735 files changed, 33688 insertions, 43728 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 61dbe52bb3a3..281a1ed03a04 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -637,7 +637,7 @@ static int v9fs_init_inode_cache(void)
 	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
 					  sizeof(struct v9fs_inode),
 					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					      SLAB_ACCOUNT),
 					  v9fs_inode_init_once);
 	if (!v9fs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 698c43dd5dc8..9defa12208f9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -179,16 +179,13 @@ extern int v9fs_vfs_rename(struct mnt_idmap *idmap,
 			   struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry,
 			   unsigned int flags);
-extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses,
-					 struct p9_fid *fid,
-					 struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid);
 extern const struct inode_operations v9fs_dir_inode_operations_dotl;
 extern const struct inode_operations v9fs_file_inode_operations_dotl;
 extern const struct inode_operations v9fs_symlink_inode_operations_dotl;
 extern const struct netfs_request_ops v9fs_req_ops;
-extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses,
-					      struct p9_fid *fid,
-					      struct super_block *sb, int new);
+extern struct inode *v9fs_fid_iget_dotl(struct super_block *sb,
+					struct p9_fid *fid);
 
 /* other default globals */
 #define V9FS_PORT	564
@@ -230,27 +227,9 @@ v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
 			struct super_block *sb)
 {
 	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0);
+		return v9fs_fid_iget_dotl(sb, fid);
 	else
-		return v9fs_inode_from_fid(v9ses, fid, sb, 0);
-}
-
-/**
- * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by
- * issuing a attribute request
- * @v9ses: session information
- * @fid: fid to issue attribute request for
- * @sb: superblock on which to create inode
- *
- */
-static inline struct inode *
-v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			    struct super_block *sb)
-{
-	if (v9fs_proto_dotl(v9ses))
-		return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1);
-	else
-		return v9fs_inode_from_fid(v9ses, fid, sb, 1);
+		return v9fs_fid_iget(sb, fid);
 }
 
 #endif
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 0e8418066a48..7923c3c347cb 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,13 +40,16 @@ extern struct kmem_cache *v9fs_inode_cache;
 
 struct inode *v9fs_alloc_inode(struct super_block *sb);
 void v9fs_free_inode(struct inode *inode);
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
-			     dev_t rdev);
 void v9fs_set_netfs_context(struct inode *inode);
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, umode_t mode, dev_t rdev);
+		    struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev);
 void v9fs_evict_inode(struct inode *inode);
-ino_t v9fs_qid2ino(struct p9_qid *qid);
+#if (BITS_PER_LONG == 32)
+#define QID2INO(q) ((ino_t) (((q)->path+2) ^ (((q)->path) >> 32)))
+#else
+#define QID2INO(q) ((ino_t) ((q)->path+2))
+#endif
+
 void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 		      struct super_block *sb, unsigned int flags);
 void v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 4102759a5cb5..e0d34e4e9076 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -127,7 +127,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx)
 			}
 
 			over = !dir_emit(ctx, st.name, strlen(st.name),
-					 v9fs_qid2ino(&st.qid), dt_type(&st));
+					QID2INO(&st.qid), dt_type(&st));
 			p9stat_free(&st);
 			if (over)
 				return 0;
@@ -184,7 +184,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx)
 
 			if (!dir_emit(ctx, curdirent.d_name,
 				      strlen(curdirent.d_name),
-				      v9fs_qid2ino(&curdirent.qid),
+				      QID2INO(&curdirent.qid),
 				      curdirent.d_type))
 				return 0;
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index bae330c2f0cf..abdbbaee5184 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -107,7 +107,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 	p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
-	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
 		filemap_write_and_wait(inode->i_mapping);
 		invalidate_mapping_pages(&inode->i_data, 0, -1);
 	}
@@ -121,13 +121,12 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	struct p9_fid *fid;
 	uint8_t status = P9_LOCK_ERROR;
 	int res = 0;
-	unsigned char fl_type;
 	struct v9fs_session_info *v9ses;
 
 	fid = filp->private_data;
 	BUG_ON(fid == NULL);
 
-	BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX);
+	BUG_ON((fl->c.flc_flags & FL_POSIX) != FL_POSIX);
 
 	res = locks_lock_file_wait(filp, fl);
 	if (res < 0)
@@ -136,7 +135,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 	/* convert posix lock to p9 tlock args */
 	memset(&flock, 0, sizeof(flock));
 	/* map the lock type */
-	switch (fl->fl_type) {
+	switch (fl->c.flc_type) {
 	case F_RDLCK:
 		flock.type = P9_LOCK_TYPE_RDLCK;
 		break;
@@ -152,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl)
 		flock.length = 0;
 	else
 		flock.length = fl->fl_end - fl->fl_start + 1;
-	flock.proc_id = fl->fl_pid;
+	flock.proc_id = fl->c.flc_pid;
 	flock.client_id = fid->clnt->name;
 	if (IS_SETLKW(cmd))
 		flock.flags = P9_LOCK_FLAGS_BLOCK;
@@ -207,12 +206,13 @@ out_unlock:
 	 * incase server returned error for lock request, revert
 	 * it locally
 	 */
-	if (res < 0 && fl->fl_type != F_UNLCK) {
-		fl_type = fl->fl_type;
-		fl->fl_type = F_UNLCK;
+	if (res < 0 && fl->c.flc_type != F_UNLCK) {
+		unsigned char type = fl->c.flc_type;
+
+		fl->c.flc_type = F_UNLCK;
 		/* Even if this fails we want to return the remote error */
 		locks_lock_file_wait(filp, fl);
-		fl->fl_type = fl_type;
+		fl->c.flc_type = type;
 	}
 	if (flock.client_id != fid->clnt->name)
 		kfree(flock.client_id);
@@ -234,7 +234,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 	 * if we have a conflicting lock locally, no need to validate
 	 * with server
 	 */
-	if (fl->fl_type != F_UNLCK)
+	if (fl->c.flc_type != F_UNLCK)
 		return res;
 
 	/* convert posix lock to p9 tgetlock args */
@@ -245,7 +245,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 		glock.length = 0;
 	else
 		glock.length = fl->fl_end - fl->fl_start + 1;
-	glock.proc_id = fl->fl_pid;
+	glock.proc_id = fl->c.flc_pid;
 	glock.client_id = fid->clnt->name;
 
 	res = p9_client_getlock_dotl(fid, &glock);
@@ -254,13 +254,13 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 	/* map 9p lock type to os lock type */
 	switch (glock.type) {
 	case P9_LOCK_TYPE_RDLCK:
-		fl->fl_type = F_RDLCK;
+		fl->c.flc_type = F_RDLCK;
 		break;
 	case P9_LOCK_TYPE_WRLCK:
-		fl->fl_type = F_WRLCK;
+		fl->c.flc_type = F_WRLCK;
 		break;
 	case P9_LOCK_TYPE_UNLCK:
-		fl->fl_type = F_UNLCK;
+		fl->c.flc_type = F_UNLCK;
 		break;
 	}
 	if (glock.type != P9_LOCK_TYPE_UNLCK) {
@@ -269,7 +269,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl)
 			fl->fl_end = OFFSET_MAX;
 		else
 			fl->fl_end = glock.start + glock.length - 1;
-		fl->fl_pid = -glock.proc_id;
+		fl->c.flc_pid = -glock.proc_id;
 	}
 out:
 	if (glock.client_id != fid->clnt->name)
@@ -293,7 +293,7 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl)
 	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
 		 filp, cmd, fl, filp);
 
-	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
 		filemap_write_and_wait(inode->i_mapping);
 		invalidate_mapping_pages(&inode->i_data, 0, -1);
 	}
@@ -324,16 +324,16 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 	p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n",
 		 filp, cmd, fl, filp);
 
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		goto out_err;
 
-	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
+	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->c.flc_type != F_UNLCK) {
 		filemap_write_and_wait(inode->i_mapping);
 		invalidate_mapping_pages(&inode->i_data, 0, -1);
 	}
 	/* Convert flock to posix lock */
-	fl->fl_flags |= FL_POSIX;
-	fl->fl_flags ^= FL_FLOCK;
+	fl->c.flc_flags |= FL_POSIX;
+	fl->c.flc_flags ^= FL_FLOCK;
 
 	if (IS_SETLK(cmd) | IS_SETLKW(cmd))
 		ret = v9fs_file_do_lock(filp, cmd, fl);
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 32572982f72e..b01b1bbf2493 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -253,9 +253,12 @@ void v9fs_set_netfs_context(struct inode *inode)
 }
 
 int v9fs_init_inode(struct v9fs_session_info *v9ses,
-		    struct inode *inode, umode_t mode, dev_t rdev)
+		    struct inode *inode, struct p9_qid *qid, umode_t mode, dev_t rdev)
 {
 	int err = 0;
+	struct v9fs_inode *v9inode = V9FS_I(inode);
+
+	memcpy(&v9inode->qid, qid, sizeof(struct p9_qid));
 
 	inode_init_owner(&nop_mnt_idmap, inode, NULL, mode);
 	inode->i_blocks = 0;
@@ -332,36 +335,6 @@ error:
 }
 
 /**
- * v9fs_get_inode - helper function to setup an inode
- * @sb: superblock
- * @mode: mode to setup inode with
- * @rdev: The device numbers to set
- */
-
-struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
-{
-	int err;
-	struct inode *inode;
-	struct v9fs_session_info *v9ses = sb->s_fs_info;
-
-	p9_debug(P9_DEBUG_VFS, "super block: %p mode: %ho\n", sb, mode);
-
-	inode = new_inode(sb);
-	if (!inode) {
-		pr_warn("%s (%d): Problem allocating inode\n",
-			__func__, task_pid_nr(current));
-		return ERR_PTR(-ENOMEM);
-	}
-	err = v9fs_init_inode(v9ses, inode, mode, rdev);
-	if (err) {
-		iput(inode);
-		return ERR_PTR(err);
-	}
-	v9fs_set_netfs_context(inode);
-	return inode;
-}
-
-/**
  * v9fs_evict_inode - Remove an inode from the inode cache
  * @inode: inode to release
  *
@@ -371,95 +344,57 @@ void v9fs_evict_inode(struct inode *inode)
 	struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
 	__le32 __maybe_unused version;
 
-	truncate_inode_pages_final(&inode->i_data);
+	if (!is_bad_inode(inode)) {
+		truncate_inode_pages_final(&inode->i_data);
 
-	version = cpu_to_le32(v9inode->qid.version);
-	netfs_clear_inode_writeback(inode, &version);
+		version = cpu_to_le32(v9inode->qid.version);
+		netfs_clear_inode_writeback(inode, &version);
 
-	clear_inode(inode);
-	filemap_fdatawrite(&inode->i_data);
+		clear_inode(inode);
+		filemap_fdatawrite(&inode->i_data);
 
 #ifdef CONFIG_9P_FSCACHE
-	fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
+		if (v9fs_inode_cookie(v9inode))
+			fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false);
 #endif
+	} else
+		clear_inode(inode);
 }
 
-static int v9fs_test_inode(struct inode *inode, void *data)
-{
-	int umode;
-	dev_t rdev;
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_wstat *st = (struct p9_wstat *)data;
-	struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode);
-
-	umode = p9mode2unixmode(v9ses, st, &rdev);
-	/* don't match inode of different type */
-	if (inode_wrong_type(inode, umode))
-		return 0;
-
-	/* compare qid details */
-	if (memcmp(&v9inode->qid.version,
-		   &st->qid.version, sizeof(v9inode->qid.version)))
-		return 0;
-
-	if (v9inode->qid.type != st->qid.type)
-		return 0;
-
-	if (v9inode->qid.path != st->qid.path)
-		return 0;
-	return 1;
-}
-
-static int v9fs_test_new_inode(struct inode *inode, void *data)
-{
-	return 0;
-}
-
-static int v9fs_set_inode(struct inode *inode,  void *data)
-{
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_wstat *st = (struct p9_wstat *)data;
-
-	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
-	return 0;
-}
-
-static struct inode *v9fs_qid_iget(struct super_block *sb,
-				   struct p9_qid *qid,
-				   struct p9_wstat *st,
-				   int new)
+struct inode *v9fs_fid_iget(struct super_block *sb, struct p9_fid *fid)
 {
 	dev_t rdev;
 	int retval;
 	umode_t umode;
-	unsigned long i_ino;
 	struct inode *inode;
+	struct p9_wstat *st;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
-	int (*test)(struct inode *inode, void *data);
 
-	if (new)
-		test = v9fs_test_new_inode;
-	else
-		test = v9fs_test_inode;
-
-	i_ino = v9fs_qid2ino(qid);
-	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode, st);
-	if (!inode)
+	inode = iget_locked(sb, QID2INO(&fid->qid));
+	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
+
 	/*
 	 * initialize the inode with the stat info
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	inode->i_ino = i_ino;
+	st = p9_client_stat(fid);
+	if (IS_ERR(st)) {
+		retval = PTR_ERR(st);
+		goto error;
+	}
+
 	umode = p9mode2unixmode(v9ses, st, &rdev);
-	retval = v9fs_init_inode(v9ses, inode, umode, rdev);
+	retval = v9fs_init_inode(v9ses, inode, &fid->qid, umode, rdev);
+	v9fs_stat2inode(st, inode, sb, 0);
+	p9stat_free(st);
+	kfree(st);
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode(st, inode, sb, 0);
 	v9fs_set_netfs_context(inode);
 	v9fs_cache_inode_get_cookie(inode);
 	unlock_new_inode(inode);
@@ -470,23 +405,6 @@ error:
 
 }
 
-struct inode *
-v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-		    struct super_block *sb, int new)
-{
-	struct p9_wstat *st;
-	struct inode *inode = NULL;
-
-	st = p9_client_stat(fid);
-	if (IS_ERR(st))
-		return ERR_CAST(st);
-
-	inode = v9fs_qid_iget(sb, &st->qid, st, new);
-	p9stat_free(st);
-	kfree(st);
-	return inode;
-}
-
 /**
  * v9fs_at_to_dotl_flags- convert Linux specific AT flags to
  * plan 9 AT flag.
@@ -633,7 +551,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
 		/*
 		 * instantiate inode and assign the unopened fid to the dentry
 		 */
-		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 		if (IS_ERR(inode)) {
 			err = PTR_ERR(inode);
 			p9_debug(P9_DEBUG_VFS,
@@ -761,10 +679,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
 		inode = NULL;
 	else if (IS_ERR(fid))
 		inode = ERR_CAST(fid);
-	else if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	else
-		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+		inode = v9fs_get_inode_from_fid(v9ses, fid, dir->i_sb);
 	/*
 	 * If we had a rename on the server and a parallel lookup
 	 * for the new name, then make sure we instantiate with
@@ -1187,26 +1103,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
 }
 
 /**
- * v9fs_qid2ino - convert qid into inode number
- * @qid: qid to hash
- *
- * BUG: potential for inode number collisions?
- */
-
-ino_t v9fs_qid2ino(struct p9_qid *qid)
-{
-	u64 path = qid->path + 2;
-	ino_t i = 0;
-
-	if (sizeof(ino_t) == sizeof(path))
-		memcpy(&i, &path, sizeof(ino_t));
-	else
-		i = (ino_t) (path ^ (path >> 32));
-
-	return i;
-}
-
-/**
  * v9fs_vfs_get_link - follow a symlink path
  * @dentry: dentry for symlink
  * @inode: inode for symlink
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 3505227e1704..55dde186041a 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -52,82 +52,37 @@ static kgid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
 	return current_fsgid();
 }
 
-static int v9fs_test_inode_dotl(struct inode *inode, void *data)
-{
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
-	/* don't match inode of different type */
-	if (inode_wrong_type(inode, st->st_mode))
-		return 0;
-
-	if (inode->i_generation != st->st_gen)
-		return 0;
-
-	/* compare qid details */
-	if (memcmp(&v9inode->qid.version,
-		   &st->qid.version, sizeof(v9inode->qid.version)))
-		return 0;
-
-	if (v9inode->qid.type != st->qid.type)
-		return 0;
-
-	if (v9inode->qid.path != st->qid.path)
-		return 0;
-	return 1;
-}
-
-/* Always get a new inode */
-static int v9fs_test_new_inode_dotl(struct inode *inode, void *data)
-{
-	return 0;
-}
-
-static int v9fs_set_inode_dotl(struct inode *inode,  void *data)
-{
-	struct v9fs_inode *v9inode = V9FS_I(inode);
-	struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
-
-	memcpy(&v9inode->qid, &st->qid, sizeof(st->qid));
-	inode->i_generation = st->st_gen;
-	return 0;
-}
-
-static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
-					struct p9_qid *qid,
-					struct p9_fid *fid,
-					struct p9_stat_dotl *st,
-					int new)
+struct inode *v9fs_fid_iget_dotl(struct super_block *sb, struct p9_fid *fid)
 {
 	int retval;
-	unsigned long i_ino;
 	struct inode *inode;
+	struct p9_stat_dotl *st;
 	struct v9fs_session_info *v9ses = sb->s_fs_info;
-	int (*test)(struct inode *inode, void *data);
-
-	if (new)
-		test = v9fs_test_new_inode_dotl;
-	else
-		test = v9fs_test_inode_dotl;
 
-	i_ino = v9fs_qid2ino(qid);
-	inode = iget5_locked(sb, i_ino, test, v9fs_set_inode_dotl, st);
-	if (!inode)
+	inode = iget_locked(sb, QID2INO(&fid->qid));
+	if (unlikely(!inode))
 		return ERR_PTR(-ENOMEM);
 	if (!(inode->i_state & I_NEW))
 		return inode;
+
 	/*
 	 * initialize the inode with the stat info
 	 * FIXME!! we may need support for stale inodes
 	 * later.
 	 */
-	inode->i_ino = i_ino;
-	retval = v9fs_init_inode(v9ses, inode,
+	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
+	if (IS_ERR(st)) {
+		retval = PTR_ERR(st);
+		goto error;
+	}
+
+	retval = v9fs_init_inode(v9ses, inode, &fid->qid,
 				 st->st_mode, new_decode_dev(st->st_rdev));
+	v9fs_stat2inode_dotl(st, inode, 0);
+	kfree(st);
 	if (retval)
 		goto error;
 
-	v9fs_stat2inode_dotl(st, inode, 0);
 	v9fs_set_netfs_context(inode);
 	v9fs_cache_inode_get_cookie(inode);
 	retval = v9fs_get_acl(inode, fid);
@@ -135,6 +90,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
 		goto error;
 
 	unlock_new_inode(inode);
+
 	return inode;
 error:
 	iget_failed(inode);
@@ -142,22 +98,6 @@ error:
 
 }
 
-struct inode *
-v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid,
-			 struct super_block *sb, int new)
-{
-	struct p9_stat_dotl *st;
-	struct inode *inode = NULL;
-
-	st = p9_client_getattr_dotl(fid, P9_STATS_BASIC | P9_STATS_GEN);
-	if (IS_ERR(st))
-		return ERR_CAST(st);
-
-	inode = v9fs_qid_iget_dotl(sb, &st->qid, fid, st, new);
-	kfree(st);
-	return inode;
-}
-
 struct dotl_openflag_map {
 	int open_flag;
 	int dotl_flag;
@@ -307,7 +247,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 		p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
 		goto out;
 	}
-	inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
+	inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n", err);
@@ -357,7 +297,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 			       umode_t omode)
 {
 	int err;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	kgid_t gid;
 	const unsigned char *name;
@@ -367,7 +306,6 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 	struct posix_acl *dacl = NULL, *pacl = NULL;
 
 	p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry);
-	v9ses = v9fs_inode2v9ses(dir);
 
 	omode |= S_IFDIR;
 	if (dir->i_mode & S_ISGID)
@@ -402,32 +340,17 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap,
 	}
 
 	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
-		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-				 err);
-			goto error;
-		}
-		v9fs_fid_add(dentry, &fid);
-		v9fs_set_create_acl(inode, fid, dacl, pacl);
-		d_instantiate(dentry, inode);
-		err = 0;
-	} else {
-		/*
-		 * Not in cached mode. No need to populate
-		 * inode with stat. We need to get an inode
-		 * so that we can set the acl with dentry
-		 */
-		inode = v9fs_get_inode(dir->i_sb, mode, 0);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		v9fs_set_create_acl(inode, fid, dacl, pacl);
-		d_instantiate(dentry, inode);
+	inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+			 err);
+		goto error;
 	}
+	v9fs_fid_add(dentry, &fid);
+	v9fs_set_create_acl(inode, fid, dacl, pacl);
+	d_instantiate(dentry, inode);
+	err = 0;
 	inc_nlink(dir);
 	v9fs_invalidate_inode_attr(dir);
 error:
@@ -709,14 +632,11 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	kgid_t gid;
 	const unsigned char *name;
 	struct p9_qid qid;
-	struct inode *inode;
 	struct p9_fid *dfid;
 	struct p9_fid *fid = NULL;
-	struct v9fs_session_info *v9ses;
 
 	name = dentry->d_name.name;
 	p9_debug(P9_DEBUG_VFS, "%lu,%s,%s\n", dir->i_ino, name, symname);
-	v9ses = v9fs_inode2v9ses(dir);
 
 	dfid = v9fs_parent_fid(dentry);
 	if (IS_ERR(dfid)) {
@@ -736,36 +656,6 @@ v9fs_vfs_symlink_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	}
 
 	v9fs_invalidate_inode_attr(dir);
-	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
-		/* Now walk from the parent so we can get an unopened fid. */
-		fid = p9_client_walk(dfid, 1, &name, 1);
-		if (IS_ERR(fid)) {
-			err = PTR_ERR(fid);
-			p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n",
-				 err);
-			goto error;
-		}
-
-		/* instantiate inode and assign the unopened fid to dentry */
-		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-				 err);
-			goto error;
-		}
-		v9fs_fid_add(dentry, &fid);
-		d_instantiate(dentry, inode);
-		err = 0;
-	} else {
-		/* Not in cached mode. No need to populate inode with stat */
-		inode = v9fs_get_inode(dir->i_sb, S_IFLNK, 0);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		d_instantiate(dentry, inode);
-	}
 
 error:
 	p9_fid_put(fid);
@@ -847,7 +737,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 	kgid_t gid;
 	const unsigned char *name;
 	umode_t mode;
-	struct v9fs_session_info *v9ses;
 	struct p9_fid *fid = NULL, *dfid = NULL;
 	struct inode *inode;
 	struct p9_qid qid;
@@ -857,7 +746,6 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 		 dir->i_ino, dentry, omode,
 		 MAJOR(rdev), MINOR(rdev));
 
-	v9ses = v9fs_inode2v9ses(dir);
 	dfid = v9fs_parent_fid(dentry);
 	if (IS_ERR(dfid)) {
 		err = PTR_ERR(dfid);
@@ -888,33 +776,17 @@ v9fs_vfs_mknod_dotl(struct mnt_idmap *idmap, struct inode *dir,
 			 err);
 		goto error;
 	}
-
-	/* instantiate inode and assign the unopened fid to the dentry */
-	if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) {
-		inode = v9fs_get_new_inode_from_fid(v9ses, fid, dir->i_sb);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
-				 err);
-			goto error;
-		}
-		v9fs_set_create_acl(inode, fid, dacl, pacl);
-		v9fs_fid_add(dentry, &fid);
-		d_instantiate(dentry, inode);
-		err = 0;
-	} else {
-		/*
-		 * Not in cached mode. No need to populate inode with stat.
-		 * socket syscall returns a fd, so we need instantiate
-		 */
-		inode = v9fs_get_inode(dir->i_sb, mode, rdev);
-		if (IS_ERR(inode)) {
-			err = PTR_ERR(inode);
-			goto error;
-		}
-		v9fs_set_create_acl(inode, fid, dacl, pacl);
-		d_instantiate(dentry, inode);
+	inode = v9fs_fid_iget_dotl(dir->i_sb, fid);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		p9_debug(P9_DEBUG_VFS, "inode creation failed %d\n",
+			 err);
+		goto error;
 	}
+	v9fs_set_create_acl(inode, fid, dacl, pacl);
+	v9fs_fid_add(dentry, &fid);
+	d_instantiate(dentry, inode);
+	err = 0;
 error:
 	p9_fid_put(fid);
 	v9fs_put_acl(dacl, pacl);
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 941f7d0e0bfa..4236058c7bbd 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -110,7 +110,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	struct inode *inode = NULL;
 	struct dentry *root = NULL;
 	struct v9fs_session_info *v9ses = NULL;
-	umode_t mode = 0777 | S_ISVTX;
 	struct p9_fid *fid;
 	int retval = 0;
 
@@ -140,7 +139,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 	else
 		sb->s_d_op = &v9fs_dentry_operations;
 
-	inode = v9fs_get_inode(sb, S_IFDIR | mode, 0);
+	inode = v9fs_get_inode_from_fid(v9ses, fid, sb);
 	if (IS_ERR(inode)) {
 		retval = PTR_ERR(inode);
 		goto release_sb;
@@ -152,32 +151,6 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 		goto release_sb;
 	}
 	sb->s_root = root;
-	if (v9fs_proto_dotl(v9ses)) {
-		struct p9_stat_dotl *st = NULL;
-
-		st = p9_client_getattr_dotl(fid, P9_STATS_BASIC);
-		if (IS_ERR(st)) {
-			retval = PTR_ERR(st);
-			goto release_sb;
-		}
-		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode_dotl(st, d_inode(root), 0);
-		kfree(st);
-	} else {
-		struct p9_wstat *st = NULL;
-
-		st = p9_client_stat(fid);
-		if (IS_ERR(st)) {
-			retval = PTR_ERR(st);
-			goto release_sb;
-		}
-
-		d_inode(root)->i_ino = v9fs_qid2ino(&st->qid);
-		v9fs_stat2inode(st, d_inode(root), sb, 0);
-
-		p9stat_free(st);
-		kfree(st);
-	}
 	retval = v9fs_get_acl(inode, fid);
 	if (retval)
 		goto release_sb;
@@ -271,21 +244,6 @@ done:
 	return res;
 }
 
-static int v9fs_drop_inode(struct inode *inode)
-{
-	struct v9fs_session_info *v9ses;
-
-	v9ses = v9fs_inode2v9ses(inode);
-	if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
-		return generic_drop_inode(inode);
-	/*
-	 * in case of non cached mode always drop the
-	 * inode because we want the inode attribute
-	 * to always match that on the server.
-	 */
-	return 1;
-}
-
 static int v9fs_write_inode(struct inode *inode,
 			    struct writeback_control *wbc)
 {
@@ -320,7 +278,6 @@ static const struct super_operations v9fs_super_ops_dotl = {
 	.alloc_inode = v9fs_alloc_inode,
 	.free_inode = v9fs_free_inode,
 	.statfs = v9fs_statfs,
-	.drop_inode = v9fs_drop_inode,
 	.evict_inode = v9fs_evict_inode,
 	.show_options = v9fs_show_options,
 	.umount_begin = v9fs_umount_begin,
diff --git a/fs/Kconfig b/fs/Kconfig
index 89fdbefd1075..a46b0cbc4d8f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -60,7 +60,6 @@ endif # BLOCK
 config FS_DAX
 	bool "File system based Direct Access (DAX) support"
 	depends on MMU
-	depends on !(ARM || MIPS || SPARC)
 	depends on ZONE_DEVICE || FS_DAX_LIMITED
 	select FS_IOMAP
 	select DAX
@@ -162,7 +161,6 @@ menu "DOS/FAT/EXFAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
 source "fs/exfat/Kconfig"
-source "fs/ntfs/Kconfig"
 source "fs/ntfs3/Kconfig"
 
 endmenu
@@ -262,6 +260,7 @@ menuconfig HUGETLBFS
 	depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
 	depends on (SYSFS || SYSCTL)
 	select MEMFD_CREATE
+	select PADATA if SMP
 	help
 	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
 	  ramfs. For architectures that support it, say Y here and read
diff --git a/fs/Makefile b/fs/Makefile
index c09016257f05..6ecc9b0a53f2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -15,7 +15,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		pnode.o splice.o sync.o utimes.o d_path.o \
 		stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
 		fs_types.o fs_context.o fs_parser.o fsopen.o init.o \
-		kernel_read_file.o mnt_idmapping.o remap_range.o
+		kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o
 
 obj-$(CONFIG_BUFFER_HEAD)	+= buffer.o mpage.o
 obj-$(CONFIG_PROC_FS)		+= proc_namespace.o
@@ -91,7 +91,6 @@ obj-y				+= unicode/
 obj-$(CONFIG_SYSV_FS)		+= sysv/
 obj-$(CONFIG_SMBFS)		+= smb/
 obj-$(CONFIG_HPFS_FS)		+= hpfs/
-obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_NTFS3_FS)		+= ntfs3/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index e8bfc38239cd..9354b14bbfe3 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -249,7 +249,7 @@ static int __init init_inodecache(void)
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/affs/super.c b/fs/affs/super.c
index b56a95cf414a..3c5821339609 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -130,8 +130,7 @@ static int __init init_inodecache(void)
 {
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
-					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					     0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
 					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 8a67fc427e74..67afe68972d5 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -474,16 +474,6 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
 			continue;
 		}
 
-		/* Don't expose silly rename entries to userspace. */
-		if (nlen > 6 &&
-		    dire->u.name[0] == '.' &&
-		    ctx->actor != afs_lookup_filldir &&
-		    ctx->actor != afs_lookup_one_filldir &&
-		    memcmp(dire->u.name, ".__afs", 6) == 0) {
-			ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
-			continue;
-		}
-
 		/* found the next entry */
 		if (!dir_emit(ctx, dire->u.name, nlen,
 			      ntohl(dire->u.vnode),
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 9c6dea3139f5..f0e96a35093f 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -93,13 +93,13 @@ static void afs_grant_locks(struct afs_vnode *vnode)
 	bool exclusive = (vnode->lock_type == AFS_LOCK_WRITE);
 
 	list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
-		if (!exclusive && p->fl_type == F_WRLCK)
+		if (!exclusive && lock_is_write(p))
 			continue;
 
 		list_move_tail(&p->fl_u.afs.link, &vnode->granted_locks);
 		p->fl_u.afs.state = AFS_LOCK_GRANTED;
 		trace_afs_flock_op(vnode, p, afs_flock_op_grant);
-		wake_up(&p->fl_wait);
+		locks_wake_up(p);
 	}
 }
 
@@ -112,25 +112,24 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
 {
 	struct file_lock *p, *_p, *next = NULL;
 	struct key *key = vnode->lock_key;
-	unsigned int fl_type = F_RDLCK;
+	unsigned int type = F_RDLCK;
 
 	_enter("");
 
 	if (vnode->lock_type == AFS_LOCK_WRITE)
-		fl_type = F_WRLCK;
+		type = F_WRLCK;
 
 	list_for_each_entry_safe(p, _p, &vnode->pending_locks, fl_u.afs.link) {
 		if (error &&
-		    p->fl_type == fl_type &&
-		    afs_file_key(p->fl_file) == key) {
+		    p->c.flc_type == type &&
+		    afs_file_key(p->c.flc_file) == key) {
 			list_del_init(&p->fl_u.afs.link);
 			p->fl_u.afs.state = error;
-			wake_up(&p->fl_wait);
+			locks_wake_up(p);
 		}
 
 		/* Select the next locker to hand off to. */
-		if (next &&
-		    (next->fl_type == F_WRLCK || p->fl_type == F_RDLCK))
+		if (next && (lock_is_write(next) || lock_is_read(p)))
 			continue;
 		next = p;
 	}
@@ -142,7 +141,7 @@ static void afs_next_locker(struct afs_vnode *vnode, int error)
 		afs_set_lock_state(vnode, AFS_VNODE_LOCK_SETTING);
 		next->fl_u.afs.state = AFS_LOCK_YOUR_TRY;
 		trace_afs_flock_op(vnode, next, afs_flock_op_wake);
-		wake_up(&next->fl_wait);
+		locks_wake_up(next);
 	} else {
 		afs_set_lock_state(vnode, AFS_VNODE_LOCK_NONE);
 		trace_afs_flock_ev(vnode, NULL, afs_flock_no_lockers, 0);
@@ -166,7 +165,7 @@ static void afs_kill_lockers_enoent(struct afs_vnode *vnode)
 			       struct file_lock, fl_u.afs.link);
 		list_del_init(&p->fl_u.afs.link);
 		p->fl_u.afs.state = -ENOENT;
-		wake_up(&p->fl_wait);
+		locks_wake_up(p);
 	}
 
 	key_put(vnode->lock_key);
@@ -464,14 +463,14 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 
 	_enter("{%llx:%llu},%llu-%llu,%u,%u",
 	       vnode->fid.vid, vnode->fid.vnode,
-	       fl->fl_start, fl->fl_end, fl->fl_type, mode);
+	       fl->fl_start, fl->fl_end, fl->c.flc_type, mode);
 
 	fl->fl_ops = &afs_lock_ops;
 	INIT_LIST_HEAD(&fl->fl_u.afs.link);
 	fl->fl_u.afs.state = AFS_LOCK_PENDING;
 
 	partial = (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX);
-	type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
+	type = lock_is_read(fl) ? AFS_LOCK_READ : AFS_LOCK_WRITE;
 	if (mode == afs_flock_mode_write && partial)
 		type = AFS_LOCK_WRITE;
 
@@ -524,7 +523,7 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl)
 	}
 
 	if (vnode->lock_state == AFS_VNODE_LOCK_NONE &&
-	    !(fl->fl_flags & FL_SLEEP)) {
+	    !(fl->c.flc_flags & FL_SLEEP)) {
 		ret = -EAGAIN;
 		if (type == AFS_LOCK_READ) {
 			if (vnode->status.lock_count == -1)
@@ -621,7 +620,7 @@ skip_server_lock:
 	return 0;
 
 lock_is_contended:
-	if (!(fl->fl_flags & FL_SLEEP)) {
+	if (!(fl->c.flc_flags & FL_SLEEP)) {
 		list_del_init(&fl->fl_u.afs.link);
 		afs_next_locker(vnode, 0);
 		ret = -EAGAIN;
@@ -641,7 +640,7 @@ need_to_wait:
 	spin_unlock(&vnode->lock);
 
 	trace_afs_flock_ev(vnode, fl, afs_flock_waiting, 0);
-	ret = wait_event_interruptible(fl->fl_wait,
+	ret = wait_event_interruptible(fl->c.flc_wait,
 				       fl->fl_u.afs.state != AFS_LOCK_PENDING);
 	trace_afs_flock_ev(vnode, fl, afs_flock_waited, ret);
 
@@ -704,7 +703,8 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl)
 	struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
 	int ret;
 
-	_enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type);
+	_enter("{%llx:%llu},%u", vnode->fid.vid, vnode->fid.vnode,
+	       fl->c.flc_type);
 
 	trace_afs_flock_op(vnode, fl, afs_flock_op_unlock);
 
@@ -730,11 +730,11 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 	if (vnode->lock_state == AFS_VNODE_LOCK_DELETED)
 		return -ENOENT;
 
-	fl->fl_type = F_UNLCK;
+	fl->c.flc_type = F_UNLCK;
 
 	/* check local lock records first */
 	posix_test_lock(file, fl);
-	if (fl->fl_type == F_UNLCK) {
+	if (lock_is_unlock(fl)) {
 		/* no local locks; consult the server */
 		ret = afs_fetch_status(vnode, key, false, NULL);
 		if (ret < 0)
@@ -743,18 +743,18 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 		lock_count = READ_ONCE(vnode->status.lock_count);
 		if (lock_count != 0) {
 			if (lock_count > 0)
-				fl->fl_type = F_RDLCK;
+				fl->c.flc_type = F_RDLCK;
 			else
-				fl->fl_type = F_WRLCK;
+				fl->c.flc_type = F_WRLCK;
 			fl->fl_start = 0;
 			fl->fl_end = OFFSET_MAX;
-			fl->fl_pid = 0;
+			fl->c.flc_pid = 0;
 		}
 	}
 
 	ret = 0;
 error:
-	_leave(" = %d [%hd]", ret, fl->fl_type);
+	_leave(" = %d [%hd]", ret, fl->c.flc_type);
 	return ret;
 }
 
@@ -769,7 +769,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	_enter("{%llx:%llu},%d,{t=%x,fl=%x,r=%Ld:%Ld}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
-	       fl->fl_type, fl->fl_flags,
+	       fl->c.flc_type, fl->c.flc_flags,
 	       (long long) fl->fl_start, (long long) fl->fl_end);
 
 	if (IS_GETLK(cmd))
@@ -778,7 +778,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 	fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
 	trace_afs_flock_op(vnode, fl, afs_flock_op_lock);
 
-	if (fl->fl_type == F_UNLCK)
+	if (lock_is_unlock(fl))
 		ret = afs_do_unlk(file, fl);
 	else
 		ret = afs_do_setlk(file, fl);
@@ -804,7 +804,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	_enter("{%llx:%llu},%d,{t=%x,fl=%x}",
 	       vnode->fid.vid, vnode->fid.vnode, cmd,
-	       fl->fl_type, fl->fl_flags);
+	       fl->c.flc_type, fl->c.flc_flags);
 
 	/*
 	 * No BSD flocks over NFS allowed.
@@ -813,14 +813,14 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 	 * Not sure whether that would be unique, though, or whether
 	 * that would break in other places.
 	 */
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		return -ENOLCK;
 
 	fl->fl_u.afs.debug_id = atomic_inc_return(&afs_file_lock_debug_id);
 	trace_afs_flock_op(vnode, fl, afs_flock_op_flock);
 
 	/* we're simulating flock() locks using posix locks on the server */
-	if (fl->fl_type == F_UNLCK)
+	if (lock_is_unlock(fl))
 		ret = afs_do_unlk(file, fl);
 	else
 		ret = afs_do_setlk(file, fl);
@@ -843,7 +843,7 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
  */
 static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
 
 	_enter("");
 
@@ -861,7 +861,7 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl)
  */
 static void afs_fl_release_private(struct file_lock *fl)
 {
-	struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->fl_file));
+	struct afs_vnode *vnode = AFS_FS_I(file_inode(fl->c.flc_file));
 
 	_enter("");
 
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 700a27bc8c25..ed04bd1eeae8 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -602,6 +602,8 @@ iterate_address:
 		goto wait_for_more_probe_results;
 
 	alist = op->estate->addresses;
+	best_prio = -1;
+	addr_index = 0;
 	for (i = 0; i < alist->nr_addrs; i++) {
 		if (alist->addrs[i].prio > best_prio) {
 			addr_index = i;
@@ -609,9 +611,7 @@ iterate_address:
 		}
 	}
 
-	addr_index = READ_ONCE(alist->preferred);
-	if (!test_bit(addr_index, &set))
-		addr_index = __ffs(set);
+	alist->preferred = addr_index;
 
 	op->addr_index = addr_index;
 	set_bit(addr_index, &op->addr_tried);
@@ -656,12 +656,6 @@ wait_for_more_probe_results:
 next_server:
 	trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
 	_debug("next");
-	ASSERT(op->estate);
-	alist = op->estate->addresses;
-	if (op->call_responded &&
-	    op->addr_index != READ_ONCE(alist->preferred) &&
-	    test_bit(alist->preferred, &op->addr_tried))
-		WRITE_ONCE(alist->preferred, op->addr_index);
 	op->estate = NULL;
 	goto pick_server;
 
@@ -690,14 +684,7 @@ no_more_servers:
 failed:
 	trace_afs_rotate(op, afs_rotate_trace_failed, 0);
 	op->flags |= AFS_OPERATION_STOP;
-	if (op->estate) {
-		alist = op->estate->addresses;
-		if (op->call_responded &&
-		    op->addr_index != READ_ONCE(alist->preferred) &&
-		    test_bit(alist->preferred, &op->addr_tried))
-			WRITE_ONCE(alist->preferred, op->addr_index);
-		op->estate = NULL;
-	}
+	op->estate = NULL;
 	_leave(" = f [failed %d]", afs_op_error(op));
 	return false;
 }
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
index 46b37f2cce7d..32a53fc8dfb2 100644
--- a/fs/afs/validation.c
+++ b/fs/afs/validation.c
@@ -122,6 +122,9 @@ bool afs_check_validity(const struct afs_vnode *vnode)
 	const struct afs_volume *volume = vnode->volume;
 	time64_t deadline = ktime_get_real_seconds() + 10;
 
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		return true;
+
 	if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
 	    atomic64_read(&vnode->cb_expires_at)  <= deadline ||
 	    volume->cb_expires_at <= deadline ||
@@ -389,12 +392,17 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	       key_serial(key));
 
 	if (afs_check_validity(vnode))
-		return 0;
+		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
 
 	ret = down_write_killable(&vnode->validate_lock);
 	if (ret < 0)
 		goto error;
 
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		ret = -ESTALE;
+		goto error_unlock;
+	}
+
 	/* Validate a volume after the v_break has changed or the volume
 	 * callback expired.  We only want to do this once per volume per
 	 * v_break change.  The actual work will be done when parsing the
@@ -448,12 +456,6 @@ int afs_validate(struct afs_vnode *vnode, struct key *key)
 	vnode->cb_ro_snapshot = cb_ro_snapshot;
 	vnode->cb_scrub = cb_scrub;
 
-	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
-		_debug("file already deleted");
-		ret = -ESTALE;
-		goto error_unlock;
-	}
-
 	/* if the vnode's data version number changed then its contents are
 	 * different */
 	zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
diff --git a/fs/aio.c b/fs/aio.c
index 28223f511931..0f4f531c9780 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -589,8 +589,8 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
 
 void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 {
-	struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
-	struct kioctx *ctx = req->ki_ctx;
+	struct aio_kiocb *req;
+	struct kioctx *ctx;
 	unsigned long flags;
 
 	/*
@@ -600,9 +600,13 @@ void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
 	if (!(iocb->ki_flags & IOCB_AIO_RW))
 		return;
 
+	req = container_of(iocb, struct aio_kiocb, rw);
+
 	if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
 		return;
 
+	ctx = req->ki_ctx;
+
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 	list_add_tail(&req->ki_list, &ctx->active_reqs);
 	req->ki_cancel = cancel;
@@ -1198,8 +1202,8 @@ static void aio_complete(struct aio_kiocb *iocb)
 		spin_lock_irqsave(&ctx->wait.lock, flags);
 		list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry)
 			if (avail >= curr->min_nr) {
-				list_del_init_careful(&curr->w.entry);
 				wake_up_process(curr->w.private);
+				list_del_init_careful(&curr->w.entry);
 			}
 		spin_unlock_irqrestore(&ctx->wait.lock, flags);
 	}
@@ -2165,11 +2169,14 @@ COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
 #endif
 
 /* sys_io_cancel:
- *	Attempts to cancel an iocb previously passed to io_submit(). If the
- *	operation is successfully cancelled 0 is returned. May fail with
- *	-EFAULT if any of the data structures pointed to are invalid. May
- *	fail with -EINVAL if aio_context specified by ctx_id is invalid. Will
- *	fail with -ENOSYS if not implemented.
+ *	Attempts to cancel an iocb previously passed to io_submit.  If
+ *	the operation is successfully cancelled, the resulting event is
+ *	copied into the memory pointed to by result without being placed
+ *	into the completion queue and 0 is returned.  May fail with
+ *	-EFAULT if any of the data structures pointed to are invalid.
+ *	May fail with -EINVAL if aio_context specified by ctx_id is
+ *	invalid.  May fail with -EAGAIN if the iocb specified was not
+ *	cancelled.  Will fail with -ENOSYS if not implemented.
  */
 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
@@ -2200,12 +2207,14 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	/*
-	 * The result argument is no longer used - the io_event is always
-	 * delivered via the ring buffer.
-	 */
-	if (ret == 0 && kiocb->rw.ki_flags & IOCB_AIO_RW)
-		aio_complete_rw(&kiocb->rw, -EINTR);
+	if (!ret) {
+		/*
+		 * The result argument is no longer used - the io_event is
+		 * always delivered via the ring buffer. -EINPROGRESS indicates
+		 * cancellation is progress:
+		 */
+		ret = -EINPROGRESS;
+	}
 
 	percpu_ref_put(&ctx->users);
 
diff --git a/fs/attr.c b/fs/attr.c
index 5a13f0c8495f..960a310581eb 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -16,8 +16,6 @@
 #include <linux/fcntl.h>
 #include <linux/filelock.h>
 #include <linux/security.h>
-#include <linux/evm.h>
-#include <linux/ima.h>
 
 #include "internal.h"
 
@@ -352,7 +350,7 @@ int may_setattr(struct mnt_idmap *idmap, struct inode *inode,
 EXPORT_SYMBOL(may_setattr);
 
 /**
- * notify_change - modify attributes of a filesytem object
+ * notify_change - modify attributes of a filesystem object
  * @idmap:	idmap of the mount the inode was found from
  * @dentry:	object affected
  * @attr:	new attributes
@@ -502,8 +500,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 
 	if (!error) {
 		fsnotify_change(dentry, ia_valid);
-		ima_inode_post_setattr(idmap, dentry);
-		evm_inode_post_setattr(dentry, ia_valid);
+		security_inode_post_setattr(idmap, dentry, ia_valid);
 	}
 
 	return error;
diff --git a/fs/backing-file.c b/fs/backing-file.c
index a681f38d84d8..740185198db3 100644
--- a/fs/backing-file.c
+++ b/fs/backing-file.c
@@ -325,9 +325,7 @@ EXPORT_SYMBOL_GPL(backing_file_mmap);
 
 static int __init backing_aio_init(void)
 {
-	backing_aio_cachep = kmem_cache_create("backing_aio",
-					       sizeof(struct backing_aio),
-					       0, SLAB_HWCACHE_ALIGN, NULL);
+	backing_aio_cachep = KMEM_CACHE(backing_aio, SLAB_HWCACHE_ALIGN);
 	if (!backing_aio_cachep)
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index 1a05cecda7cc..66ca0bbee639 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -17,6 +17,7 @@ bcachefs-y		:=	\
 	btree_journal_iter.o	\
 	btree_key_cache.o	\
 	btree_locking.o		\
+	btree_node_scan.o	\
 	btree_trans_commit.o	\
 	btree_update.o		\
 	btree_update_interior.o	\
@@ -37,6 +38,7 @@ bcachefs-y		:=	\
 	error.o			\
 	extents.o		\
 	extent_update.o		\
+	eytzinger.o		\
 	fs.o			\
 	fs-common.o		\
 	fs-ioctl.o		\
@@ -67,6 +69,7 @@ bcachefs-y		:=	\
 	quota.o			\
 	rebalance.o		\
 	recovery.o		\
+	recovery_passes.o	\
 	reflink.o		\
 	replicas.o		\
 	sb-clean.o		\
@@ -82,6 +85,7 @@ bcachefs-y		:=	\
 	super-io.o		\
 	sysfs.o			\
 	tests.o			\
+	time_stats.o		\
 	thread_with_file.o	\
 	trace.o			\
 	two_state_shared_lock.o	\
@@ -90,3 +94,6 @@ bcachefs-y		:=	\
 	xattr.o
 
 obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST)   += mean_and_variance_test.o
+
+# Silence "note: xyz changed in GCC X.X" messages
+subdir-ccflags-y += $(call cc-disable-warning, psabi)
diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c
index 3640f417cce1..5c180fdc3efb 100644
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
 	struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
 	struct btree_trans *trans = bch2_trans_get(c);
 	struct btree_iter iter = { NULL };
-	struct bkey_s_c_xattr xattr;
 	struct posix_acl *acl = NULL;
 	struct bkey_s_c k;
 	int ret;
@@ -290,28 +289,27 @@ retry:
 
 	ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
 			&hash, inode_inum(inode), &search, 0);
-	if (ret) {
-		if (!bch2_err_matches(ret, ENOENT))
-			acl = ERR_PTR(ret);
-		goto out;
-	}
+	if (ret)
+		goto err;
 
 	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
-	if (ret) {
-		acl = ERR_PTR(ret);
-		goto out;
-	}
+	if (ret)
+		goto err;
 
-	xattr = bkey_s_c_to_xattr(k);
+	struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
 	acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
-			le16_to_cpu(xattr.v->x_val_len));
+				 le16_to_cpu(xattr.v->x_val_len));
+	ret = PTR_ERR_OR_ZERO(acl);
+err:
+	if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
+		goto retry;
 
-	if (!IS_ERR(acl))
+	if (ret)
+		acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
+
+	if (!IS_ERR_OR_NULL(acl))
 		set_cached_acl(&inode->v, type, acl);
-out:
-	if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
-		goto retry;
 
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index fd3e175d8342..4ff56fa4d539 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -29,6 +29,8 @@
 #include <linux/sched/task.h>
 #include <linux/sort.h>
 
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket);
+
 /* Persistent alloc info: */
 
 static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = {
@@ -530,13 +532,13 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		u8 gen = bch2_alloc_to_v4(k, &a)->gen;
 		unsigned offset;
 		struct bpos pos = alloc_gens_pos(iter.pos, &offset);
+		int ret2 = 0;
 
 		if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
-			ret = commit_do(trans, NULL, NULL,
-					BCH_TRANS_COMMIT_no_enospc,
-				bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
-			if (ret)
-				break;
+			ret2 =  bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			if (ret2)
+				goto iter_err;
 			have_bucket_gens_key = false;
 		}
 
@@ -547,7 +549,8 @@ int bch2_bucket_gens_init(struct bch_fs *c)
 		}
 
 		g.v.gens[offset] = gen;
-		0;
+iter_err:
+		ret2;
 	}));
 
 	if (have_bucket_gens_key && !ret)
@@ -850,7 +853,7 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 					bucket_journal_seq);
 			if (ret) {
 				bch2_fs_fatal_error(c,
-					"error setting bucket_needs_journal_commit: %i", ret);
+					"setting bucket_needs_journal_commit: %s", bch2_err_str(ret));
 				return ret;
 			}
 		}
@@ -860,23 +863,28 @@ int bch2_trigger_alloc(struct btree_trans *trans,
 			*bucket_gen(ca, new.k->p.offset) = new_a->gen;
 
 		bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+		percpu_up_read(&c->mark_lock);
+
+#define eval_state(_a, expr)		({ const struct bch_alloc_v4 *a = _a; expr; })
+#define statechange(expr)		!eval_state(old_a, expr) && eval_state(new_a, expr)
+#define bucket_flushed(a)		(!a->journal_seq || a->journal_seq <= c->journal.flushed_seq_ondisk)
 
-		if (new_a->data_type == BCH_DATA_free &&
-		    (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+		if (statechange(a->data_type == BCH_DATA_free) &&
+		    bucket_flushed(new_a))
 			closure_wake_up(&c->freelist_wait);
 
-		if (new_a->data_type == BCH_DATA_need_discard &&
-		    (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
-			bch2_do_discards(c);
+		if (statechange(a->data_type == BCH_DATA_need_discard) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
+		    bucket_flushed(new_a))
+			bch2_discard_one_bucket_fast(c, new.k->p);
 
-		if (old_a->data_type != BCH_DATA_cached &&
-		    new_a->data_type == BCH_DATA_cached &&
+		if (statechange(a->data_type == BCH_DATA_cached) &&
+		    !bch2_bucket_is_open(c, new.k->p.inode, new.k->p.offset) &&
 		    should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
 			bch2_do_invalidates(c);
 
-		if (new_a->data_type == BCH_DATA_need_gc_gens)
+		if (statechange(a->data_type == BCH_DATA_need_gc_gens))
 			bch2_do_gc_gens(c);
-		percpu_up_read(&c->mark_lock);
 	}
 
 	if ((flags & BTREE_TRIGGER_GC) &&
@@ -1045,14 +1053,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (k.k->type != discard_key_type &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, need_discard_key_wrong,
-		      "incorrect key in need_discard btree (got %s should be %s)\n"
-		      "  %s",
-		      bch2_bkey_types[k.k->type],
-		      bch2_bkey_types[discard_key_type],
-		      (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(k.k->type != discard_key_type,
+			c, need_discard_key_wrong,
+			"incorrect key in need_discard btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[discard_key_type],
+			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -1076,15 +1083,14 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (k.k->type != freespace_key_type &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, freespace_key_wrong,
-		      "incorrect key in freespace btree (got %s should be %s)\n"
-		      "  %s",
-		      bch2_bkey_types[k.k->type],
-		      bch2_bkey_types[freespace_key_type],
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(k.k->type != freespace_key_type,
+			c, freespace_key_wrong,
+			"incorrect key in freespace btree (got %s should be %s)\n"
+			"  %s",
+			bch2_bkey_types[k.k->type],
+			bch2_bkey_types[freespace_key_type],
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -1108,14 +1114,13 @@ int bch2_check_alloc_key(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (a->gen != alloc_gen(k, gens_offset) &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, bucket_gens_key_wrong,
-		      "incorrect gen in bucket_gens btree (got %u should be %u)\n"
-		      "  %s",
-		      alloc_gen(k, gens_offset), a->gen,
-		      (printbuf_reset(&buf),
-		       bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) {
+	if (fsck_err_on(a->gen != alloc_gen(k, gens_offset),
+			c, bucket_gens_key_wrong,
+			"incorrect gen in bucket_gens btree (got %u should be %u)\n"
+			"  %s",
+			alloc_gen(k, gens_offset), a->gen,
+			(printbuf_reset(&buf),
+			 bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
 		struct bkey_i_bucket_gens *g =
 			bch2_trans_kmalloc(trans, sizeof(*g));
 
@@ -1167,14 +1172,13 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans,
 
 	*end = bkey_min(k.k->p, *end);
 
-	if (k.k->type != KEY_TYPE_set &&
-	    (c->opts.reconstruct_alloc ||
-	     fsck_err(c, freespace_hole_missing,
-		      "hole in alloc btree missing in freespace btree\n"
-		      "  device %llu buckets %llu-%llu",
-		      freespace_iter->pos.inode,
-		      freespace_iter->pos.offset,
-		      end->offset))) {
+	if (fsck_err_on(k.k->type != KEY_TYPE_set,
+			c, freespace_hole_missing,
+			"hole in alloc btree missing in freespace btree\n"
+			"  device %llu buckets %llu-%llu",
+			freespace_iter->pos.inode,
+			freespace_iter->pos.offset,
+			end->offset)) {
 		struct bkey_i *update =
 			bch2_trans_kmalloc(trans, sizeof(*update));
 
@@ -1604,6 +1608,36 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
 	return ret;
 }
 
+static int discard_in_flight_add(struct bch_fs *c, struct bpos bucket)
+{
+	int ret;
+
+	mutex_lock(&c->discard_buckets_in_flight_lock);
+	darray_for_each(c->discard_buckets_in_flight, i)
+		if (bkey_eq(*i, bucket)) {
+			ret = -EEXIST;
+			goto out;
+		}
+
+	ret = darray_push(&c->discard_buckets_in_flight, bucket);
+out:
+	mutex_unlock(&c->discard_buckets_in_flight_lock);
+	return ret;
+}
+
+static void discard_in_flight_remove(struct bch_fs *c, struct bpos bucket)
+{
+	mutex_lock(&c->discard_buckets_in_flight_lock);
+	darray_for_each(c->discard_buckets_in_flight, i)
+		if (bkey_eq(*i, bucket)) {
+			darray_remove_item(&c->discard_buckets_in_flight, i);
+			goto found;
+		}
+	BUG();
+found:
+	mutex_unlock(&c->discard_buckets_in_flight_lock);
+}
+
 struct discard_buckets_state {
 	u64		seen;
 	u64		open;
@@ -1642,6 +1676,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	struct bch_dev *ca;
 	struct bkey_i_alloc_v4 *a;
 	struct printbuf buf = PRINTBUF;
+	bool discard_locked = false;
 	int ret = 0;
 
 	ca = bch_dev_bkey_exists(c, pos.inode);
@@ -1678,37 +1713,45 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
-		a->v.gen++;
-		SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
-		goto write;
-	}
-
-	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
-		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
-			bch2_trans_inconsistent(trans,
-				"clearing need_discard but journal_seq %llu > flushed_seq %llu\n"
-				"%s",
-				a->v.journal_seq,
-				c->journal.flushed_seq_ondisk,
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+	if (a->v.dirty_sectors) {
+		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+					       trans, "attempting to discard bucket with dirty data\n%s",
+					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
 			ret = -EIO;
-		}
 		goto out;
 	}
 
 	if (a->v.data_type != BCH_DATA_need_discard) {
-		if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) {
-			bch2_trans_inconsistent(trans,
-				"bucket incorrectly set in need_discard btree\n"
-				"%s",
-				(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
-			ret = -EIO;
+		if (data_type_is_empty(a->v.data_type) &&
+		    BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) {
+			a->v.gen++;
+			SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false);
+			goto write;
 		}
 
+		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+					       trans, "bucket incorrectly set in need_discard btree\n"
+					       "%s",
+					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = -EIO;
 		goto out;
 	}
 
+	if (a->v.journal_seq > c->journal.flushed_seq_ondisk) {
+		if (bch2_trans_inconsistent_on(c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info,
+					       trans, "clearing need_discard but journal_seq %llu > flushed_seq %llu\n%s",
+					       a->v.journal_seq,
+					       c->journal.flushed_seq_ondisk,
+					       (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
+			ret = -EIO;
+		goto out;
+	}
+
+	if (discard_in_flight_add(c, SPOS(iter.pos.inode, iter.pos.offset, true)))
+		goto out;
+
+	discard_locked = true;
+
 	if (!bkey_eq(*discard_pos_done, iter.pos) &&
 	    ca->mi.discard && !c->opts.nochanges) {
 		/*
@@ -1740,6 +1783,8 @@ write:
 	count_event(c, bucket_discard);
 	s->discarded++;
 out:
+	if (discard_locked)
+		discard_in_flight_remove(c, iter.pos);
 	s->seen++;
 	bch2_trans_iter_exit(trans, &iter);
 	percpu_ref_put(&ca->io_ref);
@@ -1779,6 +1824,94 @@ void bch2_do_discards(struct bch_fs *c)
 		bch2_write_ref_put(c, BCH_WRITE_REF_discard);
 }
 
+static int bch2_clear_bucket_needs_discard(struct btree_trans *trans, struct bpos bucket)
+{
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, bucket, BTREE_ITER_INTENT);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&iter);
+	int ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut(trans, k);
+	ret = PTR_ERR_OR_ZERO(a);
+	if (ret)
+		goto err;
+
+	BUG_ON(a->v.dirty_sectors);
+	SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false);
+	a->v.data_type = alloc_data_type(a->v, a->v.data_type);
+
+	ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static void bch2_do_discards_fast_work(struct work_struct *work)
+{
+	struct bch_fs *c = container_of(work, struct bch_fs, discard_fast_work);
+
+	while (1) {
+		bool got_bucket = false;
+		struct bpos bucket;
+		struct bch_dev *ca;
+
+		mutex_lock(&c->discard_buckets_in_flight_lock);
+		darray_for_each(c->discard_buckets_in_flight, i) {
+			if (i->snapshot)
+				continue;
+
+			ca = bch_dev_bkey_exists(c, i->inode);
+
+			if (!percpu_ref_tryget(&ca->io_ref)) {
+				darray_remove_item(&c->discard_buckets_in_flight, i);
+				continue;
+			}
+
+			got_bucket = true;
+			bucket = *i;
+			i->snapshot = true;
+			break;
+		}
+		mutex_unlock(&c->discard_buckets_in_flight_lock);
+
+		if (!got_bucket)
+			break;
+
+		if (ca->mi.discard && !c->opts.nochanges)
+			blkdev_issue_discard(ca->disk_sb.bdev,
+					     bucket.offset * ca->mi.bucket_size,
+					     ca->mi.bucket_size,
+					     GFP_KERNEL);
+
+		int ret = bch2_trans_do(c, NULL, NULL,
+					BCH_WATERMARK_btree|
+					BCH_TRANS_COMMIT_no_enospc,
+					bch2_clear_bucket_needs_discard(trans, bucket));
+		bch_err_fn(c, ret);
+
+		percpu_ref_put(&ca->io_ref);
+		discard_in_flight_remove(c, bucket);
+
+		if (ret)
+			break;
+	}
+
+	bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
+static void bch2_discard_one_bucket_fast(struct bch_fs *c, struct bpos bucket)
+{
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
+
+	if (!percpu_ref_is_dying(&ca->io_ref) &&
+	    !discard_in_flight_add(c, bucket) &&
+	    bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast) &&
+	    !queue_work(c->write_ref_wq, &c->discard_fast_work))
+		bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast);
+}
+
 static int invalidate_one_bucket(struct btree_trans *trans,
 				 struct btree_iter *lru_iter,
 				 struct bkey_s_c lru_k,
@@ -1813,6 +1946,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
 		goto out;
 
 	BUG_ON(a->v.data_type != BCH_DATA_cached);
+	BUG_ON(a->v.dirty_sectors);
 
 	if (!a->v.cached_sectors)
 		bch_err(c, "invalidating empty bucket, confused");
@@ -2210,9 +2344,16 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca)
 			set_bit(ca->dev_idx, c->rw_devs[i].d);
 }
 
+void bch2_fs_allocator_background_exit(struct bch_fs *c)
+{
+	darray_exit(&c->discard_buckets_in_flight);
+}
+
 void bch2_fs_allocator_background_init(struct bch_fs *c)
 {
 	spin_lock_init(&c->freelist_lock);
+	mutex_init(&c->discard_buckets_in_flight_lock);
 	INIT_WORK(&c->discard_work, bch2_do_discards_work);
+	INIT_WORK(&c->discard_fast_work, bch2_do_discards_fast_work);
 	INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work);
 }
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index e7f7e842ee1b..052b2fac25d6 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -269,6 +269,7 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *);
 void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
 void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
 
+void bch2_fs_allocator_background_exit(struct bch_fs *);
 void bch2_fs_allocator_background_init(struct bch_fs *);
 
 #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 633d3223b353..a1fc30adf912 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -188,8 +188,10 @@ long bch2_bucket_alloc_new_fs(struct bch_dev *ca)
 static inline unsigned open_buckets_reserved(enum bch_watermark watermark)
 {
 	switch (watermark) {
-	case BCH_WATERMARK_reclaim:
+	case BCH_WATERMARK_interior_updates:
 		return 0;
+	case BCH_WATERMARK_reclaim:
+		return OPEN_BUCKETS_COUNT / 6;
 	case BCH_WATERMARK_btree:
 	case BCH_WATERMARK_btree_copygc:
 		return OPEN_BUCKETS_COUNT / 4;
@@ -236,8 +238,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 		if (cl)
 			closure_wait(&c->open_buckets_wait, cl);
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-				   &c->blocked_allocate_open_bucket, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true);
 		spin_unlock(&c->freelist_lock);
 		return ERR_PTR(-BCH_ERR_open_buckets_empty);
 	}
@@ -263,11 +264,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
 	ca->nr_open_buckets++;
 	bch2_open_bucket_hash_add(c, ob);
 
-	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
-			   &c->blocked_allocate_open_bucket, false);
-
-	track_event_change(&c->times[BCH_TIME_blocked_allocate],
-			   &c->blocked_allocate, false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], false);
+	track_event_change(&c->times[BCH_TIME_blocked_allocate], false);
 
 	spin_unlock(&c->freelist_lock);
 	return ob;
@@ -555,8 +553,7 @@ again:
 			goto again;
 		}
 
-		track_event_change(&c->times[BCH_TIME_blocked_allocate],
-				   &c->blocked_allocate, true);
+		track_event_change(&c->times[BCH_TIME_blocked_allocate], true);
 
 		ob = ERR_PTR(-BCH_ERR_freelist_empty);
 		goto err;
@@ -1361,15 +1358,17 @@ retry:
 
 		/* Don't retry from all devices if we're out of open buckets: */
 		if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) {
-			int ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
+			int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have,
 					      target, erasure_code,
 					      nr_replicas, &nr_effective,
 					      &have_cache, watermark,
 					      flags, cl);
-			if (!ret ||
-			    bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
-			    bch2_err_matches(ret, BCH_ERR_open_buckets_empty))
+			if (!ret2 ||
+			    bch2_err_matches(ret2, BCH_ERR_transaction_restart) ||
+			    bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) {
+				ret = ret2;
 				goto alloc_done;
+			}
 		}
 
 		/*
diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h
index b91b7a461056..c2226e947c41 100644
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
@@ -22,7 +22,8 @@ struct bucket_alloc_state {
 	x(copygc)			\
 	x(btree)			\
 	x(btree_copygc)			\
-	x(reclaim)
+	x(reclaim)			\
+	x(interior_updates)
 
 enum bch_watermark {
 #define x(name)	BCH_WATERMARK_##name,
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 569b97904da4..fadb1078903d 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -8,6 +8,7 @@
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_write_buffer.h"
+#include "checksum.h"
 #include "error.h"
 
 #include <linux/mm.h>
@@ -29,8 +30,7 @@ static bool extent_matches_bp(struct bch_fs *c,
 		if (p.ptr.cached)
 			continue;
 
-		bch2_extent_ptr_to_bp(c, btree_id, level, k, p,
-				      &bucket2, &bp2);
+		bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bucket2, &bp2);
 		if (bpos_eq(bucket, bucket2) &&
 		    !memcmp(&bp, &bp2, sizeof(bp)))
 			return true;
@@ -44,13 +44,20 @@ int bch2_backpointer_invalid(struct bch_fs *c, struct bkey_s_c k,
 			     struct printbuf *err)
 {
 	struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k);
+
+	/* these will be caught by fsck */
+	if (!bch2_dev_exists2(c, bp.k->p.inode))
+		return 0;
+
+	struct bch_dev *ca = bch_dev_bkey_exists(c, bp.k->p.inode);
 	struct bpos bucket = bp_pos_to_bucket(c, bp.k->p);
 	int ret = 0;
 
-	bkey_fsck_err_on(!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
+	bkey_fsck_err_on((bp.v->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT) >= ca->mi.bucket_size ||
+			 !bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset)),
 			 c, err,
-			 backpointer_pos_wrong,
-			 "backpointer at wrong pos");
+			 backpointer_bucket_offset_wrong,
+			 "backpointer bucket_offset wrong");
 fsck_err:
 	return ret;
 }
@@ -131,8 +138,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
 	printbuf_exit(&buf);
 
 	if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) {
-		bch2_inconsistent_error(c);
-		return -EIO;
+		return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0;
 	} else {
 		return 0;
 	}
@@ -379,7 +385,7 @@ static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_
 			backpointer_to_missing_alloc,
 			"backpointer for nonexistent alloc key: %llu:%llu:0\n%s",
 			alloc_iter.pos.inode, alloc_iter.pos.offset,
-			(bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
 		ret = bch2_btree_delete_at(trans, bp_iter, 0);
 		goto out;
 	}
@@ -415,6 +421,84 @@ struct extents_to_bp_state {
 	struct bkey_buf last_flushed;
 };
 
+static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree,
+			       struct bkey_s_c extent, unsigned dev)
+{
+	struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent);
+	int ret = PTR_ERR_OR_ZERO(n);
+	if (ret)
+		return ret;
+
+	bch2_bkey_drop_device(bkey_i_to_s(n), dev);
+	return bch2_btree_insert_trans(trans, btree, n, 0);
+}
+
+static int check_extent_checksum(struct btree_trans *trans,
+				 enum btree_id btree, struct bkey_s_c extent,
+				 enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent);
+	const union bch_extent_entry *entry;
+	struct extent_ptr_decoded p;
+	struct printbuf buf = PRINTBUF;
+	void *data_buf = NULL;
+	struct bio *bio = NULL;
+	size_t bytes;
+	int ret = 0;
+
+	if (bkey_is_btree_ptr(extent.k))
+		return false;
+
+	bkey_for_each_ptr_decode(extent.k, ptrs, p, entry)
+		if (p.ptr.dev == dev)
+			goto found;
+	BUG();
+found:
+	if (!p.crc.csum_type)
+		return false;
+
+	bytes = p.crc.compressed_size << 9;
+
+	struct bch_dev *ca = bch_dev_bkey_exists(c, dev);
+	if (!bch2_dev_get_ioref(ca, READ))
+		return false;
+
+	data_buf = kvmalloc(bytes, GFP_KERNEL);
+	if (!data_buf) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	bio = bio_alloc(ca->disk_sb.bdev, 1, REQ_OP_READ, GFP_KERNEL);
+	bio->bi_iter.bi_sector = p.ptr.offset;
+	bch2_bio_map(bio, data_buf, bytes);
+	ret = submit_bio_wait(bio);
+	if (ret)
+		goto err;
+
+	prt_str(&buf, "extents pointing to same space, but first extent checksum bad:");
+	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(btree));
+	bch2_bkey_val_to_text(&buf, c, extent);
+	prt_printf(&buf, "\n  %s ", bch2_btree_id_str(o_btree));
+	bch2_bkey_val_to_text(&buf, c, extent2);
+
+	struct nonce nonce = extent_nonce(extent.k->version, p.crc);
+	struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes);
+	if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum),
+			c, dup_backpointer_to_bad_csum_extent,
+			"%s", buf.buf))
+		ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1;
+fsck_err:
+err:
+	if (bio)
+		bio_put(bio);
+	kvfree(data_buf);
+	percpu_ref_put(&ca->io_ref);
+	printbuf_exit(&buf);
+	return ret;
+}
+
 static int check_bp_exists(struct btree_trans *trans,
 			   struct extents_to_bp_state *s,
 			   struct bpos bucket,
@@ -422,7 +506,8 @@ static int check_bp_exists(struct btree_trans *trans,
 			   struct bkey_s_c orig_k)
 {
 	struct bch_fs *c = trans->c;
-	struct btree_iter bp_iter = { NULL };
+	struct btree_iter bp_iter = {};
+	struct btree_iter other_extent_iter = {};
 	struct printbuf buf = PRINTBUF;
 	struct bkey_s_c bp_k;
 	struct bkey_buf tmp;
@@ -430,13 +515,19 @@ static int check_bp_exists(struct btree_trans *trans,
 
 	bch2_bkey_buf_init(&tmp);
 
+	if (!bch2_dev_bucket_exists(c, bucket)) {
+		prt_str(&buf, "extent for nonexistent device:bucket ");
+		bch2_bpos_to_text(&buf, bucket);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+		bch_err(c, "%s", buf.buf);
+		return -BCH_ERR_fsck_repair_unimplemented;
+	}
+
 	if (bpos_lt(bucket, s->bucket_start) ||
 	    bpos_gt(bucket, s->bucket_end))
 		return 0;
 
-	if (!bch2_dev_bucket_exists(c, bucket))
-		goto missing;
-
 	bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
 				  bucket_pos_to_bp(c, bucket, bp.bucket_offset),
 				  0);
@@ -462,24 +553,96 @@ static int check_bp_exists(struct btree_trans *trans,
 			ret = -BCH_ERR_transaction_restart_write_buffer_flush;
 			goto out;
 		}
-		goto missing;
+
+		goto check_existing_bp;
 	}
 out:
 err:
 fsck_err:
+	bch2_trans_iter_exit(trans, &other_extent_iter);
 	bch2_trans_iter_exit(trans, &bp_iter);
 	bch2_bkey_buf_exit(&tmp, c);
 	printbuf_exit(&buf);
 	return ret;
+check_existing_bp:
+	/* Do we have a backpointer for a different extent? */
+	if (bp_k.k->type != KEY_TYPE_backpointer)
+		goto missing;
+
+	struct bch_backpointer other_bp = *bkey_s_c_to_backpointer(bp_k).v;
+
+	struct bkey_s_c other_extent =
+		bch2_backpointer_get_key(trans, &other_extent_iter, bp_k.k->p, other_bp, 0);
+	ret = bkey_err(other_extent);
+	if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node)
+		ret = 0;
+	if (ret)
+		goto err;
+
+	if (!other_extent.k)
+		goto missing;
+
+	if (bch2_extents_match(orig_k, other_extent)) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n  ");
+		bch2_bkey_val_to_text(&buf, c, orig_k);
+		prt_str(&buf, "\n  ");
+		bch2_bkey_val_to_text(&buf, c, other_extent);
+		bch_err(c, "%s", buf.buf);
+
+		if (other_extent.k->size <= orig_k.k->size) {
+			ret = drop_dev_and_update(trans, other_bp.btree_id, other_extent, bucket.inode);
+			if (ret)
+				goto err;
+			goto out;
+		} else {
+			ret = drop_dev_and_update(trans, bp.btree_id, orig_k, bucket.inode);
+			if (ret)
+				goto err;
+			goto missing;
+		}
+	}
+
+	ret = check_extent_checksum(trans, other_bp.btree_id, other_extent, bp.btree_id, orig_k, bucket.inode);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		ret = 0;
+		goto missing;
+	}
+
+	ret = check_extent_checksum(trans, bp.btree_id, orig_k, other_bp.btree_id, other_extent, bucket.inode);
+	if (ret < 0)
+		goto err;
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
+
+	printbuf_reset(&buf);
+	prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n  ", bucket.inode);
+	bch2_bkey_val_to_text(&buf, c, orig_k);
+	prt_str(&buf, "\n  ");
+	bch2_bkey_val_to_text(&buf, c, other_extent);
+	bch_err(c, "%s", buf.buf);
+	ret = -BCH_ERR_fsck_repair_unimplemented;
+	goto err;
 missing:
+	printbuf_reset(&buf);
 	prt_printf(&buf, "missing backpointer for btree=%s l=%u ",
 	       bch2_btree_id_str(bp.btree_id), bp.level);
 	bch2_bkey_val_to_text(&buf, c, orig_k);
-	prt_printf(&buf, "\nbp pos ");
-	bch2_bpos_to_text(&buf, bp_iter.pos);
+	prt_printf(&buf, "\n  got:   ");
+	bch2_bkey_val_to_text(&buf, c, bp_k);
+
+	struct bkey_i_backpointer n_bp_k;
+	bkey_backpointer_init(&n_bp_k.k_i);
+	n_bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+	n_bp_k.v = bp;
+	prt_printf(&buf, "\n  want:  ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&n_bp_k.k_i));
 
-	if (c->opts.reconstruct_alloc ||
-	    fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
+	if (fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
 		ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
 
 	goto out;
@@ -504,8 +667,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
 		if (p.ptr.cached)
 			continue;
 
-		bch2_extent_ptr_to_bp(c, btree, level,
-				      k, p, &bucket_pos, &bp);
+		bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bucket_pos, &bp);
 
 		ret = check_bp_exists(trans, s, bucket_pos, bp, k);
 		if (ret)
@@ -555,60 +717,61 @@ static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp)
 	};
 }
 
-static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+static u64 mem_may_pin_bytes(struct bch_fs *c)
 {
 	struct sysinfo i;
-	u64 mem_bytes;
-
 	si_meminfo(&i);
-	mem_bytes = i.totalram * i.mem_unit;
-	return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
+
+	u64 mem_bytes = i.totalram * i.mem_unit;
+	return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100);
+}
+
+static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
+{
+	return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size);
 }
 
 static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
-					unsigned btree_leaf_mask,
-					unsigned btree_interior_mask,
+					u64 btree_leaf_mask,
+					u64 btree_interior_mask,
 					struct bbpos start, struct bbpos *end)
 {
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-	enum btree_id btree;
+	struct bch_fs *c = trans->c;
+	s64 mem_may_pin = mem_may_pin_bytes(c);
 	int ret = 0;
 
-	for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) {
-		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2;
+	btree_interior_mask |= btree_leaf_mask;
+
+	c->btree_cache.pinned_nodes_leaf_mask		= btree_leaf_mask;
+	c->btree_cache.pinned_nodes_interior_mask	= btree_interior_mask;
+	c->btree_cache.pinned_nodes_start		= start;
+	c->btree_cache.pinned_nodes_end			= *end = BBPOS_MAX;
+
+	for (enum btree_id btree = start.btree;
+	     btree < BTREE_ID_NR && !ret;
+	     btree++) {
+		unsigned depth = ((1U << btree) & btree_leaf_mask) ? 0 : 1;
+		struct btree_iter iter;
+		struct btree *b;
 
 		if (!((1U << btree) & btree_leaf_mask) &&
 		    !((1U << btree) & btree_interior_mask))
 			continue;
 
-		bch2_trans_node_iter_init(trans, &iter, btree,
-					  btree == start.btree ? start.pos : POS_MIN,
-					  0, depth, 0);
-		/*
-		 * for_each_btree_key_contineu() doesn't check the return value
-		 * from bch2_btree_iter_advance(), which is needed when
-		 * iterating over interior nodes where we'll see keys at
-		 * SPOS_MAX:
-		 */
-		do {
-			k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0);
-			ret = bkey_err(k);
-			if (!k.k || ret)
-				break;
-
-			--btree_nodes;
-			if (!btree_nodes) {
-				*end = BBPOS(btree, k.k->p);
+		__for_each_btree_node(trans, iter, btree,
+				      btree == start.btree ? start.pos : POS_MIN,
+				      0, depth, BTREE_ITER_PREFETCH, b, ret) {
+			mem_may_pin -= btree_buf_bytes(b);
+			if (mem_may_pin <= 0) {
+				c->btree_cache.pinned_nodes_end = *end =
+					BBPOS(btree, b->key.k.p);
 				bch2_trans_iter_exit(trans, &iter);
 				return 0;
 			}
-		} while (bch2_btree_iter_advance(&iter));
+		}
 		bch2_trans_iter_exit(trans, &iter);
 	}
 
-	*end = BBPOS_MAX;
 	return ret;
 }
 
@@ -666,62 +829,6 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
 	return 0;
 }
 
-static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
-					 struct bpos bucket)
-{
-	return bch2_dev_exists2(c, bucket.inode)
-		? bucket_pos_to_bp(c, bucket, 0)
-		: bucket;
-}
-
-static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
-					struct bpos start, struct bpos *end)
-{
-	struct btree_iter alloc_iter;
-	struct btree_iter bp_iter;
-	struct bkey_s_c alloc_k, bp_k;
-	size_t btree_nodes = btree_nodes_fit_in_ram(trans->c);
-	bool alloc_end = false, bp_end = false;
-	int ret = 0;
-
-	bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc,
-				  start, 0, 1, 0);
-	bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers,
-				  bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0);
-	while (1) {
-		alloc_k = !alloc_end
-			? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0)
-			: bkey_s_c_null;
-		bp_k = !bp_end
-			? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0)
-			: bkey_s_c_null;
-
-		ret = bkey_err(alloc_k) ?: bkey_err(bp_k);
-		if ((!alloc_k.k && !bp_k.k) || ret) {
-			*end = SPOS_MAX;
-			break;
-		}
-
-		--btree_nodes;
-		if (!btree_nodes) {
-			*end = alloc_k.k ? alloc_k.k->p : SPOS_MAX;
-			break;
-		}
-
-		if (bpos_lt(alloc_iter.pos, SPOS_MAX) &&
-		    bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) {
-			if (!bch2_btree_iter_advance(&alloc_iter))
-				alloc_end = true;
-		} else {
-			if (!bch2_btree_iter_advance(&bp_iter))
-				bp_end = true;
-		}
-	}
-	bch2_trans_iter_exit(trans, &bp_iter);
-	bch2_trans_iter_exit(trans, &alloc_iter);
-	return ret;
-}
-
 int bch2_check_extents_to_backpointers(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
@@ -732,10 +839,16 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 	bkey_init(&s.last_flushed.k->k);
 
 	while (1) {
-		ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
+		struct bbpos end;
+		ret = bch2_get_btree_in_memory_pos(trans,
+				BIT_ULL(BTREE_ID_backpointers),
+				BIT_ULL(BTREE_ID_backpointers),
+				BBPOS(BTREE_ID_backpointers, s.bucket_start), &end);
 		if (ret)
 			break;
 
+		s.bucket_end = end.pos;
+
 		if ( bpos_eq(s.bucket_start, POS_MIN) &&
 		    !bpos_eq(s.bucket_end, SPOS_MAX))
 			bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
@@ -763,6 +876,9 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c)
 	bch2_trans_put(trans);
 	bch2_bkey_buf_exit(&s.last_flushed, c);
 
+	c->btree_cache.pinned_nodes_leaf_mask = 0;
+	c->btree_cache.pinned_nodes_interior_mask = 0;
+
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -868,6 +984,9 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
 	}
 	bch2_trans_put(trans);
 
+	c->btree_cache.pinned_nodes_leaf_mask = 0;
+	c->btree_cache.pinned_nodes_interior_mask = 0;
+
 	bch_err_fn(c, ret);
 	return ret;
 }
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index 327365a9feac..85949b9fd880 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -53,14 +53,11 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
 					   u64 bucket_offset)
 {
 	struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode);
-	struct bpos ret;
-
-	ret = POS(bucket.inode,
-		  (bucket_to_sector(ca, bucket.offset) <<
-		   MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
+	struct bpos ret = POS(bucket.inode,
+			      (bucket_to_sector(ca, bucket.offset) <<
+			       MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset);
 
 	EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret)));
-
 	return ret;
 }
 
@@ -90,20 +87,40 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
 	return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
 }
 
-static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
-						    struct bkey_s_c k, struct extent_ptr_decoded p)
+static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k,
+							 struct extent_ptr_decoded p,
+							 const union bch_extent_entry *entry)
 {
-	return  level		? BCH_DATA_btree :
-		p.has_ec	? BCH_DATA_stripe :
-				  BCH_DATA_user;
+	switch (k.k->type) {
+	case KEY_TYPE_btree_ptr:
+	case KEY_TYPE_btree_ptr_v2:
+		return BCH_DATA_btree;
+	case KEY_TYPE_extent:
+	case KEY_TYPE_reflink_v:
+		return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user;
+	case KEY_TYPE_stripe: {
+		const struct bch_extent_ptr *ptr = &entry->ptr;
+		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
+
+		BUG_ON(ptr < s.v->ptrs ||
+		       ptr >= s.v->ptrs + s.v->nr_blocks);
+
+		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
+			? BCH_DATA_parity
+			: BCH_DATA_user;
+	}
+	default:
+		BUG();
+	}
 }
 
 static inline void bch2_extent_ptr_to_bp(struct bch_fs *c,
 			   enum btree_id btree_id, unsigned level,
 			   struct bkey_s_c k, struct extent_ptr_decoded p,
+			   const union bch_extent_entry *entry,
 			   struct bpos *bucket_pos, struct bch_backpointer *bp)
 {
-	enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+	enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
 	s64 sectors = level ? btree_sectors(c) : k.k->size;
 	u32 bucket_offset;
 
diff --git a/fs/bcachefs/bbpos_types.h b/fs/bcachefs/bbpos_types.h
index 5198e94cf3b8..f63893344f80 100644
--- a/fs/bcachefs/bbpos_types.h
+++ b/fs/bcachefs/bbpos_types.h
@@ -13,6 +13,6 @@ static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos)
 }
 
 #define BBPOS_MIN	BBPOS(0, POS_MIN)
-#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, POS_MAX)
+#define BBPOS_MAX	BBPOS(BTREE_ID_NR - 1, SPOS_MAX)
 
 #endif /* _BCACHEFS_BBPOS_TYPES_H */
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index 69d0d60d50e3..91c3c1fef233 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -209,9 +209,10 @@
 #include "fifo.h"
 #include "nocow_locking_types.h"
 #include "opts.h"
-#include "recovery_types.h"
+#include "recovery_passes_types.h"
 #include "sb-errors_types.h"
 #include "seqmutex.h"
+#include "time_stats.h"
 #include "util.h"
 
 #ifdef CONFIG_BCACHEFS_DEBUG
@@ -266,6 +267,9 @@ do {									\
 #define bch2_fmt(_c, fmt)		bch2_log_msg(_c, fmt "\n")
 
 __printf(2, 3)
+void bch2_print_opts(struct bch_opts *, const char *, ...);
+
+__printf(2, 3)
 void __bch2_print(struct bch_fs *c, const char *fmt, ...);
 
 #define maybe_dev_to_fs(_c)	_Generic((_c),				\
@@ -452,6 +456,7 @@ enum bch_time_stats {
 
 #include "alloc_types.h"
 #include "btree_types.h"
+#include "btree_node_scan_types.h"
 #include "btree_write_buffer_types.h"
 #include "buckets_types.h"
 #include "buckets_waiting_for_journal_types.h"
@@ -504,6 +509,7 @@ enum gc_phase {
 	GC_PHASE_BTREE_deleted_inodes,
 	GC_PHASE_BTREE_logged_ops,
 	GC_PHASE_BTREE_rebalance_work,
+	GC_PHASE_BTREE_subvolume_children,
 
 	GC_PHASE_PENDING_DELETE,
 };
@@ -593,7 +599,7 @@ struct bch_dev {
 
 	/* The rest of this all shows up in sysfs */
 	atomic64_t		cur_latency[2];
-	struct bch2_time_stats	io_latency[2];
+	struct bch2_time_stats_quantiles io_latency[2];
 
 #define CONGESTED_MAX		1024
 	atomic_t		congested;
@@ -609,6 +615,7 @@ struct bch_dev {
  */
 
 #define BCH_FS_FLAGS()			\
+	x(new_fs)			\
 	x(started)			\
 	x(may_go_rw)			\
 	x(rw)				\
@@ -663,6 +670,8 @@ struct journal_seq_blacklist_table {
 };
 
 struct journal_keys {
+	/* must match layout in darray_types.h */
+	size_t			nr, size;
 	struct journal_key {
 		u64		journal_seq;
 		u32		journal_offset;
@@ -671,15 +680,13 @@ struct journal_keys {
 		bool		allocated;
 		bool		overwritten;
 		struct bkey_i	*k;
-	}			*d;
+	}			*data;
 	/*
 	 * Gap buffer: instead of all the empty space in the array being at the
 	 * end of the buffer - from @nr to @size - the empty space is at @gap.
 	 * This means that sequential insertions are O(n) instead of O(n^2).
 	 */
 	size_t			gap;
-	size_t			nr;
-	size_t			size;
 	atomic_t		ref;
 	bool			initial_ref_held;
 };
@@ -702,7 +709,10 @@ struct btree_trans_buf {
 	x(stripe_delete)						\
 	x(reflink)							\
 	x(fallocate)							\
+	x(fsync)							\
+	x(dio_write)							\
 	x(discard)							\
+	x(discard_fast)							\
 	x(invalidate)							\
 	x(delete_dead_snapshots)					\
 	x(snapshot_delete_pagecache)					\
@@ -790,6 +800,7 @@ struct bch_fs {
 		u64		features;
 		u64		compat;
 		unsigned long	errors_silent[BITS_TO_LONGS(BCH_SB_ERR_MAX)];
+		u64		btrees_lost_data;
 	}			sb;
 
 
@@ -804,7 +815,6 @@ struct bch_fs {
 
 	/* snapshot.c: */
 	struct snapshot_table __rcu *snapshots;
-	size_t			snapshot_table_size;
 	struct mutex		snapshot_table_lock;
 	struct rw_semaphore	snapshot_create_lock;
 
@@ -843,6 +853,8 @@ struct bch_fs {
 	struct workqueue_struct	*btree_interior_update_worker;
 	struct work_struct	btree_interior_update_work;
 
+	struct workqueue_struct	*btree_node_rewrite_worker;
+
 	struct list_head	pending_node_rewrites;
 	struct mutex		pending_node_rewrites_lock;
 
@@ -919,8 +931,6 @@ struct bch_fs {
 	/* ALLOCATOR */
 	spinlock_t		freelist_lock;
 	struct closure_waitlist	freelist_wait;
-	u64			blocked_allocate;
-	u64			blocked_allocate_open_bucket;
 
 	open_bucket_idx_t	open_buckets_freelist;
 	open_bucket_idx_t	open_buckets_nr_free;
@@ -940,8 +950,11 @@ struct bch_fs {
 	unsigned		write_points_nr;
 
 	struct buckets_waiting_for_journal buckets_waiting_for_journal;
-	struct work_struct	discard_work;
 	struct work_struct	invalidate_work;
+	struct work_struct	discard_work;
+	struct mutex		discard_buckets_in_flight_lock;
+	DARRAY(struct bpos)	discard_buckets_in_flight;
+	struct work_struct	discard_fast_work;
 
 	/* GARBAGE COLLECTION */
 	struct task_struct	*gc_thread;
@@ -1095,6 +1108,8 @@ struct bch_fs {
 	struct journal_keys	journal_keys;
 	struct list_head	journal_iters;
 
+	struct find_btree_nodes	found_btree_nodes;
+
 	u64			last_bucket_seq_cleanup;
 
 	u64			counters_on_mount[BCH_COUNTER_NR];
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index 0668b682a21c..085987435a5e 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -189,7 +189,11 @@ struct bversion {
 	__u32		hi;
 	__u64		lo;
 #endif
-} __packed __aligned(4);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+__aligned(4)
+#endif
+;
 
 struct bkey {
 	/* Size of combined key and value, in u64s */
@@ -222,7 +226,36 @@ struct bkey {
 
 	__u8		pad[1];
 #endif
-} __packed __aligned(8);
+} __packed
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+/*
+ * The big-endian version of bkey can't be compiled by rustc with the "aligned"
+ * attr since it doesn't allow types to have both "packed" and "aligned" attrs.
+ * So for Rust compatibility, don't include this. It can be included in the LE
+ * version because the "packed" attr is redundant in that case.
+ *
+ * History: (quoting Kent)
+ *
+ * Specifically, when i was designing bkey, I wanted the header to be no
+ * bigger than necessary so that bkey_packed could use the rest. That means that
+ * decently offten extent keys will fit into only 8 bytes, instead of spilling over
+ * to 16.
+ *
+ * But packed_bkey treats the part after the header - the packed section -
+ * as a single multi word, variable length integer. And bkey, the unpacked
+ * version, is just a special case version of a bkey_packed; all the packed
+ * bkey code will work on keys in any packed format, the in-memory
+ * representation of an unpacked key also is just one type of packed key...
+ *
+ * So that constrains the key part of a bkig endian bkey to start right
+ * after the header.
+ *
+ * If we ever do a bkey_v2 and need to expand the hedaer by another byte for
+ * some reason - that will clean up this wart.
+ */
+__aligned(8)
+#endif
+;
 
 struct bkey_packed {
 	__u64		_data[0];
@@ -545,7 +578,8 @@ struct bch_member {
 	__le64			nbuckets;	/* device size */
 	__le16			first_bucket;   /* index of first bucket used */
 	__le16			bucket_size;	/* sectors */
-	__le32			pad;
+	__u8			btree_bitmap_shift;
+	__u8			pad[3];
 	__le64			last_mount;	/* time_t */
 
 	__le64			flags;
@@ -554,6 +588,7 @@ struct bch_member {
 	__le64			errors_at_reset[BCH_MEMBER_ERROR_NR];
 	__le64			errors_reset_time;
 	__le64			seq;
+	__le64			btree_allocated_bitmap;
 };
 
 #define BCH_MEMBER_V1_BYTES	56
@@ -785,6 +820,7 @@ struct bch_sb_field_ext {
 	struct bch_sb_field	field;
 	__le64			recovery_passes_required[2];
 	__le64			errors_silent[8];
+	__le64			btrees_lost_data;
 };
 
 struct bch_sb_field_downgrade_entry {
@@ -840,7 +876,10 @@ struct bch_sb_field_downgrade {
 	x(snapshot_skiplists,		BCH_VERSION(1,  1))		\
 	x(deleted_inodes,		BCH_VERSION(1,  2))		\
 	x(rebalance_work,		BCH_VERSION(1,  3))		\
-	x(member_seq,			BCH_VERSION(1,  4))
+	x(member_seq,			BCH_VERSION(1,  4))		\
+	x(subvolume_fs_parent,		BCH_VERSION(1,  5))		\
+	x(btree_subvolume_children,	BCH_VERSION(1,  6))		\
+	x(mi_btree_bitmap,		BCH_VERSION(1,  7))
 
 enum bcachefs_metadata_version {
 	bcachefs_metadata_version_min = 9,
@@ -1275,9 +1314,10 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
 	x(dev_usage,		8)		\
 	x(log,			9)		\
 	x(overwrite,		10)		\
-	x(write_buffer_keys,	11)
+	x(write_buffer_keys,	11)		\
+	x(datetime,		12)
 
-enum {
+enum bch_jset_entry_type {
 #define x(f, nr)	BCH_JSET_ENTRY_##f	= nr,
 	BCH_JSET_ENTRY_TYPES()
 #undef x
@@ -1323,7 +1363,7 @@ struct jset_entry_blacklist_v2 {
 	x(inodes,		1)		\
 	x(key_version,		2)
 
-enum {
+enum bch_fs_usage_type {
 #define x(f, nr)	BCH_FS_USAGE_##f	= nr,
 	BCH_FS_USAGE_TYPES()
 #undef x
@@ -1376,6 +1416,11 @@ struct jset_entry_log {
 	u8			d[];
 } __packed __aligned(8);
 
+struct jset_entry_datetime {
+	struct jset_entry	entry;
+	__le64			seconds;
+} __packed __aligned(8);
+
 /*
  * On disk format for a journal entry:
  * seq is monotonically increasing; every journal entry has its own unique
@@ -1482,7 +1527,9 @@ enum btree_id_flags {
 	  BIT_ULL(KEY_TYPE_logged_op_truncate)|					\
 	  BIT_ULL(KEY_TYPE_logged_op_finsert))					\
 	x(rebalance_work,	18,	BTREE_ID_SNAPSHOT_FIELD,		\
-	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))
+	  BIT_ULL(KEY_TYPE_set)|BIT_ULL(KEY_TYPE_cookie))			\
+	x(subvolume_children,	19,	0,					\
+	  BIT_ULL(KEY_TYPE_set))
 
 enum btree_id {
 #define x(name, nr, ...) BTREE_ID_##name = nr,
@@ -1491,6 +1538,20 @@ enum btree_id {
 	BTREE_ID_NR
 };
 
+static inline bool btree_id_is_alloc(enum btree_id id)
+{
+	switch (id) {
+	case BTREE_ID_alloc:
+	case BTREE_ID_backpointers:
+	case BTREE_ID_need_discard:
+	case BTREE_ID_freespace:
+	case BTREE_ID_bucket_gens:
+		return true;
+	default:
+		return false;
+	}
+}
+
 #define BTREE_MAX_DEPTH		4U
 
 /* Btree nodes */
diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h
index 831be01809f2..3a45d128f608 100644
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
@@ -4,7 +4,7 @@
 
 #include <linux/bug.h>
 #include "bcachefs_format.h"
-
+#include "bkey_types.h"
 #include "btree_types.h"
 #include "util.h"
 #include "vstructs.h"
@@ -31,57 +31,6 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *,
 				     const struct bkey_format *,
 				     const struct bkey_packed *);
 
-/* bkey with split value, const */
-struct bkey_s_c {
-	const struct bkey	*k;
-	const struct bch_val	*v;
-};
-
-/* bkey with split value */
-struct bkey_s {
-	union {
-	struct {
-		struct bkey	*k;
-		struct bch_val	*v;
-	};
-	struct bkey_s_c		s_c;
-	};
-};
-
-#define bkey_p_next(_k)		vstruct_next(_k)
-
-static inline struct bkey_i *bkey_next(struct bkey_i *k)
-{
-	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
-}
-
-#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
-
-static inline size_t bkey_val_bytes(const struct bkey *k)
-{
-	return bkey_val_u64s(k) * sizeof(u64);
-}
-
-static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
-{
-	unsigned u64s = BKEY_U64s + val_u64s;
-
-	BUG_ON(u64s > U8_MAX);
-	k->u64s = u64s;
-}
-
-static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
-{
-	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
-}
-
-#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
-
-#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
-
-#define bkey_whiteout(_k)				\
-	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
-
 enum bkey_lr_packed {
 	BKEY_PACKED_BOTH,
 	BKEY_PACKED_RIGHT,
@@ -362,10 +311,13 @@ static inline struct bpos bkey_start_pos(const struct bkey *k)
 static inline unsigned bkeyp_key_u64s(const struct bkey_format *format,
 				      const struct bkey_packed *k)
 {
-	unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+	return bkey_packed(k) ? format->key_u64s : BKEY_U64s;
+}
 
-	EBUG_ON(k->u64s < ret);
-	return ret;
+static inline bool bkeyp_u64s_valid(const struct bkey_format *f,
+				    const struct bkey_packed *k)
+{
+	return ((unsigned) k->u64s - bkeyp_key_u64s(f, k) <= U8_MAX - BKEY_U64s);
 }
 
 static inline unsigned bkeyp_key_bytes(const struct bkey_format *format,
@@ -553,155 +505,6 @@ static inline void bkey_reassemble(struct bkey_i *dst,
 	memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k));
 }
 
-#define bkey_s_null		((struct bkey_s)   { .k = NULL })
-#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
-
-#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
-#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
-
-static inline struct bkey_s bkey_to_s(struct bkey *k)
-{
-	return (struct bkey_s) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
-{
-	return (struct bkey_s_c) { .k = k, .v = NULL };
-}
-
-static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
-{
-	return (struct bkey_s) { .k = &k->k, .v = &k->v };
-}
-
-static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
-{
-	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
-}
-
-/*
- * For a given type of value (e.g. struct bch_extent), generates the types for
- * bkey + bch_extent - inline, split, split const - and also all the conversion
- * functions, which also check that the value is of the correct type.
- *
- * We use anonymous unions for upcasting - e.g. converting from e.g. a
- * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
- * functions.
- */
-#define x(name, ...)					\
-struct bkey_i_##name {							\
-	union {								\
-		struct bkey		k;				\
-		struct bkey_i		k_i;				\
-	};								\
-	struct bch_##name		v;				\
-};									\
-									\
-struct bkey_s_c_##name {						\
-	union {								\
-	struct {							\
-		const struct bkey	*k;				\
-		const struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-struct bkey_s_##name {							\
-	union {								\
-	struct {							\
-		struct bkey		*k;				\
-		struct bch_##name	*v;				\
-	};								\
-	struct bkey_s_c_##name		c;				\
-	struct bkey_s			s;				\
-	struct bkey_s_c			s_c;				\
-	};								\
-};									\
-									\
-static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline const struct bkey_i_##name *				\
-bkey_i_to_##name##_c(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return container_of(&k->k, struct bkey_i_##name, k);		\
-}									\
-									\
-static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = k.k,						\
-		.v = container_of(k.v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
-{									\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-name##_i_to_s_c(const struct bkey_i_##name *k)				\
-{									\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = &k->v,						\
-	};								\
-}									\
-									\
-static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_##name) {					\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_s_c_##name					\
-bkey_i_to_s_c_##name(const struct bkey_i *k)				\
-{									\
-	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
-	return (struct bkey_s_c_##name) {				\
-		.k = &k->k,						\
-		.v = container_of(&k->v, struct bch_##name, v),		\
-	};								\
-}									\
-									\
-static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
-{									\
-	struct bkey_i_##name *k =					\
-		container_of(&_k->k, struct bkey_i_##name, k);		\
-									\
-	bkey_init(&k->k);						\
-	memset(&k->v, 0, sizeof(k->v));					\
-	k->k.type = KEY_TYPE_##name;					\
-	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
-									\
-	return k;							\
-}
-
-BCH_BKEY_TYPES();
-#undef x
-
 /* byte order helpers */
 
 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 5e52684764eb..db336a43fc08 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -171,11 +171,15 @@ int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k,
 	if (type >= BKEY_TYPE_NR)
 		return 0;
 
-	bkey_fsck_err_on((flags & BKEY_INVALID_COMMIT) &&
+	bkey_fsck_err_on((type == BKEY_TYPE_btree ||
+			  (flags & BKEY_INVALID_COMMIT)) &&
 			 !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, err,
 			 bkey_invalid_type_for_btree,
 			 "invalid key type for btree %s (%s)",
-			 bch2_btree_node_type_str(type), bch2_bkey_types[k.k->type]);
+			 bch2_btree_node_type_str(type),
+			 k.k->type < KEY_TYPE_MAX
+			 ? bch2_bkey_types[k.k->type]
+			 : "(unknown)");
 
 	if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) {
 		bkey_fsck_err_on(k.k->size == 0, c, err,
diff --git a/fs/bcachefs/bkey_types.h b/fs/bcachefs/bkey_types.h
new file mode 100644
index 000000000000..c9ae9e42b385
--- /dev/null
+++ b/fs/bcachefs/bkey_types.h
@@ -0,0 +1,213 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BKEY_TYPES_H
+#define _BCACHEFS_BKEY_TYPES_H
+
+#include "bcachefs_format.h"
+
+/*
+ * bkey_i	- bkey with inline value
+ * bkey_s	- bkey with split value
+ * bkey_s_c	- bkey with split value, const
+ */
+
+#define bkey_p_next(_k)		vstruct_next(_k)
+
+static inline struct bkey_i *bkey_next(struct bkey_i *k)
+{
+	return (struct bkey_i *) ((u64 *) k->_data + k->k.u64s);
+}
+
+#define bkey_val_u64s(_k)	((_k)->u64s - BKEY_U64s)
+
+static inline size_t bkey_val_bytes(const struct bkey *k)
+{
+	return bkey_val_u64s(k) * sizeof(u64);
+}
+
+static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s)
+{
+	unsigned u64s = BKEY_U64s + val_u64s;
+
+	BUG_ON(u64s > U8_MAX);
+	k->u64s = u64s;
+}
+
+static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes)
+{
+	set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64)));
+}
+
+#define bkey_val_end(_k)	((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k)))
+
+#define bkey_deleted(_k)	((_k)->type == KEY_TYPE_deleted)
+
+#define bkey_whiteout(_k)				\
+	((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout)
+
+/* bkey with split value, const */
+struct bkey_s_c {
+	const struct bkey	*k;
+	const struct bch_val	*v;
+};
+
+/* bkey with split value */
+struct bkey_s {
+	union {
+	struct {
+		struct bkey	*k;
+		struct bch_val	*v;
+	};
+	struct bkey_s_c		s_c;
+	};
+};
+
+#define bkey_s_null		((struct bkey_s)   { .k = NULL })
+#define bkey_s_c_null		((struct bkey_s_c) { .k = NULL })
+
+#define bkey_s_err(err)		((struct bkey_s)   { .k = ERR_PTR(err) })
+#define bkey_s_c_err(err)	((struct bkey_s_c) { .k = ERR_PTR(err) })
+
+static inline struct bkey_s bkey_to_s(struct bkey *k)
+{
+	return (struct bkey_s) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k)
+{
+	return (struct bkey_s_c) { .k = k, .v = NULL };
+}
+
+static inline struct bkey_s bkey_i_to_s(struct bkey_i *k)
+{
+	return (struct bkey_s) { .k = &k->k, .v = &k->v };
+}
+
+static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k)
+{
+	return (struct bkey_s_c) { .k = &k->k, .v = &k->v };
+}
+
+/*
+ * For a given type of value (e.g. struct bch_extent), generates the types for
+ * bkey + bch_extent - inline, split, split const - and also all the conversion
+ * functions, which also check that the value is of the correct type.
+ *
+ * We use anonymous unions for upcasting - e.g. converting from e.g. a
+ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion
+ * functions.
+ */
+#define x(name, ...)					\
+struct bkey_i_##name {							\
+	union {								\
+		struct bkey		k;				\
+		struct bkey_i		k_i;				\
+	};								\
+	struct bch_##name		v;				\
+};									\
+									\
+struct bkey_s_c_##name {						\
+	union {								\
+	struct {							\
+		const struct bkey	*k;				\
+		const struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+struct bkey_s_##name {							\
+	union {								\
+	struct {							\
+		struct bkey		*k;				\
+		struct bch_##name	*v;				\
+	};								\
+	struct bkey_s_c_##name		c;				\
+	struct bkey_s			s;				\
+	struct bkey_s_c			s_c;				\
+	};								\
+};									\
+									\
+static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline const struct bkey_i_##name *				\
+bkey_i_to_##name##_c(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return container_of(&k->k, struct bkey_i_##name, k);		\
+}									\
+									\
+static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = k.k,						\
+		.v = container_of(k.v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\
+{									\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+name##_i_to_s_c(const struct bkey_i_##name *k)				\
+{									\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = &k->v,						\
+	};								\
+}									\
+									\
+static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k)	\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_##name) {					\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_s_c_##name					\
+bkey_i_to_s_c_##name(const struct bkey_i *k)				\
+{									\
+	EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name);	\
+	return (struct bkey_s_c_##name) {				\
+		.k = &k->k,						\
+		.v = container_of(&k->v, struct bch_##name, v),		\
+	};								\
+}									\
+									\
+static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\
+{									\
+	struct bkey_i_##name *k =					\
+		container_of(&_k->k, struct bkey_i_##name, k);		\
+									\
+	bkey_init(&k->k);						\
+	memset(&k->v, 0, sizeof(k->v));					\
+	k->k.type = KEY_TYPE_##name;					\
+	set_bkey_val_bytes(&k->k, sizeof(k->v));			\
+									\
+	return k;							\
+}
+
+BCH_BKEY_TYPES();
+#undef x
+
+#endif /* _BCACHEFS_BKEY_TYPES_H */
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index 3fd1085b6c61..3bb477840eab 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -134,18 +134,24 @@ void bch2_dump_btree_node_iter(struct btree *b,
 	printbuf_exit(&buf);
 }
 
-#ifdef CONFIG_BCACHEFS_DEBUG
-
-void __bch2_verify_btree_nr_keys(struct btree *b)
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b)
 {
 	struct bset_tree *t;
 	struct bkey_packed *k;
-	struct btree_nr_keys nr = { 0 };
+	struct btree_nr_keys nr = {};
 
 	for_each_bset(b, t)
 		bset_tree_for_each_key(b, t, k)
 			if (!bkey_deleted(k))
 				btree_keys_account_key_add(&nr, t - b->set, k);
+	return nr;
+}
+
+#ifdef CONFIG_BCACHEFS_DEBUG
+
+void __bch2_verify_btree_nr_keys(struct btree *b)
+{
+	struct btree_nr_keys nr = bch2_btree_node_count_keys(b);
 
 	BUG_ON(memcmp(&nr, &b->nr, sizeof(nr)));
 }
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 79c77baaa383..120a79fd456b 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -458,6 +458,8 @@ struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *,
 
 /* Accounting: */
 
+struct btree_nr_keys bch2_btree_node_count_keys(struct btree *);
+
 static inline void btree_keys_account_key(struct btree_nr_keys *n,
 					  unsigned bset,
 					  struct bkey_packed *k,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index d7c81beac14a..02c70e813fac 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bbpos.h"
 #include "bkey_buf.h"
 #include "btree_cache.h"
 #include "btree_io.h"
@@ -60,7 +61,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
 
 	clear_btree_node_just_written(b);
 
-	kvpfree(b->data, btree_buf_bytes(b));
+	kvfree(b->data);
 	b->data = NULL;
 #ifdef __KERNEL__
 	kvfree(b->aux_data);
@@ -94,7 +95,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 {
 	BUG_ON(b->data || b->aux_data);
 
-	b->data = kvpmalloc(btree_buf_bytes(b), gfp);
+	b->data = kvmalloc(btree_buf_bytes(b), gfp);
 	if (!b->data)
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 #ifdef __KERNEL__
@@ -107,7 +108,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
 		b->aux_data = NULL;
 #endif
 	if (!b->aux_data) {
-		kvpfree(b->data, btree_buf_bytes(b));
+		kvfree(b->data);
 		b->data = NULL;
 		return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
 	}
@@ -208,6 +209,18 @@ static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush)
 	int ret = 0;
 
 	lockdep_assert_held(&bc->lock);
+
+	struct bbpos pos = BBPOS(b->c.btree_id, b->key.k.p);
+
+	u64 mask = b->c.level
+		? bc->pinned_nodes_interior_mask
+		: bc->pinned_nodes_leaf_mask;
+
+	if ((mask & BIT_ULL(b->c.btree_id)) &&
+	    bbpos_cmp(bc->pinned_nodes_start, pos) < 0 &&
+	    bbpos_cmp(bc->pinned_nodes_end, pos) >= 0)
+		return -BCH_ERR_ENOMEM_btree_node_reclaim;
+
 wait_on_io:
 	if (b->flags & ((1U << BTREE_NODE_dirty)|
 			(1U << BTREE_NODE_read_in_flight)|
@@ -408,7 +421,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
 	if (c->verify_data)
 		list_move(&c->verify_data->list, &bc->live);
 
-	kvpfree(c->verify_ondisk, c->opts.btree_node_size);
+	kvfree(c->verify_ondisk);
 
 	for (i = 0; i < btree_id_nr_alive(c); i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
@@ -696,9 +709,31 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
 	struct btree *b;
-	u32 seq;
 
-	BUG_ON(level + 1 >= BTREE_MAX_DEPTH);
+	if (unlikely(level >= BTREE_MAX_DEPTH)) {
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node at level %u, >= max depth %u",
+						 level, BTREE_MAX_DEPTH);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(!bkey_is_btree_ptr(&k->k))) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with non-btree key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
+	if (unlikely(k->k.u64s > BKEY_BTREE_PTR_U64s_MAX)) {
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
+
+		int ret = bch2_fs_topology_error(c, "attempting to get btree node with too big key %s", buf.buf);
+		printbuf_exit(&buf);
+		return ERR_PTR(ret);
+	}
+
 	/*
 	 * Parent node must be locked, else we could read in a btree node that's
 	 * been freed:
@@ -711,6 +746,9 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	b = bch2_btree_node_mem_alloc(trans, level != 0);
 
 	if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) {
+		if (!path)
+			return b;
+
 		trans->memory_allocation_failure = true;
 		trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail));
@@ -736,33 +774,26 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
 	}
 
 	set_btree_node_read_in_flight(b);
-
 	six_unlock_write(&b->c.lock);
-	seq = six_lock_seq(&b->c.lock);
-	six_unlock_intent(&b->c.lock);
 
-	/* Unlock before doing IO: */
-	if (path && sync)
-		bch2_trans_unlock_noassert(trans);
+	if (path) {
+		u32 seq = six_lock_seq(&b->c.lock);
 
-	bch2_btree_node_read(trans, b, sync);
+		/* Unlock before doing IO: */
+		six_unlock_intent(&b->c.lock);
+		bch2_trans_unlock_noassert(trans);
 
-	if (!sync)
-		return NULL;
+		bch2_btree_node_read(trans, b, sync);
 
-	if (path) {
-		int ret = bch2_trans_relock(trans) ?:
-			bch2_btree_path_relock_intent(trans, path);
-		if (ret) {
-			BUG_ON(!trans->restarted);
-			return ERR_PTR(ret);
-		}
-	}
+		if (!sync)
+			return NULL;
 
-	if (!six_relock_type(&b->c.lock, lock_type, seq)) {
-		if (path)
-			trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path);
-		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill));
+		if (!six_relock_type(&b->c.lock, lock_type, seq))
+			b = NULL;
+	} else {
+		bch2_btree_node_read(trans, b, sync);
+		if (lock_type == SIX_LOCK_read)
+			six_lock_downgrade(&b->c.lock);
 	}
 
 	return b;
@@ -791,7 +822,8 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b)
 	prt_printf(&buf, "\nmax ");
 	bch2_bpos_to_text(&buf, b->data->max_key);
 
-	bch2_fs_inconsistent(c, "%s", buf.buf);
+	bch2_fs_topology_error(c, "%s", buf.buf);
+
 	printbuf_exit(&buf);
 }
 
@@ -901,7 +933,7 @@ retry:
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-BCH_ERR_btree_node_read_error);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -992,7 +1024,7 @@ struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_type(&b->c.lock, lock_type);
-		return ERR_PTR(-EIO);
+		return ERR_PTR(-BCH_ERR_btree_node_read_error);
 	}
 
 	EBUG_ON(b->c.btree_id != path->btree_id);
@@ -1075,7 +1107,7 @@ lock_node:
 
 	if (unlikely(btree_node_read_error(b))) {
 		six_unlock_read(&b->c.lock);
-		b = ERR_PTR(-EIO);
+		b = ERR_PTR(-BCH_ERR_btree_node_read_error);
 		goto out;
 	}
 
@@ -1094,18 +1126,19 @@ int bch2_btree_node_prefetch(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct btree_cache *bc = &c->btree_cache;
-	struct btree *b;
 
-	BUG_ON(trans && !btree_node_locked(path, level + 1));
+	BUG_ON(path && !btree_node_locked(path, level + 1));
 	BUG_ON(level >= BTREE_MAX_DEPTH);
 
-	b = btree_cache_find(bc, k);
+	struct btree *b = btree_cache_find(bc, k);
 	if (b)
 		return 0;
 
 	b = bch2_btree_node_fill(trans, path, k, btree_id,
 				 level, SIX_LOCK_read, false);
-	return PTR_ERR_OR_ZERO(b);
+	if (!IS_ERR_OR_NULL(b))
+		six_unlock_read(&b->c.lock);
+	return bch2_trans_relock(trans) ?: PTR_ERR_OR_ZERO(b);
 }
 
 void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
@@ -1117,6 +1150,8 @@ void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k)
 	b = btree_cache_find(bc, k);
 	if (!b)
 		return;
+
+	BUG_ON(b == btree_node_root(trans->c, b));
 wait_on_io:
 	/* not allowed to wait on io with btree locks held: */
 
@@ -1128,6 +1163,8 @@ wait_on_io:
 
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
 	btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
+	if (unlikely(b->hash_val != btree_ptr_hash_val(k)))
+		goto out;
 
 	if (btree_node_dirty(b)) {
 		__bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim);
@@ -1142,7 +1179,7 @@ wait_on_io:
 	btree_node_data_free(c, b);
 	bch2_btree_node_hash_remove(bc, b);
 	mutex_unlock(&bc->lock);
-
+out:
 	six_unlock_write(&b->c.lock);
 	six_unlock_intent(&b->c.lock);
 }
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 1102995643b1..ecbd9598f69f 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -7,11 +7,13 @@
 #include "bcachefs.h"
 #include "alloc_background.h"
 #include "alloc_foreground.h"
+#include "backpointers.h"
 #include "bkey_methods.h"
 #include "bkey_buf.h"
 #include "btree_journal_iter.h"
 #include "btree_key_cache.h"
 #include "btree_locking.h"
+#include "btree_node_scan.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_gc.h"
@@ -24,7 +26,7 @@
 #include "journal.h"
 #include "keylist.h"
 #include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
 #include "reflink.h"
 #include "replicas.h"
 #include "super-io.h"
@@ -40,6 +42,7 @@
 
 #define DROP_THIS_NODE		10
 #define DROP_PREV_NODE		11
+#define DID_FILL_FROM_SCAN	12
 
 static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
 {
@@ -70,90 +73,6 @@ static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos)
 	__gc_pos_set(c, new_pos);
 }
 
-/*
- * Missing: if an interior btree node is empty, we need to do something -
- * perhaps just kill it
- */
-static int bch2_gc_check_topology(struct bch_fs *c,
-				  struct btree *b,
-				  struct bkey_buf *prev,
-				  struct bkey_buf cur,
-				  bool is_last)
-{
-	struct bpos node_start	= b->data->min_key;
-	struct bpos node_end	= b->data->max_key;
-	struct bpos expected_start = bkey_deleted(&prev->k->k)
-		? node_start
-		: bpos_successor(prev->k->k.p);
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
-	int ret = 0;
-
-	if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) {
-		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k);
-
-		if (!bpos_eq(expected_start, bp->v.min_key)) {
-			bch2_topology_error(c);
-
-			if (bkey_deleted(&prev->k->k)) {
-				prt_printf(&buf1, "start of node: ");
-				bch2_bpos_to_text(&buf1, node_start);
-			} else {
-				bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k));
-			}
-			bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k));
-
-			if (__fsck_err(c,
-				       FSCK_CAN_FIX|
-				       FSCK_CAN_IGNORE|
-				       FSCK_NO_RATELIMIT,
-				       btree_node_topology_bad_min_key,
-				       "btree node with incorrect min_key at btree %s level %u:\n"
-				       "  prev %s\n"
-				       "  cur %s",
-				       bch2_btree_id_str(b->c.btree_id), b->c.level,
-				       buf1.buf, buf2.buf) && should_restart_for_topology_repair(c)) {
-				bch_info(c, "Halting mark and sweep to start topology repair pass");
-				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
-				goto err;
-			} else {
-				set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
-			}
-		}
-	}
-
-	if (is_last && !bpos_eq(cur.k->k.p, node_end)) {
-		bch2_topology_error(c);
-
-		printbuf_reset(&buf1);
-		printbuf_reset(&buf2);
-
-		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k));
-		bch2_bpos_to_text(&buf2, node_end);
-
-		if (__fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE|FSCK_NO_RATELIMIT,
-			  btree_node_topology_bad_max_key,
-			  "btree node with incorrect max_key at btree %s level %u:\n"
-			  "  %s\n"
-			  "  expected %s",
-			  bch2_btree_id_str(b->c.btree_id), b->c.level,
-			  buf1.buf, buf2.buf) &&
-		    should_restart_for_topology_repair(c)) {
-			bch_info(c, "Halting mark and sweep to start topology repair pass");
-			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
-			goto err;
-		} else {
-			set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
-		}
-	}
-
-	bch2_bkey_buf_copy(prev, c, cur.k);
-err:
-fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
-	return ret;
-}
-
 static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst)
 {
 	switch (b->key.k.type) {
@@ -212,6 +131,17 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min)
 	struct bkey_i_btree_ptr_v2 *new;
 	int ret;
 
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, " -> ");
+		bch2_bpos_to_text(&buf, new_min);
+
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+	}
+
 	new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL);
 	if (!new)
 		return -BCH_ERR_ENOMEM_gc_repair_key;
@@ -237,6 +167,17 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	struct bkey_i_btree_ptr_v2 *new;
 	int ret;
 
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, " -> ");
+		bch2_bpos_to_text(&buf, new_max);
+
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+	}
+
 	ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p);
 	if (ret)
 		return ret;
@@ -268,128 +209,140 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max)
 	return 0;
 }
 
-static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b,
-					struct btree *prev, struct btree *cur)
+static int btree_check_node_boundaries(struct bch_fs *c, struct btree *b,
+				       struct btree *prev, struct btree *cur,
+				       struct bpos *pulled_from_scan)
 {
 	struct bpos expected_start = !prev
 		? b->data->min_key
 		: bpos_successor(prev->key.k.p);
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	if (!prev) {
-		prt_printf(&buf1, "start of node: ");
-		bch2_bpos_to_text(&buf1, b->data->min_key);
-	} else {
-		bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key));
+	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+			b->data->min_key));
+
+	if (bpos_eq(expected_start, cur->data->min_key))
+		return 0;
+
+	prt_printf(&buf, "  at btree %s level %u:\n  parent: ",
+		   bch2_btree_id_str(b->c.btree_id), b->c.level);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	if (prev) {
+		prt_printf(&buf, "\n  prev: ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key));
 	}
 
-	bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key));
-
-	if (prev &&
-	    bpos_gt(expected_start, cur->data->min_key) &&
-	    BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {
-		/* cur overwrites prev: */
-
-		if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key,
-						cur->data->min_key), c,
-				btree_node_topology_overwritten_by_next_node,
-				"btree node overwritten by next node at btree %s level %u:\n"
-				"  node %s\n"
-				"  next %s",
-				bch2_btree_id_str(b->c.btree_id), b->c.level,
-				buf1.buf, buf2.buf)) {
-			ret = DROP_PREV_NODE;
-			goto out;
-		}
+	prt_str(&buf, "\n  next: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key));
 
-		if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p,
-						 bpos_predecessor(cur->data->min_key)), c,
-				btree_node_topology_bad_max_key,
-				"btree node with incorrect max_key at btree %s level %u:\n"
-				"  node %s\n"
-				"  next %s",
-				bch2_btree_id_str(b->c.btree_id), b->c.level,
-				buf1.buf, buf2.buf))
-			ret = set_node_max(c, prev,
-					   bpos_predecessor(cur->data->min_key));
-	} else {
-		/* prev overwrites cur: */
-
-		if (mustfix_fsck_err_on(bpos_ge(expected_start,
-						cur->data->max_key), c,
-				btree_node_topology_overwritten_by_prev_node,
-				"btree node overwritten by prev node at btree %s level %u:\n"
-				"  prev %s\n"
-				"  node %s",
-				bch2_btree_id_str(b->c.btree_id), b->c.level,
-				buf1.buf, buf2.buf)) {
-			ret = DROP_THIS_NODE;
-			goto out;
-		}
+	if (bpos_lt(expected_start, cur->data->min_key)) {				/* gap */
+		if (b->c.level == 1 &&
+		    bpos_lt(*pulled_from_scan, cur->data->min_key)) {
+			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+						     expected_start,
+						     bpos_predecessor(cur->data->min_key));
+			if (ret)
+				goto err;
 
-		if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c,
-				btree_node_topology_bad_min_key,
-				"btree node with incorrect min_key at btree %s level %u:\n"
-				"  prev %s\n"
-				"  node %s",
-				bch2_btree_id_str(b->c.btree_id), b->c.level,
-				buf1.buf, buf2.buf))
-			ret = set_node_min(c, cur, expected_start);
+			*pulled_from_scan = cur->data->min_key;
+			ret = DID_FILL_FROM_SCAN;
+		} else {
+			if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+					     "btree node with incorrect min_key%s", buf.buf))
+				ret = set_node_min(c, cur, expected_start);
+		}
+	} else {									/* overlap */
+		if (prev && BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) {	/* cur overwrites prev */
+			if (bpos_ge(prev->data->min_key, cur->data->min_key)) {		/* fully? */
+				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_next_node,
+						     "btree node overwritten by next node%s", buf.buf))
+					ret = DROP_PREV_NODE;
+			} else {
+				if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+						     "btree node with incorrect max_key%s", buf.buf))
+					ret = set_node_max(c, prev,
+							   bpos_predecessor(cur->data->min_key));
+			}
+		} else {
+			if (bpos_ge(expected_start, cur->data->max_key)) {		/* fully? */
+				if (mustfix_fsck_err(c, btree_node_topology_overwritten_by_prev_node,
+						     "btree node overwritten by prev node%s", buf.buf))
+					ret = DROP_THIS_NODE;
+			} else {
+				if (mustfix_fsck_err(c, btree_node_topology_bad_min_key,
+						     "btree node with incorrect min_key%s", buf.buf))
+					ret = set_node_min(c, cur, expected_start);
+			}
+		}
 	}
-out:
+err:
 fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	printbuf_exit(&buf);
 	return ret;
 }
 
 static int btree_repair_node_end(struct bch_fs *c, struct btree *b,
-				 struct btree *child)
+				 struct btree *child, struct bpos *pulled_from_scan)
 {
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key));
-	bch2_bpos_to_text(&buf2, b->key.k.p);
+	if (bpos_eq(child->key.k.p, b->key.k.p))
+		return 0;
 
-	if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c,
-				btree_node_topology_bad_max_key,
-			"btree node with incorrect max_key at btree %s level %u:\n"
-			"  %s\n"
-			"  expected %s",
-			bch2_btree_id_str(b->c.btree_id), b->c.level,
-			buf1.buf, buf2.buf)) {
-		ret = set_node_max(c, child, b->key.k.p);
-		if (ret)
-			goto err;
+	prt_printf(&buf, "at btree %s level %u:\n  parent: ",
+		   bch2_btree_id_str(b->c.btree_id), b->c.level);
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+	prt_str(&buf, "\n  child: ");
+	bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key));
+
+	if (mustfix_fsck_err(c, btree_node_topology_bad_max_key,
+			     "btree node with incorrect max_key%s", buf.buf)) {
+		if (b->c.level == 1 &&
+		    bpos_lt(*pulled_from_scan, b->key.k.p)) {
+			ret = bch2_get_scanned_nodes(c, b->c.btree_id, 0,
+						bpos_successor(child->key.k.p), b->key.k.p);
+			if (ret)
+				goto err;
+
+			*pulled_from_scan = b->key.k.p;
+			ret = DID_FILL_FROM_SCAN;
+		} else {
+			ret = set_node_max(c, child, b->key.k.p);
+		}
 	}
 err:
 fsck_err:
-	printbuf_exit(&buf2);
-	printbuf_exit(&buf1);
+	printbuf_exit(&buf);
 	return ret;
 }
 
-static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b)
+static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b,
+					      struct bpos *pulled_from_scan)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
 	struct bkey_buf prev_k, cur_k;
 	struct btree *prev = NULL, *cur = NULL;
-	bool have_child, dropped_children = false;
+	bool have_child, new_pass = false;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	if (!b->c.level)
 		return 0;
-again:
-	prev = NULL;
-	have_child = dropped_children = false;
+
 	bch2_bkey_buf_init(&prev_k);
 	bch2_bkey_buf_init(&cur_k);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+again:
+	cur = prev = NULL;
+	have_child = new_pass = false;
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -406,7 +359,7 @@ again:
 		printbuf_reset(&buf);
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k));
 
-		if (mustfix_fsck_err_on(ret == -EIO, c,
+		if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), c,
 				btree_node_unreadable,
 				"Topology repair: unreadable btree node at btree %s level %u:\n"
 				"  %s",
@@ -414,11 +367,17 @@ again:
 				b->c.level - 1,
 				buf.buf)) {
 			bch2_btree_node_evict(trans, cur_k.k);
+			cur = NULL;
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
-			cur = NULL;
 			if (ret)
 				break;
+
+			if (!btree_id_is_alloc(b->c.btree_id)) {
+				ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+				if (ret)
+					break;
+			}
 			continue;
 		}
 
@@ -426,7 +385,23 @@ again:
 		if (ret)
 			break;
 
-		ret = btree_repair_node_boundaries(c, b, prev, cur);
+		if (bch2_btree_node_is_stale(c, cur)) {
+			bch_info(c, "btree node %s older than nodes found by scanning", buf.buf);
+			six_unlock_read(&cur->c.lock);
+			bch2_btree_node_evict(trans, cur_k.k);
+			ret = bch2_journal_key_delete(c, b->c.btree_id,
+						      b->c.level, cur_k.k->k.p);
+			cur = NULL;
+			if (ret)
+				break;
+			continue;
+		}
+
+		ret = btree_check_node_boundaries(c, b, prev, cur, pulled_from_scan);
+		if (ret == DID_FILL_FROM_SCAN) {
+			new_pass = true;
+			ret = 0;
+		}
 
 		if (ret == DROP_THIS_NODE) {
 			six_unlock_read(&cur->c.lock);
@@ -444,6 +419,7 @@ again:
 		prev = NULL;
 
 		if (ret == DROP_PREV_NODE) {
+			bch_info(c, "dropped prev node");
 			bch2_btree_node_evict(trans, prev_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, prev_k.k->k.p);
@@ -451,8 +427,6 @@ again:
 				break;
 
 			bch2_btree_and_journal_iter_exit(&iter);
-			bch2_bkey_buf_exit(&prev_k, c);
-			bch2_bkey_buf_exit(&cur_k, c);
 			goto again;
 		} else if (ret)
 			break;
@@ -464,7 +438,11 @@ again:
 
 	if (!ret && !IS_ERR_OR_NULL(prev)) {
 		BUG_ON(cur);
-		ret = btree_repair_node_end(c, b, prev);
+		ret = btree_repair_node_end(c, b, prev, pulled_from_scan);
+		if (ret == DID_FILL_FROM_SCAN) {
+			new_pass = true;
+			ret = 0;
+		}
 	}
 
 	if (!IS_ERR_OR_NULL(prev))
@@ -478,7 +456,12 @@ again:
 		goto err;
 
 	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+
+	if (new_pass)
+		goto again;
+
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+	iter.prefetch = true;
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		bch2_bkey_buf_reassemble(&cur_k, c, k);
@@ -493,7 +476,7 @@ again:
 		if (ret)
 			goto err;
 
-		ret = bch2_btree_repair_topology_recurse(trans, cur);
+		ret = bch2_btree_repair_topology_recurse(trans, cur, pulled_from_scan);
 		six_unlock_read(&cur->c.lock);
 		cur = NULL;
 
@@ -501,7 +484,7 @@ again:
 			bch2_btree_node_evict(trans, cur_k.k);
 			ret = bch2_journal_key_delete(c, b->c.btree_id,
 						      b->c.level, cur_k.k->k.p);
-			dropped_children = true;
+			new_pass = true;
 		}
 
 		if (ret)
@@ -528,12 +511,14 @@ fsck_err:
 		six_unlock_read(&cur->c.lock);
 
 	bch2_btree_and_journal_iter_exit(&iter);
-	bch2_bkey_buf_exit(&prev_k, c);
-	bch2_bkey_buf_exit(&cur_k, c);
 
-	if (!ret && dropped_children)
+	if (!ret && new_pass)
 		goto again;
 
+	BUG_ON(!ret && bch2_btree_node_check_topology(trans, b));
+
+	bch2_bkey_buf_exit(&prev_k, c);
+	bch2_bkey_buf_exit(&cur_k, c);
 	printbuf_exit(&buf);
 	return ret;
 }
@@ -541,32 +526,63 @@ fsck_err:
 int bch2_check_topology(struct bch_fs *c)
 {
 	struct btree_trans *trans = bch2_trans_get(c);
-	struct btree *b;
-	unsigned i;
+	struct bpos pulled_from_scan = POS_MIN;
 	int ret = 0;
 
-	for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
+	for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
+		bool reconstructed_root = false;
 
-		if (!r->alive)
-			continue;
+		if (r->error) {
+			ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+			if (ret)
+				break;
+reconstruct_root:
+			bch_info(c, "btree root %s unreadable, must recover from scan", bch2_btree_id_str(i));
 
-		b = r->b;
-		if (btree_node_fake(b))
-			continue;
+			r->alive = false;
+			r->error = 0;
+
+			if (!bch2_btree_has_scanned_nodes(c, i)) {
+				mustfix_fsck_err(c, btree_root_unreadable_and_scan_found_nothing,
+						 "no nodes found for btree %s, continue?", bch2_btree_id_str(i));
+				bch2_btree_root_alloc_fake(c, i, 0);
+			} else {
+				bch2_btree_root_alloc_fake(c, i, 1);
+				bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+				ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
+				if (ret)
+					break;
+			}
+
+			reconstructed_root = true;
+		}
+
+		struct btree *b = r->b;
 
 		btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read);
-		ret = bch2_btree_repair_topology_recurse(trans, b);
+		ret = bch2_btree_repair_topology_recurse(trans, b, &pulled_from_scan);
 		six_unlock_read(&b->c.lock);
 
 		if (ret == DROP_THIS_NODE) {
-			bch_err(c, "empty btree root - repair unimplemented");
-			ret = -BCH_ERR_fsck_repair_unimplemented;
+			bch2_btree_node_hash_remove(&c->btree_cache, b);
+			mutex_lock(&c->btree_cache.lock);
+			list_move(&b->list, &c->btree_cache.freeable);
+			mutex_unlock(&c->btree_cache.lock);
+
+			r->b = NULL;
+
+			if (!reconstructed_root)
+				goto reconstruct_root;
+
+			bch_err(c, "empty btree root %s", bch2_btree_id_str(i));
+			bch2_btree_root_alloc_fake(c, i, 0);
+			r->alive = false;
+			ret = 0;
 		}
 	}
-
+fsck_err:
 	bch2_trans_put(trans);
-
 	return ret;
 }
 
@@ -589,18 +605,17 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 	bkey_for_each_ptr_decode(k->k, ptrs_c, p, entry_c) {
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
-		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry_c->ptr);
-
-		if (!g->gen_valid &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_to_missing_alloc_key,
-			      "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, p, entry_c);
+
+		if (fsck_err_on(!g->gen_valid,
+				c, ptr_to_missing_alloc_key,
+				"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -609,16 +624,15 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (gen_cmp(p.ptr.gen, g->gen) > 0 &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
-			      "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen, g->gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
+		if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0,
+				c, ptr_gen_newer_than_bucket_gen,
+				"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen, g->gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
 			if (!p.ptr.cached) {
 				g->gen_valid		= true;
 				g->gen			= p.ptr.gen;
@@ -631,35 +645,34 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			}
 		}
 
-		if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, ptr_gen_newer_than_bucket_gen,
-			      "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+		if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX,
+				c, ptr_gen_newer_than_bucket_gen,
+				"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
-		if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 &&
-		    (c->opts.reconstruct_alloc ||
-		     fsck_err(c, stale_dirty_ptr,
-			      "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
-			      "while marking %s",
-			      p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
-			      bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
-			      p.ptr.gen, g->gen,
-			      (printbuf_reset(&buf),
-			       bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
+		if (fsck_err_on(!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0,
+				c, stale_dirty_ptr,
+				"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
+				"while marking %s",
+				p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
+				bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
+				p.ptr.gen, g->gen,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))
 			do_update = true;
 
 		if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen)
 			continue;
 
 		if (fsck_err_on(bucket_data_type(g->data_type) &&
-				bucket_data_type(g->data_type) != data_type, c,
+				bucket_data_type(g->data_type) !=
+				bucket_data_type(data_type), c,
 				ptr_bucket_data_type_mismatch,
 				"bucket %u:%zu different types of data in same bucket: %s, %s\n"
 				"while marking %s",
@@ -700,18 +713,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 	}
 
 	if (do_update) {
-		struct bkey_ptrs ptrs;
-		union bch_extent_entry *entry;
-		struct bch_extent_ptr *ptr;
-		struct bkey_i *new;
-
 		if (is_root) {
 			bch_err(c, "cannot update btree roots yet");
 			ret = -EINVAL;
 			goto err;
 		}
 
-		new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
+		struct bkey_i *new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
 		if (!new) {
 			ret = -BCH_ERR_ENOMEM_gc_repair_key;
 			bch_err_msg(c, ret, "allocating new key");
@@ -726,7 +734,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 			 * btree node isn't there anymore, the read path will
 			 * sort it out:
 			 */
-			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 			bkey_for_each_ptr(ptrs, ptr) {
 				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
@@ -734,19 +742,26 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
 				ptr->gen = g->gen;
 			}
 		} else {
-			bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({
-				struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-				struct bucket *g = PTR_GC_BUCKET(ca, ptr);
-				enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr);
-
-				(ptr->cached &&
-				 (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) ||
-				(!ptr->cached &&
-				 gen_cmp(ptr->gen, g->gen) < 0) ||
-				gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX ||
-				(g->data_type &&
-				 g->data_type != data_type);
-			}));
+			struct bkey_ptrs ptrs;
+			union bch_extent_entry *entry;
+restart_drop_ptrs:
+			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+			bkey_for_each_ptr_decode(bkey_i_to_s(new).k, ptrs, p, entry) {
+				struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+				struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+				enum bch_data_type data_type = bch2_bkey_ptr_data_type(bkey_i_to_s_c(new), p, entry);
+
+				if ((p.ptr.cached &&
+				     (!g->gen_valid || gen_cmp(p.ptr.gen, g->gen) > 0)) ||
+				    (!p.ptr.cached &&
+				     gen_cmp(p.ptr.gen, g->gen) < 0) ||
+				    gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX ||
+				    (g->data_type &&
+				     g->data_type != data_type)) {
+					bch2_bkey_drop_ptr(bkey_i_to_s(new), &entry->ptr);
+					goto restart_drop_ptrs;
+				}
+			}
 again:
 			ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
 			bkey_extent_entry_for_each(ptrs, entry) {
@@ -776,12 +791,6 @@ found:
 			}
 		}
 
-		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
-		if (ret) {
-			kfree(new);
-			goto err;
-		}
-
 		if (level)
 			bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new);
 
@@ -795,6 +804,12 @@ found:
 			bch_info(c, "new key %s", buf.buf);
 		}
 
+		ret = bch2_journal_key_insert_take(c, btree_id, level, new);
+		if (ret) {
+			kfree(new);
+			goto err;
+		}
+
 		*k = bkey_i_to_s_c(new);
 	}
 err:
@@ -813,6 +828,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 	struct bch_fs *c = trans->c;
 	struct bkey deleted = KEY(0, 0, 0);
 	struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
+	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
 	deleted.p = k->k->p;
@@ -821,10 +837,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 		BUG_ON(bch2_journal_seq_verify &&
 		       k->k->version.lo > atomic64_read(&c->journal.seq));
 
-		ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
-		if (ret)
-			goto err;
-
 		if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c,
 				bkey_version_in_future,
 				"key version number higher than recorded: %llu > %llu",
@@ -833,52 +845,57 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
 			atomic64_set(&c->key_version, k->k->version.lo);
 	}
 
+	ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k);
+	if (ret)
+		goto err;
+
+	if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, *k),
+				c, btree_bitmap_not_marked,
+				"btree ptr not marked in member info btree allocated bitmap\n  %s",
+				(bch2_bkey_val_to_text(&buf, c, *k),
+				 buf.buf))) {
+		mutex_lock(&c->sb_lock);
+		bch2_dev_btree_bitmap_mark(c, *k);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
+	}
+
 	ret = commit_do(trans, NULL, NULL, 0,
-			bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
+			bch2_key_trigger(trans, btree_id, level, old,
+					 unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
 fsck_err:
 err:
+	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
 }
 
 static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial)
 {
-	struct bch_fs *c = trans->c;
 	struct btree_node_iter iter;
 	struct bkey unpacked;
 	struct bkey_s_c k;
-	struct bkey_buf prev, cur;
 	int ret = 0;
 
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret)
+		return ret;
+
 	if (!btree_node_type_needs_gc(btree_node_type(b)))
 		return 0;
 
 	bch2_btree_node_iter_init_from_start(&iter, b);
-	bch2_bkey_buf_init(&prev);
-	bch2_bkey_buf_init(&cur);
-	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) {
 		ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false,
 				       &k, initial);
 		if (ret)
-			break;
+			return ret;
 
 		bch2_btree_node_iter_advance(&iter, b);
-
-		if (b->c.level) {
-			bch2_bkey_buf_reassemble(&cur, c, k);
-
-			ret = bch2_gc_check_topology(c, b, &prev, cur,
-					bch2_btree_node_iter_end(&iter));
-			if (ret)
-				break;
-		}
 	}
 
-	bch2_bkey_buf_exit(&cur, c);
-	bch2_bkey_buf_exit(&prev, c);
-	return ret;
+	return 0;
 }
 
 static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id,
@@ -927,14 +944,16 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	struct bch_fs *c = trans->c;
 	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
-	struct bkey_buf cur, prev;
+	struct bkey_buf cur;
 	struct printbuf buf = PRINTBUF;
 	int ret = 0;
 
-	bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
-	bch2_bkey_buf_init(&prev);
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret)
+		return ret;
+
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 	bch2_bkey_buf_init(&cur);
-	bkey_init(&prev.k->k);
 
 	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		BUG_ON(bpos_lt(k.k->p, b->data->min_key));
@@ -945,25 +964,13 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 		if (ret)
 			goto fsck_err;
 
-		if (b->c.level) {
-			bch2_bkey_buf_reassemble(&cur, c, k);
-			k = bkey_i_to_s_c(cur.k);
-
-			bch2_btree_and_journal_iter_advance(&iter);
-
-			ret = bch2_gc_check_topology(c, b,
-					&prev, cur,
-					!bch2_btree_and_journal_iter_peek(&iter).k);
-			if (ret)
-				goto fsck_err;
-		} else {
-			bch2_btree_and_journal_iter_advance(&iter);
-		}
+		bch2_btree_and_journal_iter_advance(&iter);
 	}
 
 	if (b->c.level > target_depth) {
 		bch2_btree_and_journal_iter_exit(&iter);
-		bch2_btree_and_journal_iter_init_node_iter(&iter, c, b);
+		bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
+		iter.prefetch = true;
 
 		while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 			struct btree *child;
@@ -976,7 +983,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 						false);
 			ret = PTR_ERR_OR_ZERO(child);
 
-			if (ret == -EIO) {
+			if (bch2_err_matches(ret, EIO)) {
 				bch2_topology_error(c);
 
 				if (__fsck_err(c,
@@ -1016,7 +1023,6 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
 	}
 fsck_err:
 	bch2_bkey_buf_exit(&cur, c);
-	bch2_bkey_buf_exit(&prev, c);
 	bch2_btree_and_journal_iter_exit(&iter);
 	printbuf_exit(&buf);
 	return ret;
@@ -1034,9 +1040,6 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
 
 	b = bch2_btree_id_root(c, btree_id)->b;
 
-	if (btree_node_fake(b))
-		return 0;
-
 	six_lock_read(&b->c.lock, NULL, NULL);
 	printbuf_reset(&buf);
 	bch2_bpos_to_text(&buf, b->data->min_key);
@@ -1190,9 +1193,7 @@ static void bch2_gc_free(struct bch_fs *c)
 	genradix_free(&c->gc_stripes);
 
 	for_each_member_device(c, ca) {
-		kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
-			sizeof(struct bucket_array) +
-			ca->mi.nbuckets * sizeof(struct bucket));
+		kvfree(rcu_dereference_protected(ca->buckets_gc, 1));
 		ca->buckets_gc = NULL;
 
 		free_percpu(ca->usage_gc);
@@ -1365,11 +1366,10 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 {
 	struct bch_fs *c = trans->c;
 	struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode);
-	struct bucket gc, *b;
+	struct bucket old_gc, gc, *b;
 	struct bkey_i_alloc_v4 *a;
 	struct bch_alloc_v4 old_convert, new;
 	const struct bch_alloc_v4 *old;
-	enum bch_data_type type;
 	int ret;
 
 	old = bch2_alloc_to_v4(k, &old_convert);
@@ -1377,28 +1377,29 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 
 	percpu_down_read(&c->mark_lock);
 	b = gc_bucket(ca, iter->pos.offset);
+	old_gc = *b;
+
+	if ((old->data_type == BCH_DATA_sb ||
+	     old->data_type == BCH_DATA_journal) &&
+	    !bch2_dev_is_online(ca)) {
+		b->data_type = old->data_type;
+		b->dirty_sectors = old->dirty_sectors;
+	}
 
 	/*
 	 * b->data_type doesn't yet include need_discard & need_gc_gen states -
 	 * fix that here:
 	 */
-	type = __alloc_data_type(b->dirty_sectors,
-				 b->cached_sectors,
-				 b->stripe,
-				 *old,
-				 b->data_type);
-	if (b->data_type != type) {
-		struct bch_dev_usage *u;
-
-		preempt_disable();
-		u = this_cpu_ptr(ca->usage_gc);
-		u->d[b->data_type].buckets--;
-		b->data_type = type;
-		u->d[b->data_type].buckets++;
-		preempt_enable();
-	}
-
+	b->data_type = __alloc_data_type(b->dirty_sectors,
+					 b->cached_sectors,
+					 b->stripe,
+					 *old,
+					 b->data_type);
 	gc = *b;
+
+	if (gc.data_type != old_gc.data_type ||
+	    gc.dirty_sectors != old_gc.dirty_sectors)
+		bch2_dev_usage_update_m(c, ca, &old_gc, &gc);
 	percpu_up_read(&c->mark_lock);
 
 	if (metadata_only &&
@@ -1410,8 +1411,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 	if (gen_after(old->gen, gc.gen))
 		return 0;
 
-	if (c->opts.reconstruct_alloc ||
-	    fsck_err_on(new.data_type != gc.data_type, c,
+	if (fsck_err_on(new.data_type != gc.data_type, c,
 			alloc_key_data_type_wrong,
 			"bucket %llu:%llu gen %u has wrong data_type"
 			": got %s, should be %s",
@@ -1422,8 +1422,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
 		new.data_type = gc.data_type;
 
 #define copy_bucket_field(_errtype, _f)					\
-	if (c->opts.reconstruct_alloc ||				\
-	    fsck_err_on(new._f != gc._f, c, _errtype,			\
+	if (fsck_err_on(new._f != gc._f, c, _errtype,			\
 			"bucket %llu:%llu gen %u data type %s has wrong " #_f	\
 			": got %u, should be %u",			\
 			iter->pos.inode, iter->pos.offset,		\
@@ -1491,7 +1490,7 @@ static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
 static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
 {
 	for_each_member_device(c, ca) {
-		struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
+		struct bucket_array *buckets = kvmalloc(sizeof(struct bucket_array) +
 				ca->mi.nbuckets * sizeof(struct bucket),
 				GFP_KERNEL|__GFP_ZERO);
 		if (!buckets) {
@@ -1585,8 +1584,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			"  should be %u",
 			(bch2_bkey_val_to_text(&buf, c, k), buf.buf),
 			r->refcount)) {
-		struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0);
-
+		struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
 		ret = PTR_ERR_OR_ZERO(new);
 		if (ret)
 			return ret;
@@ -1595,6 +1593,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
 			new->k.type = KEY_TYPE_deleted;
 		else
 			*bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
+		ret = bch2_trans_update(trans, iter, new, 0);
 	}
 fsck_err:
 	printbuf_exit(&buf);
@@ -1817,10 +1816,10 @@ out:
 	if (!ret) {
 		bch2_journal_block(&c->journal);
 
-		ret   = bch2_gc_stripes_done(c, metadata_only) ?:
-			bch2_gc_reflink_done(c, metadata_only) ?:
-			bch2_gc_alloc_done(c, metadata_only) ?:
-			bch2_gc_done(c, initial, metadata_only);
+		ret   = bch2_gc_alloc_done(c, metadata_only) ?:
+			bch2_gc_done(c, initial, metadata_only) ?:
+			bch2_gc_stripes_done(c, metadata_only) ?:
+			bch2_gc_reflink_done(c, metadata_only);
 
 		bch2_journal_unblock(&c->journal);
 	}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index aa9b6cbe3226..9678b2375bed 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -103,7 +103,7 @@ static void btree_bounce_free(struct bch_fs *c, size_t size,
 	if (used_mempool)
 		mempool_free(p, &c->btree_bounce_pool);
 	else
-		vpfree(p, size);
+		kvfree(p);
 }
 
 static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
@@ -115,7 +115,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
 	BUG_ON(size > c->opts.btree_node_size);
 
 	*used_mempool = false;
-	p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
+	p = kvmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
 	if (!p) {
 		*used_mempool = true;
 		p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
@@ -581,8 +581,7 @@ static int __btree_err(int ret,
 		break;
 	case -BCH_ERR_btree_node_read_err_bad_node:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
-		bch2_topology_error(c);
-		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: -EIO;
+		ret = bch2_topology_error(c);
 		break;
 	case -BCH_ERR_btree_node_read_err_incompatible:
 		bch2_print_string_as_lines(KERN_ERR, out.buf);
@@ -655,6 +654,7 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b)
 	 */
 	bch2_bset_set_no_aux_tree(b, b->set);
 	bch2_btree_build_aux_trees(b);
+	b->nr = bch2_btree_node_count_keys(b);
 
 	struct bkey_s_c k;
 	struct bkey unpacked;
@@ -831,7 +831,7 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
 		(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
 }
 
-static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+static bool bkey_packed_valid(struct bch_fs *c, struct btree *b,
 			 struct bset *i, struct bkey_packed *k)
 {
 	if (bkey_p_next(k) > vstruct_last(i))
@@ -840,6 +840,9 @@ static bool __bkey_valid(struct bch_fs *c, struct btree *b,
 	if (k->format > KEY_FORMAT_CURRENT)
 		return false;
 
+	if (!bkeyp_u64s_valid(&b->format, k))
+		return false;
+
 	struct printbuf buf = PRINTBUF;
 	struct bkey tmp;
 	struct bkey_s u = __bkey_disassemble(b, k, &tmp);
@@ -881,7 +884,15 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
 				 "invalid bkey format %u", k->format))
 			goto drop_this_key;
 
-		/* XXX: validate k->u64s */
+		if (btree_err_on(!bkeyp_u64s_valid(&b->format, k),
+				 -BCH_ERR_btree_node_read_err_fixable,
+				 c, NULL, b, i,
+				 btree_node_bkey_bad_u64s,
+				 "bad k->u64s %u (min %u max %lu)", k->u64s,
+				 bkeyp_key_u64s(&b->format, k),
+				 U8_MAX - BKEY_U64s + bkeyp_key_u64s(&b->format, k)))
+			goto drop_this_key;
+
 		if (!write)
 			bch2_bkey_compat(b->c.level, b->c.btree_id, version,
 				    BSET_BIG_ENDIAN(i), write,
@@ -938,13 +949,12 @@ drop_this_key:
 			 * do
 			 */
 
-			if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+			if (!bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
 				for (next_good_key = 1;
 				     next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
 				     next_good_key++)
-					if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+					if (bkey_packed_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
 						goto got_good_key;
-
 			}
 
 			/*
@@ -1058,7 +1068,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;
 
 			btree_err_on(btree_node_type_is_extents(btree_node_type(b)) &&
@@ -1099,7 +1109,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
 
 			ret = bset_encrypt(c, i, b->written << 9);
 			if (bch2_fs_fatal_err_on(ret, c,
-					"error decrypting btree node: %i\n", ret))
+					"decrypting btree node: %s", bch2_err_str(ret)))
 				goto fsck_err;
 
 			sectors = vstruct_sectors(bne, c->block_bits);
@@ -1255,10 +1265,12 @@ out:
 	return retry_read;
 fsck_err:
 	if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
-	    ret == -BCH_ERR_btree_node_read_err_must_retry)
+	    ret == -BCH_ERR_btree_node_read_err_must_retry) {
 		retry_read = 1;
-	else
+	} else {
 		set_btree_node_read_error(b);
+		bch2_btree_lost_data(c, b->c.btree_id);
+	}
 	goto out;
 }
 
@@ -1319,6 +1331,7 @@ start:
 
 		if (!can_retry) {
 			set_btree_node_read_error(b);
+			bch2_btree_lost_data(c, b->c.btree_id);
 			break;
 		}
 	}
@@ -1327,10 +1340,12 @@ start:
 			       rb->start_time);
 	bio_put(&rb->bio);
 
-	if (saw_error && !btree_node_read_error(b)) {
+	if (saw_error &&
+	    !btree_node_read_error(b) &&
+	    c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) {
 		printbuf_reset(&buf);
 		bch2_bpos_to_text(&buf, b->key.k.p);
-		bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
+		bch_err_ratelimited(c, "%s: rewriting btree node at btree=%s level=%u %s due to error",
 			 __func__, bch2_btree_id_str(b->c.btree_id), b->c.level, buf.buf);
 
 		bch2_btree_node_rewrite_async(c, b);
@@ -1518,9 +1533,10 @@ fsck_err:
 		ret = -1;
 	}
 
-	if (ret)
+	if (ret) {
 		set_btree_node_read_error(b);
-	else if (*saw_error)
+		bch2_btree_lost_data(c, b->c.btree_id);
+	} else if (*saw_error)
 		bch2_btree_node_rewrite_async(c, b);
 
 	for (i = 0; i < ra->nr; i++) {
@@ -1649,13 +1665,14 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
 
 		prt_str(&buf, "btree node read error: no device to read from\n at ");
 		bch2_btree_pos_to_text(&buf, c, b);
-		bch_err(c, "%s", buf.buf);
+		bch_err_ratelimited(c, "%s", buf.buf);
 
 		if (c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology) &&
 		    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology)
 			bch2_fatal_error(c);
 
 		set_btree_node_read_error(b);
+		bch2_btree_lost_data(c, b->c.btree_id);
 		clear_btree_node_read_in_flight(b);
 		wake_up_bit(&b->flags, BTREE_NODE_read_in_flight);
 		printbuf_exit(&buf);
@@ -1737,7 +1754,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
 		list_move(&b->list, &c->btree_cache.freeable);
 		mutex_unlock(&c->btree_cache.lock);
 
-		ret = -EIO;
+		ret = -BCH_ERR_btree_node_read_error;
 		goto err;
 	}
 
@@ -1841,7 +1858,7 @@ static void btree_node_write_work(struct work_struct *work)
 		bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
 
 	if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
-		ret = -BCH_ERR_btree_write_all_failed;
+		ret = -BCH_ERR_btree_node_write_all_failed;
 		goto err;
 	}
 
@@ -1852,7 +1869,7 @@ static void btree_node_write_work(struct work_struct *work)
 	} else {
 		ret = bch2_trans_do(c, NULL, NULL, 0,
 			bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
-					BCH_WATERMARK_reclaim|
+					BCH_WATERMARK_interior_updates|
 					BCH_TRANS_COMMIT_journal_reclaim|
 					BCH_TRANS_COMMIT_no_enospc|
 					BCH_TRANS_COMMIT_no_check_rw,
@@ -1866,8 +1883,8 @@ out:
 	return;
 err:
 	set_btree_node_noevict(b);
-	if (!bch2_err_matches(ret, EROFS))
-		bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret));
+	bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c,
+			     "writing btree node: %s", bch2_err_str(ret));
 	goto out;
 }
 
@@ -2123,7 +2140,7 @@ do_write:
 
 	ret = bset_encrypt(c, i, b->written << 9);
 	if (bch2_fs_fatal_err_on(ret, c,
-			"error encrypting btree node: %i\n", ret))
+			"encrypting btree node: %s", bch2_err_str(ret)))
 		goto err;
 
 	nonce = btree_nonce(i, b->written << 9);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index 3ef338df82f5..2a211a4bebd1 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -891,7 +891,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
 	struct bkey_s_c k;
 	int ret = 0;
 
-	__bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, &jiter, l->b, l->iter, path->pos);
 
 	k = bch2_btree_and_journal_iter_peek(&jiter);
 
@@ -927,8 +927,22 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
 		if (ret)
 			goto err;
 	} else {
-		bch2_bkey_buf_unpack(&tmp, c, l->b,
-				 bch2_btree_node_iter_peek(&l->iter, l->b));
+		struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b);
+		if (!k) {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "node not found at pos ");
+			bch2_bpos_to_text(&buf, path->pos);
+			prt_str(&buf, " within parent node ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key));
+
+			bch2_fs_fatal_error(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			ret = -BCH_ERR_btree_need_topology_repair;
+			goto err;
+		}
+
+		bch2_bkey_buf_unpack(&tmp, c, l->b, k);
 
 		if ((flags & BTREE_ITER_PREFETCH) &&
 		    c->opts.btree_node_prefetch) {
@@ -962,7 +976,6 @@ err:
 	return ret;
 }
 
-
 static int bch2_btree_path_traverse_all(struct btree_trans *trans)
 {
 	struct bch_fs *c = trans->c;
@@ -1146,7 +1159,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 	path = &trans->paths[path_idx];
 
 	if (unlikely(path->level >= BTREE_MAX_DEPTH))
-		goto out;
+		goto out_uptodate;
 
 	path->level = btree_path_up_until_good_node(trans, path, 0);
 
@@ -1179,7 +1192,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
 			goto out;
 		}
 	}
-
+out_uptodate:
 	path->uptodate = BTREE_ITER_UPTODATE;
 out:
 	if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted)
@@ -1520,7 +1533,7 @@ static noinline void btree_paths_realloc(struct btree_trans *trans)
 {
 	unsigned nr = trans->nr_paths * 2;
 
-	void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+	void *p = kvzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
 			  sizeof(struct btree_trans_paths) +
 			  nr * sizeof(struct btree_path) +
 			  nr * sizeof(btree_path_idx_t) + 8 +
@@ -1729,7 +1742,9 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
 	if (ret)
 		return ret;
 
-	btree_path_set_should_be_locked(trans->paths + iter->path);
+	struct btree_path *path = btree_iter_path(trans, iter);
+	if (btree_path_node(path, path->level))
+		btree_path_set_should_be_locked(path);
 	return 0;
 }
 
@@ -2305,7 +2320,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
 		btree_iter_path(trans, iter)->level);
 
 	if (iter->flags & BTREE_ITER_WITH_JOURNAL)
-		return bkey_s_c_err(-EIO);
+		return bkey_s_c_err(-BCH_ERR_btree_iter_with_journal_not_supported);
 
 	bch2_btree_iter_verify(iter);
 	bch2_btree_iter_verify_entry_exit(iter);
@@ -2503,6 +2518,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
 			k = bch2_btree_iter_peek_upto(&iter2, end);
 
 			if (k.k && !bkey_err(k)) {
+				swap(iter->key_cache_path, iter2.key_cache_path);
 				iter->k = iter2.k;
 				k.k = &iter->k;
 			}
@@ -2762,6 +2778,9 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
 	struct btree_trans *trans = src->trans;
 
 	*dst = *src;
+#ifdef TRACK_PATH_ALLOCATED
+	dst->ip_allocated = _RET_IP_;
+#endif
 	if (src->path)
 		__btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
 	if (src->update_path)
@@ -2784,6 +2803,31 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 	struct btree_transaction_stats *s = btree_trans_stats(trans);
 	s->max_mem = max(s->max_mem, new_bytes);
 
+	if (trans->used_mempool) {
+		if (trans->mem_bytes >= new_bytes)
+			goto out_change_top;
+
+		/* No more space from mempool item, need malloc new one */
+		new_mem = kmalloc(new_bytes, GFP_NOWAIT|__GFP_NOWARN);
+		if (unlikely(!new_mem)) {
+			bch2_trans_unlock(trans);
+
+			new_mem = kmalloc(new_bytes, GFP_KERNEL);
+			if (!new_mem)
+				return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc);
+
+			ret = bch2_trans_relock(trans);
+			if (ret) {
+				kfree(new_mem);
+				return ERR_PTR(ret);
+			}
+		}
+		memcpy(new_mem, trans->mem, trans->mem_top);
+		trans->used_mempool = false;
+		mempool_free(trans->mem, &c->btree_trans_mem_pool);
+		goto out_new_mem;
+	}
+
 	new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
 	if (unlikely(!new_mem)) {
 		bch2_trans_unlock(trans);
@@ -2792,6 +2836,8 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
 			new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
 			new_bytes = BTREE_TRANS_MEM_MAX;
+			memcpy(new_mem, trans->mem, trans->mem_top);
+			trans->used_mempool = true;
 			kfree(trans->mem);
 		}
 
@@ -2805,7 +2851,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		if (ret)
 			return ERR_PTR(ret);
 	}
-
+out_new_mem:
 	trans->mem = new_mem;
 	trans->mem_bytes = new_bytes;
 
@@ -2813,7 +2859,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
 		trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
 		return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
 	}
-
+out_change_top:
 	p = trans->mem + trans->mem_top;
 	trans->mem_top += size;
 	memset(p, 0, size);
@@ -3085,9 +3131,9 @@ void bch2_trans_put(struct btree_trans *trans)
 	trans->paths		= NULL;
 
 	if (paths_allocated != trans->_paths_allocated)
-		kfree_rcu_mightsleep(paths_allocated);
+		kvfree_rcu_mightsleep(paths_allocated);
 
-	if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
+	if (trans->used_mempool)
 		mempool_free(trans->mem, &c->btree_trans_mem_pool);
 	else
 		kfree(trans->mem);
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index 24772538e4cc..1c70836dd7cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -498,8 +498,13 @@ static inline void set_btree_iter_dontneed(struct btree_iter *iter)
 {
 	struct btree_trans *trans = iter->trans;
 
-	if (!trans->restarted)
-		btree_iter_path(trans, iter)->preserve = false;
+	if (!iter->path || trans->restarted)
+		return;
+
+	struct btree_path *path = btree_iter_path(trans, iter);
+	path->preserve		= false;
+	if (path->ref == 1)
+		path->should_be_locked	= false;
 }
 
 void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -642,7 +647,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
 
 static inline int btree_trans_too_many_iters(struct btree_trans *trans)
 {
-	if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+	if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
 		return __bch2_btree_trans_too_many_iters(trans);
 
 	return 0;
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index 719a94a84950..1e8cf49a6935 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -1,7 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "bkey_buf.h"
 #include "bset.h"
+#include "btree_cache.h"
 #include "btree_journal_iter.h"
 #include "journal_io.h"
 
@@ -40,7 +42,7 @@ static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx)
 
 static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx)
 {
-	return keys->d + idx_to_pos(keys, idx);
+	return keys->data + idx_to_pos(keys, idx);
 }
 
 static size_t __bch2_journal_key_search(struct journal_keys *keys,
@@ -128,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
 	return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
 }
 
+static void journal_iter_verify(struct journal_iter *iter)
+{
+	struct journal_keys *keys = iter->keys;
+	size_t gap_size = keys->size - keys->nr;
+
+	BUG_ON(iter->idx >= keys->gap &&
+	       iter->idx <  keys->gap + gap_size);
+
+	if (iter->idx < keys->size) {
+		struct journal_key *k = keys->data + iter->idx;
+
+		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
+			  cmp_int(k->level,	iter->level);
+		BUG_ON(cmp < 0);
+	}
+}
+
 static void journal_iters_fix(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	/* The key we just inserted is immediately before the gap: */
 	size_t gap_end = keys->gap + (keys->size - keys->nr);
-	struct btree_and_journal_iter *iter;
+	struct journal_key *new_key = &keys->data[keys->gap - 1];
+	struct journal_iter *iter;
 
 	/*
 	 * If an iterator points one after the key we just inserted, decrement
@@ -141,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
 	 * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
 	 * handle that:
 	 */
-	list_for_each_entry(iter, &c->journal_iters, journal.list)
-		if (iter->journal.idx == gap_end)
-			iter->journal.idx = keys->gap - 1;
+	list_for_each_entry(iter, &c->journal_iters, list) {
+		journal_iter_verify(iter);
+		if (iter->idx		== gap_end &&
+		    new_key->btree_id	== iter->btree_id &&
+		    new_key->level	== iter->level)
+			iter->idx = keys->gap - 1;
+		journal_iter_verify(iter);
+	}
 }
 
 static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
@@ -180,33 +205,38 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 	BUG_ON(test_bit(BCH_FS_rw, &c->flags));
 
 	if (idx < keys->size &&
-	    journal_key_cmp(&n, &keys->d[idx]) == 0) {
-		if (keys->d[idx].allocated)
-			kfree(keys->d[idx].k);
-		keys->d[idx] = n;
+	    journal_key_cmp(&n, &keys->data[idx]) == 0) {
+		if (keys->data[idx].allocated)
+			kfree(keys->data[idx].k);
+		keys->data[idx] = n;
 		return 0;
 	}
 
 	if (idx > keys->gap)
 		idx -= keys->size - keys->nr;
 
+	size_t old_gap = keys->gap;
+
 	if (keys->nr == keys->size) {
+		journal_iters_move_gap(c, old_gap, keys->size);
+		old_gap = keys->size;
+
 		struct journal_keys new_keys = {
 			.nr			= keys->nr,
 			.size			= max_t(size_t, keys->size, 8) * 2,
 		};
 
-		new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL);
-		if (!new_keys.d) {
+		new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL);
+		if (!new_keys.data) {
 			bch_err(c, "%s: error allocating new key array (size %zu)",
 				__func__, new_keys.size);
 			return -BCH_ERR_ENOMEM_journal_key_insert;
 		}
 
 		/* Since @keys was full, there was no gap: */
-		memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr);
-		kvfree(keys->d);
-		keys->d		= new_keys.d;
+		memcpy(new_keys.data, keys->data, sizeof(keys->data[0]) * keys->nr);
+		kvfree(keys->data);
+		keys->data	= new_keys.data;
 		keys->nr	= new_keys.nr;
 		keys->size	= new_keys.size;
 
@@ -214,13 +244,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
 		keys->gap	= keys->nr;
 	}
 
-	journal_iters_move_gap(c, keys->gap, idx);
+	journal_iters_move_gap(c, old_gap, idx);
 
-	move_gap(keys->d, keys->nr, keys->size, keys->gap, idx);
-	keys->gap = idx;
+	move_gap(keys, idx);
 
 	keys->nr++;
-	keys->d[keys->gap++] = n;
+	keys->data[keys->gap++] = n;
 
 	journal_iters_fix(c);
 
@@ -260,6 +289,22 @@ int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id,
 	return bch2_journal_key_insert(c, id, level, &whiteout);
 }
 
+bool bch2_key_deleted_in_journal(struct btree_trans *trans, enum btree_id btree,
+				 unsigned level, struct bpos pos)
+{
+	struct journal_keys *keys = &trans->c->journal_keys;
+	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
+
+	if (!trans->journal_replay_not_finished)
+		return false;
+
+	return (idx < keys->size &&
+		keys->data[idx].btree_id	== btree &&
+		keys->data[idx].level		== level &&
+		bpos_eq(keys->data[idx].k->k.p, pos) &&
+		bkey_deleted(&keys->data[idx].k->k));
+}
+
 void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 				  unsigned level, struct bpos pos)
 {
@@ -267,10 +312,10 @@ void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree,
 	size_t idx = bch2_journal_key_search(keys, btree, level, pos);
 
 	if (idx < keys->size &&
-	    keys->d[idx].btree_id	== btree &&
-	    keys->d[idx].level		== level &&
-	    bpos_eq(keys->d[idx].k->k.p, pos))
-		keys->d[idx].overwritten = true;
+	    keys->data[idx].btree_id	== btree &&
+	    keys->data[idx].level	== level &&
+	    bpos_eq(keys->data[idx].k->k.p, pos))
+		keys->data[idx].overwritten = true;
 }
 
 static void bch2_journal_iter_advance(struct journal_iter *iter)
@@ -284,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
 
 static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
 {
-	struct journal_key *k = iter->keys->d + iter->idx;
+	journal_iter_verify(iter);
+
+	while (iter->idx < iter->keys->size) {
+		struct journal_key *k = iter->keys->data + iter->idx;
+
+		int cmp = cmp_int(k->btree_id,	iter->btree_id) ?:
+			  cmp_int(k->level,	iter->level);
+		if (cmp > 0)
+			break;
+		BUG_ON(cmp);
 
-	while (k < iter->keys->d + iter->keys->size &&
-	       k->btree_id	== iter->btree_id &&
-	       k->level		== iter->level) {
 		if (!k->overwritten)
 			return bkey_i_to_s_c(k->k);
 
 		bch2_journal_iter_advance(iter);
-		k = iter->keys->d + iter->idx;
 	}
 
 	return bkey_s_c_null;
@@ -313,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
 	iter->level	= level;
 	iter->keys	= &c->journal_keys;
 	iter->idx	= bch2_journal_key_search(&c->journal_keys, id, level, pos);
+
+	journal_iter_verify(iter);
 }
 
 static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
@@ -334,9 +386,38 @@ void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter)
 		iter->pos = bpos_successor(iter->pos);
 }
 
+static void btree_and_journal_iter_prefetch(struct btree_and_journal_iter *_iter)
+{
+	struct btree_and_journal_iter iter = *_iter;
+	struct bch_fs *c = iter.trans->c;
+	unsigned level = iter.journal.level;
+	struct bkey_buf tmp;
+	unsigned nr = test_bit(BCH_FS_started, &c->flags)
+		? (level > 1 ? 0 :  2)
+		: (level > 1 ? 1 : 16);
+
+	iter.prefetch = false;
+	bch2_bkey_buf_init(&tmp);
+
+	while (nr--) {
+		bch2_btree_and_journal_iter_advance(&iter);
+		struct bkey_s_c k = bch2_btree_and_journal_iter_peek(&iter);
+		if (!k.k)
+			break;
+
+		bch2_bkey_buf_reassemble(&tmp, c, k);
+		bch2_btree_node_prefetch(iter.trans, NULL, tmp.k, iter.journal.btree_id, level - 1);
+	}
+
+	bch2_bkey_buf_exit(&tmp, c);
+}
+
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter)
 {
-	struct bkey_s_c btree_k, journal_k, ret;
+	struct bkey_s_c btree_k, journal_k = bkey_s_c_null, ret;
+
+	if (iter->prefetch && iter->journal.level)
+		btree_and_journal_iter_prefetch(iter);
 again:
 	if (iter->at_end)
 		return bkey_s_c_null;
@@ -345,9 +426,10 @@ again:
 	       bpos_lt(btree_k.k->p, iter->pos))
 		bch2_journal_iter_advance_btree(iter);
 
-	while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
-	       bpos_lt(journal_k.k->p, iter->pos))
-		bch2_journal_iter_advance(&iter->journal);
+	if (iter->trans->journal_replay_not_finished)
+		while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k &&
+		       bpos_lt(journal_k.k->p, iter->pos))
+			bch2_journal_iter_advance(&iter->journal);
 
 	ret = journal_k.k &&
 		(!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p))
@@ -376,35 +458,40 @@ void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter)
 	bch2_journal_iter_exit(&iter->journal);
 }
 
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						  struct bch_fs *c,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						  struct btree_and_journal_iter *iter,
 						  struct btree *b,
 						  struct btree_node_iter node_iter,
 						  struct bpos pos)
 {
 	memset(iter, 0, sizeof(*iter));
 
+	iter->trans = trans;
 	iter->b = b;
 	iter->node_iter = node_iter;
-	bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos);
-	INIT_LIST_HEAD(&iter->journal.list);
 	iter->pos = b->data->min_key;
 	iter->at_end = false;
+	INIT_LIST_HEAD(&iter->journal.list);
+
+	if (trans->journal_replay_not_finished) {
+		bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
+		if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
+			list_add(&iter->journal.list, &trans->c->journal_iters);
+	}
 }
 
 /*
  * this version is used by btree_gc before filesystem has gone RW and
  * multithreaded, so uses the journal_iters list:
  */
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter,
-						struct bch_fs *c,
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
+						struct btree_and_journal_iter *iter,
 						struct btree *b)
 {
 	struct btree_node_iter node_iter;
 
 	bch2_btree_node_iter_init_from_start(&node_iter, b);
-	__bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key);
-	list_add(&iter->journal.list, &c->journal_iters);
+	__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
 }
 
 /* sort and dedup all keys in the journal: */
@@ -415,9 +502,7 @@ void bch2_journal_entries_free(struct bch_fs *c)
 	struct genradix_iter iter;
 
 	genradix_for_each(&c->journal_entries, iter, i)
-		if (*i)
-			kvpfree(*i, offsetof(struct journal_replay, j) +
-				vstruct_bytes(&(*i)->j));
+		kvfree(*i);
 	genradix_free(&c->journal_entries);
 }
 
@@ -437,22 +522,20 @@ static int journal_sort_key_cmp(const void *_l, const void *_r)
 void bch2_journal_keys_put(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
-	struct journal_key *i;
 
 	BUG_ON(atomic_read(&keys->ref) <= 0);
 
 	if (!atomic_dec_and_test(&keys->ref))
 		return;
 
-	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-	keys->gap = keys->nr;
+	move_gap(keys, keys->nr);
 
-	for (i = keys->d; i < keys->d + keys->nr; i++)
+	darray_for_each(*keys, i)
 		if (i->allocated)
 			kfree(i->k);
 
-	kvfree(keys->d);
-	keys->d = NULL;
+	kvfree(keys->data);
+	keys->data = NULL;
 	keys->nr = keys->gap = keys->size = 0;
 
 	bch2_journal_entries_free(c);
@@ -460,83 +543,38 @@ void bch2_journal_keys_put(struct bch_fs *c)
 
 static void __journal_keys_sort(struct journal_keys *keys)
 {
-	struct journal_key *src, *dst;
+	sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL);
 
-	sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL);
+	struct journal_key *dst = keys->data;
 
-	src = dst = keys->d;
-	while (src < keys->d + keys->nr) {
-		while (src + 1 < keys->d + keys->nr &&
-		       !journal_key_cmp(src, src + 1))
-			src++;
+	darray_for_each(*keys, src) {
+		if (src + 1 < &darray_top(*keys) &&
+		    !journal_key_cmp(src, src + 1))
+			continue;
 
-		*dst++ = *src++;
+		*dst++ = *src;
 	}
 
-	keys->nr = dst - keys->d;
+	keys->nr = dst - keys->data;
 }
 
 int bch2_journal_keys_sort(struct bch_fs *c)
 {
 	struct genradix_iter iter;
 	struct journal_replay *i, **_i;
-	struct jset_entry *entry;
-	struct bkey_i *k;
 	struct journal_keys *keys = &c->journal_keys;
-	size_t nr_keys = 0, nr_read = 0;
+	size_t nr_read = 0;
 
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
-			continue;
-
-		for_each_jset_key(k, entry, &i->j)
-			nr_keys++;
-	}
-
-	if (!nr_keys)
-		return 0;
-
-	keys->size = roundup_pow_of_two(nr_keys);
-
-	keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-	if (!keys->d) {
-		bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath",
-			nr_keys);
-
-		do {
-			keys->size >>= 1;
-			keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL);
-		} while (!keys->d && keys->size > nr_keys / 8);
-
-		if (!keys->d) {
-			bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting",
-				keys->size);
-			return -BCH_ERR_ENOMEM_journal_keys_sort;
-		}
-	}
-
-	genradix_for_each(&c->journal_entries, iter, _i) {
-		i = *_i;
-
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		cond_resched();
 
 		for_each_jset_key(k, entry, &i->j) {
-			if (keys->nr == keys->size) {
-				__journal_keys_sort(keys);
-
-				if (keys->nr > keys->size * 7 / 8) {
-					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu",
-						keys->nr, keys->size, nr_read, nr_keys);
-					return -BCH_ERR_ENOMEM_journal_keys_sort;
-				}
-			}
-
-			keys->d[keys->nr++] = (struct journal_key) {
+			struct journal_key n = (struct journal_key) {
 				.btree_id	= entry->btree_id,
 				.level		= entry->level,
 				.k		= k,
@@ -544,6 +582,18 @@ int bch2_journal_keys_sort(struct bch_fs *c)
 				.journal_offset	= k->_data - i->j._data,
 			};
 
+			if (darray_push(keys, n)) {
+				__journal_keys_sort(keys);
+
+				if (keys->nr * 8 > keys->size * 7) {
+					bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu",
+						keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq));
+					return -BCH_ERR_ENOMEM_journal_keys_sort;
+				}
+
+				BUG_ON(darray_push(keys, n));
+			}
+
 			nr_read++;
 		}
 	}
@@ -551,6 +601,25 @@ int bch2_journal_keys_sort(struct bch_fs *c)
 	__journal_keys_sort(keys);
 	keys->gap = keys->nr;
 
-	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr);
+	bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_read, keys->nr);
 	return 0;
 }
+
+void bch2_shoot_down_journal_keys(struct bch_fs *c, enum btree_id btree,
+				  unsigned level_min, unsigned level_max,
+				  struct bpos start, struct bpos end)
+{
+	struct journal_keys *keys = &c->journal_keys;
+	size_t dst = 0;
+
+	move_gap(keys, keys->nr);
+
+	darray_for_each(*keys, i)
+		if (!(i->btree_id == btree &&
+		      i->level >= level_min &&
+		      i->level <= level_max &&
+		      bpos_ge(i->k->k.p, start) &&
+		      bpos_le(i->k->k.p, end)))
+			keys->data[dst++] = *i;
+	keys->nr = keys->gap = dst;
+}
diff --git a/fs/bcachefs/btree_journal_iter.h b/fs/bcachefs/btree_journal_iter.h
index 8ca4c100b2e3..af25046ebcaa 100644
--- a/fs/bcachefs/btree_journal_iter.h
+++ b/fs/bcachefs/btree_journal_iter.h
@@ -15,6 +15,7 @@ struct journal_iter {
  */
 
 struct btree_and_journal_iter {
+	struct btree_trans	*trans;
 	struct btree		*b;
 	struct btree_node_iter	node_iter;
 	struct bkey		unpacked;
@@ -22,6 +23,7 @@ struct btree_and_journal_iter {
 	struct journal_iter	journal;
 	struct bpos		pos;
 	bool			at_end;
+	bool			prefetch;
 };
 
 struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
@@ -29,25 +31,27 @@ struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id,
 struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id,
 					   unsigned, struct bpos);
 
+int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *,
+					 struct btree_and_journal_iter *);
+
 int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id,
 				 unsigned, struct bkey_i *);
 int bch2_journal_key_insert(struct bch_fs *, enum btree_id,
 			    unsigned, struct bkey_i *);
 int bch2_journal_key_delete(struct bch_fs *, enum btree_id,
 			    unsigned, struct bpos);
-void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id,
-				  unsigned, struct bpos);
+bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos);
+void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos);
 
 void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *);
 struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *);
 
 void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *);
-void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-				struct bch_fs *, struct btree *,
+void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *,
 				struct btree_node_iter, struct bpos);
-void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *,
-						struct bch_fs *,
-						struct btree *);
+void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *,
+				struct btree_and_journal_iter *, struct btree *);
 
 void bch2_journal_keys_put(struct bch_fs *);
 
@@ -62,4 +66,8 @@ void bch2_journal_entries_free(struct bch_fs *);
 
 int bch2_journal_keys_sort(struct bch_fs *);
 
+void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id,
+				  unsigned, unsigned,
+				  struct bpos, struct bpos);
+
 #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 74e52fd28abe..88a3582a3275 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
 	} else {
 		mutex_lock(&bc->lock);
 		list_move_tail(&ck->list, &bc->freed_pcpu);
+		bc->nr_freed_pcpu++;
 		mutex_unlock(&bc->lock);
 	}
 }
@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
 		if (!list_empty(&bc->freed_pcpu)) {
 			ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
 			list_del_init(&ck->list);
+			bc->nr_freed_pcpu--;
 		}
 		mutex_unlock(&bc->lock);
 	}
@@ -380,9 +382,11 @@ static int btree_key_cache_fill(struct btree_trans *trans,
 	struct bkey_i *new_k = NULL;
 	int ret;
 
-	k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos,
-			       BTREE_ITER_KEY_CACHE_FILL|
-			       BTREE_ITER_CACHED_NOFILL);
+	bch2_trans_iter_init(trans, &iter, ck->key.btree_id, ck->key.pos,
+			     BTREE_ITER_KEY_CACHE_FILL|
+			     BTREE_ITER_CACHED_NOFILL);
+	iter.flags &= ~BTREE_ITER_WITH_JOURNAL;
+	k = bch2_btree_iter_peek_slot(&iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
@@ -657,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 		commit_flags |= BCH_WATERMARK_reclaim;
 
 	if (ck->journal.seq != journal_last_seq(j) ||
-	    j->watermark == BCH_WATERMARK_stripe)
+	    !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
 		commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
 
 	ret   = bch2_btree_iter_traverse(&b_iter) ?:
@@ -674,7 +678,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
 			     !bch2_err_matches(ret, BCH_ERR_transaction_restart) &&
 			     !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) &&
 			     !bch2_journal_error(j), c,
-			     "error flushing key cache: %s", bch2_err_str(ret));
+			     "flushing key cache: %s", bch2_err_str(ret));
 	if (ret)
 		goto out;
 
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 684397442338..f2caf491957e 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
 				       struct btree_path *path,
 				       struct btree_bkey_cached_common *b)
 {
-	struct btree_path *linked;
-	unsigned i, iter;
-	int ret;
-
-	/*
-	 * XXX BIG FAT NOTICE
-	 *
-	 * Drop all read locks before taking a write lock:
-	 *
-	 * This is a hack, because bch2_btree_node_lock_write_nofail() is a
-	 * hack - but by dropping read locks first, this should never fail, and
-	 * we only use this in code paths where whatever read locks we've
-	 * already taken are no longer needed:
-	 */
-
-	trans_for_each_path(trans, linked, iter) {
-		if (!linked->nodes_locked)
-			continue;
-
-		for (i = 0; i < BTREE_MAX_DEPTH; i++)
-			if (btree_node_read_locked(linked, i)) {
-				btree_node_unlock(trans, linked, i);
-				btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
-			}
-	}
-
-	ret = __btree_node_lock_write(trans, path, b, true);
+	int ret = __btree_node_lock_write(trans, path, b, true);
 	BUG_ON(ret);
 }
 
@@ -747,7 +721,8 @@ void bch2_trans_downgrade(struct btree_trans *trans)
 		return;
 
 	trans_for_each_path(trans, path, i)
-		bch2_btree_path_downgrade(trans, path);
+		if (path->ref)
+			bch2_btree_path_downgrade(trans, path);
 }
 
 int bch2_trans_relock(struct btree_trans *trans)
diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c
new file mode 100644
index 000000000000..866bd278439f
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.c
@@ -0,0 +1,519 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "btree_cache.h"
+#include "btree_io.h"
+#include "btree_journal_iter.h"
+#include "btree_node_scan.h"
+#include "btree_update_interior.h"
+#include "buckets.h"
+#include "error.h"
+#include "journal_io.h"
+#include "recovery_passes.h"
+
+#include <linux/kthread.h>
+#include <linux/sort.h>
+
+struct find_btree_nodes_worker {
+	struct closure		*cl;
+	struct find_btree_nodes	*f;
+	struct bch_dev		*ca;
+};
+
+static void found_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struct found_btree_node *n)
+{
+	prt_printf(out, "%s l=%u seq=%u cookie=%llx ", bch2_btree_id_str(n->btree_id), n->level, n->seq, n->cookie);
+	bch2_bpos_to_text(out, n->min_key);
+	prt_str(out, "-");
+	bch2_bpos_to_text(out, n->max_key);
+
+	if (n->range_updated)
+		prt_str(out, " range updated");
+	if (n->overwritten)
+		prt_str(out, " overwritten");
+
+	for (unsigned i = 0; i < n->nr_ptrs; i++) {
+		prt_char(out, ' ');
+		bch2_extent_ptr_to_text(out, c, n->ptrs + i);
+	}
+}
+
+static void found_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c, found_btree_nodes nodes)
+{
+	printbuf_indent_add(out, 2);
+	darray_for_each(nodes, i) {
+		found_btree_node_to_text(out, c, i);
+		prt_newline(out);
+	}
+	printbuf_indent_sub(out, 2);
+}
+
+static void found_btree_node_to_key(struct bkey_i *k, const struct found_btree_node *f)
+{
+	struct bkey_i_btree_ptr_v2 *bp = bkey_btree_ptr_v2_init(k);
+
+	set_bkey_val_u64s(&bp->k, sizeof(struct bch_btree_ptr_v2) / sizeof(u64) + f->nr_ptrs);
+	bp->k.p			= f->max_key;
+	bp->v.seq		= cpu_to_le64(f->cookie);
+	bp->v.sectors_written	= 0;
+	bp->v.flags		= 0;
+	bp->v.min_key		= f->min_key;
+	SET_BTREE_PTR_RANGE_UPDATED(&bp->v, f->range_updated);
+	memcpy(bp->v.start, f->ptrs, sizeof(struct bch_extent_ptr) * f->nr_ptrs);
+}
+
+static bool found_btree_node_is_readable(struct btree_trans *trans,
+					 const struct found_btree_node *f)
+{
+	struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } k;
+
+	found_btree_node_to_key(&k.k, f);
+
+	struct btree *b = bch2_btree_node_get_noiter(trans, &k.k, f->btree_id, f->level, false);
+	bool ret = !IS_ERR_OR_NULL(b);
+	if (ret)
+		six_unlock_read(&b->c.lock);
+
+	/*
+	 * We might update this node's range; if that happens, we need the node
+	 * to be re-read so the read path can trim keys that are no longer in
+	 * this node
+	 */
+	if (b != btree_node_root(trans->c, b))
+		bch2_btree_node_evict(trans, &k.k);
+	return ret;
+}
+
+static int found_btree_node_cmp_cookie(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+		cmp_int(l->level,	r->level) ?:
+		cmp_int(l->cookie,	r->cookie);
+}
+
+/*
+ * Given two found btree nodes, if their sequence numbers are equal, take the
+ * one that's readable:
+ */
+static int found_btree_node_cmp_time(const struct found_btree_node *l,
+				     const struct found_btree_node *r)
+{
+	return cmp_int(l->seq, r->seq);
+}
+
+static int found_btree_node_cmp_pos(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+	       -cmp_int(l->level,	r->level) ?:
+		bpos_cmp(l->min_key,	r->min_key) ?:
+	       -found_btree_node_cmp_time(l, r);
+}
+
+static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
+				struct bio *bio, struct btree_node *bn, u64 offset)
+{
+	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+
+	bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ);
+	bio->bi_iter.bi_sector	= offset;
+	bch2_bio_map(bio, bn, PAGE_SIZE);
+
+	submit_bio_wait(bio);
+	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read,
+			       "IO error in try_read_btree_node() at %llu: %s",
+			       offset, bch2_blk_status_to_str(bio->bi_status)))
+		return;
+
+	if (le64_to_cpu(bn->magic) != bset_magic(c))
+		return;
+
+	if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) {
+		struct nonce nonce = btree_nonce(&bn->keys, 0);
+		unsigned bytes = (void *) &bn->keys - (void *) &bn->flags;
+
+		bch2_encrypt(c, BSET_CSUM_TYPE(&bn->keys), nonce, &bn->flags, bytes);
+	}
+
+	if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
+		return;
+
+	if (BTREE_NODE_LEVEL(bn) >= BTREE_MAX_DEPTH)
+		return;
+
+	rcu_read_lock();
+	struct found_btree_node n = {
+		.btree_id	= BTREE_NODE_ID(bn),
+		.level		= BTREE_NODE_LEVEL(bn),
+		.seq		= BTREE_NODE_SEQ(bn),
+		.cookie		= le64_to_cpu(bn->keys.seq),
+		.min_key	= bn->min_key,
+		.max_key	= bn->max_key,
+		.nr_ptrs	= 1,
+		.ptrs[0].type	= 1 << BCH_EXTENT_ENTRY_ptr,
+		.ptrs[0].offset	= offset,
+		.ptrs[0].dev	= ca->dev_idx,
+		.ptrs[0].gen	= *bucket_gen(ca, sector_to_bucket(ca, offset)),
+	};
+	rcu_read_unlock();
+
+	if (bch2_trans_run(c, found_btree_node_is_readable(trans, &n))) {
+		mutex_lock(&f->lock);
+		if (BSET_BIG_ENDIAN(&bn->keys) != CPU_BIG_ENDIAN) {
+			bch_err(c, "try_read_btree_node() can't handle endian conversion");
+			f->ret = -EINVAL;
+			goto unlock;
+		}
+
+		if (darray_push(&f->nodes, n))
+			f->ret = -ENOMEM;
+unlock:
+		mutex_unlock(&f->lock);
+	}
+}
+
+static int read_btree_nodes_worker(void *p)
+{
+	struct find_btree_nodes_worker *w = p;
+	struct bch_fs *c = container_of(w->f, struct bch_fs, found_btree_nodes);
+	struct bch_dev *ca = w->ca;
+	void *buf = (void *) __get_free_page(GFP_KERNEL);
+	struct bio *bio = bio_alloc(NULL, 1, 0, GFP_KERNEL);
+	unsigned long last_print = jiffies;
+
+	if (!buf || !bio) {
+		bch_err(c, "read_btree_nodes_worker: error allocating bio/buf");
+		w->f->ret = -ENOMEM;
+		goto err;
+	}
+
+	for (u64 bucket = ca->mi.first_bucket; bucket < ca->mi.nbuckets; bucket++)
+		for (unsigned bucket_offset = 0;
+		     bucket_offset + btree_sectors(c) <= ca->mi.bucket_size;
+		     bucket_offset += btree_sectors(c)) {
+			if (time_after(jiffies, last_print + HZ * 30)) {
+				u64 cur_sector = bucket * ca->mi.bucket_size + bucket_offset;
+				u64 end_sector = ca->mi.nbuckets * ca->mi.bucket_size;
+
+				bch_info(ca, "%s: %2u%% done", __func__,
+					 (unsigned) div64_u64(cur_sector * 100, end_sector));
+				last_print = jiffies;
+			}
+
+			u64 sector = bucket * ca->mi.bucket_size + bucket_offset;
+
+			if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_mi_btree_bitmap &&
+			    !bch2_dev_btree_bitmap_marked_sectors(ca, sector, btree_sectors(c)))
+				continue;
+
+			try_read_btree_node(w->f, ca, bio, buf, sector);
+		}
+err:
+	bio_put(bio);
+	free_page((unsigned long) buf);
+	percpu_ref_get(&ca->io_ref);
+	closure_put(w->cl);
+	kfree(w);
+	return 0;
+}
+
+static int read_btree_nodes(struct find_btree_nodes *f)
+{
+	struct bch_fs *c = container_of(f, struct bch_fs, found_btree_nodes);
+	struct closure cl;
+	int ret = 0;
+
+	closure_init_stack(&cl);
+
+	for_each_online_member(c, ca) {
+		if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
+			continue;
+
+		struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
+		struct task_struct *t;
+
+		if (!w) {
+			percpu_ref_put(&ca->io_ref);
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		percpu_ref_get(&ca->io_ref);
+		closure_get(&cl);
+		w->cl		= &cl;
+		w->f		= f;
+		w->ca		= ca;
+
+		t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name);
+		ret = IS_ERR_OR_NULL(t);
+		if (ret) {
+			percpu_ref_put(&ca->io_ref);
+			closure_put(&cl);
+			f->ret = ret;
+			bch_err(c, "error starting kthread: %i", ret);
+			break;
+		}
+	}
+err:
+	closure_sync(&cl);
+	return f->ret ?: ret;
+}
+
+static void bubble_up(struct found_btree_node *n, struct found_btree_node *end)
+{
+	while (n + 1 < end &&
+	       found_btree_node_cmp_pos(n, n + 1) > 0) {
+		swap(n[0], n[1]);
+		n++;
+	}
+}
+
+static int handle_overwrites(struct bch_fs *c,
+			     struct found_btree_node *start,
+			     struct found_btree_node *end)
+{
+	struct found_btree_node *n;
+again:
+	for (n = start + 1;
+	     n < end &&
+	     n->btree_id	== start->btree_id &&
+	     n->level		== start->level &&
+	     bpos_lt(n->min_key, start->max_key);
+	     n++)  {
+		int cmp = found_btree_node_cmp_time(start, n);
+
+		if (cmp > 0) {
+			if (bpos_cmp(start->max_key, n->max_key) >= 0)
+				n->overwritten = true;
+			else {
+				n->range_updated = true;
+				n->min_key = bpos_successor(start->max_key);
+				n->range_updated = true;
+				bubble_up(n, end);
+				goto again;
+			}
+		} else if (cmp < 0) {
+			BUG_ON(bpos_cmp(n->min_key, start->min_key) <= 0);
+
+			start->max_key = bpos_predecessor(n->min_key);
+			start->range_updated = true;
+		} else {
+			struct printbuf buf = PRINTBUF;
+
+			prt_str(&buf, "overlapping btree nodes with same seq! halting\n  ");
+			found_btree_node_to_text(&buf, c, start);
+			prt_str(&buf, "\n  ");
+			found_btree_node_to_text(&buf, c, n);
+			bch_err(c, "%s", buf.buf);
+			printbuf_exit(&buf);
+			return -BCH_ERR_fsck_repair_unimplemented;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_scan_for_btree_nodes(struct bch_fs *c)
+{
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+	struct printbuf buf = PRINTBUF;
+	size_t dst;
+	int ret = 0;
+
+	if (f->nodes.nr)
+		return 0;
+
+	mutex_init(&f->lock);
+
+	ret = read_btree_nodes(f);
+	if (ret)
+		return ret;
+
+	if (!f->nodes.nr) {
+		bch_err(c, "%s: no btree nodes found", __func__);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (0 && c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes found:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL);
+
+	dst = 0;
+	darray_for_each(f->nodes, i) {
+		struct found_btree_node *prev = dst ? f->nodes.data + dst - 1 : NULL;
+
+		if (prev &&
+		    prev->cookie == i->cookie) {
+			if (prev->nr_ptrs == ARRAY_SIZE(prev->ptrs)) {
+				bch_err(c, "%s: found too many replicas for btree node", __func__);
+				ret = -EINVAL;
+				goto err;
+			}
+			prev->ptrs[prev->nr_ptrs++] = i->ptrs[0];
+		} else {
+			f->nodes.data[dst++] = *i;
+		}
+	}
+	f->nodes.nr = dst;
+
+	sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+
+	if (0 && c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	dst = 0;
+	darray_for_each(f->nodes, i) {
+		if (i->overwritten)
+			continue;
+
+		ret = handle_overwrites(c, i, &darray_top(f->nodes));
+		if (ret)
+			goto err;
+
+		BUG_ON(i->overwritten);
+		f->nodes.data[dst++] = *i;
+	}
+	f->nodes.nr = dst;
+
+	if (c->opts.verbose) {
+		printbuf_reset(&buf);
+		prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__);
+		found_btree_nodes_to_text(&buf, c, f->nodes);
+		bch2_print_string_as_lines(KERN_INFO, buf.buf);
+	}
+
+	eytzinger0_sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL);
+err:
+	printbuf_exit(&buf);
+	return ret;
+}
+
+static int found_btree_node_range_start_cmp(const void *_l, const void *_r)
+{
+	const struct found_btree_node *l = _l;
+	const struct found_btree_node *r = _r;
+
+	return  cmp_int(l->btree_id,	r->btree_id) ?:
+	       -cmp_int(l->level,	r->level) ?:
+		bpos_cmp(l->max_key,	r->min_key);
+}
+
+#define for_each_found_btree_node_in_range(_f, _search, _idx)				\
+	for (size_t _idx = eytzinger0_find_gt((_f)->nodes.data, (_f)->nodes.nr,		\
+					sizeof((_f)->nodes.data[0]),			\
+					found_btree_node_range_start_cmp, &search);	\
+	     _idx < (_f)->nodes.nr &&							\
+	     (_f)->nodes.data[_idx].btree_id == _search.btree_id &&			\
+	     (_f)->nodes.data[_idx].level == _search.level &&				\
+	     bpos_lt((_f)->nodes.data[_idx].min_key, _search.max_key);			\
+	     _idx = eytzinger0_next(_idx, (_f)->nodes.nr))
+
+bool bch2_btree_node_is_stale(struct bch_fs *c, struct btree *b)
+{
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+
+	struct found_btree_node search = {
+		.btree_id	= b->c.btree_id,
+		.level		= b->c.level,
+		.min_key	= b->data->min_key,
+		.max_key	= b->key.k.p,
+	};
+
+	for_each_found_btree_node_in_range(f, search, idx)
+		if (f->nodes.data[idx].seq > BTREE_NODE_SEQ(b->data))
+			return true;
+	return false;
+}
+
+bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
+{
+	struct found_btree_node search = {
+		.btree_id	= btree,
+		.level		= 0,
+		.min_key	= POS_MIN,
+		.max_key	= SPOS_MAX,
+	};
+
+	for_each_found_btree_node_in_range(&c->found_btree_nodes, search, idx)
+		return true;
+	return false;
+}
+
+int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
+			   unsigned level, struct bpos node_min, struct bpos node_max)
+{
+	if (btree_id_is_alloc(btree))
+		return 0;
+
+	struct find_btree_nodes *f = &c->found_btree_nodes;
+
+	int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
+	if (ret)
+		return ret;
+
+	if (c->opts.verbose) {
+		struct printbuf buf = PRINTBUF;
+
+		prt_printf(&buf, "recovering %s l=%u ", bch2_btree_id_str(btree), level);
+		bch2_bpos_to_text(&buf, node_min);
+		prt_str(&buf, " - ");
+		bch2_bpos_to_text(&buf, node_max);
+
+		bch_info(c, "%s(): %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+	}
+
+	struct found_btree_node search = {
+		.btree_id	= btree,
+		.level		= level,
+		.min_key	= node_min,
+		.max_key	= node_max,
+	};
+
+	for_each_found_btree_node_in_range(f, search, idx) {
+		struct found_btree_node n = f->nodes.data[idx];
+
+		n.range_updated |= bpos_lt(n.min_key, node_min);
+		n.min_key = bpos_max(n.min_key, node_min);
+
+		n.range_updated |= bpos_gt(n.max_key, node_max);
+		n.max_key = bpos_min(n.max_key, node_max);
+
+		struct { __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); } tmp;
+
+		found_btree_node_to_key(&tmp.k, &n);
+
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k));
+		bch_verbose(c, "%s(): recovering %s", __func__, buf.buf);
+		printbuf_exit(&buf);
+
+		BUG_ON(bch2_bkey_invalid(c, bkey_i_to_s_c(&tmp.k), BKEY_TYPE_btree, 0, NULL));
+
+		ret = bch2_journal_key_insert(c, btree, level + 1, &tmp.k);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *f)
+{
+	darray_exit(&f->nodes);
+}
diff --git a/fs/bcachefs/btree_node_scan.h b/fs/bcachefs/btree_node_scan.h
new file mode 100644
index 000000000000..08687b209787
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_H
+#define _BCACHEFS_BTREE_NODE_SCAN_H
+
+int bch2_scan_for_btree_nodes(struct bch_fs *);
+bool bch2_btree_node_is_stale(struct bch_fs *, struct btree *);
+bool bch2_btree_has_scanned_nodes(struct bch_fs *, enum btree_id);
+int bch2_get_scanned_nodes(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos);
+void bch2_find_btree_nodes_exit(struct find_btree_nodes *);
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_H */
diff --git a/fs/bcachefs/btree_node_scan_types.h b/fs/bcachefs/btree_node_scan_types.h
new file mode 100644
index 000000000000..abb7b27d556a
--- /dev/null
+++ b/fs/bcachefs/btree_node_scan_types.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+#define _BCACHEFS_BTREE_NODE_SCAN_TYPES_H
+
+#include "darray.h"
+
+struct found_btree_node {
+	bool			range_updated:1;
+	bool			overwritten:1;
+	u8			btree_id;
+	u8			level;
+	u32			seq;
+	u64			cookie;
+
+	struct bpos		min_key;
+	struct bpos		max_key;
+
+	unsigned		nr_ptrs;
+	struct bch_extent_ptr	ptrs[BCH_REPLICAS_MAX];
+};
+
+typedef DARRAY(struct found_btree_node)	found_btree_nodes;
+
+struct find_btree_nodes {
+	int			ret;
+	struct mutex		lock;
+	found_btree_nodes	nodes;
+};
+
+#endif /* _BCACHEFS_BTREE_NODE_SCAN_TYPES_H */
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 30d69a6d133e..bbec91e8e650 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -318,7 +318,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans,
 		!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
 		test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) &&
 		i->k->k.p.snapshot &&
-		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot));
+		bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot) > 0);
 }
 
 static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans,
@@ -397,12 +397,13 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
 	struct bkey_cached *ck = (void *) path->l[0].b;
 	unsigned new_u64s;
 	struct bkey_i *new_k;
+	unsigned watermark = flags & BCH_WATERMARK_MASK;
 
 	EBUG_ON(path->level);
 
-	if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
-	    bch2_btree_key_cache_must_wait(c) &&
-	    !(flags & BCH_TRANS_COMMIT_journal_reclaim))
+	if (watermark < BCH_WATERMARK_reclaim &&
+	    !test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
+	    bch2_btree_key_cache_must_wait(c))
 		return -BCH_ERR_btree_insert_need_journal_reclaim;
 
 	/*
@@ -499,9 +500,8 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
 }
 
 static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
-			      struct btree_insert_entry *btree_id_start)
+			      unsigned btree_id_start)
 {
-	struct btree_insert_entry *i;
 	bool trans_trigger_run;
 	int ret, overwrite;
 
@@ -514,13 +514,13 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 		do {
 			trans_trigger_run = false;
 
-			for (i = btree_id_start;
-			     i < trans->updates + trans->nr_updates && i->btree_id <= btree_id;
+			for (unsigned i = btree_id_start;
+			     i < trans->nr_updates && trans->updates[i].btree_id <= btree_id;
 			     i++) {
-				if (i->btree_id != btree_id)
+				if (trans->updates[i].btree_id != btree_id)
 					continue;
 
-				ret = run_one_trans_trigger(trans, i, overwrite);
+				ret = run_one_trans_trigger(trans, trans->updates + i, overwrite);
 				if (ret < 0)
 					return ret;
 				if (ret)
@@ -534,8 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
 
 static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 {
-	struct btree_insert_entry *btree_id_start = trans->updates;
-	unsigned btree_id = 0;
+	unsigned btree_id = 0, btree_id_start = 0;
 	int ret = 0;
 
 	/*
@@ -549,8 +548,8 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 		if (btree_id == BTREE_ID_alloc)
 			continue;
 
-		while (btree_id_start < trans->updates + trans->nr_updates &&
-		       btree_id_start->btree_id < btree_id)
+		while (btree_id_start < trans->nr_updates &&
+		       trans->updates[btree_id_start].btree_id < btree_id)
 			btree_id_start++;
 
 		ret = run_btree_triggers(trans, btree_id, btree_id_start);
@@ -558,11 +557,13 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
 			return ret;
 	}
 
-	trans_for_each_update(trans, i) {
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
+
 		if (i->btree_id > BTREE_ID_alloc)
 			break;
 		if (i->btree_id == BTREE_ID_alloc) {
-			ret = run_btree_triggers(trans, BTREE_ID_alloc, i);
+			ret = run_btree_triggers(trans, BTREE_ID_alloc, idx);
 			if (ret)
 				return ret;
 			break;
@@ -826,7 +827,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
 	struct bch_fs *c = trans->c;
 	int ret = 0, u64s_delta = 0;
 
-	trans_for_each_update(trans, i) {
+	for (unsigned idx = 0; idx < trans->nr_updates; idx++) {
+		struct btree_insert_entry *i = trans->updates + idx;
 		if (i->cached)
 			continue;
 
@@ -887,6 +889,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 			    int ret, unsigned long trace_ip)
 {
 	struct bch_fs *c = trans->c;
+	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
 
 	switch (ret) {
 	case -BCH_ERR_btree_insert_btree_node_full:
@@ -905,7 +908,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
 		 * flag
 		 */
 		if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
+		    watermark < BCH_WATERMARK_reclaim) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			break;
 		}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 4a5a64499eb7..e0c982a4195c 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -5,6 +5,7 @@
 #include <linux/list.h>
 #include <linux/rhashtable.h>
 
+#include "bbpos_types.h"
 #include "btree_key_cache_types.h"
 #include "buckets_types.h"
 #include "darray.h"
@@ -173,6 +174,11 @@ struct btree_cache {
 	 */
 	struct task_struct	*alloc_lock;
 	struct closure_waitlist	alloc_wait;
+
+	struct bbpos		pinned_nodes_start;
+	struct bbpos		pinned_nodes_end;
+	u64			pinned_nodes_leaf_mask;
+	u64			pinned_nodes_interior_mask;
 };
 
 struct btree_node_iter {
@@ -358,7 +364,21 @@ struct btree_insert_entry {
 	unsigned long		ip_allocated;
 };
 
+/* Number of btree paths we preallocate, usually enough */
 #define BTREE_ITER_INITIAL		64
+/*
+ * Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
+ * paths should run inside this limit, and if they don't it usually indicates a
+ * bug (leaking/duplicated btree paths).
+ *
+ * exception: some fsck paths
+ *
+ * bugs with excessive path usage seem to have possibly been eliminated now, so
+ * we might consider eliminating this (and btree_trans_too_many_iter()) at some
+ * point.
+ */
+#define BTREE_ITER_NORMAL_LIMIT		256
+/* never exceed limit */
 #define BTREE_ITER_MAX			(1U << 10)
 
 struct btree_trans_commit_hook;
@@ -654,6 +674,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
 	 BIT_ULL(BKEY_TYPE_inodes)|			\
 	 BIT_ULL(BKEY_TYPE_stripes)|			\
 	 BIT_ULL(BKEY_TYPE_reflink)|			\
+	 BIT_ULL(BKEY_TYPE_subvolumes)|			\
 	 BIT_ULL(BKEY_TYPE_btree))
 
 #define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS		\
@@ -727,7 +748,7 @@ struct btree_root {
 	__BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX);
 	u8			level;
 	u8			alive;
-	s8			error;
+	s16			error;
 };
 
 enum btree_gc_coalesce_fail_reason {
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index c3ff365acce9..8e47e260eba5 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -38,6 +38,9 @@ static noinline int extent_front_merge(struct btree_trans *trans,
 	struct bkey_i *update;
 	int ret;
 
+	if (unlikely(trans->journal_replay_not_finished))
+		return 0;
+
 	update = bch2_bkey_make_mut_noupdate(trans, k);
 	ret = PTR_ERR_OR_ZERO(update);
 	if (ret)
@@ -69,6 +72,9 @@ static noinline int extent_back_merge(struct btree_trans *trans,
 	struct bch_fs *c = trans->c;
 	int ret;
 
+	if (unlikely(trans->journal_replay_not_finished))
+		return 0;
+
 	ret =   bch2_key_has_snapshot_overwrites(trans, iter->btree_id, insert->k.p) ?:
 		bch2_key_has_snapshot_overwrites(trans, iter->btree_id, k.k->p);
 	if (ret < 0)
@@ -452,7 +458,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
 	 * the key cache - but the key has to exist in the btree for that to
 	 * work:
 	 */
-	if (path->cached && bkey_deleted(&i->old_k))
+	if (path->cached && !i->old_btree_u64s)
 		return flush_new_cached_update(trans, i, flags, ip);
 
 	return 0;
@@ -788,6 +794,27 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
 int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
 		       struct bpos pos, bool set)
 {
+	struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k));
+	int ret = PTR_ERR_OR_ZERO(k);
+	if (ret)
+		return ret;
+
+	bkey_init(&k->k);
+	k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+	k->k.p = pos;
+
+	struct btree_iter iter;
+	bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
+
+	ret   = bch2_btree_iter_traverse(&iter) ?:
+		bch2_trans_update(trans, &iter, k, 0);
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree,
+				struct bpos pos, bool set)
+{
 	struct bkey_i k;
 
 	bkey_init(&k.k);
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index b9382b7b288b..cc7c53e83f89 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -63,11 +63,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
 			    struct bpos, struct bpos, unsigned, u64 *);
 
 int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+int bch2_btree_bit_mod_buffered(struct btree_trans *, enum btree_id, struct bpos, bool);
 
 static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
 						enum btree_id btree, struct bpos pos)
 {
-	return bch2_btree_bit_mod(trans, btree, pos, false);
+	return bch2_btree_bit_mod_buffered(trans, btree, pos, false);
 }
 
 int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 4530b14ff2c3..6030c396754f 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "alloc_foreground.h"
+#include "bkey_buf.h"
 #include "bkey_methods.h"
 #include "btree_cache.h"
 #include "btree_gc.h"
@@ -18,15 +19,23 @@
 #include "journal.h"
 #include "journal_reclaim.h"
 #include "keylist.h"
+#include "recovery_passes.h"
 #include "replicas.h"
+#include "sb-members.h"
 #include "super-io.h"
 #include "trace.h"
 
 #include <linux/random.h>
 
+static const char * const bch2_btree_update_modes[] = {
+#define x(t) #t,
+	BTREE_UPDATE_MODES()
+#undef x
+	NULL
+};
+
 static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
-				  btree_path_idx_t, struct btree *,
-				  struct keylist *, unsigned);
+				  btree_path_idx_t, struct btree *, struct keylist *);
 static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
 
 static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
@@ -45,56 +54,103 @@ static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
 	return path_idx;
 }
 
-/* Debug code: */
-
 /*
  * Verify that child nodes correctly span parent node's range:
  */
-static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
+int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	struct bpos next_node = b->data->min_key;
-	struct btree_node_iter iter;
+	struct bch_fs *c = trans->c;
+	struct bpos node_min = b->key.k.type == KEY_TYPE_btree_ptr_v2
+		? bkey_i_to_btree_ptr_v2(&b->key)->v.min_key
+		: b->data->min_key;
+	struct btree_and_journal_iter iter;
 	struct bkey_s_c k;
-	struct bkey_s_c_btree_ptr_v2 bp;
-	struct bkey unpacked;
-	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
+	struct printbuf buf = PRINTBUF;
+	struct bkey_buf prev;
+	int ret = 0;
 
-	BUG_ON(!b->c.level);
+	BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
+	       !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key,
+			b->data->min_key));
 
-	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
-		return;
+	if (!b->c.level)
+		return 0;
 
-	bch2_btree_node_iter_init_from_start(&iter, b);
+	bch2_bkey_buf_init(&prev);
+	bkey_init(&prev.k->k);
+	bch2_btree_and_journal_iter_init_node_iter(trans, &iter, b);
 
-	while (1) {
-		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
+	while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) {
 		if (k.k->type != KEY_TYPE_btree_ptr_v2)
-			break;
-		bp = bkey_s_c_to_btree_ptr_v2(k);
+			goto out;
 
-		if (!bpos_eq(next_node, bp.v->min_key)) {
-			bch2_dump_btree_node(c, b);
-			bch2_bpos_to_text(&buf1, next_node);
-			bch2_bpos_to_text(&buf2, bp.v->min_key);
-			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
-		}
+		struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 
-		bch2_btree_node_iter_advance(&iter, b);
+		struct bpos expected_min = bkey_deleted(&prev.k->k)
+			? node_min
+			: bpos_successor(prev.k->k.p);
 
-		if (bch2_btree_node_iter_end(&iter)) {
-			if (!bpos_eq(k.k->p, b->key.k.p)) {
-				bch2_dump_btree_node(c, b);
-				bch2_bpos_to_text(&buf1, b->key.k.p);
-				bch2_bpos_to_text(&buf2, k.k->p);
-				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
-			}
-			break;
+		if (!bpos_eq(expected_min, bp.v->min_key)) {
+			bch2_topology_error(c);
+
+			printbuf_reset(&buf);
+			prt_str(&buf, "end of prev node doesn't match start of next node\n"),
+			prt_printf(&buf, "  in btree %s level %u node ",
+				   bch2_btree_id_str(b->c.btree_id), b->c.level);
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+			prt_str(&buf, "\n  prev ");
+			bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+			prt_str(&buf, "\n  next ");
+			bch2_bkey_val_to_text(&buf, c, k);
+
+			need_fsck_err(c, btree_node_topology_bad_min_key, "%s", buf.buf);
+			goto topology_repair;
 		}
 
-		next_node = bpos_successor(k.k->p);
+		bch2_bkey_buf_reassemble(&prev, c, k);
+		bch2_btree_and_journal_iter_advance(&iter);
+	}
+
+	if (bkey_deleted(&prev.k->k)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf);
+		prt_str(&buf, "empty interior node\n");
+		prt_printf(&buf, "  in btree %s level %u node ",
+			   bch2_btree_id_str(b->c.btree_id), b->c.level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+
+		need_fsck_err(c, btree_node_topology_empty_interior_node, "%s", buf.buf);
+		goto topology_repair;
+	} else if (!bpos_eq(prev.k->k.p, b->key.k.p)) {
+		bch2_topology_error(c);
+
+		printbuf_reset(&buf);
+		prt_str(&buf, "last child node doesn't end at end of parent node\n");
+		prt_printf(&buf, "  in btree %s level %u node ",
+			   bch2_btree_id_str(b->c.btree_id), b->c.level);
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
+		prt_str(&buf, "\n  last key ");
+		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k));
+
+		need_fsck_err(c, btree_node_topology_bad_max_key, "%s", buf.buf);
+		goto topology_repair;
 	}
-#endif
+out:
+fsck_err:
+	bch2_btree_and_journal_iter_exit(&iter);
+	bch2_bkey_buf_exit(&prev, c);
+	printbuf_exit(&buf);
+	return ret;
+topology_repair:
+	if ((c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_check_topology)) &&
+	    c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) {
+		bch2_inconsistent_error(c);
+		ret = -BCH_ERR_btree_need_topology_repair;
+	} else {
+		ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
+	}
+	goto out;
 }
 
 /* Calculate ideal packed bkey format for new btree nodes: */
@@ -255,7 +311,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
 	struct open_buckets obs = { .nr = 0 };
 	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
 	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
-	unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim
+	unsigned nr_reserve = watermark < BCH_WATERMARK_reclaim
 		? BTREE_NODE_RESERVE
 		: 0;
 	int ret;
@@ -550,6 +606,26 @@ static void btree_update_add_key(struct btree_update *as,
 	bch2_keylist_push(keys);
 }
 
+static bool btree_update_new_nodes_marked_sb(struct btree_update *as)
+{
+	for_each_keylist_key(&as->new_keys, k)
+		if (!bch2_dev_btree_bitmap_marked(as->c, bkey_i_to_s_c(k)))
+			return false;
+	return true;
+}
+
+static void btree_update_new_nodes_mark_sb(struct btree_update *as)
+{
+	struct bch_fs *c = as->c;
+
+	mutex_lock(&c->sb_lock);
+	for_each_keylist_key(&as->new_keys, k)
+		bch2_dev_btree_bitmap_mark(c, bkey_i_to_s_c(k));
+
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+}
+
 /*
  * The transactional part of an interior btree node update, where we journal the
  * update we did to the interior node and update alloc info:
@@ -607,6 +683,9 @@ static void btree_update_nodes_written(struct btree_update *as)
 	if (ret)
 		goto err;
 
+	if (!btree_update_new_nodes_marked_sb(as))
+		btree_update_new_nodes_mark_sb(as);
+
 	/*
 	 * Wait for any in flight writes to finish before we free the old nodes
 	 * on disk:
@@ -639,7 +718,7 @@ static void btree_update_nodes_written(struct btree_update *as)
 	 * which may require allocations as well.
 	 */
 	ret = commit_do(trans, &as->disk_res, &journal_seq,
-			BCH_WATERMARK_reclaim|
+			BCH_WATERMARK_interior_updates|
 			BCH_TRANS_COMMIT_no_enospc|
 			BCH_TRANS_COMMIT_no_check_rw|
 			BCH_TRANS_COMMIT_journal_reclaim,
@@ -647,11 +726,15 @@ static void btree_update_nodes_written(struct btree_update *as)
 	bch2_trans_unlock(trans);
 
 	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
-			     "%s(): error %s", __func__, bch2_err_str(ret));
+			     "%s", bch2_err_str(ret));
 err:
-	if (as->b) {
-
-		b = as->b;
+	/*
+	 * We have to be careful because another thread might be getting ready
+	 * to free as->b and calling btree_update_reparent() on us - we'll
+	 * recheck under btree_update_lock below:
+	 */
+	b = READ_ONCE(as->b);
+	if (b) {
 		btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
 						as->btree_id, b->c.level, b->key.k.p);
 		struct btree_path *path = trans->paths + path_idx;
@@ -795,15 +878,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 
-	mutex_lock(&c->btree_interior_update_lock);
-	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
-
-	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(as->mode != BTREE_UPDATE_none);
+	BUG_ON(as->update_level_end < b->c.level);
 	BUG_ON(!btree_node_dirty(b));
 	BUG_ON(!b->c.level);
 
-	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
+	mutex_lock(&c->btree_interior_update_lock);
+	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
+
+	as->mode	= BTREE_UPDATE_node;
 	as->b		= b;
+	as->update_level_end = b->c.level;
 
 	set_btree_node_write_blocked(b);
 	list_add(&as->write_blocked_list, &b->write_blocked);
@@ -825,7 +910,7 @@ static void btree_update_reparent(struct btree_update *as,
 	lockdep_assert_held(&c->btree_interior_update_lock);
 
 	child->b = NULL;
-	child->mode = BTREE_INTERIOR_UPDATING_AS;
+	child->mode = BTREE_UPDATE_update;
 
 	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
 			      bch2_update_reparent_journal_pin_flush);
@@ -836,7 +921,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 	struct bkey_i *insert = &b->key;
 	struct bch_fs *c = as->c;
 
-	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(as->mode != BTREE_UPDATE_none);
 
 	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
 	       ARRAY_SIZE(as->journal_entries));
@@ -850,7 +935,7 @@ static void btree_update_updated_root(struct btree_update *as, struct btree *b)
 	mutex_lock(&c->btree_interior_update_lock);
 	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
 
-	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
+	as->mode	= BTREE_UPDATE_root;
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
@@ -1028,7 +1113,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
 	struct bch_fs *c = as->c;
 	u64 start_time = as->start_time;
 
-	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);
+	BUG_ON(as->mode == BTREE_UPDATE_none);
 
 	if (as->took_gc_lock)
 		up_read(&as->c->gc_lock);
@@ -1045,7 +1130,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
 
 static struct btree_update *
 bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
-			unsigned level, bool split, unsigned flags)
+			unsigned level_start, bool split, unsigned flags)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_update *as;
@@ -1053,7 +1138,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
 		? BCH_DISK_RESERVATION_NOFAIL : 0;
 	unsigned nr_nodes[2] = { 0, 0 };
-	unsigned update_level = level;
+	unsigned level_end = level_start;
 	enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
 	int ret = 0;
 	u32 restart_count = trans->restart_count;
@@ -1068,29 +1153,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	flags &= ~BCH_WATERMARK_MASK;
 	flags |= watermark;
 
-	if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-	    watermark < c->journal.watermark) {
-		struct journal_res res = { 0 };
+	if (watermark < BCH_WATERMARK_reclaim &&
+	    test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
+		if (flags & BCH_TRANS_COMMIT_journal_reclaim)
+			return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
 
-		ret = drop_locks_do(trans,
-			bch2_journal_res_get(&c->journal, &res, 1,
-					     watermark|JOURNAL_RES_GET_CHECK));
+		bch2_trans_unlock(trans);
+		wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
+		ret = bch2_trans_relock(trans);
 		if (ret)
 			return ERR_PTR(ret);
 	}
 
 	while (1) {
-		nr_nodes[!!update_level] += 1 + split;
-		update_level++;
+		nr_nodes[!!level_end] += 1 + split;
+		level_end++;
 
-		ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
+		ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
 		if (ret)
 			return ERR_PTR(ret);
 
-		if (!btree_path_node(path, update_level)) {
+		if (!btree_path_node(path, level_end)) {
 			/* Allocating new root? */
 			nr_nodes[1] += split;
-			update_level = BTREE_MAX_DEPTH;
+			level_end = BTREE_MAX_DEPTH;
 			break;
 		}
 
@@ -1098,11 +1184,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		 * Always check for space for two keys, even if we won't have to
 		 * split at prior level - it might have been a merge instead:
 		 */
-		if (bch2_btree_node_insert_fits(path->l[update_level].b,
+		if (bch2_btree_node_insert_fits(path->l[level_end].b,
 						BKEY_BTREE_PTR_U64s_MAX * 2))
 			break;
 
-		split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
+		split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
 	}
 
 	if (!down_read_trylock(&c->gc_lock)) {
@@ -1116,12 +1202,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
 	memset(as, 0, sizeof(*as));
 	closure_init(&as->cl, NULL);
-	as->c		= c;
-	as->start_time	= start_time;
-	as->mode	= BTREE_INTERIOR_NO_UPDATE;
-	as->took_gc_lock = true;
-	as->btree_id	= path->btree_id;
-	as->update_level = update_level;
+	as->c			= c;
+	as->start_time		= start_time;
+	as->ip_started		= _RET_IP_;
+	as->mode		= BTREE_UPDATE_none;
+	as->watermark		= watermark;
+	as->took_gc_lock	= true;
+	as->btree_id		= path->btree_id;
+	as->update_level_start	= level_start;
+	as->update_level_end	= level_end;
 	INIT_LIST_HEAD(&as->list);
 	INIT_LIST_HEAD(&as->unwritten_list);
 	INIT_LIST_HEAD(&as->write_blocked_list);
@@ -1163,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 		 */
 		if (bch2_err_matches(ret, ENOSPC) &&
 		    (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
-		    watermark != BCH_WATERMARK_reclaim) {
+		    watermark < BCH_WATERMARK_reclaim) {
 			ret = -BCH_ERR_journal_reclaim_would_deadlock;
 			goto err;
 		}
@@ -1193,7 +1282,8 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
 err:
 	bch2_btree_update_free(as, trans);
 	if (!bch2_err_matches(ret, ENOSPC) &&
-	    !bch2_err_matches(ret, EROFS))
+	    !bch2_err_matches(ret, EROFS) &&
+	    ret != -BCH_ERR_journal_reclaim_would_deadlock)
 		bch_err_fn_ratelimited(c, ret);
 	return ERR_PTR(ret);
 }
@@ -1208,33 +1298,35 @@ static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
 	mutex_unlock(&c->btree_cache.lock);
 
 	mutex_lock(&c->btree_root_lock);
-	BUG_ON(btree_node_root(c, b) &&
-	       (b->c.level < btree_node_root(c, b)->c.level ||
-		!btree_node_dying(btree_node_root(c, b))));
-
 	bch2_btree_id_root(c, b->c.btree_id)->b = b;
 	mutex_unlock(&c->btree_root_lock);
 
 	bch2_recalc_btree_reserve(c);
 }
 
-static void bch2_btree_set_root(struct btree_update *as,
-				struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b)
+static int bch2_btree_set_root(struct btree_update *as,
+			       struct btree_trans *trans,
+			       struct btree_path *path,
+			       struct btree *b,
+			       bool nofail)
 {
 	struct bch_fs *c = as->c;
-	struct btree *old;
 
 	trace_and_count(c, btree_node_set_root, trans, b);
 
-	old = btree_node_root(c, b);
+	struct btree *old = btree_node_root(c, b);
 
 	/*
 	 * Ensure no one is using the old root while we switch to the
 	 * new root:
 	 */
-	bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+	if (nofail) {
+		bch2_btree_node_lock_write_nofail(trans, path, &old->c);
+	} else {
+		int ret = bch2_btree_node_lock_write(trans, path, &old->c);
+		if (ret)
+			return ret;
+	}
 
 	bch2_btree_set_root_inmem(c, b);
 
@@ -1248,6 +1340,7 @@ static void bch2_btree_set_root(struct btree_update *as,
 	 * depend on the new root would have to update the new root.
 	 */
 	bch2_btree_node_unlock_write(trans, path, old);
+	return 0;
 }
 
 /* Interior node updates: */
@@ -1314,12 +1407,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
 }
 
 static void
-__bch2_btree_insert_keys_interior(struct btree_update *as,
-				  struct btree_trans *trans,
-				  struct btree_path *path,
-				  struct btree *b,
-				  struct btree_node_iter node_iter,
-				  struct keylist *keys)
+bch2_btree_insert_keys_interior(struct btree_update *as,
+				struct btree_trans *trans,
+				struct btree_path *path,
+				struct btree *b,
+				struct btree_node_iter node_iter,
+				struct keylist *keys)
 {
 	struct bkey_i *insert = bch2_keylist_front(keys);
 	struct bkey_packed *k;
@@ -1378,9 +1471,16 @@ static void __btree_split_node(struct btree_update *as,
 		if (bkey_deleted(k))
 			continue;
 
+		uk = bkey_unpack_key(b, k);
+
+		if (b->c.level &&
+		    u64s < n1_u64s &&
+		    u64s + k->u64s >= n1_u64s &&
+		    bch2_key_deleted_in_journal(trans, b->c.btree_id, b->c.level, uk.p))
+			n1_u64s += k->u64s;
+
 		i = u64s >= n1_u64s;
 		u64s += k->u64s;
-		uk = bkey_unpack_key(b, k);
 		if (!i)
 			n1_pos = uk.p;
 		bch2_bkey_format_add_key(&format[i], &uk);
@@ -1439,8 +1539,7 @@ static void __btree_split_node(struct btree_update *as,
 
 		bch2_verify_btree_nr_keys(n[i]);
 
-		if (b->c.level)
-			btree_node_interior_verify(as->c, n[i]);
+		BUG_ON(bch2_btree_node_check_topology(trans, n[i]));
 	}
 }
 
@@ -1469,15 +1568,15 @@ static void btree_split_insert_keys(struct btree_update *as,
 
 		bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
 
-		__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
+		bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
 
-		btree_node_interior_verify(as->c, b);
+		BUG_ON(bch2_btree_node_check_topology(trans, b));
 	}
 }
 
 static int btree_split(struct btree_update *as, struct btree_trans *trans,
 		       btree_path_idx_t path, struct btree *b,
-		       struct keylist *keys, unsigned flags)
+		       struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
 	struct btree *parent = btree_node_parent(trans->paths + path, b);
@@ -1486,9 +1585,14 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 	u64 start_time = local_clock();
 	int ret = 0;
 
+	bch2_verify_btree_nr_keys(b);
 	BUG_ON(!parent && (b != btree_node_root(c, b)));
 	BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
 
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret)
+		return ret;
+
 	bch2_btree_interior_update_will_free_node(as, b);
 
 	if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
@@ -1578,16 +1682,17 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
 
 	if (parent) {
 		/* Split a non root node */
-		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
-		if (ret)
-			goto err;
+		ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
 	} else if (n3) {
-		bch2_btree_set_root(as, trans, trans->paths + path, n3);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n3, false);
 	} else {
 		/* Root filled up but didn't need to be split */
-		bch2_btree_set_root(as, trans, trans->paths + path, n1);
+		ret = bch2_btree_set_root(as, trans, trans->paths + path, n1, false);
 	}
 
+	if (ret)
+		goto err;
+
 	if (n3) {
 		bch2_btree_update_get_open_buckets(as, n3);
 		bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
@@ -1644,27 +1749,6 @@ err:
 	goto out;
 }
 
-static void
-bch2_btree_insert_keys_interior(struct btree_update *as,
-				struct btree_trans *trans,
-				struct btree_path *path,
-				struct btree *b,
-				struct keylist *keys)
-{
-	struct btree_path *linked;
-	unsigned i;
-
-	__bch2_btree_insert_keys_interior(as, trans, path, b,
-					  path->l[b->c.level].iter, keys);
-
-	btree_update_updated_node(as, b);
-
-	trans_for_each_path_with_node(trans, b, linked, i)
-		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
-
-	bch2_trans_verify_paths(trans);
-}
-
 /**
  * bch2_btree_insert_node - insert bkeys into a given btree node
  *
@@ -1673,7 +1757,6 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  * @path_idx:		path that points to current node
  * @b:			node to insert keys into
  * @keys:		list of keys to insert
- * @flags:		transaction commit flags
  *
  * Returns: 0 on success, typically transaction restart error on failure
  *
@@ -1683,10 +1766,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
  */
 static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
 				  btree_path_idx_t path_idx, struct btree *b,
-				  struct keylist *keys, unsigned flags)
+				  struct keylist *keys)
 {
 	struct bch_fs *c = as->c;
-	struct btree_path *path = trans->paths + path_idx;
+	struct btree_path *path = trans->paths + path_idx, *linked;
+	unsigned i;
 	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
 	int old_live_u64s = b->nr.live_u64s;
 	int live_u64s_added, u64s_added;
@@ -1709,9 +1793,19 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 		goto split;
 	}
 
-	btree_node_interior_verify(c, b);
+	ret = bch2_btree_node_check_topology(trans, b);
+	if (ret) {
+		bch2_btree_node_unlock_write(trans, path, b);
+		return ret;
+	}
+
+	bch2_btree_insert_keys_interior(as, trans, path, b,
+					path->l[b->c.level].iter, keys);
+
+	trans_for_each_path_with_node(trans, b, linked, i)
+		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
 
-	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
+	bch2_trans_verify_paths(trans);
 
 	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
 	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
@@ -1725,21 +1819,22 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
 	    bch2_maybe_compact_whiteouts(c, b))
 		bch2_trans_node_reinit_iter(trans, b);
 
+	btree_update_updated_node(as, b);
 	bch2_btree_node_unlock_write(trans, path, b);
 
-	btree_node_interior_verify(c, b);
+	BUG_ON(bch2_btree_node_check_topology(trans, b));
 	return 0;
 split:
 	/*
 	 * We could attempt to avoid the transaction restart, by calling
 	 * bch2_btree_path_upgrade() and allocating more nodes:
 	 */
-	if (b->c.level >= as->update_level) {
+	if (b->c.level >= as->update_level_end) {
 		trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
 		return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
 	}
 
-	return btree_split(as, trans, path_idx, b, keys, flags);
+	return btree_split(as, trans, path_idx, b, keys);
 }
 
 int bch2_btree_split_leaf(struct btree_trans *trans,
@@ -1747,7 +1842,6 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 			  unsigned flags)
 {
 	/* btree_split & merge may both cause paths array to be reallocated */
-
 	struct btree *b = path_l(trans->paths + path)->b;
 	struct btree_update *as;
 	unsigned l;
@@ -1759,7 +1853,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	if (IS_ERR(as))
 		return PTR_ERR(as);
 
-	ret = btree_split(as, trans, path, b, NULL, flags);
+	ret = btree_split(as, trans, path, b, NULL);
 	if (ret) {
 		bch2_btree_update_free(as, trans);
 		return ret;
@@ -1775,6 +1869,65 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
 	return ret;
 }
 
+static void __btree_increase_depth(struct btree_update *as, struct btree_trans *trans,
+				   btree_path_idx_t path_idx)
+{
+	struct bch_fs *c = as->c;
+	struct btree_path *path = trans->paths + path_idx;
+	struct btree *n, *b = bch2_btree_id_root(c, path->btree_id)->b;
+
+	BUG_ON(!btree_node_locked(path, b->c.level));
+
+	n = __btree_root_alloc(as, trans, b->c.level + 1);
+
+	bch2_btree_update_add_new_node(as, n);
+	six_unlock_write(&n->c.lock);
+
+	path->locks_want++;
+	BUG_ON(btree_node_locked(path, n->c.level));
+	six_lock_increment(&n->c.lock, SIX_LOCK_intent);
+	mark_btree_node_locked(trans, path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+	bch2_btree_path_level_init(trans, path, n);
+
+	n->sib_u64s[0] = U16_MAX;
+	n->sib_u64s[1] = U16_MAX;
+
+	bch2_keylist_add(&as->parent_keys, &b->key);
+	btree_split_insert_keys(as, trans, path_idx, n, &as->parent_keys);
+
+	int ret = bch2_btree_set_root(as, trans, path, n, true);
+	BUG_ON(ret);
+
+	bch2_btree_update_get_open_buckets(as, n);
+	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
+	bch2_trans_node_add(trans, path, n);
+	six_unlock_intent(&n->c.lock);
+
+	mutex_lock(&c->btree_cache.lock);
+	list_add_tail(&b->list, &c->btree_cache.live);
+	mutex_unlock(&c->btree_cache.lock);
+
+	bch2_trans_verify_locks(trans);
+}
+
+int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, unsigned flags)
+{
+	struct bch_fs *c = trans->c;
+	struct btree *b = bch2_btree_id_root(c, trans->paths[path].btree_id)->b;
+
+	if (btree_node_fake(b))
+		return bch2_btree_split_leaf(trans, path, flags);
+
+	struct btree_update *as =
+		bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags);
+	if (IS_ERR(as))
+		return PTR_ERR(as);
+
+	__btree_increase_depth(as, trans, path);
+	bch2_btree_update_done(as, trans);
+	return 0;
+}
+
 int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 				  btree_path_idx_t path,
 				  unsigned level,
@@ -1797,6 +1950,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 	BUG_ON(!trans->paths[path].should_be_locked);
 	BUG_ON(!btree_node_locked(&trans->paths[path], level));
 
+	/*
+	 * Work around a deadlock caused by the btree write buffer not doing
+	 * merges and leaving tons of merges for us to do - we really don't need
+	 * to be doing merges at all from the interior update path, and if the
+	 * interior update path is generating too many new interior updates we
+	 * deadlock:
+	 */
+	if ((flags & BCH_WATERMARK_MASK) == BCH_WATERMARK_interior_updates)
+		return 0;
+
+	flags &= ~BCH_WATERMARK_MASK;
+
 	b = trans->paths[path].l[level].b;
 
 	if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
@@ -1845,8 +2010,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 			__func__, buf1.buf, buf2.buf);
 		printbuf_exit(&buf1);
 		printbuf_exit(&buf2);
-		bch2_topology_error(c);
-		ret = -EIO;
+		ret = bch2_topology_error(c);
 		goto err;
 	}
 
@@ -1916,7 +2080,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
 
 	bch2_trans_verify_paths(trans);
 
-	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
+	ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys);
 	if (ret)
 		goto err_free_update;
 
@@ -1943,6 +2107,10 @@ err:
 		bch2_path_put(trans, new_path, true);
 	bch2_path_put(trans, sib_path, true);
 	bch2_trans_verify_locks(trans);
+	if (ret == -BCH_ERR_journal_reclaim_would_deadlock)
+		ret = 0;
+	if (!ret)
+		ret = bch2_trans_relock(trans);
 	return ret;
 err_free_update:
 	bch2_btree_node_free_never_used(as, trans, n);
@@ -1987,14 +2155,14 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
 
 	if (parent) {
 		bch2_keylist_add(&as->parent_keys, &n->key);
-		ret = bch2_btree_insert_node(as, trans, iter->path,
-					     parent, &as->parent_keys, flags);
-		if (ret)
-			goto err;
+		ret = bch2_btree_insert_node(as, trans, iter->path, parent, &as->parent_keys);
 	} else {
-		bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
+		ret = bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n, false);
 	}
 
+	if (ret)
+		goto err;
+
 	bch2_btree_update_get_open_buckets(as, n);
 	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
 
@@ -2069,7 +2237,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
 
 	ret = bch2_trans_do(c, NULL, NULL, 0,
 		      async_btree_node_rewrite_trans(trans, a));
-	bch_err_fn(c, ret);
+	bch_err_fn_ratelimited(c, ret);
 	bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
 	kfree(a);
 }
@@ -2116,7 +2284,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
 	}
 
-	queue_work(c->btree_interior_update_worker, &a->work);
+	queue_work(c->btree_node_rewrite_worker, &a->work);
 }
 
 void bch2_do_pending_node_rewrites(struct bch_fs *c)
@@ -2128,7 +2296,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c)
 		list_del(&a->list);
 
 		bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite);
-		queue_work(c->btree_interior_update_worker, &a->work);
+		queue_work(c->btree_node_rewrite_worker, &a->work);
 	}
 	mutex_unlock(&c->pending_node_rewrites_lock);
 }
@@ -2339,7 +2507,7 @@ void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
 	bch2_btree_set_root_inmem(c, b);
 }
 
-static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
+static int __bch2_btree_root_alloc_fake(struct btree_trans *trans, enum btree_id id, unsigned level)
 {
 	struct bch_fs *c = trans->c;
 	struct closure cl;
@@ -2358,7 +2526,7 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 
 	set_btree_node_fake(b);
 	set_btree_node_need_rewrite(b);
-	b->c.level	= 0;
+	b->c.level	= level;
 	b->c.btree_id	= id;
 
 	bkey_btree_ptr_init(&b->key);
@@ -2385,9 +2553,23 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
 	return 0;
 }
 
-void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
+void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned level)
 {
-	bch2_trans_run(c, __bch2_btree_root_alloc(trans, id));
+	bch2_trans_run(c, __bch2_btree_root_alloc_fake(trans, id, level));
+}
+
+static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
+{
+	prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
+		   (void *) as->ip_started,
+		   bch2_btree_id_str(as->btree_id),
+		   as->update_level_start,
+		   as->update_level_end,
+		   bch2_watermarks[as->watermark],
+		   bch2_btree_update_modes[as->mode],
+		   as->nodes_written,
+		   closure_nr_remaining(&as->cl),
+		   as->journal.seq);
 }
 
 void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
@@ -2396,12 +2578,7 @@ void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
 
 	mutex_lock(&c->btree_interior_update_lock);
 	list_for_each_entry(as, &c->btree_interior_update_list, list)
-		prt_printf(out, "%p m %u w %u r %u j %llu\n",
-		       as,
-		       as->mode,
-		       as->nodes_written,
-		       closure_nr_remaining(&as->cl),
-		       as->journal.seq);
+		bch2_btree_update_to_text(out, as);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
 
@@ -2465,6 +2642,8 @@ bch2_btree_roots_to_journal_entries(struct bch_fs *c,
 
 void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
 {
+	if (c->btree_node_rewrite_worker)
+		destroy_workqueue(c->btree_node_rewrite_worker);
 	if (c->btree_interior_update_worker)
 		destroy_workqueue(c->btree_interior_update_worker);
 	mempool_exit(&c->btree_interior_update_pool);
@@ -2485,10 +2664,15 @@ void bch2_fs_btree_interior_update_init_early(struct bch_fs *c)
 int bch2_fs_btree_interior_update_init(struct bch_fs *c)
 {
 	c->btree_interior_update_worker =
-		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
+		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8);
 	if (!c->btree_interior_update_worker)
 		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
 
+	c->btree_node_rewrite_worker =
+		alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND);
+	if (!c->btree_node_rewrite_worker)
+		return -BCH_ERR_ENOMEM_btree_interior_update_worker_init;
+
 	if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
 				      sizeof(struct btree_update)))
 		return -BCH_ERR_ENOMEM_btree_interior_update_pool_init;
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index c593c925d1e3..c1a479ebaad1 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -10,6 +10,20 @@
 
 #define BTREE_UPDATE_JOURNAL_RES	(BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1))
 
+int bch2_btree_node_check_topology(struct btree_trans *, struct btree *);
+
+#define BTREE_UPDATE_MODES()	\
+	x(none)			\
+	x(node)			\
+	x(root)			\
+	x(update)
+
+enum btree_update_mode {
+#define x(n)	BTREE_UPDATE_##n,
+	BTREE_UPDATE_MODES()
+#undef x
+};
+
 /*
  * Tracks an in progress split/rewrite of a btree node and the update to the
  * parent node:
@@ -32,28 +46,24 @@ struct btree_update {
 	struct closure			cl;
 	struct bch_fs			*c;
 	u64				start_time;
+	unsigned long			ip_started;
 
 	struct list_head		list;
 	struct list_head		unwritten_list;
 
-	/* What kind of update are we doing? */
-	enum {
-		BTREE_INTERIOR_NO_UPDATE,
-		BTREE_INTERIOR_UPDATING_NODE,
-		BTREE_INTERIOR_UPDATING_ROOT,
-		BTREE_INTERIOR_UPDATING_AS,
-	} mode;
-
+	enum btree_update_mode		mode;
+	enum bch_watermark		watermark;
 	unsigned			nodes_written:1;
 	unsigned			took_gc_lock:1;
 
 	enum btree_id			btree_id;
-	unsigned			update_level;
+	unsigned			update_level_start;
+	unsigned			update_level_end;
 
 	struct disk_reservation		disk_res;
 
 	/*
-	 * BTREE_INTERIOR_UPDATING_NODE:
+	 * BTREE_UPDATE_node:
 	 * The update that made the new nodes visible was a regular update to an
 	 * existing interior node - @b. We can't write out the update to @b
 	 * until the new nodes we created are finished writing, so we block @b
@@ -119,6 +129,8 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
 
 int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
 
+int bch2_btree_increase_depth(struct btree_trans *, btree_path_idx_t, unsigned);
+
 int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
 				  unsigned, unsigned, enum btree_node_sibling);
 
@@ -160,7 +172,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *,
 					struct bkey_i *, unsigned, bool);
 
 void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *);
-void bch2_btree_root_alloc(struct bch_fs *, enum btree_id);
+void bch2_btree_root_alloc_fake(struct bch_fs *, enum btree_id, unsigned);
 
 static inline unsigned btree_update_reserve_required(struct bch_fs *c,
 						     struct btree *b)
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index ac7844861966..36a6f42aba5e 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -11,6 +11,7 @@
 #include "journal_reclaim.h"
 
 #include <linux/prefetch.h>
+#include <linux/sort.h>
 
 static int bch2_btree_write_buffer_journal_flush(struct journal *,
 				struct journal_entry_pin *, u64);
@@ -46,6 +47,14 @@ static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_ke
 #endif
 }
 
+static int wb_key_seq_cmp(const void *_l, const void *_r)
+{
+	const struct btree_write_buffered_key *l = _l;
+	const struct btree_write_buffered_key *r = _r;
+
+	return cmp_int(l->journal_seq, r->journal_seq);
+}
+
 /* Compare excluding idx, the low 24 bits: */
 static inline bool wb_key_eq(const void *_l, const void *_r)
 {
@@ -307,6 +316,16 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 			    bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
 				bch2_btree_node_unlock_write(trans, path, path->l[0].b);
 				write_locked = false;
+
+				ret = lockrestart_do(trans,
+					bch2_btree_iter_traverse(&iter) ?:
+					bch2_foreground_maybe_merge(trans, iter.path, 0,
+							BCH_WATERMARK_reclaim|
+							BCH_TRANS_COMMIT_journal_reclaim|
+							BCH_TRANS_COMMIT_no_check_rw|
+							BCH_TRANS_COMMIT_no_enospc));
+				if (ret)
+					goto err;
 			}
 		}
 
@@ -357,6 +376,11 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 		 */
 		trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
 
+		sort(wb->flushing.keys.data,
+		     wb->flushing.keys.nr,
+		     sizeof(wb->flushing.keys.data[0]),
+		     wb_key_seq_cmp, NULL);
+
 		darray_for_each(wb->flushing.keys, i) {
 			if (!i->journal_seq)
 				continue;
@@ -368,17 +392,17 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
 
 			ret = commit_do(trans, NULL, NULL,
 					BCH_WATERMARK_reclaim|
+					BCH_TRANS_COMMIT_journal_reclaim|
 					BCH_TRANS_COMMIT_no_check_rw|
 					BCH_TRANS_COMMIT_no_enospc|
-					BCH_TRANS_COMMIT_no_journal_res|
-					BCH_TRANS_COMMIT_journal_reclaim,
+					BCH_TRANS_COMMIT_no_journal_res ,
 					btree_write_buffered_insert(trans, i));
 			if (ret)
 				goto err;
 		}
 	}
 err:
-	bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
+	bch2_fs_fatal_err_on(ret, c, "%s", bch2_err_str(ret));
 	trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
 	bch2_journal_pin_drop(j, &wb->flushing.pin);
 	wb->flushing.keys.nr = 0;
@@ -574,8 +598,6 @@ void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys
 static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
 {
 	struct journal_keys_to_wb dst;
-	struct jset_entry *entry;
-	struct bkey_i *k;
 	int ret = 0;
 
 	bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
@@ -590,7 +612,9 @@ static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_bu
 		entry->type = BCH_JSET_ENTRY_btree_keys;
 	}
 
+	spin_lock(&c->journal.lock);
 	buf->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
 out:
 	bch2_journal_keys_to_write_buffer_end(c, &dst);
 	return ret;
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 54f7826ac498..941401a210f5 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -525,6 +525,7 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 			"different types of data in same bucket: %s, %s",
 			bch2_data_type_str(g->data_type),
 			bch2_data_type_str(data_type))) {
+		BUG();
 		ret = -EIO;
 		goto err;
 	}
@@ -628,6 +629,7 @@ int bch2_check_bucket_ref(struct btree_trans *trans,
 			bch2_data_type_str(ptr_data_type),
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+		BUG();
 		ret = -EIO;
 		goto err;
 	}
@@ -815,14 +817,14 @@ static int __mark_pointer(struct btree_trans *trans,
 static int bch2_trigger_pointer(struct btree_trans *trans,
 			enum btree_id btree_id, unsigned level,
 			struct bkey_s_c k, struct extent_ptr_decoded p,
-			s64 *sectors,
-			unsigned flags)
+			const union bch_extent_entry *entry,
+			s64 *sectors, unsigned flags)
 {
 	bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
 	struct bpos bucket;
 	struct bch_backpointer bp;
 
-	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
+	bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, entry, &bucket, &bp);
 	*sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
 
 	if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
@@ -851,7 +853,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans,
 	if (flags & BTREE_TRIGGER_GC) {
 		struct bch_fs *c = trans->c;
 		struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
-		enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+		enum bch_data_type data_type = bch2_bkey_ptr_data_type(k, p, entry);
 
 		percpu_down_read(&c->mark_lock);
 		struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
@@ -979,7 +981,7 @@ static int __trigger_extent(struct btree_trans *trans,
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors;
-		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
+		ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
 		if (ret < 0)
 			return ret;
 
@@ -990,8 +992,8 @@ static int __trigger_extent(struct btree_trans *trans,
 				ret = !gc
 					? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
 					: update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
-				bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
-						     __func__);
+				bch2_fs_fatal_err_on(ret && gc, c, "%s: no replicas entry while updating cached sectors",
+						     bch2_err_str(ret));
 				if (ret)
 					return ret;
 			}
@@ -1020,7 +1022,7 @@ static int __trigger_extent(struct btree_trans *trans,
 			struct printbuf buf = PRINTBUF;
 
 			bch2_bkey_val_to_text(&buf, c, k);
-			bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 		}
 		if (ret)
@@ -1053,7 +1055,8 @@ int bch2_trigger_extent(struct btree_trans *trans,
 			  (int) bch2_bkey_needs_rebalance(c, old);
 
 		if (mod) {
-			int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work,
+							      new.k->p, mod > 0);
 			if (ret)
 				return ret;
 		}
@@ -1335,7 +1338,7 @@ static void bucket_gens_free_rcu(struct rcu_head *rcu)
 	struct bucket_gens *buckets =
 		container_of(rcu, struct bucket_gens, rcu);
 
-	kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets);
+	kvfree(buckets);
 }
 
 int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
@@ -1345,16 +1348,16 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 	bool resize = ca->bucket_gens != NULL;
 	int ret;
 
-	if (!(bucket_gens	= kvpmalloc(sizeof(struct bucket_gens) + nbuckets,
-					    GFP_KERNEL|__GFP_ZERO))) {
+	if (!(bucket_gens	= kvmalloc(sizeof(struct bucket_gens) + nbuckets,
+					   GFP_KERNEL|__GFP_ZERO))) {
 		ret = -BCH_ERR_ENOMEM_bucket_gens;
 		goto err;
 	}
 
 	if ((c->opts.buckets_nouse &&
-	     !(buckets_nouse	= kvpmalloc(BITS_TO_LONGS(nbuckets) *
-					    sizeof(unsigned long),
-					    GFP_KERNEL|__GFP_ZERO)))) {
+	     !(buckets_nouse	= kvmalloc(BITS_TO_LONGS(nbuckets) *
+					   sizeof(unsigned long),
+					   GFP_KERNEL|__GFP_ZERO)))) {
 		ret = -BCH_ERR_ENOMEM_buckets_nouse;
 		goto err;
 	}
@@ -1397,8 +1400,7 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
 
 	ret = 0;
 err:
-	kvpfree(buckets_nouse,
-		BITS_TO_LONGS(nbuckets) * sizeof(unsigned long));
+	kvfree(buckets_nouse);
 	if (bucket_gens)
 		call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu);
 
@@ -1407,27 +1409,21 @@ err:
 
 void bch2_dev_buckets_free(struct bch_dev *ca)
 {
-	unsigned i;
-
-	kvpfree(ca->buckets_nouse,
-		BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long));
-	kvpfree(rcu_dereference_protected(ca->bucket_gens, 1),
-		sizeof(struct bucket_gens) + ca->mi.nbuckets);
+	kvfree(ca->buckets_nouse);
+	kvfree(rcu_dereference_protected(ca->bucket_gens, 1));
 
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++)
 		free_percpu(ca->usage[i]);
 	kfree(ca->usage_base);
 }
 
 int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca)
 {
-	unsigned i;
-
 	ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL);
 	if (!ca->usage_base)
 		return -BCH_ERR_ENOMEM_usage_init;
 
-	for (i = 0; i < ARRAY_SIZE(ca->usage); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(ca->usage); i++) {
 		ca->usage[i] = alloc_percpu(struct bch_dev_usage);
 		if (!ca->usage[i])
 			return -BCH_ERR_ENOMEM_usage_init;
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 6387e039f789..f9af5adabe83 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -226,6 +226,7 @@ static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_waterma
 		fallthrough;
 	case BCH_WATERMARK_btree_copygc:
 	case BCH_WATERMARK_reclaim:
+	case BCH_WATERMARK_interior_updates:
 		break;
 	}
 
@@ -394,14 +395,6 @@ static inline const char *bch2_data_type_str(enum bch_data_type type)
 		: "(invalid data type)";
 }
 
-static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
-{
-	if (type < BCH_DATA_NR)
-		prt_str(out, __bch2_data_types[type]);
-	else
-		prt_printf(out, "(invalid data type %u)", type);
-}
-
 /* disk reservations: */
 
 static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 226b39c17667..72781aad6ba7 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,7 +7,7 @@
 #include "chardev.h"
 #include "journal.h"
 #include "move.h"
-#include "recovery.h"
+#include "recovery_passes.h"
 #include "replicas.h"
 #include "super.h"
 #include "super-io.h"
@@ -22,12 +22,6 @@
 #include <linux/slab.h>
 #include <linux/uaccess.h>
 
-__must_check
-static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
-{
-	return copy_to_user(to, from, n) ? -EFAULT : 0;
-}
-
 /* returns with ref on ca->ref */
 static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
 					  unsigned flags)
@@ -140,39 +134,51 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
 struct fsck_thread {
 	struct thread_with_stdio thr;
 	struct bch_fs		*c;
-	char			**devs;
-	size_t			nr_devs;
 	struct bch_opts		opts;
 };
 
 static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
 {
 	struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
-	if (thr->devs)
-		for (size_t i = 0; i < thr->nr_devs; i++)
-			kfree(thr->devs[i]);
-	kfree(thr->devs);
 	kfree(thr);
 }
 
-static int bch2_fsck_offline_thread_fn(void *arg)
+static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
-	struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
+	struct bch_fs *c = thr->c;
 
-	thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
-	if (!thr->thr.thr.ret)
-		bch2_fs_stop(c);
+	int ret = PTR_ERR_OR_ZERO(c);
+	if (ret)
+		return ret;
 
-	thread_with_stdio_done(&thr->thr);
-	return 0;
+	ret = bch2_fs_start(thr->c);
+	if (ret)
+		goto err;
+
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
+		ret |= 1;
+	}
+	if (test_bit(BCH_FS_error, &c->flags)) {
+		bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
+		ret |= 4;
+	}
+err:
+	bch2_fs_stop(c);
+	return ret;
 }
 
+static const struct thread_with_stdio_ops bch2_offline_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_offline_thread_fn,
+};
+
 static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
 {
 	struct bch_ioctl_fsck_offline arg;
 	struct fsck_thread *thr = NULL;
-	u64 *devs = NULL;
+	darray_str(devs) = {};
 	long ret = 0;
 
 	if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -184,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
-	    !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
-	    !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
-		ret = -ENOMEM;
-		goto err;
-	}
+	for (size_t i = 0; i < arg.nr_devs; i++) {
+		u64 dev_u64;
+		ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
+		if (ret)
+			goto err;
 
-	thr->opts = bch2_opts_empty();
-	thr->nr_devs = arg.nr_devs;
+		char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
+		ret = PTR_ERR_OR_ZERO(dev_str);
+		if (ret)
+			goto err;
 
-	if (copy_from_user(devs, &user_arg->devs[0],
-			   array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
-		ret = -EINVAL;
-		goto err;
+		ret = darray_push(&devs, dev_str);
+		if (ret) {
+			kfree(dev_str);
+			goto err;
+		}
 	}
 
-	for (size_t i = 0; i < arg.nr_devs; i++) {
-		thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
-		ret = PTR_ERR_OR_ZERO(thr->devs[i]);
-		if (ret)
-			goto err;
+	thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+	if (!thr) {
+		ret = -ENOMEM;
+		goto err;
 	}
 
+	thr->opts = bch2_opts_empty();
+
 	if (arg.opts) {
 		char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
 
@@ -220,17 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
 
 	opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
-			bch2_fsck_thread_exit,
-			bch2_fsck_offline_thread_fn);
-err:
-	if (ret < 0) {
-		if (thr)
-			bch2_fsck_thread_exit(&thr->thr);
-		pr_err("ret %s", bch2_err_str(ret));
-	}
-	kfree(devs);
+	/* We need request_key() to be called before we punt to kthread: */
+	opt_set(thr->opts, nostart, true);
+
+	thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
+
+	if (!IS_ERR(thr->c) &&
+	    thr->c->opts.errors == BCH_ON_ERROR_panic)
+		thr->c->opts.errors = BCH_ON_ERROR_ro;
+
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
+out:
+	darray_for_each(devs, i)
+		kfree(*i);
+	darray_exit(&devs);
 	return ret;
+err:
+	if (thr)
+		bch2_fsck_thread_exit(&thr->thr);
+	pr_err("ret %s", bch2_err_str(ret));
+	goto out;
 }
 
 static long bch2_global_ioctl(unsigned cmd, void __user *arg)
@@ -763,9 +781,9 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
 	return ret;
 }
 
-static int bch2_fsck_online_thread_fn(void *arg)
+static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio)
 {
-	struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+	struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
 	struct bch_fs *c = thr->c;
 
 	c->stdio_filter = current;
@@ -793,13 +811,16 @@ static int bch2_fsck_online_thread_fn(void *arg)
 	c->stdio_filter = NULL;
 	c->opts.fix_errors = old_fix_errors;
 
-	thread_with_stdio_done(&thr->thr);
-
 	up(&c->online_fsck_mutex);
 	bch2_ro_ref_put(c);
-	return 0;
+	return ret;
 }
 
+static const struct thread_with_stdio_ops bch2_online_fsck_ops = {
+	.exit		= bch2_fsck_thread_exit,
+	.fn		= bch2_fsck_online_thread_fn,
+};
+
 static long bch2_ioctl_fsck_online(struct bch_fs *c,
 				   struct bch_ioctl_fsck_online arg)
 {
@@ -840,9 +861,7 @@ static long bch2_ioctl_fsck_online(struct bch_fs *c,
 			goto err;
 	}
 
-	ret = bch2_run_thread_with_stdio(&thr->thr,
-			bch2_fsck_thread_exit,
-			bch2_fsck_online_thread_fn);
+	ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_online_fsck_ops);
 err:
 	if (ret < 0) {
 		bch_err_fn(c, ret);
diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c
index 3c761ad6b1c8..7ed779b411f6 100644
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
@@ -429,15 +429,20 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio,
 				extent_nonce(version, crc_old), bio);
 
 	if (bch2_crc_cmp(merged, crc_old.csum) && !c->opts.no_data_io) {
-		bch_err(c, "checksum error in %s() (memory corruption or bug?)\n"
-			"expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)",
-			__func__,
-			crc_old.csum.hi,
-			crc_old.csum.lo,
-			merged.hi,
-			merged.lo,
-			bch2_csum_types[crc_old.csum_type],
-			bch2_csum_types[new_csum_type]);
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "checksum error in %s() (memory corruption or bug?)\n"
+			   "expected %0llx:%0llx got %0llx:%0llx (old type ",
+			   __func__,
+			   crc_old.csum.hi,
+			   crc_old.csum.lo,
+			   merged.hi,
+			   merged.lo);
+		bch2_prt_csum_type(&buf, crc_old.csum_type);
+		prt_str(&buf, " new type ");
+		bch2_prt_csum_type(&buf, new_csum_type);
+		prt_str(&buf, ")");
+		bch_err(c, "%s", buf.buf);
+		printbuf_exit(&buf);
 		return -EIO;
 	}
 
@@ -558,7 +563,7 @@ got_key:
 	return 0;
 }
 
-#include "../crypto.h"
+#include "crypto.h"
 #endif
 
 int bch2_request_key(struct bch_sb *sb, struct bch_key *key)
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 1b8c2c1016dc..e40499fde9a4 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -61,11 +61,12 @@ static inline void bch2_csum_err_msg(struct printbuf *out,
 				     struct bch_csum expected,
 				     struct bch_csum got)
 {
-	prt_printf(out, "checksum error: got ");
+	prt_str(out, "checksum error, type ");
+	bch2_prt_csum_type(out, type);
+	prt_str(out, ": got ");
 	bch2_csum_to_text(out, type, got);
 	prt_str(out, " should be ");
 	bch2_csum_to_text(out, type, expected);
-	prt_printf(out, " type %s", bch2_csum_types[type]);
 }
 
 int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 33df8cf86bd8..1410365a8891 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -601,13 +601,13 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		return 0;
 
 	if (!mempool_initialized(&c->compression_bounce[READ]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[READ],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[READ],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_read_init;
 
 	if (!mempool_initialized(&c->compression_bounce[WRITE]) &&
-	    mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE],
-					1, c->opts.encoded_extent_max))
+	    mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE],
+				       1, c->opts.encoded_extent_max))
 		return -BCH_ERR_ENOMEM_compression_bounce_write_init;
 
 	for (i = compression_types;
@@ -622,15 +622,15 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
 		if (mempool_initialized(&c->compress_workspace[i->type]))
 			continue;
 
-		if (mempool_init_kvpmalloc_pool(
+		if (mempool_init_kvmalloc_pool(
 				&c->compress_workspace[i->type],
 				1, i->compress_workspace))
 			return -BCH_ERR_ENOMEM_compression_workspace_init;
 	}
 
 	if (!mempool_initialized(&c->decompress_workspace) &&
-	    mempool_init_kvpmalloc_pool(&c->decompress_workspace,
-					1, decompress_workspace_size))
+	    mempool_init_kvmalloc_pool(&c->decompress_workspace,
+				       1, decompress_workspace_size))
 		return -BCH_ERR_ENOMEM_decompression_workspace_init;
 
 	return 0;
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 58c2eb45570f..607fd5e232c9 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,14 +47,6 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
 	return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
 }
 
-static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
-{
-	if (type < BCH_COMPRESSION_TYPE_NR)
-		prt_str(out, __bch2_compression_types[type]);
-	else
-		prt_printf(out, "(invalid compression type %u)", type);
-}
-
 int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
 				struct bch_extent_crc_unpacked *);
 int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 4150feca42a2..0022b51ce3c0 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -14,6 +14,7 @@
 #include "move.h"
 #include "nocow_locking.h"
 #include "rebalance.h"
+#include "snapshot.h"
 #include "subvolume.h"
 #include "trace.h"
 
@@ -509,6 +510,14 @@ int bch2_data_update_init(struct btree_trans *trans,
 	unsigned ptrs_locked = 0;
 	int ret = 0;
 
+	/*
+	 * fs is corrupt  we have a key for a snapshot node that doesn't exist,
+	 * and we have to check for this because we go rw before repairing the
+	 * snapshots table - just skip it, we can move it later.
+	 */
+	if (unlikely(k.k->p.snapshot && !bch2_snapshot_equiv(c, k.k->p.snapshot)))
+		return -BCH_ERR_data_update_done;
+
 	bch2_bkey_buf_init(&m->k);
 	bch2_bkey_buf_reassemble(&m->k, c, k);
 	m->btree_id	= btree_id;
@@ -571,8 +580,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 				move_ctxt_wait_event(ctxt,
 						(locked = bch2_bucket_nocow_trylock(&c->nocow_locks,
 									  PTR_BUCKET_POS(c, &p.ptr), 0)) ||
-						(!atomic_read(&ctxt->read_sectors) &&
-						 !atomic_read(&ctxt->write_sectors)));
+						list_empty(&ctxt->ios));
 
 				if (!locked)
 					bch2_bucket_nocow_lock(&c->nocow_locks,
@@ -590,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
 		i++;
 	}
 
+	unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
+
 	/*
 	 * If current extent durability is less than io_opts.data_replicas,
 	 * we're not trying to rereplicate the extent up to data_replicas here -
@@ -599,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
 	 * rereplicate, currently, so that users don't get an unexpected -ENOSPC
 	 */
 	if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
-	    durability_have >= io_opts.data_replicas) {
+	    !durability_required) {
 		m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
 		m->data_opts.rewrite_ptrs = 0;
 		/* if iter == NULL, it's just a promote */
@@ -608,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
 		goto done;
 	}
 
-	m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
+	m->op.nr_replicas = min(durability_removing, durability_required) +
 		m->data_opts.extra_replicas;
-	m->op.nr_replicas_required = m->op.nr_replicas;
 
-	BUG_ON(!m->op.nr_replicas);
+	/*
+	 * If device(s) were set to durability=0 after data was written to them
+	 * we can end up with a duribilty=0 extent, and the normal algorithm
+	 * that tries not to increase durability doesn't work:
+	 */
+	if (!(durability_have + durability_removing))
+		m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
+
+	m->op.nr_replicas_required = m->op.nr_replicas;
 
 	if (reserve_sectors) {
 		ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 7bdba8507fc9..cd99b7399414 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -13,6 +13,7 @@
 #include "btree_iter.h"
 #include "btree_locking.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "debug.h"
 #include "error.h"
@@ -137,7 +138,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 	mutex_lock(&c->verify_lock);
 
 	if (!c->verify_ondisk) {
-		c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+		c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 		if (!c->verify_ondisk)
 			goto out;
 	}
@@ -170,7 +171,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
 		struct printbuf buf = PRINTBUF;
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key));
-		bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf);
+		bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf);
 		printbuf_exit(&buf);
 	}
 out:
@@ -199,7 +200,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 		return;
 	}
 
-	n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
+	n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL);
 	if (!n_ondisk) {
 		prt_printf(out, "memory allocation failure\n");
 		goto out;
@@ -293,7 +294,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
 out:
 	if (bio)
 		bio_put(bio);
-	kvpfree(n_ondisk, btree_buf_bytes(b));
+	kvfree(n_ondisk);
 	percpu_ref_put(&ca->io_ref);
 }
 
@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
 	i->size	= size;
 	i->ret	= 0;
 
-	do {
+	while (1) {
 		err = flush_buf(i);
 		if (err)
 			return err;
@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
 		if (!i->size)
 			break;
 
+		if (done)
+			break;
+
 		done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
 		i->iter++;
-	} while (!done);
+	}
 
 	if (i->buf.allocation_failure)
 		return -ENOMEM;
@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
 	.read		= bch2_journal_pins_read,
 };
 
+static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
+				       size_t size, loff_t *ppos)
+{
+	struct dump_iter *i = file->private_data;
+	struct bch_fs *c = i->c;
+	int err;
+
+	i->ubuf = buf;
+	i->size	= size;
+	i->ret	= 0;
+
+	if (!i->iter) {
+		bch2_btree_updates_to_text(&i->buf, c);
+		i->iter++;
+	}
+
+	err = flush_buf(i);
+	if (err)
+		return err;
+
+	if (i->buf.allocation_failure)
+		return -ENOMEM;
+
+	return i->ret;
+}
+
+static const struct file_operations btree_updates_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch2_dump_open,
+	.release	= bch2_dump_release,
+	.read		= bch2_btree_updates_read,
+};
+
 static int btree_transaction_stats_open(struct inode *inode, struct file *file)
 {
 	struct bch_fs *c = inode->i_private;
 	struct dump_iter *i;
 
 	i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
-
 	if (!i)
 		return -ENOMEM;
 
@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
 		debugfs_remove_recursive(c->fs_debug_dir);
 }
 
+static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
+{
+	struct dentry *d;
+
+	d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
+
+	debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
+
+	debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
+
+	debugfs_create_file("bfloat-failed", 0400, d, bd,
+			    &bfloat_failed_debug_ops);
+}
+
 void bch2_fs_debug_init(struct bch_fs *c)
 {
 	struct btree_debug *bd;
@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
 			    c->btree_debug, &journal_pins_ops);
 
+	debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
+			    c->btree_debug, &btree_updates_ops);
+
 	debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
 			    c, &btree_transaction_stats_op);
 
@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
 	     bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
 	     bd++) {
 		bd->id = bd - c->btree_debug;
-		debugfs_create_file(bch2_btree_id_str(bd->id),
-				    0400, c->btree_debug_dir, bd,
-				    &btree_debug_ops);
-
-		snprintf(name, sizeof(name), "%s-formats",
-			 bch2_btree_id_str(bd->id));
-
-		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
-				    &btree_format_debug_ops);
-
-		snprintf(name, sizeof(name), "%s-bfloat-failed",
-			 bch2_btree_id_str(bd->id));
-
-		debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
-				    &bfloat_failed_debug_ops);
+		bch2_fs_debug_btree_init(c, bd);
 	}
 }
 
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 4ae1e9f002a0..d37bd07afbfe 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -144,19 +144,21 @@ fsck_err:
 	return ret;
 }
 
-void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c,
-			 struct bkey_s_c k)
+void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
 {
 	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 	struct qstr d_name = bch2_dirent_get_name(d);
 
-	prt_printf(out, "%.*s -> %llu type %s",
-	       d_name.len,
-	       d_name.name,
-	       d.v->d_type != DT_SUBVOL
-	       ? le64_to_cpu(d.v->d_inum)
-	       : le32_to_cpu(d.v->d_child_subvol),
-	       bch2_d_type_str(d.v->d_type));
+	prt_printf(out, "%.*s -> ", d_name.len, d_name.name);
+
+	if (d.v->d_type != DT_SUBVOL)
+		prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum));
+	else
+		prt_printf(out, "%u -> %u",
+			   le32_to_cpu(d.v->d_parent_subvol),
+			   le32_to_cpu(d.v->d_child_subvol));
+
+	prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type));
 }
 
 static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
@@ -199,17 +201,17 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
 }
 
 int bch2_dirent_create_snapshot(struct btree_trans *trans,
-			u64 dir, u32 snapshot,
+			u32 dir_subvol, u64 dir, u32 snapshot,
 			const struct bch_hash_info *hash_info,
 			u8 type, const struct qstr *name, u64 dst_inum,
 			u64 *dir_offset,
 			bch_str_hash_flags_t str_hash_flags)
 {
-	subvol_inum zero_inum = { 0 };
+	subvol_inum dir_inum = { .subvol = dir_subvol, .inum = dir };
 	struct bkey_i_dirent *dirent;
 	int ret;
 
-	dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+	dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
 	ret = PTR_ERR_OR_ZERO(dirent);
 	if (ret)
 		return ret;
@@ -217,10 +219,10 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans,
 	dirent->k.p.inode	= dir;
 	dirent->k.p.snapshot	= snapshot;
 
-	ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
-				     zero_inum, snapshot,
-				     &dirent->k_i, str_hash_flags,
-				     BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+					dir_inum, snapshot,
+					&dirent->k_i, str_hash_flags,
+					BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	*dir_offset = dirent->k.p.offset;
 
 	return ret;
@@ -291,12 +293,10 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	struct bkey_i_dirent *new_src = NULL, *new_dst = NULL;
 	struct bpos dst_pos =
 		POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name));
-	unsigned src_type = 0, dst_type = 0, src_update_flags = 0;
+	unsigned src_update_flags = 0;
+	bool delete_src, delete_dst;
 	int ret = 0;
 
-	if (src_dir.subvol != dst_dir.subvol)
-		return -EXDEV;
-
 	memset(src_inum, 0, sizeof(*src_inum));
 	memset(dst_inum, 0, sizeof(*dst_inum));
 
@@ -317,12 +317,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
-	src_type = bkey_s_c_to_dirent(old_src).v->d_type;
-
-	if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE)
-		return -EOPNOTSUPP;
-
-
 	/* Lookup dst: */
 	if (mode == BCH_RENAME) {
 		/*
@@ -350,11 +344,6 @@ int bch2_dirent_rename(struct btree_trans *trans,
 				bkey_s_c_to_dirent(old_dst), dst_inum);
 		if (ret)
 			goto out;
-
-		dst_type = bkey_s_c_to_dirent(old_dst).v->d_type;
-
-		if (dst_type == DT_SUBVOL)
-			return -EOPNOTSUPP;
 	}
 
 	if (mode != BCH_RENAME_EXCHANGE)
@@ -424,28 +413,55 @@ int bch2_dirent_rename(struct btree_trans *trans,
 		}
 	}
 
+	if (new_dst->v.d_type == DT_SUBVOL)
+		new_dst->v.d_parent_subvol = cpu_to_le32(dst_dir.subvol);
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    new_src->v.d_type == DT_SUBVOL)
+		new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol);
+
 	ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0);
 	if (ret)
 		goto out;
 out_set_src:
-
 	/*
-	 * If we're deleting a subvolume, we need to really delete the dirent,
-	 * not just emit a whiteout in the current snapshot:
+	 * If we're deleting a subvolume we need to really delete the dirent,
+	 * not just emit a whiteout in the current snapshot - there can only be
+	 * single dirent that points to a given subvolume.
+	 *
+	 * IOW, we don't maintain multiple versions in different snapshots of
+	 * dirents that point to subvolumes - dirents that point to subvolumes
+	 * are only visible in one particular subvolume so it's not necessary,
+	 * and it would be particularly confusing for fsck to have to deal with.
 	 */
-	if (src_type == DT_SUBVOL) {
-		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
-		ret = bch2_btree_iter_traverse(&src_iter);
+	delete_src = bkey_s_c_to_dirent(old_src).v->d_type == DT_SUBVOL &&
+		new_src->k.p.snapshot != old_src.k->p.snapshot;
+
+	delete_dst = old_dst.k &&
+		bkey_s_c_to_dirent(old_dst).v->d_type == DT_SUBVOL &&
+		new_dst->k.p.snapshot != old_dst.k->p.snapshot;
+
+	if (!delete_src || !bkey_deleted(&new_src->k)) {
+		ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
 		if (ret)
 			goto out;
+	}
 
-		new_src->k.p = src_iter.pos;
-		src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE;
+	if (delete_src) {
+		bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&src_iter) ?:
+			bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
 	}
 
-	ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags);
-	if (ret)
-		goto out;
+	if (delete_dst) {
+		bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot);
+		ret =   bch2_btree_iter_traverse(&dst_iter) ?:
+			bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+		if (ret)
+			goto out;
+	}
 
 	if (mode == BCH_RENAME_EXCHANGE)
 		*src_offset = new_src->k.p.offset;
@@ -456,41 +472,29 @@ out:
 	return ret;
 }
 
-int __bch2_dirent_lookup_trans(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       subvol_inum dir,
-			       const struct bch_hash_info *hash_info,
-			       const struct qstr *name, subvol_inum *inum,
-			       unsigned flags)
+int bch2_dirent_lookup_trans(struct btree_trans *trans,
+			     struct btree_iter *iter,
+			     subvol_inum dir,
+			     const struct bch_hash_info *hash_info,
+			     const struct qstr *name, subvol_inum *inum,
+			     unsigned flags)
 {
-	struct bkey_s_c k;
-	struct bkey_s_c_dirent d;
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot);
-	if (ret)
-		return ret;
-
-	ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
-			       hash_info, dir, name, flags);
+	int ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
+				   hash_info, dir, name, flags);
 	if (ret)
 		return ret;
 
-	k = bch2_btree_iter_peek_slot(iter);
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(iter);
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
 
-	d = bkey_s_c_to_dirent(k);
-
-	ret = bch2_dirent_read_target(trans, dir, d, inum);
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), inum);
 	if (ret > 0)
 		ret = -ENOENT;
 err:
 	if (ret)
 		bch2_trans_iter_exit(trans, iter);
-
 	return ret;
 }
 
@@ -502,13 +506,13 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
 	struct btree_iter iter = { NULL };
 
 	int ret = lockrestart_do(trans,
-		__bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+		bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
 	bch2_trans_iter_exit(trans, &iter);
 	bch2_trans_put(trans);
 	return ret;
 }
 
-int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
+int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -518,7 +522,10 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 snapshot)
 			   SPOS(dir, 0, snapshot),
 			   POS(dir, U64_MAX), 0, k, ret)
 		if (k.k->type == KEY_TYPE_dirent) {
-			ret = -ENOTEMPTY;
+			struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+			if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol)
+				continue;
+			ret = -BCH_ERR_ENOTEMPTY_dir_not_empty;
 			break;
 		}
 	bch2_trans_iter_exit(trans, &iter);
@@ -531,7 +538,7 @@ int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir)
 	u32 snapshot;
 
 	return bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot) ?:
-		bch2_empty_dir_snapshot(trans, dir.inum, snapshot);
+		bch2_empty_dir_snapshot(trans, dir.inum, dir.subvol, snapshot);
 }
 
 int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx)
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 21ffeb78f02e..bee55cca2aa0 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,7 +35,7 @@ static inline unsigned dirent_val_u64s(unsigned len)
 int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
 			    struct bkey_s_c_dirent, subvol_inum *);
 
-int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32,
 			const struct bch_hash_info *, u8,
 			const struct qstr *, u64, u64 *,
 			bch_str_hash_flags_t);
@@ -62,14 +62,14 @@ int bch2_dirent_rename(struct btree_trans *,
 		       const struct qstr *, subvol_inum *, u64 *,
 		       enum bch_rename_mode);
 
-int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
+int bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *,
 			       subvol_inum, const struct bch_hash_info *,
 			       const struct qstr *, subvol_inum *, unsigned);
 u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum,
 		       const struct bch_hash_info *,
 		       const struct qstr *, subvol_inum *);
 
-int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32);
+int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32);
 int bch2_empty_dir_trans(struct btree_trans *, subvol_inum);
 int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *);
 
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index d503af270024..556a217108d3 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -131,29 +131,33 @@ fsck_err:
 void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
 			 struct bkey_s_c k)
 {
-	const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
-	unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
+	const struct bch_stripe *sp = bkey_s_c_to_stripe(k).v;
+	struct bch_stripe s = {};
+
+	memcpy(&s, sp, min(sizeof(s), bkey_val_bytes(k.k)));
+
+	unsigned nr_data = s.nr_blocks - s.nr_redundant;
+
+	prt_printf(out, "algo %u sectors %u blocks %u:%u csum ",
+		   s.algorithm,
+		   le16_to_cpu(s.sectors),
+		   nr_data,
+		   s.nr_redundant);
+	bch2_prt_csum_type(out, s.csum_type);
+	prt_printf(out, " gran %u", 1U << s.csum_granularity_bits);
+
+	for (unsigned i = 0; i < s.nr_blocks; i++) {
+		const struct bch_extent_ptr *ptr = sp->ptrs + i;
+
+		if ((void *) ptr >= bkey_val_end(k))
+			break;
+
+		bch2_extent_ptr_to_text(out, c, ptr);
 
-	prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u",
-	       s->algorithm,
-	       le16_to_cpu(s->sectors),
-	       nr_data,
-	       s->nr_redundant,
-	       s->csum_type,
-	       1U << s->csum_granularity_bits);
-
-	for (i = 0; i < s->nr_blocks; i++) {
-		const struct bch_extent_ptr *ptr = s->ptrs + i;
-		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
-		u32 offset;
-		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-		prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset);
-		if (i < nr_data)
-			prt_printf(out, "#%u", stripe_blockcount_get(s, i));
-		prt_printf(out, " gen %u", ptr->gen);
-		if (ptr_stale(ca, ptr))
-			prt_printf(out, " stale");
+		if (s.csum_type < BCH_CSUM_NR &&
+		    i < nr_data &&
+		    stripe_blockcount_offset(&s, i) < bkey_val_bytes(k.k))
+			prt_printf(out,  "#%u", stripe_blockcount_get(sp, i));
 	}
 }
 
@@ -448,7 +452,7 @@ int bch2_trigger_stripe(struct btree_trans *trans,
 			struct printbuf buf = PRINTBUF;
 
 			bch2_bkey_val_to_text(&buf, c, new);
-			bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+			bch2_fs_fatal_error(c, ": no replicas entry for %s", buf.buf);
 			printbuf_exit(&buf);
 			return ret;
 		}
@@ -504,7 +508,7 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf)
 		unsigned i;
 
 		for (i = 0; i < s->v.nr_blocks; i++) {
-			kvpfree(buf->data[i], buf->size << 9);
+			kvfree(buf->data[i]);
 			buf->data[i] = NULL;
 		}
 	}
@@ -531,7 +535,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf,
 	memset(buf->valid, 0xFF, sizeof(buf->valid));
 
 	for (i = 0; i < v->nr_blocks; i++) {
-		buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL);
+		buf->data[i] = kvmalloc(buf->size << 9, GFP_KERNEL);
 		if (!buf->data[i])
 			goto err;
 	}
@@ -607,10 +611,8 @@ static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf)
 				struct printbuf err = PRINTBUF;
 				struct bch_dev *ca = bch_dev_bkey_exists(c, v->ptrs[i].dev);
 
-				prt_printf(&err, "stripe checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)\n",
-					   want.hi, want.lo,
-					   got.hi, got.lo,
-					   bch2_csum_types[v->csum_type]);
+				prt_str(&err, "stripe ");
+				bch2_csum_err_msg(&err, v->csum_type, want, got);
 				prt_printf(&err, "  for %ps at %u of\n  ", (void *) _RET_IP_, i);
 				bch2_bkey_val_to_text(&err, c, bkey_i_to_s_c(&buf->key));
 				bch_err_ratelimited(ca, "%s", err.buf);
@@ -1868,10 +1870,10 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri
 		return -BCH_ERR_stripe_alloc_blocked;
 
 	ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe);
+	bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c,
+			     "reading stripe key: %s", bch2_err_str(ret));
 	if (ret) {
 		bch2_stripe_close(c, h->s);
-		if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
-			bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret));
 		return ret;
 	}
 
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index f4369b02e805..f042616888b0 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -32,6 +32,8 @@ static inline unsigned stripe_csums_per_device(const struct bch_stripe *s)
 static inline unsigned stripe_csum_offset(const struct bch_stripe *s,
 					  unsigned dev, unsigned csum_idx)
 {
+	EBUG_ON(s->csum_type >= BCH_CSUM_NR);
+
 	unsigned csum_bytes = bch_crc_bytes[s->csum_type];
 
 	return sizeof(struct bch_stripe) +
diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c
index d260ff9bbfeb..43557bebd0f8 100644
--- a/fs/bcachefs/errcode.c
+++ b/fs/bcachefs/errcode.c
@@ -2,6 +2,7 @@
 
 #include "bcachefs.h"
 #include "errcode.h"
+#include "trace.h"
 
 #include <linux/errname.h>
 
@@ -49,15 +50,17 @@ bool __bch2_err_matches(int err, int class)
 	return err == class;
 }
 
-int __bch2_err_class(int err)
+int __bch2_err_class(int bch_err)
 {
-	err = -err;
-	BUG_ON((unsigned) err >= BCH_ERR_MAX);
+	int std_err = -bch_err;
+	BUG_ON((unsigned) std_err >= BCH_ERR_MAX);
 
-	while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START])
-		err = bch2_errcode_parents[err - BCH_ERR_START];
+	while (std_err >= BCH_ERR_START && bch2_errcode_parents[std_err - BCH_ERR_START])
+		std_err = bch2_errcode_parents[std_err - BCH_ERR_START];
+
+	trace_error_downcast(bch_err, std_err, _RET_IP_);
 
-	return -err;
+	return -std_err;
 }
 
 const char *bch2_blk_status_to_str(blk_status_t status)
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 8c40c2067a04..01a79fa3eacb 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -5,6 +5,10 @@
 #define BCH_ERRCODES()								\
 	x(ERANGE,			ERANGE_option_too_small)		\
 	x(ERANGE,			ERANGE_option_too_big)			\
+	x(EINVAL,			mount_option)				\
+	x(BCH_ERR_mount_option,		option_name)				\
+	x(BCH_ERR_mount_option,		option_value)				\
+	x(BCH_ERR_mount_option,         option_not_bool)                        \
 	x(ENOMEM,			ENOMEM_stripe_buf)			\
 	x(ENOMEM,			ENOMEM_replicas_table)			\
 	x(ENOMEM,			ENOMEM_cpu_replicas)			\
@@ -78,6 +82,7 @@
 	x(ENOMEM,			ENOMEM_fs_name_alloc)			\
 	x(ENOMEM,			ENOMEM_fs_other_alloc)			\
 	x(ENOMEM,			ENOMEM_dev_alloc)			\
+	x(ENOMEM,			ENOMEM_disk_accounting)			\
 	x(ENOSPC,			ENOSPC_disk_reservation)		\
 	x(ENOSPC,			ENOSPC_bucket_alloc)			\
 	x(ENOSPC,			ENOSPC_disk_label_add)			\
@@ -109,6 +114,8 @@
 	x(ENOENT,			ENOENT_dirent_doesnt_match_inode)	\
 	x(ENOENT,			ENOENT_dev_not_found)			\
 	x(ENOENT,			ENOENT_dev_idx_not_found)		\
+	x(ENOTEMPTY,			ENOTEMPTY_dir_not_empty)		\
+	x(ENOTEMPTY,			ENOTEMPTY_subvol_not_empty)		\
 	x(0,				open_buckets_empty)			\
 	x(0,				freelist_empty)				\
 	x(BCH_ERR_freelist_empty,	no_buckets_found)			\
@@ -176,6 +183,9 @@
 	x(EINVAL,			invalid)				\
 	x(EINVAL,			internal_fsck_err)			\
 	x(EINVAL,			opt_parse_error)			\
+	x(EINVAL,			remove_with_metadata_missing_unimplemented)\
+	x(EINVAL,			remove_would_lose_data)			\
+	x(EINVAL,			btree_iter_with_journal_not_supported)	\
 	x(EROFS,			erofs_trans_commit)			\
 	x(EROFS,			erofs_no_writes)			\
 	x(EROFS,			erofs_journal_err)			\
@@ -225,7 +235,10 @@
 	x(BCH_ERR_operation_blocked,    nocow_lock_blocked)			\
 	x(EIO,				btree_node_read_err)			\
 	x(EIO,				sb_not_downgraded)			\
-	x(EIO,				btree_write_all_failed)			\
+	x(EIO,				btree_node_write_all_failed)		\
+	x(EIO,				btree_node_read_error)			\
+	x(EIO,				btree_node_read_validate_error)		\
+	x(EIO,				btree_need_topology_repair)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_fixable)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_want_retry)		\
 	x(BCH_ERR_btree_node_read_err,	btree_node_read_err_must_retry)		\
@@ -238,7 +251,9 @@
 	x(BCH_ERR_nopromote,		nopromote_congested)			\
 	x(BCH_ERR_nopromote,		nopromote_in_flight)			\
 	x(BCH_ERR_nopromote,		nopromote_no_writes)			\
-	x(BCH_ERR_nopromote,		nopromote_enomem)
+	x(BCH_ERR_nopromote,		nopromote_enomem)			\
+	x(0,				need_inode_lock)			\
+	x(0,				invalid_snapshot_node)
 
 enum bch_errcode {
 	BCH_ERR_START		= 2048,
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index d32c8bebe46c..82a6656c941c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "bcachefs.h"
 #include "error.h"
+#include "journal.h"
+#include "recovery_passes.h"
 #include "super.h"
 #include "thread_with_file.h"
 
@@ -15,7 +17,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 		return false;
 	case BCH_ON_ERROR_ro:
 		if (bch2_fs_emergency_read_only(c))
-			bch_err(c, "inconsistency detected - emergency read only");
+			bch_err(c, "inconsistency detected - emergency read only at journal seq %llu",
+				journal_cur_seq(&c->journal));
 		return true;
 	case BCH_ON_ERROR_panic:
 		panic(bch2_fmt(c, "panic after error"));
@@ -25,11 +28,16 @@ bool bch2_inconsistent_error(struct bch_fs *c)
 	}
 }
 
-void bch2_topology_error(struct bch_fs *c)
+int bch2_topology_error(struct bch_fs *c)
 {
 	set_bit(BCH_FS_topology_error, &c->flags);
-	if (!test_bit(BCH_FS_fsck_running, &c->flags))
+	if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
 		bch2_inconsistent_error(c);
+		return -BCH_ERR_btree_need_topology_repair;
+	} else {
+		return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?:
+			-BCH_ERR_btree_node_read_validate_error;
+	}
 }
 
 void bch2_fatal_error(struct bch_fs *c)
diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h
index fec17d1353d1..36caedf72d89 100644
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
@@ -30,7 +30,13 @@ struct work_struct;
 
 bool bch2_inconsistent_error(struct bch_fs *);
 
-void bch2_topology_error(struct bch_fs *);
+int bch2_topology_error(struct bch_fs *);
+
+#define bch2_fs_topology_error(c, ...)					\
+({									\
+	bch_err(c, "btree topology error: " __VA_ARGS__);		\
+	bch2_topology_error(c);						\
+})
 
 #define bch2_fs_inconsistent(c, ...)					\
 ({									\
@@ -191,9 +197,9 @@ do {									\
 
 void bch2_fatal_error(struct bch_fs *);
 
-#define bch2_fs_fatal_error(c, ...)					\
+#define bch2_fs_fatal_error(c, _msg, ...)				\
 do {									\
-	bch_err(c, __VA_ARGS__);					\
+	bch_err(c, "%s(): fatal error " _msg, __func__, ##__VA_ARGS__);	\
 	bch2_fatal_error(c);						\
 } while (0)
 
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 61395b113df9..1a331e539204 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -189,13 +189,18 @@ int bch2_btree_ptr_v2_invalid(struct bch_fs *c, struct bkey_s_c k,
 			      enum bkey_invalid_flags flags,
 			      struct printbuf *err)
 {
+	struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k);
 	int ret = 0;
 
-	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX, c, err,
-			 btree_ptr_v2_val_too_big,
+	bkey_fsck_err_on(bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX,
+			 c, err, btree_ptr_v2_val_too_big,
 			 "value too big (%zu > %zu)",
 			 bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX);
 
+	bkey_fsck_err_on(bpos_ge(bp.v->min_key, bp.k->p),
+			 c, err, btree_ptr_v2_min_key_bad,
+			 "min_key > key");
+
 	ret = bch2_bkey_ptrs_invalid(c, k, flags, err);
 fsck_err:
 	return ret;
@@ -973,6 +978,33 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k)
 	return bkey_deleted(k.k);
 }
 
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr)
+{
+	struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
+		? bch_dev_bkey_exists(c, ptr->dev)
+		: NULL;
+
+	if (!ca) {
+		prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
+			   (u64) ptr->offset, ptr->gen,
+			   ptr->cached ? " cached" : "");
+	} else {
+		u32 offset;
+		u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
+
+		prt_printf(out, "ptr: %u:%llu:%u gen %u",
+			   ptr->dev, b, offset, ptr->gen);
+		if (ptr->cached)
+			prt_str(out, " cached");
+		if (ptr->unwritten)
+			prt_str(out, " unwritten");
+		if (b >= ca->mi.first_bucket &&
+		    b <  ca->mi.nbuckets &&
+		    ptr_stale(ca, ptr))
+			prt_printf(out, " stale");
+	}
+}
+
 void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			    struct bkey_s_c k)
 {
@@ -988,42 +1020,22 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
 			prt_printf(out, " ");
 
 		switch (__extent_entry_type(entry)) {
-		case BCH_EXTENT_ENTRY_ptr: {
-			const struct bch_extent_ptr *ptr = entry_to_ptr(entry);
-			struct bch_dev *ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev]
-				? bch_dev_bkey_exists(c, ptr->dev)
-				: NULL;
-
-			if (!ca) {
-				prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev,
-				       (u64) ptr->offset, ptr->gen,
-				       ptr->cached ? " cached" : "");
-			} else {
-				u32 offset;
-				u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset);
-
-				prt_printf(out, "ptr: %u:%llu:%u gen %u",
-					   ptr->dev, b, offset, ptr->gen);
-				if (ptr->cached)
-					prt_str(out, " cached");
-				if (ptr->unwritten)
-					prt_str(out, " unwritten");
-				if (ca && ptr_stale(ca, ptr))
-					prt_printf(out, " stale");
-			}
+		case BCH_EXTENT_ENTRY_ptr:
+			bch2_extent_ptr_to_text(out, c, entry_to_ptr(entry));
 			break;
-		}
+
 		case BCH_EXTENT_ENTRY_crc32:
 		case BCH_EXTENT_ENTRY_crc64:
 		case BCH_EXTENT_ENTRY_crc128: {
 			struct bch_extent_crc_unpacked crc =
 				bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
 
-			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
+			prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum ",
 			       crc.compressed_size,
 			       crc.uncompressed_size,
-			       crc.offset, crc.nonce,
-			       bch2_csum_types[crc.csum_type]);
+			       crc.offset, crc.nonce);
+			bch2_prt_csum_type(out, crc.csum_type);
+			prt_str(out, " compress ");
 			bch2_prt_compression_type(out, crc.compression_type);
 			break;
 		}
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 6bf839d69e84..528e817eacbd 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -43,6 +43,11 @@ enum bkey_invalid_flags;
 #define extent_entry_next(_entry)					\
 	((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry)))
 
+#define extent_entry_next_safe(_entry, _end)				\
+	(likely(__extent_entry_type(_entry) < BCH_EXTENT_ENTRY_MAX)	\
+	 ? extent_entry_next(_entry)					\
+	 : _end)
+
 static inline unsigned
 __extent_entry_type(const union bch_extent_entry *e)
 {
@@ -103,17 +108,17 @@ static inline void extent_entry_drop(struct bkey_s k, union bch_extent_entry *en
 
 static inline bool extent_entry_is_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr;
 }
 
 static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e)
 {
-	return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
+	return __extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr;
 }
 
 static inline bool extent_entry_is_crc(const union bch_extent_entry *e)
 {
-	switch (extent_entry_type(e)) {
+	switch (__extent_entry_type(e)) {
 	case BCH_EXTENT_ENTRY_crc32:
 	case BCH_EXTENT_ENTRY_crc64:
 	case BCH_EXTENT_ENTRY_crc128:
@@ -280,7 +285,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 #define __bkey_extent_entry_for_each_from(_start, _end, _entry)		\
 	for ((_entry) = (_start);					\
 	     (_entry) < (_end);						\
-	     (_entry) = extent_entry_next(_entry))
+	     (_entry) = extent_entry_next_safe(_entry, _end))
 
 #define __bkey_ptr_next(_ptr, _end)					\
 ({									\
@@ -318,7 +323,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
 	(_ptr).has_ec	= false;					\
 									\
 	__bkey_extent_entry_for_each_from(_entry, _end, _entry)		\
-		switch (extent_entry_type(_entry)) {			\
+		switch (__extent_entry_type(_entry)) {			\
 		case BCH_EXTENT_ENTRY_ptr:				\
 			(_ptr).ptr		= _entry->ptr;		\
 			goto out;					\
@@ -344,7 +349,7 @@ out:									\
 	for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL),		\
 	     (_entry) = _start;						\
 	     __bkey_ptr_next_decode(_k, _end, _ptr, _entry);		\
-	     (_entry) = extent_entry_next(_entry))
+	     (_entry) = extent_entry_next_safe(_entry, _end))
 
 #define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry)			\
 	__bkey_for_each_ptr_decode(_k, (_p).start, (_p).end,		\
@@ -591,30 +596,6 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
 	return ret;
 }
 
-static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr)
-{
-	switch (k.k->type) {
-	case KEY_TYPE_btree_ptr:
-	case KEY_TYPE_btree_ptr_v2:
-		return BCH_DATA_btree;
-	case KEY_TYPE_extent:
-	case KEY_TYPE_reflink_v:
-		return BCH_DATA_user;
-	case KEY_TYPE_stripe: {
-		struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
-
-		BUG_ON(ptr < s.v->ptrs ||
-		       ptr >= s.v->ptrs + s.v->nr_blocks);
-
-		return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant
-			? BCH_DATA_parity
-			: BCH_DATA_user;
-	}
-	default:
-		BUG();
-	}
-}
-
 unsigned bch2_bkey_nr_ptrs(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c);
@@ -695,6 +676,7 @@ bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s);
 void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *);
 
 bool bch2_extent_normalize(struct bch_fs *, struct bkey_s);
+void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *, const struct bch_extent_ptr *);
 void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *,
 			    struct bkey_s_c);
 int bch2_bkey_ptrs_invalid(struct bch_fs *, struct bkey_s_c,
diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c
new file mode 100644
index 000000000000..0f955c3c76a7
--- /dev/null
+++ b/fs/bcachefs/eytzinger.c
@@ -0,0 +1,234 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "eytzinger.h"
+
+/**
+ * is_aligned - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required alignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+__attribute_const__ __always_inline
+static bool is_aligned(const void *base, size_t size, unsigned char align)
+{
+	unsigned char lsbits = (unsigned char)size;
+
+	(void)base;
+#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	lsbits |= (unsigned char)(uintptr_t)base;
+#endif
+	return (lsbits & (align - 1)) == 0;
+}
+
+/**
+ * swap_words_32 - swap two elements in 32-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is still valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
+static void swap_words_32(void *a, void *b, size_t n)
+{
+	do {
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+	} while (n);
+}
+
+/**
+ * swap_words_64 - swap two elements in 64-bit chunks
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+static void swap_words_64(void *a, void *b, size_t n)
+{
+	do {
+#ifdef CONFIG_64BIT
+		u64 t = *(u64 *)(a + (n -= 8));
+		*(u64 *)(a + n) = *(u64 *)(b + n);
+		*(u64 *)(b + n) = t;
+#else
+		/* Use two 32-bit transfers to avoid base+index+4 addressing */
+		u32 t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+
+		t = *(u32 *)(a + (n -= 4));
+		*(u32 *)(a + n) = *(u32 *)(b + n);
+		*(u32 *)(b + n) = t;
+#endif
+	} while (n);
+}
+
+/**
+ * swap_bytes - swap two elements a byte at a time
+ * @a: pointer to the first element to swap
+ * @b: pointer to the second element to swap
+ * @n: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void swap_bytes(void *a, void *b, size_t n)
+{
+	do {
+		char t = ((char *)a)[--n];
+		((char *)a)[n] = ((char *)b)[n];
+		((char *)b)[n] = t;
+	} while (n);
+}
+
+/*
+ * The values are arbitrary as long as they can't be confused with
+ * a pointer, but small integers make for the smallest compare
+ * instructions.
+ */
+#define SWAP_WORDS_64 (swap_r_func_t)0
+#define SWAP_WORDS_32 (swap_r_func_t)1
+#define SWAP_BYTES    (swap_r_func_t)2
+#define SWAP_WRAPPER  (swap_r_func_t)3
+
+struct wrapper {
+	cmp_func_t cmp;
+	swap_func_t swap_func;
+};
+
+/*
+ * The function pointer is last to make tail calls most efficient if the
+ * compiler decides not to inline this function.
+ */
+static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
+{
+	if (swap_func == SWAP_WRAPPER) {
+		((const struct wrapper *)priv)->swap_func(a, b, (int)size);
+		return;
+	}
+
+	if (swap_func == SWAP_WORDS_64)
+		swap_words_64(a, b, size);
+	else if (swap_func == SWAP_WORDS_32)
+		swap_words_32(a, b, size);
+	else if (swap_func == SWAP_BYTES)
+		swap_bytes(a, b, size);
+	else
+		swap_func(a, b, (int)size, priv);
+}
+
+#define _CMP_WRAPPER ((cmp_r_func_t)0L)
+
+static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *priv)
+{
+	if (cmp == _CMP_WRAPPER)
+		return ((const struct wrapper *)priv)->cmp(a, b);
+	return cmp(a, b, priv);
+}
+
+static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size,
+			 cmp_r_func_t cmp_func, const void *priv,
+			 size_t l, size_t r)
+{
+	return do_cmp(base + inorder_to_eytzinger0(l, n) * size,
+		      base + inorder_to_eytzinger0(r, n) * size,
+		      cmp_func, priv);
+}
+
+static inline void eytzinger0_do_swap(void *base, size_t n, size_t size,
+			   swap_r_func_t swap_func, const void *priv,
+			   size_t l, size_t r)
+{
+	do_swap(base + inorder_to_eytzinger0(l, n) * size,
+		base + inorder_to_eytzinger0(r, n) * size,
+		size, swap_func, priv);
+}
+
+void eytzinger0_sort_r(void *base, size_t n, size_t size,
+		       cmp_r_func_t cmp_func,
+		       swap_r_func_t swap_func,
+		       const void *priv)
+{
+	int i, c, r;
+
+	/* called from 'sort' without swap function, let's pick the default */
+	if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
+		swap_func = NULL;
+
+	if (!swap_func) {
+		if (is_aligned(base, size, 8))
+			swap_func = SWAP_WORDS_64;
+		else if (is_aligned(base, size, 4))
+			swap_func = SWAP_WORDS_32;
+		else
+			swap_func = SWAP_BYTES;
+	}
+
+	/* heapify */
+	for (i = n / 2 - 1; i >= 0; --i) {
+		for (r = i; r * 2 + 1 < n; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < n &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+
+	/* sort */
+	for (i = n - 1; i > 0; --i) {
+		eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i);
+
+		for (r = 0; r * 2 + 1 < i; r = c) {
+			c = r * 2 + 1;
+
+			if (c + 1 < i &&
+			    eytzinger0_do_cmp(base, n, size, cmp_func, priv, c, c + 1) < 0)
+				c++;
+
+			if (eytzinger0_do_cmp(base, n, size, cmp_func, priv, r, c) >= 0)
+				break;
+
+			eytzinger0_do_swap(base, n, size, swap_func, priv, r, c);
+		}
+	}
+}
+
+void eytzinger0_sort(void *base, size_t n, size_t size,
+		     cmp_func_t cmp_func,
+		     swap_func_t swap_func)
+{
+	struct wrapper w = {
+		.cmp  = cmp_func,
+		.swap_func = swap_func,
+	};
+
+	return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
+}
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index b04750dbf870..24840aee335c 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -5,23 +5,33 @@
 #include <linux/bitops.h>
 #include <linux/log2.h>
 
-#include "util.h"
+#ifdef EYTZINGER_DEBUG
+#define EYTZINGER_BUG_ON(cond)		BUG_ON(cond)
+#else
+#define EYTZINGER_BUG_ON(cond)
+#endif
 
 /*
  * Traversal for trees in eytzinger layout - a full binary tree layed out in an
- * array
- */
-
-/*
- * One based indexing version:
+ * array.
+ *
+ * Consider using an eytzinger tree any time you would otherwise be doing binary
+ * search over an array. Binary search is a worst case scenario for branch
+ * prediction and prefetching, but in an eytzinger tree every node's children
+ * are adjacent in memory, thus we can prefetch children before knowing the
+ * result of the comparison, assuming multiple nodes fit on a cacheline.
  *
- * With one based indexing each level of the tree starts at a power of two -
- * good for cacheline alignment:
+ * Two variants are provided, for one based indexing and zero based indexing.
+ *
+ * Zero based indexing is more convenient, but one based indexing has better
+ * alignment and thus better performance because each new level of the tree
+ * starts at a power of two, and thus if element 0 was cacheline aligned, each
+ * new level will be as well.
  */
 
 static inline unsigned eytzinger1_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + child;
 }
@@ -58,7 +68,7 @@ static inline unsigned eytzinger1_last(unsigned size)
 
 static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_right_child(i) <= size) {
 		i = eytzinger1_right_child(i);
@@ -74,7 +84,7 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size)
 
 static inline unsigned eytzinger1_prev(unsigned i, unsigned size)
 {
-	EBUG_ON(i > size);
+	EYTZINGER_BUG_ON(i > size);
 
 	if (eytzinger1_left_child(i) <= size) {
 		i = eytzinger1_left_child(i) + 1;
@@ -101,7 +111,7 @@ static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size,
 	unsigned shift = __fls(size) - b;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	i  ^= 1U << b;
 	i <<= 1;
@@ -126,7 +136,7 @@ static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size,
 	unsigned shift;
 	int s;
 
-	EBUG_ON(!i || i > size);
+	EYTZINGER_BUG_ON(!i || i > size);
 
 	/*
 	 * sign bit trick:
@@ -164,7 +174,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
 
 static inline unsigned eytzinger0_child(unsigned i, unsigned child)
 {
-	EBUG_ON(child > 1);
+	EYTZINGER_BUG_ON(child > 1);
 
 	return (i << 1) + 1 + child;
 }
@@ -231,11 +241,9 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
 	     (_i) != -1;				\
 	     (_i) = eytzinger0_next((_i), (_size)))
 
-typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size);
-
 /* return greatest node <= @search, or -1 if not found */
-static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
-					 eytzinger_cmp_fn cmp, const void *search)
+static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
 {
 	unsigned i, n = 0;
 
@@ -244,21 +252,38 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 
 	do {
 		i = n;
-		n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0);
+		n = eytzinger0_child(i, cmp(base + i * size, search) <= 0);
 	} while (n < nr);
 
 	if (n & 1) {
-		/* @i was greater than @search, return previous node: */
-
-		if (i == eytzinger0_first(nr))
-			return -1;
-
+		/*
+		 * @i was greater than @search, return previous node:
+		 *
+		 * if @i was leftmost/smallest element,
+		 * eytzinger0_prev(eytzinger0_first())) returns -1, as expected
+		 */
 		return eytzinger0_prev(i, nr);
 	} else {
 		return i;
 	}
 }
 
+static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
+				     cmp_func_t cmp, const void *search)
+{
+	ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
+
+	/*
+	 * if eytitzinger0_find_le() returned -1 - no element was <= search - we
+	 * want to return the first element; next/prev identities mean this work
+	 * as expected
+	 *
+	 * similarly if find_le() returns last element, we should return -1;
+	 * identities mean this all works out:
+	 */
+	return eytzinger0_next(idx, nr);
+}
+
 #define eytzinger0_find(base, nr, size, _cmp, search)			\
 ({									\
 	void *_base		= (base);				\
@@ -269,13 +294,13 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	int _res;							\
 									\
 	while (_i < _nr &&						\
-	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+	       (_res = _cmp(_search, _base + _i * _size)))		\
 		_i = eytzinger0_child(_i, _res > 0);			\
 	_i;								\
 })
 
-void eytzinger0_sort(void *, size_t, size_t,
-		    int (*cmp_func)(const void *, const void *, size_t),
-		    void (*swap_func)(void *, void *, size_t));
+void eytzinger0_sort_r(void *, size_t, size_t,
+		       cmp_r_func_t, swap_r_func_t, const void *);
+void eytzinger0_sort(void *, size_t, size_t, cmp_func_t, swap_func_t);
 
 #endif /* _EYTZINGER_H */
diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h
index 66b945be10c2..d8153fe27037 100644
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
@@ -24,12 +24,12 @@ struct {								\
 	(fifo)->mask	= (fifo)->size					\
 		? roundup_pow_of_two((fifo)->size) - 1			\
 		: 0;							\
-	(fifo)->data	= kvpmalloc(fifo_buf_size(fifo), (_gfp));	\
+	(fifo)->data	= kvmalloc(fifo_buf_size(fifo), (_gfp));	\
 })
 
 #define free_fifo(fifo)							\
 do {									\
-	kvpfree((fifo)->data, fifo_buf_size(fifo));			\
+	kvfree((fifo)->data);						\
 	(fifo)->data = NULL;						\
 } while (0)
 
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 1c1ea0f0c692..624e6f963240 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -107,6 +107,7 @@ int bch2_create_trans(struct btree_trans *trans,
 		u32 new_subvol, dir_snapshot;
 
 		ret = bch2_subvolume_create(trans, new_inode->bi_inum,
+					    dir.subvol,
 					    snapshot_src.subvol,
 					    &new_subvol, &snapshot,
 					    (flags & BCH_CREATE_SNAPSHOT_RO) != 0);
@@ -242,7 +243,7 @@ int bch2_unlink_trans(struct btree_trans *trans,
 		      struct bch_inode_unpacked *dir_u,
 		      struct bch_inode_unpacked *inode_u,
 		      const struct qstr *name,
-		      bool deleting_snapshot)
+		      bool deleting_subvol)
 {
 	struct bch_fs *c = trans->c;
 	struct btree_iter dir_iter = { NULL };
@@ -260,8 +261,8 @@ int bch2_unlink_trans(struct btree_trans *trans,
 
 	dir_hash = bch2_hash_info_init(c, dir_u);
 
-	ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
-					 name, &inum, BTREE_ITER_INTENT);
+	ret = bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash,
+				       name, &inum, BTREE_ITER_INTENT);
 	if (ret)
 		goto err;
 
@@ -270,18 +271,25 @@ int bch2_unlink_trans(struct btree_trans *trans,
 	if (ret)
 		goto err;
 
-	if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) {
+	if (!deleting_subvol && S_ISDIR(inode_u->bi_mode)) {
 		ret = bch2_empty_dir_trans(trans, inum);
 		if (ret)
 			goto err;
 	}
 
-	if (deleting_snapshot && !inode_u->bi_subvol) {
+	if (deleting_subvol && !inode_u->bi_subvol) {
 		ret = -BCH_ERR_ENOENT_not_subvol;
 		goto err;
 	}
 
-	if (deleting_snapshot || inode_u->bi_subvol) {
+	if (inode_u->bi_subvol) {
+		/* Recursive subvolume destroy not allowed (yet?) */
+		ret = bch2_subvol_has_children(trans, inode_u->bi_subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (deleting_subvol || inode_u->bi_subvol) {
 		ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol);
 		if (ret)
 			goto err;
@@ -349,6 +357,22 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u,
 	return ret;
 }
 
+static int subvol_update_parent(struct btree_trans *trans, u32 subvol, u32 new_parent)
+{
+	struct btree_iter iter;
+	struct bkey_i_subvolume *s =
+		bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_subvolumes, POS(0, subvol),
+			BTREE_ITER_CACHED, subvolume);
+	int ret = PTR_ERR_OR_ZERO(s);
+	if (ret)
+		return ret;
+
+	s->v.fs_path_parent = cpu_to_le32(new_parent);
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
 int bch2_rename_trans(struct btree_trans *trans,
 		      subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u,
 		      subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u,
@@ -410,6 +434,36 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 	}
 
+	if (src_inode_u->bi_subvol &&
+	    dst_dir.subvol != src_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, src_inode_u->bi_subvol, dst_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
+	if (mode == BCH_RENAME_EXCHANGE &&
+	    dst_inode_u->bi_subvol &&
+	    src_dir.subvol != dst_inode_u->bi_parent_subvol) {
+		ret = subvol_update_parent(trans, dst_inode_u->bi_subvol, src_dir.subvol);
+		if (ret)
+			goto err;
+	}
+
+	/* Can't move across subvolumes, unless it's a subvolume root: */
+	if (src_dir.subvol != dst_dir.subvol &&
+	    (!src_inode_u->bi_subvol ||
+	     (dst_inum.inum && !dst_inode_u->bi_subvol))) {
+		ret = -EXDEV;
+		goto err;
+	}
+
+	if (src_inode_u->bi_parent_subvol)
+		src_inode_u->bi_parent_subvol = dst_dir.subvol;
+
+	if ((mode == BCH_RENAME_EXCHANGE) &&
+	    dst_inode_u->bi_parent_subvol)
+		dst_inode_u->bi_parent_subvol = src_dir.subvol;
+
 	src_inode_u->bi_dir		= dst_dir_u->bi_inum;
 	src_inode_u->bi_dir_offset	= dst_offset;
 
@@ -432,10 +486,10 @@ int bch2_rename_trans(struct btree_trans *trans,
 			goto err;
 		}
 
-		if (S_ISDIR(dst_inode_u->bi_mode) &&
-		    bch2_empty_dir_trans(trans, dst_inum)) {
-			ret = -ENOTEMPTY;
-			goto err;
+		if (S_ISDIR(dst_inode_u->bi_mode)) {
+			ret = bch2_empty_dir_trans(trans, dst_inum);
+			if (ret)
+				goto err;
 		}
 	}
 
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 27710cdd5710..39292e7ef342 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -810,7 +810,8 @@ static noinline void folios_trunc(folios *fs, struct folio **fi)
 static int __bch2_buffered_write(struct bch_inode_info *inode,
 				 struct address_space *mapping,
 				 struct iov_iter *iter,
-				 loff_t pos, unsigned len)
+				 loff_t pos, unsigned len,
+				 bool inode_locked)
 {
 	struct bch_fs *c = inode->v.i_sb->s_fs_info;
 	struct bch2_folio_reservation res;
@@ -835,6 +836,15 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 
 	BUG_ON(!fs.nr);
 
+	/*
+	 * If we're not using the inode lock, we need to lock all the folios for
+	 * atomiticity of writes vs. other writes:
+	 */
+	if (!inode_locked && folio_end_pos(darray_last(fs)) < end) {
+		ret = -BCH_ERR_need_inode_lock;
+		goto out;
+	}
+
 	f = darray_first(fs);
 	if (pos != folio_pos(f) && !folio_test_uptodate(f)) {
 		ret = bch2_read_single_folio(f, mapping);
@@ -929,8 +939,10 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
 	end = pos + copied;
 
 	spin_lock(&inode->v.i_lock);
-	if (end > inode->v.i_size)
+	if (end > inode->v.i_size) {
+		BUG_ON(!inode_locked);
 		i_size_write(&inode->v, end);
+	}
 	spin_unlock(&inode->v.i_lock);
 
 	f_pos = pos;
@@ -974,12 +986,68 @@ static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter)
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct bch_inode_info *inode = file_bch_inode(file);
-	loff_t pos = iocb->ki_pos;
-	ssize_t written = 0;
-	int ret = 0;
+	loff_t pos;
+	bool inode_locked = false;
+	ssize_t written = 0, written2 = 0, ret = 0;
+
+	/*
+	 * We don't take the inode lock unless i_size will be changing. Folio
+	 * locks provide exclusion with other writes, and the pagecache add lock
+	 * provides exclusion with truncate and hole punching.
+	 *
+	 * There is one nasty corner case where atomicity would be broken
+	 * without great care: when copying data from userspace to the page
+	 * cache, we do that with faults disable - a page fault would recurse
+	 * back into the filesystem, taking filesystem locks again, and
+	 * deadlock; so it's done with faults disabled, and we fault in the user
+	 * buffer when we aren't holding locks.
+	 *
+	 * If we do part of the write, but we then race and in the userspace
+	 * buffer have been evicted and are no longer resident, then we have to
+	 * drop our folio locks to re-fault them in, breaking write atomicity.
+	 *
+	 * To fix this, we restart the write from the start, if we weren't
+	 * holding the inode lock.
+	 *
+	 * There is another wrinkle after that; if we restart the write from the
+	 * start, and then get an unrecoverable error, we _cannot_ claim to
+	 * userspace that we did not write data we actually did - so we must
+	 * track (written2) the most we ever wrote.
+	 */
+
+	if ((iocb->ki_flags & IOCB_APPEND) ||
+	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v))) {
+		inode_lock(&inode->v);
+		inode_locked = true;
+	}
+
+	ret = generic_write_checks(iocb, iter);
+	if (ret <= 0)
+		goto unlock;
+
+	ret = file_remove_privs_flags(file, !inode_locked ? IOCB_NOWAIT : 0);
+	if (ret) {
+		if (!inode_locked) {
+			inode_lock(&inode->v);
+			inode_locked = true;
+			ret = file_remove_privs_flags(file, 0);
+		}
+		if (ret)
+			goto unlock;
+	}
+
+	ret = file_update_time(file);
+	if (ret)
+		goto unlock;
+
+	pos = iocb->ki_pos;
 
 	bch2_pagecache_add_get(inode);
 
+	if (!inode_locked &&
+	    (iocb->ki_pos + iov_iter_count(iter) > i_size_read(&inode->v)))
+		goto get_inode_lock;
+
 	do {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		unsigned bytes = iov_iter_count(iter);
@@ -1004,12 +1072,17 @@ again:
 			}
 		}
 
+		if (unlikely(bytes != iov_iter_count(iter) && !inode_locked))
+			goto get_inode_lock;
+
 		if (unlikely(fatal_signal_pending(current))) {
 			ret = -EINTR;
 			break;
 		}
 
-		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes);
+		ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes, inode_locked);
+		if (ret == -BCH_ERR_need_inode_lock)
+			goto get_inode_lock;
 		if (unlikely(ret < 0))
 			break;
 
@@ -1030,50 +1103,46 @@ again:
 		}
 		pos += ret;
 		written += ret;
+		written2 = max(written, written2);
+
+		if (ret != bytes && !inode_locked)
+			goto get_inode_lock;
 		ret = 0;
 
 		balance_dirty_pages_ratelimited(mapping);
-	} while (iov_iter_count(iter));
 
+		if (0) {
+get_inode_lock:
+			bch2_pagecache_add_put(inode);
+			inode_lock(&inode->v);
+			inode_locked = true;
+			bch2_pagecache_add_get(inode);
+
+			iov_iter_revert(iter, written);
+			pos -= written;
+			written = 0;
+			ret = 0;
+		}
+	} while (iov_iter_count(iter));
 	bch2_pagecache_add_put(inode);
+unlock:
+	if (inode_locked)
+		inode_unlock(&inode->v);
+
+	iocb->ki_pos += written;
 
-	return written ? written : ret;
+	ret = max(written, written2) ?: ret;
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
 }
 
-ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct file *file = iocb->ki_filp;
-	struct bch_inode_info *inode = file_bch_inode(file);
-	ssize_t ret;
-
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = bch2_direct_write(iocb, from);
-		goto out;
-	}
-
-	inode_lock(&inode->v);
-
-	ret = generic_write_checks(iocb, from);
-	if (ret <= 0)
-		goto unlock;
-
-	ret = file_remove_privs(file);
-	if (ret)
-		goto unlock;
-
-	ret = file_update_time(file);
-	if (ret)
-		goto unlock;
-
-	ret = bch2_buffered_write(iocb, from);
-	if (likely(ret > 0))
-		iocb->ki_pos += ret;
-unlock:
-	inode_unlock(&inode->v);
+	ssize_t ret = iocb->ki_flags & IOCB_DIRECT
+		? bch2_direct_write(iocb, iter)
+		: bch2_buffered_write(iocb, iter);
 
-	if (ret > 0)
-		ret = generic_write_sync(iocb, ret);
-out:
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 33cb6da3a5ad..b889370a5088 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -387,6 +387,8 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio)
 	ret = dio->op.error ?: ((long) dio->written << 9);
 	bio_put(&dio->op.wbio.bio);
 
+	bch2_write_ref_put(dio->op.c, BCH_WRITE_REF_dio_write);
+
 	/* inode->i_dio_count is our ref on inode and thus bch_fs */
 	inode_dio_end(&inode->v);
 
@@ -536,7 +538,7 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio)
 		if (likely(!dio->iter.count) || dio->op.error)
 			break;
 
-		bio_reset(bio, NULL, REQ_OP_WRITE);
+		bio_reset(bio, NULL, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
 	}
 out:
 	return bch2_dio_write_done(dio);
@@ -590,22 +592,25 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	prefetch(&inode->ei_inode);
 	prefetch((void *) &inode->ei_inode + 64);
 
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write))
+		return -EROFS;
+
 	inode_lock(&inode->v);
 
 	ret = generic_write_checks(req, iter);
 	if (unlikely(ret <= 0))
-		goto err;
+		goto err_put_write_ref;
 
 	ret = file_remove_privs(file);
 	if (unlikely(ret))
-		goto err;
+		goto err_put_write_ref;
 
 	ret = file_update_time(file);
 	if (unlikely(ret))
-		goto err;
+		goto err_put_write_ref;
 
 	if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
-		goto err;
+		goto err_put_write_ref;
 
 	inode_dio_begin(&inode->v);
 	bch2_pagecache_block_get(inode);
@@ -618,7 +623,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 
 	bio = bio_alloc_bioset(NULL,
 			       bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS),
-			       REQ_OP_WRITE,
+			       REQ_OP_WRITE | REQ_SYNC | REQ_IDLE,
 			       GFP_KERNEL,
 			       &c->dio_write_bioset);
 	dio = container_of(bio, struct dio_write, op.wbio.bio);
@@ -645,7 +650,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	}
 
 	ret = bch2_dio_write_loop(dio);
-err:
+out:
 	if (locked)
 		inode_unlock(&inode->v);
 	return ret;
@@ -653,7 +658,9 @@ err_put_bio:
 	bch2_pagecache_block_put(inode);
 	bio_put(bio);
 	inode_dio_end(&inode->v);
-	goto err;
+err_put_write_ref:
+	bch2_write_ref_put(c, BCH_WRITE_REF_dio_write);
+	goto out;
 }
 
 void bch2_fs_fs_io_direct_exit(struct bch_fs *c)
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 8cbaba6565b4..828c3d7c8f19 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -51,13 +51,10 @@ enum bch_folio_sector_state {
 
 struct bch_folio_sector {
 	/* Uncompressed, fully allocated replicas (or on disk reservation): */
-	unsigned		nr_replicas:4;
-
+	u8			nr_replicas:4,
 	/* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */
-	unsigned		replicas_reserved:4;
-
-	/* i_sectors: */
-	enum bch_folio_sector_state state:8;
+				replicas_reserved:4;
+	u8			state;
 };
 
 struct bch_folio {
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index 8c70123b6a0c..20b40477425f 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -174,18 +174,18 @@ void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode,
 static int bch2_flush_inode(struct bch_fs *c,
 			    struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked u;
-	int ret;
-
 	if (c->opts.journal_flush_disabled)
 		return 0;
 
-	ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u);
-	if (ret)
-		return ret;
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync))
+		return -EROFS;
 
-	return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
-		bch2_inode_flush_nocow_writes(c, inode);
+	struct bch_inode_unpacked u;
+	int ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u) ?:
+		  bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?:
+		  bch2_inode_flush_nocow_writes(c, inode);
+	bch2_write_ref_put(c, BCH_WRITE_REF_fsync);
+	return ret;
 }
 
 int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 77ae65542db9..b5ea9fa1259d 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -108,7 +108,8 @@ retry:
 		goto retry;
 
 	bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c,
-			     "inode %u:%llu not found when updating",
+			     "%s: inode %u:%llu not found when updating",
+			     bch2_err_str(ret),
 			     inode_inum(inode).subvol,
 			     inode_inum(inode).inum);
 
@@ -176,45 +177,88 @@ static unsigned bch2_inode_hash(subvol_inum inum)
 	return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL);
 }
 
-struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+static struct bch_inode_info *bch2_inode_insert(struct bch_fs *c, struct bch_inode_info *inode)
 {
-	struct bch_inode_unpacked inode_u;
-	struct bch_inode_info *inode;
-	struct btree_trans *trans;
-	struct bch_subvolume subvol;
-	int ret;
+	subvol_inum inum = inode_inum(inode);
+	struct bch_inode_info *old = to_bch_ei(inode_insert5(&inode->v,
+				      bch2_inode_hash(inum),
+				      bch2_iget5_test,
+				      bch2_iget5_set,
+				      &inum));
+	BUG_ON(!old);
 
-	inode = to_bch_ei(iget5_locked(c->vfs_sb,
-				       bch2_inode_hash(inum),
-				       bch2_iget5_test,
-				       bch2_iget5_set,
-				       &inum));
-	if (unlikely(!inode))
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->v.i_state & I_NEW))
-		return &inode->v;
+	if (unlikely(old != inode)) {
+		discard_new_inode(&inode->v);
+		inode = old;
+	} else {
+		mutex_lock(&c->vfs_inodes_lock);
+		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
+		mutex_unlock(&c->vfs_inodes_lock);
+		/*
+		 * we really don't want insert_inode_locked2() to be setting
+		 * I_NEW...
+		 */
+		unlock_new_inode(&inode->v);
+	}
 
-	trans = bch2_trans_get(c);
-	ret = lockrestart_do(trans,
-		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
-		bch2_inode_find_by_inum_trans(trans, inum, &inode_u));
+	return inode;
+}
 
-	if (!ret)
-		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
-	bch2_trans_put(trans);
+#define memalloc_flags_do(_flags, _do)						\
+({										\
+	unsigned _saved_flags = memalloc_flags_save(_flags);			\
+	typeof(_do) _ret = _do;							\
+	memalloc_noreclaim_restore(_saved_flags);				\
+	_ret;									\
+})
 
-	if (ret) {
-		iget_failed(&inode->v);
-		return ERR_PTR(bch2_err_class(ret));
+/*
+ * Allocate a new inode, dropping/retaking btree locks if necessary:
+ */
+static struct bch_inode_info *bch2_new_inode(struct btree_trans *trans)
+{
+	struct bch_fs *c = trans->c;
+
+	struct bch_inode_info *inode =
+		memalloc_flags_do(PF_MEMALLOC_NORECLAIM|PF_MEMALLOC_NOWARN,
+				  to_bch_ei(new_inode(c->vfs_sb)));
+
+	if (unlikely(!inode)) {
+		int ret = drop_locks_do(trans, (inode = to_bch_ei(new_inode(c->vfs_sb))) ? 0 : -ENOMEM);
+		if (ret && inode)
+			discard_new_inode(&inode->v);
+		if (ret)
+			return ERR_PTR(ret);
 	}
 
-	mutex_lock(&c->vfs_inodes_lock);
-	list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-	mutex_unlock(&c->vfs_inodes_lock);
+	return inode;
+}
 
-	unlock_new_inode(&inode->v);
+struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum)
+{
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		return &inode->v;
 
-	return &inode->v;
+	struct btree_trans *trans = bch2_trans_get(c);
+
+	struct bch_inode_unpacked inode_u;
+	struct bch_subvolume subvol;
+	int ret = lockrestart_do(trans,
+		bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_trans(trans, inum, &inode_u)) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (!ret) {
+		bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+		inode = bch2_inode_insert(c, inode);
+	}
+	bch2_trans_put(trans);
+
+	return ret ? ERR_PTR(ret) : &inode->v;
 }
 
 struct bch_inode_info *
@@ -226,7 +270,7 @@ __bch2_create(struct mnt_idmap *idmap,
 	struct bch_fs *c = dir->v.i_sb->s_fs_info;
 	struct btree_trans *trans;
 	struct bch_inode_unpacked dir_u;
-	struct bch_inode_info *inode, *old;
+	struct bch_inode_info *inode;
 	struct bch_inode_unpacked inode_u;
 	struct posix_acl *default_acl = NULL, *acl = NULL;
 	subvol_inum inum;
@@ -293,7 +337,6 @@ err_before_quota:
 		mutex_unlock(&dir->ei_update_lock);
 	}
 
-	bch2_iget5_set(&inode->v, &inum);
 	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
 
 	set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl);
@@ -304,36 +347,7 @@ err_before_quota:
 	 * bch2_trans_exit() and dropping locks, else we could race with another
 	 * thread pulling the inode in and modifying it:
 	 */
-
-	inode->v.i_state |= I_CREATING;
-
-	old = to_bch_ei(inode_insert5(&inode->v,
-				      bch2_inode_hash(inum),
-				      bch2_iget5_test,
-				      bch2_iget5_set,
-				      &inum));
-	BUG_ON(!old);
-
-	if (unlikely(old != inode)) {
-		/*
-		 * We raced, another process pulled the new inode into cache
-		 * before us:
-		 */
-		make_bad_inode(&inode->v);
-		iput(&inode->v);
-
-		inode = old;
-	} else {
-		mutex_lock(&c->vfs_inodes_lock);
-		list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list);
-		mutex_unlock(&c->vfs_inodes_lock);
-		/*
-		 * we really don't want insert_inode_locked2() to be setting
-		 * I_NEW...
-		 */
-		unlock_new_inode(&inode->v);
-	}
-
+	inode = bch2_inode_insert(c, inode);
 	bch2_trans_put(trans);
 err:
 	posix_acl_release(default_acl);
@@ -352,23 +366,78 @@ err_trans:
 
 /* methods */
 
+static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans,
+			subvol_inum dir, struct bch_hash_info *dir_hash_info,
+			const struct qstr *name)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter dirent_iter = {};
+	subvol_inum inum = {};
+
+	int ret = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc,
+				   dir_hash_info, dir, name, 0);
+	if (ret)
+		return ERR_PTR(ret);
+
+	struct bkey_s_c k = bch2_btree_iter_peek_slot(&dirent_iter);
+	ret = bkey_err(k);
+	if (ret)
+		goto err;
+
+	ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret)
+		goto err;
+
+	struct bch_inode_info *inode =
+		to_bch_ei(ilookup5_nowait(c->vfs_sb,
+					  bch2_inode_hash(inum),
+					  bch2_iget5_test,
+					  &inum));
+	if (inode)
+		goto out;
+
+	struct bch_subvolume subvol;
+	struct bch_inode_unpacked inode_u;
+	ret =   bch2_subvolume_get(trans, inum.subvol, true, 0, &subvol) ?:
+		bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?:
+		PTR_ERR_OR_ZERO(inode = bch2_new_inode(trans));
+	if (bch2_err_matches(ret, ENOENT)) {
+		struct printbuf buf = PRINTBUF;
+
+		bch2_bkey_val_to_text(&buf, c, k);
+		bch_err(c, "%s points to missing inode", buf.buf);
+		printbuf_exit(&buf);
+	}
+	if (ret)
+		goto err;
+
+	bch2_vfs_inode_init(trans, inum, inode, &inode_u, &subvol);
+	inode = bch2_inode_insert(c, inode);
+out:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	return inode;
+err:
+	inode = ERR_PTR(ret);
+	goto out;
+}
+
 static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry,
 				  unsigned int flags)
 {
 	struct bch_fs *c = vdir->i_sb->s_fs_info;
 	struct bch_inode_info *dir = to_bch_ei(vdir);
 	struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode);
-	struct inode *vinode = NULL;
-	subvol_inum inum = { .subvol = 1 };
-	int ret;
-
-	ret = bch2_dirent_lookup(c, inode_inum(dir), &hash,
-				 &dentry->d_name, &inum);
 
-	if (!ret)
-		vinode = bch2_vfs_inode_get(c, inum);
+	struct bch_inode_info *inode;
+	bch2_trans_do(c, NULL, NULL, 0,
+		PTR_ERR_OR_ZERO(inode = bch2_lookup_trans(trans, inode_inum(dir),
+							  &hash, &dentry->d_name)));
+	if (IS_ERR(inode))
+		inode = NULL;
 
-	return d_splice_alias(vinode, dentry);
+	return d_splice_alias(&inode->v, dentry);
 }
 
 static int bch2_mknod(struct mnt_idmap *idmap,
@@ -1372,6 +1441,7 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum,
 				struct bch_inode_unpacked *bi,
 				struct bch_subvolume *subvol)
 {
+	bch2_iget5_set(&inode->v, &inum);
 	bch2_inode_update_after_write(trans, inode, bi, ~0);
 
 	if (BCH_SUBVOLUME_SNAP(subvol))
@@ -1572,7 +1642,6 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	 * number:
 	 */
 	u64 avail_inodes = ((usage.capacity - usage.used) << 3);
-	u64 fsid;
 
 	buf->f_type	= BCACHEFS_STATFS_MAGIC;
 	buf->f_bsize	= sb->s_blocksize;
@@ -1583,10 +1652,7 @@ static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files	= usage.nr_inodes + avail_inodes;
 	buf->f_ffree	= avail_inodes;
 
-	fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^
-	       le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64));
-	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
-	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+	buf->f_fsid	= uuid_to_fsid(c->sb.user_uuid.b);
 	buf->f_namelen	= BCH_NAME_MAX;
 
 	return 0;
@@ -1805,8 +1871,10 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
 	opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
 
 	ret = bch2_parse_mount_opts(NULL, &opts, data);
-	if (ret)
+	if (ret) {
+		ret = bch2_err_class(ret);
 		return ERR_PTR(ret);
+	}
 
 	if (!dev_name || strlen(dev_name) == 0)
 		return ERR_PTR(-EINVAL);
@@ -1882,6 +1950,7 @@ got_sb:
 	sb->s_time_gran		= c->sb.nsec_per_time_unit;
 	sb->s_time_min		= div_s64(S64_MIN, c->sb.time_units_per_sec) + 1;
 	sb->s_time_max		= div_s64(S64_MAX, c->sb.time_units_per_sec);
+	sb->s_uuid		= c->sb.user_uuid;
 	c->vfs_sb		= sb;
 	strscpy(sb->s_id, c->name, sizeof(sb->s_id));
 
@@ -1928,6 +1997,7 @@ out:
 	return dget(sb->s_root);
 
 err_put_super:
+	__bch2_fs_stop(c);
 	deactivate_locked_super(sb);
 	return ERR_PTR(bch2_err_class(ret));
 }
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index 6a760777bafb..8e2010212cc3 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -12,7 +12,7 @@
 #include "fsck.h"
 #include "inode.h"
 #include "keylist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
 #include "snapshot.h"
 #include "super.h"
 #include "xattr.h"
@@ -63,9 +63,7 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol,
 			 u32 *snapshot, u64 *inum)
 {
 	struct bch_subvolume s;
-	int ret;
-
-	ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
+	int ret = bch2_subvolume_get(trans, subvol, false, 0, &s);
 
 	*snapshot = le32_to_cpu(s.snapshot);
 	*inum = le64_to_cpu(s.inode);
@@ -100,8 +98,8 @@ err:
 }
 
 static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
-			  struct bch_inode_unpacked *inode,
-			  u32 *snapshot)
+			struct bch_inode_unpacked *inode,
+			u32 *snapshot)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k;
@@ -142,34 +140,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans,
 	return 0;
 }
 
-static int __write_inode(struct btree_trans *trans,
-			 struct bch_inode_unpacked *inode,
-			 u32 snapshot)
-{
-	struct bkey_inode_buf *inode_p =
-		bch2_trans_kmalloc(trans, sizeof(*inode_p));
-
-	if (IS_ERR(inode_p))
-		return PTR_ERR(inode_p);
-
-	bch2_inode_pack(inode_p, inode);
-	inode_p->inode.k.p.snapshot = snapshot;
-
-	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
-				&inode_p->inode.k_i,
-				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
-}
-
-static int fsck_write_inode(struct btree_trans *trans,
-			    struct bch_inode_unpacked *inode,
-			    u32 snapshot)
-{
-	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
-			    __write_inode(trans, inode, snapshot));
-	bch_err_fn(trans->c, ret);
-	return ret;
-}
-
 static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 {
 	struct bch_fs *c = trans->c;
@@ -186,9 +156,10 @@ static int __remove_dirent(struct btree_trans *trans, struct bpos pos)
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT);
 
-	ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
-				  &dir_hash_info, &iter,
-				  BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+	ret =   bch2_btree_iter_traverse(&iter) ?:
+		bch2_hash_delete_at(trans, bch2_dirent_hash_desc,
+				    &dir_hash_info, &iter,
+				    BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
 	bch2_trans_iter_exit(trans, &iter);
 err:
 	bch_err_fn(c, ret);
@@ -197,7 +168,8 @@ err:
 
 /* Get lost+found, create if it doesn't exist: */
 static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
-			    struct bch_inode_unpacked *lostfound)
+			    struct bch_inode_unpacked *lostfound,
+			    u64 reattaching_inum)
 {
 	struct bch_fs *c = trans->c;
 	struct qstr lostfound_str = QSTR("lost+found");
@@ -212,19 +184,36 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
 		return ret;
 
 	subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
-	u32 subvol_snapshot;
 
-	ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
-			    &subvol_snapshot, &root_inum.inum);
-	bch_err_msg(c, ret, "looking up root subvol");
+	struct bch_subvolume subvol;
+	ret = bch2_subvolume_get(trans, le32_to_cpu(st.master_subvol),
+				 false, 0, &subvol);
+	bch_err_msg(c, ret, "looking up root subvol %u for snapshot %u",
+		    le32_to_cpu(st.master_subvol), snapshot);
 	if (ret)
 		return ret;
 
+	if (!subvol.inode) {
+		struct btree_iter iter;
+		struct bkey_i_subvolume *subvol = bch2_bkey_get_mut_typed(trans, &iter,
+				BTREE_ID_subvolumes, POS(0, le32_to_cpu(st.master_subvol)),
+				0, subvolume);
+		ret = PTR_ERR_OR_ZERO(subvol);
+		if (ret)
+			return ret;
+
+		subvol->v.inode = cpu_to_le64(reattaching_inum);
+		bch2_trans_iter_exit(trans, &iter);
+	}
+
+	root_inum.inum = le64_to_cpu(subvol.inode);
+
 	struct bch_inode_unpacked root_inode;
 	struct bch_hash_info root_hash_info;
 	u32 root_inode_snapshot = snapshot;
 	ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
-	bch_err_msg(c, ret, "looking up root inode");
+	bch_err_msg(c, ret, "looking up root inode %llu for subvol %u",
+		    root_inum.inum, le32_to_cpu(st.master_subvol));
 	if (ret)
 		return ret;
 
@@ -280,7 +269,7 @@ create_lostfound:
 		goto err;
 
 	ret =   bch2_dirent_create_snapshot(trans,
-				root_inode.bi_inum, snapshot, &root_hash_info,
+				0, root_inode.bi_inum, snapshot, &root_hash_info,
 				mode_to_type(lostfound->bi_mode),
 				&lostfound_str,
 				lostfound->bi_inum,
@@ -303,30 +292,47 @@ static int reattach_inode(struct btree_trans *trans,
 	char name_buf[20];
 	struct qstr name;
 	u64 dir_offset = 0;
+	u32 dirent_snapshot = inode_snapshot;
 	int ret;
 
-	ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
+	if (inode->bi_subvol) {
+		inode->bi_parent_subvol = BCACHEFS_ROOT_SUBVOL;
+
+		u64 root_inum;
+		ret = subvol_lookup(trans, inode->bi_parent_subvol,
+				    &dirent_snapshot, &root_inum);
+		if (ret)
+			return ret;
+
+		snprintf(name_buf, sizeof(name_buf), "subvol-%u", inode->bi_subvol);
+	} else {
+		snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
+	}
+
+	ret = lookup_lostfound(trans, dirent_snapshot, &lostfound, inode->bi_inum);
 	if (ret)
 		return ret;
 
 	if (S_ISDIR(inode->bi_mode)) {
 		lostfound.bi_nlink++;
 
-		ret = __write_inode(trans, &lostfound, U32_MAX);
+		ret = __bch2_fsck_write_inode(trans, &lostfound, U32_MAX);
 		if (ret)
 			return ret;
 	}
 
 	dir_hash = bch2_hash_info_init(trans->c, &lostfound);
 
-	snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
 	name = (struct qstr) QSTR(name_buf);
 
 	ret = bch2_dirent_create_snapshot(trans,
-				lostfound.bi_inum, inode_snapshot,
+				inode->bi_parent_subvol, lostfound.bi_inum,
+				dirent_snapshot,
 				&dir_hash,
 				inode_d_type(inode),
-				&name, inode->bi_inum, &dir_offset,
+				&name,
+				inode->bi_subvol ?: inode->bi_inum,
+				&dir_offset,
 				BCH_HASH_SET_MUST_CREATE);
 	if (ret)
 		return ret;
@@ -334,7 +340,7 @@ static int reattach_inode(struct btree_trans *trans,
 	inode->bi_dir		= lostfound.bi_inum;
 	inode->bi_dir_offset	= dir_offset;
 
-	return __write_inode(trans, inode, inode_snapshot);
+	return __bch2_fsck_write_inode(trans, inode, inode_snapshot);
 }
 
 static int remove_backpointer(struct btree_trans *trans,
@@ -353,6 +359,133 @@ static int remove_backpointer(struct btree_trans *trans,
 	return ret;
 }
 
+static int reattach_subvol(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+	struct bch_fs *c = trans->c;
+
+	struct bch_inode_unpacked inode;
+	int ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+				&inode);
+	if (ret)
+		return ret;
+
+	ret = remove_backpointer(trans, &inode);
+	bch_err_msg(c, ret, "removing dirent");
+	if (ret)
+		return ret;
+
+	ret = reattach_inode(trans, &inode, le32_to_cpu(s.v->snapshot));
+	bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+	return ret;
+}
+
+static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 subvolid, u64 inum)
+{
+	struct bch_fs *c = trans->c;
+
+	if (!bch2_snapshot_is_leaf(c, snapshotid)) {
+		bch_err(c, "need to reconstruct subvol, but have interior node snapshot");
+		return -BCH_ERR_fsck_repair_unimplemented;
+	}
+
+	/*
+	 * If inum isn't set, that means we're being called from check_dirents,
+	 * not check_inodes - the root of this subvolume doesn't exist or we
+	 * would have found it there:
+	 */
+	if (!inum) {
+		struct btree_iter inode_iter = {};
+		struct bch_inode_unpacked new_inode;
+		u64 cpu = raw_smp_processor_id();
+
+		bch2_inode_init_early(c, &new_inode);
+		bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL);
+
+		new_inode.bi_subvol = subvolid;
+
+		int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?:
+			  bch2_btree_iter_traverse(&inode_iter) ?:
+			  bch2_inode_write(trans, &inode_iter, &new_inode);
+		bch2_trans_iter_exit(trans, &inode_iter);
+		if (ret)
+			return ret;
+
+		inum = new_inode.bi_inum;
+	}
+
+	bch_info(c, "reconstructing subvol %u with root inode %llu", subvolid, inum);
+
+	struct bkey_i_subvolume *new_subvol = bch2_trans_kmalloc(trans, sizeof(*new_subvol));
+	int ret = PTR_ERR_OR_ZERO(new_subvol);
+	if (ret)
+		return ret;
+
+	bkey_subvolume_init(&new_subvol->k_i);
+	new_subvol->k.p.offset	= subvolid;
+	new_subvol->v.snapshot	= cpu_to_le32(snapshotid);
+	new_subvol->v.inode	= cpu_to_le64(inum);
+	ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &new_subvol->k_i, 0);
+	if (ret)
+		return ret;
+
+	struct btree_iter iter;
+	struct bkey_i_snapshot *s = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshots, POS(0, snapshotid),
+			0, snapshot);
+	ret = PTR_ERR_OR_ZERO(s);
+	bch_err_msg(c, ret, "getting snapshot %u", snapshotid);
+	if (ret)
+		return ret;
+
+	u32 snapshot_tree = le32_to_cpu(s->v.tree);
+
+	s->v.subvol = cpu_to_le32(subvolid);
+	SET_BCH_SNAPSHOT_SUBVOL(&s->v, true);
+	bch2_trans_iter_exit(trans, &iter);
+
+	struct bkey_i_snapshot_tree *st = bch2_bkey_get_mut_typed(trans, &iter,
+			BTREE_ID_snapshot_trees, POS(0, snapshot_tree),
+			0, snapshot_tree);
+	ret = PTR_ERR_OR_ZERO(st);
+	bch_err_msg(c, ret, "getting snapshot tree %u", snapshot_tree);
+	if (ret)
+		return ret;
+
+	if (!st->v.master_subvol)
+		st->v.master_subvol = cpu_to_le32(subvolid);
+
+	bch2_trans_iter_exit(trans, &iter);
+	return 0;
+}
+
+static int reconstruct_inode(struct btree_trans *trans, u32 snapshot, u64 inum, u64 size, unsigned mode)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_inode_unpacked new_inode;
+
+	bch2_inode_init_early(c, &new_inode);
+	bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, mode|0755, 0, NULL);
+	new_inode.bi_size = size;
+	new_inode.bi_inum = inum;
+
+	return __bch2_fsck_write_inode(trans, &new_inode, snapshot);
+}
+
+static int reconstruct_reg_inode(struct btree_trans *trans, u32 snapshot, u64 inum)
+{
+	struct btree_iter iter = {};
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0);
+	struct bkey_s_c k = bch2_btree_iter_peek_prev(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+	int ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	return reconstruct_inode(trans, snapshot, inum, k.k->p.offset << 9, S_IFREG);
+}
+
 struct snapshots_seen_entry {
 	u32				id;
 	u32				equiv;
@@ -592,13 +725,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
 }
 
 static struct inode_walker_entry *
-lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
-			  u32 snapshot, bool is_whiteout)
+lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k)
 {
-	struct inode_walker_entry *i;
-
-	snapshot = bch2_snapshot_equiv(c, snapshot);
+	bool is_whiteout = k.k->type == KEY_TYPE_whiteout;
+	u32 snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot);
 
+	struct inode_walker_entry *i;
 	__darray_for_each(w->inodes, i)
 		if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
 			goto found;
@@ -609,20 +741,24 @@ found:
 
 	if (snapshot != i->snapshot && !is_whiteout) {
 		struct inode_walker_entry new = *i;
-		size_t pos;
-		int ret;
 
 		new.snapshot = snapshot;
 		new.count = 0;
 
-		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u",
-			 w->last_pos.inode, snapshot, i->snapshot);
+		struct printbuf buf = PRINTBUF;
+		bch2_bkey_val_to_text(&buf, c, k);
+
+		bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n"
+			 "unexpected because we should always update the inode when we update a key in that inode\n"
+			 "%s",
+			 w->last_pos.inode, snapshot, i->snapshot, buf.buf);
+		printbuf_exit(&buf);
 
 		while (i > w->inodes.data && i[-1].snapshot > snapshot)
 			--i;
 
-		pos = i - w->inodes.data;
-		ret = darray_insert_item(&w->inodes, pos, new);
+		size_t pos = i - w->inodes.data;
+		int ret = darray_insert_item(&w->inodes, pos, new);
 		if (ret)
 			return ERR_PTR(ret);
 
@@ -633,21 +769,21 @@ found:
 }
 
 static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
-					     struct inode_walker *w, struct bpos pos,
-					     bool is_whiteout)
+					     struct inode_walker *w,
+					     struct bkey_s_c k)
 {
-	if (w->last_pos.inode != pos.inode) {
-		int ret = get_inodes_all_snapshots(trans, w, pos.inode);
+	if (w->last_pos.inode != k.k->p.inode) {
+		int ret = get_inodes_all_snapshots(trans, w, k.k->p.inode);
 		if (ret)
 			return ERR_PTR(ret);
-	} else if (bkey_cmp(w->last_pos, pos)) {
+	} else if (bkey_cmp(w->last_pos, k.k->p)) {
 		darray_for_each(w->inodes, i)
 			i->seen_this_pos = false;
 	}
 
-	w->last_pos = pos;
+	w->last_pos = k.k->p;
 
-	return lookup_inode_for_snapshot(trans->c, w, pos.snapshot, is_whiteout);
+	return lookup_inode_for_snapshot(trans->c, w, k);
 }
 
 static int __get_visible_inodes(struct btree_trans *trans,
@@ -722,7 +858,7 @@ static int hash_redo_key(struct btree_trans *trans,
 	delete->k.p = k_iter->pos;
 	return  bch2_btree_iter_traverse(k_iter) ?:
 		bch2_trans_update(trans, k_iter, delete, 0) ?:
-		bch2_hash_set_snapshot(trans, desc, hash_info,
+		bch2_hash_set_in_snapshot(trans, desc, hash_info,
 				       (subvol_inum) { 0, k.k->p.inode },
 				       k.k->p.snapshot, tmp,
 				       BCH_HASH_SET_MUST_CREATE,
@@ -795,16 +931,93 @@ fsck_err:
 	goto out;
 }
 
+static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
+						struct btree_iter *iter,
+						struct bpos pos)
+{
+	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
+}
+
+static struct bkey_s_c_dirent inode_get_dirent(struct btree_trans *trans,
+					       struct btree_iter *iter,
+					       struct bch_inode_unpacked *inode,
+					       u32 *snapshot)
+{
+	if (inode->bi_subvol) {
+		u64 inum;
+		int ret = subvol_lookup(trans, inode->bi_parent_subvol, snapshot, &inum);
+		if (ret)
+			return ((struct bkey_s_c_dirent) { .k = ERR_PTR(ret) });
+	}
+
+	return dirent_get_by_pos(trans, iter, SPOS(inode->bi_dir, inode->bi_dir_offset, *snapshot));
+}
+
+static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
+				   struct bkey_s_c_dirent d)
+{
+	return  inode->bi_dir		== d.k->p.inode &&
+		inode->bi_dir_offset	== d.k->p.offset;
+}
+
+static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *inode)
+{
+	return d.v->d_type == DT_SUBVOL
+		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
+		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
+}
+
 static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
 {
 	struct btree_iter iter;
 	struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
-	int ret = bkey_err(k);
-	if (ret)
+	int ret = bkey_err(k) ?: k.k->type == KEY_TYPE_set;
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int check_inode_dirent_inode(struct btree_trans *trans, struct bkey_s_c inode_k,
+				    struct bch_inode_unpacked *inode,
+				    u32 inode_snapshot, bool *write_inode)
+{
+	struct bch_fs *c = trans->c;
+	struct printbuf buf = PRINTBUF;
+
+	struct btree_iter dirent_iter = {};
+	struct bkey_s_c_dirent d = inode_get_dirent(trans, &dirent_iter, inode, &inode_snapshot);
+	int ret = bkey_err(d);
+	if (ret && !bch2_err_matches(ret, ENOENT))
 		return ret;
 
-	bch2_trans_iter_exit(trans, &iter);
-	return k.k->type == KEY_TYPE_set;
+	if (fsck_err_on(ret,
+			c, inode_points_to_missing_dirent,
+			"inode points to missing dirent\n%s",
+			(bch2_bkey_val_to_text(&buf, c, inode_k), buf.buf)) ||
+	    fsck_err_on(!ret && !dirent_points_to_inode(d, inode),
+			c, inode_points_to_wrong_dirent,
+			"inode points to dirent that does not point back:\n%s",
+			(bch2_bkey_val_to_text(&buf, c, inode_k),
+			 prt_newline(&buf),
+			 bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		/*
+		 * We just clear the backpointer fields for now. If we find a
+		 * dirent that points to this inode in check_dirents(), we'll
+		 * update it then; then when we get to check_path() if the
+		 * backpointer is still 0 we'll reattach it.
+		 */
+		inode->bi_dir = 0;
+		inode->bi_dir_offset = 0;
+		inode->bi_flags &= ~BCH_INODE_backptr_untrusted;
+		*write_inode = true;
+	}
+
+	ret = 0;
+fsck_err:
+	bch2_trans_iter_exit(trans, &dirent_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
 }
 
 static int check_inode(struct btree_trans *trans,
@@ -861,7 +1074,8 @@ static int check_inode(struct btree_trans *trans,
 
 		u.bi_flags &= ~BCH_INODE_i_size_dirty|BCH_INODE_unlinked;
 
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
+
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -876,7 +1090,7 @@ static int check_inode(struct btree_trans *trans,
 		if (ret < 0)
 			return ret;
 
-		fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+		fsck_err_on(!ret, c, unlinked_inode_not_on_deleted_list,
 			    "inode %llu:%u unlinked, but not on deleted list",
 			    u.bi_inum, k.k->p.snapshot);
 		ret = 0;
@@ -950,8 +1164,54 @@ static int check_inode(struct btree_trans *trans,
 		do_update = true;
 	}
 
+	if (u.bi_dir || u.bi_dir_offset) {
+		ret = check_inode_dirent_inode(trans, k, &u, k.k->p.snapshot, &do_update);
+		if (ret)
+			goto err;
+	}
+
+	if (fsck_err_on(u.bi_parent_subvol &&
+			(u.bi_subvol == 0 ||
+			 u.bi_subvol == BCACHEFS_ROOT_SUBVOL),
+			c, inode_bi_parent_nonzero,
+			"inode %llu:%u has subvol %u but nonzero parent subvol %u",
+			u.bi_inum, k.k->p.snapshot, u.bi_subvol, u.bi_parent_subvol)) {
+		u.bi_parent_subvol = 0;
+		do_update = true;
+	}
+
+	if (u.bi_subvol) {
+		struct bch_subvolume s;
+
+		ret = bch2_subvolume_get(trans, u.bi_subvol, false, 0, &s);
+		if (ret && !bch2_err_matches(ret, ENOENT))
+			goto err;
+
+		if (ret && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+			ret = reconstruct_subvol(trans, k.k->p.snapshot, u.bi_subvol, u.bi_inum);
+			goto do_update;
+		}
+
+		if (fsck_err_on(ret,
+				c, inode_bi_subvol_missing,
+				"inode %llu:%u bi_subvol points to missing subvolume %u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol) ||
+		    fsck_err_on(le64_to_cpu(s.inode) != u.bi_inum ||
+				!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.snapshot),
+							   k.k->p.snapshot),
+				c, inode_bi_subvol_wrong,
+				"inode %llu:%u points to subvol %u, but subvol points to %llu:%u",
+				u.bi_inum, k.k->p.snapshot, u.bi_subvol,
+				le64_to_cpu(s.inode),
+				le32_to_cpu(s.snapshot))) {
+			u.bi_subvol = 0;
+			u.bi_parent_subvol = 0;
+			do_update = true;
+		}
+	}
+do_update:
 	if (do_update) {
-		ret = __write_inode(trans, &u, iter->pos.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, iter->pos.snapshot);
 		bch_err_msg(c, ret, "in fsck updating inode");
 		if (ret)
 			return ret;
@@ -982,32 +1242,9 @@ int bch2_check_inodes(struct bch_fs *c)
 	return ret;
 }
 
-static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans,
-						struct btree_iter *iter,
-						struct bpos pos)
-{
-	return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent);
-}
-
-static bool inode_points_to_dirent(struct bch_inode_unpacked *inode,
-				   struct bkey_s_c_dirent d)
-{
-	return  inode->bi_dir		== d.k->p.inode &&
-		inode->bi_dir_offset	== d.k->p.offset;
-}
-
-static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
-				   struct bch_inode_unpacked *inode)
-{
-	return d.v->d_type == DT_SUBVOL
-		? le32_to_cpu(d.v->d_child_subvol)	== inode->bi_subvol
-		: le64_to_cpu(d.v->d_inum)		== inode->bi_inum;
-}
-
-static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;
 
@@ -1021,8 +1258,8 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 			i->count = count2;
 
 		if (i->count != count2) {
-			bch_err(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
-				w->last_pos.inode, i->snapshot, i->count, count2);
+			bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu",
+					    w->last_pos.inode, i->snapshot, i->count, count2);
 			return -BCH_ERR_internal_fsck_err;
 		}
 
@@ -1032,14 +1269,21 @@ static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
 				w->last_pos.inode, i->snapshot,
 				i->inode.bi_sectors, i->count)) {
 			i->inode.bi_sectors = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
+}
+
+static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_i_sectors_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
 }
 
 struct extent_end {
@@ -1255,10 +1499,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
 			goto err;
 	}
 
-	ret = extent_ends_at(c, extent_ends, seen, k);
-	if (ret)
-		goto err;
-
 	extent_ends->last_pos = k.k->p;
 err:
 	return ret;
@@ -1312,7 +1552,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 			goto err;
 	}
 
-	i = walk_inode(trans, inode, equiv, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		goto err;
@@ -1322,6 +1562,17 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 		goto err;
 
 	if (k.k->type != KEY_TYPE_whiteout) {
+		if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+			ret =   reconstruct_reg_inode(trans, k.k->p.snapshot, k.k->p.inode) ?:
+				bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+			if (ret)
+				goto err;
+
+			inode->last_pos.inode--;
+			ret = -BCH_ERR_transaction_restart_nested;
+			goto err;
+		}
+
 		if (fsck_err_on(!i, c, extent_in_missing_inode,
 				"extent in missing inode:\n  %s",
 				(printbuf_reset(&buf),
@@ -1388,6 +1639,12 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
 
 		i->seen_this_pos = true;
 	}
+
+	if (k.k->type != KEY_TYPE_whiteout) {
+		ret = extent_ends_at(c, extent_ends, s, k);
+		if (ret)
+			goto err;
+	}
 out:
 err:
 fsck_err:
@@ -1423,7 +1680,7 @@ int bch2_check_extents(struct bch_fs *c)
 			check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
 			check_extent_overbig(trans, &iter, k);
 		})) ?:
-		check_i_sectors(trans, &w));
+		check_i_sectors_notnested(trans, &w));
 
 	bch2_disk_reservation_put(c, &res);
 	extent_ends_exit(&extent_ends);
@@ -1453,10 +1710,9 @@ int bch2_check_indirect_extents(struct bch_fs *c)
 	return ret;
 }
 
-static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_walker *w)
 {
 	struct bch_fs *c = trans->c;
-	u32 restart_count = trans->restart_count;
 	int ret = 0;
 	s64 count2;
 
@@ -1469,8 +1725,8 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 			return count2;
 
 		if (i->count != count2) {
-			bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu",
-				i->count, count2);
+			bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu",
+					    w->last_pos.inode, i->snapshot, i->count, count2);
 			i->count = count2;
 			if (i->inode.bi_nlink == i->count)
 				continue;
@@ -1481,96 +1737,123 @@ static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
 				"directory %llu:%u with wrong i_nlink: got %u, should be %llu",
 				w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) {
 			i->inode.bi_nlink = i->count;
-			ret = fsck_write_inode(trans, &i->inode, i->snapshot);
+			ret = bch2_fsck_write_inode(trans, &i->inode, i->snapshot);
 			if (ret)
 				break;
 		}
 	}
 fsck_err:
 	bch_err_fn(c, ret);
-	return ret ?: trans_was_restarted(trans, restart_count);
+	return ret;
 }
 
-static int check_dirent_target(struct btree_trans *trans,
-			       struct btree_iter *iter,
-			       struct bkey_s_c_dirent d,
-			       struct bch_inode_unpacked *target,
-			       u32 target_snapshot)
+static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
+{
+	u32 restart_count = trans->restart_count;
+	return check_subdir_count_notnested(trans, w) ?:
+		trans_was_restarted(trans, restart_count);
+}
+
+static int check_dirent_inode_dirent(struct btree_trans *trans,
+				   struct btree_iter *iter,
+				   struct bkey_s_c_dirent d,
+				   struct bch_inode_unpacked *target,
+				   u32 target_snapshot)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_i_dirent *n;
 	struct printbuf buf = PRINTBUF;
-	struct btree_iter bp_iter = { NULL };
 	int ret = 0;
 
+	if (inode_points_to_dirent(target, d))
+		return 0;
+
 	if (!target->bi_dir &&
 	    !target->bi_dir_offset) {
 		target->bi_dir		= d.k->p.inode;
 		target->bi_dir_offset	= d.k->p.offset;
-
-		ret = __write_inode(trans, target, target_snapshot);
-		if (ret)
-			goto err;
+		return __bch2_fsck_write_inode(trans, target, target_snapshot);
 	}
 
-	if (!inode_points_to_dirent(target, d)) {
-		struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
-				      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
-		ret = bkey_err(bp_dirent);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
+	struct btree_iter bp_iter = { NULL };
+	struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+			      SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+	ret = bkey_err(bp_dirent);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
 
-		bool backpointer_exists = !ret;
-		ret = 0;
+	bool backpointer_exists = !ret;
+	ret = 0;
+
+	if (fsck_err_on(!backpointer_exists,
+			c, inode_wrong_backpointer,
+			"inode %llu:%u has wrong backpointer:\n"
+			"got       %llu:%llu\n"
+			"should be %llu:%llu",
+			target->bi_inum, target_snapshot,
+			target->bi_dir,
+			target->bi_dir_offset,
+			d.k->p.inode,
+			d.k->p.offset)) {
+		target->bi_dir		= d.k->p.inode;
+		target->bi_dir_offset	= d.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		goto out;
+	}
 
-		bch2_bkey_val_to_text(&buf, c, d.s_c);
-		prt_newline(&buf);
-		if (backpointer_exists)
-			bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+	bch2_bkey_val_to_text(&buf, c, d.s_c);
+	prt_newline(&buf);
+	if (backpointer_exists)
+		bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
+	if (fsck_err_on(backpointer_exists &&
+			(S_ISDIR(target->bi_mode) ||
+			 target->bi_subvol),
+			c, inode_dir_multiple_links,
+			"%s %llu:%u with multiple links\n%s",
+			S_ISDIR(target->bi_mode) ? "directory" : "subvolume",
+			target->bi_inum, target_snapshot, buf.buf)) {
+		ret = __remove_dirent(trans, d.k->p);
+		goto out;
+	}
 
-		if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
-				c, inode_dir_multiple_links,
-				"directory %llu:%u with multiple links\n%s",
-				target->bi_inum, target_snapshot, buf.buf)) {
-			ret = __remove_dirent(trans, d.k->p);
-			goto out;
-		}
+	/*
+	 * hardlinked file with nlink 0:
+	 * We're just adjusting nlink here so check_nlinks() will pick
+	 * it up, it ignores inodes with nlink 0
+	 */
+	if (fsck_err_on(backpointer_exists && !target->bi_nlink,
+			c, inode_multiple_links_but_nlink_0,
+			"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+			target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
+		target->bi_nlink++;
+		target->bi_flags &= ~BCH_INODE_unlinked;
+		ret = __bch2_fsck_write_inode(trans, target, target_snapshot);
+		if (ret)
+			goto err;
+	}
+out:
+err:
+fsck_err:
+	bch2_trans_iter_exit(trans, &bp_iter);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-		/*
-		 * hardlinked file with nlink 0:
-		 * We're just adjusting nlink here so check_nlinks() will pick
-		 * it up, it ignores inodes with nlink 0
-		 */
-		if (fsck_err_on(backpointer_exists && !target->bi_nlink,
-				c, inode_multiple_links_but_nlink_0,
-				"inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
-				target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
-			target->bi_nlink++;
-			target->bi_flags &= ~BCH_INODE_unlinked;
-
-			ret = __write_inode(trans, target, target_snapshot);
-			if (ret)
-				goto err;
-		}
+static int check_dirent_target(struct btree_trans *trans,
+			       struct btree_iter *iter,
+			       struct bkey_s_c_dirent d,
+			       struct bch_inode_unpacked *target,
+			       u32 target_snapshot)
+{
+	struct bch_fs *c = trans->c;
+	struct bkey_i_dirent *n;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
 
-		if (fsck_err_on(!backpointer_exists,
-				c, inode_wrong_backpointer,
-				"inode %llu:%u has wrong backpointer:\n"
-				"got       %llu:%llu\n"
-				"should be %llu:%llu",
-				target->bi_inum, target_snapshot,
-				target->bi_dir,
-				target->bi_dir_offset,
-				d.k->p.inode,
-				d.k->p.offset)) {
-			target->bi_dir		= d.k->p.inode;
-			target->bi_dir_offset	= d.k->p.offset;
-
-			ret = __write_inode(trans, target, target_snapshot);
-			if (ret)
-				goto err;
-		}
-	}
+	ret = check_dirent_inode_dirent(trans, iter, d, target, target_snapshot);
+	if (ret)
+		goto err;
 
 	if (fsck_err_on(d.v->d_type != inode_d_type(target),
 			c, dirent_d_type_wrong,
@@ -1586,6 +1869,12 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		bkey_reassemble(&n->k_i, d.s_c);
 		n->v.d_type = inode_d_type(target);
+		if (n->v.d_type == DT_SUBVOL) {
+			n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+			n->v.d_child_subvol = cpu_to_le32(target->bi_subvol);
+		} else {
+			n->v.d_inum = cpu_to_le64(target->bi_inum);
+		}
 
 		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
 		if (ret)
@@ -1593,33 +1882,163 @@ static int check_dirent_target(struct btree_trans *trans,
 
 		d = dirent_i_to_s_c(n);
 	}
+err:
+fsck_err:
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
 
-	if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
-			target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
-			c, dirent_d_parent_subvol_wrong,
-			"dirent has wrong d_parent_subvol field: got %u, should be %u",
-			le32_to_cpu(d.v->d_parent_subvol),
-			target->bi_parent_subvol)) {
-		n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
-		ret = PTR_ERR_OR_ZERO(n);
+/* find a subvolume that's a descendent of @snapshot: */
+static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	int ret;
+
+	for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, 0, k, ret) {
+		if (k.k->type != KEY_TYPE_subvolume)
+			continue;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+		if (bch2_snapshot_is_ancestor(trans->c, le32_to_cpu(s.v->snapshot), snapshot)) {
+			bch2_trans_iter_exit(trans, &iter);
+			*subvolid = k.k->p.offset;
+			goto found;
+		}
+	}
+	if (!ret)
+		ret = -ENOENT;
+found:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter *iter,
+				  struct bkey_s_c_dirent d)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter subvol_iter = {};
+	struct bch_inode_unpacked subvol_root;
+	u32 parent_subvol = le32_to_cpu(d.v->d_parent_subvol);
+	u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
+	u32 parent_snapshot;
+	u32 new_parent_subvol = 0;
+	u64 parent_inum;
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	ret = subvol_lookup(trans, parent_subvol, &parent_snapshot, &parent_inum);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret ||
+	    (!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot))) {
+		int ret2 = find_snapshot_subvol(trans, d.k->p.snapshot, &new_parent_subvol);
+		if (ret2 && !bch2_err_matches(ret, ENOENT))
+			return ret2;
+	}
+
+	if (ret &&
+	    !new_parent_subvol &&
+	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_subvolumes))) {
+		/*
+		 * Couldn't find a subvol for dirent's snapshot - but we lost
+		 * subvols, so we need to reconstruct:
+		 */
+		ret = reconstruct_subvol(trans, d.k->p.snapshot, parent_subvol, 0);
+		if (ret)
+			return ret;
+
+		parent_snapshot = d.k->p.snapshot;
+	}
+
+	if (fsck_err_on(ret, c, dirent_to_missing_parent_subvol,
+			"dirent parent_subvol points to missing subvolume\n%s",
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)) ||
+	    fsck_err_on(!ret && !bch2_snapshot_is_ancestor(c, parent_snapshot, d.k->p.snapshot),
+			c, dirent_not_visible_in_parent_subvol,
+			"dirent not visible in parent_subvol (not an ancestor of subvol snap %u)\n%s",
+			parent_snapshot,
+			(bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) {
+		if (!new_parent_subvol) {
+			bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot);
+			return -BCH_ERR_fsck_repair_unimplemented;
+		}
+
+		struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent);
+		ret = PTR_ERR_OR_ZERO(new_dirent);
 		if (ret)
 			goto err;
 
-		bkey_reassemble(&n->k_i, d.s_c);
-		n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol);
+		new_dirent->v.d_parent_subvol = cpu_to_le32(new_parent_subvol);
+	}
 
-		ret = bch2_trans_update(trans, iter, &n->k_i, 0);
+	struct bkey_s_c_subvolume s =
+		bch2_bkey_get_iter_typed(trans, &subvol_iter,
+					 BTREE_ID_subvolumes, POS(0, target_subvol),
+					 0, subvolume);
+	ret = bkey_err(s.s_c);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (ret) {
+		if (fsck_err(c, dirent_to_missing_subvol,
+			     "dirent points to missing subvolume\n%s",
+			     (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)))
+			return __remove_dirent(trans, d.k->p);
+		ret = 0;
+		goto out;
+	}
+
+	if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol,
+			c, subvol_fs_path_parent_wrong,
+			"subvol with wrong fs_path_parent, should be be %u\n%s",
+			parent_subvol,
+			(bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
 		if (ret)
 			goto err;
 
-		d = dirent_i_to_s_c(n);
+		n->v.fs_path_parent = cpu_to_le32(parent_subvol);
+	}
+
+	u64 target_inum = le64_to_cpu(s.v->inode);
+	u32 target_snapshot = le32_to_cpu(s.v->snapshot);
+
+	ret = lookup_inode(trans, target_inum, &subvol_root, &target_snapshot);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		goto err;
+
+	if (ret) {
+		bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum);
+		ret = -BCH_ERR_fsck_repair_unimplemented;
+		ret = 0;
+		goto err;
+	}
+
+	if (fsck_err_on(!ret && parent_subvol != subvol_root.bi_parent_subvol,
+			c, inode_bi_parent_wrong,
+			"subvol root %llu has wrong bi_parent_subvol: got %u, should be %u",
+			target_inum,
+			subvol_root.bi_parent_subvol, parent_subvol)) {
+		subvol_root.bi_parent_subvol = parent_subvol;
+		ret = __bch2_fsck_write_inode(trans, &subvol_root, target_snapshot);
+		if (ret)
+			goto err;
 	}
+
+	ret = check_dirent_target(trans, iter, d, &subvol_root,
+				  target_snapshot);
+	if (ret)
+		goto err;
 out:
 err:
 fsck_err:
-	bch2_trans_iter_exit(trans, &bp_iter);
+	bch2_trans_iter_exit(trans, &subvol_iter);
 	printbuf_exit(&buf);
-	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -1631,7 +2050,6 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			struct snapshots_seen *s)
 {
 	struct bch_fs *c = trans->c;
-	struct bkey_s_c_dirent d;
 	struct inode_walker_entry *i;
 	struct printbuf buf = PRINTBUF;
 	struct bpos equiv;
@@ -1661,7 +2079,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 
 	BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
 
-	i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, dir, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret < 0)
 		goto err;
@@ -1670,6 +2088,17 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 		*hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode);
 	dir->first_this_inode = false;
 
+	if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) {
+		ret =   reconstruct_inode(trans, k.k->p.snapshot, k.k->p.inode, 0, S_IFDIR) ?:
+			bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
+		if (ret)
+			goto err;
+
+		dir->last_pos.inode--;
+		ret = -BCH_ERR_transaction_restart_nested;
+		goto err;
+	}
+
 	if (fsck_err_on(!i, c, dirent_in_missing_dir_inode,
 			"dirent in nonexisting directory:\n%s",
 			(printbuf_reset(&buf),
@@ -1704,53 +2133,10 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 	if (k.k->type != KEY_TYPE_dirent)
 		goto out;
 
-	d = bkey_s_c_to_dirent(k);
+	struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
 
 	if (d.v->d_type == DT_SUBVOL) {
-		struct bch_inode_unpacked subvol_root;
-		u32 target_subvol = le32_to_cpu(d.v->d_child_subvol);
-		u32 target_snapshot;
-		u64 target_inum;
-
-		ret = subvol_lookup(trans, target_subvol,
-				      &target_snapshot, &target_inum);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, dirent_to_missing_subvol,
-				"dirent points to missing subvolume %u",
-				le32_to_cpu(d.v->d_child_subvol))) {
-			ret = __remove_dirent(trans, d.k->p);
-			goto err;
-		}
-
-		ret = lookup_inode(trans, target_inum,
-				   &subvol_root, &target_snapshot);
-		if (ret && !bch2_err_matches(ret, ENOENT))
-			goto err;
-
-		if (fsck_err_on(ret, c, subvol_to_missing_root,
-				"subvolume %u points to missing subvolume root %llu",
-				target_subvol,
-				target_inum)) {
-			bch_err(c, "repair not implemented yet");
-			ret = -EINVAL;
-			goto err;
-		}
-
-		if (fsck_err_on(subvol_root.bi_subvol != target_subvol,
-				c, subvol_root_wrong_bi_subvol,
-				"subvol root %llu has wrong bi_subvol field: got %u, should be %u",
-				target_inum,
-				subvol_root.bi_subvol, target_subvol)) {
-			subvol_root.bi_subvol = target_subvol;
-			ret = __write_inode(trans, &subvol_root, target_snapshot);
-			if (ret)
-				goto err;
-		}
-
-		ret = check_dirent_target(trans, iter, d, &subvol_root,
-					  target_snapshot);
+		ret = check_dirent_to_subvol(trans, iter, d);
 		if (ret)
 			goto err;
 	} else {
@@ -1776,12 +2162,11 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
 			if (ret)
 				goto err;
 		}
-	}
-
-	if (d.v->d_type == DT_DIR)
-		for_each_visible_inode(c, s, dir, equiv.snapshot, i)
-			i->count++;
 
+		if (d.v->d_type == DT_DIR)
+			for_each_visible_inode(c, s, dir, equiv.snapshot, i)
+				i->count++;
+	}
 out:
 err:
 fsck_err:
@@ -1810,7 +2195,8 @@ int bch2_check_dirents(struct bch_fs *c)
 				k,
 				NULL, NULL,
 				BCH_TRANS_COMMIT_no_enospc,
-			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
+			check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?:
+		check_subdir_count_notnested(trans, &dir));
 
 	snapshots_seen_exit(&s);
 	inode_walker_exit(&dir);
@@ -1829,10 +2215,12 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
 	int ret;
 
 	ret = check_key_has_snapshot(trans, iter, k);
-	if (ret)
+	if (ret < 0)
 		return ret;
+	if (ret)
+		return 0;
 
-	i = walk_inode(trans, inode, k.k->p, k.k->type == KEY_TYPE_whiteout);
+	i = walk_inode(trans, inode, k);
 	ret = PTR_ERR_OR_ZERO(i);
 	if (ret)
 		return ret;
@@ -1890,17 +2278,21 @@ static int check_root_trans(struct btree_trans *trans)
 
 	if (mustfix_fsck_err_on(ret, c, root_subvol_missing,
 				"root subvol missing")) {
-		struct bkey_i_subvolume root_subvol;
+		struct bkey_i_subvolume *root_subvol =
+			bch2_trans_kmalloc(trans, sizeof(*root_subvol));
+		ret = PTR_ERR_OR_ZERO(root_subvol);
+		if (ret)
+			goto err;
 
 		snapshot	= U32_MAX;
 		inum		= BCACHEFS_ROOT_INO;
 
-		bkey_subvolume_init(&root_subvol.k_i);
-		root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL;
-		root_subvol.v.flags	= 0;
-		root_subvol.v.snapshot	= cpu_to_le32(snapshot);
-		root_subvol.v.inode	= cpu_to_le64(inum);
-		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
+		bkey_subvolume_init(&root_subvol->k_i);
+		root_subvol->k.p.offset = BCACHEFS_ROOT_SUBVOL;
+		root_subvol->v.flags	= 0;
+		root_subvol->v.snapshot	= cpu_to_le32(snapshot);
+		root_subvol->v.inode	= cpu_to_le64(inum);
+		ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol->k_i, 0);
 		bch_err_msg(c, ret, "writing root subvol");
 		if (ret)
 			goto err;
@@ -1919,7 +2311,7 @@ static int check_root_trans(struct btree_trans *trans)
 				0, NULL);
 		root_inode.bi_inum = inum;
 
-		ret = __write_inode(trans, &root_inode, snapshot);
+		ret = __bch2_fsck_write_inode(trans, &root_inode, snapshot);
 		bch_err_msg(c, ret, "writing root inode");
 	}
 err:
@@ -1936,6 +2328,107 @@ int bch2_check_root(struct bch_fs *c)
 	return ret;
 }
 
+typedef DARRAY(u32) darray_u32;
+
+static bool darray_u32_has(darray_u32 *d, u32 v)
+{
+	darray_for_each(*d, i)
+		if (*i == v)
+			return true;
+	return false;
+}
+
+/*
+ * We've checked that inode backpointers point to valid dirents; here, it's
+ * sufficient to check that the subvolume root has a dirent:
+ */
+static int subvol_has_dirent(struct btree_trans *trans, struct bkey_s_c_subvolume s)
+{
+	struct bch_inode_unpacked inode;
+	int ret = bch2_inode_find_by_inum_trans(trans,
+				(subvol_inum) { s.k->p.offset, le64_to_cpu(s.v->inode) },
+				&inode);
+	if (ret)
+		return ret;
+
+	return inode.bi_dir != 0;
+}
+
+static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k)
+{
+	struct bch_fs *c = trans->c;
+	struct btree_iter parent_iter = {};
+	darray_u32 subvol_path = {};
+	struct printbuf buf = PRINTBUF;
+	int ret = 0;
+
+	if (k.k->type != KEY_TYPE_subvolume)
+		return 0;
+
+	while (k.k->p.offset != BCACHEFS_ROOT_SUBVOL) {
+		ret = darray_push(&subvol_path, k.k->p.offset);
+		if (ret)
+			goto err;
+
+		struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+
+		ret = subvol_has_dirent(trans, s);
+		if (ret < 0)
+			break;
+
+		if (fsck_err_on(!ret,
+				c, subvol_unreachable,
+				"unreachable subvolume %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c),
+				 buf.buf))) {
+			ret = reattach_subvol(trans, s);
+			break;
+		}
+
+		u32 parent = le32_to_cpu(s.v->fs_path_parent);
+
+		if (darray_u32_has(&subvol_path, parent)) {
+			if (fsck_err(c, subvol_loop, "subvolume loop"))
+				ret = reattach_subvol(trans, s);
+			break;
+		}
+
+		bch2_trans_iter_exit(trans, &parent_iter);
+		bch2_trans_iter_init(trans, &parent_iter,
+				     BTREE_ID_subvolumes, POS(0, parent), 0);
+		k = bch2_btree_iter_peek_slot(&parent_iter);
+		ret = bkey_err(k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(k.k->type != KEY_TYPE_subvolume,
+				c, subvol_unreachable,
+				"unreachable subvolume %s",
+				(bch2_bkey_val_to_text(&buf, c, s.s_c),
+				 buf.buf))) {
+			ret = reattach_subvol(trans, s);
+			break;
+		}
+	}
+fsck_err:
+err:
+	printbuf_exit(&buf);
+	darray_exit(&subvol_path);
+	bch2_trans_iter_exit(trans, &parent_iter);
+	return ret;
+}
+
+int bch2_check_subvolume_structure(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_path(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 struct pathbuf_entry {
 	u64	inum;
 	u32	snapshot;
@@ -1952,89 +2445,71 @@ static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
 	return false;
 }
 
-static int path_down(struct bch_fs *c, pathbuf *p,
-		     u64 inum, u32 snapshot)
-{
-	int ret = darray_push(p, ((struct pathbuf_entry) {
-		.inum		= inum,
-		.snapshot	= snapshot,
-	}));
-
-	if (ret)
-		bch_err(c, "fsck: error allocating memory for pathbuf, size %zu",
-			p->size);
-	return ret;
-}
-
 /*
- * Check that a given inode is reachable from the root:
+ * Check that a given inode is reachable from its subvolume root - we already
+ * verified subvolume connectivity:
  *
  * XXX: we should also be verifying that inodes are in the right subvolumes
  */
-static int check_path(struct btree_trans *trans,
-		      pathbuf *p,
-		      struct bch_inode_unpacked *inode,
-		      u32 snapshot)
+static int check_path(struct btree_trans *trans, pathbuf *p, struct bkey_s_c inode_k)
 {
 	struct bch_fs *c = trans->c;
+	struct btree_iter inode_iter = {};
+	struct bch_inode_unpacked inode;
+	struct printbuf buf = PRINTBUF;
+	u32 snapshot = bch2_snapshot_equiv(c, inode_k.k->p.snapshot);
 	int ret = 0;
 
-	snapshot = bch2_snapshot_equiv(c, snapshot);
 	p->nr = 0;
 
-	while (!(inode->bi_inum == BCACHEFS_ROOT_INO &&
-		 inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) {
+	BUG_ON(bch2_inode_unpack(inode_k, &inode));
+
+	while (!inode.bi_subvol) {
 		struct btree_iter dirent_iter;
 		struct bkey_s_c_dirent d;
 		u32 parent_snapshot = snapshot;
 
-		if (inode->bi_subvol) {
-			u64 inum;
-
-			ret = subvol_lookup(trans, inode->bi_parent_subvol,
-					    &parent_snapshot, &inum);
-			if (ret)
-				break;
-		}
-
-		d = dirent_get_by_pos(trans, &dirent_iter,
-				      SPOS(inode->bi_dir, inode->bi_dir_offset,
-					   parent_snapshot));
+		d = inode_get_dirent(trans, &dirent_iter, &inode, &parent_snapshot);
 		ret = bkey_err(d.s_c);
 		if (ret && !bch2_err_matches(ret, ENOENT))
 			break;
 
-		if (!ret && !dirent_points_to_inode(d, inode)) {
+		if (!ret && !dirent_points_to_inode(d, &inode)) {
 			bch2_trans_iter_exit(trans, &dirent_iter);
 			ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode;
 		}
 
 		if (bch2_err_matches(ret, ENOENT)) {
-			if (fsck_err(c,  inode_unreachable,
-				     "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu",
-				     inode->bi_inum, snapshot,
-				     bch2_d_type_str(inode_d_type(inode)),
-				     inode->bi_nlink,
-				     inode->bi_dir,
-				     inode->bi_dir_offset))
-				ret = reattach_inode(trans, inode, snapshot);
-			break;
+			ret = 0;
+			if (fsck_err(c, inode_unreachable,
+				     "unreachable inode\n%s",
+				     (printbuf_reset(&buf),
+				      bch2_bkey_val_to_text(&buf, c, inode_k),
+				      buf.buf)))
+				ret = reattach_inode(trans, &inode, snapshot);
+			goto out;
 		}
 
 		bch2_trans_iter_exit(trans, &dirent_iter);
 
-		if (!S_ISDIR(inode->bi_mode))
+		if (!S_ISDIR(inode.bi_mode))
 			break;
 
-		ret = path_down(c, p, inode->bi_inum, snapshot);
-		if (ret) {
-			bch_err(c, "memory allocation failure");
+		ret = darray_push(p, ((struct pathbuf_entry) {
+			.inum		= inode.bi_inum,
+			.snapshot	= snapshot,
+		}));
+		if (ret)
 			return ret;
-		}
 
 		snapshot = parent_snapshot;
 
-		ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
+		bch2_trans_iter_exit(trans, &inode_iter);
+		inode_k = bch2_bkey_get_iter(trans, &inode_iter, BTREE_ID_inodes,
+					     SPOS(0, inode.bi_dir, snapshot), 0);
+		ret = bkey_err(inode_k) ?:
+			!bkey_is_inode(inode_k.k) ? -BCH_ERR_ENOENT_inode
+			: bch2_inode_unpack(inode_k, &inode);
 		if (ret) {
 			/* Should have been caught in dirents pass */
 			if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -2042,30 +2517,32 @@ static int check_path(struct btree_trans *trans,
 			break;
 		}
 
-		if (path_is_dup(p, inode->bi_inum, snapshot)) {
+		snapshot = inode_k.k->p.snapshot;
+
+		if (path_is_dup(p, inode.bi_inum, snapshot)) {
 			/* XXX print path */
 			bch_err(c, "directory structure loop");
 
 			darray_for_each(*p, i)
 				pr_err("%llu:%u", i->inum, i->snapshot);
-			pr_err("%llu:%u", inode->bi_inum, snapshot);
-
-			if (!fsck_err(c, dir_loop, "directory structure loop"))
-				return 0;
+			pr_err("%llu:%u", inode.bi_inum, snapshot);
 
-			ret = remove_backpointer(trans, inode);
-			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+			if (fsck_err(c, dir_loop, "directory structure loop")) {
+				ret = remove_backpointer(trans, &inode);
 				bch_err_msg(c, ret, "removing dirent");
-			if (ret)
-				break;
+				if (ret)
+					break;
 
-			ret = reattach_inode(trans, inode, snapshot);
-			if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
-				bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+				ret = reattach_inode(trans, &inode, snapshot);
+				bch_err_msg(c, ret, "reattaching inode %llu", inode.bi_inum);
+			}
 			break;
 		}
 	}
+out:
 fsck_err:
+	bch2_trans_iter_exit(trans, &inode_iter);
+	printbuf_exit(&buf);
 	bch_err_fn(c, ret);
 	return ret;
 }
@@ -2077,7 +2554,6 @@ fsck_err:
  */
 int bch2_check_directory_structure(struct bch_fs *c)
 {
-	struct bch_inode_unpacked u;
 	pathbuf path = { 0, };
 	int ret;
 
@@ -2090,12 +2566,10 @@ int bch2_check_directory_structure(struct bch_fs *c)
 			if (!bkey_is_inode(k.k))
 				continue;
 
-			BUG_ON(bch2_inode_unpack(k, &u));
-
-			if (u.bi_flags & BCH_INODE_unlinked)
+			if (bch2_inode_flags(k) & BCH_INODE_unlinked)
 				continue;
 
-			check_path(trans, &path, &u, iter.pos.snapshot);
+			check_path(trans, &path, k);
 		})));
 	darray_exit(&path);
 
@@ -2291,7 +2765,7 @@ static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_ite
 			u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)],
 			bch2_inode_nlink_get(&u), link->count)) {
 		bch2_inode_nlink_set(&u, link->count);
-		ret = __write_inode(trans, &u, k.k->p.snapshot);
+		ret = __bch2_fsck_write_inode(trans, &u, k.k->p.snapshot);
 	}
 fsck_err:
 	return ret;
diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h
index da991e8cf27e..a4ef94271784 100644
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
@@ -8,6 +8,7 @@ int bch2_check_indirect_extents(struct bch_fs *);
 int bch2_check_dirents(struct bch_fs *);
 int bch2_check_xattrs(struct bch_fs *);
 int bch2_check_root(struct bch_fs *);
+int bch2_check_subvolume_structure(struct bch_fs *);
 int bch2_check_directory_structure(struct bch_fs *);
 int bch2_check_nlinks(struct bch_fs *);
 int bch2_fix_reflink_p(struct bch_fs *);
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 086f0090b03a..ca4a066e9a54 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -324,7 +324,7 @@ int bch2_inode_unpack(struct bkey_s_c k,
 	return bch2_inode_unpack_slowpath(k, unpacked);
 }
 
-static int bch2_inode_peek_nowarn(struct btree_trans *trans,
+int bch2_inode_peek_nowarn(struct btree_trans *trans,
 		    struct btree_iter *iter,
 		    struct bch_inode_unpacked *inode,
 		    subvol_inum inum, unsigned flags)
@@ -384,6 +384,34 @@ int bch2_inode_write_flags(struct btree_trans *trans,
 	return bch2_trans_update(trans, iter, &inode_p->inode.k_i, flags);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *trans,
+			 struct bch_inode_unpacked *inode,
+			 u32 snapshot)
+{
+	struct bkey_inode_buf *inode_p =
+		bch2_trans_kmalloc(trans, sizeof(*inode_p));
+
+	if (IS_ERR(inode_p))
+		return PTR_ERR(inode_p);
+
+	bch2_inode_pack(inode_p, inode);
+	inode_p->inode.k.p.snapshot = snapshot;
+
+	return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes,
+				&inode_p->inode.k_i,
+				BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+}
+
+int bch2_fsck_write_inode(struct btree_trans *trans,
+			    struct bch_inode_unpacked *inode,
+			    u32 snapshot)
+{
+	int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			    __bch2_fsck_write_inode(trans, inode, snapshot));
+	bch_err_fn(trans->c, ret);
+	return ret;
+}
+
 struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k)
 {
 	struct bch_inode_unpacked u;
@@ -524,8 +552,8 @@ static void __bch2_inode_unpacked_to_text(struct printbuf *out,
 	prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
 	prt_newline(out);
 
-	prt_newline(out);
 	prt_printf(out, "bi_version=%llu", inode->bi_version);
+	prt_newline(out);
 
 #define x(_name, _bits)						\
 	prt_printf(out, #_name "=%llu", (u64) inode->_name);	\
@@ -592,7 +620,8 @@ int bch2_trigger_inode(struct btree_trans *trans,
 		bool old_deleted = bkey_is_deleted_inode(old);
 		bool new_deleted = bkey_is_deleted_inode(new.s_c);
 		if (old_deleted != new_deleted) {
-			int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+			int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes,
+							      new.k->p, new_deleted);
 			if (ret)
 				return ret;
 		}
@@ -1088,8 +1117,9 @@ static int may_delete_deleted_inode(struct btree_trans *trans,
 		goto out;
 
 	if (S_ISDIR(inode.bi_mode)) {
-		ret = bch2_empty_dir_snapshot(trans, pos.offset, pos.snapshot);
-		if (fsck_err_on(ret == -ENOTEMPTY, c, deleted_inode_is_dir,
+		ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot);
+		if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY),
+				c, deleted_inode_is_dir,
 				"non empty directory %llu:%u in deleted_inodes btree",
 				pos.offset, pos.snapshot))
 			goto delete;
@@ -1141,7 +1171,7 @@ fsck_err:
 	bch2_trans_iter_exit(trans, &inode_iter);
 	return ret;
 delete:
-	ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, pos, false);
+	ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_deleted_inodes, pos, false);
 	goto out;
 }
 
@@ -1151,6 +1181,15 @@ int bch2_delete_dead_inodes(struct bch_fs *c)
 	bool need_another_pass;
 	int ret;
 again:
+	/*
+	 * if we ran check_inodes() unlinked inodes will have already been
+	 * cleaned up but the write buffer will be out of sync; therefore we
+	 * alway need a write buffer flush
+	 */
+	ret = bch2_btree_write_buffer_flush_sync(trans);
+	if (ret)
+		goto err;
+
 	need_another_pass = false;
 
 	/*
@@ -1183,12 +1222,8 @@ again:
 		ret;
 	}));
 
-	if (!ret && need_another_pass) {
-		ret = bch2_btree_write_buffer_flush_sync(trans);
-		if (ret)
-			goto err;
+	if (!ret && need_another_pass)
 		goto again;
-	}
 err:
 	bch2_trans_put(trans);
 	return ret;
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index b63f312581cf..056298050550 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -95,6 +95,8 @@ struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *);
 
 void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *);
 
+int bch2_inode_peek_nowarn(struct btree_trans *, struct btree_iter *,
+		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 int bch2_inode_peek(struct btree_trans *, struct btree_iter *,
 		    struct bch_inode_unpacked *, subvol_inum, unsigned);
 
@@ -108,6 +110,9 @@ static inline int bch2_inode_write(struct btree_trans *trans,
 	return bch2_inode_write_flags(trans, iter, inode, 0);
 }
 
+int __bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *, u32);
+
 void bch2_inode_init_early(struct bch_fs *,
 			   struct bch_inode_unpacked *);
 void bch2_inode_init_late(struct bch_inode_unpacked *, u64,
@@ -172,6 +177,20 @@ static inline u8 inode_d_type(struct bch_inode_unpacked *inode)
 	return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode);
 }
 
+static inline u32 bch2_inode_flags(struct bkey_s_c k)
+{
+	switch (k.k->type) {
+	case KEY_TYPE_inode:
+		return le32_to_cpu(bkey_s_c_to_inode(k).v->bi_flags);
+	case KEY_TYPE_inode_v2:
+		return le64_to_cpu(bkey_s_c_to_inode_v2(k).v->bi_flags);
+	case KEY_TYPE_inode_v3:
+		return le64_to_cpu(bkey_s_c_to_inode_v3(k).v->bi_flags);
+	default:
+		return 0;
+	}
+}
+
 /* i_nlink: */
 
 static inline unsigned nlink_bias(umode_t mode)
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index 1baf78594cca..82f9170dab3f 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -264,6 +264,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
 		ret = 0;
 err:
 	bch2_logged_op_finish(trans, op_k);
+	bch_err_fn(c, ret);
 	return ret;
 }
 
@@ -476,6 +477,7 @@ case LOGGED_OP_FINSERT_finish:
 	break;
 	}
 err:
+	bch_err_fn(c, ret);
 	bch2_logged_op_finish(trans, op_k);
 	bch2_trans_iter_exit(trans, &iter);
 	return ret;
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 3c574d8873a1..8a556e6d1ab6 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -174,7 +174,7 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
 	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
 		return ERR_PTR(-BCH_ERR_nopromote_no_writes);
 
-	op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+	op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL);
 	if (!op) {
 		ret = -BCH_ERR_nopromote_enomem;
 		goto err;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 2c098ac017b3..f137252bccc5 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -88,7 +88,7 @@ void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw)
 
 	bch2_congested_acct(ca, io_latency, now, rw);
 
-	__bch2_time_stats_update(&ca->io_latency[rw], submit_time, now);
+	__bch2_time_stats_update(&ca->io_latency[rw].stats, submit_time, now);
 }
 
 #endif
@@ -530,7 +530,8 @@ static void __bch2_write_index(struct bch_write_op *op)
 
 			bch_err_inum_offset_ratelimited(c,
 				insert->k.p.inode, insert->k.p.offset << 9,
-				"write error while doing btree update: %s",
+				"%s write error while doing btree update: %s",
+				op->flags & BCH_WRITE_MOVE ? "move" : "user",
 				bch2_err_str(ret));
 		}
 
@@ -1067,7 +1068,8 @@ do_write:
 	*_dst = dst;
 	return more;
 csum_err:
-	bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)");
+	bch_err(c, "%s writ error: error verifying existing checksum while rewriting existing data (memory corruption?)",
+		op->flags & BCH_WRITE_MOVE ? "move" : "user");
 	ret = -EIO;
 err:
 	if (to_wbio(dst)->bounce)
@@ -1169,7 +1171,8 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
 
 			bch_err_inum_offset_ratelimited(c,
 				insert->k.p.inode, insert->k.p.offset << 9,
-				"write error while doing btree update: %s",
+				"%s write error while doing btree update: %s",
+				op->flags & BCH_WRITE_MOVE ? "move" : "user",
 				bch2_err_str(ret));
 		}
 
@@ -1449,7 +1452,9 @@ err:
 					bch_err_inum_offset_ratelimited(c,
 						op->pos.inode,
 						op->pos.offset << 9,
-						"%s(): error: %s", __func__, bch2_err_str(ret));
+						"%s(): %s error: %s", __func__,
+						op->flags & BCH_WRITE_MOVE ? "move" : "user",
+						bch2_err_str(ret));
 				op->error = ret;
 				break;
 			}
@@ -1573,7 +1578,8 @@ CLOSURE_CALLBACK(bch2_write)
 		bch_err_inum_offset_ratelimited(c,
 			op->pos.inode,
 			op->pos.offset << 9,
-			"misaligned write");
+			"%s write error: misaligned write",
+			op->flags & BCH_WRITE_MOVE ? "move" : "user");
 		op->error = -EIO;
 		goto err;
 	}
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index bc890776eb57..9c9a25dbd613 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -27,33 +27,71 @@ static const char * const bch2_journal_errors[] = {
 	NULL
 };
 
+static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
+{
+	return seq > j->seq_ondisk;
+}
+
+static bool __journal_entry_is_open(union journal_res_state state)
+{
+	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
+}
+
+static inline unsigned nr_unwritten_journal_entries(struct journal *j)
+{
+	return atomic64_read(&j->seq) - j->seq_ondisk;
+}
+
+static bool journal_entry_is_open(struct journal *j)
+{
+	return __journal_entry_is_open(j->reservations);
+}
+
 static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
 {
 	union journal_res_state s = READ_ONCE(j->reservations);
 	unsigned i = seq & JOURNAL_BUF_MASK;
 	struct journal_buf *buf = j->buf + i;
 
-	prt_printf(out, "seq:");
+	prt_str(out, "seq:");
 	prt_tab(out);
 	prt_printf(out, "%llu", seq);
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	prt_printf(out, "refcount:");
+	prt_str(out, "refcount:");
 	prt_tab(out);
 	prt_printf(out, "%u", journal_state_count(s, i));
 	prt_newline(out);
 
-	prt_printf(out, "size:");
+	prt_str(out, "size:");
 	prt_tab(out);
 	prt_human_readable_u64(out, vstruct_bytes(buf->data));
 	prt_newline(out);
 
-	prt_printf(out, "expires");
+	prt_str(out, "expires:");
 	prt_tab(out);
 	prt_printf(out, "%li jiffies", buf->expires - jiffies);
 	prt_newline(out);
 
+	prt_str(out, "flags:");
+	prt_tab(out);
+	if (buf->noflush)
+		prt_str(out, "noflush ");
+	if (buf->must_flush)
+		prt_str(out, "must_flush ");
+	if (buf->separate_flush)
+		prt_str(out, "separate_flush ");
+	if (buf->need_flush_to_write_buffer)
+		prt_str(out, "need_flush_to_write_buffer ");
+	if (buf->write_started)
+		prt_str(out, "write_started ");
+	if (buf->write_allocated)
+		prt_str(out, "write allocated ");
+	if (buf->write_done)
+		prt_str(out, "write done");
+	prt_newline(out);
+
 	printbuf_indent_sub(out, 2);
 }
 
@@ -66,26 +104,7 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
 	     seq <= journal_cur_seq(j);
 	     seq++)
 		bch2_journal_buf_to_text(out, j, seq);
-}
-
-static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
-{
-	return seq > j->seq_ondisk;
-}
-
-static bool __journal_entry_is_open(union journal_res_state state)
-{
-	return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL;
-}
-
-static inline unsigned nr_unwritten_journal_entries(struct journal *j)
-{
-	return atomic64_read(&j->seq) - j->seq_ondisk;
-}
-
-static bool journal_entry_is_open(struct journal *j)
-{
-	return __journal_entry_is_open(j->reservations);
+	prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed");
 }
 
 static inline struct journal_buf *
@@ -174,21 +193,40 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags)
 	return stuck;
 }
 
+void bch2_journal_do_writes(struct journal *j)
+{
+	for (u64 seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		unsigned idx = seq & JOURNAL_BUF_MASK;
+		struct journal_buf *w = j->buf + idx;
+
+		if (w->write_started && !w->write_allocated)
+			break;
+		if (w->write_started)
+			continue;
+
+		if (!journal_state_count(j->reservations, idx)) {
+			w->write_started = true;
+			closure_call(&w->io, bch2_journal_write, j->wq, NULL);
+		}
+
+		break;
+	}
+}
+
 /*
  * Final processing when the last reference of a journal buffer has been
  * dropped. Drop the pin list reference acquired at journal entry open and write
  * the buffer, if requested.
  */
-void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
+void bch2_journal_buf_put_final(struct journal *j, u64 seq)
 {
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-
 	lockdep_assert_held(&j->lock);
 
 	if (__bch2_journal_pin_put(j, seq))
 		bch2_journal_reclaim_fast(j);
-	if (write)
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
+	bch2_journal_do_writes(j);
 }
 
 /*
@@ -380,11 +418,14 @@ static int journal_entry_open(struct journal *j)
 	BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf);
 
 	bkey_extent_init(&buf->key);
-	buf->noflush	= false;
-	buf->must_flush	= false;
-	buf->separate_flush = false;
-	buf->flush_time	= 0;
+	buf->noflush		= false;
+	buf->must_flush		= false;
+	buf->separate_flush	= false;
+	buf->flush_time		= 0;
 	buf->need_flush_to_write_buffer = true;
+	buf->write_started	= false;
+	buf->write_allocated	= false;
+	buf->write_done		= false;
 
 	memset(buf->data, 0, sizeof(*buf->data));
 	buf->data->seq	= cpu_to_le64(journal_cur_seq(j));
@@ -418,9 +459,10 @@ static int journal_entry_open(struct journal *j)
 	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
 				       old.v, new.v)) != old.v);
 
-	mod_delayed_work(c->io_complete_wq,
-			 &j->write_work,
-			 msecs_to_jiffies(c->opts.journal_flush_delay));
+	if (nr_unwritten_journal_entries(j) == 1)
+		mod_delayed_work(j->wq,
+				 &j->write_work,
+				 msecs_to_jiffies(c->opts.journal_flush_delay));
 	journal_wake(j);
 
 	if (j->early_journal_entries.nr)
@@ -445,20 +487,16 @@ static void journal_quiesce(struct journal *j)
 static void journal_write_work(struct work_struct *work)
 {
 	struct journal *j = container_of(work, struct journal, write_work.work);
-	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	long delta;
 
 	spin_lock(&j->lock);
-	if (!__journal_entry_is_open(j->reservations))
-		goto unlock;
+	if (__journal_entry_is_open(j->reservations)) {
+		long delta = journal_cur_buf(j)->expires - jiffies;
 
-	delta = journal_cur_buf(j)->expires - jiffies;
-
-	if (delta > 0)
-		mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
-	else
-		__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
-unlock:
+		if (delta > 0)
+			mod_delayed_work(j->wq, &j->write_work, delta);
+		else
+			__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+	}
 	spin_unlock(&j->lock);
 }
 
@@ -476,30 +514,29 @@ retry:
 	if (bch2_journal_error(j))
 		return -BCH_ERR_erofs_journal_err;
 
-	spin_lock(&j->lock);
+	if (j->blocked)
+		return -BCH_ERR_journal_res_get_blocked;
 
-	/* check once more in case somebody else shut things down... */
-	if (bch2_journal_error(j)) {
-		spin_unlock(&j->lock);
-		return -BCH_ERR_erofs_journal_err;
+	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
+		ret = JOURNAL_ERR_journal_full;
+		can_discard = j->can_discard;
+		goto out;
+	}
+
+	if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) {
+		ret = JOURNAL_ERR_max_in_flight;
+		goto out;
 	}
 
+	spin_lock(&j->lock);
+
 	/*
 	 * Recheck after taking the lock, so we don't race with another thread
 	 * that just did journal_entry_open() and call bch2_journal_entry_close()
 	 * unnecessarily
 	 */
 	if (journal_res_get_fast(j, res, flags)) {
-		spin_unlock(&j->lock);
-		return 0;
-	}
-
-	if ((flags & BCH_WATERMARK_MASK) < j->watermark) {
-		/*
-		 * Don't want to close current journal entry, just need to
-		 * invoke reclaim:
-		 */
-		ret = JOURNAL_ERR_journal_full;
+		ret = 0;
 		goto unlock;
 	}
 
@@ -515,30 +552,30 @@ retry:
 		j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
 
 	__journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
-	ret = journal_entry_open(j);
-
-	if (ret == JOURNAL_ERR_max_in_flight) {
-		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-				   &j->max_in_flight_start, true);
-		if (trace_journal_entry_full_enabled()) {
-			struct printbuf buf = PRINTBUF;
-			buf.atomic++;
-
-			bch2_journal_bufs_to_text(&buf, j);
-			trace_journal_entry_full(c, buf.buf);
-			printbuf_exit(&buf);
-		}
-		count_event(c, journal_entry_full);
-	}
+	ret = journal_entry_open(j) ?: JOURNAL_ERR_retry;
 unlock:
 	can_discard = j->can_discard;
 	spin_unlock(&j->lock);
-
-	if (!ret)
+out:
+	if (ret == JOURNAL_ERR_retry)
 		goto retry;
+	if (!ret)
+		return 0;
+
 	if (journal_error_check_stuck(j, ret, flags))
 		ret = -BCH_ERR_journal_res_get_blocked;
 
+	if (ret == JOURNAL_ERR_max_in_flight &&
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) {
+
+		struct printbuf buf = PRINTBUF;
+		prt_printf(&buf, "seq %llu\n", journal_cur_seq(j));
+		bch2_journal_bufs_to_text(&buf, j);
+		trace_journal_entry_full(c, buf.buf);
+		printbuf_exit(&buf);
+		count_event(c, journal_entry_full);
+	}
+
 	/*
 	 * Journal is full - can't rely on reclaim from work item due to
 	 * freezing:
@@ -674,7 +711,7 @@ recheck_need_open:
 			return ret;
 
 		seq = res.seq;
-		buf = j->buf + (seq & JOURNAL_BUF_MASK);
+		buf = journal_seq_to_buf(j, seq);
 		buf->must_flush = true;
 
 		if (!buf->flush_time) {
@@ -692,8 +729,8 @@ recheck_need_open:
 	}
 
 	/*
-	 * if write was kicked off without a flush, flush the next sequence
-	 * number instead
+	 * if write was kicked off without a flush, or if we promised it
+	 * wouldn't be a flush, flush the next sequence number instead
 	 */
 	buf = journal_seq_to_buf(j, seq);
 	if (buf->noflush) {
@@ -771,8 +808,8 @@ bool bch2_journal_noflush_seq(struct journal *j, u64 seq)
 	     unwritten_seq++) {
 		struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq);
 
-		/* journal write is already in flight, and was a flush write: */
-		if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush)
+		/* journal flush already in flight, or flush requseted */
+		if (buf->must_flush)
 			goto out;
 
 		buf->noflush = true;
@@ -1157,13 +1194,12 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	struct journal_replay *i, **_i;
 	struct genradix_iter iter;
 	bool had_entries = false;
-	unsigned ptr;
 	u64 last_seq = cur_seq, nr, seq;
 
 	genradix_for_each_reverse(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		last_seq = le64_to_cpu(i->j.last_seq);
@@ -1196,7 +1232,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 	genradix_for_each(&c->journal_entries, iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
@@ -1211,8 +1247,8 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 		p = journal_seq_pin(j, seq);
 
 		p->devs.nr = 0;
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr)
+			bch2_dev_list_add_dev(&p->devs, ptr->dev);
 
 		had_entries = true;
 	}
@@ -1240,13 +1276,17 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq)
 
 void bch2_dev_journal_exit(struct bch_dev *ca)
 {
-	kfree(ca->journal.bio);
-	kfree(ca->journal.buckets);
-	kfree(ca->journal.bucket_seq);
+	struct journal_device *ja = &ca->journal;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		kfree(ja->bio[i]);
+		ja->bio[i] = NULL;
+	}
 
-	ca->journal.bio		= NULL;
-	ca->journal.buckets	= NULL;
-	ca->journal.bucket_seq	= NULL;
+	kfree(ja->buckets);
+	kfree(ja->bucket_seq);
+	ja->buckets	= NULL;
+	ja->bucket_seq	= NULL;
 }
 
 int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
@@ -1256,14 +1296,13 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 		bch2_sb_field_get(sb, journal);
 	struct bch_sb_field_journal_v2 *journal_buckets_v2 =
 		bch2_sb_field_get(sb, journal_v2);
-	unsigned i, nr_bvecs;
 
 	ja->nr = 0;
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
 
-		for (i = 0; i < nr; i++)
+		for (unsigned i = 0; i < nr; i++)
 			ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr);
 	} else if (journal_buckets) {
 		ja->nr = bch2_nr_journal_buckets(journal_buckets);
@@ -1273,13 +1312,18 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 	if (!ja->bucket_seq)
 		return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
+	unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE);
 
-	ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL);
-	if (!ca->journal.bio)
-		return -BCH_ERR_ENOMEM_dev_journal_init;
+	for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) {
+		ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs,
+				     nr_bvecs), GFP_KERNEL);
+		if (!ja->bio[i])
+			return -BCH_ERR_ENOMEM_dev_journal_init;
 
-	bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0);
+		ja->bio[i]->ca = ca;
+		ja->bio[i]->buf_idx = i;
+		bio_init(&ja->bio[i]->bio, NULL, ja->bio[i]->bio.bi_inline_vecs, nr_bvecs, 0);
+	}
 
 	ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);
 	if (!ja->buckets)
@@ -1287,14 +1331,14 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 	if (journal_buckets_v2) {
 		unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2);
-		unsigned j, dst = 0;
+		unsigned dst = 0;
 
-		for (i = 0; i < nr; i++)
-			for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
+		for (unsigned i = 0; i < nr; i++)
+			for (unsigned j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++)
 				ja->buckets[dst++] =
 					le64_to_cpu(journal_buckets_v2->d[i].start) + j;
 	} else if (journal_buckets) {
-		for (i = 0; i < ja->nr; i++)
+		for (unsigned i = 0; i < ja->nr; i++)
 			ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]);
 	}
 
@@ -1303,19 +1347,19 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb)
 
 void bch2_fs_journal_exit(struct journal *j)
 {
-	unsigned i;
+	if (j->wq)
+		destroy_workqueue(j->wq);
 
 	darray_exit(&j->early_journal_entries);
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++)
-		kvpfree(j->buf[i].data, j->buf[i].buf_size);
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++)
+		kvfree(j->buf[i].data);
 	free_fifo(&j->pin);
 }
 
 int bch2_fs_journal_init(struct journal *j)
 {
 	static struct lock_class_key res_key;
-	unsigned i;
 
 	mutex_init(&j->buf_lock);
 	spin_lock_init(&j->lock);
@@ -1336,14 +1380,20 @@ int bch2_fs_journal_init(struct journal *j)
 	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)))
 		return -BCH_ERR_ENOMEM_journal_pin_fifo;
 
-	for (i = 0; i < ARRAY_SIZE(j->buf); i++) {
+	for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) {
 		j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN;
-		j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL);
+		j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL);
 		if (!j->buf[i].data)
 			return -BCH_ERR_ENOMEM_journal_buf;
+		j->buf[i].idx = i;
 	}
 
 	j->pin.front = j->pin.back = 1;
+
+	j->wq = alloc_workqueue("bcachefs_journal",
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512);
+	if (!j->wq)
+		return -BCH_ERR_ENOMEM_fs_other_alloc;
 	return 0;
 }
 
@@ -1381,6 +1431,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
 	prt_printf(out, "reclaim kicked:\t\t%u\n",		j->reclaim_kicked);
 	prt_printf(out, "reclaim runs in:\t%u ms\n",		time_after(j->next_reclaim, now)
 	       ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
+	prt_printf(out, "blocked:\t\t%u\n",			j->blocked);
 	prt_printf(out, "current entry sectors:\t%u\n",		j->cur_entry_sectors);
 	prt_printf(out, "current entry error:\t%s\n",		bch2_journal_errors[j->cur_entry_error]);
 	prt_printf(out, "current entry:\t\t");
@@ -1455,7 +1506,6 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 {
 	struct journal_entry_pin_list *pin_list;
 	struct journal_entry_pin *pin;
-	unsigned i;
 
 	spin_lock(&j->lock);
 	*seq = max(*seq, j->pin.front);
@@ -1473,7 +1523,7 @@ bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64
 	prt_newline(out);
 	printbuf_indent_add(out, 2);
 
-	for (i = 0; i < ARRAY_SIZE(pin_list->list); i++)
+	for (unsigned i = 0; i < ARRAY_SIZE(pin_list->list); i++)
 		list_for_each_entry(pin, &pin_list->list[i], list) {
 			prt_printf(out, "\t%px %ps", pin, pin->flush);
 			prt_newline(out);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 4544ce24bb8a..7c7528f839c5 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -264,7 +264,8 @@ static inline union journal_res_state journal_state_buf_put(struct journal *j, u
 }
 
 bool bch2_journal_entry_close(struct journal *);
-void bch2_journal_buf_put_final(struct journal *, u64, bool);
+void bch2_journal_do_writes(struct journal *);
+void bch2_journal_buf_put_final(struct journal *, u64);
 
 static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
 {
@@ -272,7 +273,7 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s
 
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx))
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 }
 
 static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq)
@@ -282,7 +283,7 @@ static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq
 	s = journal_state_buf_put(j, idx);
 	if (!journal_state_count(s, idx)) {
 		spin_lock(&j->lock);
-		bch2_journal_buf_put_final(j, seq, idx == s.unwritten_idx);
+		bch2_journal_buf_put_final(j, seq);
 		spin_unlock(&j->lock);
 	}
 }
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 47805193f18c..9aa28b52ab92 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -17,6 +17,37 @@
 #include "sb-clean.h"
 #include "trace.h"
 
+void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
+			       struct journal_replay *j)
+{
+	darray_for_each(j->ptrs, i) {
+		struct bch_dev *ca = bch_dev_bkey_exists(c, i->dev);
+		u64 offset;
+
+		div64_u64_rem(i->sector, ca->mi.bucket_size, &offset);
+
+		if (i != j->ptrs.data)
+			prt_printf(out, " ");
+		prt_printf(out, "%u:%u:%u (sector %llu)",
+			   i->dev, i->bucket, i->bucket_offset, i->sector);
+	}
+}
+
+static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c,
+					struct journal_replay *j)
+{
+	prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq));
+
+	bch2_journal_ptrs_to_text(out, c, j);
+
+	for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) {
+		struct jset_entry_datetime *datetime =
+			container_of(entry, struct jset_entry_datetime, entry);
+		bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+		break;
+	}
+}
+
 static struct nonce journal_nonce(const struct jset *jset)
 {
 	return (struct nonce) {{
@@ -52,13 +83,15 @@ static void __journal_replay_free(struct bch_fs *c,
 
 	BUG_ON(*p != i);
 	*p = NULL;
-	kvpfree(i, offsetof(struct journal_replay, j) +
-		vstruct_bytes(&i->j));
+	kvfree(i);
 }
 
-static void journal_replay_free(struct bch_fs *c, struct journal_replay *i)
+static void journal_replay_free(struct bch_fs *c, struct journal_replay *i, bool blacklisted)
 {
-	i->ignore = true;
+	if (blacklisted)
+		i->ignore_blacklisted = true;
+	else
+		i->ignore_not_dirty = true;
 
 	if (!c->opts.read_entire_journal)
 		__journal_replay_free(c, i);
@@ -84,9 +117,9 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 {
 	struct genradix_iter iter;
 	struct journal_replay **_i, *i, *dup;
-	struct journal_ptr *ptr;
 	size_t bytes = vstruct_bytes(j);
 	u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0;
+	struct printbuf buf = PRINTBUF;
 	int ret = JOURNAL_ENTRY_ADD_OK;
 
 	/* Is this entry older than the range we need? */
@@ -108,12 +141,13 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 				       journal_entry_radix_idx(c, jlist->last_seq)) {
 			i = *_i;
 
-			if (!i || i->ignore)
+			if (journal_replay_ignore(i))
 				continue;
 
 			if (le64_to_cpu(i->j.seq) >= last_seq)
 				break;
-			journal_replay_free(c, i);
+
+			journal_replay_free(c, i, false);
 		}
 	}
 
@@ -131,72 +165,62 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca,
 	 */
 	dup = *_i;
 	if (dup) {
-		if (bytes == vstruct_bytes(&dup->j) &&
-		    !memcmp(j, &dup->j, bytes)) {
-			i = dup;
-			goto found;
-		}
+		bool identical = bytes == vstruct_bytes(&dup->j) &&
+			!memcmp(j, &dup->j, bytes);
+		bool not_identical = !identical &&
+			entry_ptr.csum_good &&
+			dup->csum_good;
+
+		bool same_device = false;
+		darray_for_each(dup->ptrs, ptr)
+			if (ptr->dev == ca->dev_idx)
+				same_device = true;
+
+		ret = darray_push(&dup->ptrs, entry_ptr);
+		if (ret)
+			goto out;
 
-		if (!entry_ptr.csum_good) {
-			i = dup;
-			goto found;
-		}
+		bch2_journal_replay_to_text(&buf, c, dup);
 
-		if (!dup->csum_good)
+		fsck_err_on(same_device,
+			    c, journal_entry_dup_same_device,
+			    "duplicate journal entry on same device\n  %s",
+			    buf.buf);
+
+		fsck_err_on(not_identical,
+			    c, journal_entry_replicas_data_mismatch,
+			    "found duplicate but non identical journal entries\n  %s",
+			    buf.buf);
+
+		if (entry_ptr.csum_good && !identical)
 			goto replace;
 
-		fsck_err(c, journal_entry_replicas_data_mismatch,
-			 "found duplicate but non identical journal entries (seq %llu)",
-			 le64_to_cpu(j->seq));
-		i = dup;
-		goto found;
+		goto out;
 	}
 replace:
-	i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
+	i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL);
 	if (!i)
 		return -BCH_ERR_ENOMEM_journal_entry_add;
 
-	i->nr_ptrs	= 0;
-	i->csum_good	= entry_ptr.csum_good;
-	i->ignore	= false;
+	darray_init(&i->ptrs);
+	i->csum_good		= entry_ptr.csum_good;
+	i->ignore_blacklisted	= false;
+	i->ignore_not_dirty	= false;
 	unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct");
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
 
 	if (dup) {
-		if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) {
-			bch_err(c, "found too many copies of journal entry %llu",
-				le64_to_cpu(i->j.seq));
-			dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1;
-		}
-
 		/* The first ptr should represent the jset we kept: */
-		memcpy(i->ptrs + i->nr_ptrs,
-		       dup->ptrs,
-		       sizeof(dup->ptrs[0]) * dup->nr_ptrs);
-		i->nr_ptrs += dup->nr_ptrs;
+		darray_for_each(dup->ptrs, ptr)
+			darray_push(&i->ptrs, *ptr);
 		__journal_replay_free(c, dup);
+	} else {
+		darray_push(&i->ptrs, entry_ptr);
 	}
 
 	*_i = i;
-	return 0;
-found:
-	for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) {
-		if (ptr->dev == ca->dev_idx) {
-			bch_err(c, "duplicate journal entry %llu on same device",
-				le64_to_cpu(i->j.seq));
-			goto out;
-		}
-	}
-
-	if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) {
-		bch_err(c, "found too many copies of journal entry %llu",
-			le64_to_cpu(i->j.seq));
-		goto out;
-	}
-
-	i->ptrs[i->nr_ptrs++] = entry_ptr;
 out:
 fsck_err:
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -223,7 +247,7 @@ static void journal_entry_err_msg(struct printbuf *out,
 
 	if (entry) {
 		prt_str(out, " type=");
-		prt_str(out, bch2_jset_entry_types[entry->type]);
+		bch2_prt_jset_entry_type(out, entry->type);
 	}
 
 	if (!jset) {
@@ -374,13 +398,13 @@ static int journal_entry_btree_keys_validate(struct bch_fs *c,
 static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c,
 					     struct jset_entry *entry)
 {
-	struct bkey_i *k;
 	bool first = true;
 
 	jset_entry_for_each_key(entry, k) {
 		if (!first) {
 			prt_newline(out);
-			prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+			bch2_prt_jset_entry_type(out, entry->type);
+			prt_str(out, ": ");
 		}
 		prt_printf(out, "btree=%s l=%u ", bch2_btree_id_str(entry->btree_id), entry->level);
 		bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k));
@@ -540,9 +564,9 @@ static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c,
 	struct jset_entry_usage *u =
 		container_of(entry, struct jset_entry_usage, entry);
 
-	prt_printf(out, "type=%s v=%llu",
-	       bch2_fs_usage_types[u->entry.btree_id],
-	       le64_to_cpu(u->v));
+	prt_str(out, "type=");
+	bch2_prt_fs_usage_type(out, u->entry.btree_id);
+	prt_printf(out, " v=%llu", le64_to_cpu(u->v));
 }
 
 static int journal_entry_data_usage_validate(struct bch_fs *c,
@@ -741,6 +765,37 @@ static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct
 	journal_entry_btree_keys_to_text(out, c, entry);
 }
 
+static int journal_entry_datetime_validate(struct bch_fs *c,
+				struct jset *jset,
+				struct jset_entry *entry,
+				unsigned version, int big_endian,
+				enum bkey_invalid_flags flags)
+{
+	unsigned bytes = vstruct_bytes(entry);
+	unsigned expected = 16;
+	int ret = 0;
+
+	if (journal_entry_err_on(vstruct_bytes(entry) < expected,
+				 c, version, jset, entry,
+				 journal_entry_dev_usage_bad_size,
+				 "bad size (%u < %u)",
+				 bytes, expected)) {
+		journal_entry_null_range(entry, vstruct_next(entry));
+		return ret;
+	}
+fsck_err:
+	return ret;
+}
+
+static void journal_entry_datetime_to_text(struct printbuf *out, struct bch_fs *c,
+					    struct jset_entry *entry)
+{
+	struct jset_entry_datetime *datetime =
+		container_of(entry, struct jset_entry_datetime, entry);
+
+	bch2_prt_datetime(out, le64_to_cpu(datetime->seconds));
+}
+
 struct jset_entry_ops {
 	int (*validate)(struct bch_fs *, struct jset *,
 			struct jset_entry *, unsigned, int,
@@ -773,11 +828,11 @@ int bch2_journal_entry_validate(struct bch_fs *c,
 void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
 				struct jset_entry *entry)
 {
+	bch2_prt_jset_entry_type(out, entry->type);
+
 	if (entry->type < BCH_JSET_ENTRY_NR) {
-		prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]);
+		prt_str(out, ": ");
 		bch2_jset_entry_ops[entry->type].to_text(out, c, entry);
-	} else {
-		prt_printf(out, "(unknown type %u)", entry->type);
 	}
 }
 
@@ -913,11 +968,11 @@ static int journal_read_buf_realloc(struct journal_read_buf *b,
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
 	new_size = roundup_pow_of_two(new_size);
-	n = kvpmalloc(new_size, GFP_KERNEL);
+	n = kvmalloc(new_size, GFP_KERNEL);
 	if (!n)
 		return -BCH_ERR_ENOMEM_journal_read_buf_realloc;
 
-	kvpfree(b->data, b->size);
+	kvfree(b->data);
 	b->data = n;
 	b->size = new_size;
 	return 0;
@@ -1028,9 +1083,7 @@ reread:
 		ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
 			     j->encrypted_start,
 			     vstruct_end(j) - (void *) j->encrypted_start);
-		bch2_fs_fatal_err_on(ret, c,
-				"error decrypting journal entry: %s",
-				bch2_err_str(ret));
+		bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret));
 
 		mutex_lock(&jlist->lock);
 		ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1102,16 +1155,15 @@ static CLOSURE_CALLBACK(bch2_journal_read_device)
 		if (!r)
 			continue;
 
-		for (i = 0; i < r->nr_ptrs; i++) {
-			if (r->ptrs[i].dev == ca->dev_idx) {
-				unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) +
+		darray_for_each(r->ptrs, i)
+			if (i->dev == ca->dev_idx) {
+				unsigned wrote = bucket_remainder(ca, i->sector) +
 					vstruct_sectors(&r->j, c->block_bits);
 
-				ja->cur_idx = r->ptrs[i].bucket;
+				ja->cur_idx = i->bucket;
 				ja->sectors_free = ca->mi.bucket_size - wrote;
 				goto found;
 			}
-		}
 	}
 found:
 	mutex_unlock(&jlist->lock);
@@ -1144,7 +1196,7 @@ found:
 		ja->dirty_idx = (ja->cur_idx + 1) % ja->nr;
 out:
 	bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret);
-	kvpfree(buf.data, buf.size);
+	kvfree(buf.data);
 	percpu_ref_put(&ca->io_ref);
 	closure_return(cl);
 	return;
@@ -1155,27 +1207,6 @@ err:
 	goto out;
 }
 
-void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
-			       struct journal_replay *j)
-{
-	unsigned i;
-
-	for (i = 0; i < j->nr_ptrs; i++) {
-		struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev);
-		u64 offset;
-
-		div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset);
-
-		if (i)
-			prt_printf(out, " ");
-		prt_printf(out, "%u:%u:%u (sector %llu)",
-		       j->ptrs[i].dev,
-		       j->ptrs[i].bucket,
-		       j->ptrs[i].bucket_offset,
-		       j->ptrs[i].sector);
-	}
-}
-
 int bch2_journal_read(struct bch_fs *c,
 		      u64 *last_seq,
 		      u64 *blacklist_seq,
@@ -1228,20 +1259,20 @@ int bch2_journal_read(struct bch_fs *c,
 
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		if (!*start_seq)
 			*blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1;
 
 		if (JSET_NO_FLUSH(&i->j)) {
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 			continue;
 		}
 
 		if (!last_write_torn && !i->csum_good) {
 			last_write_torn = true;
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 			continue;
 		}
 
@@ -1280,12 +1311,12 @@ int bch2_journal_read(struct bch_fs *c,
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		seq = le64_to_cpu(i->j.seq);
 		if (seq < *last_seq) {
-			journal_replay_free(c, i);
+			journal_replay_free(c, i, false);
 			continue;
 		}
 
@@ -1293,7 +1324,7 @@ int bch2_journal_read(struct bch_fs *c,
 			fsck_err_on(!JSET_NO_FLUSH(&i->j), c,
 				    jset_seq_blacklisted,
 				    "found blacklisted journal entry %llu", seq);
-			i->ignore = true;
+			i->ignore_blacklisted = true;
 		}
 	}
 
@@ -1302,7 +1333,7 @@ int bch2_journal_read(struct bch_fs *c,
 	genradix_for_each(&c->journal_entries, radix_iter, _i) {
 		i = *_i;
 
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
 		BUG_ON(seq > le64_to_cpu(i->j.seq));
@@ -1353,32 +1384,31 @@ int bch2_journal_read(struct bch_fs *c,
 			.e.data_type = BCH_DATA_journal,
 			.e.nr_required = 1,
 		};
-		unsigned ptr;
 
 		i = *_i;
-		if (!i || i->ignore)
+		if (journal_replay_ignore(i))
 			continue;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
-			struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+		darray_for_each(i->ptrs, ptr) {
+			struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
 
-			if (!i->ptrs[ptr].csum_good)
-				bch_err_dev_offset(ca, i->ptrs[ptr].sector,
+			if (!ptr->csum_good)
+				bch_err_dev_offset(ca, ptr->sector,
 						   "invalid journal checksum, seq %llu%s",
 						   le64_to_cpu(i->j.seq),
 						   i->csum_good ? " (had good copy on another device)" : "");
 		}
 
 		ret = jset_validate(c,
-				    bch_dev_bkey_exists(c, i->ptrs[0].dev),
+				    bch_dev_bkey_exists(c, i->ptrs.data[0].dev),
 				    &i->j,
-				    i->ptrs[0].sector,
+				    i->ptrs.data[0].sector,
 				    READ);
 		if (ret)
 			goto err;
 
-		for (ptr = 0; ptr < i->nr_ptrs; ptr++)
-			replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev;
+		darray_for_each(i->ptrs, ptr)
+			replicas.e.devs[replicas.e.nr_devs++] = ptr->dev;
 
 		bch2_replicas_entry_sort(&replicas.e);
 
@@ -1547,7 +1577,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
 		return;
 
-	new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
+	new_buf = kvmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
 	if (!new_buf)
 		return;
 
@@ -1558,7 +1588,7 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
 	swap(buf->buf_size,	new_size);
 	spin_unlock(&j->lock);
 
-	kvpfree(new_buf, new_size);
+	kvfree(new_buf);
 }
 
 static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
@@ -1568,12 +1598,12 @@ static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j)
 
 static CLOSURE_CALLBACK(journal_write_done)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
 	union journal_res_state old, new;
-	u64 v, seq;
+	u64 v, seq = le64_to_cpu(w->data->seq);
 	int err = 0;
 
 	bch2_time_stats_update(!JSET_NO_FLUSH(w->data)
@@ -1593,63 +1623,68 @@ static CLOSURE_CALLBACK(journal_write_done)
 	if (err)
 		bch2_fatal_error(c);
 
-	spin_lock(&j->lock);
-	seq = le64_to_cpu(w->data->seq);
+	closure_debug_destroy(cl);
 
+	spin_lock(&j->lock);
 	if (seq >= j->pin.front)
 		journal_seq_pin(j, seq)->devs = w->devs_written;
+	if (err && (!j->err_seq || seq < j->err_seq))
+		j->err_seq	= seq;
+	w->write_done = true;
 
-	if (!err) {
-		if (!JSET_NO_FLUSH(w->data)) {
+	bool completed = false;
+
+	for (seq = journal_last_unwritten_seq(j);
+	     seq <= journal_cur_seq(j);
+	     seq++) {
+		w = j->buf + (seq & JOURNAL_BUF_MASK);
+		if (!w->write_done)
+			break;
+
+		if (!j->err_seq && !JSET_NO_FLUSH(w->data)) {
 			j->flushed_seq_ondisk = seq;
 			j->last_seq_ondisk = w->last_seq;
 
 			bch2_do_discards(c);
 			closure_wake_up(&c->freelist_wait);
-
 			bch2_reset_alloc_cursors(c);
 		}
-	} else if (!j->err_seq || seq < j->err_seq)
-		j->err_seq	= seq;
 
-	j->seq_ondisk		= seq;
+		j->seq_ondisk = seq;
 
-	/*
-	 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
-	 * more buckets:
-	 *
-	 * Must come before signaling write completion, for
-	 * bch2_fs_journal_stop():
-	 */
-	if (j->watermark != BCH_WATERMARK_stripe)
-		journal_reclaim_kick(&c->journal);
+		/*
+		 * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard
+		 * more buckets:
+		 *
+		 * Must come before signaling write completion, for
+		 * bch2_fs_journal_stop():
+		 */
+		if (j->watermark != BCH_WATERMARK_stripe)
+			journal_reclaim_kick(&c->journal);
 
-	/* also must come before signalling write completion: */
-	closure_debug_destroy(cl);
+		v = atomic64_read(&j->reservations.counter);
+		do {
+			old.v = new.v = v;
+			BUG_ON(journal_state_count(new, new.unwritten_idx));
+			BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK));
 
-	v = atomic64_read(&j->reservations.counter);
-	do {
-		old.v = new.v = v;
-		BUG_ON(journal_state_count(new, new.unwritten_idx));
+			new.unwritten_idx++;
+		} while ((v = atomic64_cmpxchg(&j->reservations.counter, old.v, new.v)) != old.v);
 
-		new.unwritten_idx++;
-	} while ((v = atomic64_cmpxchg(&j->reservations.counter,
-				       old.v, new.v)) != old.v);
+		closure_wake_up(&w->wait);
+		completed = true;
+	}
 
-	bch2_journal_reclaim_fast(j);
-	bch2_journal_space_available(j);
+	if (completed) {
+		bch2_journal_reclaim_fast(j);
+		bch2_journal_space_available(j);
 
-	track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
-			   &j->max_in_flight_start, false);
+		track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], false);
 
-	closure_wake_up(&w->wait);
-	journal_wake(j);
+		journal_wake(j);
+	}
 
-	if (!journal_state_count(new, new.unwritten_idx) &&
-	    journal_last_unwritten_seq(j) <= journal_cur_seq(j)) {
-		spin_unlock(&j->lock);
-		closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL);
-	} else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
+	if (journal_last_unwritten_seq(j) == journal_cur_seq(j) &&
 		   new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) {
 		struct journal_buf *buf = journal_cur_buf(j);
 		long delta = buf->expires - jiffies;
@@ -1659,46 +1694,46 @@ static CLOSURE_CALLBACK(journal_write_done)
 		 * previous entries still in flight - the current journal entry
 		 * might want to be written now:
 		 */
-
-		spin_unlock(&j->lock);
-		mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta));
-	} else {
-		spin_unlock(&j->lock);
+		mod_delayed_work(j->wq, &j->write_work, max(0L, delta));
 	}
+
+	spin_unlock(&j->lock);
 }
 
 static void journal_write_endio(struct bio *bio)
 {
-	struct bch_dev *ca = bio->bi_private;
+	struct journal_bio *jbio = container_of(bio, struct journal_bio, bio);
+	struct bch_dev *ca = jbio->ca;
 	struct journal *j = &ca->fs->journal;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	unsigned long flags;
+	struct journal_buf *w = j->buf + jbio->buf_idx;
 
 	if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write,
 			       "error writing journal entry %llu: %s",
 			       le64_to_cpu(w->data->seq),
 			       bch2_blk_status_to_str(bio->bi_status)) ||
 	    bch2_meta_write_fault("journal")) {
+		unsigned long flags;
+
 		spin_lock_irqsave(&j->err_lock, flags);
 		bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx);
 		spin_unlock_irqrestore(&j->err_lock, flags);
 	}
 
-	closure_put(&j->io);
+	closure_put(&w->io);
 	percpu_ref_put(&ca->io_ref);
 }
 
 static CLOSURE_CALLBACK(do_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct bch_dev *ca;
-	struct journal_buf *w = journal_last_unwritten_buf(j);
-	struct bio *bio;
 	unsigned sectors = vstruct_sectors(w->data, c->block_bits);
 
 	extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) {
-		ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+		struct journal_device *ja = &ca->journal;
+
 		if (!percpu_ref_tryget(&ca->io_ref)) {
 			/* XXX: fix this */
 			bch_err(c, "missing device for journal write\n");
@@ -1708,7 +1743,7 @@ static CLOSURE_CALLBACK(do_journal_write)
 		this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal],
 			     sectors);
 
-		bio = ca->journal.bio;
+		struct bio *bio = &ja->bio[w->idx]->bio;
 		bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META);
 		bio->bi_iter.bi_sector	= ptr->offset;
 		bio->bi_end_io		= journal_write_endio;
@@ -1727,11 +1762,10 @@ static CLOSURE_CALLBACK(do_journal_write)
 		trace_and_count(c, journal_write, bio);
 		closure_bio_submit(bio, cl);
 
-		ca->journal.bucket_seq[ca->journal.cur_idx] =
-			le64_to_cpu(w->data->seq);
+		ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq);
 	}
 
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
 
 static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
@@ -1782,11 +1816,11 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 			if (!wb.wb)
 				bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
 
-			struct bkey_i *k;
 			jset_entry_for_each_key(i, k) {
 				ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
 				if (ret) {
-					bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+					bch2_fs_fatal_error(c, "flushing journal keys to btree write buffer: %s",
+							    bch2_err_str(ret));
 					bch2_journal_keys_to_write_buffer_end(c, &wb);
 					return ret;
 				}
@@ -1798,15 +1832,24 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 
 	if (wb.wb)
 		bch2_journal_keys_to_write_buffer_end(c, &wb);
+
+	spin_lock(&c->journal.lock);
 	w->need_flush_to_write_buffer = false;
+	spin_unlock(&c->journal.lock);
 
 	start = end = vstruct_last(jset);
 
 	end	= bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
 
+	struct jset_entry_datetime *d =
+		container_of(jset_entry_init(&end, sizeof(*d)), struct jset_entry_datetime, entry);
+	d->entry.type	= BCH_JSET_ENTRY_datetime;
+	d->seconds	= cpu_to_le64(ktime_get_real_seconds());
+
 	bch2_journal_super_entries_add_common(c, &end, seq);
 	u64s	= (u64 *) end - (u64 *) start;
-	BUG_ON(u64s > j->entry_u64s_reserved);
+
+	WARN_ON(u64s > j->entry_u64s_reserved);
 
 	le32_add_cpu(&jset->u64s, u64s);
 
@@ -1814,7 +1857,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	bytes	= vstruct_bytes(jset);
 
 	if (sectors > w->sectors) {
-		bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
+		bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)",
 				    vstruct_bytes(jset), w->sectors << 9,
 				    u64s, w->u64s_reserved, j->entry_u64s_reserved);
 		return -EINVAL;
@@ -1842,8 +1885,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
 	ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset),
 		    jset->encrypted_start,
 		    vstruct_end(jset) - (void *) jset->encrypted_start);
-	if (bch2_fs_fatal_err_on(ret, c,
-			"error decrypting journal entry: %i", ret))
+	if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret)))
 		return ret;
 
 	jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset),
@@ -1893,6 +1935,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 		j->nr_noflush_writes++;
 	} else {
+		w->must_flush = true;
 		j->last_flush_write = jiffies;
 		j->nr_flush_writes++;
 		clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags);
@@ -1903,20 +1946,28 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
 
 CLOSURE_CALLBACK(bch2_journal_write)
 {
-	closure_type(j, struct journal, io);
+	closure_type(w, struct journal_buf, io);
+	struct journal *j = container_of(w, struct journal, buf[w->idx]);
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
-	struct journal_buf *w = journal_last_unwritten_buf(j);
 	struct bch_replicas_padded replicas;
-	struct bio *bio;
 	struct printbuf journal_debug_buf = PRINTBUF;
 	unsigned nr_rw_members = 0;
 	int ret;
 
+	for_each_rw_member(c, ca)
+		nr_rw_members++;
+
 	BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
+	BUG_ON(!w->write_started);
+	BUG_ON(w->write_allocated);
+	BUG_ON(w->write_done);
 
 	j->write_start_time = local_clock();
 
 	spin_lock(&j->lock);
+	if (nr_rw_members > 1)
+		w->separate_flush = true;
+
 	ret = bch2_journal_write_pick_flush(j, w);
 	spin_unlock(&j->lock);
 	if (ret)
@@ -1956,12 +2007,14 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	 * bch2_journal_space_available():
 	 */
 	w->sectors = 0;
+	w->write_allocated = true;
 
 	/*
 	 * journal entry has been compacted and allocated, recalculate space
 	 * available:
 	 */
 	bch2_journal_space_available(j);
+	bch2_journal_do_writes(j);
 	spin_unlock(&j->lock);
 
 	w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key));
@@ -1969,12 +2022,6 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (c->opts.nochanges)
 		goto no_io;
 
-	for_each_rw_member(c, ca)
-		nr_rw_members++;
-
-	if (nr_rw_members > 1)
-		w->separate_flush = true;
-
 	/*
 	 * Mark journal replicas before we submit the write to guarantee
 	 * recovery will find the journal entries after a crash.
@@ -1985,25 +2032,29 @@ CLOSURE_CALLBACK(bch2_journal_write)
 	if (ret)
 		goto err;
 
+	if (!JSET_NO_FLUSH(w->data))
+		closure_wait_event(&j->async_wait, j->seq_ondisk + 1 == le64_to_cpu(w->data->seq));
+
 	if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
 		for_each_rw_member(c, ca) {
 			percpu_ref_get(&ca->io_ref);
 
-			bio = ca->journal.bio;
+			struct journal_device *ja = &ca->journal;
+			struct bio *bio = &ja->bio[w->idx]->bio;
 			bio_reset(bio, ca->disk_sb.bdev,
-				  REQ_OP_WRITE|REQ_PREFLUSH);
+				  REQ_OP_WRITE|REQ_SYNC|REQ_META|REQ_PREFLUSH);
 			bio->bi_end_io		= journal_write_endio;
 			bio->bi_private		= ca;
 			closure_bio_submit(bio, cl);
 		}
 	}
 
-	continue_at(cl, do_journal_write, c->io_complete_wq);
+	continue_at(cl, do_journal_write, j->wq);
 	return;
 no_io:
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 	return;
 err:
 	bch2_fatal_error(c);
-	continue_at(cl, journal_write_done, c->io_complete_wq);
+	continue_at(cl, journal_write_done, j->wq);
 }
diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h
index c035e7c108e1..4f1e763ab506 100644
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
@@ -2,26 +2,35 @@
 #ifndef _BCACHEFS_JOURNAL_IO_H
 #define _BCACHEFS_JOURNAL_IO_H
 
+#include "darray.h"
+
+struct journal_ptr {
+	bool		csum_good;
+	u8		dev;
+	u32		bucket;
+	u32		bucket_offset;
+	u64		sector;
+};
+
 /*
  * Only used for holding the journal entries we read in btree_journal_read()
  * during cache_registration
  */
 struct journal_replay {
-	struct journal_ptr {
-		bool		csum_good;
-		u8		dev;
-		u32		bucket;
-		u32		bucket_offset;
-		u64		sector;
-	}			ptrs[BCH_REPLICAS_MAX];
-	unsigned		nr_ptrs;
+	DARRAY_PREALLOCATED(struct journal_ptr, 8) ptrs;
 
 	bool			csum_good;
-	bool			ignore;
+	bool			ignore_blacklisted;
+	bool			ignore_not_dirty;
 	/* must be last: */
 	struct jset		j;
 };
 
+static inline bool journal_replay_ignore(struct journal_replay *i)
+{
+	return !i || i->ignore_blacklisted || i->ignore_not_dirty;
+}
+
 static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 					struct jset_entry *entry, unsigned type)
 {
@@ -36,12 +45,12 @@ static inline struct jset_entry *__jset_entry_type_next(struct jset *jset,
 }
 
 #define for_each_jset_entry_type(entry, jset, type)			\
-	for (entry = (jset)->start;					\
+	for (struct jset_entry *entry = (jset)->start;			\
 	     (entry = __jset_entry_type_next(jset, entry, type));	\
 	     entry = vstruct_next(entry))
 
 #define jset_entry_for_each_key(_e, _k)					\
-	for (_k = (_e)->start;						\
+	for (struct bkey_i *_k = (_e)->start;				\
 	     _k < vstruct_last(_e);					\
 	     _k = bkey_next(_k))
 
@@ -62,4 +71,20 @@ int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *);
 
 CLOSURE_CALLBACK(bch2_journal_write);
 
+static inline struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
+{
+	struct jset_entry *entry = *end;
+	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
+
+	memset(entry, 0, u64s * sizeof(u64));
+	/*
+	 * The u64s field counts from the start of data, ignoring the shared
+	 * fields.
+	 */
+	entry->u64s = cpu_to_le16(u64s - 1);
+
+	*end = vstruct_next(*end);
+	return entry;
+}
+
 #endif /* _BCACHEFS_JOURNAL_IO_H */
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index c33dca641575..04a577848b01 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -62,14 +62,13 @@ void bch2_journal_set_watermark(struct journal *j)
 		? BCH_WATERMARK_reclaim
 		: BCH_WATERMARK_stripe;
 
-	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
-			       &j->low_on_space_start, low_on_space) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
-			       &j->low_on_pin_start, low_on_pin) ||
-	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
-			       &j->write_buffer_full_start, low_on_wb))
+	if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space], low_on_space) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin], low_on_pin) ||
+	    track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
 		trace_and_count(c, journal_full, c);
 
+	mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
+
 	swap(watermark, j->watermark);
 	if (watermark > j->watermark)
 		journal_wake(j);
@@ -394,8 +393,6 @@ void bch2_journal_pin_copy(struct journal *j,
 			   struct journal_entry_pin *src,
 			   journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	u64 seq = READ_ONCE(src->seq);
@@ -411,44 +408,44 @@ void bch2_journal_pin_copy(struct journal *j,
 		return;
 	}
 
-	reclaim = __journal_pin_drop(j, dst);
+	bool reclaim = __journal_pin_drop(j, dst);
 
 	bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
 
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+	spin_unlock(&j->lock);
 }
 
 void bch2_journal_pin_set(struct journal *j, u64 seq,
 			  struct journal_entry_pin *pin,
 			  journal_pin_flush_fn flush_fn)
 {
-	bool reclaim;
-
 	spin_lock(&j->lock);
 
 	BUG_ON(seq < journal_last_seq(j));
 
-	reclaim = __journal_pin_drop(j, pin);
+	bool reclaim = __journal_pin_drop(j, pin);
 
 	bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
 
 	if (reclaim)
 		bch2_journal_reclaim_fast(j);
-	spin_unlock(&j->lock);
-
 	/*
 	 * If the journal is currently full,  we might want to call flush_fn
 	 * immediately:
 	 */
-	journal_wake(j);
+	if (seq == journal_last_seq(j))
+		journal_wake(j);
+
+	spin_unlock(&j->lock);
 }
 
 /**
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index 0200e299cfbb..37a024e034d4 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -43,61 +43,36 @@ static unsigned sb_blacklist_u64s(unsigned nr)
 	return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64);
 }
 
-static struct bch_sb_field_journal_seq_blacklist *
-blacklist_entry_try_merge(struct bch_fs *c,
-			  struct bch_sb_field_journal_seq_blacklist *bl,
-			  unsigned i)
-{
-	unsigned nr = blacklist_nr_entries(bl);
-
-	if (le64_to_cpu(bl->start[i].end) >=
-	    le64_to_cpu(bl->start[i + 1].start)) {
-		bl->start[i].end = bl->start[i + 1].end;
-		--nr;
-		memmove(&bl->start[i],
-			&bl->start[i + 1],
-			sizeof(bl->start[0]) * (nr - i));
-
-		bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
-					  sb_blacklist_u64s(nr));
-		BUG_ON(!bl);
-	}
-
-	return bl;
-}
-
-static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e,
-					u64 start, u64 end)
-{
-	return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start);
-}
-
 int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 {
 	struct bch_sb_field_journal_seq_blacklist *bl;
-	unsigned i, nr;
+	unsigned i = 0, nr;
 	int ret = 0;
 
 	mutex_lock(&c->sb_lock);
 	bl = bch2_sb_field_get(c->disk_sb.sb, journal_seq_blacklist);
 	nr = blacklist_nr_entries(bl);
 
-	for (i = 0; i < nr; i++) {
+	while (i < nr) {
 		struct journal_seq_blacklist_entry *e =
 			bl->start + i;
 
-		if (bl_entry_contig_or_overlaps(e, start, end)) {
-			e->start = cpu_to_le64(min(start, le64_to_cpu(e->start)));
-			e->end	= cpu_to_le64(max(end, le64_to_cpu(e->end)));
-
-			if (i + 1 < nr)
-				bl = blacklist_entry_try_merge(c,
-							bl, i);
-			if (i)
-				bl = blacklist_entry_try_merge(c,
-							bl, i - 1);
-			goto out_write_sb;
+		if (end < le64_to_cpu(e->start))
+			break;
+
+		if (start > le64_to_cpu(e->end)) {
+			i++;
+			continue;
 		}
+
+		/*
+		 * Entry is contiguous or overlapping with new entry: merge it
+		 * with new entry, and delete:
+		 */
+
+		start	= min(start,	le64_to_cpu(e->start));
+		end	= max(end,	le64_to_cpu(e->end));
+		array_remove_item(bl->start, nr, i);
 	}
 
 	bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist,
@@ -107,9 +82,10 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end)
 		goto out;
 	}
 
-	bl->start[nr].start	= cpu_to_le64(start);
-	bl->start[nr].end	= cpu_to_le64(end);
-out_write_sb:
+	array_insert_item(bl->start, nr, i, ((struct journal_seq_blacklist_entry) {
+		.start	= cpu_to_le64(start),
+		.end	= cpu_to_le64(end),
+	}));
 	c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3);
 
 	ret = bch2_write_super(c);
@@ -119,8 +95,7 @@ out:
 	return ret ?: bch2_blacklist_table_initialize(c);
 }
 
-static int journal_seq_blacklist_table_cmp(const void *_l,
-					   const void *_r, size_t size)
+static int journal_seq_blacklist_table_cmp(const void *_l, const void *_r)
 {
 	const struct journal_seq_blacklist_table_entry *l = _l;
 	const struct journal_seq_blacklist_table_entry *r = _r;
@@ -165,8 +140,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c)
 	if (!bl)
 		return 0;
 
-	t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr,
-		    GFP_KERNEL);
+	t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL);
 	if (!t)
 		return -BCH_ERR_ENOMEM_blacklist_table_init;
 
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index 38817c7a0851..b5161b5d76a0 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -18,6 +18,7 @@
  * the journal that are being staged or in flight.
  */
 struct journal_buf {
+	struct closure		io;
 	struct jset		*data;
 
 	__BKEY_PADDED(key, BCH_REPLICAS_MAX);
@@ -33,10 +34,14 @@ struct journal_buf {
 	unsigned		disk_sectors;	/* maximum size entry could have been, if
 						   buf_size was bigger */
 	unsigned		u64s_reserved;
-	bool			noflush;	/* write has already been kicked off, and was noflush */
-	bool			must_flush;	/* something wants a flush */
-	bool			separate_flush;
-	bool			need_flush_to_write_buffer;
+	bool			noflush:1;	/* write has already been kicked off, and was noflush */
+	bool			must_flush:1;	/* something wants a flush */
+	bool			separate_flush:1;
+	bool			need_flush_to_write_buffer:1;
+	bool			write_started:1;
+	bool			write_allocated:1;
+	bool			write_done:1;
+	u8			idx;
 };
 
 /*
@@ -129,11 +134,13 @@ enum journal_flags {
 	JOURNAL_STARTED,
 	JOURNAL_MAY_SKIP_FLUSH,
 	JOURNAL_NEED_FLUSH_WRITE,
+	JOURNAL_SPACE_LOW,
 };
 
 /* Reasons we may fail to get a journal reservation: */
 #define JOURNAL_ERRORS()		\
 	x(ok)				\
+	x(retry)			\
 	x(blocked)			\
 	x(max_in_flight)		\
 	x(journal_full)			\
@@ -149,6 +156,13 @@ enum journal_errors {
 
 typedef DARRAY(u64)		darray_u64;
 
+struct journal_bio {
+	struct bch_dev		*ca;
+	unsigned		buf_idx;
+
+	struct bio		bio;
+};
+
 /* Embedded in struct bch_fs */
 struct journal {
 	/* Fastpath stuff up front: */
@@ -203,8 +217,8 @@ struct journal {
 	wait_queue_head_t	wait;
 	struct closure_waitlist	async_wait;
 
-	struct closure		io;
 	struct delayed_work	write_work;
+	struct workqueue_struct *wq;
 
 	/* Sequence number of most recent journal entry (last entry in @pin) */
 	atomic64_t		seq;
@@ -274,11 +288,6 @@ struct journal {
 	u64			nr_noflush_writes;
 	u64			entry_bytes_written;
 
-	u64			low_on_space_start;
-	u64			low_on_pin_start;
-	u64			max_in_flight_start;
-	u64			write_buffer_full_start;
-
 	struct bch2_time_stats	*flush_write_time;
 	struct bch2_time_stats	*noflush_write_time;
 	struct bch2_time_stats	*flush_seq_time;
@@ -313,7 +322,7 @@ struct journal_device {
 	u64			*buckets;
 
 	/* Bio for journal reads/writes to this device */
-	struct bio		*bio;
+	struct journal_bio	*bio[JOURNAL_BUF_NR];
 
 	/* for bch_journal_read_device */
 	struct closure		read;
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index ad598105c587..b82f8209041f 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -37,7 +37,6 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
 	const struct bch_logged_op_fn *fn = logged_op_fn(k.k->type);
 	struct bkey_buf sk;
 	u32 restart_count = trans->restart_count;
-	int ret;
 
 	if (!fn)
 		return 0;
@@ -45,11 +44,11 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
 	bch2_bkey_buf_init(&sk);
 	bch2_bkey_buf_reassemble(&sk, c, k);
 
-	ret =   drop_locks_do(trans, (bch2_fs_lazy_rw(c), 0)) ?:
-		fn->resume(trans, sk.k) ?: trans_was_restarted(trans, restart_count);
+	fn->resume(trans, sk.k);
 
 	bch2_bkey_buf_exit(&sk, c);
-	return ret;
+
+	return trans_was_restarted(trans, restart_count);
 }
 
 int bch2_resume_logged_ops(struct bch_fs *c)
@@ -101,8 +100,8 @@ void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
 		struct printbuf buf = PRINTBUF;
 
 		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k));
-		bch2_fs_fatal_error(c, "%s: error deleting logged operation %s: %s",
-				     __func__, buf.buf, bch2_err_str(ret));
+		bch2_fs_fatal_error(c, "deleting logged operation %s: %s",
+				    buf.buf, bch2_err_str(ret));
 		printbuf_exit(&buf);
 	}
 }
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index 7a4ca5a28b3e..26569043e368 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -44,8 +44,8 @@ static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id,
 			  u64 dev_bucket, u64 time, bool set)
 {
 	return time
-		? bch2_btree_bit_mod(trans, BTREE_ID_lru,
-				     lru_pos(lru_id, dev_bucket, time), set)
+		? bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru,
+					      lru_pos(lru_id, dev_bucket, time), set)
 		: 0;
 }
 
@@ -125,8 +125,7 @@ static int bch2_check_lru_key(struct btree_trans *trans,
 			goto out;
 		}
 
-		if (c->opts.reconstruct_alloc ||
-		    fsck_err(c, lru_entry_bad,
+		if (fsck_err(c, lru_entry_bad,
 			     "incorrect lru entry: lru %s time %llu\n"
 			     "  %s\n"
 			     "  for %s",
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index bf0ef668fd38..0ea9f30803a2 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -103,14 +103,17 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
  * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
  * @s: mean and variance number of samples and their sums
  * @x: new value to include in the &mean_and_variance_weighted
+ * @initted: caller must track whether this is the first use or not
+ * @weight: ewma weight
  *
  * see linked pdf: function derived from equations 140-143 where alpha = 2^w.
  * values are stored bitshifted for performance and added precision.
  */
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x)
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 x, bool initted, u8 weight)
 {
 	// previous weighted variance.
-	u8 w		= s->weight;
+	u8 w		= weight;
 	u64 var_w0	= s->variance;
 	// new value weighted.
 	s64 x_w		= x << w;
@@ -119,45 +122,50 @@ void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64
 	// new mean weighted.
 	s64 u_w1	= s->mean + diff;
 
-	if (!s->init) {
+	if (!initted) {
 		s->mean = x_w;
 		s->variance = 0;
 	} else {
 		s->mean = u_w1;
 		s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w;
 	}
-	s->init = true;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
 
 /**
  * mean_and_variance_weighted_get_mean() - get mean from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return fast_divpow2(s.mean, s.weight);
+	return fast_divpow2(s.mean, weight);
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
 
 /**
  * mean_and_variance_weighted_get_variance() -- get variance from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight)
 {
 	// always positive don't need fast divpow2
-	return s.variance >> s.weight;
+	return s.variance >> weight;
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
 
 /**
  * mean_and_variance_weighted_get_stddev() - get standard deviation from @s
  * @s: mean and variance number of samples and their sums
+ * @weight: ewma weight
  */
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight)
 {
-	return int_sqrt64(mean_and_variance_weighted_get_variance(s));
+	return int_sqrt64(mean_and_variance_weighted_get_variance(s, weight));
 }
 EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev);
 
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 64df11ab422b..4fcf062dd22c 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -154,8 +154,6 @@ struct mean_and_variance {
 
 /* expontentially weighted variant */
 struct mean_and_variance_weighted {
-	bool	init;
-	u8	weight;	/* base 2 logarithim */
 	s64	mean;
 	u64	variance;
 };
@@ -192,10 +190,14 @@ s64 mean_and_variance_get_mean(struct mean_and_variance s);
 u64 mean_and_variance_get_variance(struct mean_and_variance s1);
 u32 mean_and_variance_get_stddev(struct mean_and_variance s);
 
-void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v);
+void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s,
+		s64 v, bool initted, u8 weight);
 
-s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s);
-u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s);
-u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s);
+s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s,
+		u8 weight);
+u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s,
+		u8 weight);
+u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s,
+		u8 weight);
 
 #endif // MEAN_AND_VAIRANCE_H_
diff --git a/fs/bcachefs/mean_and_variance_test.c b/fs/bcachefs/mean_and_variance_test.c
index 019583c3ca0e..4c298e74723d 100644
--- a/fs/bcachefs/mean_and_variance_test.c
+++ b/fs/bcachefs/mean_and_variance_test.c
@@ -31,53 +31,59 @@ static void mean_and_variance_basic_test(struct kunit *test)
 
 static void mean_and_variance_weighted_test(struct kunit *test)
 {
-	struct mean_and_variance_weighted s = { .weight = 2 };
+	struct mean_and_variance_weighted s = { };
 
-	mean_and_variance_weighted_update(&s, 10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+	mean_and_variance_weighted_update(&s, 10, false, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
 
-	mean_and_variance_weighted_update(&s, 20);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+	mean_and_variance_weighted_update(&s, 20, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
 
-	mean_and_variance_weighted_update(&s, 30);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+	mean_and_variance_weighted_update(&s, 30, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), 16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
 
-	s = (struct mean_and_variance_weighted) { .weight = 2 };
+	s = (struct mean_and_variance_weighted) { };
 
-	mean_and_variance_weighted_update(&s, -10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0);
+	mean_and_variance_weighted_update(&s, -10, false, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -10);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 0);
 
-	mean_and_variance_weighted_update(&s, -20);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18);
+	mean_and_variance_weighted_update(&s, -20, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -12);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 18);
 
-	mean_and_variance_weighted_update(&s, -30);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72);
+	mean_and_variance_weighted_update(&s, -30, true, 2);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 2), -16);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 2), 72);
 }
 
 static void mean_and_variance_weighted_advanced_test(struct kunit *test)
 {
-	struct mean_and_variance_weighted s = { .weight = 8 };
+	struct mean_and_variance_weighted s = { };
+	bool initted = false;
 	s64 i;
 
-	for (i = 10; i <= 100; i += 10)
-		mean_and_variance_weighted_update(&s, i);
+	for (i = 10; i <= 100; i += 10) {
+		mean_and_variance_weighted_update(&s, i, initted, 8);
+		initted = true;
+	}
 
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), 11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
 
-	s = (struct mean_and_variance_weighted) { .weight = 8 };
+	s = (struct mean_and_variance_weighted) { };
+	initted = false;
 
-	for (i = -10; i >= -100; i -= 10)
-		mean_and_variance_weighted_update(&s, i);
+	for (i = -10; i >= -100; i -= 10) {
+		mean_and_variance_weighted_update(&s, i, initted, 8);
+		initted = true;
+	}
 
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11);
-	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s, 8), -11);
+	KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s, 8), 107);
 }
 
 static void do_mean_and_variance_test(struct kunit *test,
@@ -92,26 +98,26 @@ static void do_mean_and_variance_test(struct kunit *test,
 				      s64 *weighted_stddev)
 {
 	struct mean_and_variance mv = {};
-	struct mean_and_variance_weighted vw = { .weight = weight };
+	struct mean_and_variance_weighted vw = { };
 
 	for (unsigned i = 0; i < initial_n; i++) {
 		mean_and_variance_update(&mv, initial_value);
-		mean_and_variance_weighted_update(&vw, initial_value);
+		mean_and_variance_weighted_update(&vw, initial_value, false, weight);
 
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		initial_value);
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		0);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	initial_value);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	initial_value);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),0);
 	}
 
 	for (unsigned i = 0; i < n; i++) {
 		mean_and_variance_update(&mv, data[i]);
-		mean_and_variance_weighted_update(&vw, data[i]);
+		mean_and_variance_weighted_update(&vw, data[i], true, weight);
 
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv),		mean[i]);
 		KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv),		stddev[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw),	weighted_mean[i]);
-		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw, weight),	weighted_mean[i]);
+		KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw, weight),weighted_stddev[i]);
 	}
 
 	KUNIT_EXPECT_EQ(test, mv.n, initial_n + n);
@@ -130,20 +136,8 @@ static void mean_and_variance_test_1(struct kunit *test)
 			d, mean, stddev, weighted_mean, weighted_stddev);
 }
 
-static void mean_and_variance_test_2(struct kunit *test)
-{
-	s64 d[]			= { 100, 10, 10, 10, 10, 10, 10 };
-	s64 mean[]		= {  10, 10, 10, 10, 10, 10, 10 };
-	s64 stddev[]		= {   9,  9,  9,  9,  9,  9,  9 };
-	s64 weighted_mean[]	= {  32, 27, 22, 19, 17, 15, 14 };
-	s64 weighted_stddev[]	= {  38, 35, 31, 27, 24, 21, 18 };
-
-	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-			d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
 /* Test behaviour where we switch from one steady state to another: */
-static void mean_and_variance_test_3(struct kunit *test)
+static void mean_and_variance_test_2(struct kunit *test)
 {
 	s64 d[]			= { 100, 100, 100, 100, 100 };
 	s64 mean[]		= {  22,  32,  40,  46,  50 };
@@ -155,18 +149,6 @@ static void mean_and_variance_test_3(struct kunit *test)
 			d, mean, stddev, weighted_mean, weighted_stddev);
 }
 
-static void mean_and_variance_test_4(struct kunit *test)
-{
-	s64 d[]			= { 100, 100, 100, 100, 100 };
-	s64 mean[]		= {  10,  11,  12,  13,  14 };
-	s64 stddev[]		= {   9,  13,  15,  17,  19 };
-	s64 weighted_mean[]	= {  32,  49,  61,  71,  78 };
-	s64 weighted_stddev[]	= {  38,  44,  44,  41,  38 };
-
-	do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2,
-			d, mean, stddev, weighted_mean, weighted_stddev);
-}
-
 static void mean_and_variance_fast_divpow2(struct kunit *test)
 {
 	s64 i;
@@ -224,8 +206,6 @@ static struct kunit_case mean_and_variance_test_cases[] = {
 	KUNIT_CASE(mean_and_variance_weighted_advanced_test),
 	KUNIT_CASE(mean_and_variance_test_1),
 	KUNIT_CASE(mean_and_variance_test_2),
-	KUNIT_CASE(mean_and_variance_test_3),
-	KUNIT_CASE(mean_and_variance_test_4),
 	{}
 };
 
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 5623cee3ef86..69098eeb5d48 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -31,7 +31,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k,
 	nr_good = bch2_bkey_durability(c, k.s_c);
 	if ((!nr_good && !(flags & lost)) ||
 	    (nr_good < replicas && !(flags & degraded)))
-		return -EINVAL;
+		return -BCH_ERR_remove_would_lose_data;
 
 	return 0;
 }
@@ -111,7 +111,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 
 	/* don't handle this yet: */
 	if (flags & BCH_FORCE_IF_METADATA_LOST)
-		return -EINVAL;
+		return -BCH_ERR_remove_with_metadata_missing_unimplemented;
 
 	trans = bch2_trans_get(c);
 	bch2_bkey_buf_init(&k);
@@ -132,10 +132,8 @@ retry:
 
 			ret = drop_dev_ptrs(c, bkey_i_to_s(k.k),
 					    dev_idx, flags, true);
-			if (ret) {
-				bch_err(c, "Cannot drop device without losing data");
+			if (ret)
 				break;
-			}
 
 			ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false);
 			if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index 69e06a84dad4..0d2b82d8d11f 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -155,8 +155,7 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
 	if (bch2_err_matches(ret, EROFS))
 		return ret;
 
-	if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
-				 __func__, bch2_err_str(ret)))
+	if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret)))
 		return ret;
 
 	ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index b1ed0b9a20d3..bb068fd72465 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -7,6 +7,7 @@
 #include "disk_groups.h"
 #include "error.h"
 #include "opts.h"
+#include "recovery_passes.h"
 #include "super-io.h"
 #include "util.h"
 
@@ -42,7 +43,7 @@ const char * const __bch2_btree_ids[] = {
 	NULL
 };
 
-const char * const bch2_csum_types[] = {
+static const char * const __bch2_csum_types[] = {
 	BCH_CSUM_TYPES()
 	NULL
 };
@@ -52,7 +53,7 @@ const char * const bch2_csum_opts[] = {
 	NULL
 };
 
-const char * const __bch2_compression_types[] = {
+static const char * const __bch2_compression_types[] = {
 	BCH_COMPRESSION_TYPES()
 	NULL
 };
@@ -82,18 +83,39 @@ const char * const bch2_member_states[] = {
 	NULL
 };
 
-const char * const bch2_jset_entry_types[] = {
+static const char * const __bch2_jset_entry_types[] = {
 	BCH_JSET_ENTRY_TYPES()
 	NULL
 };
 
-const char * const bch2_fs_usage_types[] = {
+static const char * const __bch2_fs_usage_types[] = {
 	BCH_FS_USAGE_TYPES()
 	NULL
 };
 
 #undef x
 
+static void prt_str_opt_boundscheck(struct printbuf *out, const char * const opts[],
+				    unsigned nr, const char *type, unsigned idx)
+{
+	if (idx < nr)
+		prt_str(out, opts[idx]);
+	else
+		prt_printf(out, "(unknown %s %u)", type, idx);
+}
+
+#define PRT_STR_OPT_BOUNDSCHECKED(name, type)					\
+void bch2_prt_##name(struct printbuf *out, type t)				\
+{										\
+	prt_str_opt_boundscheck(out, __bch2_##name##s, ARRAY_SIZE(__bch2_##name##s) - 1, #name, t);\
+}
+
+PRT_STR_OPT_BOUNDSCHECKED(jset_entry_type,	enum bch_jset_entry_type);
+PRT_STR_OPT_BOUNDSCHECKED(fs_usage_type,	enum bch_fs_usage_type);
+PRT_STR_OPT_BOUNDSCHECKED(data_type,		enum bch_data_type);
+PRT_STR_OPT_BOUNDSCHECKED(csum_type,		enum bch_csum_type);
+PRT_STR_OPT_BOUNDSCHECKED(compression_type,	enum bch_compression_type);
+
 static int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res,
 				     struct printbuf *err)
 {
@@ -205,6 +227,9 @@ const struct bch_option bch2_opt_table[] = {
 #define OPT_STR(_choices)	.type = BCH_OPT_STR,			\
 				.min = 0, .max = ARRAY_SIZE(_choices),	\
 				.choices = _choices
+#define OPT_STR_NOLIMIT(_choices)	.type = BCH_OPT_STR,		\
+				.min = 0, .max = U64_MAX,		\
+				.choices = _choices
 #define OPT_FN(_fn)		.type = BCH_OPT_FN, .fn	= _fn
 
 #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help)	\
@@ -314,7 +339,7 @@ int bch2_opt_parse(struct bch_fs *c,
 		if (ret < 0 || (*res != 0 && *res != 1)) {
 			if (err)
 				prt_printf(err, "%s: must be bool", opt->attr.name);
-			return ret;
+			return ret < 0 ? ret : -BCH_ERR_option_not_bool;
 		}
 		break;
 	case BCH_OPT_UINT:
@@ -456,7 +481,7 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 
 	copied_opts = kstrdup(options, GFP_KERNEL);
 	if (!copied_opts)
-		return -1;
+		return -ENOMEM;
 	copied_opts_start = copied_opts;
 
 	while ((opt = strsep(&copied_opts, ",")) != NULL) {
@@ -501,11 +526,11 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts,
 
 bad_opt:
 	pr_err("Bad mount option %s", name);
-	ret = -1;
+	ret = -BCH_ERR_option_name;
 	goto out;
 bad_val:
 	pr_err("Invalid mount option %s", err.buf);
-	ret = -1;
+	ret = -BCH_ERR_option_value;
 	goto out;
 out:
 	kfree(copied_opts_start);
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 9a4b7faa3765..84e452835a17 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -16,18 +16,20 @@ extern const char * const bch2_version_upgrade_opts[];
 extern const char * const bch2_sb_features[];
 extern const char * const bch2_sb_compat[];
 extern const char * const __bch2_btree_ids[];
-extern const char * const bch2_csum_types[];
 extern const char * const bch2_csum_opts[];
-extern const char * const __bch2_compression_types[];
 extern const char * const bch2_compression_opts[];
 extern const char * const bch2_str_hash_types[];
 extern const char * const bch2_str_hash_opts[];
 extern const char * const __bch2_data_types[];
 extern const char * const bch2_member_states[];
-extern const char * const bch2_jset_entry_types[];
-extern const char * const bch2_fs_usage_types[];
 extern const char * const bch2_d_types[];
 
+void bch2_prt_jset_entry_type(struct printbuf *,	enum bch_jset_entry_type);
+void bch2_prt_fs_usage_type(struct printbuf *,		enum bch_fs_usage_type);
+void bch2_prt_data_type(struct printbuf *,		enum bch_data_type);
+void bch2_prt_csum_type(struct printbuf *,		enum bch_csum_type);
+void bch2_prt_compression_type(struct printbuf *,	enum bch_compression_type);
+
 static inline const char *bch2_d_type_str(unsigned d_type)
 {
 	return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)";
@@ -290,6 +292,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Allow mounting in when data will be missing")	\
+	x(no_splitbrain_check,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_BOOL(),							\
+	  BCH2_NO_SB_OPT,		false,				\
+	  NULL,		"Don't kick drives out when splitbrain detected")\
 	x(discard,			u8,				\
 	  OPT_FS|OPT_MOUNT|OPT_DEVICE,					\
 	  OPT_BOOL(),							\
@@ -332,6 +339,11 @@ enum fsck_err_opts {
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
 	  NULL,		"Run fsck on mount")				\
+	x(fsck_memory_usage_percent,	u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_UINT(20, 70),						\
+	  BCH2_NO_SB_OPT,		50,				\
+	  NULL,		"Maximum percentage of system ram fsck is allowed to pin")\
 	x(fix_errors,			u8,				\
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_FN(bch2_opt_fix_errors),					\
@@ -352,12 +364,17 @@ enum fsck_err_opts {
 	  OPT_FS|OPT_MOUNT,						\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don't replay the journal")			\
-	x(keep_journal,			u8,				\
+	  NULL,		"Exit recovery immediately prior to journal replay")\
+	x(recovery_pass_last,		u8,				\
+	  OPT_FS|OPT_MOUNT,						\
+	  OPT_STR_NOLIMIT(bch2_recovery_passes),			\
+	  BCH2_NO_SB_OPT,		0,				\
+	  NULL,		"Exit recovery after specified pass")		\
+	x(retain_recovery_info,		u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
 	  BCH2_NO_SB_OPT,		false,				\
-	  NULL,		"Don't free journal entries/keys after startup")\
+	  NULL,		"Don't free journal entries/keys, scanned btree nodes after startup")\
 	x(read_entire_journal,		u8,				\
 	  0,								\
 	  OPT_BOOL(),							\
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 22d1017aa49b..56336f3dd1d0 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -412,11 +412,11 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c)
 		u64 now = atomic64_read(&c->io_clock[WRITE].now);
 
 		prt_str(out, "io wait duration:  ");
-		bch2_prt_human_readable_s64(out, r->wait_iotime_end - r->wait_iotime_start);
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9);
 		prt_newline(out);
 
 		prt_str(out, "io wait remaining: ");
-		bch2_prt_human_readable_s64(out, r->wait_iotime_end - now);
+		bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9);
 		prt_newline(out);
 
 		prt_str(out, "duration waited:   ");
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 21e13bb4335b..0f328aba9760 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -1,35 +1,31 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
-#include "backpointers.h"
-#include "bkey_buf.h"
 #include "alloc_background.h"
-#include "btree_gc.h"
+#include "bkey_buf.h"
 #include "btree_journal_iter.h"
+#include "btree_node_scan.h"
 #include "btree_update.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "buckets.h"
 #include "dirent.h"
-#include "ec.h"
 #include "errcode.h"
 #include "error.h"
 #include "fs-common.h"
-#include "fsck.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
 #include "journal_seq_blacklist.h"
-#include "lru.h"
 #include "logged_ops.h"
 #include "move.h"
 #include "quota.h"
 #include "rebalance.h"
 #include "recovery.h"
+#include "recovery_passes.h"
 #include "replicas.h"
 #include "sb-clean.h"
 #include "sb-downgrade.h"
 #include "snapshot.h"
-#include "subvolume.h"
 #include "super-io.h"
 
 #include <linux/sort.h>
@@ -37,30 +33,67 @@
 
 #define QSTR(n) { { { .len = strlen(n) } }, .name = n }
 
-static bool btree_id_is_alloc(enum btree_id id)
+void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
 {
-	switch (id) {
-	case BTREE_ID_alloc:
-	case BTREE_ID_backpointers:
-	case BTREE_ID_need_discard:
-	case BTREE_ID_freespace:
-	case BTREE_ID_bucket_gens:
-		return true;
-	default:
-		return false;
+	u64 b = BIT_ULL(btree);
+
+	if (!(c->sb.btrees_lost_data & b)) {
+		bch_err(c, "flagging btree %s lost data", bch2_btree_id_str(btree));
+
+		mutex_lock(&c->sb_lock);
+		bch2_sb_field_get(c->disk_sb.sb, ext)->btrees_lost_data |= cpu_to_le64(b);
+		bch2_write_super(c);
+		mutex_unlock(&c->sb_lock);
 	}
 }
 
 /* for -o reconstruct_alloc: */
-static void drop_alloc_keys(struct journal_keys *keys)
+static void bch2_reconstruct_alloc(struct bch_fs *c)
 {
-	size_t src, dst;
+	bch2_journal_log_msg(c, "dropping alloc info");
+	bch_info(c, "dropping and reconstructing all alloc info");
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_allocations, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_info, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_lrus, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_extents_to_backpointers, ext->recovery_passes_required);
+	__set_bit_le64(BCH_RECOVERY_PASS_STABLE_check_alloc_to_lru_refs, ext->recovery_passes_required);
+
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_alloc_key, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_stale_dirty_ptr, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_dirty_sectors_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_need_discard_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_bucket_gens_key_wrong, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_freespace_hole_missing, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_ptr_to_missing_backpointer, ext->errors_silent);
+	__set_bit_le64(BCH_FSCK_ERR_lru_entry_bad, ext->errors_silent);
+	c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 
-	for (src = 0, dst = 0; src < keys->nr; src++)
-		if (!btree_id_is_alloc(keys->d[src].btree_id))
-			keys->d[dst++] = keys->d[src];
+	bch2_write_super(c);
+	mutex_unlock(&c->sb_lock);
+
+	c->recovery_passes_explicit |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0]));
 
-	keys->nr = dst;
+
+	bch2_shoot_down_journal_keys(c, BTREE_ID_alloc,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_backpointers,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_need_discard,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_freespace,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
+	bch2_shoot_down_journal_keys(c, BTREE_ID_bucket_gens,
+				     0, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
 }
 
 /*
@@ -70,9 +103,7 @@ static void drop_alloc_keys(struct journal_keys *keys)
  */
 static void zero_out_btree_mem_ptr(struct journal_keys *keys)
 {
-	struct journal_key *i;
-
-	for (i = keys->d; i < keys->d + keys->nr; i++)
+	darray_for_each(*keys, i)
 		if (i->k->k.type == KEY_TYPE_btree_ptr_v2)
 			bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0;
 }
@@ -124,6 +155,17 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
 	if (ret)
 		goto out;
 
+	struct btree_path *path = btree_iter_path(trans, &iter);
+	if (unlikely(!btree_path_node(path, k->level))) {
+		bch2_trans_iter_exit(trans, &iter);
+		bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p,
+					  BTREE_MAX_DEPTH, 0, iter_flags);
+		ret =   bch2_btree_iter_traverse(&iter) ?:
+			bch2_btree_increase_depth(trans, iter.path, 0) ?:
+			-BCH_ERR_transaction_restart_nested;
+		goto out;
+	}
+
 	/* Must be checked with btree locked: */
 	if (k->overwritten)
 		goto out;
@@ -142,7 +184,7 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
 	return cmp_int(l->journal_seq, r->journal_seq);
 }
 
-static int bch2_journal_replay(struct bch_fs *c)
+int bch2_journal_replay(struct bch_fs *c)
 {
 	struct journal_keys *keys = &c->journal_keys;
 	DARRAY(struct journal_key *) keys_sorted = { 0 };
@@ -150,6 +192,7 @@ static int bch2_journal_replay(struct bch_fs *c)
 	u64 start_seq	= c->journal_replay_seq_start;
 	u64 end_seq	= c->journal_replay_seq_start;
 	struct btree_trans *trans = bch2_trans_get(c);
+	bool immediate_flush = false;
 	int ret = 0;
 
 	if (keys->nr) {
@@ -161,15 +204,22 @@ static int bch2_journal_replay(struct bch_fs *c)
 
 	BUG_ON(!atomic_read(&keys->ref));
 
+	move_gap(keys, keys->nr);
+
 	/*
 	 * First, attempt to replay keys in sorted order. This is more
 	 * efficient - better locality of btree access -  but some might fail if
 	 * that would cause a journal deadlock.
 	 */
-	for (size_t i = 0; i < keys->nr; i++) {
+	darray_for_each(*keys, k) {
 		cond_resched();
 
-		struct journal_key *k = keys->d + i;
+		/*
+		 * k->allocated means the key wasn't read in from the journal,
+		 * rather it was from early repair code
+		 */
+		if (k->allocated)
+			immediate_flush = true;
 
 		/* Skip fastpath if we're low on space in the journal */
 		ret = c->journal.watermark ? -1 :
@@ -222,7 +272,8 @@ static int bch2_journal_replay(struct bch_fs *c)
 	bch2_trans_put(trans);
 	trans = NULL;
 
-	if (!c->opts.keep_journal)
+	if (!c->opts.retain_recovery_info &&
+	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay)
 		bch2_journal_keys_put_initial(c);
 
 	replay_now_at(j, j->replay_journal_seq_end);
@@ -230,6 +281,12 @@ static int bch2_journal_replay(struct bch_fs *c)
 
 	bch2_journal_set_replay_done(j);
 
+	/* if we did any repair, flush it immediately */
+	if (immediate_flush) {
+		bch2_journal_flush_all_pins(&c->journal);
+		ret = bch2_journal_meta(&c->journal);
+	}
+
 	if (keys->nr)
 		bch2_journal_log_msg(c, "journal replay finished");
 err:
@@ -264,7 +321,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
 			bkey_copy(&r->key, (struct bkey_i *) entry->start);
 			r->error = 0;
 		} else {
-			r->error = -EIO;
+			r->error = -BCH_ERR_btree_node_read_error;
 		}
 		r->alive = true;
 		break;
@@ -359,7 +416,7 @@ static int journal_replay_early(struct bch_fs *c,
 		genradix_for_each(&c->journal_entries, iter, _i) {
 			i = *_i;
 
-			if (!i || i->ignore)
+			if (journal_replay_ignore(i))
 				continue;
 
 			vstruct_for_each(&i->j, entry) {
@@ -379,202 +436,57 @@ static int journal_replay_early(struct bch_fs *c,
 
 static int read_btree_roots(struct bch_fs *c)
 {
-	unsigned i;
 	int ret = 0;
 
-	for (i = 0; i < btree_id_nr_alive(c); i++) {
+	for (unsigned i = 0; i < btree_id_nr_alive(c); i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
 
 		if (!r->alive)
 			continue;
 
-		if (btree_id_is_alloc(i) &&
-		    c->opts.reconstruct_alloc) {
-			c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
+		if (btree_id_is_alloc(i) && c->opts.reconstruct_alloc)
 			continue;
-		}
 
-		if (r->error) {
-			__fsck_err(c,
-				   btree_id_is_alloc(i)
-				   ? FSCK_CAN_IGNORE : 0,
-				   btree_root_bkey_invalid,
-				   "invalid btree root %s",
-				   bch2_btree_id_str(i));
-			if (i == BTREE_ID_alloc)
+		if (mustfix_fsck_err_on((ret = r->error),
+					c, btree_root_bkey_invalid,
+					"invalid btree root %s",
+					bch2_btree_id_str(i)) ||
+		    mustfix_fsck_err_on((ret = r->error = bch2_btree_root_read(c, i, &r->key, r->level)),
+					c, btree_root_read_error,
+					"error reading btree root %s l=%u: %s",
+					bch2_btree_id_str(i), r->level, bch2_err_str(ret))) {
+			if (btree_id_is_alloc(i)) {
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_allocations);
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_info);
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_lrus);
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers);
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs);
 				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-		}
+				r->error = 0;
+			} else if (!(c->recovery_passes_explicit & BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes))) {
+				bch_info(c, "will run btree node scan");
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_scan_for_btree_nodes);
+				c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+			}
 
-		ret = bch2_btree_root_read(c, i, &r->key, r->level);
-		if (ret) {
-			fsck_err(c,
-				 btree_root_read_error,
-				 "error reading btree root %s",
-				 bch2_btree_id_str(i));
-			if (btree_id_is_alloc(i))
-				c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
 			ret = 0;
+			bch2_btree_lost_data(c, i);
 		}
 	}
 
-	for (i = 0; i < BTREE_ID_NR; i++) {
+	for (unsigned i = 0; i < BTREE_ID_NR; i++) {
 		struct btree_root *r = bch2_btree_id_root(c, i);
 
-		if (!r->b) {
+		if (!r->b && !r->error) {
 			r->alive = false;
 			r->level = 0;
-			bch2_btree_root_alloc(c, i);
+			bch2_btree_root_alloc_fake(c, i, 0);
 		}
 	}
 fsck_err:
 	return ret;
 }
 
-static int bch2_initialize_subvolumes(struct bch_fs *c)
-{
-	struct bkey_i_snapshot_tree	root_tree;
-	struct bkey_i_snapshot		root_snapshot;
-	struct bkey_i_subvolume		root_volume;
-	int ret;
-
-	bkey_snapshot_tree_init(&root_tree.k_i);
-	root_tree.k.p.offset		= 1;
-	root_tree.v.master_subvol	= cpu_to_le32(1);
-	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
-
-	bkey_snapshot_init(&root_snapshot.k_i);
-	root_snapshot.k.p.offset = U32_MAX;
-	root_snapshot.v.flags	= 0;
-	root_snapshot.v.parent	= 0;
-	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
-	root_snapshot.v.tree	= cpu_to_le32(1);
-	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
-
-	bkey_subvolume_init(&root_volume.k_i);
-	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
-	root_volume.v.flags	= 0;
-	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
-	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
-
-	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
-		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
-{
-	struct btree_iter iter;
-	struct bkey_s_c k;
-	struct bch_inode_unpacked inode;
-	int ret;
-
-	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
-			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
-	ret = bkey_err(k);
-	if (ret)
-		return ret;
-
-	if (!bkey_is_inode(k.k)) {
-		bch_err(trans->c, "root inode not found");
-		ret = -BCH_ERR_ENOENT_inode;
-		goto err;
-	}
-
-	ret = bch2_inode_unpack(k, &inode);
-	BUG_ON(ret);
-
-	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
-
-	ret = bch2_inode_write(trans, &iter, &inode);
-err:
-	bch2_trans_iter_exit(trans, &iter);
-	return ret;
-}
-
-/* set bi_subvol on root inode */
-noinline_for_stack
-static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
-{
-	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
-				__bch2_fs_upgrade_for_subvolumes(trans));
-	bch_err_fn(c, ret);
-	return ret;
-}
-
-const char * const bch2_recovery_passes[] = {
-#define x(_fn, ...)	#_fn,
-	BCH_RECOVERY_PASSES()
-#undef x
-	NULL
-};
-
-static int bch2_check_allocations(struct bch_fs *c)
-{
-	return bch2_gc(c, true, c->opts.norecovery);
-}
-
-static int bch2_set_may_go_rw(struct bch_fs *c)
-{
-	struct journal_keys *keys = &c->journal_keys;
-
-	/*
-	 * After we go RW, the journal keys buffer can't be modified (except for
-	 * setting journal_key->overwritten: it will be accessed by multiple
-	 * threads
-	 */
-	move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
-	keys->gap = keys->nr;
-
-	set_bit(BCH_FS_may_go_rw, &c->flags);
-
-	if (keys->nr || c->opts.fsck || !c->sb.clean)
-		return bch2_fs_read_write_early(c);
-	return 0;
-}
-
-struct recovery_pass_fn {
-	int		(*fn)(struct bch_fs *);
-	unsigned	when;
-};
-
-static struct recovery_pass_fn recovery_pass_fns[] = {
-#define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
-	BCH_RECOVERY_PASSES()
-#undef x
-};
-
-u64 bch2_recovery_passes_to_stable(u64 v)
-{
-	static const u8 map[] = {
-#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-	};
-
-	u64 ret = 0;
-	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
-		if (v & BIT_ULL(i))
-			ret |= BIT_ULL(map[i]);
-	return ret;
-}
-
-u64 bch2_recovery_passes_from_stable(u64 v)
-{
-	static const u8 map[] = {
-#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
-	BCH_RECOVERY_PASSES()
-#undef x
-	};
-
-	u64 ret = 0;
-	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
-		if (v & BIT_ULL(i))
-			ret |= BIT_ULL(map[i]);
-	return ret;
-}
-
 static bool check_version_upgrade(struct bch_fs *c)
 {
 	unsigned latest_version	= bcachefs_metadata_version_current;
@@ -647,96 +559,6 @@ static bool check_version_upgrade(struct bch_fs *c)
 	return false;
 }
 
-u64 bch2_fsck_recovery_passes(void)
-{
-	u64 ret = 0;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
-		if (recovery_pass_fns[i].when & PASS_FSCK)
-			ret |= BIT_ULL(i);
-	return ret;
-}
-
-static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-	struct recovery_pass_fn *p = recovery_pass_fns + pass;
-
-	if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
-		return false;
-	if (c->recovery_passes_explicit & BIT_ULL(pass))
-		return true;
-	if ((p->when & PASS_FSCK) && c->opts.fsck)
-		return true;
-	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
-		return true;
-	if (p->when & PASS_ALWAYS)
-		return true;
-	return false;
-}
-
-static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
-{
-	struct recovery_pass_fn *p = recovery_pass_fns + pass;
-	int ret;
-
-	if (!(p->when & PASS_SILENT))
-		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
-			   bch2_recovery_passes[pass]);
-	ret = p->fn(c);
-	if (ret)
-		return ret;
-	if (!(p->when & PASS_SILENT))
-		bch2_print(c, KERN_CONT " done\n");
-
-	return 0;
-}
-
-static int bch2_run_recovery_passes(struct bch_fs *c)
-{
-	int ret = 0;
-
-	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
-		if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
-			unsigned pass = c->curr_recovery_pass;
-
-			ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
-			if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
-			    (ret && c->curr_recovery_pass < pass))
-				continue;
-			if (ret)
-				break;
-
-			c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
-		}
-		c->curr_recovery_pass++;
-		c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
-	}
-
-	return ret;
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *c)
-{
-	int ret = 0;
-
-	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
-		struct recovery_pass_fn *p = recovery_pass_fns + i;
-
-		if (!(p->when & PASS_ONLINE))
-			continue;
-
-		ret = bch2_run_recovery_pass(c, i);
-		if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
-			i = c->curr_recovery_pass;
-			continue;
-		}
-		if (ret)
-			break;
-	}
-
-	return ret;
-}
-
 int bch2_fs_recovery(struct bch_fs *c)
 {
 	struct bch_sb_field_clean *clean = NULL;
@@ -769,24 +591,14 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (c->opts.fsck && c->opts.norecovery) {
-		bch_err(c, "cannot select both norecovery and fsck");
-		ret = -EINVAL;
-		goto err;
-	}
+	if (c->opts.norecovery)
+		c->opts.recovery_pass_last = BCH_RECOVERY_PASS_journal_replay - 1;
 
 	if (!c->opts.nochanges) {
 		mutex_lock(&c->sb_lock);
+		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 		bool write_sb = false;
 
-		struct bch_sb_field_ext *ext =
-			bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
-		if (!ext) {
-			ret = -BCH_ERR_ENOSPC_sb;
-			mutex_unlock(&c->sb_lock);
-			goto err;
-		}
-
 		if (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb)) {
 			ext->recovery_passes_required[0] |=
 				cpu_to_le64(bch2_recovery_passes_to_stable(BIT_ULL(BCH_RECOVERY_PASS_check_topology)));
@@ -845,7 +657,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 		goto err;
 	}
 
-	if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) {
+	if (!c->sb.clean || c->opts.fsck || c->opts.retain_recovery_info) {
 		struct genradix_iter iter;
 		struct journal_replay **i;
 
@@ -862,7 +674,7 @@ int bch2_fs_recovery(struct bch_fs *c)
 			goto out;
 
 		genradix_for_each_reverse(&c->journal_entries, iter, i)
-			if (*i && !(*i)->ignore) {
+			if (!journal_replay_ignore(*i)) {
 				last_journal_entry = &(*i)->j;
 				break;
 			}
@@ -887,7 +699,8 @@ int bch2_fs_recovery(struct bch_fs *c)
 			genradix_for_each_reverse(&c->journal_entries, iter, i)
 				if (*i) {
 					last_journal_entry = &(*i)->j;
-					(*i)->ignore = false;
+					(*i)->ignore_blacklisted = false;
+					(*i)->ignore_not_dirty= false;
 					/*
 					 * This was probably a NO_FLUSH entry,
 					 * so last_seq was garbage - but we know
@@ -923,10 +736,8 @@ use_clean:
 	c->journal_replay_seq_start	= last_seq;
 	c->journal_replay_seq_end	= blacklist_seq - 1;
 
-	if (c->opts.reconstruct_alloc) {
-		c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info);
-		drop_alloc_keys(&c->journal_keys);
-	}
+	if (c->opts.reconstruct_alloc)
+		bch2_reconstruct_alloc(c);
 
 	zero_out_btree_mem_ptr(&c->journal_keys);
 
@@ -950,7 +761,7 @@ use_clean:
 			bch2_journal_seq_blacklist_add(c,
 					blacklist_seq, journal_seq);
 		if (ret) {
-			bch_err(c, "error creating new journal seq blacklist entry");
+			bch_err_msg(c, ret, "error creating new journal seq blacklist entry");
 			goto err;
 		}
 	}
@@ -961,9 +772,6 @@ use_clean:
 	if (ret)
 		goto err;
 
-	if (c->opts.reconstruct_alloc)
-		bch2_journal_log_msg(c, "dropping alloc info");
-
 	/*
 	 * Skip past versions that might have possibly been used (as nonces),
 	 * but hadn't had their pointers written:
@@ -981,6 +789,12 @@ use_clean:
 
 	clear_bit(BCH_FS_fsck_running, &c->flags);
 
+	/* fsync if we fixed errors */
+	if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_meta(&c->journal);
+	}
+
 	/* If we fixed errors, verify that fs is actually clean now: */
 	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
 	    test_bit(BCH_FS_errors_fixed, &c->flags) &&
@@ -1015,6 +829,7 @@ use_clean:
 	}
 
 	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
 	bool write_sb = false;
 
 	if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != le16_to_cpu(c->disk_sb.sb->version)) {
@@ -1028,15 +843,18 @@ use_clean:
 		write_sb = true;
 	}
 
-	if (!test_bit(BCH_FS_error, &c->flags)) {
-		struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
-		if (ext &&
-		    (!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
-		     !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent)))) {
-			memset(ext->recovery_passes_required, 0, sizeof(ext->recovery_passes_required));
-			memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
-			write_sb = true;
-		}
+	if (!test_bit(BCH_FS_error, &c->flags) &&
+	    !bch2_is_zero(ext->errors_silent, sizeof(ext->errors_silent))) {
+		memset(ext->errors_silent, 0, sizeof(ext->errors_silent));
+		write_sb = true;
+	}
+
+	if (c->opts.fsck &&
+	    !test_bit(BCH_FS_error, &c->flags) &&
+	    c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 &&
+	    ext->btrees_lost_data) {
+		ext->btrees_lost_data = 0;
+		write_sb = true;
 	}
 
 	if (c->opts.fsck &&
@@ -1077,9 +895,10 @@ use_clean:
 out:
 	bch2_flush_fsck_errs(c);
 
-	if (!c->opts.keep_journal &&
-	    test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
+	if (!c->opts.retain_recovery_info) {
 		bch2_journal_keys_put_initial(c);
+		bch2_find_btree_nodes_exit(&c->found_btree_nodes);
+	}
 	kfree(clean);
 
 	if (!ret &&
@@ -1105,6 +924,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	int ret;
 
 	bch_notice(c, "initializing new filesystem");
+	set_bit(BCH_FS_new_fs, &c->flags);
 
 	mutex_lock(&c->sb_lock);
 	c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done);
@@ -1119,11 +939,11 @@ int bch2_fs_initialize(struct bch_fs *c)
 	}
 	mutex_unlock(&c->sb_lock);
 
-	c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
+	c->curr_recovery_pass = BCH_RECOVERY_PASS_NR;
 	set_bit(BCH_FS_may_go_rw, &c->flags);
 
 	for (unsigned i = 0; i < BTREE_ID_NR; i++)
-		bch2_btree_root_alloc(c, i);
+		bch2_btree_root_alloc_fake(c, i, 0);
 
 	for_each_member_device(c, ca)
 		bch2_dev_usage_init(ca);
@@ -1194,7 +1014,7 @@ int bch2_fs_initialize(struct bch_fs *c)
 	if (ret)
 		goto err;
 
-	c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
+	c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1;
 
 	if (enabled_qtypes(c)) {
 		ret = bch2_fs_quota_read(c);
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 4e9d24719b2e..4bf818de1f2f 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -2,37 +2,9 @@
 #ifndef _BCACHEFS_RECOVERY_H
 #define _BCACHEFS_RECOVERY_H
 
-extern const char * const bch2_recovery_passes[];
+void bch2_btree_lost_data(struct bch_fs *, enum btree_id);
 
-u64 bch2_recovery_passes_to_stable(u64 v);
-u64 bch2_recovery_passes_from_stable(u64 v);
-
-/*
- * For when we need to rewind recovery passes and run a pass we skipped:
- */
-static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
-						  enum bch_recovery_pass pass)
-{
-	if (c->recovery_passes_explicit & BIT_ULL(pass))
-		return 0;
-
-	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
-		 bch2_recovery_passes[pass], pass,
-		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
-
-	c->recovery_passes_explicit |= BIT_ULL(pass);
-
-	if (c->curr_recovery_pass >= pass) {
-		c->curr_recovery_pass = pass;
-		c->recovery_passes_complete &= (1ULL << pass) >> 1;
-		return -BCH_ERR_restart_recovery;
-	} else {
-		return 0;
-	}
-}
-
-int bch2_run_online_recovery_passes(struct bch_fs *);
-u64 bch2_fsck_recovery_passes(void);
+int bch2_journal_replay(struct bch_fs *);
 
 int bch2_fs_recovery(struct bch_fs *);
 int bch2_fs_initialize(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c
new file mode 100644
index 000000000000..0cec0f7d9703
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.c
@@ -0,0 +1,249 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "bcachefs.h"
+#include "alloc_background.h"
+#include "backpointers.h"
+#include "btree_gc.h"
+#include "btree_node_scan.h"
+#include "ec.h"
+#include "fsck.h"
+#include "inode.h"
+#include "journal.h"
+#include "lru.h"
+#include "logged_ops.h"
+#include "rebalance.h"
+#include "recovery.h"
+#include "recovery_passes.h"
+#include "snapshot.h"
+#include "subvolume.h"
+#include "super.h"
+#include "super-io.h"
+
+const char * const bch2_recovery_passes[] = {
+#define x(_fn, ...)	#_fn,
+	BCH_RECOVERY_PASSES()
+#undef x
+	NULL
+};
+
+static int bch2_check_allocations(struct bch_fs *c)
+{
+	return bch2_gc(c, true, false);
+}
+
+static int bch2_set_may_go_rw(struct bch_fs *c)
+{
+	struct journal_keys *keys = &c->journal_keys;
+
+	/*
+	 * After we go RW, the journal keys buffer can't be modified (except for
+	 * setting journal_key->overwritten: it will be accessed by multiple
+	 * threads
+	 */
+	move_gap(keys, keys->nr);
+
+	set_bit(BCH_FS_may_go_rw, &c->flags);
+
+	if (keys->nr || c->opts.fsck || !c->sb.clean || c->recovery_passes_explicit)
+		return bch2_fs_read_write_early(c);
+	return 0;
+}
+
+struct recovery_pass_fn {
+	int		(*fn)(struct bch_fs *);
+	unsigned	when;
+};
+
+static struct recovery_pass_fn recovery_pass_fns[] = {
+#define x(_fn, _id, _when)	{ .fn = bch2_##_fn, .when = _when },
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static const u8 passes_to_stable_map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+};
+
+static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass)
+{
+	return passes_to_stable_map[pass];
+}
+
+u64 bch2_recovery_passes_to_stable(u64 v)
+{
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(passes_to_stable_map[i]);
+	return ret;
+}
+
+u64 bch2_recovery_passes_from_stable(u64 v)
+{
+	static const u8 map[] = {
+#define x(n, id, ...)	[BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n,
+	BCH_RECOVERY_PASSES()
+#undef x
+	};
+
+	u64 ret = 0;
+	for (unsigned i = 0; i < ARRAY_SIZE(map); i++)
+		if (v & BIT_ULL(i))
+			ret |= BIT_ULL(map[i]);
+	return ret;
+}
+
+/*
+ * For when we need to rewind recovery passes and run a pass we skipped:
+ */
+int bch2_run_explicit_recovery_pass(struct bch_fs *c,
+				    enum bch_recovery_pass pass)
+{
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return 0;
+
+	bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)",
+		 bch2_recovery_passes[pass], pass,
+		 bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass);
+
+	c->recovery_passes_explicit |= BIT_ULL(pass);
+
+	if (c->curr_recovery_pass >= pass) {
+		c->curr_recovery_pass = pass;
+		c->recovery_passes_complete &= (1ULL << pass) >> 1;
+		return -BCH_ERR_restart_recovery;
+	} else {
+		return 0;
+	}
+}
+
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c,
+					       enum bch_recovery_pass pass)
+{
+	enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (!test_bit_le64(s, ext->recovery_passes_required)) {
+		__set_bit_le64(s, ext->recovery_passes_required);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+
+	return bch2_run_explicit_recovery_pass(c, pass);
+}
+
+static void bch2_clear_recovery_pass_required(struct bch_fs *c,
+					      enum bch_recovery_pass pass)
+{
+	enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass);
+
+	mutex_lock(&c->sb_lock);
+	struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+	if (test_bit_le64(s, ext->recovery_passes_required)) {
+		__clear_bit_le64(s, ext->recovery_passes_required);
+		bch2_write_super(c);
+	}
+	mutex_unlock(&c->sb_lock);
+}
+
+u64 bch2_fsck_recovery_passes(void)
+{
+	u64 ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++)
+		if (recovery_pass_fns[i].when & PASS_FSCK)
+			ret |= BIT_ULL(i);
+	return ret;
+}
+
+static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + pass;
+
+	if (c->recovery_passes_explicit & BIT_ULL(pass))
+		return true;
+	if ((p->when & PASS_FSCK) && c->opts.fsck)
+		return true;
+	if ((p->when & PASS_UNCLEAN) && !c->sb.clean)
+		return true;
+	if (p->when & PASS_ALWAYS)
+		return true;
+	return false;
+}
+
+static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
+{
+	struct recovery_pass_fn *p = recovery_pass_fns + pass;
+	int ret;
+
+	if (!(p->when & PASS_SILENT))
+		bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+			   bch2_recovery_passes[pass]);
+	ret = p->fn(c);
+	if (ret)
+		return ret;
+	if (!(p->when & PASS_SILENT))
+		bch2_print(c, KERN_CONT " done\n");
+
+	return 0;
+}
+
+int bch2_run_online_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+		struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+		if (!(p->when & PASS_ONLINE))
+			continue;
+
+		ret = bch2_run_recovery_pass(c, i);
+		if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+			i = c->curr_recovery_pass;
+			continue;
+		}
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
+int bch2_run_recovery_passes(struct bch_fs *c)
+{
+	int ret = 0;
+
+	while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+		if (c->opts.recovery_pass_last &&
+		    c->curr_recovery_pass > c->opts.recovery_pass_last)
+			break;
+
+		if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+			unsigned pass = c->curr_recovery_pass;
+
+			ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+			if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+			    (ret && c->curr_recovery_pass < pass))
+				continue;
+			if (ret)
+				break;
+
+			c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+		}
+
+		c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
+
+		if (!test_bit(BCH_FS_error, &c->flags))
+			bch2_clear_recovery_pass_required(c, c->curr_recovery_pass);
+
+		c->curr_recovery_pass++;
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h
new file mode 100644
index 000000000000..99b464e127b8
--- /dev/null
+++ b/fs/bcachefs/recovery_passes.h
@@ -0,0 +1,17 @@
+#ifndef _BCACHEFS_RECOVERY_PASSES_H
+#define _BCACHEFS_RECOVERY_PASSES_H
+
+extern const char * const bch2_recovery_passes[];
+
+u64 bch2_recovery_passes_to_stable(u64 v);
+u64 bch2_recovery_passes_from_stable(u64 v);
+
+u64 bch2_fsck_recovery_passes(void);
+
+int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass);
+int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass);
+
+int bch2_run_online_recovery_passes(struct bch_fs *);
+int bch2_run_recovery_passes(struct bch_fs *);
+
+#endif /* _BCACHEFS_RECOVERY_PASSES_H */
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_passes_types.h
index fa0c8efd2a1b..773aea9a0080 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_passes_types.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_RECOVERY_TYPES_H
-#define _BCACHEFS_RECOVERY_TYPES_H
+#ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H
+#define _BCACHEFS_RECOVERY_PASSES_TYPES_H
 
 #define PASS_SILENT		BIT(0)
 #define PASS_FSCK		BIT(1)
@@ -13,11 +13,12 @@
  * must never change:
  */
 #define BCH_RECOVERY_PASSES()							\
+	x(scan_for_btree_nodes,			37, 0)				\
+	x(check_topology,			 4, 0)				\
 	x(alloc_read,				 0, PASS_ALWAYS)		\
 	x(stripes_read,				 1, PASS_ALWAYS)		\
 	x(initialize_subvolumes,		 2, 0)				\
 	x(snapshots_read,			 3, PASS_ALWAYS)		\
-	x(check_topology,			 4, 0)				\
 	x(check_allocations,			 5, PASS_FSCK)			\
 	x(trans_mark_dev_sbs,			 6, PASS_ALWAYS|PASS_SILENT)	\
 	x(fs_journal_alloc,			 7, PASS_ALWAYS|PASS_SILENT)	\
@@ -31,20 +32,23 @@
 	x(check_alloc_to_lru_refs,		15, PASS_ONLINE|PASS_FSCK)	\
 	x(fs_freespace_init,			16, PASS_ALWAYS|PASS_SILENT)	\
 	x(bucket_gens_init,			17, 0)				\
+	x(reconstruct_snapshots,		38, 0)				\
 	x(check_snapshot_trees,			18, PASS_ONLINE|PASS_FSCK)	\
 	x(check_snapshots,			19, PASS_ONLINE|PASS_FSCK)	\
 	x(check_subvols,			20, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvol_children,		35, PASS_ONLINE|PASS_FSCK)	\
 	x(delete_dead_snapshots,		21, PASS_ONLINE|PASS_FSCK)	\
 	x(fs_upgrade_for_subvolumes,		22, 0)				\
-	x(resume_logged_ops,			23, PASS_ALWAYS)		\
 	x(check_inodes,				24, PASS_FSCK)			\
 	x(check_extents,			25, PASS_FSCK)			\
 	x(check_indirect_extents,		26, PASS_FSCK)			\
 	x(check_dirents,			27, PASS_FSCK)			\
 	x(check_xattrs,				28, PASS_FSCK)			\
 	x(check_root,				29, PASS_ONLINE|PASS_FSCK)	\
+	x(check_subvolume_structure,		36, PASS_ONLINE|PASS_FSCK)	\
 	x(check_directory_structure,		30, PASS_ONLINE|PASS_FSCK)	\
 	x(check_nlinks,				31, PASS_FSCK)			\
+	x(resume_logged_ops,			23, PASS_ALWAYS)		\
 	x(delete_dead_inodes,			32, PASS_FSCK|PASS_UNCLEAN)	\
 	x(fix_reflink_p,			33, 0)				\
 	x(set_fs_needs_rebalance,		34, 0)				\
@@ -54,6 +58,7 @@ enum bch_recovery_pass {
 #define x(n, id, when)	BCH_RECOVERY_PASS_##n,
 	BCH_RECOVERY_PASSES()
 #undef x
+	BCH_RECOVERY_PASS_NR
 };
 
 /* But we also need stable identifiers that can be used in the superblock */
@@ -63,4 +68,4 @@ enum bch_recovery_pass_stable {
 #undef x
 };
 
-#endif /* _BCACHEFS_RECOVERY_TYPES_H */
+#endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index c47c66c2b394..ff7864731a07 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -185,8 +185,7 @@ not_found:
 		} else {
 			bkey_error_init(update);
 			update->k.p		= p.k->p;
-			update->k.p.offset	= next_idx;
-			update->k.size		= next_idx - *idx;
+			update->k.size		= p.k->size;
 			set_bkey_val_u64s(&update->k, 0);
 		}
 
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index cc2672c12031..678b9c20e251 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -6,12 +6,15 @@
 #include "replicas.h"
 #include "super-io.h"
 
+#include <linux/sort.h>
+
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
 /* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
-static int bch2_memcmp(const void *l, const void *r, size_t size)
+static int bch2_memcmp(const void *l, const void *r,  const void *priv)
 {
+	size_t size = (size_t) priv;
 	return memcmp(l, r, size);
 }
 
@@ -39,7 +42,8 @@ void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
 
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
-	eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
+	eytzinger0_sort_r(r->entries, r->nr, r->entry_size,
+			  bch2_memcmp, NULL, (void *)(size_t)r->entry_size);
 }
 
 static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
@@ -228,7 +232,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
 
 	verify_replicas_entry(search);
 
-#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+#define entry_cmp(_l, _r)	memcmp(_l, _r, entry_size)
 	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
 			      entry_cmp, search);
 #undef entry_cmp
@@ -824,10 +828,11 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
 {
 	unsigned i;
 
-	sort_cmp_size(cpu_r->entries,
-		      cpu_r->nr,
-		      cpu_r->entry_size,
-		      bch2_memcmp, NULL);
+	sort_r(cpu_r->entries,
+	       cpu_r->nr,
+	       cpu_r->entry_size,
+	       bch2_memcmp, NULL,
+	       (void *)(size_t)cpu_r->entry_size);
 
 	for (i = 0; i < cpu_r->nr; i++) {
 		struct bch_replicas_entry_v1 *e =
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index b6bf0ebe7e84..5980ba2563fe 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -171,22 +171,6 @@ fsck_err:
 	return ERR_PTR(ret);
 }
 
-static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size)
-{
-	struct jset_entry *entry = *end;
-	unsigned u64s = DIV_ROUND_UP(size, sizeof(u64));
-
-	memset(entry, 0, u64s * sizeof(u64));
-	/*
-	 * The u64s field counts from the start of data, ignoring the shared
-	 * fields.
-	 */
-	entry->u64s = cpu_to_le16(u64s - 1);
-
-	*end = vstruct_next(*end);
-	return entry;
-}
-
 void bch2_journal_super_entries_add_common(struct bch_fs *c,
 					   struct jset_entry **end,
 					   u64 journal_seq)
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 441dcb1bf160..a98ef940b7a3 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -7,7 +7,7 @@
 
 #include "bcachefs.h"
 #include "darray.h"
-#include "recovery.h"
+#include "recovery_passes.h"
 #include "sb-downgrade.h"
 #include "sb-errors.h"
 #include "super-io.h"
@@ -45,7 +45,16 @@
 	  BIT_ULL(BCH_RECOVERY_PASS_check_inodes),		\
 	  BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list)	\
 	x(rebalance_work,					\
-	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+	  BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))	\
+	x(subvolume_fs_parent,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_dirents),		\
+	  BCH_FSCK_ERR_subvol_fs_path_parent_wrong)		\
+	x(btree_subvolume_children,				\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_subvols),		\
+	  BCH_FSCK_ERR_subvol_children_not_set)			\
+	x(mi_btree_bitmap,					\
+	  BIT_ULL(BCH_RECOVERY_PASS_check_allocations),		\
+	  BCH_FSCK_ERR_btree_bitmap_not_marked)
 
 #define DOWNGRADE_TABLE()
 
@@ -253,7 +262,7 @@ void bch2_sb_set_downgrade(struct bch_fs *c, unsigned new_minor, unsigned old_mi
 				if (e < BCH_SB_ERR_MAX)
 					__set_bit(e, c->sb.errors_silent);
 				if (e < sizeof(ext->errors_silent) * 8)
-					ext->errors_silent[e / 64] |= cpu_to_le64(BIT_ULL(e % 64));
+					__set_bit_le64(e, ext->errors_silent);
 			}
 		}
 	}
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index c08aacdfd073..4ca6e7b0d8aa 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -130,7 +130,7 @@
 	x(bucket_gens_nonzero_for_invalid_buckets,		122)	\
 	x(need_discard_freespace_key_to_invalid_dev_bucket,	123)	\
 	x(need_discard_freespace_key_bad,			124)	\
-	x(backpointer_pos_wrong,				125)	\
+	x(backpointer_bucket_offset_wrong,			125)	\
 	x(backpointer_to_missing_device,			126)	\
 	x(backpointer_to_missing_alloc,				127)	\
 	x(backpointer_to_missing_ptr,				128)	\
@@ -231,7 +231,7 @@
 	x(dirent_name_dot_or_dotdot,				223)	\
 	x(dirent_name_has_slash,				224)	\
 	x(dirent_d_type_wrong,					225)	\
-	x(dirent_d_parent_subvol_wrong,				226)	\
+	x(inode_bi_parent_wrong,				226)	\
 	x(dirent_in_missing_dir_inode,				227)	\
 	x(dirent_in_non_dir_inode,				228)	\
 	x(dirent_to_missing_inode,				229)	\
@@ -250,7 +250,28 @@
 	x(hash_table_key_duplicate,				242)	\
 	x(hash_table_key_wrong_offset,				243)	\
 	x(unlinked_inode_not_on_deleted_list,			244)	\
-	x(reflink_p_front_pad_bad,				245)
+	x(reflink_p_front_pad_bad,				245)	\
+	x(journal_entry_dup_same_device,			246)	\
+	x(inode_bi_subvol_missing,				247)	\
+	x(inode_bi_subvol_wrong,				248)	\
+	x(inode_points_to_missing_dirent,			249)	\
+	x(inode_points_to_wrong_dirent,				250)	\
+	x(inode_bi_parent_nonzero,				251)	\
+	x(dirent_to_missing_parent_subvol,			252)	\
+	x(dirent_not_visible_in_parent_subvol,			253)	\
+	x(subvol_fs_path_parent_wrong,				254)	\
+	x(subvol_root_fs_path_parent_nonzero,			255)	\
+	x(subvol_children_not_set,				256)	\
+	x(subvol_children_bad,					257)	\
+	x(subvol_loop,						258)	\
+	x(subvol_unreachable,					259)	\
+	x(btree_node_bkey_bad_u64s,				260)	\
+	x(btree_node_topology_empty_interior_node,		261)	\
+	x(btree_ptr_v2_min_key_bad,				262)	\
+	x(btree_root_unreadable_and_scan_found_nothing,		263)	\
+	x(snapshot_node_missing,				264)	\
+	x(dup_backpointer_to_bad_csum_extent,			265)	\
+	x(btree_bitmap_not_marked,				266)
 
 enum bch_sb_error_id {
 #define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index eff5ce18c69c..522a969345e5 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include "bcachefs.h"
+#include "btree_cache.h"
 #include "disk_groups.h"
 #include "opts.h"
 #include "replicas.h"
@@ -426,3 +427,55 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
 	bch2_write_super(c);
 	mutex_unlock(&c->sb_lock);
 }
+
+/*
+ * Per member "range has btree nodes" bitmap:
+ *
+ * This is so that if we ever have to run the btree node scan to repair we don't
+ * have to scan full devices:
+ */
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k)
+{
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+		if (!bch2_dev_btree_bitmap_marked_sectors(bch_dev_bkey_exists(c, ptr->dev),
+							  ptr->offset, btree_sectors(c)))
+			return false;
+	return true;
+}
+
+static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev,
+				u64 start, unsigned sectors)
+{
+	struct bch_member *m = __bch2_members_v2_get_mut(mi, dev);
+	u64 bitmap = le64_to_cpu(m->btree_allocated_bitmap);
+
+	u64 end = start + sectors;
+
+	int resize = ilog2(roundup_pow_of_two(end)) - (m->btree_bitmap_shift + 6);
+	if (resize > 0) {
+		u64 new_bitmap = 0;
+
+		for (unsigned i = 0; i < 64; i++)
+			if (bitmap & BIT_ULL(i))
+				new_bitmap |= BIT_ULL(i >> resize);
+		bitmap = new_bitmap;
+		m->btree_bitmap_shift += resize;
+	}
+
+	for (unsigned bit = sectors >> m->btree_bitmap_shift;
+	     bit << m->btree_bitmap_shift < end;
+	     bit++)
+		bitmap |= BIT_ULL(bit);
+
+	m->btree_allocated_bitmap = cpu_to_le64(bitmap);
+}
+
+void bch2_dev_btree_bitmap_mark(struct bch_fs *c, struct bkey_s_c k)
+{
+	lockdep_assert_held(&c->sb_lock);
+
+	struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+	bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr)
+		__bch2_dev_btree_bitmap_mark(mi, ptr->dev, ptr->offset, btree_sectors(c));
+}
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index be0a94183271..b27c3e4467cf 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -3,6 +3,7 @@
 #define _BCACHEFS_SB_MEMBERS_H
 
 #include "darray.h"
+#include "bkey_types.h"
 
 extern char * const bch2_member_error_strs[];
 
@@ -220,6 +221,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi)
 			: 1,
 		.freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi),
 		.valid		= bch2_member_exists(mi),
+		.btree_bitmap_shift	= mi->btree_bitmap_shift,
+		.btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap),
 	};
 }
 
@@ -228,4 +231,22 @@ void bch2_sb_members_from_cpu(struct bch_fs *);
 void bch2_dev_io_errors_to_text(struct printbuf *, struct bch_dev *);
 void bch2_dev_errors_reset(struct bch_dev *);
 
+static inline bool bch2_dev_btree_bitmap_marked_sectors(struct bch_dev *ca, u64 start, unsigned sectors)
+{
+	u64 end = start + sectors;
+
+	if (end > 64 << ca->mi.btree_bitmap_shift)
+		return false;
+
+	for (unsigned bit = sectors >> ca->mi.btree_bitmap_shift;
+	     bit << ca->mi.btree_bitmap_shift < end;
+	     bit++)
+		if (!(ca->mi.btree_allocated_bitmap & BIT_ULL(bit)))
+			return false;
+	return true;
+}
+
+bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c);
+void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c);
+
 #endif /* _BCACHEFS_SB_MEMBERS_H */
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index ac6ba04d5521..544322d5c251 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -8,6 +8,7 @@
 #include "errcode.h"
 #include "error.h"
 #include "fs.h"
+#include "recovery_passes.h"
 #include "snapshot.h"
 
 #include <linux/random.h>
@@ -91,23 +92,29 @@ static int bch2_snapshot_tree_create(struct btree_trans *trans,
 
 /* Snapshot nodes: */
 
-static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
+	while (id && id < ancestor) {
+		const struct snapshot_t *s = __snapshot_t(t, id);
+		id = s ? s->parent : 0;
+	}
+	return id == ancestor;
+}
 
+static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
+{
 	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
-
-	while (id && id < ancestor)
-		id = __snapshot_t(t, id)->parent;
+	bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
 	rcu_read_unlock();
 
-	return id == ancestor;
+	return ret;
 }
 
 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
 {
 	const struct snapshot_t *s = __snapshot_t(t, id);
+	if (!s)
+		return 0;
 
 	if (s->skip[2] <= ancestor)
 		return s->skip[2];
@@ -118,27 +125,36 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
 	return s->parent;
 }
 
+static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
+{
+	const struct snapshot_t *s = __snapshot_t(t, id);
+	if (!s)
+		return false;
+
+	return test_bit(ancestor - id - 1, s->is_ancestor);
+}
+
 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 {
-	struct snapshot_table *t;
 	bool ret;
 
-	EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
-
 	rcu_read_lock();
-	t = rcu_dereference(c->snapshots);
+	struct snapshot_table *t = rcu_dereference(c->snapshots);
+
+	if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
+		ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
+		goto out;
+	}
 
 	while (id && id < ancestor - IS_ANCESTOR_BITMAP)
 		id = get_ancestor_below(t, id, ancestor);
 
-	if (id && id < ancestor) {
-		ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
-
-		EBUG_ON(ret != bch2_snapshot_is_ancestor_early(c, id, ancestor));
-	} else {
-		ret = id == ancestor;
-	}
+	ret = id && id < ancestor
+		? test_ancestor_bitmap(t, id, ancestor)
+		: id == ancestor;
 
+	EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
+out:
 	rcu_read_unlock();
 
 	return ret;
@@ -147,36 +163,39 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
 {
 	size_t idx = U32_MAX - id;
-	size_t new_size;
 	struct snapshot_table *new, *old;
 
-	new_size = max(16UL, roundup_pow_of_two(idx + 1));
+	size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
+	size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
 
-	new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL);
+	new = kvzalloc(new_bytes, GFP_KERNEL);
 	if (!new)
 		return NULL;
 
+	new->nr = new_size;
+
 	old = rcu_dereference_protected(c->snapshots, true);
 	if (old)
-		memcpy(new->s,
-		       rcu_dereference_protected(c->snapshots, true)->s,
-		       sizeof(new->s[0]) * c->snapshot_table_size);
+		memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
 
 	rcu_assign_pointer(c->snapshots, new);
-	c->snapshot_table_size = new_size;
-	kvfree_rcu_mightsleep(old);
+	kvfree_rcu(old, rcu);
 
-	return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+	return &rcu_dereference_protected(c->snapshots,
+				lockdep_is_held(&c->snapshot_table_lock))->s[idx];
 }
 
 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
 {
 	size_t idx = U32_MAX - id;
+	struct snapshot_table *table =
+		rcu_dereference_protected(c->snapshots,
+				lockdep_is_held(&c->snapshot_table_lock));
 
 	lockdep_assert_held(&c->snapshot_table_lock);
 
-	if (likely(idx < c->snapshot_table_size))
-		return &rcu_dereference_protected(c->snapshots, true)->s[idx];
+	if (likely(table && idx < table->nr))
+		return &table->s[idx];
 
 	return __snapshot_t_mut(c, id);
 }
@@ -547,7 +566,7 @@ static int check_snapshot_tree(struct btree_trans *trans,
 			"snapshot tree points to missing subvolume:\n  %s",
 			(printbuf_reset(&buf),
 			 bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
-	    fsck_err_on(!bch2_snapshot_is_ancestor_early(c,
+	    fsck_err_on(!bch2_snapshot_is_ancestor(c,
 						le32_to_cpu(subvol.snapshot),
 						root_id),
 			c, snapshot_tree_to_wrong_subvol,
@@ -563,6 +582,13 @@ static int check_snapshot_tree(struct btree_trans *trans,
 		u32 subvol_id;
 
 		ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
+		bch_err_fn(c, ret);
+
+		if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
+			ret = 0;
+			goto err;
+		}
+
 		if (ret)
 			goto err;
 
@@ -720,7 +746,6 @@ static int check_snapshot(struct btree_trans *trans,
 	u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
 	u32 real_depth;
 	struct printbuf buf = PRINTBUF;
-	bool should_have_subvol;
 	u32 i, id;
 	int ret = 0;
 
@@ -766,7 +791,7 @@ static int check_snapshot(struct btree_trans *trans,
 		}
 	}
 
-	should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
+	bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
 		!BCH_SNAPSHOT_DELETED(&s);
 
 	if (should_have_subvol) {
@@ -868,6 +893,154 @@ int bch2_check_snapshots(struct bch_fs *c)
 	return ret;
 }
 
+static int check_snapshot_exists(struct btree_trans *trans, u32 id)
+{
+	struct bch_fs *c = trans->c;
+
+	if (bch2_snapshot_equiv(c, id))
+		return 0;
+
+	u32 tree_id;
+	int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
+	if (ret)
+		return ret;
+
+	struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
+	ret = PTR_ERR_OR_ZERO(snapshot);
+	if (ret)
+		return ret;
+
+	bkey_snapshot_init(&snapshot->k_i);
+	snapshot->k.p		= POS(0, id);
+	snapshot->v.tree	= cpu_to_le32(tree_id);
+	snapshot->v.btime.lo	= cpu_to_le64(bch2_current_time(c));
+
+	return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
+		bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+				   bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
+		bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
+}
+
+/* Figure out which snapshot nodes belong in the same tree: */
+struct snapshot_tree_reconstruct {
+	enum btree_id			btree;
+	struct bpos			cur_pos;
+	snapshot_id_list		cur_ids;
+	DARRAY(snapshot_id_list)	trees;
+};
+
+static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
+{
+	darray_for_each(r->trees, i)
+		darray_exit(i);
+	darray_exit(&r->trees);
+	darray_exit(&r->cur_ids);
+}
+
+static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+	return r->btree == BTREE_ID_inodes
+		? r->cur_pos.offset == pos.offset
+		: r->cur_pos.inode == pos.inode;
+}
+
+static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
+{
+	darray_for_each(*l, i)
+		if (snapshot_list_has_id(r, *i))
+			return true;
+	return false;
+}
+
+static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
+{
+	bool first = true;
+	darray_for_each(*s, i) {
+		if (!first)
+			prt_char(out, ' ');
+		first = false;
+		prt_printf(out, "%u", *i);
+	}
+}
+
+static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
+{
+	if (r->cur_ids.nr) {
+		darray_for_each(r->trees, i)
+			if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
+				int ret = snapshot_list_merge(c, i, &r->cur_ids);
+				if (ret)
+					return ret;
+				goto out;
+			}
+		darray_push(&r->trees, r->cur_ids);
+		darray_init(&r->cur_ids);
+	}
+out:
+	r->cur_ids.nr = 0;
+	return 0;
+}
+
+static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
+{
+	if (!same_snapshot(r, pos))
+		snapshot_tree_reconstruct_next(c, r);
+	r->cur_pos = pos;
+	return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
+}
+
+int bch2_reconstruct_snapshots(struct bch_fs *c)
+{
+	struct btree_trans *trans = bch2_trans_get(c);
+	struct printbuf buf = PRINTBUF;
+	struct snapshot_tree_reconstruct r = {};
+	int ret = 0;
+
+	for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
+		if (btree_type_has_snapshots(btree)) {
+			r.btree = btree;
+
+			ret = for_each_btree_key(trans, iter, btree, POS_MIN,
+					BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
+				get_snapshot_trees(c, &r, k.k->p);
+			}));
+			if (ret)
+				goto err;
+
+			snapshot_tree_reconstruct_next(c, &r);
+		}
+	}
+
+	darray_for_each(r.trees, t) {
+		printbuf_reset(&buf);
+		snapshot_id_list_to_text(&buf, t);
+
+		darray_for_each(*t, id) {
+			if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
+					c, snapshot_node_missing,
+					"snapshot node %u from tree %s missing", *id, buf.buf)) {
+				if (t->nr > 1) {
+					bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
+					ret = -BCH_ERR_fsck_repair_unimplemented;
+					goto err;
+				}
+
+				ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+						check_snapshot_exists(trans, *id));
+				if (ret)
+					goto err;
+			}
+		}
+	}
+fsck_err:
+err:
+	bch2_trans_put(trans);
+	snapshot_tree_reconstruct_exit(&r);
+	printbuf_exit(&buf);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 /*
  * Mark a snapshot as deleted, for future cleanup:
  */
@@ -1678,6 +1851,20 @@ int bch2_snapshots_read(struct bch_fs *c)
 				   POS_MIN, 0, k,
 			   (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
 	bch_err_fn(c, ret);
+
+	/*
+	 * It's important that we check if we need to reconstruct snapshots
+	 * before going RW, so we mark that pass as required in the superblock -
+	 * otherwise, we could end up deleting keys with missing snapshot nodes
+	 * instead
+	 */
+	BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
+	       test_bit(BCH_FS_may_go_rw, &c->flags));
+
+	if (bch2_err_matches(ret, EIO) ||
+	    (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
+		ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index 7c66ffc06385..b7d2fed37c4f 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -33,7 +33,11 @@ int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
 
 static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id)
 {
-	return &t->s[U32_MAX - id];
+	u32 idx = U32_MAX - id;
+
+	return likely(t && idx < t->nr)
+		? &t->s[idx]
+		: NULL;
 }
 
 static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
@@ -44,7 +48,8 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id)
 static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
 {
 	rcu_read_lock();
-	id = snapshot_t(c, id)->tree;
+	const struct snapshot_t *s = snapshot_t(c, id);
+	id = s ? s->tree : 0;
 	rcu_read_unlock();
 
 	return id;
@@ -52,7 +57,8 @@ static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id)
 
 static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
 {
-	return snapshot_t(c, id)->parent;
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s ? s->parent : 0;
 }
 
 static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
@@ -66,19 +72,19 @@ static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id)
 
 static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id)
 {
-#ifdef CONFIG_BCACHEFS_DEBUG
-	u32 parent = snapshot_t(c, id)->parent;
+	const struct snapshot_t *s = snapshot_t(c, id);
+	if (!s)
+		return 0;
 
-	if (parent &&
-	    snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1)
+	u32 parent = s->parent;
+	if (IS_ENABLED(CONFIG_BCACHEFS_DEBU) &&
+	    parent &&
+	    s->depth != snapshot_t(c, parent)->depth + 1)
 		panic("id %u depth=%u parent %u depth=%u\n",
 		      id, snapshot_t(c, id)->depth,
 		      parent, snapshot_t(c, parent)->depth);
 
 	return parent;
-#else
-	return snapshot_t(c, id)->parent;
-#endif
 }
 
 static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id)
@@ -116,7 +122,8 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id)
 
 static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id)
 {
-	return snapshot_t(c, id)->equiv;
+	const struct snapshot_t *s = snapshot_t(c, id);
+	return s ? s->equiv : 0;
 }
 
 static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id)
@@ -133,38 +140,22 @@ static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id)
 	return id == bch2_snapshot_equiv(c, id);
 }
 
-static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id)
 {
-	const struct snapshot_t *s;
-	bool ret;
-
 	rcu_read_lock();
-	s = snapshot_t(c, id);
-	ret = s->children[0];
+	const struct snapshot_t *s = snapshot_t(c, id);
+	int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node;
 	rcu_read_unlock();
 
 	return ret;
 }
 
-static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
-{
-	return !bch2_snapshot_is_internal_node(c, id);
-}
-
-static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id)
+static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id)
 {
-	const struct snapshot_t *s;
-	u32 parent = __bch2_snapshot_parent(c, id);
-
-	if (!parent)
-		return 0;
-
-	s = snapshot_t(c, __bch2_snapshot_parent(c, id));
-	if (id == s->children[0])
-		return s->children[1];
-	if (id == s->children[1])
-		return s->children[0];
-	return 0;
+	int ret = bch2_snapshot_is_internal_node(c, id);
+	if (ret < 0)
+		return ret;
+	return !ret;
 }
 
 static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent)
@@ -218,15 +209,34 @@ static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list
 
 static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id)
 {
-	int ret;
-
 	BUG_ON(snapshot_list_has_id(s, id));
-	ret = darray_push(s, id);
+	int ret = darray_push(s, id);
 	if (ret)
 		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
 	return ret;
 }
 
+static inline int snapshot_list_add_nodup(struct bch_fs *c, snapshot_id_list *s, u32 id)
+{
+	int ret = snapshot_list_has_id(s, id)
+		? 0
+		: darray_push(s, id);
+	if (ret)
+		bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size);
+	return ret;
+}
+
+static inline int snapshot_list_merge(struct bch_fs *c, snapshot_id_list *dst, snapshot_id_list *src)
+{
+	darray_for_each(*src, i) {
+		int ret = snapshot_list_add_nodup(c, dst, *i);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
 			 struct bch_snapshot *s);
 int bch2_snapshot_get_subvol(struct btree_trans *, u32,
@@ -238,6 +248,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
 
 int bch2_check_snapshot_trees(struct bch_fs *);
 int bch2_check_snapshots(struct bch_fs *);
+int bch2_reconstruct_snapshots(struct bch_fs *);
 
 int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
 void bch2_delete_dead_snapshots_work(struct work_struct *);
@@ -249,7 +260,7 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
 					  struct bpos pos)
 {
 	if (!btree_type_has_snapshots(id) ||
-	    bch2_snapshot_is_leaf(trans->c, pos.snapshot))
+	    bch2_snapshot_is_leaf(trans->c, pos.snapshot) > 0)
 		return 0;
 
 	return __bch2_key_has_snapshot_overwrites(trans, id, pos);
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index fcaa5a888744..3976f80721bf 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -259,7 +259,7 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans,
 }
 
 static __always_inline
-int bch2_hash_set_snapshot(struct btree_trans *trans,
+int bch2_hash_set_in_snapshot(struct btree_trans *trans,
 			   const struct bch_hash_desc desc,
 			   const struct bch_hash_info *info,
 			   subvol_inum inum, u32 snapshot,
@@ -328,17 +328,12 @@ int bch2_hash_set(struct btree_trans *trans,
 		  struct bkey_i *insert,
 		  bch_str_hash_flags_t str_hash_flags)
 {
-	u32 snapshot;
-	int ret;
-
-	ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
-	if (ret)
-		return ret;
-
 	insert->k.p.inode = inum.inum;
 
-	return bch2_hash_set_snapshot(trans, desc, info, inum,
-				      snapshot, insert, str_hash_flags, 0);
+	u32 snapshot;
+	return  bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+		bch2_hash_set_in_snapshot(trans, desc, info, inum,
+					  snapshot, insert, str_hash_flags, 0);
 }
 
 static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 7c67c28d3ef8..88a79c823276 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -13,13 +13,26 @@
 
 static int bch2_subvolume_delete(struct btree_trans *, u32);
 
+static struct bpos subvolume_children_pos(struct bkey_s_c k)
+{
+	if (k.k->type != KEY_TYPE_subvolume)
+		return POS_MIN;
+
+	struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
+	if (!s.v->fs_path_parent)
+		return POS_MIN;
+	return POS(le32_to_cpu(s.v->fs_path_parent), s.k->p.offset);
+}
+
 static int check_subvol(struct btree_trans *trans,
 			struct btree_iter *iter,
 			struct bkey_s_c k)
 {
 	struct bch_fs *c = trans->c;
 	struct bkey_s_c_subvolume subvol;
+	struct btree_iter subvol_children_iter = {};
 	struct bch_snapshot snapshot;
+	struct printbuf buf = PRINTBUF;
 	unsigned snapid;
 	int ret = 0;
 
@@ -42,6 +55,72 @@ static int check_subvol(struct btree_trans *trans,
 		return ret ?: -BCH_ERR_transaction_restart_nested;
 	}
 
+	if (fsck_err_on(subvol.k->p.offset == BCACHEFS_ROOT_SUBVOL &&
+			subvol.v->fs_path_parent,
+			c, subvol_root_fs_path_parent_nonzero,
+			"root subvolume has nonzero fs_path_parent\n%s",
+			(bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+		struct bkey_i_subvolume *n =
+			bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume);
+		ret = PTR_ERR_OR_ZERO(n);
+		if (ret)
+			goto err;
+
+		n->v.fs_path_parent = 0;
+	}
+
+	if (subvol.v->fs_path_parent) {
+		struct bpos pos = subvolume_children_pos(k);
+
+		struct bkey_s_c subvol_children_k =
+			bch2_bkey_get_iter(trans, &subvol_children_iter,
+					   BTREE_ID_subvolume_children, pos, 0);
+		ret = bkey_err(subvol_children_k);
+		if (ret)
+			goto err;
+
+		if (fsck_err_on(subvol_children_k.k->type != KEY_TYPE_set,
+				c, subvol_children_not_set,
+				"subvolume not set in subvolume_children btree at %llu:%llu\n%s",
+				pos.inode, pos.offset,
+				(printbuf_reset(&buf),
+				 bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
+			ret = bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, true);
+			if (ret)
+				goto err;
+		}
+	}
+
+	struct bch_inode_unpacked inode;
+	struct btree_iter inode_iter = {};
+	ret = bch2_inode_peek_nowarn(trans, &inode_iter, &inode,
+				    (subvol_inum) { k.k->p.offset, le64_to_cpu(subvol.v->inode) },
+				    0);
+	bch2_trans_iter_exit(trans, &inode_iter);
+
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret, c, subvol_to_missing_root,
+			"subvolume %llu points to missing subvolume root %llu:%u",
+			k.k->p.offset, le64_to_cpu(subvol.v->inode),
+			le32_to_cpu(subvol.v->snapshot))) {
+		ret = bch2_subvolume_delete(trans, iter->pos.offset);
+		bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+		return ret ?: -BCH_ERR_transaction_restart_nested;
+	}
+
+	if (fsck_err_on(inode.bi_subvol != subvol.k->p.offset,
+			c, subvol_root_wrong_bi_subvol,
+			"subvol root %llu:%u has wrong bi_subvol field: got %u, should be %llu",
+			inode.bi_inum, inode_iter.k.p.snapshot,
+			inode.bi_subvol, subvol.k->p.offset)) {
+		inode.bi_subvol = subvol.k->p.offset;
+		ret = __bch2_fsck_write_inode(trans, &inode, le32_to_cpu(subvol.v->snapshot));
+		if (ret)
+			goto err;
+	}
+
 	if (!BCH_SUBVOLUME_SNAP(subvol.v)) {
 		u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot));
 		u32 snapshot_tree;
@@ -72,8 +151,10 @@ static int check_subvol(struct btree_trans *trans,
 			SET_BCH_SUBVOLUME_SNAP(&s->v, true);
 		}
 	}
-
+err:
 fsck_err:
+	bch2_trans_iter_exit(trans, &subvol_children_iter);
+	printbuf_exit(&buf);
 	return ret;
 }
 
@@ -88,6 +169,42 @@ int bch2_check_subvols(struct bch_fs *c)
 	return ret;
 }
 
+static int check_subvol_child(struct btree_trans *trans,
+			      struct btree_iter *child_iter,
+			      struct bkey_s_c child_k)
+{
+	struct bch_fs *c = trans->c;
+	struct bch_subvolume s;
+	int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, child_k.k->p.offset),
+					  0, subvolume, &s);
+	if (ret && !bch2_err_matches(ret, ENOENT))
+		return ret;
+
+	if (fsck_err_on(ret ||
+			le32_to_cpu(s.fs_path_parent) != child_k.k->p.inode,
+			c, subvol_children_bad,
+			"incorrect entry in subvolume_children btree %llu:%llu",
+			child_k.k->p.inode, child_k.k->p.offset)) {
+		ret = bch2_btree_delete_at(trans, child_iter, 0);
+		if (ret)
+			goto err;
+	}
+err:
+fsck_err:
+	return ret;
+}
+
+int bch2_check_subvol_children(struct bch_fs *c)
+{
+	int ret = bch2_trans_run(c,
+		for_each_btree_key_commit(trans, iter,
+				BTREE_ID_subvolume_children, POS_MIN, BTREE_ITER_PREFETCH, k,
+				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+			check_subvol_child(trans, &iter, k)));
+	bch_err_fn(c, ret);
+	return 0;
+}
+
 /* Subvolumes: */
 
 int bch2_subvolume_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -112,8 +229,50 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c,
 		   le64_to_cpu(s.v->inode),
 		   le32_to_cpu(s.v->snapshot));
 
-	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent))
-		prt_printf(out, " parent %u", le32_to_cpu(s.v->parent));
+	if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, creation_parent)) {
+		prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent));
+		prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent));
+	}
+}
+
+static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set)
+{
+	return !bpos_eq(pos, POS_MIN)
+		? bch2_btree_bit_mod(trans, BTREE_ID_subvolume_children, pos, set)
+		: 0;
+}
+
+int bch2_subvolume_trigger(struct btree_trans *trans,
+			   enum btree_id btree_id, unsigned level,
+			   struct bkey_s_c old, struct bkey_s new,
+			   unsigned flags)
+{
+	if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+		struct bpos children_pos_old = subvolume_children_pos(old);
+		struct bpos children_pos_new = subvolume_children_pos(new.s_c);
+
+		if (!bpos_eq(children_pos_old, children_pos_new)) {
+			int ret = subvolume_children_mod(trans, children_pos_old, false) ?:
+				  subvolume_children_mod(trans, children_pos_new, true);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol)
+{
+	struct btree_iter iter;
+
+	bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0);
+	struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+	bch2_trans_iter_exit(trans, &iter);
+
+	return bkey_err(k) ?: k.k && k.k->p.inode == subvol
+		? -BCH_ERR_ENOTEMPTY_subvol_not_empty
+		: 0;
 }
 
 static __always_inline int
@@ -197,8 +356,8 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 	if (k.k->type != KEY_TYPE_subvolume)
 		return 0;
 
-	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) &&
-	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent)
+	if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, creation_parent) &&
+	    le32_to_cpu(bkey_s_c_to_subvolume(k).v->creation_parent) != old_parent)
 		return 0;
 
 	s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume);
@@ -206,7 +365,7 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
 	if (ret)
 		return ret;
 
-	s->v.parent = cpu_to_le32(new_parent);
+	s->v.creation_parent = cpu_to_le32(new_parent);
 	return 0;
 }
 
@@ -229,7 +388,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
 				BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
 				NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
 			bch2_subvolume_reparent(trans, &iter, k,
-					subvolid_to_delete, le32_to_cpu(s.parent)));
+					subvolid_to_delete, le32_to_cpu(s.creation_parent)));
 }
 
 /*
@@ -360,6 +519,7 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid)
 }
 
 int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
+			  u32 parent_subvolid,
 			  u32 src_subvolid,
 			  u32 *new_subvolid,
 			  u32 *new_snapshotid,
@@ -416,12 +576,13 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode,
 	if (ret)
 		goto err;
 
-	new_subvol->v.flags	= 0;
-	new_subvol->v.snapshot	= cpu_to_le32(new_nodes[0]);
-	new_subvol->v.inode	= cpu_to_le64(inode);
-	new_subvol->v.parent	= cpu_to_le32(src_subvolid);
-	new_subvol->v.otime.lo	= cpu_to_le64(bch2_current_time(c));
-	new_subvol->v.otime.hi	= 0;
+	new_subvol->v.flags		= 0;
+	new_subvol->v.snapshot		= cpu_to_le32(new_nodes[0]);
+	new_subvol->v.inode		= cpu_to_le64(inode);
+	new_subvol->v.creation_parent	= cpu_to_le32(src_subvolid);
+	new_subvol->v.fs_path_parent	= cpu_to_le32(parent_subvolid);
+	new_subvol->v.otime.lo		= cpu_to_le64(bch2_current_time(c));
+	new_subvol->v.otime.hi		= 0;
 
 	SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro);
 	SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0);
@@ -434,6 +595,78 @@ err:
 	return ret;
 }
 
+int bch2_initialize_subvolumes(struct bch_fs *c)
+{
+	struct bkey_i_snapshot_tree	root_tree;
+	struct bkey_i_snapshot		root_snapshot;
+	struct bkey_i_subvolume		root_volume;
+	int ret;
+
+	bkey_snapshot_tree_init(&root_tree.k_i);
+	root_tree.k.p.offset		= 1;
+	root_tree.v.master_subvol	= cpu_to_le32(1);
+	root_tree.v.root_snapshot	= cpu_to_le32(U32_MAX);
+
+	bkey_snapshot_init(&root_snapshot.k_i);
+	root_snapshot.k.p.offset = U32_MAX;
+	root_snapshot.v.flags	= 0;
+	root_snapshot.v.parent	= 0;
+	root_snapshot.v.subvol	= cpu_to_le32(BCACHEFS_ROOT_SUBVOL);
+	root_snapshot.v.tree	= cpu_to_le32(1);
+	SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true);
+
+	bkey_subvolume_init(&root_volume.k_i);
+	root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL;
+	root_volume.v.flags	= 0;
+	root_volume.v.snapshot	= cpu_to_le32(U32_MAX);
+	root_volume.v.inode	= cpu_to_le64(BCACHEFS_ROOT_INO);
+
+	ret =   bch2_btree_insert(c, BTREE_ID_snapshot_trees,	&root_tree.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_snapshots,	&root_snapshot.k_i, NULL, 0) ?:
+		bch2_btree_insert(c, BTREE_ID_subvolumes,	&root_volume.k_i, NULL, 0);
+	bch_err_fn(c, ret);
+	return ret;
+}
+
+static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans)
+{
+	struct btree_iter iter;
+	struct bkey_s_c k;
+	struct bch_inode_unpacked inode;
+	int ret;
+
+	k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes,
+			       SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0);
+	ret = bkey_err(k);
+	if (ret)
+		return ret;
+
+	if (!bkey_is_inode(k.k)) {
+		bch_err(trans->c, "root inode not found");
+		ret = -BCH_ERR_ENOENT_inode;
+		goto err;
+	}
+
+	ret = bch2_inode_unpack(k, &inode);
+	BUG_ON(ret);
+
+	inode.bi_subvol = BCACHEFS_ROOT_SUBVOL;
+
+	ret = bch2_inode_write(trans, &iter, &inode);
+err:
+	bch2_trans_iter_exit(trans, &iter);
+	return ret;
+}
+
+/* set bi_subvol on root inode */
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
+{
+	int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+				__bch2_fs_upgrade_for_subvolumes(trans));
+	bch_err_fn(c, ret);
+	return ret;
+}
+
 int bch2_fs_subvolumes_init(struct bch_fs *c)
 {
 	INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work);
diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h
index a6f56f66e27c..d2015d549bd2 100644
--- a/fs/bcachefs/subvolume.h
+++ b/fs/bcachefs/subvolume.h
@@ -8,17 +8,22 @@
 enum bkey_invalid_flags;
 
 int bch2_check_subvols(struct bch_fs *);
+int bch2_check_subvol_children(struct bch_fs *);
 
 int bch2_subvolume_invalid(struct bch_fs *, struct bkey_s_c,
 			   enum bkey_invalid_flags, struct printbuf *);
 void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
+int bch2_subvolume_trigger(struct btree_trans *, enum btree_id, unsigned,
+			   struct bkey_s_c, struct bkey_s, unsigned);
 
 #define bch2_bkey_ops_subvolume ((struct bkey_ops) {		\
 	.key_invalid	= bch2_subvolume_invalid,		\
 	.val_to_text	= bch2_subvolume_to_text,		\
+	.trigger	= bch2_subvolume_trigger,		\
 	.min_val_size	= 16,					\
 })
 
+int bch2_subvol_has_children(struct btree_trans *, u32);
 int bch2_subvolume_get(struct btree_trans *, unsigned,
 		       bool, int, struct bch_subvolume *);
 int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *);
@@ -30,8 +35,10 @@ int bch2_delete_dead_snapshots(struct bch_fs *);
 void bch2_delete_dead_snapshots_async(struct bch_fs *);
 
 int bch2_subvolume_unlink(struct btree_trans *, u32);
-int bch2_subvolume_create(struct btree_trans *, u64, u32,
-			  u32 *, u32 *, bool);
+int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool);
+
+int bch2_initialize_subvolumes(struct bch_fs *);
+int bch2_fs_upgrade_for_subvolumes(struct bch_fs *);
 
 int bch2_fs_subvolumes_init(struct bch_fs *);
 
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
index af79134b07d6..e029df7ba89f 100644
--- a/fs/bcachefs/subvolume_format.h
+++ b/fs/bcachefs/subvolume_format.h
@@ -19,8 +19,8 @@ struct bch_subvolume {
 	 * This is _not_ necessarily the subvolume of the directory containing
 	 * this subvolume:
 	 */
-	__le32			parent;
-	__le32			pad;
+	__le32			creation_parent;
+	__le32			fs_path_parent;
 	bch_le128		otime;
 };
 
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index ae644adfc391..9b10c8947828 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,6 +20,8 @@ struct snapshot_t {
 };
 
 struct snapshot_table {
+	struct rcu_head		rcu;
+	size_t			nr;
 #ifndef RUST_BINDGEN
 	DECLARE_FLEX_ARRAY(struct snapshot_t, s);
 #else
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 36988add581f..08ea3dbbbe97 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -8,7 +8,7 @@
 #include "journal.h"
 #include "journal_sb.h"
 #include "journal_seq_blacklist.h"
-#include "recovery.h"
+#include "recovery_passes.h"
 #include "replicas.h"
 #include "quota.h"
 #include "sb-clean.h"
@@ -142,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
 void bch2_free_super(struct bch_sb_handle *sb)
 {
 	kfree(sb->bio);
-	if (!IS_ERR_OR_NULL(sb->bdev_handle))
-		bdev_release(sb->bdev_handle);
+	if (!IS_ERR_OR_NULL(sb->s_bdev_file))
+		bdev_fput(sb->s_bdev_file);
 	kfree(sb->holder);
 	kfree(sb->sb_name);
 
@@ -470,6 +470,14 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
 			return ret;
 	}
 
+	if (rw == WRITE &&
+	    bch2_sb_member_get(sb, sb->dev_idx).seq != sb->seq) {
+		prt_printf(out, "Invalid superblock: member seq %llu != sb seq %llu",
+			   le64_to_cpu(bch2_sb_member_get(sb, sb->dev_idx).seq),
+			   le64_to_cpu(sb->seq));
+		return -BCH_ERR_invalid_sb_members_missing;
+	}
+
 	return 0;
 }
 
@@ -519,9 +527,11 @@ static void bch2_sb_update(struct bch_fs *c)
 	memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent));
 
 	struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext);
-	if (ext)
+	if (ext) {
 		le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
 				    sizeof(c->sb.errors_silent) * 8);
+		c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data);
+	}
 
 	for_each_member_device(c, ca) {
 		struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
@@ -690,8 +700,11 @@ retry:
 		return -ENOMEM;
 
 	sb->sb_name = kstrdup(path, GFP_KERNEL);
-	if (!sb->sb_name)
-		return -ENOMEM;
+	if (!sb->sb_name) {
+		ret = -ENOMEM;
+		prt_printf(&err, "error allocating memory for sb_name");
+		goto err;
+	}
 
 #ifndef __KERNEL__
 	if (opt_get(*opts, direct_io) == false)
@@ -704,22 +717,23 @@ retry:
 	if (!opt_get(*opts, nochanges))
 		sb->mode |= BLK_OPEN_WRITE;
 
-	sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-	if (IS_ERR(sb->bdev_handle) &&
-	    PTR_ERR(sb->bdev_handle) == -EACCES &&
+	sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+	if (IS_ERR(sb->s_bdev_file) &&
+	    PTR_ERR(sb->s_bdev_file) == -EACCES &&
 	    opt_get(*opts, read_only)) {
 		sb->mode &= ~BLK_OPEN_WRITE;
 
-		sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
-		if (!IS_ERR(sb->bdev_handle))
+		sb->s_bdev_file = bdev_file_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+		if (!IS_ERR(sb->s_bdev_file))
 			opt_set(*opts, nochanges, true);
 	}
 
-	if (IS_ERR(sb->bdev_handle)) {
-		ret = PTR_ERR(sb->bdev_handle);
+	if (IS_ERR(sb->s_bdev_file)) {
+		ret = PTR_ERR(sb->s_bdev_file);
+		prt_printf(&err, "error opening %s: %s", path, bch2_err_str(ret));
 		goto err;
 	}
-	sb->bdev = sb->bdev_handle->bdev;
+	sb->bdev = file_bdev(sb->s_bdev_file);
 
 	ret = bch2_sb_realloc(sb, 0);
 	if (ret) {
@@ -743,9 +757,9 @@ retry:
 	prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
 	       path, err.buf);
 	if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
-		printk(KERN_INFO "%s", err2.buf);
+		bch2_print_opts(opts, KERN_INFO "%s", err2.buf);
 	else
-		printk(KERN_ERR "%s", err2.buf);
+		bch2_print_opts(opts, KERN_ERR "%s", err2.buf);
 
 	printbuf_exit(&err2);
 	printbuf_reset(&err);
@@ -803,21 +817,20 @@ got_super:
 		goto err;
 	}
 
-	ret = 0;
 	sb->have_layout = true;
 
 	ret = bch2_sb_validate(sb, &err, READ);
 	if (ret) {
-		printk(KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
-		       path, err.buf);
+		bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n",
+				path, err.buf);
 		goto err_no_print;
 	}
 out:
 	printbuf_exit(&err);
 	return ret;
 err:
-	printk(KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
-	       path, err.buf);
+	bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error reading superblock: %s\n",
+			path, err.buf);
 err_no_print:
 	bch2_free_super(sb);
 	goto out;
@@ -977,7 +990,7 @@ int bch2_write_super(struct bch_fs *c)
 		prt_str(&buf, " > ");
 		bch2_version_to_text(&buf, bcachefs_metadata_version_current);
 		prt_str(&buf, ")");
-		bch2_fs_fatal_error(c, "%s", buf.buf);
+		bch2_fs_fatal_error(c, ": %s", buf.buf);
 		printbuf_exit(&buf);
 		return -BCH_ERR_sb_not_downgraded;
 	}
@@ -997,7 +1010,7 @@ int bch2_write_super(struct bch_fs *c)
 
 		if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock write was silently dropped! (seq %llu expected %llu)",
+				": Superblock write was silently dropped! (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1007,7 +1020,7 @@ int bch2_write_super(struct bch_fs *c)
 
 		if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) {
 			bch2_fs_fatal_error(c,
-				"Superblock modified by another process (seq %llu expected %llu)",
+				": Superblock modified by another process (seq %llu expected %llu)",
 				le64_to_cpu(ca->sb_read_scratch->seq),
 				ca->disk_sb.seq);
 			percpu_ref_put(&ca->io_ref);
@@ -1058,7 +1071,7 @@ int bch2_write_super(struct bch_fs *c)
 				 !can_mount_with_written ||
 				 (can_mount_without_written &&
 				  !can_mount_with_written), c,
-		"Unable to write superblock to sufficient devices (from %ps)",
+		": Unable to write superblock to sufficient devices (from %ps)",
 		(void *) _RET_IP_))
 		ret = -1;
 out:
@@ -1154,6 +1167,11 @@ static void bch2_sb_ext_to_text(struct printbuf *out, struct bch_sb *sb,
 
 		kfree(errors_silent);
 	}
+
+	prt_printf(out, "Btrees with missing data:");
+	prt_tab(out);
+	prt_bitflags(out, __bch2_btree_ids, le64_to_cpu(e->btrees_lost_data));
+	prt_newline(out);
 }
 
 static const struct bch_sb_field_ops bch_sb_field_ops_ext = {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 6b23e11825e6..8daf80a38d60 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -15,6 +15,7 @@
 #include "btree_gc.h"
 #include "btree_journal_iter.h"
 #include "btree_key_cache.h"
+#include "btree_node_scan.h"
 #include "btree_update_interior.h"
 #include "btree_io.h"
 #include "btree_write_buffer.h"
@@ -56,6 +57,7 @@
 #include "super.h"
 #include "super-io.h"
 #include "sysfs.h"
+#include "thread_with_file.h"
 #include "trace.h"
 
 #include <linux/backing-dev.h>
@@ -86,26 +88,38 @@ const char * const bch2_fs_flag_strs[] = {
 	NULL
 };
 
-void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+__printf(2, 0)
+static void bch2_print_maybe_redirect(struct stdio_redirect *stdio, const char *fmt, va_list args)
 {
-	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+#ifdef __KERNEL__
+	if (unlikely(stdio)) {
+		if (fmt[0] == KERN_SOH[0])
+			fmt += 2;
+
+		bch2_stdio_redirect_vprintf(stdio, true, fmt, args);
+		return;
+	}
+#endif
+	vprintk(fmt, args);
+}
+
+void bch2_print_opts(struct bch_opts *opts, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = (void *)(unsigned long)opts->stdio;
 
 	va_list args;
 	va_start(args, fmt);
-	if (likely(!stdio)) {
-		vprintk(fmt, args);
-	} else {
-		unsigned long flags;
-
-		if (fmt[0] == KERN_SOH[0])
-			fmt += 2;
+	bch2_print_maybe_redirect(stdio, fmt, args);
+	va_end(args);
+}
 
-		spin_lock_irqsave(&stdio->output_lock, flags);
-		prt_vprintf(&stdio->output_buf, fmt, args);
-		spin_unlock_irqrestore(&stdio->output_lock, flags);
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+	struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
 
-		wake_up(&stdio->output_wait);
-	}
+	va_list args;
+	va_start(args, fmt);
+	bch2_print_maybe_redirect(stdio, fmt, args);
 	va_end(args);
 }
 
@@ -274,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
 	if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
 	    !test_bit(BCH_FS_emergency_ro, &c->flags))
 		set_bit(BCH_FS_clean_shutdown, &c->flags);
+
 	bch2_fs_journal_stop(&c->journal);
 
+	bch_info(c, "%sshutdown complete, journal seq %llu",
+		 test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
+		 c->journal.seq_ondisk);
+
 	/*
 	 * After stopping journal:
 	 */
@@ -352,7 +371,7 @@ void bch2_fs_read_only(struct bch_fs *c)
 	    !test_bit(BCH_FS_emergency_ro, &c->flags) &&
 	    test_bit(BCH_FS_started, &c->flags) &&
 	    test_bit(BCH_FS_clean_shutdown, &c->flags) &&
-	    !c->opts.norecovery) {
+	    c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) {
 		BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
 		BUG_ON(atomic_read(&c->btree_cache.dirty));
 		BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
@@ -497,7 +516,8 @@ err:
 
 int bch2_fs_read_write(struct bch_fs *c)
 {
-	if (c->opts.norecovery)
+	if (c->opts.recovery_pass_last &&
+	    c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay)
 		return -BCH_ERR_erofs_norecovery;
 
 	if (c->opts.nochanges)
@@ -522,6 +542,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	for (i = 0; i < BCH_TIME_STAT_NR; i++)
 		bch2_time_stats_exit(&c->times[i]);
 
+	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	bch2_free_pending_node_rewrites(c);
 	bch2_fs_sb_errors_exit(c);
 	bch2_fs_counters_exit(c);
@@ -546,6 +567,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	bch2_journal_keys_put_initial(c);
+	bch2_find_btree_nodes_exit(&c->found_btree_nodes);
 	BUG_ON(atomic_read(&c->journal_keys.ref));
 	bch2_fs_btree_write_buffer_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
@@ -576,7 +598,7 @@ static void __bch2_fs_free(struct bch_fs *c)
 		destroy_workqueue(c->btree_update_wq);
 
 	bch2_free_super(&c->disk_sb);
-	kvpfree(c, sizeof(*c));
+	kvfree(c);
 	module_put(THIS_MODULE);
 }
 
@@ -715,7 +737,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	unsigned i, iter_size;
 	int ret = 0;
 
-	c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
+	c = kvmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO);
 	if (!c) {
 		c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc);
 		goto out;
@@ -818,13 +840,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		goto err;
 
 	pr_uuid(&name, c->sb.user_uuid.b);
-	strscpy(c->name, name.buf, sizeof(c->name));
-	printbuf_exit(&name);
-
 	ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0;
 	if (ret)
 		goto err;
 
+	strscpy(c->name, name.buf, sizeof(c->name));
+	printbuf_exit(&name);
+
 	/* Compat: */
 	if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 &&
 	    !BCH_SB_JOURNAL_FLUSH_DELAY(sb))
@@ -862,13 +884,13 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 	c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus()));
 
 	if (!(c->btree_update_wq = alloc_workqueue("bcachefs",
-				WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) ||
 	    !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
-				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
 	    !(c->io_complete_wq = alloc_workqueue("bcachefs_io",
-				WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
+				WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) ||
 	    !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
 				WQ_FREEZABLE, 0)) ||
 #ifndef BCH_WRITE_REF_DEBUG
@@ -882,8 +904,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			BIOSET_NEED_BVECS) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    !(c->online_reserved = alloc_percpu(u64)) ||
-	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
-					c->opts.btree_node_size) ||
+	    mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1,
+				       c->opts.btree_node_size) ||
 	    mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
 	    !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
 					      sizeof(u64), GFP_KERNEL))) {
@@ -1002,8 +1024,16 @@ int bch2_fs_start(struct bch_fs *c)
 	for_each_online_member(c, ca)
 		bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
 
+	struct bch_sb_field_ext *ext =
+		bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64));
 	mutex_unlock(&c->sb_lock);
 
+	if (!ext) {
+		bch_err(c, "insufficient space in superblock for sb_field_ext");
+		ret = -BCH_ERR_ENOSPC_sb;
+		goto err;
+	}
+
 	for_each_rw_member(c, ca)
 		bch2_dev_allocator_add(c, ca);
 	bch2_recalc_capacity(c);
@@ -1061,7 +1091,8 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
 }
 
 static int bch2_dev_in_fs(struct bch_sb_handle *fs,
-			  struct bch_sb_handle *sb)
+			  struct bch_sb_handle *sb,
+			  struct bch_opts *opts)
 {
 	if (fs == sb)
 		return 0;
@@ -1102,11 +1133,14 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
 		prt_newline(&buf);
 
-		prt_printf(&buf, "Not using older sb");
+		if (!opts->no_splitbrain_check)
+			prt_printf(&buf, "Not using older sb");
 
 		pr_err("%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_device_splitbrain;
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
 	}
 
 	struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
@@ -1124,17 +1158,22 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs,
 		prt_newline(&buf);
 
 		prt_bdevname(&buf, fs->bdev);
-		prt_str(&buf, "believes seq of ");
+		prt_str(&buf, " believes seq of ");
 		prt_bdevname(&buf, sb->bdev);
 		prt_printf(&buf, " to be %llu, but ", seq_from_fs);
 		prt_bdevname(&buf, sb->bdev);
 		prt_printf(&buf, " has %llu\n", seq_from_member);
-		prt_str(&buf, "Not using ");
-		prt_bdevname(&buf, sb->bdev);
+
+		if (!opts->no_splitbrain_check) {
+			prt_str(&buf, "Not using ");
+			prt_bdevname(&buf, sb->bdev);
+		}
 
 		pr_err("%s", buf.buf);
 		printbuf_exit(&buf);
-		return -BCH_ERR_device_splitbrain;
+
+		if (!opts->no_splitbrain_check)
+			return -BCH_ERR_device_splitbrain;
 	}
 
 	return 0;
@@ -1168,8 +1207,8 @@ static void bch2_dev_free(struct bch_dev *ca)
 	bch2_dev_buckets_free(ca);
 	free_page((unsigned long) ca->sb_read_scratch);
 
-	bch2_time_stats_exit(&ca->io_latency[WRITE]);
-	bch2_time_stats_exit(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_exit(&ca->io_latency[READ]);
 
 	percpu_ref_exit(&ca->io_ref);
 	percpu_ref_exit(&ca->ref);
@@ -1260,8 +1299,8 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c,
 
 	INIT_WORK(&ca->io_error_work, bch2_io_error_work);
 
-	bch2_time_stats_init(&ca->io_latency[READ]);
-	bch2_time_stats_init(&ca->io_latency[WRITE]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[READ]);
+	bch2_time_stats_quantiles_init(&ca->io_latency[WRITE]);
 
 	ca->mi = bch2_mi_to_cpu(member);
 
@@ -1597,27 +1636,27 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
 	__bch2_dev_read_only(c, ca);
 
 	ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
-	bch_err_msg(ca, ret, "dropping data");
+	bch_err_msg(ca, ret, "bch2_dev_data_drop()");
 	if (ret)
 		goto err;
 
 	ret = bch2_dev_remove_alloc(c, ca);
-	bch_err_msg(ca, ret, "deleting alloc info");
+	bch_err_msg(ca, ret, "bch2_dev_remove_alloc()");
 	if (ret)
 		goto err;
 
 	ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
-	bch_err_msg(ca, ret, "flushing journal");
+	bch_err_msg(ca, ret, "bch2_journal_flush_device_pins()");
 	if (ret)
 		goto err;
 
 	ret = bch2_journal_flush(&c->journal);
-	bch_err(ca, "journal error");
+	bch_err_msg(ca, ret, "bch2_journal_flush()");
 	if (ret)
 		goto err;
 
 	ret = bch2_replicas_gc2(c);
-	bch_err_msg(ca, ret, "in replicas_gc2()");
+	bch_err_msg(ca, ret, "bch2_replicas_gc2()");
 	if (ret)
 		goto err;
 
@@ -1835,7 +1874,7 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
 
 	dev_idx = sb.sb->dev_idx;
 
-	ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+	ret = bch2_dev_in_fs(&c->disk_sb, &sb, &c->opts);
 	bch_err_msg(c, ret, "bringing %s online", path);
 	if (ret)
 		goto err;
@@ -2023,7 +2062,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
 			best = sb;
 
 	darray_for_each_reverse(sbs, sb) {
-		ret = bch2_dev_in_fs(best, sb);
+		ret = bch2_dev_in_fs(best, sb, &opts);
 
 		if (ret == -BCH_ERR_device_has_been_removed ||
 		    ret == -BCH_ERR_device_splitbrain) {
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 0e5a14fc8e7f..11bcef170c2c 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,7 +4,7 @@
 
 struct bch_sb_handle {
 	struct bch_sb		*sb;
-	struct bdev_handle	*bdev_handle;
+	struct file		*s_bdev_file;
 	struct block_device	*bdev;
 	char			*sb_name;
 	struct bio		*bio;
@@ -37,6 +37,8 @@ struct bch_member_cpu {
 	u8			durability;
 	u8			freespace_initialized;
 	u8			valid;
+	u8			btree_bitmap_shift;
+	u64			btree_allocated_bitmap;
 };
 
 #endif /* _BCACHEFS_SUPER_TYPES_H */
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index cee80c47feea..5be92fe3f4ea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -17,7 +17,6 @@
 #include "btree_iter.h"
 #include "btree_key_cache.h"
 #include "btree_update.h"
-#include "btree_update_interior.h"
 #include "btree_gc.h"
 #include "buckets.h"
 #include "clock.h"
@@ -26,6 +25,7 @@
 #include "ec.h"
 #include "inode.h"
 #include "journal.h"
+#include "journal_reclaim.h"
 #include "keylist.h"
 #include "move.h"
 #include "movinggc.h"
@@ -139,6 +139,7 @@ do {									\
 write_attribute(trigger_gc);
 write_attribute(trigger_discards);
 write_attribute(trigger_invalidates);
+write_attribute(trigger_journal_flush);
 write_attribute(prune_cache);
 write_attribute(btree_wakeup);
 rw_attribute(btree_gc_periodic);
@@ -166,7 +167,6 @@ read_attribute(btree_write_stats);
 read_attribute(btree_cache_size);
 read_attribute(compression_stats);
 read_attribute(journal_debug);
-read_attribute(btree_updates);
 read_attribute(btree_cache);
 read_attribute(btree_key_cache);
 read_attribute(stripes_heap);
@@ -415,9 +415,6 @@ SHOW(bch2_fs)
 	if (attr == &sysfs_journal_debug)
 		bch2_journal_debug_to_text(out, &c->journal);
 
-	if (attr == &sysfs_btree_updates)
-		bch2_btree_updates_to_text(out, c);
-
 	if (attr == &sysfs_btree_cache)
 		bch2_btree_cache_to_text(out, c);
 
@@ -505,7 +502,7 @@ STORE(bch2_fs)
 
 	/* Debugging: */
 
-	if (!test_bit(BCH_FS_rw, &c->flags))
+	if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))
 		return -EROFS;
 
 	if (attr == &sysfs_prune_cache) {
@@ -538,6 +535,11 @@ STORE(bch2_fs)
 	if (attr == &sysfs_trigger_invalidates)
 		bch2_do_invalidates(c);
 
+	if (attr == &sysfs_trigger_journal_flush) {
+		bch2_journal_flush_all_pins(&c->journal);
+		bch2_journal_meta(&c->journal);
+	}
+
 #ifdef CONFIG_BCACHEFS_TESTS
 	if (attr == &sysfs_perf_test) {
 		char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp;
@@ -558,6 +560,7 @@ STORE(bch2_fs)
 			size = ret;
 	}
 #endif
+	bch2_write_ref_put(c, BCH_WRITE_REF_sysfs);
 	return size;
 }
 SYSFS_OPS(bch2_fs);
@@ -639,7 +642,6 @@ SYSFS_OPS(bch2_fs_internal);
 struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_flags,
 	&sysfs_journal_debug,
-	&sysfs_btree_updates,
 	&sysfs_btree_cache,
 	&sysfs_btree_key_cache,
 	&sysfs_new_stripes,
@@ -657,6 +659,7 @@ struct attribute *bch2_fs_internal_files[] = {
 	&sysfs_trigger_gc,
 	&sysfs_trigger_discards,
 	&sysfs_trigger_invalidates,
+	&sysfs_trigger_journal_flush,
 	&sysfs_prune_cache,
 	&sysfs_btree_wakeup,
 
@@ -930,10 +933,10 @@ SHOW(bch2_dev)
 	sysfs_print(io_latency_write,		atomic64_read(&ca->cur_latency[WRITE]));
 
 	if (attr == &sysfs_io_latency_stats_read)
-		bch2_time_stats_to_text(out, &ca->io_latency[READ]);
+		bch2_time_stats_to_text(out, &ca->io_latency[READ].stats);
 
 	if (attr == &sysfs_io_latency_stats_write)
-		bch2_time_stats_to_text(out, &ca->io_latency[WRITE]);
+		bch2_time_stats_to_text(out, &ca->io_latency[WRITE].stats);
 
 	sysfs_printf(congested,			"%u%%",
 		     clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX)
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index b3fe9fc57747..bfec656f94c0 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
 
 	bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
 			     BTREE_ITER_INTENT);
-	k = bch2_btree_iter_peek(&iter);
+	k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
 	ret = bkey_err(k);
 	if (ret)
 		goto err;
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
index 9220d7de10db..940db15d6a93 100644
--- a/fs/bcachefs/thread_with_file.c
+++ b/fs/bcachefs/thread_with_file.c
@@ -2,7 +2,6 @@
 #ifndef NO_BCACHEFS_FS
 
 #include "bcachefs.h"
-#include "printbuf.h"
 #include "thread_with_file.h"
 
 #include <linux/anon_inodes.h>
@@ -10,6 +9,7 @@
 #include <linux/kthread.h>
 #include <linux/pagemap.h>
 #include <linux/poll.h>
+#include <linux/sched/sysctl.h>
 
 void bch2_thread_with_file_exit(struct thread_with_file *thr)
 {
@@ -65,68 +65,82 @@ err:
 	return ret;
 }
 
-static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+/* stdio_redirect */
+
+static bool stdio_redirect_has_input(struct stdio_redirect *stdio)
 {
-	return thr->stdio.output_buf.pos ||
-		thr->output2.nr ||
-		thr->thr.done;
+	return stdio->input.buf.nr || stdio->done;
 }
 
-static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
-				      size_t len, loff_t *ppos)
+static bool stdio_redirect_has_output(struct stdio_redirect *stdio)
 {
-	struct thread_with_stdio *thr =
-		container_of(file->private_data, struct thread_with_stdio, thr);
-	size_t copied = 0, b;
-	int ret = 0;
+	return stdio->output.buf.nr || stdio->done;
+}
 
-	if ((file->f_flags & O_NONBLOCK) &&
-	    !thread_with_stdio_has_output(thr))
-		return -EAGAIN;
+#define STDIO_REDIRECT_BUFSIZE		4096
 
-	ret = wait_event_interruptible(thr->stdio.output_wait,
-		thread_with_stdio_has_output(thr));
-	if (ret)
-		return ret;
+static bool stdio_redirect_has_input_space(struct stdio_redirect *stdio)
+{
+	return stdio->input.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
 
-	if (thr->thr.done)
-		return 0;
+static bool stdio_redirect_has_output_space(struct stdio_redirect *stdio)
+{
+	return stdio->output.buf.nr < STDIO_REDIRECT_BUFSIZE || stdio->done;
+}
 
-	while (len) {
-		ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
-		if (ret)
-			break;
+static void stdio_buf_init(struct stdio_buf *buf)
+{
+	spin_lock_init(&buf->lock);
+	init_waitqueue_head(&buf->wait);
+	darray_init(&buf->buf);
+}
 
-		spin_lock_irq(&thr->stdio.output_lock);
-		b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+/* thread_with_stdio */
 
-		memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
-		memmove(thr->stdio.output_buf.buf,
-			thr->stdio.output_buf.buf + b,
-			thr->stdio.output_buf.pos - b);
+static void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+	thr->thr.done = true;
+	thr->stdio.done = true;
+	wake_up(&thr->stdio.input.wait);
+	wake_up(&thr->stdio.output.wait);
+}
 
-		thr->output2.nr += b;
-		thr->stdio.output_buf.pos -= b;
-		spin_unlock_irq(&thr->stdio.output_lock);
+static ssize_t thread_with_stdio_read(struct file *file, char __user *ubuf,
+				      size_t len, loff_t *ppos)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+	struct stdio_buf *buf = &thr->stdio.output;
+	size_t copied = 0, b;
+	int ret = 0;
 
-		b = min(len, thr->output2.nr);
-		if (!b)
-			break;
+	if (!(file->f_flags & O_NONBLOCK)) {
+		ret = wait_event_interruptible(buf->wait, stdio_redirect_has_output(&thr->stdio));
+		if (ret)
+			return ret;
+	} else if (!stdio_redirect_has_output(&thr->stdio))
+		return -EAGAIN;
 
-		b -= copy_to_user(buf, thr->output2.data, b);
-		if (!b) {
+	while (len && buf->buf.nr) {
+		if (fault_in_writeable(ubuf, len) == len) {
 			ret = -EFAULT;
 			break;
 		}
 
-		copied	+= b;
-		buf	+= b;
-		len	-= b;
-
-		memmove(thr->output2.data,
-			thr->output2.data + b,
-			thr->output2.nr - b);
-		thr->output2.nr -= b;
+		spin_lock_irq(&buf->lock);
+		b = min_t(size_t, len, buf->buf.nr);
+
+		if (b && !copy_to_user_nofault(ubuf, buf->buf.data, b)) {
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
+			buf->buf.nr -= b;
+			memmove(buf->buf.data,
+				buf->buf.data + b,
+				buf->buf.nr);
+		}
+		spin_unlock_irq(&buf->lock);
 	}
 
 	return copied ?: ret;
@@ -137,27 +151,20 @@ static int thread_with_stdio_release(struct inode *inode, struct file *file)
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
 
+	thread_with_stdio_done(thr);
 	bch2_thread_with_file_exit(&thr->thr);
-	printbuf_exit(&thr->stdio.input_buf);
-	printbuf_exit(&thr->stdio.output_buf);
-	darray_exit(&thr->output2);
-	thr->exit(thr);
+	darray_exit(&thr->stdio.input.buf);
+	darray_exit(&thr->stdio.output.buf);
+	thr->ops->exit(thr);
 	return 0;
 }
 
-#define WRITE_BUFFER		4096
-
-static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
-{
-	return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
-}
-
 static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
 				       size_t len, loff_t *ppos)
 {
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
-	struct printbuf *buf = &thr->stdio.input_buf;
+	struct stdio_buf *buf = &thr->stdio.input;
 	size_t copied = 0;
 	ssize_t ret = 0;
 
@@ -173,29 +180,30 @@ static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubu
 			break;
 		}
 
-		spin_lock(&thr->stdio.input_lock);
-		if (buf->pos < WRITE_BUFFER)
-			bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
-		b = min(len, printbuf_remaining_size(buf));
-
-		if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
-			ubuf += b;
-			len -= b;
-			copied += b;
-			buf->pos += b;
+		spin_lock(&buf->lock);
+		if (buf->buf.nr < STDIO_REDIRECT_BUFSIZE)
+			darray_make_room_gfp(&buf->buf,
+				min(b, STDIO_REDIRECT_BUFSIZE - buf->buf.nr), GFP_NOWAIT);
+		b = min(len, darray_room(buf->buf));
+
+		if (b && !copy_from_user_nofault(&darray_top(buf->buf), ubuf, b)) {
+			buf->buf.nr += b;
+			ubuf	+= b;
+			len	-= b;
+			copied	+= b;
 		}
-		spin_unlock(&thr->stdio.input_lock);
+		spin_unlock(&buf->lock);
 
 		if (b) {
-			wake_up(&thr->stdio.input_wait);
+			wake_up(&buf->wait);
 		} else {
 			if ((file->f_flags & O_NONBLOCK)) {
 				ret = -EAGAIN;
 				break;
 			}
 
-			ret = wait_event_interruptible(thr->stdio.input_wait,
-					thread_with_stdio_has_input_space(thr));
+			ret = wait_event_interruptible(buf->wait,
+					stdio_redirect_has_input_space(&thr->stdio));
 			if (ret)
 				break;
 		}
@@ -209,90 +217,233 @@ static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_stru
 	struct thread_with_stdio *thr =
 		container_of(file->private_data, struct thread_with_stdio, thr);
 
-	poll_wait(file, &thr->stdio.output_wait, wait);
-	poll_wait(file, &thr->stdio.input_wait, wait);
+	poll_wait(file, &thr->stdio.output.wait, wait);
+	poll_wait(file, &thr->stdio.input.wait, wait);
 
 	__poll_t mask = 0;
 
-	if (thread_with_stdio_has_output(thr))
+	if (stdio_redirect_has_output(&thr->stdio))
 		mask |= EPOLLIN;
-	if (thread_with_stdio_has_input_space(thr))
+	if (stdio_redirect_has_input_space(&thr->stdio))
 		mask |= EPOLLOUT;
 	if (thr->thr.done)
 		mask |= EPOLLHUP|EPOLLERR;
 	return mask;
 }
 
+static __poll_t thread_with_stdout_poll(struct file *file, struct poll_table_struct *wait)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	poll_wait(file, &thr->stdio.output.wait, wait);
+
+	__poll_t mask = 0;
+
+	if (stdio_redirect_has_output(&thr->stdio))
+		mask |= EPOLLIN;
+	if (thr->thr.done)
+		mask |= EPOLLHUP|EPOLLERR;
+	return mask;
+}
+
+static int thread_with_stdio_flush(struct file *file, fl_owner_t id)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	return thr->thr.ret;
+}
+
+static long thread_with_stdio_ioctl(struct file *file, unsigned int cmd, unsigned long p)
+{
+	struct thread_with_stdio *thr =
+		container_of(file->private_data, struct thread_with_stdio, thr);
+
+	if (thr->ops->unlocked_ioctl)
+		return thr->ops->unlocked_ioctl(thr, cmd, p);
+	return -ENOTTY;
+}
+
 static const struct file_operations thread_with_stdio_fops = {
-	.release	= thread_with_stdio_release,
+	.llseek		= no_llseek,
 	.read		= thread_with_stdio_read,
 	.write		= thread_with_stdio_write,
 	.poll		= thread_with_stdio_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
+};
+
+static const struct file_operations thread_with_stdout_fops = {
 	.llseek		= no_llseek,
+	.read		= thread_with_stdio_read,
+	.poll		= thread_with_stdout_poll,
+	.flush		= thread_with_stdio_flush,
+	.release	= thread_with_stdio_release,
+	.unlocked_ioctl	= thread_with_stdio_ioctl,
 };
 
+static int thread_with_stdio_fn(void *arg)
+{
+	struct thread_with_stdio *thr = arg;
+
+	thr->thr.ret = thr->ops->fn(thr);
+
+	thread_with_stdio_done(thr);
+	return 0;
+}
+
 int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
-			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *))
+			       const struct thread_with_stdio_ops *ops)
 {
-	thr->stdio.input_buf = PRINTBUF;
-	thr->stdio.input_buf.atomic++;
-	spin_lock_init(&thr->stdio.input_lock);
-	init_waitqueue_head(&thr->stdio.input_wait);
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
 
-	thr->stdio.output_buf = PRINTBUF;
-	thr->stdio.output_buf.atomic++;
-	spin_lock_init(&thr->stdio.output_lock);
-	init_waitqueue_head(&thr->stdio.output_wait);
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, thread_with_stdio_fn);
+}
 
-	darray_init(&thr->output2);
-	thr->exit = exit;
+int bch2_run_thread_with_stdout(struct thread_with_stdio *thr,
+				const struct thread_with_stdio_ops *ops)
+{
+	stdio_buf_init(&thr->stdio.input);
+	stdio_buf_init(&thr->stdio.output);
+	thr->ops = ops;
 
-	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+	return bch2_run_thread_with_file(&thr->thr, &thread_with_stdout_fops, thread_with_stdio_fn);
 }
+EXPORT_SYMBOL_GPL(bch2_run_thread_with_stdout);
 
-int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
+	struct stdio_buf *buf = &stdio->input;
+
+	/*
+	 * we're waiting on user input (or for the file descriptor to be
+	 * closed), don't want a hung task warning:
+	 */
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
 
 	if (stdio->done)
 		return -1;
 
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
+	spin_lock(&buf->lock);
+	int ret = min(len, buf->buf.nr);
+	buf->buf.nr -= ret;
+	memcpy(ubuf, buf->buf.data, ret);
+	memmove(buf->buf.data,
+		buf->buf.data + ret,
+		buf->buf.nr);
+	spin_unlock(&buf->lock);
 
-	wake_up(&stdio->input_wait);
+	wake_up(&buf->wait);
 	return ret;
 }
 
-int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *ubuf, size_t len)
 {
-	wait_event(stdio->input_wait,
-		   stdio->input_buf.pos || stdio->done);
-
-	if (stdio->done)
-		return -1;
+	struct stdio_buf *buf = &stdio->input;
+	size_t copied = 0;
+	ssize_t ret = 0;
+again:
+	do {
+		wait_event_timeout(buf->wait, stdio_redirect_has_input(stdio),
+				   sysctl_hung_task_timeout_secs * HZ / 2);
+	} while (!stdio_redirect_has_input(stdio));
+
+	if (stdio->done) {
+		ret = -1;
+		goto out;
+	}
 
-	spin_lock(&stdio->input_lock);
-	int ret = min(len, stdio->input_buf.pos);
-	char *n = memchr(stdio->input_buf.buf, '\n', ret);
+	spin_lock(&buf->lock);
+	size_t b = min(len, buf->buf.nr);
+	char *n = memchr(buf->buf.data, '\n', b);
 	if (n)
-		ret = min(ret, n + 1 - stdio->input_buf.buf);
-	stdio->input_buf.pos -= ret;
-	memcpy(buf, stdio->input_buf.buf, ret);
-	memmove(stdio->input_buf.buf,
-		stdio->input_buf.buf + ret,
-		stdio->input_buf.pos);
-	spin_unlock(&stdio->input_lock);
-
-	wake_up(&stdio->input_wait);
+		b = min_t(size_t, b, n + 1 - buf->buf.data);
+	buf->buf.nr -= b;
+	memcpy(ubuf, buf->buf.data, b);
+	memmove(buf->buf.data,
+		buf->buf.data + b,
+		buf->buf.nr);
+	ubuf += b;
+	len -= b;
+	copied += b;
+	spin_unlock(&buf->lock);
+
+	wake_up(&buf->wait);
+
+	if (!n && len)
+		goto again;
+out:
+	return copied ?: ret;
+}
+
+__printf(3, 0)
+static ssize_t bch2_darray_vprintf(darray_char *out, gfp_t gfp, const char *fmt, va_list args)
+{
+	ssize_t ret;
+
+	do {
+		va_list args2;
+		size_t len;
+
+		va_copy(args2, args);
+		len = vsnprintf(out->data + out->nr, darray_room(*out), fmt, args2);
+		va_end(args2);
+
+		if (len + 1 <= darray_room(*out)) {
+			out->nr += len;
+			return len;
+		}
+
+		ret = darray_make_room_gfp(out, len + 1, gfp);
+	} while (ret == 0);
+
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocking,
+				    const char *fmt, va_list args)
+{
+	struct stdio_buf *buf = &stdio->output;
+	unsigned long flags;
+	ssize_t ret;
+
+again:
+	spin_lock_irqsave(&buf->lock, flags);
+	ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args);
+	spin_unlock_irqrestore(&buf->lock, flags);
+
+	if (ret < 0) {
+		if (nonblocking)
+			return -EAGAIN;
+
+		ret = wait_event_interruptible(buf->wait,
+				stdio_redirect_has_output_space(stdio));
+		if (ret)
+			return ret;
+		goto again;
+	}
+
+	wake_up(&buf->wait);
+	return ret;
+}
+
+ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *stdio, bool nonblocking,
+				const char *fmt, ...)
+{
+	va_list args;
+	ssize_t ret;
+
+	va_start(args, fmt);
+	ret = bch2_stdio_redirect_vprintf(stdio, nonblocking, fmt, args);
+	va_end(args);
+
 	return ret;
 }
 
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
index 05879c5048c8..af54ea8f5b0f 100644
--- a/fs/bcachefs/thread_with_file.h
+++ b/fs/bcachefs/thread_with_file.h
@@ -4,6 +4,38 @@
 
 #include "thread_with_file_types.h"
 
+/*
+ * Thread with file: Run a kthread and connect it to a file descriptor, so that
+ * it can be interacted with via fd read/write methods and closing the file
+ * descriptor stops the kthread.
+ *
+ * We have two different APIs:
+ *
+ * thread_with_file, the low level version.
+ * You get to define the full file_operations, including your release function,
+ * which means that you must call bch2_thread_with_file_exit() from your
+ * .release method
+ *
+ * thread_with_stdio, the higher level version
+ * This implements full piping of input and output, including .poll.
+ *
+ * Notes on behaviour:
+ *  - kthread shutdown behaves like writing or reading from a pipe that has been
+ *    closed
+ *  - Input and output buffers are 4096 bytes, although buffers may in some
+ *    situations slightly exceed that limit so as to avoid chopping off a
+ *    message in the middle in nonblocking mode.
+ *  - Input/output buffers are lazily allocated, with GFP_NOWAIT allocations -
+ *    should be fine but might change in future revisions.
+ *  - Output buffer may grow past 4096 bytes to deal with messages that are
+ *    bigger than 4096 bytes
+ *  - Writing may be done blocking or nonblocking; in nonblocking mode, we only
+ *    drop entire messages.
+ *
+ * To write, use stdio_redirect_printf()
+ * To read, use stdio_redirect_read() or stdio_redirect_readline()
+ */
+
 struct task_struct;
 
 struct thread_with_file {
@@ -17,25 +49,28 @@ int bch2_run_thread_with_file(struct thread_with_file *,
 			      const struct file_operations *,
 			      int (*fn)(void *));
 
+struct thread_with_stdio;
+
+struct thread_with_stdio_ops {
+	void (*exit)(struct thread_with_stdio *);
+	int (*fn)(struct thread_with_stdio *);
+	long (*unlocked_ioctl)(struct thread_with_stdio *, unsigned int, unsigned long);
+};
+
 struct thread_with_stdio {
 	struct thread_with_file	thr;
 	struct stdio_redirect	stdio;
-	DARRAY(char)		output2;
-	void			(*exit)(struct thread_with_stdio *);
+	const struct thread_with_stdio_ops	*ops;
 };
 
-static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
-{
-	thr->thr.done = true;
-	thr->stdio.done = true;
-	wake_up(&thr->stdio.input_wait);
-	wake_up(&thr->stdio.output_wait);
-}
-
 int bch2_run_thread_with_stdio(struct thread_with_stdio *,
-			       void (*exit)(struct thread_with_stdio *),
-			       int (*fn)(void *));
+			       const struct thread_with_stdio_ops *);
+int bch2_run_thread_with_stdout(struct thread_with_stdio *,
+				const struct thread_with_stdio_ops *);
 int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
 int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
 
+__printf(3, 0) ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *, bool, const char *, va_list);
+__printf(3, 4) ssize_t bch2_stdio_redirect_printf(struct stdio_redirect *, bool, const char *, ...);
+
 #endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
index 90b5e645e98c..e0daf4eec341 100644
--- a/fs/bcachefs/thread_with_file_types.h
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -2,14 +2,21 @@
 #ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 #define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
 
+#include "darray.h"
+
+struct stdio_buf {
+	spinlock_t		lock;
+	wait_queue_head_t	wait;
+	darray_char		buf;
+};
+
 struct stdio_redirect {
-	spinlock_t		output_lock;
-	wait_queue_head_t	output_wait;
-	struct printbuf		output_buf;
+	struct stdio_buf	input;
+	struct stdio_buf	output;
 
 	spinlock_t		input_lock;
 	wait_queue_head_t	input_wait;
-	struct printbuf		input_buf;
+	darray_char		input_buf;
 	bool			done;
 };
 
diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c
new file mode 100644
index 000000000000..4508e9dcbee2
--- /dev/null
+++ b/fs/bcachefs/time_stats.c
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/preempt.h>
+#include <linux/time.h>
+#include <linux/spinlock.h>
+
+#include "eytzinger.h"
+#include "time_stats.h"
+
+static const struct time_unit time_units[] = {
+	{ "ns",		1		 },
+	{ "us",		NSEC_PER_USEC	 },
+	{ "ms",		NSEC_PER_MSEC	 },
+	{ "s",		NSEC_PER_SEC	 },
+	{ "m",          (u64) NSEC_PER_SEC * 60},
+	{ "h",          (u64) NSEC_PER_SEC * 3600},
+	{ "d",          (u64) NSEC_PER_SEC * 3600 * 24},
+	{ "w",          (u64) NSEC_PER_SEC * 3600 * 24 * 7},
+	{ "y",          (u64) NSEC_PER_SEC * ((3600 * 24 * 7 * 365) + (3600 * (24 / 4) * 7))}, /* 365.25d */
+	{ "eon",        U64_MAX          },
+};
+
+const struct time_unit *bch2_pick_time_units(u64 ns)
+{
+	const struct time_unit *u;
+
+	for (u = time_units;
+	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
+	     ns >= u[1].nsecs << 1;
+	     u++)
+		;
+
+	return u;
+}
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+	unsigned i = 0;
+
+	while (i < ARRAY_SIZE(q->entries)) {
+		struct quantile_entry *e = q->entries + i;
+
+		if (unlikely(!e->step)) {
+			e->m = v;
+			e->step = max_t(unsigned, v / 2, 1024);
+		} else if (e->m > v) {
+			e->m = e->m >= e->step
+				? e->m - e->step
+				: 0;
+		} else if (e->m < v) {
+			e->m = e->m + e->step > e->m
+				? e->m + e->step
+				: U32_MAX;
+		}
+
+		if ((e->m > v ? e->m - v : v - e->m) < e->step)
+			e->step = max_t(unsigned, e->step / 2, 1);
+
+		if (v >= e->m)
+			break;
+
+		i = eytzinger0_child(i, v > e->m);
+	}
+}
+
+static inline void time_stats_update_one(struct bch2_time_stats *stats,
+					      u64 start, u64 end)
+{
+	u64 duration, freq;
+	bool initted = stats->last_event != 0;
+
+	if (time_after64(end, start)) {
+		struct quantiles *quantiles = time_stats_to_quantiles(stats);
+
+		duration = end - start;
+		mean_and_variance_update(&stats->duration_stats, duration);
+		mean_and_variance_weighted_update(&stats->duration_stats_weighted,
+				duration, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_duration = max(stats->max_duration, duration);
+		stats->min_duration = min(stats->min_duration, duration);
+		stats->total_duration += duration;
+
+		if (quantiles)
+			quantiles_update(quantiles, duration);
+	}
+
+	if (stats->last_event && time_after64(end, stats->last_event)) {
+		freq = end - stats->last_event;
+		mean_and_variance_update(&stats->freq_stats, freq);
+		mean_and_variance_weighted_update(&stats->freq_stats_weighted,
+				freq, initted, TIME_STATS_MV_WEIGHT);
+		stats->max_freq = max(stats->max_freq, freq);
+		stats->min_freq = min(stats->min_freq, freq);
+	}
+
+	stats->last_event = end;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+				    struct time_stat_buffer *b)
+{
+	for (struct time_stat_buffer_entry *i = b->entries;
+	     i < b->entries + ARRAY_SIZE(b->entries);
+	     i++)
+		time_stats_update_one(stats, i->start, i->end);
+	b->nr = 0;
+}
+
+static noinline void time_stats_clear_buffer(struct bch2_time_stats *stats,
+					     struct time_stat_buffer *b)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&stats->lock, flags);
+	__bch2_time_stats_clear_buffer(stats, b);
+	spin_unlock_irqrestore(&stats->lock, flags);
+}
+
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
+{
+	unsigned long flags;
+
+	if (!stats->buffer) {
+		spin_lock_irqsave(&stats->lock, flags);
+		time_stats_update_one(stats, start, end);
+
+		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 &&
+		    stats->duration_stats.n > 1024)
+			stats->buffer =
+				alloc_percpu_gfp(struct time_stat_buffer,
+						 GFP_ATOMIC);
+		spin_unlock_irqrestore(&stats->lock, flags);
+	} else {
+		struct time_stat_buffer *b;
+
+		preempt_disable();
+		b = this_cpu_ptr(stats->buffer);
+
+		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
+		b->entries[b->nr++] = (struct time_stat_buffer_entry) {
+			.start = start,
+			.end = end
+		};
+
+		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
+			time_stats_clear_buffer(stats, b);
+		preempt_enable();
+	}
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *stats)
+{
+	free_percpu(stats->buffer);
+}
+
+void bch2_time_stats_init(struct bch2_time_stats *stats)
+{
+	memset(stats, 0, sizeof(*stats));
+	stats->min_duration = U64_MAX;
+	stats->min_freq = U64_MAX;
+	spin_lock_init(&stats->lock);
+}
diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h
new file mode 100644
index 000000000000..5df61403744b
--- /dev/null
+++ b/fs/bcachefs/time_stats.h
@@ -0,0 +1,159 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * bch2_time_stats - collect statistics on events that have a duration, with nicely
+ * formatted textual output on demand
+ *
+ * - percpu buffering of event collection: cheap enough to shotgun
+ *   everywhere without worrying about overhead
+ *
+ * tracks:
+ *  - number of events
+ *  - maximum event duration ever seen
+ *  - sum of all event durations
+ *  - average event duration, standard and weighted
+ *  - standard deviation of event durations, standard and weighted
+ * and analagous statistics for the frequency of events
+ *
+ * We provide both mean and weighted mean (exponentially weighted), and standard
+ * deviation and weighted standard deviation, to give an efficient-to-compute
+ * view of current behaviour versus. average behaviour - "did this event source
+ * just become wonky, or is this typical?".
+ *
+ * Particularly useful for tracking down latency issues.
+ */
+#ifndef _BCACHEFS_TIME_STATS_H
+#define _BCACHEFS_TIME_STATS_H
+
+#include <linux/sched/clock.h>
+#include <linux/spinlock_types.h>
+#include <linux/string.h>
+
+#include "mean_and_variance.h"
+
+struct time_unit {
+	const char	*name;
+	u64		nsecs;
+};
+
+/*
+ * given a nanosecond value, pick the preferred time units for printing:
+ */
+const struct time_unit *bch2_pick_time_units(u64 ns);
+
+/*
+ * quantiles - do not use:
+ *
+ * Only enabled if bch2_time_stats->quantiles_enabled has been manually set - don't
+ * use in new code.
+ */
+
+#define NR_QUANTILES	15
+#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
+#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
+#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
+
+struct quantiles {
+	struct quantile_entry {
+		u64	m;
+		u64	step;
+	}		entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+	unsigned	nr;
+	struct time_stat_buffer_entry {
+		u64	start;
+		u64	end;
+	}		entries[31];
+};
+
+struct bch2_time_stats {
+	spinlock_t	lock;
+	bool		have_quantiles;
+	/* all fields are in nanoseconds */
+	u64             min_duration;
+	u64		max_duration;
+	u64		total_duration;
+	u64             max_freq;
+	u64             min_freq;
+	u64		last_event;
+	u64		last_event_start;
+
+	struct mean_and_variance	  duration_stats;
+	struct mean_and_variance	  freq_stats;
+
+/* default weight for weighted mean and variance calculations */
+#define TIME_STATS_MV_WEIGHT	8
+
+	struct mean_and_variance_weighted duration_stats_weighted;
+	struct mean_and_variance_weighted freq_stats_weighted;
+	struct time_stat_buffer __percpu *buffer;
+};
+
+struct bch2_time_stats_quantiles {
+	struct bch2_time_stats	stats;
+	struct quantiles	quantiles;
+};
+
+static inline struct quantiles *time_stats_to_quantiles(struct bch2_time_stats *stats)
+{
+	return stats->have_quantiles
+		? &container_of(stats, struct bch2_time_stats_quantiles, stats)->quantiles
+		: NULL;
+}
+
+void __bch2_time_stats_clear_buffer(struct bch2_time_stats *, struct time_stat_buffer *);
+void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
+
+/**
+ * time_stats_update - collect a new event being tracked
+ *
+ * @stats	- bch2_time_stats to update
+ * @start	- start time of event, recorded with local_clock()
+ *
+ * The end duration of the event will be the current time
+ */
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
+{
+	__bch2_time_stats_update(stats, start, local_clock());
+}
+
+/**
+ * track_event_change - track state change events
+ *
+ * @stats	- bch2_time_stats to update
+ * @v		- new state, true or false
+ *
+ * Use this when tracking time stats for state changes, i.e. resource X becoming
+ * blocked/unblocked.
+ */
+static inline bool track_event_change(struct bch2_time_stats *stats, bool v)
+{
+	if (v != !!stats->last_event_start) {
+		if (!v) {
+			bch2_time_stats_update(stats, stats->last_event_start);
+			stats->last_event_start = 0;
+		} else {
+			stats->last_event_start = local_clock() ?: 1;
+			return true;
+		}
+	}
+
+	return false;
+}
+
+void bch2_time_stats_exit(struct bch2_time_stats *);
+void bch2_time_stats_init(struct bch2_time_stats *);
+
+static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq)
+{
+	bch2_time_stats_exit(&statq->stats);
+}
+static inline void bch2_time_stats_quantiles_init(struct bch2_time_stats_quantiles *statq)
+{
+	bch2_time_stats_init(&statq->stats);
+	statq->stats.have_quantiles = true;
+	memset(&statq->quantiles, 0, sizeof(statq->quantiles));
+}
+
+#endif /* _BCACHEFS_TIME_STATS_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index 293b90d704fb..6aa81d1e6d36 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -1431,6 +1431,25 @@ DEFINE_EVENT(fs_str, data_update,
 	TP_ARGS(c, str)
 );
 
+TRACE_EVENT(error_downcast,
+	TP_PROTO(int bch_err, int std_err, unsigned long ip),
+	TP_ARGS(bch_err, std_err, ip),
+
+	TP_STRUCT__entry(
+		__array(char,		bch_err, 32		)
+		__array(char,		std_err, 32		)
+		__array(char,		ip, 32			)
+	),
+
+	TP_fast_assign(
+		strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err));
+		strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err));
+		snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip);
+	),
+
+	TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip)
+);
+
 #endif /* _TRACE_BCACHEFS_H */
 
 /* This part must be outside protection */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 3a32faa86b5c..92c6ad75e702 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -337,157 +337,16 @@ void bch2_prt_datetime(struct printbuf *out, time64_t sec)
 }
 #endif
 
-static const struct time_unit {
-	const char	*name;
-	u64		nsecs;
-} time_units[] = {
-	{ "ns",		1		 },
-	{ "us",		NSEC_PER_USEC	 },
-	{ "ms",		NSEC_PER_MSEC	 },
-	{ "s",		NSEC_PER_SEC	 },
-	{ "m",          (u64) NSEC_PER_SEC * 60},
-	{ "h",          (u64) NSEC_PER_SEC * 3600},
-	{ "eon",        U64_MAX          },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
-	const struct time_unit *u;
-
-	for (u = time_units;
-	     u + 1 < time_units + ARRAY_SIZE(time_units) &&
-	     ns >= u[1].nsecs << 1;
-	     u++)
-		;
-
-	return u;
-}
-
 void bch2_pr_time_units(struct printbuf *out, u64 ns)
 {
-	const struct time_unit *u = pick_time_units(ns);
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
 	prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
 }
 
-/* time stats: */
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v)
-{
-	unsigned i = 0;
-
-	while (i < ARRAY_SIZE(q->entries)) {
-		struct bch2_quantile_entry *e = q->entries + i;
-
-		if (unlikely(!e->step)) {
-			e->m = v;
-			e->step = max_t(unsigned, v / 2, 1024);
-		} else if (e->m > v) {
-			e->m = e->m >= e->step
-				? e->m - e->step
-				: 0;
-		} else if (e->m < v) {
-			e->m = e->m + e->step > e->m
-				? e->m + e->step
-				: U32_MAX;
-		}
-
-		if ((e->m > v ? e->m - v : v - e->m) < e->step)
-			e->step = max_t(unsigned, e->step / 2, 1);
-
-		if (v >= e->m)
-			break;
-
-		i = eytzinger0_child(i, v > e->m);
-	}
-}
-
-static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
-					      u64 start, u64 end)
-{
-	u64 duration, freq;
-
-	if (time_after64(end, start)) {
-		duration = end - start;
-		mean_and_variance_update(&stats->duration_stats, duration);
-		mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
-		stats->max_duration = max(stats->max_duration, duration);
-		stats->min_duration = min(stats->min_duration, duration);
-		stats->total_duration += duration;
-		bch2_quantiles_update(&stats->quantiles, duration);
-	}
-
-	if (stats->last_event && time_after64(end, stats->last_event)) {
-		freq = end - stats->last_event;
-		mean_and_variance_update(&stats->freq_stats, freq);
-		mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
-		stats->max_freq = max(stats->max_freq, freq);
-		stats->min_freq = min(stats->min_freq, freq);
-	}
-
-	stats->last_event = end;
-}
-
-static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-					   struct bch2_time_stat_buffer *b)
-{
-	for (struct bch2_time_stat_buffer_entry *i = b->entries;
-	     i < b->entries + ARRAY_SIZE(b->entries);
-	     i++)
-		bch2_time_stats_update_one(stats, i->start, i->end);
-	b->nr = 0;
-}
-
-static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
-						  struct bch2_time_stat_buffer *b)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&stats->lock, flags);
-	__bch2_time_stats_clear_buffer(stats, b);
-	spin_unlock_irqrestore(&stats->lock, flags);
-}
-
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
-{
-	unsigned long flags;
-
-	WARN_ONCE(!stats->duration_stats_weighted.weight ||
-		  !stats->freq_stats_weighted.weight,
-		  "uninitialized time_stats");
-
-	if (!stats->buffer) {
-		spin_lock_irqsave(&stats->lock, flags);
-		bch2_time_stats_update_one(stats, start, end);
-
-		if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 &&
-		    stats->duration_stats.n > 1024)
-			stats->buffer =
-				alloc_percpu_gfp(struct bch2_time_stat_buffer,
-						 GFP_ATOMIC);
-		spin_unlock_irqrestore(&stats->lock, flags);
-	} else {
-		struct bch2_time_stat_buffer *b;
-
-		preempt_disable();
-		b = this_cpu_ptr(stats->buffer);
-
-		BUG_ON(b->nr >= ARRAY_SIZE(b->entries));
-		b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) {
-			.start = start,
-			.end = end
-		};
-
-		if (unlikely(b->nr == ARRAY_SIZE(b->entries)))
-			bch2_time_stats_clear_buffer(stats, b);
-		preempt_enable();
-	}
-}
-
 static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
 {
-	const struct time_unit *u = pick_time_units(ns);
+	const struct time_unit *u = bch2_pick_time_units(ns);
 
 	prt_printf(out, "%llu ", div64_u64(ns, u->nsecs));
 	prt_tab_rjust(out);
@@ -506,10 +365,9 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
 
 void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
 {
-	const struct time_unit *u;
+	struct quantiles *quantiles = time_stats_to_quantiles(stats);
 	s64 f_mean = 0, d_mean = 0;
-	u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
-	int i;
+	u64 f_stddev = 0, d_stddev = 0;
 
 	if (stats->buffer) {
 		int cpu;
@@ -571,14 +429,14 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, d_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
@@ -594,53 +452,38 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_mean);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 	prt_newline(out);
 
 	prt_printf(out, "stddev:");
 	prt_tab(out);
 	bch2_pr_time_units_aligned(out, f_stddev);
 	prt_tab(out);
-	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted));
+	bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT));
 
 	printbuf_indent_sub(out, 2);
 	prt_newline(out);
 
 	printbuf_tabstops_reset(out);
 
-	i = eytzinger0_first(NR_QUANTILES);
-	u = pick_time_units(stats->quantiles.entries[i].m);
-
-	prt_printf(out, "quantiles (%s):\t", u->name);
-	eytzinger0_for_each(i, NR_QUANTILES) {
-		bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
-
-		q = max(stats->quantiles.entries[i].m, last_q);
-		prt_printf(out, "%llu ",
-		       div_u64(q, u->nsecs));
-		if (is_last)
-			prt_newline(out);
-		last_q = q;
+	if (quantiles) {
+		int i = eytzinger0_first(NR_QUANTILES);
+		const struct time_unit *u =
+			bch2_pick_time_units(quantiles->entries[i].m);
+		u64 last_q = 0;
+
+		prt_printf(out, "quantiles (%s):\t", u->name);
+		eytzinger0_for_each(i, NR_QUANTILES) {
+			bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1;
+
+			u64 q = max(quantiles->entries[i].m, last_q);
+			prt_printf(out, "%llu ", div_u64(q, u->nsecs));
+			if (is_last)
+				prt_newline(out);
+			last_q = q;
+		}
 	}
 }
-#else
-void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
-#endif
-
-void bch2_time_stats_exit(struct bch2_time_stats *stats)
-{
-	free_percpu(stats->buffer);
-}
-
-void bch2_time_stats_init(struct bch2_time_stats *stats)
-{
-	memset(stats, 0, sizeof(*stats));
-	stats->duration_stats_weighted.weight = 8;
-	stats->freq_stats_weighted.weight = 8;
-	stats->min_duration = U64_MAX;
-	stats->min_freq = U64_MAX;
-	spin_lock_init(&stats->lock);
-}
 
 /* ratelimit: */
 
@@ -864,171 +707,6 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter)
 	}
 }
 
-static int alignment_ok(const void *base, size_t align)
-{
-	return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-		((unsigned long)base & (align - 1)) == 0;
-}
-
-static void u32_swap(void *a, void *b, size_t size)
-{
-	u32 t = *(u32 *)a;
-	*(u32 *)a = *(u32 *)b;
-	*(u32 *)b = t;
-}
-
-static void u64_swap(void *a, void *b, size_t size)
-{
-	u64 t = *(u64 *)a;
-	*(u64 *)a = *(u64 *)b;
-	*(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, size_t size)
-{
-	char t;
-
-	do {
-		t = *(char *)a;
-		*(char *)a++ = *(char *)b;
-		*(char *)b++ = t;
-	} while (--size > 0);
-}
-
-static inline int do_cmp(void *base, size_t n, size_t size,
-			 int (*cmp_func)(const void *, const void *, size_t),
-			 size_t l, size_t r)
-{
-	return cmp_func(base + inorder_to_eytzinger0(l, n) * size,
-			base + inorder_to_eytzinger0(r, n) * size,
-			size);
-}
-
-static inline void do_swap(void *base, size_t n, size_t size,
-			   void (*swap_func)(void *, void *, size_t),
-			   size_t l, size_t r)
-{
-	swap_func(base + inorder_to_eytzinger0(l, n) * size,
-		  base + inorder_to_eytzinger0(r, n) * size,
-		  size);
-}
-
-void eytzinger0_sort(void *base, size_t n, size_t size,
-		     int (*cmp_func)(const void *, const void *, size_t),
-		     void (*swap_func)(void *, void *, size_t))
-{
-	int i, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for (i = n / 2 - 1; i >= 0; --i) {
-		for (r = i; r * 2 + 1 < n; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < n &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-
-	/* sort */
-	for (i = n - 1; i > 0; --i) {
-		do_swap(base, n, size, swap_func, 0, i);
-
-		for (r = 0; r * 2 + 1 < i; r = c) {
-			c = r * 2 + 1;
-
-			if (c + 1 < i &&
-			    do_cmp(base, n, size, cmp_func, c, c + 1) < 0)
-				c++;
-
-			if (do_cmp(base, n, size, cmp_func, r, c) >= 0)
-				break;
-
-			do_swap(base, n, size, swap_func, r, c);
-		}
-	}
-}
-
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t size))
-{
-	/* pre-scale counters for performance */
-	int i = (num/2 - 1) * size, n = num * size, c, r;
-
-	if (!swap_func) {
-		if (size == 4 && alignment_ok(base, 4))
-			swap_func = u32_swap;
-		else if (size == 8 && alignment_ok(base, 8))
-			swap_func = u64_swap;
-		else
-			swap_func = generic_swap;
-	}
-
-	/* heapify */
-	for ( ; i >= 0; i -= size) {
-		for (r = i; r * 2 + size < n; r  = c) {
-			c = r * 2 + size;
-			if (c < n - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-
-	/* sort */
-	for (i = n - size; i > 0; i -= size) {
-		swap_func(base, base + i, size);
-		for (r = 0; r * 2 + size < i; r = c) {
-			c = r * 2 + size;
-			if (c < i - size &&
-			    cmp_func(base + c, base + c + size, size) < 0)
-				c += size;
-			if (cmp_func(base + r, base + c, size) >= 0)
-				break;
-			swap_func(base + r, base + c, size);
-		}
-	}
-}
-
-static void mempool_free_vp(void *element, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	vpfree(element, size);
-}
-
-static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data)
-{
-	size_t size = (size_t) pool_data;
-
-	return vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size)
-{
-	return size < PAGE_SIZE
-		? mempool_init_kmalloc_pool(pool, min_nr, size)
-		: mempool_init(pool, min_nr, mempool_alloc_vp,
-			       mempool_free_vp, (void *) size);
-}
-
 #if 0
 void eytzinger1_test(void)
 {
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b414736d59a5..5cf885b09986 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -21,6 +21,7 @@
 #include "mean_and_variance.h"
 
 #include "darray.h"
+#include "time_stats.h"
 
 struct closure;
 
@@ -53,38 +54,6 @@ static inline size_t buf_pages(void *p, size_t len)
 			    PAGE_SIZE);
 }
 
-static inline void vpfree(void *p, size_t size)
-{
-	if (is_vmalloc_addr(p))
-		vfree(p);
-	else
-		free_pages((unsigned long) p, get_order(size));
-}
-
-static inline void *vpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN,
-					 get_order(size)) ?:
-		__vmalloc(size, gfp_mask);
-}
-
-static inline void kvpfree(void *p, size_t size)
-{
-	if (size < PAGE_SIZE)
-		kfree(p);
-	else
-		vpfree(p, size);
-}
-
-static inline void *kvpmalloc(size_t size, gfp_t gfp_mask)
-{
-	return size < PAGE_SIZE
-		? kmalloc(size, gfp_mask)
-		: vpmalloc(size, gfp_mask);
-}
-
-int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t);
-
 #define HEAP(type)							\
 struct {								\
 	size_t size, used;						\
@@ -97,13 +66,13 @@ struct {								\
 ({									\
 	(heap)->used = 0;						\
 	(heap)->size = (_size);						\
-	(heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\
+	(heap)->data = kvmalloc((heap)->size * sizeof((heap)->data[0]),\
 				 (gfp));				\
 })
 
 #define free_heap(heap)							\
 do {									\
-	kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0]));	\
+	kvfree((heap)->data);						\
 	(heap)->data = NULL;						\
 } while (0)
 
@@ -361,84 +330,8 @@ static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
 #endif
 }
 
-#define NR_QUANTILES	15
-#define QUANTILE_IDX(i)	inorder_to_eytzinger0(i, NR_QUANTILES)
-#define QUANTILE_FIRST	eytzinger0_first(NR_QUANTILES)
-#define QUANTILE_LAST	eytzinger0_last(NR_QUANTILES)
-
-struct bch2_quantiles {
-	struct bch2_quantile_entry {
-		u64	m;
-		u64	step;
-	}		entries[NR_QUANTILES];
-};
-
-struct bch2_time_stat_buffer {
-	unsigned	nr;
-	struct bch2_time_stat_buffer_entry {
-		u64	start;
-		u64	end;
-	}		entries[32];
-};
-
-struct bch2_time_stats {
-	spinlock_t	lock;
-	/* all fields are in nanoseconds */
-	u64             min_duration;
-	u64		max_duration;
-	u64		total_duration;
-	u64             max_freq;
-	u64             min_freq;
-	u64		last_event;
-	struct bch2_quantiles quantiles;
-
-	struct mean_and_variance	  duration_stats;
-	struct mean_and_variance_weighted duration_stats_weighted;
-	struct mean_and_variance	  freq_stats;
-	struct mean_and_variance_weighted freq_stats_weighted;
-	struct bch2_time_stat_buffer __percpu *buffer;
-};
-
-#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
-void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
-{
-	__bch2_time_stats_update(stats, start, local_clock());
-}
-
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	if (v != !!*start) {
-		if (!v) {
-			bch2_time_stats_update(stats, *start);
-			*start = 0;
-		} else {
-			*start = local_clock() ?: 1;
-			return true;
-		}
-	}
-
-	return false;
-}
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
-static inline bool track_event_change(struct bch2_time_stats *stats,
-				      u64 *start, bool v)
-{
-	bool ret = v && !*start;
-	*start = v;
-	return ret;
-}
-#endif
-
 void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
 
-void bch2_time_stats_exit(struct bch2_time_stats *);
-void bch2_time_stats_init(struct bch2_time_stats *);
-
 #define ewma_add(ewma, val, weight)					\
 ({									\
 	typeof(ewma) _ewma = (ewma);					\
@@ -738,10 +631,6 @@ static inline void memset_u64s_tail(void *s, int c, unsigned bytes)
 	memset(s + bytes, c, rem);
 }
 
-void sort_cmp_size(void *base, size_t num, size_t size,
-	  int (*cmp_func)(const void *, const void *, size_t),
-	  void (*swap_func)(void *, void *, size_t));
-
 /* just the memmove, doesn't update @_nr */
 #define __array_insert_item(_array, _nr, _pos)				\
 	memmove(&(_array)[(_pos) + 1],					\
@@ -788,8 +677,15 @@ static inline void __move_gap(void *array, size_t element_size,
 }
 
 /* Move the gap in a gap buffer: */
-#define move_gap(_array, _nr, _size, _old_gap, _new_gap)	\
-	__move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap)
+#define move_gap(_d, _new_gap)						\
+do {									\
+	BUG_ON(_new_gap > (_d)->nr);					\
+	BUG_ON((_d)->gap > (_d)->nr);					\
+									\
+	__move_gap((_d)->data, sizeof((_d)->data[0]),			\
+		   (_d)->nr, (_d)->size, (_d)->gap, _new_gap);		\
+	(_d)->gap = _new_gap;						\
+} while (0)
 
 #define bubble_sort(_base, _nr, _cmp)					\
 do {									\
@@ -876,4 +772,43 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r)
 void bch2_darray_str_exit(darray_str *);
 int bch2_split_devs(const char *, darray_str *);
 
+#ifdef __KERNEL__
+
+__must_check
+static inline int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+	return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
+__must_check
+static inline int copy_from_user_errcode(void *to, const void __user *from, unsigned long n)
+{
+	return copy_from_user(to, from, n) ? -EFAULT : 0;
+}
+
+#endif
+
+static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
+{
+	if (v)
+		set_bit(nr, addr);
+	else
+		clear_bit(nr, addr);
+}
+
+static inline void __set_bit_le64(size_t bit, __le64 *addr)
+{
+	addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline void __clear_bit_le64(size_t bit, __le64 *addr)
+{
+	addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
+}
+
+static inline bool test_bit_le64(size_t bit, __le64 *addr)
+{
+	return (addr[bit / 64] & cpu_to_le64(BIT_ULL(bit % 64))) != 0;
+}
+
 #endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 9c0d2316031b..754f17bba68e 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -544,11 +544,11 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler,
 		kfree(buf);
 
 		if (ret < 0)
-			return ret;
+			goto err_class_exit;
 
 		ret = bch2_opt_check_may_set(c, opt_id, v);
 		if (ret < 0)
-			return ret;
+			goto err_class_exit;
 
 		s.v = v + 1;
 		s.defined = true;
@@ -595,6 +595,7 @@ err:
 	     (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
 		bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
 
+err_class_exit:
 	return bch2_err_class(ret);
 }
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 2b4dda047450..d76f406d3b2e 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -435,8 +435,7 @@ befs_init_inodecache(void)
 {
 	befs_inode_cachep = kmem_cache_create_usercopy("befs_inode_cache",
 				sizeof(struct befs_inode_info), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-					SLAB_ACCOUNT),
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 				offsetof(struct befs_inode_info,
 					i_data.symlink),
 				sizeof_field(struct befs_inode_info,
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 355957dbce39..db81570c9637 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -259,7 +259,7 @@ static int __init init_inodecache(void)
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index fefc642541cb..3314249e8674 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -320,7 +320,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
 	else
 		executable_stack = EXSTACK_DEFAULT;
 
-	if (stack_size == 0) {
+	if (stack_size == 0 && interp_params.flags & ELF_FDPIC_FLAG_PRESENT) {
 		stack_size = interp_params.stack_size;
 		if (interp_params.flags & ELF_FDPIC_FLAG_EXEC_STACK)
 			executable_stack = EXSTACK_ENABLE_X;
@@ -1359,7 +1359,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p,
 	SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid));
 	SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid));
 	rcu_read_unlock();
-	strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname));
+	get_task_comm(psinfo->pr_fname, p);
 
 	return 0;
 }
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e24d784898fc..7696beec4c21 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -244,7 +244,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 {
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_device *device;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
@@ -255,13 +255,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 		return -EINVAL;
 	}
 
-	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
 					fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		btrfs_err(fs_info, "target device %s is invalid!", device_path);
-		return PTR_ERR(bdev_handle);
+		return PTR_ERR(bdev_file);
 	}
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 
 	if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 		btrfs_err(fs_info,
@@ -312,7 +312,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	device->commit_bytes_used = device->bytes_used;
 	device->fs_info = fs_info;
 	device->bdev = bdev;
-	device->bdev_handle = bdev_handle;
+	device->bdev_file = bdev_file;
 	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 	set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 	device->dev_stats_valid = 1;
@@ -333,7 +333,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 	return 0;
 
 error:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return ret;
 }
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5a507237c4fa..55f3ba6a831c 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2736,7 +2736,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_vol_args_v2 *vol_args;
-	struct bdev_handle *bdev_handle = NULL;
+	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2776,7 +2776,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 		goto err_drop;
 
 	/* Exclusive operation is now claimed */
-	ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+	ret = btrfs_rm_device(fs_info, &args, &bdev_file);
 
 	btrfs_exclop_finish(fs_info);
 
@@ -2790,8 +2790,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
 	}
 err_drop:
 	mnt_drop_write_file(file);
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 	kfree(vol_args);
@@ -2804,7 +2804,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	struct inode *inode = file_inode(file);
 	struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
 	struct btrfs_ioctl_vol_args *vol_args;
-	struct bdev_handle *bdev_handle = NULL;
+	struct file *bdev_file = NULL;
 	int ret;
 	bool cancel = false;
 
@@ -2834,15 +2834,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
 					   cancel);
 	if (ret == 0) {
-		ret = btrfs_rm_device(fs_info, &args, &bdev_handle);
+		ret = btrfs_rm_device(fs_info, &args, &bdev_file);
 		if (!ret)
 			btrfs_info(fs_info, "disk deleted %s", vol_args->name);
 		btrfs_exclop_finish(fs_info);
 	}
 
 	mnt_drop_write_file(file);
-	if (bdev_handle)
-		bdev_release(bdev_handle);
+	if (bdev_file)
+		fput(bdev_file);
 out:
 	btrfs_put_dev_args_from_path(&args);
 out_free:
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dedec3d9b111..f15591f3e54f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -466,39 +466,39 @@ static noinline struct btrfs_fs_devices *find_fsid(
 
 static int
 btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
-		      int flush, struct bdev_handle **bdev_handle,
+		      int flush, struct file **bdev_file,
 		      struct btrfs_super_block **disk_super)
 {
 	struct block_device *bdev;
 	int ret;
 
-	*bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL);
+	*bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
 
-	if (IS_ERR(*bdev_handle)) {
-		ret = PTR_ERR(*bdev_handle);
+	if (IS_ERR(*bdev_file)) {
+		ret = PTR_ERR(*bdev_file);
 		goto error;
 	}
-	bdev = (*bdev_handle)->bdev;
+	bdev = file_bdev(*bdev_file);
 
 	if (flush)
 		sync_blockdev(bdev);
 	ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE);
 	if (ret) {
-		bdev_release(*bdev_handle);
+		fput(*bdev_file);
 		goto error;
 	}
 	invalidate_bdev(bdev);
 	*disk_super = btrfs_read_dev_super(bdev);
 	if (IS_ERR(*disk_super)) {
 		ret = PTR_ERR(*disk_super);
-		bdev_release(*bdev_handle);
+		fput(*bdev_file);
 		goto error;
 	}
 
 	return 0;
 
 error:
-	*bdev_handle = NULL;
+	*bdev_file = NULL;
 	return ret;
 }
 
@@ -641,7 +641,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 			struct btrfs_device *device, blk_mode_t flags,
 			void *holder)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct btrfs_super_block *disk_super;
 	u64 devid;
 	int ret;
@@ -652,7 +652,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		return -EINVAL;
 
 	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
-				    &bdev_handle, &disk_super);
+				    &bdev_file, &disk_super);
 	if (ret)
 		return ret;
 
@@ -676,20 +676,20 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		fs_devices->seeding = true;
 	} else {
-		if (bdev_read_only(bdev_handle->bdev))
+		if (bdev_read_only(file_bdev(bdev_file)))
 			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 		else
 			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 	}
 
-	if (!bdev_nonrot(bdev_handle->bdev))
+	if (!bdev_nonrot(file_bdev(bdev_file)))
 		fs_devices->rotating = true;
 
-	if (bdev_max_discard_sectors(bdev_handle->bdev))
+	if (bdev_max_discard_sectors(file_bdev(bdev_file)))
 		fs_devices->discardable = true;
 
-	device->bdev_handle = bdev_handle;
-	device->bdev = bdev_handle->bdev;
+	device->bdev_file = bdev_file;
+	device->bdev = file_bdev(bdev_file);
 	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 
 	if (device->devt != device->bdev->bd_dev) {
@@ -714,7 +714,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 
 error_free_page:
 	btrfs_release_disk_super(disk_super);
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
 	return -EINVAL;
 }
@@ -1027,10 +1027,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
 		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
 			continue;
 
-		if (device->bdev_handle) {
-			bdev_release(device->bdev_handle);
+		if (device->bdev_file) {
+			fput(device->bdev_file);
 			device->bdev = NULL;
-			device->bdev_handle = NULL;
+			device->bdev_file = NULL;
 			fs_devices->open_devices--;
 		}
 		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
@@ -1075,7 +1075,7 @@ static void btrfs_close_bdev(struct btrfs_device *device)
 		invalidate_bdev(device->bdev);
 	}
 
-	bdev_release(device->bdev_handle);
+	fput(device->bdev_file);
 }
 
 static void btrfs_close_one_device(struct btrfs_device *device)
@@ -1320,6 +1320,47 @@ int btrfs_forget_devices(dev_t devt)
 	return ret;
 }
 
+static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
+				    const char *path, dev_t devt,
+				    bool mount_arg_dev)
+{
+	struct btrfs_fs_devices *fs_devices;
+
+	/*
+	 * Do not skip device registration for mounted devices with matching
+	 * maj:min but different paths. Booting without initrd relies on
+	 * /dev/root initially, later replaced with the actual root device.
+	 * A successful scan ensures grub2-probe selects the correct device.
+	 */
+	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+		struct btrfs_device *device;
+
+		mutex_lock(&fs_devices->device_list_mutex);
+
+		if (!fs_devices->opened) {
+			mutex_unlock(&fs_devices->device_list_mutex);
+			continue;
+		}
+
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (device->bdev && (device->bdev->bd_dev == devt) &&
+			    strcmp(device->name->str, path) != 0) {
+				mutex_unlock(&fs_devices->device_list_mutex);
+
+				/* Do not skip registration. */
+				return false;
+			}
+		}
+		mutex_unlock(&fs_devices->device_list_mutex);
+	}
+
+	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
+	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
+		return true;
+
+	return false;
+}
+
 /*
  * Look for a btrfs signature on a device. This may be called out of the mount path
  * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -1335,8 +1376,9 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	struct btrfs_super_block *disk_super;
 	bool new_device_added = false;
 	struct btrfs_device *device = NULL;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	u64 bytenr, bytenr_orig;
+	dev_t devt;
 	int ret;
 
 	lockdep_assert_held(&uuid_mutex);
@@ -1358,37 +1400,31 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
 	 * values temporarily, as the device paths of the fsid are the only
 	 * required information for assembling the volume.
 	 */
-	bdev_handle = bdev_open_by_path(path, flags, NULL, NULL);
-	if (IS_ERR(bdev_handle))
-		return ERR_CAST(bdev_handle);
+	bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
+	if (IS_ERR(bdev_file))
+		return ERR_CAST(bdev_file);
 
 	bytenr_orig = btrfs_sb_offset(0);
-	ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr);
+	ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
 	if (ret) {
 		device = ERR_PTR(ret);
 		goto error_bdev_put;
 	}
 
-	disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr,
+	disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
 					   bytenr_orig);
 	if (IS_ERR(disk_super)) {
 		device = ERR_CAST(disk_super);
 		goto error_bdev_put;
 	}
 
-	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
-	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) {
-		dev_t devt;
+	devt = file_bdev(bdev_file)->bd_dev;
+	if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
+		pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
+			  path, MAJOR(devt), MINOR(devt));
 
-		ret = lookup_bdev(path, &devt);
-		if (ret)
-			btrfs_warn(NULL, "lookup bdev failed for path %s: %d",
-				   path, ret);
-		else
-			btrfs_free_stale_devices(devt, NULL);
+		btrfs_free_stale_devices(devt, NULL);
 
-	pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
-			path, MAJOR(devt), MINOR(devt));
 		device = NULL;
 		goto free_disk_super;
 	}
@@ -1401,7 +1437,7 @@ free_disk_super:
 	btrfs_release_disk_super(disk_super);
 
 error_bdev_put:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 
 	return device;
 }
@@ -2076,7 +2112,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_devic
 
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct bdev_handle **bdev_handle)
+		    struct file **bdev_file)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
@@ -2185,7 +2221,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 
 	btrfs_assign_next_active_device(device, NULL);
 
-	if (device->bdev_handle) {
+	if (device->bdev_file) {
 		cur_devices->open_devices--;
 		/* remove sysfs entry */
 		btrfs_sysfs_remove_device(device);
@@ -2201,9 +2237,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 	 * free the device.
 	 *
 	 * We cannot call btrfs_close_bdev() here because we're holding the sb
-	 * write lock, and bdev_release() will pull in the ->open_mutex on
-	 * the block device and it's dependencies.  Instead just flush the
-	 * device and let the caller do the final bdev_release.
+	 * write lock, and fput() on the block device will pull in the
+	 * ->open_mutex on the block device and it's dependencies.  Instead
+	 *  just flush the device and let the caller do the final bdev_release.
 	 */
 	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
 		btrfs_scratch_superblocks(fs_info, device);
@@ -2213,7 +2249,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		}
 	}
 
-	*bdev_handle = device->bdev_handle;
+	*bdev_file = device->bdev_file;
 	synchronize_rcu();
 	btrfs_free_device(device);
 
@@ -2349,7 +2385,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 				 const char *path)
 {
 	struct btrfs_super_block *disk_super;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int ret;
 
 	if (!path || !path[0])
@@ -2367,7 +2403,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	}
 
 	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
-				    &bdev_handle, &disk_super);
+				    &bdev_file, &disk_super);
 	if (ret) {
 		btrfs_put_dev_args_from_path(args);
 		return ret;
@@ -2380,7 +2416,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
 	else
 		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 	btrfs_release_disk_super(disk_super);
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	return 0;
 }
 
@@ -2600,7 +2636,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	struct btrfs_root *root = fs_info->dev_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct super_block *sb = fs_info->sb;
 	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 	struct btrfs_fs_devices *seed_devices = NULL;
@@ -2613,12 +2649,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	if (sb_rdonly(sb) && !fs_devices->seeding)
 		return -EROFS;
 
-	bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
 					fs_info->bdev_holder, NULL);
-	if (IS_ERR(bdev_handle))
-		return PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file))
+		return PTR_ERR(bdev_file);
 
-	if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) {
+	if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
 		ret = -EINVAL;
 		goto error;
 	}
@@ -2630,11 +2666,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 		locked = true;
 	}
 
-	sync_blockdev(bdev_handle->bdev);
+	sync_blockdev(file_bdev(bdev_file));
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
-		if (device->bdev == bdev_handle->bdev) {
+		if (device->bdev == file_bdev(bdev_file)) {
 			ret = -EEXIST;
 			rcu_read_unlock();
 			goto error;
@@ -2650,8 +2686,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
 	}
 
 	device->fs_info = fs_info;
-	device->bdev_handle = bdev_handle;
-	device->bdev = bdev_handle->bdev;
+	device->bdev_file = bdev_file;
+	device->bdev = file_bdev(bdev_file);
 	ret = lookup_bdev(device_path, &device->devt);
 	if (ret)
 		goto error_free_device;
@@ -2834,7 +2870,7 @@ error_free_zone:
 error_free_device:
 	btrfs_free_device(device);
 error:
-	bdev_release(bdev_handle);
+	fput(bdev_file);
 	if (locked) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index feba8d53526c..93854609a4d5 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -105,7 +105,7 @@ struct btrfs_device {
 
 	u64 generation;
 
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 
 	struct btrfs_zoned_device_info *zone_info;
@@ -698,7 +698,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
 void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args);
 int btrfs_rm_device(struct btrfs_fs_info *fs_info,
 		    struct btrfs_dev_lookup_args *args,
-		    struct bdev_handle **bdev_handle);
+		    struct file **bdev_file);
 void __exit btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 459d1af02c3c..4cba80b34387 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -822,11 +822,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 			reset = &zones[1];
 
 		if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+			unsigned int nofs_flags;
+
 			ASSERT(sb_zone_is_full(reset));
 
+			nofs_flags = memalloc_nofs_save();
 			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
-					       reset->start, reset->len,
-					       GFP_NOFS);
+					       reset->start, reset->len);
+			memalloc_nofs_restore(nofs_flags);
 			if (ret)
 				return ret;
 
@@ -972,11 +975,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 			 * explicit ZONE_FINISH is not necessary.
 			 */
 			if (zone->wp != zone->start + zone->capacity) {
+				unsigned int nofs_flags;
 				int ret;
 
+				nofs_flags = memalloc_nofs_save();
 				ret = blkdev_zone_mgmt(device->bdev,
 						REQ_OP_ZONE_FINISH, zone->start,
-						zone->len, GFP_NOFS);
+						zone->len);
+				memalloc_nofs_restore(nofs_flags);
 				if (ret)
 					return ret;
 			}
@@ -994,11 +1000,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 
 int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 {
+	unsigned int nofs_flags;
 	sector_t zone_sectors;
 	sector_t nr_sectors;
 	u8 zone_sectors_shift;
 	u32 sb_zone;
 	u32 nr_zones;
+	int ret;
 
 	zone_sectors = bdev_zone_sectors(bdev);
 	zone_sectors_shift = ilog2(zone_sectors);
@@ -1009,9 +1017,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 	if (sb_zone + 1 >= nr_zones)
 		return -ENOENT;
 
-	return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
-				zone_start_sector(sb_zone, bdev),
-				zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+	nofs_flags = memalloc_nofs_save();
+	ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+			       zone_start_sector(sb_zone, bdev),
+			       zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+	memalloc_nofs_restore(nofs_flags);
+	return ret;
 }
 
 /*
@@ -1122,12 +1133,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
 int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 			    u64 length, u64 *bytes)
 {
+	unsigned int nofs_flags;
 	int ret;
 
 	*bytes = 0;
+	nofs_flags = memalloc_nofs_save();
 	ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
-			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
-			       GFP_NOFS);
+			       physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+	memalloc_nofs_restore(nofs_flags);
 	if (ret)
 		return ret;
 
@@ -2239,14 +2252,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
 		struct btrfs_device *device = map->stripes[i].dev;
 		const u64 physical = map->stripes[i].physical;
 		struct btrfs_zoned_device_info *zinfo = device->zone_info;
+		unsigned int nofs_flags;
 
 		if (zinfo->max_active_zones == 0)
 			continue;
 
+		nofs_flags = memalloc_nofs_save();
 		ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
 				       physical >> SECTOR_SHIFT,
-				       zinfo->zone_size >> SECTOR_SHIFT,
-				       GFP_NOFS);
+				       zinfo->zone_size >> SECTOR_SHIFT);
+		memalloc_nofs_restore(nofs_flags);
 
 		if (ret) {
 			up_read(&dev_replace->rwsem);
diff --git a/fs/buffer.c b/fs/buffer.c
index d3bcf601d3e5..4f73d23c2c46 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -55,7 +55,7 @@
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
-			  struct writeback_control *wbc);
+			  enum rw_hint hint, struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -464,7 +464,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
  * a successful fsync().  For example, ext2 indirect blocks need to be
  * written back and waited upon before fsync() returns.
  *
- * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
+ * The functions mark_buffer_dirty_inode(), fsync_inode_buffers(),
  * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  * management of a list of dependent buffers at ->i_mapping->i_private_list.
  *
@@ -1889,7 +1889,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
 	do {
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
-			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+				      inode->i_write_hint, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -1944,7 +1945,8 @@ recover:
 		struct buffer_head *next = bh->b_this_page;
 		if (buffer_async_write(bh)) {
 			clear_buffer_dirty(bh);
-			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
+			submit_bh_wbc(REQ_OP_WRITE | write_flags, bh,
+				      inode->i_write_hint, wbc);
 			nr_underway++;
 		}
 		bh = next;
@@ -2756,6 +2758,7 @@ static void end_bio_bh_io_sync(struct bio *bio)
 }
 
 static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
+			  enum rw_hint write_hint,
 			  struct writeback_control *wbc)
 {
 	const enum req_op op = opf & REQ_OP_MASK;
@@ -2783,6 +2786,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 	fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
 
 	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_write_hint = write_hint;
 
 	__bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 
@@ -2802,7 +2806,7 @@ static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
 
 void submit_bh(blk_opf_t opf, struct buffer_head *bh)
 {
-	submit_bh_wbc(opf, bh, NULL);
+	submit_bh_wbc(opf, bh, WRITE_LIFE_NOT_SET, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3121,12 +3125,8 @@ void __init buffer_init(void)
 	unsigned long nrpages;
 	int ret;
 
-	bh_cachep = kmem_cache_create("buffer_head",
-			sizeof(struct buffer_head), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-				SLAB_MEM_SPREAD),
-				NULL);
-
+	bh_cachep = KMEM_CACHE(buffer_head,
+				SLAB_RECLAIM_ACCOUNT|SLAB_PANIC);
 	/*
 	 * Limit the bh occupancy to 10% of ZONE_NORMAL
 	 */
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 1340d77124ae..ee9caf7916fb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -795,8 +795,10 @@ static int ceph_writepage(struct page *page, struct writeback_control *wbc)
 	ihold(inode);
 
 	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    ceph_inode_to_fs_client(inode)->write_congested)
+	    ceph_inode_to_fs_client(inode)->write_congested) {
+		redirty_page_for_writepage(wbc, page);
 		return AOP_WRITEPAGE_ACTIVATE;
+	}
 
 	wait_on_page_fscache(page);
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7fb4aae97412..c4941ba245ac 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -4634,6 +4634,14 @@ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
 			iput(inode);
 			spin_lock(&mdsc->cap_delay_lock);
 		}
+
+		/*
+		 * Make sure too many dirty caps or general
+		 * slowness doesn't block mdsc delayed work,
+		 * preventing send_renew_caps() from running.
+		 */
+		if (jiffies - loop_start >= 5 * HZ)
+			break;
 	}
 	spin_unlock(&mdsc->cap_delay_lock);
 	doutc(cl, "done\n");
@@ -4775,13 +4783,13 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
 
 			doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
 			      ceph_vinop(inode));
-			spin_lock(&mdsc->cap_unlink_delay_lock);
+			spin_lock(&mdsc->cap_delay_lock);
 			ci->i_ceph_flags |= CEPH_I_FLUSH;
 			if (!list_empty(&ci->i_cap_delay_list))
 				list_del_init(&ci->i_cap_delay_list);
 			list_add_tail(&ci->i_cap_delay_list,
 				      &mdsc->cap_unlink_delay_list);
-			spin_unlock(&mdsc->cap_unlink_delay_lock);
+			spin_unlock(&mdsc->cap_delay_lock);
 
 			/*
 			 * Fire the work immediately, because the MDS maybe
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index abe8028d95bf..16873d07692f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1138,7 +1138,12 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 		}
 
 		idx = 0;
-		left = ret > 0 ? ret : 0;
+		if (ret <= 0)
+			left = 0;
+		else if (off + ret > i_size)
+			left = i_size - off;
+		else
+			left = ret;
 		while (left > 0) {
 			size_t plen, copied;
 
@@ -1167,15 +1172,13 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
 	}
 
 	if (ret > 0) {
-		if (off > *ki_pos) {
-			if (off >= i_size) {
-				*retry_op = CHECK_EOF;
-				ret = i_size - *ki_pos;
-				*ki_pos = i_size;
-			} else {
-				ret = off - *ki_pos;
-				*ki_pos = off;
-			}
+		if (off >= i_size) {
+			*retry_op = CHECK_EOF;
+			ret = i_size - *ki_pos;
+			*ki_pos = i_size;
+		} else {
+			ret = off - *ki_pos;
+			*ki_pos = off;
 		}
 
 		if (last_objver)
@@ -2126,14 +2129,16 @@ again:
 		int statret;
 		struct page *page = NULL;
 		loff_t i_size;
+		int mask = CEPH_STAT_CAP_SIZE;
 		if (retry_op == READ_INLINE) {
 			page = __page_cache_alloc(GFP_KERNEL);
 			if (!page)
 				return -ENOMEM;
+
+			mask = CEPH_STAT_CAP_INLINE_DATA;
 		}
 
-		statret = __ceph_do_getattr(inode, page,
-					    CEPH_STAT_CAP_INLINE_DATA, !!page);
+		statret = __ceph_do_getattr(inode, page, mask, !!page);
 		if (statret < 0) {
 			if (page)
 				__free_page(page);
@@ -2174,7 +2179,7 @@ again:
 		/* hit EOF or hole? */
 		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
 		    ret < len) {
-			doutc(cl, "hit hole, ppos %lld < size %lld, reading more\n",
+			doutc(cl, "may hit hole, ppos %lld < size %lld, reading more\n",
 			      iocb->ki_pos, i_size);
 
 			read += ret;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index e07ad29ff8b9..ebf4ac0055dd 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -33,7 +33,7 @@ void __init ceph_flock_init(void)
 
 static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
 {
-	struct inode *inode = file_inode(dst->fl_file);
+	struct inode *inode = file_inode(dst->c.flc_file);
 	atomic_inc(&ceph_inode(inode)->i_filelock_ref);
 	dst->fl_u.ceph.inode = igrab(inode);
 }
@@ -110,17 +110,18 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 	else
 		length = fl->fl_end - fl->fl_start + 1;
 
-	owner = secure_addr(fl->fl_owner);
+	owner = secure_addr(fl->c.flc_owner);
 
 	doutc(cl, "rule: %d, op: %d, owner: %llx, pid: %llu, "
 		    "start: %llu, length: %llu, wait: %d, type: %d\n",
-		    (int)lock_type, (int)operation, owner, (u64)fl->fl_pid,
-		    fl->fl_start, length, wait, fl->fl_type);
+		    (int)lock_type, (int)operation, owner,
+		    (u64) fl->c.flc_pid,
+		    fl->fl_start, length, wait, fl->c.flc_type);
 
 	req->r_args.filelock_change.rule = lock_type;
 	req->r_args.filelock_change.type = cmd;
 	req->r_args.filelock_change.owner = cpu_to_le64(owner);
-	req->r_args.filelock_change.pid = cpu_to_le64((u64)fl->fl_pid);
+	req->r_args.filelock_change.pid = cpu_to_le64((u64) fl->c.flc_pid);
 	req->r_args.filelock_change.start = cpu_to_le64(fl->fl_start);
 	req->r_args.filelock_change.length = cpu_to_le64(length);
 	req->r_args.filelock_change.wait = wait;
@@ -130,13 +131,13 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 		err = ceph_mdsc_wait_request(mdsc, req, wait ?
 					ceph_lock_wait_for_completion : NULL);
 	if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
-		fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
+		fl->c.flc_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
 		if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
-			fl->fl_type = F_RDLCK;
+			fl->c.flc_type = F_RDLCK;
 		else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type)
-			fl->fl_type = F_WRLCK;
+			fl->c.flc_type = F_WRLCK;
 		else
-			fl->fl_type = F_UNLCK;
+			fl->c.flc_type = F_UNLCK;
 
 		fl->fl_start = le64_to_cpu(req->r_reply_info.filelock_reply->start);
 		length = le64_to_cpu(req->r_reply_info.filelock_reply->start) +
@@ -150,8 +151,8 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
 	ceph_mdsc_put_request(req);
 	doutc(cl, "rule: %d, op: %d, pid: %llu, start: %llu, "
 	      "length: %llu, wait: %d, type: %d, err code %d\n",
-	      (int)lock_type, (int)operation, (u64)fl->fl_pid,
-	      fl->fl_start, length, wait, fl->fl_type, err);
+	      (int)lock_type, (int)operation, (u64) fl->c.flc_pid,
+	      fl->fl_start, length, wait, fl->c.flc_type, err);
 	return err;
 }
 
@@ -227,10 +228,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
 static int try_unlock_file(struct file *file, struct file_lock *fl)
 {
 	int err;
-	unsigned int orig_flags = fl->fl_flags;
-	fl->fl_flags |= FL_EXISTS;
+	unsigned int orig_flags = fl->c.flc_flags;
+	fl->c.flc_flags |= FL_EXISTS;
 	err = locks_lock_file_wait(file, fl);
-	fl->fl_flags = orig_flags;
+	fl->c.flc_flags = orig_flags;
 	if (err == -ENOENT) {
 		if (!(orig_flags & FL_EXISTS))
 			err = 0;
@@ -253,13 +254,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	u8 wait = 0;
 	u8 lock_cmd;
 
-	if (!(fl->fl_flags & FL_POSIX))
+	if (!(fl->c.flc_flags & FL_POSIX))
 		return -ENOLCK;
 
 	if (ceph_inode_is_shutdown(inode))
 		return -ESTALE;
 
-	doutc(cl, "fl_owner: %p\n", fl->fl_owner);
+	doutc(cl, "fl_owner: %p\n", fl->c.flc_owner);
 
 	/* set wait bit as appropriate, then make command as Ceph expects it*/
 	if (IS_GETLK(cmd))
@@ -273,19 +274,19 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	if (err < 0) {
-		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type)
+		if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl))
 			posix_lock_file(file, fl, NULL);
 		return err;
 	}
 
-	if (F_RDLCK == fl->fl_type)
+	if (lock_is_read(fl))
 		lock_cmd = CEPH_LOCK_SHARED;
-	else if (F_WRLCK == fl->fl_type)
+	else if (lock_is_write(fl))
 		lock_cmd = CEPH_LOCK_EXCL;
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
 
-	if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) {
+	if (op == CEPH_MDS_OP_SETFILELOCK && lock_is_unlock(fl)) {
 		err = try_unlock_file(file, fl);
 		if (err <= 0)
 			return err;
@@ -293,7 +294,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl);
 	if (!err) {
-		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->fl_type) {
+		if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK != fl->c.flc_type) {
 			doutc(cl, "locking locally\n");
 			err = posix_lock_file(file, fl, NULL);
 			if (err) {
@@ -319,13 +320,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	u8 wait = 0;
 	u8 lock_cmd;
 
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		return -ENOLCK;
 
 	if (ceph_inode_is_shutdown(inode))
 		return -ESTALE;
 
-	doutc(cl, "fl_file: %p\n", fl->fl_file);
+	doutc(cl, "fl_file: %p\n", fl->c.flc_file);
 
 	spin_lock(&ci->i_ceph_lock);
 	if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) {
@@ -333,7 +334,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	if (err < 0) {
-		if (F_UNLCK == fl->fl_type)
+		if (lock_is_unlock(fl))
 			locks_lock_file_wait(file, fl);
 		return err;
 	}
@@ -341,14 +342,14 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 	if (IS_SETLKW(cmd))
 		wait = 1;
 
-	if (F_RDLCK == fl->fl_type)
+	if (lock_is_read(fl))
 		lock_cmd = CEPH_LOCK_SHARED;
-	else if (F_WRLCK == fl->fl_type)
+	else if (lock_is_write(fl))
 		lock_cmd = CEPH_LOCK_EXCL;
 	else
 		lock_cmd = CEPH_LOCK_UNLOCK;
 
-	if (F_UNLCK == fl->fl_type) {
+	if (lock_is_unlock(fl)) {
 		err = try_unlock_file(file, fl);
 		if (err <= 0)
 			return err;
@@ -356,7 +357,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
 				inode, lock_cmd, wait, fl);
-	if (!err && F_UNLCK != fl->fl_type) {
+	if (!err && F_UNLCK != fl->c.flc_type) {
 		err = locks_lock_file_wait(file, fl);
 		if (err) {
 			ceph_lock_message(CEPH_LOCK_FLOCK,
@@ -385,9 +386,9 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 	ctx = locks_inode_context(inode);
 	if (ctx) {
 		spin_lock(&ctx->flc_lock);
-		list_for_each_entry(lock, &ctx->flc_posix, fl_list)
+		for_each_file_lock(lock, &ctx->flc_posix)
 			++(*fcntl_count);
-		list_for_each_entry(lock, &ctx->flc_flock, fl_list)
+		for_each_file_lock(lock, &ctx->flc_flock)
 			++(*flock_count);
 		spin_unlock(&ctx->flc_lock);
 	}
@@ -408,10 +409,10 @@ static int lock_to_ceph_filelock(struct inode *inode,
 	cephlock->start = cpu_to_le64(lock->fl_start);
 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 	cephlock->client = cpu_to_le64(0);
-	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
-	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
+	cephlock->pid = cpu_to_le64((u64) lock->c.flc_pid);
+	cephlock->owner = cpu_to_le64(secure_addr(lock->c.flc_owner));
 
-	switch (lock->fl_type) {
+	switch (lock->c.flc_type) {
 	case F_RDLCK:
 		cephlock->type = CEPH_LOCK_SHARED;
 		break;
@@ -422,7 +423,8 @@ static int lock_to_ceph_filelock(struct inode *inode,
 		cephlock->type = CEPH_LOCK_UNLOCK;
 		break;
 	default:
-		doutc(cl, "Have unknown lock type %d\n", lock->fl_type);
+		doutc(cl, "Have unknown lock type %d\n",
+		      lock->c.flc_type);
 		err = -EINVAL;
 	}
 
@@ -453,7 +455,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 		return 0;
 
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(lock, &ctx->flc_posix, fl_list) {
+	for_each_file_lock(lock, &ctx->flc_posix) {
 		++seen_fcntl;
 		if (seen_fcntl > num_fcntl_locks) {
 			err = -ENOSPC;
@@ -464,7 +466,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
 			goto fail;
 		++l;
 	}
-	list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
+	for_each_file_lock(lock, &ctx->flc_flock) {
 		++seen_flock;
 		if (seen_flock > num_flock_locks) {
 			err = -ENOSPC;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab9c268a8bb..360b686c3c67 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2504,7 +2504,7 @@ static void ceph_cap_unlink_work(struct work_struct *work)
 	struct ceph_client *cl = mdsc->fsc->client;
 
 	doutc(cl, "begin\n");
-	spin_lock(&mdsc->cap_unlink_delay_lock);
+	spin_lock(&mdsc->cap_delay_lock);
 	while (!list_empty(&mdsc->cap_unlink_delay_list)) {
 		struct ceph_inode_info *ci;
 		struct inode *inode;
@@ -2516,15 +2516,15 @@ static void ceph_cap_unlink_work(struct work_struct *work)
 
 		inode = igrab(&ci->netfs.inode);
 		if (inode) {
-			spin_unlock(&mdsc->cap_unlink_delay_lock);
+			spin_unlock(&mdsc->cap_delay_lock);
 			doutc(cl, "on %p %llx.%llx\n", inode,
 			      ceph_vinop(inode));
 			ceph_check_caps(ci, CHECK_CAPS_FLUSH);
 			iput(inode);
-			spin_lock(&mdsc->cap_unlink_delay_lock);
+			spin_lock(&mdsc->cap_delay_lock);
 		}
 	}
-	spin_unlock(&mdsc->cap_unlink_delay_lock);
+	spin_unlock(&mdsc->cap_delay_lock);
 	doutc(cl, "done\n");
 }
 
@@ -5404,7 +5404,6 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	INIT_LIST_HEAD(&mdsc->cap_wait_list);
 	spin_lock_init(&mdsc->cap_delay_lock);
 	INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
-	spin_lock_init(&mdsc->cap_unlink_delay_lock);
 	INIT_LIST_HEAD(&mdsc->snap_flush_list);
 	spin_lock_init(&mdsc->snap_flush_lock);
 	mdsc->last_cap_flush_tid = 1;
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 03f8ff00874f..b88e80415224 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -461,9 +461,8 @@ struct ceph_mds_client {
 	struct delayed_work    delayed_work;  /* delayed work */
 	unsigned long    last_renew_caps;  /* last time we renewed our caps */
 	struct list_head cap_delay_list;   /* caps with delayed release */
-	spinlock_t       cap_delay_lock;   /* protects cap_delay_list */
 	struct list_head cap_unlink_delay_list;  /* caps with delayed release for unlink */
-	spinlock_t       cap_unlink_delay_lock;  /* protects cap_unlink_delay_list */
+	spinlock_t       cap_delay_lock;   /* protects cap_delay_list and cap_unlink_delay_list */
 	struct list_head snap_flush_list;  /* cap_snaps ready to flush */
 	spinlock_t       snap_flush_lock;
 
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 5ec102f6b1ac..885cb5d4e771 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -928,36 +928,36 @@ static int __init init_caches(void)
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
-				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-				      SLAB_ACCOUNT, ceph_inode_init_once);
+				      SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				      ceph_inode_init_once);
 	if (!ceph_inode_cachep)
 		return -ENOMEM;
 
-	ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
+	ceph_cap_cachep = KMEM_CACHE(ceph_cap, 0);
 	if (!ceph_cap_cachep)
 		goto bad_cap;
-	ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
+	ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, 0);
 	if (!ceph_cap_snap_cachep)
 		goto bad_cap_snap;
 	ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
-					   SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+					   SLAB_RECLAIM_ACCOUNT);
 	if (!ceph_cap_flush_cachep)
 		goto bad_cap_flush;
 
 	ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
-					SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+					SLAB_RECLAIM_ACCOUNT);
 	if (!ceph_dentry_cachep)
 		goto bad_dentry;
 
-	ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+	ceph_file_cachep = KMEM_CACHE(ceph_file_info, 0);
 	if (!ceph_file_cachep)
 		goto bad_file;
 
-	ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, SLAB_MEM_SPREAD);
+	ceph_dir_file_cachep = KMEM_CACHE(ceph_dir_file_info, 0);
 	if (!ceph_dir_file_cachep)
 		goto bad_dir_file;
 
-	ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, SLAB_MEM_SPREAD);
+	ceph_mds_request_cachep = KMEM_CACHE(ceph_mds_request, 0);
 	if (!ceph_mds_request_cachep)
 		goto bad_mds_req;
 
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 0c7c2528791e..6898dc621011 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -24,6 +24,8 @@
 #include <linux/pid_namespace.h>
 #include <linux/uaccess.h>
 #include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include <linux/vmalloc.h>
 
 #include <linux/coda.h>
@@ -70,8 +72,8 @@ int __init coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
 				sizeof(struct coda_inode_info), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-				SLAB_ACCOUNT, init_once);
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -87,10 +89,10 @@ void coda_destroy_inodecache(void)
 	kmem_cache_destroy(coda_inode_cachep);
 }
 
-static int coda_remount(struct super_block *sb, int *flags, char *data)
+static int coda_reconfigure(struct fs_context *fc)
 {
-	sync_filesystem(sb);
-	*flags |= SB_NOATIME;
+	sync_filesystem(fc->root->d_sb);
+	fc->sb_flags |= SB_NOATIME;
 	return 0;
 }
 
@@ -102,78 +104,102 @@ static const struct super_operations coda_super_operations =
 	.evict_inode	= coda_evict_inode,
 	.put_super	= coda_put_super,
 	.statfs		= coda_statfs,
-	.remount_fs	= coda_remount,
 };
 
-static int get_device_index(struct coda_mount_data *data)
+struct coda_fs_context {
+	int	idx;
+};
+
+enum {
+	Opt_fd,
+};
+
+static const struct fs_parameter_spec coda_param_specs[] = {
+	fsparam_fd	("fd",	Opt_fd),
+	{}
+};
+
+static int coda_parse_fd(struct fs_context *fc, int fd)
 {
+	struct coda_fs_context *ctx = fc->fs_private;
 	struct fd f;
 	struct inode *inode;
 	int idx;
 
-	if (data == NULL) {
-		pr_warn("%s: Bad mount data\n", __func__);
-		return -1;
-	}
-
-	if (data->version != CODA_MOUNT_VERSION) {
-		pr_warn("%s: Bad mount version\n", __func__);
-		return -1;
-	}
-
-	f = fdget(data->fd);
+	f = fdget(fd);
 	if (!f.file)
-		goto Ebadf;
+		return -EBADF;
 	inode = file_inode(f.file);
 	if (!S_ISCHR(inode->i_mode) || imajor(inode) != CODA_PSDEV_MAJOR) {
 		fdput(f);
-		goto Ebadf;
+		return invalf(fc, "code: Not coda psdev");
 	}
 
 	idx = iminor(inode);
 	fdput(f);
 
-	if (idx < 0 || idx >= MAX_CODADEVS) {
-		pr_warn("%s: Bad minor number\n", __func__);
-		return -1;
+	if (idx < 0 || idx >= MAX_CODADEVS)
+		return invalf(fc, "coda: Bad minor number");
+	ctx->idx = idx;
+	return 0;
+}
+
+static int coda_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, coda_param_specs, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_fd:
+		return coda_parse_fd(fc, result.uint_32);
 	}
 
-	return idx;
-Ebadf:
-	pr_warn("%s: Bad file\n", __func__);
-	return -1;
+	return 0;
+}
+
+/*
+ * Parse coda's binary mount data form.  We ignore any errors and go with index
+ * 0 if we get one for backward compatibility.
+ */
+static int coda_parse_monolithic(struct fs_context *fc, void *_data)
+{
+	struct coda_mount_data *data = _data;
+
+	if (!data)
+		return invalf(fc, "coda: Bad mount data");
+
+	if (data->version != CODA_MOUNT_VERSION)
+		return invalf(fc, "coda: Bad mount version");
+
+	coda_parse_fd(fc, data->fd);
+	return 0;
 }
 
-static int coda_fill_super(struct super_block *sb, void *data, int silent)
+static int coda_fill_super(struct super_block *sb, struct fs_context *fc)
 {
+	struct coda_fs_context *ctx = fc->fs_private;
 	struct inode *root = NULL;
 	struct venus_comm *vc;
 	struct CodaFid fid;
 	int error;
-	int idx;
-
-	if (task_active_pid_ns(current) != &init_pid_ns)
-		return -EINVAL;
-
-	idx = get_device_index((struct coda_mount_data *) data);
 
-	/* Ignore errors in data, for backward compatibility */
-	if(idx == -1)
-		idx = 0;
-	
-	pr_info("%s: device index: %i\n", __func__,  idx);
+	infof(fc, "coda: device index: %i\n", ctx->idx);
 
-	vc = &coda_comms[idx];
+	vc = &coda_comms[ctx->idx];
 	mutex_lock(&vc->vc_mutex);
 
 	if (!vc->vc_inuse) {
-		pr_warn("%s: No pseudo device\n", __func__);
+		errorf(fc, "coda: No pseudo device");
 		error = -EINVAL;
 		goto unlock_out;
 	}
 
 	if (vc->vc_sb) {
-		pr_warn("%s: Device already mounted\n", __func__);
+		errorf(fc, "coda: Device already mounted");
 		error = -EBUSY;
 		goto unlock_out;
 	}
@@ -313,18 +339,45 @@ static int coda_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0; 
 }
 
-/* init_coda: used by filesystems.c to register coda */
+static int coda_get_tree(struct fs_context *fc)
+{
+	if (task_active_pid_ns(current) != &init_pid_ns)
+		return -EINVAL;
 
-static struct dentry *coda_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+	return get_tree_nodev(fc, coda_fill_super);
+}
+
+static void coda_free_fc(struct fs_context *fc)
 {
-	return mount_nodev(fs_type, flags, data, coda_fill_super);
+	kfree(fc->fs_private);
+}
+
+static const struct fs_context_operations coda_context_ops = {
+	.free		= coda_free_fc,
+	.parse_param	= coda_parse_param,
+	.parse_monolithic = coda_parse_monolithic,
+	.get_tree	= coda_get_tree,
+	.reconfigure	= coda_reconfigure,
+};
+
+static int coda_init_fs_context(struct fs_context *fc)
+{
+	struct coda_fs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct coda_fs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+
+	fc->fs_private = ctx;
+	fc->ops = &coda_context_ops;
+	return 0;
 }
 
 struct file_system_type coda_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "coda",
-	.mount		= coda_mount,
+	.init_fs_context = coda_init_fs_context,
+	.parameters	= coda_param_specs,
 	.kill_sb	= kill_anon_super,
 	.fs_flags	= FS_BINARY_MOUNTDATA,
 };
diff --git a/fs/coredump.c b/fs/coredump.c
index f258c17c1841..be6403b4b14b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	loff_t pos;
 	ssize_t n;
 
+	if (!page)
+		return 0;
+
 	if (cprm->to_skip) {
 		if (!__dump_skip(cprm, cprm->to_skip))
 			return 0;
@@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	pos = file->f_pos;
 	bvec_set_page(&bvec, page, PAGE_SIZE, 0);
 	iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
-	iov_iter_set_copy_mc(&iter);
 	n = __kernel_write_iter(cprm->file, &iter, &pos);
 	if (n != PAGE_SIZE)
 		return 0;
@@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
 	return 1;
 }
 
+/*
+ * If we might get machine checks from kernel accesses during the
+ * core dump, let's get those errors early rather than during the
+ * IO. This is not performance-critical enough to warrant having
+ * all the machine check logic in the iovec paths.
+ */
+#ifdef copy_mc_to_kernel
+
+#define dump_page_alloc() alloc_page(GFP_KERNEL)
+#define dump_page_free(x) __free_page(x)
+static struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+	void *buf = kmap_local_page(src);
+	size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
+	kunmap_local(buf);
+	return left ? NULL : dst;
+}
+
+#else
+
+/* We just want to return non-NULL; it's never used. */
+#define dump_page_alloc() ERR_PTR(-EINVAL)
+#define dump_page_free(x) ((void)(x))
+static inline struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+	return src;
+}
+#endif
+
 int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		    unsigned long len)
 {
 	unsigned long addr;
+	struct page *dump_page;
+
+	dump_page = dump_page_alloc();
+	if (!dump_page)
+		return 0;
 
 	for (addr = start; addr < start + len; addr += PAGE_SIZE) {
 		struct page *page;
@@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
 		 */
 		page = get_dump_page(addr);
 		if (page) {
-			int stop = !dump_emit_page(cprm, page);
+			int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
 			put_page(page);
-			if (stop)
+			if (stop) {
+				dump_page_free(dump_page);
 				return 0;
+			}
 		} else {
 			dump_skip(cprm, PAGE_SIZE);
 		}
 	}
+	dump_page_free(dump_page);
 	return 1;
 }
 #endif
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 60dbfa0f8805..9901057a15ba 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb)
 		sb->s_mtd = NULL;
 	} else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_release(sb->s_bdev_handle);
+		bdev_fput(sb->s_bdev_file);
 	}
 	kfree(sbi);
 }
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 7b3fc189593a..0ad52fbe51c9 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -74,13 +74,7 @@ struct fscrypt_nokey_name {
 
 static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
 {
-	if (str->len == 1 && str->name[0] == '.')
-		return true;
-
-	if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-		return true;
-
-	return false;
+	return is_dot_dotdot(str->name, str->len);
 }
 
 /**
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 1892356cf924..8371e4e1f596 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -222,16 +222,19 @@ struct fscrypt_inode_info {
 	struct fscrypt_prepared_key ci_enc_key;
 
 	/* True if ci_enc_key should be freed when this struct is freed */
-	bool ci_owns_key;
+	u8 ci_owns_key : 1;
 
 #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT
 	/*
 	 * True if this inode will use inline encryption (blk-crypto) instead of
 	 * the traditional filesystem-layer encryption.
 	 */
-	bool ci_inlinecrypt;
+	u8 ci_inlinecrypt : 1;
 #endif
 
+	/* True if ci_dirhash_key is initialized */
+	u8 ci_dirhash_key_initialized : 1;
+
 	/*
 	 * log2 of the data unit size (granularity of contents encryption) of
 	 * this file.  This is computable from ci_policy and ci_inode but is
@@ -242,6 +245,9 @@ struct fscrypt_inode_info {
 	/* Cached value: log2 of number of data units per FS block */
 	u8 ci_data_units_per_block_bits;
 
+	/* Hashed inode number.  Only set for IV_INO_LBLK_32 */
+	u32 ci_hashed_ino;
+
 	/*
 	 * Encryption mode used for this inode.  It corresponds to either the
 	 * contents or filenames encryption mode, depending on the inode type.
@@ -276,16 +282,12 @@ struct fscrypt_inode_info {
 	 * the plaintext filenames -- currently just casefolded directories.
 	 */
 	siphash_key_t ci_dirhash_key;
-	bool ci_dirhash_key_initialized;
 
 	/* The encryption policy used by this inode */
 	union fscrypt_policy ci_policy;
 
 	/* This inode's nonce, copied from the fscrypt_context */
 	u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE];
-
-	/* Hashed inode number.  Only set for IV_INO_LBLK_32 */
-	u32 ci_hashed_ino;
 };
 
 typedef enum {
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 52504dd478d3..104771c3d3f6 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -102,11 +102,8 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry,
 	if (err && err != -ENOENT)
 		return err;
 
-	if (fname->is_nokey_name) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_NOKEY_NAME;
-		spin_unlock(&dentry->d_lock);
-	}
+	fscrypt_prepare_dentry(dentry, fname->is_nokey_name);
+
 	return err;
 }
 EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
@@ -131,12 +128,10 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
 int fscrypt_prepare_lookup_partial(struct inode *dir, struct dentry *dentry)
 {
 	int err = fscrypt_get_encryption_info(dir, true);
+	bool is_nokey_name = (!err && !fscrypt_has_encryption_key(dir));
+
+	fscrypt_prepare_dentry(dentry, is_nokey_name);
 
-	if (!err && !fscrypt_has_encryption_key(dir)) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags |= DCACHE_NOKEY_NAME;
-		spin_unlock(&dentry->d_lock);
-	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(fscrypt_prepare_lookup_partial);
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index 0edf0b58daa7..6681a71625f0 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -74,8 +74,12 @@ void fscrypt_put_master_key(struct fscrypt_master_key *mk)
 	 * that concurrent keyring lookups can no longer find it.
 	 */
 	WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 0);
-	key_put(mk->mk_users);
-	mk->mk_users = NULL;
+	if (mk->mk_users) {
+		/* Clear the keyring so the quota gets released right away. */
+		keyring_clear(mk->mk_users);
+		key_put(mk->mk_users);
+		mk->mk_users = NULL;
+	}
 	call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
 }
 
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index d71f7c799e79..b4fe01ea4bd4 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -23,7 +23,7 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS,
 	},
 	[FSCRYPT_MODE_AES_256_CTS] = {
-		.friendly_name = "AES-256-CTS-CBC",
+		.friendly_name = "AES-256-CBC-CTS",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 32,
 		.security_strength = 32,
@@ -38,7 +38,7 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV,
 	},
 	[FSCRYPT_MODE_AES_128_CTS] = {
-		.friendly_name = "AES-128-CTS-CBC",
+		.friendly_name = "AES-128-CBC-CTS",
 		.cipher_str = "cts(cbc(aes))",
 		.keysize = 16,
 		.security_strength = 16,
@@ -53,7 +53,7 @@ struct fscrypt_mode fscrypt_modes[] = {
 		.blk_crypto_mode = BLK_ENCRYPTION_MODE_SM4_XTS,
 	},
 	[FSCRYPT_MODE_SM4_CTS] = {
-		.friendly_name = "SM4-CTS-CBC",
+		.friendly_name = "SM4-CBC-CTS",
 		.cipher_str = "cts(cbc(sm4))",
 		.keysize = 16,
 		.security_strength = 16,
@@ -687,7 +687,7 @@ int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported)
 /**
  * fscrypt_prepare_new_inode() - prepare to create a new inode in a directory
  * @dir: a possibly-encrypted directory
- * @inode: the new inode.  ->i_mode must be set already.
+ * @inode: the new inode.  ->i_mode and ->i_blkbits must be set already.
  *	   ->i_ino doesn't need to be set yet.
  * @encrypt_ret: (output) set to %true if the new inode will be encrypted
  *
@@ -717,6 +717,9 @@ int fscrypt_prepare_new_inode(struct inode *dir, struct inode *inode,
 	if (IS_ERR(policy))
 		return PTR_ERR(policy);
 
+	if (WARN_ON_ONCE(inode->i_blkbits == 0))
+		return -EINVAL;
+
 	if (WARN_ON_ONCE(inode->i_mode == 0))
 		return -EINVAL;
 
diff --git a/fs/dcache.c b/fs/dcache.c
index 6ebccba33336..71a8e943a0fa 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3139,7 +3139,7 @@ static void __init dcache_init(void)
 	 * of the dcache.
 	 */
 	dentry_cache = KMEM_CACHE_USERCOPY(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_ACCOUNT,
 		d_iname);
 
 	/* Hash may have been set up in dcache_init_early */
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 034a617cb1a5..a40da0065433 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -751,13 +751,28 @@ static void __debugfs_file_removed(struct dentry *dentry)
 	if ((unsigned long)fsd & DEBUGFS_FSDATA_IS_REAL_FOPS_BIT)
 		return;
 
-	/* if we hit zero, just wait for all to finish */
-	if (!refcount_dec_and_test(&fsd->active_users)) {
-		wait_for_completion(&fsd->active_users_drained);
+	/* if this was the last reference, we're done */
+	if (refcount_dec_and_test(&fsd->active_users))
 		return;
-	}
 
-	/* if we didn't hit zero, try to cancel any we can */
+	/*
+	 * If there's still a reference, the code that obtained it can
+	 * be in different states:
+	 *  - The common case of not using cancellations, or already
+	 *    after debugfs_leave_cancellation(), where we just need
+	 *    to wait for debugfs_file_put() which signals the completion;
+	 *  - inside a cancellation section, i.e. between
+	 *    debugfs_enter_cancellation() and debugfs_leave_cancellation(),
+	 *    in which case we need to trigger the ->cancel() function,
+	 *    and then wait for debugfs_file_put() just like in the
+	 *    previous case;
+	 *  - before debugfs_enter_cancellation() (but obviously after
+	 *    debugfs_file_get()), in which case we may not see the
+	 *    cancellation in the list on the first round of the loop,
+	 *    but debugfs_enter_cancellation() signals the completion
+	 *    after adding it, so this code gets woken up to call the
+	 *    ->cancel() function.
+	 */
 	while (refcount_read(&fsd->active_users)) {
 		struct debugfs_cancellation *c;
 
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 60456263a338..62c97ff9e852 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -410,6 +410,8 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio,
 		bio->bi_end_io = dio_bio_end_io;
 	if (dio->is_pinned)
 		bio_set_flag(bio, BIO_PAGE_PINNED);
+	bio->bi_write_hint = file_inode(dio->iocb->ki_filp)->i_write_hint;
+
 	sdio->bio = bio;
 	sdio->logical_offset_in_bio = sdio->cur_page_fs_offset;
 }
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index dfc444dad329..3b4dbce849f0 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -246,7 +246,7 @@ struct dlm_lkb {
 	int8_t			lkb_highbast;	/* highest mode bast sent for */
 
 	int8_t			lkb_wait_type;	/* type of reply waiting for */
-	atomic_t		lkb_wait_count;
+	int8_t			lkb_wait_count;
 	int			lkb_wait_nodeid; /* for debugging */
 
 	struct list_head	lkb_statequeue;	/* rsb g/c/w list */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 652c51fbbf76..fd752dd03896 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1407,7 +1407,6 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 {
 	struct dlm_ls *ls = lkb->lkb_resource->res_ls;
 	int error = 0;
-	int wc;
 
 	mutex_lock(&ls->ls_waiters_mutex);
 
@@ -1429,17 +1428,20 @@ static int add_to_waiters(struct dlm_lkb *lkb, int mstype, int to_nodeid)
 			error = -EBUSY;
 			goto out;
 		}
-		wc = atomic_inc_return(&lkb->lkb_wait_count);
+		lkb->lkb_wait_count++;
 		hold_lkb(lkb);
 
 		log_debug(ls, "addwait %x cur %d overlap %d count %d f %x",
-			  lkb->lkb_id, lkb->lkb_wait_type, mstype, wc,
-			  dlm_iflags_val(lkb));
+			  lkb->lkb_id, lkb->lkb_wait_type, mstype,
+			  lkb->lkb_wait_count, dlm_iflags_val(lkb));
 		goto out;
 	}
 
-	wc = atomic_fetch_inc(&lkb->lkb_wait_count);
-	DLM_ASSERT(!wc, dlm_print_lkb(lkb); printk("wait_count %d\n", wc););
+	DLM_ASSERT(!lkb->lkb_wait_count,
+		   dlm_print_lkb(lkb);
+		   printk("wait_count %d\n", lkb->lkb_wait_count););
+
+	lkb->lkb_wait_count++;
 	lkb->lkb_wait_type = mstype;
 	lkb->lkb_wait_nodeid = to_nodeid; /* for debugging */
 	hold_lkb(lkb);
@@ -1502,7 +1504,7 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
 		log_debug(ls, "remwait %x convert_reply zap overlap_cancel",
 			  lkb->lkb_id);
 		lkb->lkb_wait_type = 0;
-		atomic_dec(&lkb->lkb_wait_count);
+		lkb->lkb_wait_count--;
 		unhold_lkb(lkb);
 		goto out_del;
 	}
@@ -1529,15 +1531,16 @@ static int _remove_from_waiters(struct dlm_lkb *lkb, int mstype,
 	if (overlap_done && lkb->lkb_wait_type) {
 		log_error(ls, "remwait error %x reply %d wait_type %d overlap",
 			  lkb->lkb_id, mstype, lkb->lkb_wait_type);
-		atomic_dec(&lkb->lkb_wait_count);
+		lkb->lkb_wait_count--;
 		unhold_lkb(lkb);
 		lkb->lkb_wait_type = 0;
 	}
 
-	DLM_ASSERT(atomic_read(&lkb->lkb_wait_count), dlm_print_lkb(lkb););
+	DLM_ASSERT(lkb->lkb_wait_count, dlm_print_lkb(lkb););
 
 	clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
-	if (atomic_dec_and_test(&lkb->lkb_wait_count))
+	lkb->lkb_wait_count--;
+	if (!lkb->lkb_wait_count)
 		list_del_init(&lkb->lkb_wait_reply);
 	unhold_lkb(lkb);
 	return 0;
@@ -2666,7 +2669,7 @@ static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
 			goto out;
 
 		/* lock not allowed if there's any op in progress */
-		if (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count))
+		if (lkb->lkb_wait_type || lkb->lkb_wait_count)
 			goto out;
 
 		if (is_overlap(lkb))
@@ -2728,7 +2731,7 @@ static int validate_unlock_args(struct dlm_lkb *lkb, struct dlm_args *args)
 
 	/* normal unlock not allowed if there's any op in progress */
 	if (!(args->flags & (DLM_LKF_CANCEL | DLM_LKF_FORCEUNLOCK)) &&
-	    (lkb->lkb_wait_type || atomic_read(&lkb->lkb_wait_count)))
+	    (lkb->lkb_wait_type || lkb->lkb_wait_count))
 		goto out;
 
 	/* an lkb may be waiting for an rsb lookup to complete where the
@@ -5011,21 +5014,32 @@ static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
 	return lkb;
 }
 
-/* Deal with lookups and lkb's marked RESEND from _pre.  We may now be the
-   master or dir-node for r.  Processing the lkb may result in it being placed
-   back on waiters. */
-
-/* We do this after normal locking has been enabled and any saved messages
-   (in requestqueue) have been processed.  We should be confident that at
-   this point we won't get or process a reply to any of these waiting
-   operations.  But, new ops may be coming in on the rsbs/locks here from
-   userspace or remotely. */
-
-/* there may have been an overlap unlock/cancel prior to recovery or after
-   recovery.  if before, the lkb may still have a pos wait_count; if after, the
-   overlap flag would just have been set and nothing new sent.  we can be
-   confident here than any replies to either the initial op or overlap ops
-   prior to recovery have been received. */
+/*
+ * Forced state reset for locks that were in the middle of remote operations
+ * when recovery happened (i.e. lkbs that were on the waiters list, waiting
+ * for a reply from a remote operation.)  The lkbs remaining on the waiters
+ * list need to be reevaluated; some may need resending to a different node
+ * than previously, and some may now need local handling rather than remote.
+ *
+ * First, the lkb state for the voided remote operation is forcibly reset,
+ * equivalent to what remove_from_waiters() would normally do:
+ * . lkb removed from ls_waiters list
+ * . lkb wait_type cleared
+ * . lkb waiters_count cleared
+ * . lkb ref count decremented for each waiters_count (almost always 1,
+ *   but possibly 2 in case of cancel/unlock overlapping, which means
+ *   two remote replies were being expected for the lkb.)
+ *
+ * Second, the lkb is reprocessed like an original operation would be,
+ * by passing it to _request_lock or _convert_lock, which will either
+ * process the lkb operation locally, or send it to a remote node again
+ * and put the lkb back onto the waiters list.
+ *
+ * When reprocessing the lkb, we may find that it's flagged for an overlapping
+ * force-unlock or cancel, either from before recovery began, or after recovery
+ * finished.  If this is the case, the unlock/cancel is done directly, and the
+ * original operation is not initiated again (no _request_lock/_convert_lock.)
+ */
 
 int dlm_recover_waiters_post(struct dlm_ls *ls)
 {
@@ -5040,6 +5054,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 			break;
 		}
 
+		/* 
+		 * Find an lkb from the waiters list that's been affected by
+		 * recovery node changes, and needs to be reprocessed.  Does
+		 * hold_lkb(), adding a refcount.
+		 */
 		lkb = find_resend_waiter(ls);
 		if (!lkb)
 			break;
@@ -5048,6 +5067,11 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 		hold_rsb(r);
 		lock_rsb(r);
 
+		/*
+		 * If the lkb has been flagged for a force unlock or cancel,
+		 * then the reprocessing below will be replaced by just doing
+		 * the unlock/cancel directly.
+		 */
 		mstype = lkb->lkb_wait_type;
 		oc = test_and_clear_bit(DLM_IFL_OVERLAP_CANCEL_BIT,
 					&lkb->lkb_iflags);
@@ -5061,22 +5085,40 @@ int dlm_recover_waiters_post(struct dlm_ls *ls)
 			  r->res_nodeid, lkb->lkb_nodeid, lkb->lkb_wait_nodeid,
 			  dlm_dir_nodeid(r), oc, ou);
 
-		/* At this point we assume that we won't get a reply to any
-		   previous op or overlap op on this lock.  First, do a big
-		   remove_from_waiters() for all previous ops. */
+		/*
+		 * No reply to the pre-recovery operation will now be received,
+		 * so a forced equivalent of remove_from_waiters() is needed to
+		 * reset the waiters state that was in place before recovery.
+		 */
 
 		clear_bit(DLM_IFL_RESEND_BIT, &lkb->lkb_iflags);
+
+		/* Forcibly clear wait_type */
 		lkb->lkb_wait_type = 0;
-		/* drop all wait_count references we still
-		 * hold a reference for this iteration.
+
+		/*
+		 * Forcibly reset wait_count and associated refcount.  The
+		 * wait_count will almost always be 1, but in case of an
+		 * overlapping unlock/cancel it could be 2: see where
+		 * add_to_waiters() finds the lkb is already on the waiters
+		 * list and does lkb_wait_count++; hold_lkb().
 		 */
-		while (!atomic_dec_and_test(&lkb->lkb_wait_count))
+		while (lkb->lkb_wait_count) {
+			lkb->lkb_wait_count--;
 			unhold_lkb(lkb);
+		}
 
+		/* Forcibly remove from waiters list */
 		mutex_lock(&ls->ls_waiters_mutex);
 		list_del_init(&lkb->lkb_wait_reply);
 		mutex_unlock(&ls->ls_waiters_mutex);
 
+		/*
+		 * The lkb is now clear of all prior waiters state and can be
+		 * processed locally, or sent to remote node again, or directly
+		 * cancelled/unlocked.
+		 */
+
 		if (oc || ou) {
 			/* do an unlock or cancel instead of resending */
 			switch (mstype) {
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index d814c5121367..9ca83ef70ed1 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -138,14 +138,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	}
 
 	op->info.optype		= DLM_PLOCK_OP_LOCK;
-	op->info.pid		= fl->fl_pid;
-	op->info.ex		= (fl->fl_type == F_WRLCK);
-	op->info.wait		= !!(fl->fl_flags & FL_SLEEP);
+	op->info.pid		= fl->c.flc_pid;
+	op->info.ex		= lock_is_write(fl);
+	op->info.wait		= !!(fl->c.flc_flags & FL_SLEEP);
 	op->info.fsid		= ls->ls_global_id;
 	op->info.number		= number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner = (__u64)(long)fl->fl_owner;
+	op->info.owner = (__u64)(long) fl->c.flc_owner;
 	/* async handling */
 	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
 		op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -258,7 +258,7 @@ static int dlm_plock_callback(struct plock_op *op)
 	}
 
 	/* got fs lock; bookkeep locally as well: */
-	flc->fl_flags &= ~FL_SLEEP;
+	flc->c.flc_flags &= ~FL_SLEEP;
 	if (posix_lock_file(file, flc, NULL)) {
 		/*
 		 * This can only happen in the case of kmalloc() failure.
@@ -291,7 +291,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	struct dlm_ls *ls;
 	struct plock_op *op;
 	int rv;
-	unsigned char fl_flags = fl->fl_flags;
+	unsigned char saved_flags = fl->c.flc_flags;
 
 	ls = dlm_find_lockspace_local(lockspace);
 	if (!ls)
@@ -304,7 +304,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	}
 
 	/* cause the vfs unlock to return ENOENT if lock is not found */
-	fl->fl_flags |= FL_EXISTS;
+	fl->c.flc_flags |= FL_EXISTS;
 
 	rv = locks_lock_file_wait(file, fl);
 	if (rv == -ENOENT) {
@@ -317,14 +317,14 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	}
 
 	op->info.optype		= DLM_PLOCK_OP_UNLOCK;
-	op->info.pid		= fl->fl_pid;
+	op->info.pid		= fl->c.flc_pid;
 	op->info.fsid		= ls->ls_global_id;
 	op->info.number		= number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner = (__u64)(long)fl->fl_owner;
+	op->info.owner = (__u64)(long) fl->c.flc_owner;
 
-	if (fl->fl_flags & FL_CLOSE) {
+	if (fl->c.flc_flags & FL_CLOSE) {
 		op->info.flags |= DLM_PLOCK_FL_CLOSE;
 		send_op(op);
 		rv = 0;
@@ -345,7 +345,7 @@ out_free:
 	dlm_release_plock_op(op);
 out:
 	dlm_put_lockspace(ls);
-	fl->fl_flags = fl_flags;
+	fl->c.flc_flags = saved_flags;
 	return rv;
 }
 EXPORT_SYMBOL_GPL(dlm_posix_unlock);
@@ -375,14 +375,14 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 		return -EINVAL;
 
 	memset(&info, 0, sizeof(info));
-	info.pid = fl->fl_pid;
-	info.ex = (fl->fl_type == F_WRLCK);
+	info.pid = fl->c.flc_pid;
+	info.ex = lock_is_write(fl);
 	info.fsid = ls->ls_global_id;
 	dlm_put_lockspace(ls);
 	info.number = number;
 	info.start = fl->fl_start;
 	info.end = fl->fl_end;
-	info.owner = (__u64)(long)fl->fl_owner;
+	info.owner = (__u64)(long) fl->c.flc_owner;
 
 	rv = do_lock_cancel(&info);
 	switch (rv) {
@@ -437,13 +437,13 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 	}
 
 	op->info.optype		= DLM_PLOCK_OP_GET;
-	op->info.pid		= fl->fl_pid;
-	op->info.ex		= (fl->fl_type == F_WRLCK);
+	op->info.pid		= fl->c.flc_pid;
+	op->info.ex		= lock_is_write(fl);
 	op->info.fsid		= ls->ls_global_id;
 	op->info.number		= number;
 	op->info.start		= fl->fl_start;
 	op->info.end		= fl->fl_end;
-	op->info.owner = (__u64)(long)fl->fl_owner;
+	op->info.owner = (__u64)(long) fl->c.flc_owner;
 
 	send_op(op);
 	wait_event(recv_wq, (op->done != 0));
@@ -455,16 +455,16 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
 
 	rv = op->info.rv;
 
-	fl->fl_type = F_UNLCK;
+	fl->c.flc_type = F_UNLCK;
 	if (rv == -ENOENT)
 		rv = 0;
 	else if (rv > 0) {
 		locks_init_lock(fl);
-		fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
-		fl->fl_flags = FL_POSIX;
-		fl->fl_pid = op->info.pid;
+		fl->c.flc_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
+		fl->c.flc_flags = FL_POSIX;
+		fl->c.flc_pid = op->info.pid;
 		if (op->info.nodeid != dlm_our_nodeid())
-			fl->fl_pid = -fl->fl_pid;
+			fl->c.flc_pid = -fl->c.flc_pid;
 		fl->fl_start = op->info.start;
 		fl->fl_end = op->info.end;
 		rv = 0;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 695e691b38b3..9f9b68448830 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -806,7 +806,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	struct dlm_lkb *lkb;
 	DECLARE_WAITQUEUE(wait, current);
 	struct dlm_callback *cb;
-	int rv, copy_lvb = 0;
+	int rv, ret, copy_lvb = 0;
 	int old_mode, new_mode;
 
 	if (count == sizeof(struct dlm_device_version)) {
@@ -906,9 +906,9 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 		trace_dlm_ast(lkb->lkb_resource->res_ls, lkb);
 	}
 
-	rv = copy_result_to_user(lkb->lkb_ua,
-				 test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
-				 cb->flags, cb->mode, copy_lvb, buf, count);
+	ret = copy_result_to_user(lkb->lkb_ua,
+				  test_bit(DLM_PROC_FLAGS_COMPAT, &proc->flags),
+				  cb->flags, cb->mode, copy_lvb, buf, count);
 
 	kref_put(&cb->ref, dlm_release_callback);
 
@@ -916,7 +916,7 @@ static ssize_t device_read(struct file *file, char __user *buf, size_t count,
 	if (rv == DLM_DEQUEUE_CALLBACK_LAST)
 		dlm_put_lkb(lkb);
 
-	return rv;
+	return ret;
 }
 
 static __poll_t device_poll(struct file *file, poll_table *wait)
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 03bd55069d86..2fe0f3af1a08 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1949,16 +1949,6 @@ out:
 	return rc;
 }
 
-static bool is_dot_dotdot(const char *name, size_t name_size)
-{
-	if (name_size == 1 && name[0] == '.')
-		return true;
-	else if (name_size == 2 && name[0] == '.' && name[1] == '.')
-		return true;
-
-	return false;
-}
-
 /**
  * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext
  * @plaintext_name: The plaintext name
diff --git a/fs/efs/super.c b/fs/efs/super.c
index f17fdac76b2e..e4421c10caeb 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -14,19 +14,14 @@
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
 #include <linux/blkdev.h>
-
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 #include "efs.h"
 #include <linux/efs_vh.h>
 #include <linux/efs_fs_sb.h>
 
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf);
-static int efs_fill_super(struct super_block *s, void *d, int silent);
-
-static struct dentry *efs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, efs_fill_super);
-}
+static int efs_init_fs_context(struct fs_context *fc);
 
 static void efs_kill_sb(struct super_block *s)
 {
@@ -35,15 +30,6 @@ static void efs_kill_sb(struct super_block *s)
 	kfree(sbi);
 }
 
-static struct file_system_type efs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "efs",
-	.mount		= efs_mount,
-	.kill_sb	= efs_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("efs");
-
 static struct pt_types sgi_pt_types[] = {
 	{0x00,		"SGI vh"},
 	{0x01,		"SGI trkrepl"},
@@ -63,6 +49,27 @@ static struct pt_types sgi_pt_types[] = {
 	{0,		NULL}
 };
 
+enum {
+	Opt_explicit_open,
+};
+
+static const struct fs_parameter_spec efs_param_spec[] = {
+	fsparam_flag    ("explicit-open",       Opt_explicit_open),
+	{}
+};
+
+/*
+ * File system definition and registration.
+ */
+static struct file_system_type efs_fs_type = {
+	.owner			= THIS_MODULE,
+	.name			= "efs",
+	.kill_sb		= efs_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= efs_init_fs_context,
+	.parameters		= efs_param_spec,
+};
+MODULE_ALIAS_FS("efs");
 
 static struct kmem_cache * efs_inode_cachep;
 
@@ -91,8 +98,8 @@ static int __init init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
 				sizeof(struct efs_inode_info), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-				SLAB_ACCOUNT, init_once);
+				SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+				init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
@@ -108,18 +115,10 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
-static int efs_remount(struct super_block *sb, int *flags, char *data)
-{
-	sync_filesystem(sb);
-	*flags |= SB_RDONLY;
-	return 0;
-}
-
 static const struct super_operations efs_superblock_operations = {
 	.alloc_inode	= efs_alloc_inode,
 	.free_inode	= efs_free_inode,
 	.statfs		= efs_statfs,
-	.remount_fs	= efs_remount,
 };
 
 static const struct export_operations efs_export_ops = {
@@ -249,26 +248,26 @@ static int efs_validate_super(struct efs_sb_info *sb, struct efs_super *super) {
 	return 0;    
 }
 
-static int efs_fill_super(struct super_block *s, void *d, int silent)
+static int efs_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct efs_sb_info *sb;
 	struct buffer_head *bh;
 	struct inode *root;
 
- 	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
+	sb = kzalloc(sizeof(struct efs_sb_info), GFP_KERNEL);
 	if (!sb)
 		return -ENOMEM;
 	s->s_fs_info = sb;
 	s->s_time_min = 0;
 	s->s_time_max = U32_MAX;
- 
+
 	s->s_magic		= EFS_SUPER_MAGIC;
 	if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) {
 		pr_err("device does not support %d byte blocks\n",
 			EFS_BLOCKSIZE);
 		return -EINVAL;
 	}
-  
+
 	/* read the vh (volume header) block */
 	bh = sb_bread(s, 0);
 
@@ -294,7 +293,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 		pr_err("cannot read superblock\n");
 		return -EIO;
 	}
-		
+
 	if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) {
 #ifdef DEBUG
 		pr_warn("invalid superblock at block %u\n",
@@ -328,6 +327,61 @@ static int efs_fill_super(struct super_block *s, void *d, int silent)
 	return 0;
 }
 
+static void efs_free_fc(struct fs_context *fc)
+{
+	kfree(fc->fs_private);
+}
+
+static int efs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, efs_fill_super);
+}
+
+static int efs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	int token;
+	struct fs_parse_result result;
+
+	token = fs_parse(fc, efs_param_spec, param, &result);
+	if (token < 0)
+		return token;
+	return 0;
+}
+
+static int efs_reconfigure(struct fs_context *fc)
+{
+	sync_filesystem(fc->root->d_sb);
+
+	return 0;
+}
+
+struct efs_context {
+	unsigned long s_mount_opts;
+};
+
+static const struct fs_context_operations efs_context_opts = {
+	.parse_param	= efs_parse_param,
+	.get_tree	= efs_get_tree,
+	.reconfigure	= efs_reconfigure,
+	.free		= efs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int efs_init_fs_context(struct fs_context *fc)
+{
+	struct efs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct efs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	fc->fs_private = ctx;
+	fc->ops = &efs_context_opts;
+
+	return 0;
+}
+
 static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) {
 	struct super_block *sb = dentry->d_sb;
 	struct efs_sb_info *sbi = SUPER_INFO(sb);
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 7cc5841577b2..333587ba6183 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -81,13 +81,6 @@ static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
 	return true;
 }
 
-#define MNGD_MAPPING(sbi)	((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
-					 struct page *page)
-{
-	return page->mapping == MNGD_MAPPING(sbi);
-}
-
 int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
 			 unsigned int padbufsize);
 extern const struct z_erofs_decompressor erofs_decompressors[];
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb2..52524bd9698b 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -220,7 +220,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			up_read(&devs->rwsem);
 			return 0;
 		}
-		map->m_bdev = dif->bdev_handle ? dif->bdev_handle->bdev : NULL;
+		map->m_bdev = dif->bdev_file ? file_bdev(dif->bdev_file) : NULL;
 		map->m_daxdev = dif->dax_dev;
 		map->m_dax_part_off = dif->dax_part_off;
 		map->m_fscache = dif->fscache;
@@ -238,8 +238,8 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
 			if (map->m_pa >= startoff &&
 			    map->m_pa < startoff + length) {
 				map->m_pa -= startoff;
-				map->m_bdev = dif->bdev_handle ?
-					      dif->bdev_handle->bdev : NULL;
+				map->m_bdev = dif->bdev_file ?
+					      file_bdev(dif->bdev_file) : NULL;
 				map->m_daxdev = dif->dax_dev;
 				map->m_dax_part_off = dif->dax_part_off;
 				map->m_fscache = dif->fscache;
@@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = {
 	.llseek		= generic_file_llseek,
 	.read_iter	= erofs_file_read_iter,
 	.mmap		= erofs_file_mmap,
+	.get_unmapped_area = thp_get_unmapped_area,
 	.splice_read	= filemap_splice_read,
 };
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index d4cee95af14c..2ec9b2bb628d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -323,7 +323,8 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
 	unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
 	u8 *kin;
 
-	DBG_BUGON(rq->outputsize > rq->inputsize);
+	if (rq->outputsize > rq->inputsize)
+		return -EOPNOTSUPP;
 	if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
 		cur = bs - (rq->pageofs_out & (bs - 1));
 		pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index b98872058abe..81e65c453ef0 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -212,9 +212,6 @@ again:
 
 			if (rq->out[no] != rq->in[j])
 				continue;
-
-			DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
-							rq->in[j]));
 			tmppage = erofs_allocpage(pgpl, rq->gfp);
 			if (!tmppage) {
 				err = -ENOMEM;
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 6ca357d83cfa..4b28dc130c9f 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -258,9 +258,6 @@ again:
 
 			if (rq->out[no] != rq->in[j])
 				continue;
-
-			DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
-							rq->in[j]));
 			tmppage = erofs_allocpage(pgpl, rq->gfp);
 			if (!tmppage) {
 				err = -ENOMEM;
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 89a7c2453aae..8aff1a724805 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2022, Alibaba Cloud
  * Copyright (C) 2022, Bytedance Inc. All rights reserved.
  */
+#include <linux/pseudo_fs.h>
 #include <linux/fscache.h>
 #include "internal.h"
 
@@ -12,9 +13,27 @@ static LIST_HEAD(erofs_domain_list);
 static LIST_HEAD(erofs_domain_cookies_list);
 static struct vfsmount *erofs_pseudo_mnt;
 
-struct erofs_fscache_request {
-	struct erofs_fscache_request *primary;
-	struct netfs_cache_resources cache_resources;
+static int erofs_anon_init_fs_context(struct fs_context *fc)
+{
+	return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type erofs_anon_fs_type = {
+	.owner		= THIS_MODULE,
+	.name           = "pseudo_erofs",
+	.init_fs_context = erofs_anon_init_fs_context,
+	.kill_sb        = kill_anon_super,
+};
+
+struct erofs_fscache_io {
+	struct netfs_cache_resources cres;
+	struct iov_iter		iter;
+	netfs_io_terminated_t	end_io;
+	void			*private;
+	refcount_t		ref;
+};
+
+struct erofs_fscache_rq {
 	struct address_space	*mapping;	/* The mapping being accessed */
 	loff_t			start;		/* Start position */
 	size_t			len;		/* Length of the request */
@@ -23,44 +42,17 @@ struct erofs_fscache_request {
 	refcount_t		ref;
 };
 
-static struct erofs_fscache_request *erofs_fscache_req_alloc(struct address_space *mapping,
-					     loff_t start, size_t len)
+static bool erofs_fscache_io_put(struct erofs_fscache_io *io)
 {
-	struct erofs_fscache_request *req;
-
-	req = kzalloc(sizeof(struct erofs_fscache_request), GFP_KERNEL);
-	if (!req)
-		return ERR_PTR(-ENOMEM);
-
-	req->mapping = mapping;
-	req->start   = start;
-	req->len     = len;
-	refcount_set(&req->ref, 1);
-
-	return req;
+	if (!refcount_dec_and_test(&io->ref))
+		return false;
+	if (io->cres.ops)
+		io->cres.ops->end_operation(&io->cres);
+	kfree(io);
+	return true;
 }
 
-static struct erofs_fscache_request *erofs_fscache_req_chain(struct erofs_fscache_request *primary,
-					     size_t len)
-{
-	struct erofs_fscache_request *req;
-
-	/* use primary request for the first submission */
-	if (!primary->submitted) {
-		refcount_inc(&primary->ref);
-		return primary;
-	}
-
-	req = erofs_fscache_req_alloc(primary->mapping,
-			primary->start + primary->submitted, len);
-	if (!IS_ERR(req)) {
-		req->primary = primary;
-		refcount_inc(&primary->ref);
-	}
-	return req;
-}
-
-static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
+static void erofs_fscache_req_complete(struct erofs_fscache_rq *req)
 {
 	struct folio *folio;
 	bool failed = req->error;
@@ -80,120 +72,196 @@ static void erofs_fscache_req_complete(struct erofs_fscache_request *req)
 	rcu_read_unlock();
 }
 
-static void erofs_fscache_req_put(struct erofs_fscache_request *req)
+static void erofs_fscache_req_put(struct erofs_fscache_rq *req)
 {
-	if (refcount_dec_and_test(&req->ref)) {
-		if (req->cache_resources.ops)
-			req->cache_resources.ops->end_operation(&req->cache_resources);
-		if (!req->primary)
-			erofs_fscache_req_complete(req);
-		else
-			erofs_fscache_req_put(req->primary);
-		kfree(req);
-	}
+	if (!refcount_dec_and_test(&req->ref))
+		return;
+	erofs_fscache_req_complete(req);
+	kfree(req);
 }
 
-static void erofs_fscache_subreq_complete(void *priv,
+static struct erofs_fscache_rq *erofs_fscache_req_alloc(struct address_space *mapping,
+						loff_t start, size_t len)
+{
+	struct erofs_fscache_rq *req = kzalloc(sizeof(*req), GFP_KERNEL);
+
+	if (!req)
+		return NULL;
+	req->mapping = mapping;
+	req->start = start;
+	req->len = len;
+	refcount_set(&req->ref, 1);
+	return req;
+}
+
+static void erofs_fscache_req_io_put(struct erofs_fscache_io *io)
+{
+	struct erofs_fscache_rq *req = io->private;
+
+	if (erofs_fscache_io_put(io))
+		erofs_fscache_req_put(req);
+}
+
+static void erofs_fscache_req_end_io(void *priv,
 		ssize_t transferred_or_error, bool was_async)
 {
-	struct erofs_fscache_request *req = priv;
+	struct erofs_fscache_io *io = priv;
+	struct erofs_fscache_rq *req = io->private;
 
-	if (IS_ERR_VALUE(transferred_or_error)) {
-		if (req->primary)
-			req->primary->error = transferred_or_error;
-		else
-			req->error = transferred_or_error;
-	}
-	erofs_fscache_req_put(req);
+	if (IS_ERR_VALUE(transferred_or_error))
+		req->error = transferred_or_error;
+	erofs_fscache_req_io_put(io);
+}
+
+static struct erofs_fscache_io *erofs_fscache_req_io_alloc(struct erofs_fscache_rq *req)
+{
+	struct erofs_fscache_io *io = kzalloc(sizeof(*io), GFP_KERNEL);
+
+	if (!io)
+		return NULL;
+	io->end_io = erofs_fscache_req_end_io;
+	io->private = req;
+	refcount_inc(&req->ref);
+	refcount_set(&io->ref, 1);
+	return io;
 }
 
 /*
- * Read data from fscache (cookie, pstart, len), and fill the read data into
- * page cache described by (req->mapping, lstart, len). @pstart describeis the
- * start physical address in the cache file.
+ * Read data from fscache described by cookie at pstart physical address
+ * offset, and fill the read data into buffer described by io->iter.
  */
-static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
-		struct erofs_fscache_request *req, loff_t pstart, size_t len)
+static int erofs_fscache_read_io_async(struct fscache_cookie *cookie,
+		loff_t pstart, struct erofs_fscache_io *io)
 {
 	enum netfs_io_source source;
-	struct super_block *sb = req->mapping->host->i_sb;
-	struct netfs_cache_resources *cres = &req->cache_resources;
-	struct iov_iter iter;
-	loff_t lstart = req->start + req->submitted;
-	size_t done = 0;
+	struct netfs_cache_resources *cres = &io->cres;
+	struct iov_iter *iter = &io->iter;
 	int ret;
 
-	DBG_BUGON(len > req->len - req->submitted);
-
 	ret = fscache_begin_read_operation(cres, cookie);
 	if (ret)
 		return ret;
 
-	while (done < len) {
-		loff_t sstart = pstart + done;
-		size_t slen = len - done;
+	while (iov_iter_count(iter)) {
+		size_t orig_count = iov_iter_count(iter), len = orig_count;
 		unsigned long flags = 1 << NETFS_SREQ_ONDEMAND;
 
 		source = cres->ops->prepare_ondemand_read(cres,
-				sstart, &slen, LLONG_MAX, &flags, 0);
-		if (WARN_ON(slen == 0))
+				pstart, &len, LLONG_MAX, &flags, 0);
+		if (WARN_ON(len == 0))
 			source = NETFS_INVALID_READ;
 		if (source != NETFS_READ_FROM_CACHE) {
-			erofs_err(sb, "failed to fscache prepare_read (source %d)", source);
+			erofs_err(NULL, "prepare_read failed (source %d)", source);
 			return -EIO;
 		}
 
-		refcount_inc(&req->ref);
-		iov_iter_xarray(&iter, ITER_DEST, &req->mapping->i_pages,
-				lstart + done, slen);
-
-		ret = fscache_read(cres, sstart, &iter, NETFS_READ_HOLE_FAIL,
-				   erofs_fscache_subreq_complete, req);
+		iov_iter_truncate(iter, len);
+		refcount_inc(&io->ref);
+		ret = fscache_read(cres, pstart, iter, NETFS_READ_HOLE_FAIL,
+				   io->end_io, io);
 		if (ret == -EIOCBQUEUED)
 			ret = 0;
 		if (ret) {
-			erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+			erofs_err(NULL, "fscache_read failed (ret %d)", ret);
 			return ret;
 		}
+		if (WARN_ON(iov_iter_count(iter)))
+			return -EIO;
 
-		done += slen;
+		iov_iter_reexpand(iter, orig_count - len);
+		pstart += len;
 	}
-	DBG_BUGON(done != len);
 	return 0;
 }
 
-static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+struct erofs_fscache_bio {
+	struct erofs_fscache_io io;
+	struct bio bio;		/* w/o bdev to share bio_add_page/endio() */
+	struct bio_vec bvecs[BIO_MAX_VECS];
+};
+
+static void erofs_fscache_bio_endio(void *priv,
+		ssize_t transferred_or_error, bool was_async)
+{
+	struct erofs_fscache_bio *io = priv;
+
+	if (IS_ERR_VALUE(transferred_or_error))
+		io->bio.bi_status = errno_to_blk_status(transferred_or_error);
+	io->bio.bi_end_io(&io->bio);
+	BUILD_BUG_ON(offsetof(struct erofs_fscache_bio, io) != 0);
+	erofs_fscache_io_put(&io->io);
+}
+
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev)
 {
+	struct erofs_fscache_bio *io;
+
+	io = kmalloc(sizeof(*io), GFP_KERNEL | __GFP_NOFAIL);
+	bio_init(&io->bio, NULL, io->bvecs, BIO_MAX_VECS, REQ_OP_READ);
+	io->io.private = mdev->m_fscache->cookie;
+	io->io.end_io = erofs_fscache_bio_endio;
+	refcount_set(&io->io.ref, 1);
+	return &io->bio;
+}
+
+void erofs_fscache_submit_bio(struct bio *bio)
+{
+	struct erofs_fscache_bio *io = container_of(bio,
+			struct erofs_fscache_bio, bio);
 	int ret;
+
+	iov_iter_bvec(&io->io.iter, ITER_DEST, io->bvecs, bio->bi_vcnt,
+		      bio->bi_iter.bi_size);
+	ret = erofs_fscache_read_io_async(io->io.private,
+				bio->bi_iter.bi_sector << 9, &io->io);
+	erofs_fscache_io_put(&io->io);
+	if (!ret)
+		return;
+	bio->bi_status = errno_to_blk_status(ret);
+	bio->bi_end_io(bio);
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
 	struct erofs_fscache *ctx = folio->mapping->host->i_private;
-	struct erofs_fscache_request *req;
+	int ret = -ENOMEM;
+	struct erofs_fscache_rq *req;
+	struct erofs_fscache_io *io;
 
 	req = erofs_fscache_req_alloc(folio->mapping,
 				folio_pos(folio), folio_size(folio));
-	if (IS_ERR(req)) {
+	if (!req) {
 		folio_unlock(folio);
-		return PTR_ERR(req);
+		return ret;
 	}
 
-	ret = erofs_fscache_read_folios_async(ctx->cookie, req,
-				folio_pos(folio), folio_size(folio));
+	io = erofs_fscache_req_io_alloc(req);
+	if (!io) {
+		req->error = ret;
+		goto out;
+	}
+	iov_iter_xarray(&io->iter, ITER_DEST, &folio->mapping->i_pages,
+			folio_pos(folio), folio_size(folio));
+
+	ret = erofs_fscache_read_io_async(ctx->cookie, folio_pos(folio), io);
 	if (ret)
 		req->error = ret;
 
+	erofs_fscache_req_io_put(io);
+out:
 	erofs_fscache_req_put(req);
 	return ret;
 }
 
-static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
+static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req)
 {
-	struct address_space *mapping = primary->mapping;
+	struct address_space *mapping = req->mapping;
 	struct inode *inode = mapping->host;
 	struct super_block *sb = inode->i_sb;
-	struct erofs_fscache_request *req;
+	struct erofs_fscache_io *io;
 	struct erofs_map_blocks map;
 	struct erofs_map_dev mdev;
-	struct iov_iter iter;
-	loff_t pos = primary->start + primary->submitted;
+	loff_t pos = req->start + req->submitted;
 	size_t count;
 	int ret;
 
@@ -204,6 +272,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
 
 	if (map.m_flags & EROFS_MAP_META) {
 		struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+		struct iov_iter iter;
 		erofs_blk_t blknr;
 		size_t offset, size;
 		void *src;
@@ -224,15 +293,17 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
 		}
 		iov_iter_zero(PAGE_SIZE - size, &iter);
 		erofs_put_metabuf(&buf);
-		primary->submitted += PAGE_SIZE;
+		req->submitted += PAGE_SIZE;
 		return 0;
 	}
 
-	count = primary->len - primary->submitted;
+	count = req->len - req->submitted;
 	if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+		struct iov_iter iter;
+
 		iov_iter_xarray(&iter, ITER_DEST, &mapping->i_pages, pos, count);
 		iov_iter_zero(count, &iter);
-		primary->submitted += count;
+		req->submitted += count;
 		return 0;
 	}
 
@@ -247,18 +318,19 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary)
 	if (ret)
 		return ret;
 
-	req = erofs_fscache_req_chain(primary, count);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	io = erofs_fscache_req_io_alloc(req);
+	if (!io)
+		return -ENOMEM;
+	iov_iter_xarray(&io->iter, ITER_DEST, &mapping->i_pages, pos, count);
+	ret = erofs_fscache_read_io_async(mdev.m_fscache->cookie,
+			mdev.m_pa + (pos - map.m_la), io);
+	erofs_fscache_req_io_put(io);
 
-	ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
-			req, mdev.m_pa + (pos - map.m_la), count);
-	erofs_fscache_req_put(req);
-	primary->submitted += count;
+	req->submitted += count;
 	return ret;
 }
 
-static int erofs_fscache_data_read(struct erofs_fscache_request *req)
+static int erofs_fscache_data_read(struct erofs_fscache_rq *req)
 {
 	int ret;
 
@@ -267,20 +339,19 @@ static int erofs_fscache_data_read(struct erofs_fscache_request *req)
 		if (ret)
 			req->error = ret;
 	} while (!ret && req->submitted < req->len);
-
 	return ret;
 }
 
 static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
 {
-	struct erofs_fscache_request *req;
+	struct erofs_fscache_rq *req;
 	int ret;
 
 	req = erofs_fscache_req_alloc(folio->mapping,
 			folio_pos(folio), folio_size(folio));
-	if (IS_ERR(req)) {
+	if (!req) {
 		folio_unlock(folio);
-		return PTR_ERR(req);
+		return -ENOMEM;
 	}
 
 	ret = erofs_fscache_data_read(req);
@@ -290,14 +361,14 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
 
 static void erofs_fscache_readahead(struct readahead_control *rac)
 {
-	struct erofs_fscache_request *req;
+	struct erofs_fscache_rq *req;
 
 	if (!readahead_count(rac))
 		return;
 
 	req = erofs_fscache_req_alloc(rac->mapping,
 			readahead_pos(rac), readahead_length(rac));
-	if (IS_ERR(req))
+	if (!req)
 		return;
 
 	/* The request completion will drop refs on the folios. */
@@ -381,7 +452,7 @@ static int erofs_fscache_init_domain(struct super_block *sb)
 		goto out;
 
 	if (!erofs_pseudo_mnt) {
-		struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+		struct vfsmount *mnt = kern_mount(&erofs_anon_fs_type);
 		if (IS_ERR(mnt)) {
 			err = PTR_ERR(mnt);
 			goto out;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 36e638e8b53a..0eb0e6f933c3 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -259,14 +259,12 @@ static int erofs_fill_inode(struct inode *inode)
 
 	if (erofs_inode_is_data_compressed(vi->datalayout)) {
 #ifdef CONFIG_EROFS_FS_ZIP
-		if (!erofs_is_fscache_mode(inode->i_sb)) {
-			DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE,
-				  erofs_info, inode->i_sb,
-				  "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
-			inode->i_mapping->a_ops = &z_erofs_aops;
-			err = 0;
-			goto out_unlock;
-		}
+		DO_ONCE_LITE_IF(inode->i_blkbits != PAGE_SHIFT,
+			  erofs_info, inode->i_sb,
+			  "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
+		inode->i_mapping->a_ops = &z_erofs_aops;
+		err = 0;
+		goto out_unlock;
 #endif
 		err = -EOPNOTSUPP;
 		goto out_unlock;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index b0409badb017..39c67119f43b 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,7 +49,7 @@ typedef u32 erofs_blk_t;
 struct erofs_device_info {
 	char *path;
 	struct erofs_fscache *fscache;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct dax_device *dax_dev;
 	u64 dax_part_off;
 
@@ -385,7 +385,6 @@ struct erofs_map_dev {
 	unsigned int m_deviceid;
 };
 
-extern struct file_system_type erofs_fs_type;
 extern const struct super_operations erofs_sops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
@@ -467,8 +466,8 @@ int __init erofs_init_shrinker(void);
 void erofs_exit_shrinker(void);
 int __init z_erofs_init_zip_subsystem(void);
 void z_erofs_exit_zip_subsystem(void);
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
-				       struct erofs_workgroup *egrp);
+int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+					struct erofs_workgroup *egrp);
 int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
 			    int flags);
 void *erofs_get_pcpubuf(unsigned int requiredpages);
@@ -513,6 +512,8 @@ void erofs_fscache_unregister_fs(struct super_block *sb);
 struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
 					char *name, unsigned int flags);
 void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev);
+void erofs_fscache_submit_bio(struct bio *bio);
 #else
 static inline int erofs_fscache_register_fs(struct super_block *sb)
 {
@@ -530,6 +531,8 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
 static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
 {
 }
+static inline struct bio *erofs_fscache_bio_alloc(struct erofs_map_dev *mdev) { return NULL; }
+static inline void erofs_fscache_submit_bio(struct bio *bio) {}
 #endif
 
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 5f60f163bd56..c0eb139adb07 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -177,7 +177,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_fscache *fscache;
 	struct erofs_deviceslot *dis;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	void *ptr;
 
 	ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP);
@@ -201,12 +201,12 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
 			return PTR_ERR(fscache);
 		dif->fscache = fscache;
 	} else if (!sbi->devs->flatdev) {
-		bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ,
+		bdev_file = bdev_file_open_by_path(dif->path, BLK_OPEN_READ,
 						sb->s_type, NULL);
-		if (IS_ERR(bdev_handle))
-			return PTR_ERR(bdev_handle);
-		dif->bdev_handle = bdev_handle;
-		dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev,
+		if (IS_ERR(bdev_file))
+			return PTR_ERR(bdev_file);
+		dif->bdev_file = bdev_file;
+		dif->dax_dev = fs_dax_get_by_bdev(file_bdev(bdev_file),
 				&dif->dax_part_off, NULL, NULL);
 	}
 
@@ -430,7 +430,6 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
 
 	switch (mode) {
 	case EROFS_MOUNT_DAX_ALWAYS:
-		warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
 		set_opt(&ctx->opt, DAX_ALWAYS);
 		clear_opt(&ctx->opt, DAX_NEVER);
 		return true;
@@ -579,13 +578,6 @@ static const struct export_operations erofs_export_ops = {
 	.get_parent = erofs_get_parent,
 };
 
-static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
-{
-	static const struct tree_descr empty_descr = {""};
-
-	return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
-}
-
 static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct inode *inode;
@@ -712,11 +704,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
 	return 0;
 }
 
-static int erofs_fc_anon_get_tree(struct fs_context *fc)
-{
-	return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
-}
-
 static int erofs_fc_get_tree(struct fs_context *fc)
 {
 	struct erofs_fs_context *ctx = fc->fs_private;
@@ -754,8 +741,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data)
 	struct erofs_device_info *dif = ptr;
 
 	fs_put_dax(dif->dax_dev, NULL);
-	if (dif->bdev_handle)
-		bdev_release(dif->bdev_handle);
+	if (dif->bdev_file)
+		fput(dif->bdev_file);
 	erofs_fscache_unregister_cookie(dif->fscache);
 	dif->fscache = NULL;
 	kfree(dif->path);
@@ -789,20 +776,10 @@ static const struct fs_context_operations erofs_context_ops = {
 	.free		= erofs_fc_free,
 };
 
-static const struct fs_context_operations erofs_anon_context_ops = {
-	.get_tree       = erofs_fc_anon_get_tree,
-};
-
 static int erofs_init_fs_context(struct fs_context *fc)
 {
 	struct erofs_fs_context *ctx;
 
-	/* pseudo mount for anon inodes */
-	if (fc->sb_flags & SB_KERNMOUNT) {
-		fc->ops = &erofs_anon_context_ops;
-		return 0;
-	}
-
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
@@ -824,12 +801,6 @@ static void erofs_kill_sb(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi;
 
-	/* pseudo mount for anon inodes */
-	if (sb->s_flags & SB_KERNMOUNT) {
-		kill_anon_super(sb);
-		return;
-	}
-
 	if (erofs_is_fscache_mode(sb))
 		kill_anon_super(sb);
 	else
@@ -868,7 +839,7 @@ static void erofs_put_super(struct super_block *sb)
 	erofs_fscache_unregister_fs(sb);
 }
 
-struct file_system_type erofs_fs_type = {
+static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
 	.init_fs_context = erofs_init_fs_context,
@@ -885,7 +856,7 @@ static int __init erofs_module_init(void)
 
 	erofs_inode_cachep = kmem_cache_create("erofs_inode",
 			sizeof(struct erofs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 			erofs_inode_init_once);
 	if (!erofs_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index e146d09151af..518bdd69c823 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -129,7 +129,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
 	 * the XArray. Otherwise some cached pages could be still attached to
 	 * the orphan old workgroup when the new one is available in the tree.
 	 */
-	if (erofs_try_to_free_all_cached_pages(sbi, grp))
+	if (erofs_try_to_free_all_cached_folios(sbi, grp))
 		goto out;
 
 	/*
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index ff0aa72b0db3..3216b920d369 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -19,7 +19,10 @@
 typedef void *z_erofs_next_pcluster_t;
 
 struct z_erofs_bvec {
-	struct page *page;
+	union {
+		struct page *page;
+		struct folio *folio;
+	};
 	int offset;
 	unsigned int end;
 };
@@ -116,47 +119,46 @@ static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
 	return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
 }
 
+#define MNGD_MAPPING(sbi)	((sbi)->managed_cache->i_mapping)
+static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo)
+{
+	return fo->mapping == MNGD_MAPPING(sbi);
+}
+
 /*
- * bit 30: I/O error occurred on this page
- * bit 0 - 29: remaining parts to complete this page
+ * bit 30: I/O error occurred on this folio
+ * bit 0 - 29: remaining parts to complete this folio
  */
-#define Z_EROFS_PAGE_EIO			(1 << 30)
+#define Z_EROFS_FOLIO_EIO			(1 << 30)
 
-static inline void z_erofs_onlinepage_init(struct page *page)
+static void z_erofs_onlinefolio_init(struct folio *folio)
 {
 	union {
 		atomic_t o;
-		unsigned long v;
+		void *v;
 	} u = { .o = ATOMIC_INIT(1) };
 
-	set_page_private(page, u.v);
-	smp_wmb();
-	SetPagePrivate(page);
+	folio->private = u.v;	/* valid only if file-backed folio is locked */
 }
 
-static inline void z_erofs_onlinepage_split(struct page *page)
+static void z_erofs_onlinefolio_split(struct folio *folio)
 {
-	atomic_inc((atomic_t *)&page->private);
+	atomic_inc((atomic_t *)&folio->private);
 }
 
-static void z_erofs_onlinepage_endio(struct page *page, int err)
+static void z_erofs_onlinefolio_end(struct folio *folio, int err)
 {
 	int orig, v;
 
-	DBG_BUGON(!PagePrivate(page));
-
 	do {
-		orig = atomic_read((atomic_t *)&page->private);
-		v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0);
-	} while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig);
-
-	if (!(v & ~Z_EROFS_PAGE_EIO)) {
-		set_page_private(page, 0);
-		ClearPagePrivate(page);
-		if (!(v & Z_EROFS_PAGE_EIO))
-			SetPageUptodate(page);
-		unlock_page(page);
-	}
+		orig = atomic_read((atomic_t *)&folio->private);
+		v = (orig - 1) | (err ? Z_EROFS_FOLIO_EIO : 0);
+	} while (atomic_cmpxchg((atomic_t *)&folio->private, orig, v) != orig);
+
+	if (v & ~Z_EROFS_FOLIO_EIO)
+		return;
+	folio->private = 0;
+	folio_end_read(folio, !(v & Z_EROFS_FOLIO_EIO));
 }
 
 #define Z_EROFS_ONSTACK_PAGES		32
@@ -572,17 +574,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 
 	for (i = 0; i < pclusterpages; ++i) {
 		struct page *page, *newpage;
-		void *t;	/* mark pages just found for debugging */
 
 		/* Inaccurate check w/o locking to avoid unneeded lookups */
 		if (READ_ONCE(pcl->compressed_bvecs[i].page))
 			continue;
 
 		page = find_get_page(mc, pcl->obj.index + i);
-		if (page) {
-			t = (void *)((unsigned long)page | 1);
-			newpage = NULL;
-		} else {
+		if (!page) {
 			/* I/O is needed, no possible to decompress directly */
 			standalone = false;
 			if (!shouldalloc)
@@ -596,11 +594,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 			if (!newpage)
 				continue;
 			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
-			t = (void *)((unsigned long)newpage | 1);
 		}
 		spin_lock(&pcl->obj.lockref.lock);
 		if (!pcl->compressed_bvecs[i].page) {
-			pcl->compressed_bvecs[i].page = t;
+			pcl->compressed_bvecs[i].page = page ? page : newpage;
 			spin_unlock(&pcl->obj.lockref.lock);
 			continue;
 		}
@@ -620,9 +617,9 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
 		fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
 }
 
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
-				       struct erofs_workgroup *grp)
+/* called by erofs_shrinker to get rid of all cached compressed bvecs */
+int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi,
+					struct erofs_workgroup *grp)
 {
 	struct z_erofs_pcluster *const pcl =
 		container_of(grp, struct z_erofs_pcluster, obj);
@@ -630,27 +627,22 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 	int i;
 
 	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
-	/*
-	 * refcount of workgroup is now freezed as 0,
-	 * therefore no need to worry about available decompression users.
-	 */
+	/* There is no actice user since the pcluster is now freezed */
 	for (i = 0; i < pclusterpages; ++i) {
-		struct page *page = pcl->compressed_bvecs[i].page;
+		struct folio *folio = pcl->compressed_bvecs[i].folio;
 
-		if (!page)
+		if (!folio)
 			continue;
 
-		/* block other users from reclaiming or migrating the page */
-		if (!trylock_page(page))
+		/* Avoid reclaiming or migrating this folio */
+		if (!folio_trylock(folio))
 			return -EBUSY;
 
-		if (!erofs_page_is_managed(sbi, page))
+		if (!erofs_folio_is_managed(sbi, folio))
 			continue;
-
-		/* barrier is implied in the following 'unlock_page' */
-		WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
-		detach_page_private(page);
-		unlock_page(page);
+		pcl->compressed_bvecs[i].folio = NULL;
+		folio_detach_private(folio);
+		folio_unlock(folio);
 	}
 	return 0;
 }
@@ -667,20 +659,17 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
 
 	ret = false;
 	spin_lock(&pcl->obj.lockref.lock);
-	if (pcl->obj.lockref.count > 0)
-		goto out;
-
-	DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
-	for (i = 0; i < pclusterpages; ++i) {
-		if (pcl->compressed_bvecs[i].page == &folio->page) {
-			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
-			ret = true;
-			break;
+	if (pcl->obj.lockref.count <= 0) {
+		DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+		for (i = 0; i < pclusterpages; ++i) {
+			if (pcl->compressed_bvecs[i].folio == folio) {
+				pcl->compressed_bvecs[i].folio = NULL;
+				folio_detach_private(folio);
+				ret = true;
+				break;
+			}
 		}
 	}
-	if (ret)
-		folio_detach_private(folio);
-out:
 	spin_unlock(&pcl->obj.lockref.lock);
 	return ret;
 }
@@ -962,20 +951,20 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
 	return 0;
 }
 
-static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-				struct page *page, bool ra)
+static int z_erofs_scan_folio(struct z_erofs_decompress_frontend *fe,
+			      struct folio *folio, bool ra)
 {
 	struct inode *const inode = fe->inode;
 	struct erofs_map_blocks *const map = &fe->map;
-	const loff_t offset = page_offset(page);
-	const unsigned int bs = i_blocksize(inode);
+	const loff_t offset = folio_pos(folio);
+	const unsigned int bs = i_blocksize(inode), fs = folio_size(folio);
 	bool tight = true, exclusive;
 	unsigned int cur, end, len, split;
 	int err = 0;
 
-	z_erofs_onlinepage_init(page);
+	z_erofs_onlinefolio_init(folio);
 	split = 0;
-	end = PAGE_SIZE;
+	end = fs;
 repeat:
 	if (offset + end - 1 < map->m_la ||
 	    offset + end - 1 >= map->m_la + map->m_llen) {
@@ -992,7 +981,7 @@ repeat:
 	++split;
 
 	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
-		zero_user_segment(page, cur, end);
+		folio_zero_segment(folio, cur, end);
 		tight = false;
 		goto next_part;
 	}
@@ -1001,8 +990,8 @@ repeat:
 		erofs_off_t fpos = offset + cur - map->m_la;
 
 		len = min_t(unsigned int, map->m_llen - fpos, end - cur);
-		err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len,
-				EROFS_I(inode)->z_fragmentoff + fpos);
+		err = z_erofs_read_fragment(inode->i_sb, &folio->page, cur,
+			cur + len, EROFS_I(inode)->z_fragmentoff + fpos);
 		if (err)
 			goto out;
 		tight = false;
@@ -1017,25 +1006,25 @@ repeat:
 	}
 
 	/*
-	 * Ensure the current partial page belongs to this submit chain rather
+	 * Ensure the current partial folio belongs to this submit chain rather
 	 * than other concurrent submit chains or the noio(bypass) chain since
-	 * those chains are handled asynchronously thus the page cannot be used
+	 * those chains are handled asynchronously thus the folio cannot be used
 	 * for inplace I/O or bvpage (should be processed in a strict order.)
 	 */
 	tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
-	exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE)));
+	exclusive = (!cur && ((split <= 1) || (tight && bs == fs)));
 	if (cur)
 		tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
 
 	err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
-					.page = page,
+					.page = &folio->page,
 					.offset = offset - map->m_la,
 					.end = end,
 				  }), exclusive);
 	if (err)
 		goto out;
 
-	z_erofs_onlinepage_split(page);
+	z_erofs_onlinefolio_split(folio);
 	if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
 		fe->pcl->multibases = true;
 	if (fe->pcl->length < offset + end - map->m_la) {
@@ -1056,7 +1045,7 @@ next_part:
 		goto repeat;
 
 out:
-	z_erofs_onlinepage_endio(page, err);
+	z_erofs_onlinefolio_end(folio, err);
 	return err;
 }
 
@@ -1159,7 +1148,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
 			cur += len;
 		}
 		kunmap_local(dst);
-		z_erofs_onlinepage_endio(bvi->bvec.page, err);
+		z_erofs_onlinefolio_end(page_folio(bvi->bvec.page), err);
 		list_del(p);
 		kfree(bvi);
 	}
@@ -1210,7 +1199,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
 		be->compressed_pages[i] = page;
 
 		if (z_erofs_is_inline_pcluster(pcl) ||
-		    erofs_page_is_managed(EROFS_SB(be->sb), page)) {
+		    erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) {
 			if (!PageUptodate(page))
 				err = -EIO;
 			continue;
@@ -1295,7 +1284,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 			/* consider shortlived pages added when decompressing */
 			page = be->compressed_pages[i];
 
-			if (!page || erofs_page_is_managed(sbi, page))
+			if (!page ||
+			    erofs_folio_is_managed(sbi, page_folio(page)))
 				continue;
 			(void)z_erofs_put_shortlivedpage(be->pagepool, page);
 			WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
@@ -1316,7 +1306,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
 		/* recycle all individual short-lived pages */
 		if (z_erofs_put_shortlivedpage(be->pagepool, page))
 			continue;
-		z_erofs_onlinepage_endio(page, err);
+		z_erofs_onlinefolio_end(page_folio(page), err);
 	}
 
 	if (be->decompressed_pages != be->onstack_pages)
@@ -1430,38 +1420,34 @@ static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
 	struct z_erofs_bvec zbv;
 	struct address_space *mapping;
 	struct page *page;
-	int justfound, bs = i_blocksize(f->inode);
+	int bs = i_blocksize(f->inode);
 
-	/* Except for inplace pages, the entire page can be used for I/Os */
+	/* Except for inplace folios, the entire folio can be used for I/Os */
 	bvec->bv_offset = 0;
 	bvec->bv_len = PAGE_SIZE;
 repeat:
 	spin_lock(&pcl->obj.lockref.lock);
 	zbv = pcl->compressed_bvecs[nr];
-	page = zbv.page;
-	justfound = (unsigned long)page & 1UL;
-	page = (struct page *)((unsigned long)page & ~1UL);
-	pcl->compressed_bvecs[nr].page = page;
 	spin_unlock(&pcl->obj.lockref.lock);
-	if (!page)
-		goto out_allocpage;
+	if (!zbv.folio)
+		goto out_allocfolio;
 
-	bvec->bv_page = page;
-	DBG_BUGON(z_erofs_is_shortlived_page(page));
+	bvec->bv_page = &zbv.folio->page;
+	DBG_BUGON(z_erofs_is_shortlived_page(bvec->bv_page));
 	/*
-	 * Handle preallocated cached pages.  We tried to allocate such pages
+	 * Handle preallocated cached folios.  We tried to allocate such folios
 	 * without triggering direct reclaim.  If allocation failed, inplace
-	 * file-backed pages will be used instead.
+	 * file-backed folios will be used instead.
 	 */
-	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
-		set_page_private(page, 0);
+	if (zbv.folio->private == (void *)Z_EROFS_PREALLOCATED_PAGE) {
+		zbv.folio->private = 0;
 		tocache = true;
 		goto out_tocache;
 	}
 
-	mapping = READ_ONCE(page->mapping);
+	mapping = READ_ONCE(zbv.folio->mapping);
 	/*
-	 * File-backed pages for inplace I/Os are all locked steady,
+	 * File-backed folios for inplace I/Os are all locked steady,
 	 * therefore it is impossible for `mapping` to be NULL.
 	 */
 	if (mapping && mapping != mc) {
@@ -1471,26 +1457,21 @@ repeat:
 		return;
 	}
 
-	lock_page(page);
-	/* only true if page reclaim goes wrong, should never happen */
-	DBG_BUGON(justfound && PagePrivate(page));
-
-	/* the cached page is still in managed cache */
-	if (page->mapping == mc) {
+	folio_lock(zbv.folio);
+	if (zbv.folio->mapping == mc) {
 		/*
-		 * The cached page is still available but without a valid
-		 * `->private` pcluster hint.  Let's reconnect them.
+		 * The cached folio is still in managed cache but without
+		 * a valid `->private` pcluster hint.  Let's reconnect them.
 		 */
-		if (!PagePrivate(page)) {
-			DBG_BUGON(!justfound);
-			/* compressed_bvecs[] already takes a ref */
-			attach_page_private(page, pcl);
-			put_page(page);
+		if (!folio_test_private(zbv.folio)) {
+			folio_attach_private(zbv.folio, pcl);
+			/* compressed_bvecs[] already takes a ref before */
+			folio_put(zbv.folio);
 		}
 
 		/* no need to submit if it is already up-to-date */
-		if (PageUptodate(page)) {
-			unlock_page(page);
+		if (folio_test_uptodate(zbv.folio)) {
+			folio_unlock(zbv.folio);
 			bvec->bv_page = NULL;
 		}
 		return;
@@ -1500,34 +1481,32 @@ repeat:
 	 * It has been truncated, so it's unsafe to reuse this one. Let's
 	 * allocate a new page for compressed data.
 	 */
-	DBG_BUGON(page->mapping);
-	DBG_BUGON(!justfound);
-
+	DBG_BUGON(zbv.folio->mapping);
 	tocache = true;
-	unlock_page(page);
-	put_page(page);
-out_allocpage:
+	folio_unlock(zbv.folio);
+	folio_put(zbv.folio);
+out_allocfolio:
 	page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
 	spin_lock(&pcl->obj.lockref.lock);
-	if (pcl->compressed_bvecs[nr].page) {
+	if (pcl->compressed_bvecs[nr].folio) {
 		erofs_pagepool_add(&f->pagepool, page);
 		spin_unlock(&pcl->obj.lockref.lock);
 		cond_resched();
 		goto repeat;
 	}
-	pcl->compressed_bvecs[nr].page = page;
+	pcl->compressed_bvecs[nr].folio = zbv.folio = page_folio(page);
 	spin_unlock(&pcl->obj.lockref.lock);
 	bvec->bv_page = page;
 out_tocache:
 	if (!tocache || bs != PAGE_SIZE ||
-	    add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) {
-		/* turn into a temporary shortlived page (1 ref) */
-		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+	    filemap_add_folio(mc, zbv.folio, pcl->obj.index + nr, gfp)) {
+		/* turn into a temporary shortlived folio (1 ref) */
+		zbv.folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE;
 		return;
 	}
-	attach_page_private(page, pcl);
+	folio_attach_private(zbv.folio, pcl);
 	/* drop a refcount added by allocpage (then 2 refs in total here) */
-	put_page(page);
+	folio_put(zbv.folio);
 }
 
 static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1582,28 +1561,29 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
 	qtail[JQ_BYPASS] = &pcl->next;
 }
 
-static void z_erofs_submissionqueue_endio(struct bio *bio)
+static void z_erofs_endio(struct bio *bio)
 {
 	struct z_erofs_decompressqueue *q = bio->bi_private;
 	blk_status_t err = bio->bi_status;
-	struct bio_vec *bvec;
-	struct bvec_iter_all iter_all;
+	struct folio_iter fi;
 
-	bio_for_each_segment_all(bvec, bio, iter_all) {
-		struct page *page = bvec->bv_page;
+	bio_for_each_folio_all(fi, bio) {
+		struct folio *folio = fi.folio;
 
-		DBG_BUGON(PageUptodate(page));
-		DBG_BUGON(z_erofs_page_is_invalidated(page));
-		if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
-			if (!err)
-				SetPageUptodate(page);
-			unlock_page(page);
-		}
+		DBG_BUGON(folio_test_uptodate(folio));
+		DBG_BUGON(z_erofs_page_is_invalidated(&folio->page));
+		if (!erofs_folio_is_managed(EROFS_SB(q->sb), folio))
+			continue;
+
+		if (!err)
+			folio_mark_uptodate(folio);
+		folio_unlock(folio);
 	}
 	if (err)
 		q->eio = true;
 	z_erofs_decompress_kickoff(q, -1);
-	bio_put(bio);
+	if (bio->bi_bdev)
+		bio_put(bio);
 }
 
 static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
@@ -1617,7 +1597,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 	z_erofs_next_pcluster_t owned_head = f->owned_head;
 	/* bio is NULL initially, so no need to initialize last_{index,bdev} */
 	erofs_off_t last_pa;
-	struct block_device *last_bdev;
 	unsigned int nr_bios = 0;
 	struct bio *bio = NULL;
 	unsigned long pflags;
@@ -1664,9 +1643,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
 				continue;
 
 			if (bio && (cur != last_pa ||
-				    last_bdev != mdev.m_bdev)) {
-submit_bio_retry:
-				submit_bio(bio);
+				    bio->bi_bdev != mdev.m_bdev)) {
+io_retry:
+				if (!erofs_is_fscache_mode(sb))
+					submit_bio(bio);
+				else
+					erofs_fscache_submit_bio(bio);
+
 				if (memstall) {
 					psi_memstall_leave(&pflags);
 					memstall = 0;
@@ -1681,15 +1664,16 @@ submit_bio_retry:
 			}
 
 			if (!bio) {
-				bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
-						REQ_OP_READ, GFP_NOIO);
-				bio->bi_end_io = z_erofs_submissionqueue_endio;
+				bio = erofs_is_fscache_mode(sb) ?
+					erofs_fscache_bio_alloc(&mdev) :
+					bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+						  REQ_OP_READ, GFP_NOIO);
+				bio->bi_end_io = z_erofs_endio;
 				bio->bi_iter.bi_sector = cur >> 9;
 				bio->bi_private = q[JQ_SUBMIT];
 				if (readahead)
 					bio->bi_opf |= REQ_RAHEAD;
 				++nr_bios;
-				last_bdev = mdev.m_bdev;
 			}
 
 			if (cur + bvec.bv_len > end)
@@ -1697,7 +1681,7 @@ submit_bio_retry:
 			DBG_BUGON(bvec.bv_len < sb->s_blocksize);
 			if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
 					  bvec.bv_offset))
-				goto submit_bio_retry;
+				goto io_retry;
 
 			last_pa = cur + bvec.bv_len;
 			bypass = false;
@@ -1710,7 +1694,10 @@ submit_bio_retry:
 	} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
 
 	if (bio) {
-		submit_bio(bio);
+		if (!erofs_is_fscache_mode(sb))
+			submit_bio(bio);
+		else
+			erofs_fscache_submit_bio(bio);
 		if (memstall)
 			psi_memstall_leave(&pflags);
 	}
@@ -1795,7 +1782,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
 			if (PageUptodate(page))
 				unlock_page(page);
 			else
-				(void)z_erofs_do_read_page(f, page, !!rac);
+				z_erofs_scan_folio(f, page_folio(page), !!rac);
 			put_page(page);
 		}
 
@@ -1816,7 +1803,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
 	f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
 
 	z_erofs_pcluster_readmore(&f, NULL, true);
-	err = z_erofs_do_read_page(&f, &folio->page, false);
+	err = z_erofs_scan_folio(&f, folio, false);
 	z_erofs_pcluster_readmore(&f, NULL, false);
 	z_erofs_pcluster_end(&f);
 
@@ -1857,7 +1844,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
 		folio = head;
 		head = folio_get_private(folio);
 
-		err = z_erofs_do_read_page(&f, &folio->page, true);
+		err = z_erofs_scan_folio(&f, folio, true);
 		if (err && err != -EINTR)
 			erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
 				  folio->index, EROFS_I(inode)->nid);
diff --git a/fs/eventfd.c b/fs/eventfd.c
index ad8186d47ba7..9afdb722fa92 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -251,7 +251,7 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 	ssize_t res;
 	__u64 ucnt;
 
-	if (count < sizeof(ucnt))
+	if (count != sizeof(ucnt))
 		return -EINVAL;
 	if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
 		return -EFAULT;
@@ -283,13 +283,18 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
 static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventfd_ctx *ctx = f->private_data;
+	__u64 cnt;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-count: %16llx\n",
-		   (unsigned long long)ctx->count);
+	cnt = ctx->count;
 	spin_unlock_irq(&ctx->wqh.lock);
-	seq_printf(m, "eventfd-id: %d\n", ctx->id);
-	seq_printf(m, "eventfd-semaphore: %d\n",
+
+	seq_printf(m,
+		   "eventfd-count: %16llx\n"
+		   "eventfd-id: %d\n"
+		   "eventfd-semaphore: %d\n",
+		   cnt,
+		   ctx->id,
 		   !!(ctx->flags & EFD_SEMAPHORE));
 }
 #endif
@@ -383,6 +388,7 @@ static int do_eventfd(unsigned int count, int flags)
 	/* Check the EFD_* constants for consistency.  */
 	BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
+	BUILD_BUG_ON(EFD_SEMAPHORE != (1 << 0));
 
 	if (flags & ~EFD_FLAGS_SET)
 		return -EINVAL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3534d36a1474..882b89edc52a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -37,6 +37,7 @@
 #include <linux/seq_file.h>
 #include <linux/compat.h>
 #include <linux/rculist.h>
+#include <linux/capability.h>
 #include <net/busy_poll.h>
 
 /*
@@ -206,7 +207,7 @@ struct eventpoll {
 	 */
 	struct epitem *ovflist;
 
-	/* wakeup_source used when ep_scan_ready_list is running */
+	/* wakeup_source used when ep_send_events or __ep_eventpoll_poll is running */
 	struct wakeup_source *ws;
 
 	/* The user that created the eventpoll descriptor */
@@ -227,6 +228,11 @@ struct eventpoll {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	/* used to track busy poll napi_id */
 	unsigned int napi_id;
+	/* busy poll timeout */
+	u32 busy_poll_usecs;
+	/* busy poll packet budget */
+	u16 busy_poll_budget;
+	bool prefer_busy_poll;
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -387,11 +393,41 @@ static inline int ep_events_available(struct eventpoll *ep)
 }
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
+/**
+ * busy_loop_ep_timeout - check if busy poll has timed out. The timeout value
+ * from the epoll instance ep is preferred, but if it is not set fallback to
+ * the system-wide global via busy_loop_timeout.
+ *
+ * @start_time: The start time used to compute the remaining time until timeout.
+ * @ep: Pointer to the eventpoll context.
+ *
+ * Return: true if the timeout has expired, false otherwise.
+ */
+static bool busy_loop_ep_timeout(unsigned long start_time,
+				 struct eventpoll *ep)
+{
+	unsigned long bp_usec = READ_ONCE(ep->busy_poll_usecs);
+
+	if (bp_usec) {
+		unsigned long end_time = start_time + bp_usec;
+		unsigned long now = busy_loop_current_time();
+
+		return time_after(now, end_time);
+	} else {
+		return busy_loop_timeout(start_time);
+	}
+}
+
+static bool ep_busy_loop_on(struct eventpoll *ep)
+{
+	return !!ep->busy_poll_usecs || net_busy_loop_on();
+}
+
 static bool ep_busy_loop_end(void *p, unsigned long start_time)
 {
 	struct eventpoll *ep = p;
 
-	return ep_events_available(ep) || busy_loop_timeout(start_time);
+	return ep_events_available(ep) || busy_loop_ep_timeout(start_time, ep);
 }
 
 /*
@@ -403,10 +439,15 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
 static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
 {
 	unsigned int napi_id = READ_ONCE(ep->napi_id);
+	u16 budget = READ_ONCE(ep->busy_poll_budget);
+	bool prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
 
-	if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) {
-		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
-			       BUSY_POLL_BUDGET);
+	if (!budget)
+		budget = BUSY_POLL_BUDGET;
+
+	if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
+		napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+			       ep, prefer_busy_poll, budget);
 		if (ep_events_available(ep))
 			return true;
 		/*
@@ -425,12 +466,12 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
  */
 static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 {
-	struct eventpoll *ep;
+	struct eventpoll *ep = epi->ep;
 	unsigned int napi_id;
 	struct socket *sock;
 	struct sock *sk;
 
-	if (!net_busy_loop_on())
+	if (!ep_busy_loop_on(ep))
 		return;
 
 	sock = sock_from_file(epi->ffd.file);
@@ -442,7 +483,6 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 		return;
 
 	napi_id = READ_ONCE(sk->sk_napi_id);
-	ep = epi->ep;
 
 	/* Non-NAPI IDs can be rejected
 	 *	or
@@ -455,6 +495,49 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 	ep->napi_id = napi_id;
 }
 
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	struct eventpoll *ep = file->private_data;
+	void __user *uarg = (void __user *)arg;
+	struct epoll_params epoll_params;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+		if (copy_from_user(&epoll_params, uarg, sizeof(epoll_params)))
+			return -EFAULT;
+
+		/* pad byte must be zero */
+		if (epoll_params.__pad)
+			return -EINVAL;
+
+		if (epoll_params.busy_poll_usecs > S32_MAX)
+			return -EINVAL;
+
+		if (epoll_params.prefer_busy_poll > 1)
+			return -EINVAL;
+
+		if (epoll_params.busy_poll_budget > NAPI_POLL_WEIGHT &&
+		    !capable(CAP_NET_ADMIN))
+			return -EPERM;
+
+		WRITE_ONCE(ep->busy_poll_usecs, epoll_params.busy_poll_usecs);
+		WRITE_ONCE(ep->busy_poll_budget, epoll_params.busy_poll_budget);
+		WRITE_ONCE(ep->prefer_busy_poll, epoll_params.prefer_busy_poll);
+		return 0;
+	case EPIOCGPARAMS:
+		memset(&epoll_params, 0, sizeof(epoll_params));
+		epoll_params.busy_poll_usecs = READ_ONCE(ep->busy_poll_usecs);
+		epoll_params.busy_poll_budget = READ_ONCE(ep->busy_poll_budget);
+		epoll_params.prefer_busy_poll = READ_ONCE(ep->prefer_busy_poll);
+		if (copy_to_user(uarg, &epoll_params, sizeof(epoll_params)))
+			return -EFAULT;
+		return 0;
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
 #else
 
 static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
@@ -466,6 +549,12 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi)
 {
 }
 
+static long ep_eventpoll_bp_ioctl(struct file *file, unsigned int cmd,
+				  unsigned long arg)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_NET_RX_BUSY_POLL */
 
 /*
@@ -678,12 +767,6 @@ static void ep_done_scan(struct eventpoll *ep,
 	write_unlock_irq(&ep->lock);
 }
 
-static void epi_rcu_free(struct rcu_head *head)
-{
-	struct epitem *epi = container_of(head, struct epitem, rcu);
-	kmem_cache_free(epi_cache, epi);
-}
-
 static void ep_get(struct eventpoll *ep)
 {
 	refcount_inc(&ep->refcount);
@@ -767,7 +850,7 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
 	 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
 	 * use of the rbn field.
 	 */
-	call_rcu(&epi->rcu, epi_rcu_free);
+	kfree_rcu(epi, rcu);
 
 	percpu_counter_dec(&ep->user->epoll_watches);
 	return ep_refcount_dec_and_test(ep);
@@ -825,6 +908,27 @@ static void ep_clear_and_put(struct eventpoll *ep)
 		ep_free(ep);
 }
 
+static long ep_eventpoll_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long arg)
+{
+	int ret;
+
+	if (!is_file_epoll(file))
+		return -EINVAL;
+
+	switch (cmd) {
+	case EPIOCSPARAMS:
+	case EPIOCGPARAMS:
+		ret = ep_eventpoll_bp_ioctl(file, cmd, arg);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
 static int ep_eventpoll_release(struct inode *inode, struct file *file)
 {
 	struct eventpoll *ep = file->private_data;
@@ -931,6 +1035,8 @@ static const struct file_operations eventpoll_fops = {
 	.release	= ep_eventpoll_release,
 	.poll		= ep_eventpoll_poll,
 	.llseek		= noop_llseek,
+	.unlocked_ioctl	= ep_eventpoll_ioctl,
+	.compat_ioctl   = compat_ptr_ioctl,
 };
 
 /*
@@ -1153,7 +1259,7 @@ static inline bool chain_epi_lockless(struct epitem *epi)
  * This callback takes a read lock in order not to contend with concurrent
  * events from another file descriptor, thus all modifications to ->rdllist
  * or ->ovflist are lockless.  Read lock is paired with the write lock from
- * ep_scan_ready_list(), which stops all list modifications and guarantees
+ * ep_start/done_scan(), which stops all list modifications and guarantees
  * that lists state is seen correctly.
  *
  * Another thing worth to mention is that ep_poll_callback() can be called
@@ -1751,7 +1857,7 @@ static int ep_send_events(struct eventpoll *ep,
 			 * availability. At this point, no one can insert
 			 * into ep->rdllist besides us. The epoll_ctl()
 			 * callers are locked out by
-			 * ep_scan_ready_list() holding "mtx" and the
+			 * ep_send_events() holding "mtx" and the
 			 * poll callback will queue them in ep->ovflist.
 			 */
 			list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -1904,7 +2010,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
 		__set_current_state(TASK_INTERRUPTIBLE);
 
 		/*
-		 * Do the final check under the lock. ep_scan_ready_list()
+		 * Do the final check under the lock. ep_start/done_scan()
 		 * plays with two lists (->rdllist and ->ovflist) and there
 		 * is always a race when both lists are empty for short
 		 * period of time although events are pending, so lock is
@@ -2058,6 +2164,11 @@ static int do_epoll_create(int flags)
 		error = PTR_ERR(file);
 		goto out_free_fd;
 	}
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	ep->busy_poll_usecs = 0;
+	ep->busy_poll_budget = 0;
+	ep->prefer_busy_poll = false;
+#endif
 	ep->file = file;
 	fd_install(fd, file);
 	return fd;
diff --git a/fs/exec.c b/fs/exec.c
index af4fbb61cd53..cf1df7f16e55 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -895,6 +895,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
 			goto out;
 	}
 
+	bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
 	*sp_location = sp;
 
 out:
@@ -1158,7 +1159,6 @@ static int de_thread(struct task_struct *tsk)
 
 		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
 		leader->exit_state = EXIT_DEAD;
-
 		/*
 		 * We are going to release_task()->ptrace_unlink() silently,
 		 * the tracer can sleep in do_wait(). EXIT_DEAD guarantees
@@ -1720,7 +1720,6 @@ static int prepare_binprm(struct linux_binprm *bprm)
  */
 int remove_arg_zero(struct linux_binprm *bprm)
 {
-	int ret = 0;
 	unsigned long offset;
 	char *kaddr;
 	struct page *page;
@@ -1731,10 +1730,8 @@ int remove_arg_zero(struct linux_binprm *bprm)
 	do {
 		offset = bprm->p & ~PAGE_MASK;
 		page = get_arg_page(bprm, bprm->p, 0);
-		if (!page) {
-			ret = -EFAULT;
-			goto out;
-		}
+		if (!page)
+			return -EFAULT;
 		kaddr = kmap_local_page(page);
 
 		for (; offset < PAGE_SIZE && kaddr[offset];
@@ -1747,10 +1744,8 @@ int remove_arg_zero(struct linux_binprm *bprm)
 
 	bprm->p++;
 	bprm->argc--;
-	ret = 0;
 
-out:
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(remove_arg_zero);
 
diff --git a/fs/exfat/cache.c b/fs/exfat/cache.c
index 5a2f119b7e8c..7cc200d89821 100644
--- a/fs/exfat/cache.c
+++ b/fs/exfat/cache.c
@@ -46,7 +46,7 @@ int exfat_cache_init(void)
 {
 	exfat_cachep = kmem_cache_create("exfat_cache",
 				sizeof(struct exfat_cache),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				0, SLAB_RECLAIM_ACCOUNT,
 				exfat_cache_init_once);
 	if (!exfat_cachep)
 		return -ENOMEM;
diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c
index 9f9295847a4e..077944d3c2c0 100644
--- a/fs/exfat/dir.c
+++ b/fs/exfat/dir.c
@@ -448,88 +448,34 @@ static void exfat_init_name_entry(struct exfat_dentry *ep,
 	}
 }
 
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, unsigned int type, unsigned int start_clu,
-		unsigned long long size)
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+		unsigned int type, unsigned int start_clu,
+		unsigned long long size, struct timespec64 *ts)
 {
-	struct super_block *sb = inode->i_sb;
+	struct super_block *sb = es->sb;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
-	struct timespec64 ts = current_time(inode);
 	struct exfat_dentry *ep;
-	struct buffer_head *bh;
-
-	/*
-	 * We cannot use exfat_get_dentry_set here because file ep is not
-	 * initialized yet.
-	 */
-	ep = exfat_get_dentry(sb, p_dir, entry, &bh);
-	if (!ep)
-		return -EIO;
 
+	ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
 	exfat_set_entry_type(ep, type);
-	exfat_set_entry_time(sbi, &ts,
+	exfat_set_entry_time(sbi, ts,
 			&ep->dentry.file.create_tz,
 			&ep->dentry.file.create_time,
 			&ep->dentry.file.create_date,
 			&ep->dentry.file.create_time_cs);
-	exfat_set_entry_time(sbi, &ts,
+	exfat_set_entry_time(sbi, ts,
 			&ep->dentry.file.modify_tz,
 			&ep->dentry.file.modify_time,
 			&ep->dentry.file.modify_date,
 			&ep->dentry.file.modify_time_cs);
-	exfat_set_entry_time(sbi, &ts,
+	exfat_set_entry_time(sbi, ts,
 			&ep->dentry.file.access_tz,
 			&ep->dentry.file.access_time,
 			&ep->dentry.file.access_date,
 			NULL);
 
-	exfat_update_bh(bh, IS_DIRSYNC(inode));
-	brelse(bh);
-
-	ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
-	if (!ep)
-		return -EIO;
-
+	ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
 	exfat_init_stream_entry(ep, start_clu, size);
-	exfat_update_bh(bh, IS_DIRSYNC(inode));
-	brelse(bh);
-
-	return 0;
-}
-
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
-		int entry)
-{
-	struct super_block *sb = inode->i_sb;
-	int ret = 0;
-	int i, num_entries;
-	u16 chksum;
-	struct exfat_dentry *ep, *fep;
-	struct buffer_head *fbh, *bh;
-
-	fep = exfat_get_dentry(sb, p_dir, entry, &fbh);
-	if (!fep)
-		return -EIO;
-
-	num_entries = fep->dentry.file.num_ext + 1;
-	chksum = exfat_calc_chksum16(fep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
-
-	for (i = 1; i < num_entries; i++) {
-		ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
-		if (!ep) {
-			ret = -EIO;
-			goto release_fbh;
-		}
-		chksum = exfat_calc_chksum16(ep, DENTRY_SIZE, chksum,
-				CS_DEFAULT);
-		brelse(bh);
-	}
-
-	fep->dentry.file.checksum = cpu_to_le16(chksum);
-	exfat_update_bh(fbh, IS_DIRSYNC(inode));
-release_fbh:
-	brelse(fbh);
-	return ret;
 }
 
 static void exfat_free_benign_secondary_clusters(struct inode *inode,
@@ -551,76 +497,49 @@ static void exfat_free_benign_secondary_clusters(struct inode *inode,
 	exfat_free_cluster(inode, &dir);
 }
 
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, int num_entries, struct exfat_uni_name *p_uniname)
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+		struct exfat_uni_name *p_uniname)
 {
-	struct super_block *sb = inode->i_sb;
 	int i;
 	unsigned short *uniname = p_uniname->name;
 	struct exfat_dentry *ep;
-	struct buffer_head *bh;
-	int sync = IS_DIRSYNC(inode);
-
-	ep = exfat_get_dentry(sb, p_dir, entry, &bh);
-	if (!ep)
-		return -EIO;
 
+	ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
 	ep->dentry.file.num_ext = (unsigned char)(num_entries - 1);
-	exfat_update_bh(bh, sync);
-	brelse(bh);
-
-	ep = exfat_get_dentry(sb, p_dir, entry + 1, &bh);
-	if (!ep)
-		return -EIO;
 
+	ep = exfat_get_dentry_cached(es, ES_IDX_STREAM);
 	ep->dentry.stream.name_len = p_uniname->name_len;
 	ep->dentry.stream.name_hash = cpu_to_le16(p_uniname->name_hash);
-	exfat_update_bh(bh, sync);
-	brelse(bh);
-
-	for (i = EXFAT_FIRST_CLUSTER; i < num_entries; i++) {
-		ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
-		if (!ep)
-			return -EIO;
-
-		if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
-			exfat_free_benign_secondary_clusters(inode, ep);
 
+	for (i = ES_IDX_FIRST_FILENAME; i < num_entries; i++) {
+		ep = exfat_get_dentry_cached(es, i);
 		exfat_init_name_entry(ep, uniname);
-		exfat_update_bh(bh, sync);
-		brelse(bh);
 		uniname += EXFAT_FILE_NAME_LEN;
 	}
 
-	exfat_update_dir_chksum(inode, p_dir, entry);
-	return 0;
+	exfat_update_dir_chksum(es);
 }
 
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, int order, int num_entries)
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+		int order)
 {
-	struct super_block *sb = inode->i_sb;
 	int i;
 	struct exfat_dentry *ep;
-	struct buffer_head *bh;
 
-	for (i = order; i < num_entries; i++) {
-		ep = exfat_get_dentry(sb, p_dir, entry + i, &bh);
-		if (!ep)
-			return -EIO;
+	for (i = order; i < es->num_entries; i++) {
+		ep = exfat_get_dentry_cached(es, i);
 
 		if (exfat_get_entry_type(ep) & TYPE_BENIGN_SEC)
 			exfat_free_benign_secondary_clusters(inode, ep);
 
 		exfat_set_entry_type(ep, TYPE_DELETED);
-		exfat_update_bh(bh, IS_DIRSYNC(inode));
-		brelse(bh);
 	}
 
-	return 0;
+	if (order < es->num_entries)
+		es->modified = true;
 }
 
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es)
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es)
 {
 	int chksum_type = CS_DIR_ENTRY, i;
 	unsigned short chksum = 0;
@@ -775,7 +694,6 @@ struct exfat_dentry *exfat_get_dentry(struct super_block *sb,
 }
 
 enum exfat_validate_dentry_mode {
-	ES_MODE_STARTED,
 	ES_MODE_GET_FILE_ENTRY,
 	ES_MODE_GET_STRM_ENTRY,
 	ES_MODE_GET_NAME_ENTRY,
@@ -790,11 +708,6 @@ static bool exfat_validate_entry(unsigned int type,
 		return false;
 
 	switch (*mode) {
-	case ES_MODE_STARTED:
-		if  (type != TYPE_FILE && type != TYPE_DIR)
-			return false;
-		*mode = ES_MODE_GET_FILE_ENTRY;
-		break;
 	case ES_MODE_GET_FILE_ENTRY:
 		if (type != TYPE_STREAM)
 			return false;
@@ -834,7 +747,7 @@ struct exfat_dentry *exfat_get_dentry_cached(
 }
 
 /*
- * Returns a set of dentries for a file or dir.
+ * Returns a set of dentries.
  *
  * Note It provides a direct pointer to bh->data via exfat_get_dentry_cached().
  * User should call exfat_get_dentry_set() after setting 'modified' to apply
@@ -842,22 +755,24 @@ struct exfat_dentry *exfat_get_dentry_cached(
  *
  * in:
  *   sb+p_dir+entry: indicates a file/dir
- *   type:  specifies how many dentries should be included.
+ *   num_entries: specifies how many dentries should be included.
+ *                It will be set to es->num_entries if it is not 0.
+ *                If num_entries is 0, es->num_entries will be obtained
+ *                from the first dentry.
+ * out:
+ *   es: pointer of entry set on success.
  * return:
- *   pointer of entry set on success,
- *   NULL on failure.
+ *   0 on success
+ *   -error code on failure
  */
-int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+static int __exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		struct super_block *sb, struct exfat_chain *p_dir, int entry,
-		unsigned int type)
+		unsigned int num_entries)
 {
 	int ret, i, num_bh;
 	unsigned int off;
 	sector_t sec;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
-	struct exfat_dentry *ep;
-	int num_entries;
-	enum exfat_validate_dentry_mode mode = ES_MODE_STARTED;
 	struct buffer_head *bh;
 
 	if (p_dir->dir == DIR_DELETED) {
@@ -880,12 +795,18 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		return -EIO;
 	es->bh[es->num_bh++] = bh;
 
-	ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
-	if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
-		goto put_es;
+	if (num_entries == ES_ALL_ENTRIES) {
+		struct exfat_dentry *ep;
+
+		ep = exfat_get_dentry_cached(es, ES_IDX_FILE);
+		if (ep->type != EXFAT_FILE) {
+			brelse(bh);
+			return -EIO;
+		}
+
+		num_entries = ep->dentry.file.num_ext + 1;
+	}
 
-	num_entries = type == ES_ALL_ENTRIES ?
-		ep->dentry.file.num_ext + 1 : type;
 	es->num_entries = num_entries;
 
 	num_bh = EXFAT_B_TO_BLK_ROUND_UP(off + num_entries * DENTRY_SIZE, sb);
@@ -918,8 +839,27 @@ int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		es->bh[es->num_bh++] = bh;
 	}
 
+	return 0;
+
+put_es:
+	exfat_put_dentry_set(es, false);
+	return -EIO;
+}
+
+int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
+		struct super_block *sb, struct exfat_chain *p_dir,
+		int entry, unsigned int num_entries)
+{
+	int ret, i;
+	struct exfat_dentry *ep;
+	enum exfat_validate_dentry_mode mode = ES_MODE_GET_FILE_ENTRY;
+
+	ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+	if (ret < 0)
+		return ret;
+
 	/* validate cached dentries */
-	for (i = ES_IDX_STREAM; i < num_entries; i++) {
+	for (i = ES_IDX_STREAM; i < es->num_entries; i++) {
 		ep = exfat_get_dentry_cached(es, i);
 		if (!exfat_validate_entry(exfat_get_entry_type(ep), &mode))
 			goto put_es;
@@ -931,6 +871,85 @@ put_es:
 	return -EIO;
 }
 
+static int exfat_validate_empty_dentry_set(struct exfat_entry_set_cache *es)
+{
+	struct exfat_dentry *ep;
+	struct buffer_head *bh;
+	int i, off;
+	bool unused_hit = false;
+
+	/*
+	 * ONLY UNUSED OR DELETED DENTRIES ARE ALLOWED:
+	 * Although it violates the specification for a deleted entry to
+	 * follow an unused entry, some exFAT implementations could work
+	 * like this. Therefore, to improve compatibility, let's allow it.
+	 */
+	for (i = 0; i < es->num_entries; i++) {
+		ep = exfat_get_dentry_cached(es, i);
+		if (ep->type == EXFAT_UNUSED) {
+			unused_hit = true;
+		} else if (!IS_EXFAT_DELETED(ep->type)) {
+			if (unused_hit)
+				goto err_used_follow_unused;
+			i++;
+			goto count_skip_entries;
+		}
+	}
+
+	return 0;
+
+err_used_follow_unused:
+	off = es->start_off + (i << DENTRY_SIZE_BITS);
+	bh = es->bh[EXFAT_B_TO_BLK(off, es->sb)];
+
+	exfat_fs_error(es->sb,
+		"in sector %lld, dentry %d should be unused, but 0x%x",
+		bh->b_blocknr, off >> DENTRY_SIZE_BITS, ep->type);
+
+	return -EIO;
+
+count_skip_entries:
+	es->num_entries = EXFAT_B_TO_DEN(EXFAT_BLK_TO_B(es->num_bh, es->sb) - es->start_off);
+	for (; i < es->num_entries; i++) {
+		ep = exfat_get_dentry_cached(es, i);
+		if (IS_EXFAT_DELETED(ep->type))
+			break;
+	}
+
+	return i;
+}
+
+/*
+ * Get an empty dentry set.
+ *
+ * in:
+ *   sb+p_dir+entry: indicates the empty dentry location
+ *   num_entries: specifies how many empty dentries should be included.
+ * out:
+ *   es: pointer of empty dentry set on success.
+ * return:
+ *   0  : on success
+ *   >0 : the dentries are not empty, the return value is the number of
+ *        dentries to be skipped for the next lookup.
+ *   <0 : on failure
+ */
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+		struct super_block *sb, struct exfat_chain *p_dir,
+		int entry, unsigned int num_entries)
+{
+	int ret;
+
+	ret = __exfat_get_dentry_set(es, sb, p_dir, entry, num_entries);
+	if (ret < 0)
+		return ret;
+
+	ret = exfat_validate_empty_dentry_set(es);
+	if (ret)
+		exfat_put_dentry_set(es, false);
+
+	return ret;
+}
+
 static inline void exfat_reset_empty_hint(struct exfat_hint_femp *hint_femp)
 {
 	hint_femp->eidx = EXFAT_HINT_NONE;
@@ -1187,27 +1206,6 @@ found:
 	return dentry - num_ext;
 }
 
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
-		int entry, struct exfat_dentry *ep)
-{
-	int i, count = 0;
-	unsigned int type;
-	struct exfat_dentry *ext_ep;
-	struct buffer_head *bh;
-
-	for (i = 0, entry++; i < ep->dentry.file.num_ext; i++, entry++) {
-		ext_ep = exfat_get_dentry(sb, p_dir, entry, &bh);
-		if (!ext_ep)
-			return -EIO;
-
-		type = exfat_get_entry_type(ext_ep);
-		brelse(bh);
-		if (type & TYPE_CRITICAL_SEC || type & TYPE_BENIGN_SEC)
-			count++;
-	}
-	return count;
-}
-
 int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir)
 {
 	int i, count = 0;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 361595433480..ecc5db952deb 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -431,8 +431,6 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc,
 		unsigned int *content);
 int exfat_ent_set(struct super_block *sb, unsigned int loc,
 		unsigned int content);
-int exfat_count_ext_entries(struct super_block *sb, struct exfat_chain *p_dir,
-		int entry, struct exfat_dentry *p_entry);
 int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain,
 		unsigned int len);
 int exfat_zeroed_cluster(struct inode *dir, unsigned int clu);
@@ -480,16 +478,14 @@ int exfat_get_cluster(struct inode *inode, unsigned int cluster,
 extern const struct inode_operations exfat_dir_inode_operations;
 extern const struct file_operations exfat_dir_operations;
 unsigned int exfat_get_entry_type(struct exfat_dentry *p_entry);
-int exfat_init_dir_entry(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, unsigned int type, unsigned int start_clu,
-		unsigned long long size);
-int exfat_init_ext_entry(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, int num_entries, struct exfat_uni_name *p_uniname);
-int exfat_remove_entries(struct inode *inode, struct exfat_chain *p_dir,
-		int entry, int order, int num_entries);
-int exfat_update_dir_chksum(struct inode *inode, struct exfat_chain *p_dir,
-		int entry);
-void exfat_update_dir_chksum_with_entry_set(struct exfat_entry_set_cache *es);
+void exfat_init_dir_entry(struct exfat_entry_set_cache *es,
+		unsigned int type, unsigned int start_clu,
+		unsigned long long size, struct timespec64 *ts);
+void exfat_init_ext_entry(struct exfat_entry_set_cache *es, int num_entries,
+		struct exfat_uni_name *p_uniname);
+void exfat_remove_entries(struct inode *inode, struct exfat_entry_set_cache *es,
+		int order);
+void exfat_update_dir_chksum(struct exfat_entry_set_cache *es);
 int exfat_calc_num_entries(struct exfat_uni_name *p_uniname);
 int exfat_find_dir_entry(struct super_block *sb, struct exfat_inode_info *ei,
 		struct exfat_chain *p_dir, struct exfat_uni_name *p_uniname,
@@ -501,7 +497,10 @@ struct exfat_dentry *exfat_get_dentry_cached(struct exfat_entry_set_cache *es,
 		int num);
 int exfat_get_dentry_set(struct exfat_entry_set_cache *es,
 		struct super_block *sb, struct exfat_chain *p_dir, int entry,
-		unsigned int type);
+		unsigned int num_entries);
+int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es,
+		struct super_block *sb, struct exfat_chain *p_dir, int entry,
+		unsigned int num_entries);
 int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync);
 int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir);
 
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 0687f952956c..dd894e558c91 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -94,7 +94,7 @@ int __exfat_write_inode(struct inode *inode, int sync)
 		ep2->dentry.stream.start_clu = EXFAT_FREE_CLUSTER;
 	}
 
-	exfat_update_dir_chksum_with_entry_set(&es);
+	exfat_update_dir_chksum(&es);
 	return exfat_put_dentry_set(&es, sync);
 }
 
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 9c549fd11fc8..631ad9e8e32a 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -204,21 +204,16 @@ const struct dentry_operations exfat_utf8_dentry_ops = {
 	.d_compare	= exfat_utf8_d_cmp,
 };
 
-/* used only in search empty_slot() */
-#define CNT_UNUSED_NOHIT        (-1)
-#define CNT_UNUSED_HIT          (-2)
 /* search EMPTY CONTINUOUS "num_entries" entries */
 static int exfat_search_empty_slot(struct super_block *sb,
 		struct exfat_hint_femp *hint_femp, struct exfat_chain *p_dir,
-		int num_entries)
+		int num_entries, struct exfat_entry_set_cache *es)
 {
-	int i, dentry, num_empty = 0;
+	int i, dentry, ret;
 	int dentries_per_clu;
-	unsigned int type;
 	struct exfat_chain clu;
-	struct exfat_dentry *ep;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
-	struct buffer_head *bh;
+	int total_entries = EXFAT_CLU_TO_DEN(p_dir->size, sbi);
 
 	dentries_per_clu = sbi->dentries_per_clu;
 
@@ -231,7 +226,7 @@ static int exfat_search_empty_slot(struct super_block *sb,
 		 * Otherwise, and if "dentry + hint_famp->count" is also equal
 		 * to "p_dir->size * dentries_per_clu", it means ENOSPC.
 		 */
-		if (dentry + hint_femp->count == p_dir->size * dentries_per_clu &&
+		if (dentry + hint_femp->count == total_entries &&
 		    num_entries > hint_femp->count)
 			return -ENOSPC;
 
@@ -242,69 +237,41 @@ static int exfat_search_empty_slot(struct super_block *sb,
 		dentry = 0;
 	}
 
-	while (clu.dir != EXFAT_EOF_CLUSTER) {
+	while (dentry + num_entries < total_entries &&
+	       clu.dir != EXFAT_EOF_CLUSTER) {
 		i = dentry & (dentries_per_clu - 1);
 
-		for (; i < dentries_per_clu; i++, dentry++) {
-			ep = exfat_get_dentry(sb, &clu, i, &bh);
-			if (!ep)
-				return -EIO;
-			type = exfat_get_entry_type(ep);
-			brelse(bh);
-
-			if (type == TYPE_UNUSED || type == TYPE_DELETED) {
-				num_empty++;
-				if (hint_femp->eidx == EXFAT_HINT_NONE) {
-					hint_femp->eidx = dentry;
-					hint_femp->count = CNT_UNUSED_NOHIT;
-					exfat_chain_set(&hint_femp->cur,
-						clu.dir, clu.size, clu.flags);
-				}
-
-				if (type == TYPE_UNUSED &&
-				    hint_femp->count != CNT_UNUSED_HIT)
-					hint_femp->count = CNT_UNUSED_HIT;
+		ret = exfat_get_empty_dentry_set(es, sb, &clu, i, num_entries);
+		if (ret < 0)
+			return ret;
+		else if (ret == 0)
+			return dentry;
+
+		dentry += ret;
+		i += ret;
+
+		while (i >= dentries_per_clu) {
+			if (clu.flags == ALLOC_NO_FAT_CHAIN) {
+				if (--clu.size > 0)
+					clu.dir++;
+				else
+					clu.dir = EXFAT_EOF_CLUSTER;
 			} else {
-				if (hint_femp->eidx != EXFAT_HINT_NONE &&
-				    hint_femp->count == CNT_UNUSED_HIT) {
-					/* unused empty group means
-					 * an empty group which includes
-					 * unused dentry
-					 */
-					exfat_fs_error(sb,
-						"found bogus dentry(%d) beyond unused empty group(%d) (start_clu : %u, cur_clu : %u)",
-						dentry, hint_femp->eidx,
-						p_dir->dir, clu.dir);
+				if (exfat_get_next_cluster(sb, &clu.dir))
 					return -EIO;
-				}
-
-				num_empty = 0;
-				hint_femp->eidx = EXFAT_HINT_NONE;
 			}
 
-			if (num_empty >= num_entries) {
-				/* found and invalidate hint_femp */
-				hint_femp->eidx = EXFAT_HINT_NONE;
-				return (dentry - (num_entries - 1));
-			}
-		}
-
-		if (clu.flags == ALLOC_NO_FAT_CHAIN) {
-			if (--clu.size > 0)
-				clu.dir++;
-			else
-				clu.dir = EXFAT_EOF_CLUSTER;
-		} else {
-			if (exfat_get_next_cluster(sb, &clu.dir))
-				return -EIO;
+			i -= dentries_per_clu;
 		}
 	}
 
-	hint_femp->eidx = p_dir->size * dentries_per_clu - num_empty;
-	hint_femp->count = num_empty;
-	if (num_empty == 0)
+	hint_femp->eidx = dentry;
+	hint_femp->count = 0;
+	if (dentry == total_entries || clu.dir == EXFAT_EOF_CLUSTER)
 		exfat_chain_set(&hint_femp->cur, EXFAT_EOF_CLUSTER, 0,
 				clu.flags);
+	else
+		hint_femp->cur = clu;
 
 	return -ENOSPC;
 }
@@ -325,7 +292,8 @@ static int exfat_check_max_dentries(struct inode *inode)
  * if there isn't any empty slot, expand cluster chain.
  */
 static int exfat_find_empty_entry(struct inode *inode,
-		struct exfat_chain *p_dir, int num_entries)
+		struct exfat_chain *p_dir, int num_entries,
+		struct exfat_entry_set_cache *es)
 {
 	int dentry;
 	unsigned int ret, last_clu;
@@ -344,7 +312,7 @@ static int exfat_find_empty_entry(struct inode *inode,
 	}
 
 	while ((dentry = exfat_search_empty_slot(sb, &hint_femp, p_dir,
-					num_entries)) < 0) {
+					num_entries, es)) < 0) {
 		if (dentry == -EIO)
 			break;
 
@@ -499,6 +467,8 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_uni_name uniname;
 	struct exfat_chain clu;
+	struct timespec64 ts = current_time(inode);
+	struct exfat_entry_set_cache es;
 	int clu_size = 0;
 	unsigned int start_clu = EXFAT_FREE_CLUSTER;
 
@@ -513,7 +483,7 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 	}
 
 	/* exfat_find_empty_entry must be called before alloc_cluster() */
-	dentry = exfat_find_empty_entry(inode, p_dir, num_entries);
+	dentry = exfat_find_empty_entry(inode, p_dir, num_entries, &es);
 	if (dentry < 0) {
 		ret = dentry; /* -EIO or -ENOSPC */
 		goto out;
@@ -521,8 +491,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 
 	if (type == TYPE_DIR && !sbi->options.zero_size_dir) {
 		ret = exfat_alloc_new_dir(inode, &clu);
-		if (ret)
+		if (ret) {
+			exfat_put_dentry_set(&es, false);
 			goto out;
+		}
 		start_clu = clu.dir;
 		clu_size = sbi->cluster_size;
 	}
@@ -531,12 +503,10 @@ static int exfat_add_entry(struct inode *inode, const char *path,
 	/* fill the dos name directory entry information of the created file.
 	 * the first cluster is not determined yet. (0)
 	 */
-	ret = exfat_init_dir_entry(inode, p_dir, dentry, type,
-		start_clu, clu_size);
-	if (ret)
-		goto out;
+	exfat_init_dir_entry(&es, type, start_clu, clu_size, &ts);
+	exfat_init_ext_entry(&es, num_entries, &uniname);
 
-	ret = exfat_init_ext_entry(inode, p_dir, dentry, num_entries, &uniname);
+	ret = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
 	if (ret)
 		goto out;
 
@@ -577,6 +547,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 	struct exfat_dir_entry info;
 	loff_t i_pos;
 	int err;
+	loff_t size = i_size_read(dir);
 
 	mutex_lock(&EXFAT_SB(sb)->s_lock);
 	exfat_set_volume_dirty(sb);
@@ -587,7 +558,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir,
 
 	inode_inc_iversion(dir);
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	if (IS_DIRSYNC(dir))
+	if (IS_DIRSYNC(dir) && size != i_size_read(dir))
 		exfat_sync_inode(dir);
 	else
 		mark_inode_dirty(dir);
@@ -795,12 +766,11 @@ unlock:
 static int exfat_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct exfat_chain cdir;
-	struct exfat_dentry *ep;
 	struct super_block *sb = dir->i_sb;
 	struct inode *inode = dentry->d_inode;
 	struct exfat_inode_info *ei = EXFAT_I(inode);
-	struct buffer_head *bh;
-	int num_entries, entry, err = 0;
+	struct exfat_entry_set_cache es;
+	int entry, err = 0;
 
 	mutex_lock(&EXFAT_SB(sb)->s_lock);
 	exfat_chain_dup(&cdir, &ei->dir);
@@ -811,26 +781,20 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}
 
-	ep = exfat_get_dentry(sb, &cdir, entry, &bh);
-	if (!ep) {
-		err = -EIO;
-		goto unlock;
-	}
-	num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
-	if (num_entries < 0) {
+	err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+	if (err) {
 		err = -EIO;
-		brelse(bh);
 		goto unlock;
 	}
-	num_entries++;
-	brelse(bh);
 
 	exfat_set_volume_dirty(sb);
+
 	/* update the directory entry */
-	if (exfat_remove_entries(dir, &cdir, entry, 0, num_entries)) {
-		err = -EIO;
+	exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+	err = exfat_put_dentry_set(&es, IS_DIRSYNC(inode));
+	if (err)
 		goto unlock;
-	}
 
 	/* This doesn't modify ei */
 	ei->dir.dir = DIR_DELETED;
@@ -838,10 +802,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
 	inode_inc_iversion(dir);
 	simple_inode_init_ts(dir);
 	exfat_truncate_inode_atime(dir);
-	if (IS_DIRSYNC(dir))
-		exfat_sync_inode(dir);
-	else
-		mark_inode_dirty(dir);
+	mark_inode_dirty(dir);
 
 	clear_nlink(inode);
 	simple_inode_init_ts(inode);
@@ -862,6 +823,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	struct exfat_chain cdir;
 	loff_t i_pos;
 	int err;
+	loff_t size = i_size_read(dir);
 
 	mutex_lock(&EXFAT_SB(sb)->s_lock);
 	exfat_set_volume_dirty(sb);
@@ -872,7 +834,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 
 	inode_inc_iversion(dir);
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
-	if (IS_DIRSYNC(dir))
+	if (IS_DIRSYNC(dir) && size != i_size_read(dir))
 		exfat_sync_inode(dir);
 	else
 		mark_inode_dirty(dir);
@@ -946,13 +908,12 @@ static int exfat_check_dir_empty(struct super_block *sb,
 static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	struct exfat_dentry *ep;
 	struct exfat_chain cdir, clu_to_free;
 	struct super_block *sb = inode->i_sb;
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	struct exfat_inode_info *ei = EXFAT_I(inode);
-	struct buffer_head *bh;
-	int num_entries, entry, err;
+	struct exfat_entry_set_cache es;
+	int entry, err;
 
 	mutex_lock(&EXFAT_SB(inode->i_sb)->s_lock);
 
@@ -976,27 +937,20 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
 		goto unlock;
 	}
 
-	ep = exfat_get_dentry(sb, &cdir, entry, &bh);
-	if (!ep) {
-		err = -EIO;
-		goto unlock;
-	}
-
-	num_entries = exfat_count_ext_entries(sb, &cdir, entry, ep);
-	if (num_entries < 0) {
+	err = exfat_get_dentry_set(&es, sb, &cdir, entry, ES_ALL_ENTRIES);
+	if (err) {
 		err = -EIO;
-		brelse(bh);
 		goto unlock;
 	}
-	num_entries++;
-	brelse(bh);
 
 	exfat_set_volume_dirty(sb);
-	err = exfat_remove_entries(dir, &cdir, entry, 0, num_entries);
-	if (err) {
-		exfat_err(sb, "failed to exfat_remove_entries : err(%d)", err);
+
+	exfat_remove_entries(inode, &es, ES_IDX_FILE);
+
+	err = exfat_put_dentry_set(&es, IS_DIRSYNC(dir));
+	if (err)
 		goto unlock;
-	}
+
 	ei->dir.dir = DIR_DELETED;
 
 	inode_inc_iversion(dir);
@@ -1022,67 +976,52 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
 		int oldentry, struct exfat_uni_name *p_uniname,
 		struct exfat_inode_info *ei)
 {
-	int ret, num_old_entries, num_new_entries;
+	int ret, num_new_entries;
 	struct exfat_dentry *epold, *epnew;
 	struct super_block *sb = inode->i_sb;
-	struct buffer_head *new_bh, *old_bh;
+	struct exfat_entry_set_cache old_es, new_es;
 	int sync = IS_DIRSYNC(inode);
 
-	epold = exfat_get_dentry(sb, p_dir, oldentry, &old_bh);
-	if (!epold)
-		return -EIO;
-
-	num_old_entries = exfat_count_ext_entries(sb, p_dir, oldentry, epold);
-	if (num_old_entries < 0)
-		return -EIO;
-	num_old_entries++;
-
 	num_new_entries = exfat_calc_num_entries(p_uniname);
 	if (num_new_entries < 0)
 		return num_new_entries;
 
-	if (num_old_entries < num_new_entries) {
-		int newentry;
+	ret = exfat_get_dentry_set(&old_es, sb, p_dir, oldentry, ES_ALL_ENTRIES);
+	if (ret) {
+		ret = -EIO;
+		return ret;
+	}
 
-		newentry =
-			exfat_find_empty_entry(inode, p_dir, num_new_entries);
-		if (newentry < 0)
-			return newentry; /* -EIO or -ENOSPC */
+	epold = exfat_get_dentry_cached(&old_es, ES_IDX_FILE);
 
-		epnew = exfat_get_dentry(sb, p_dir, newentry, &new_bh);
-		if (!epnew)
-			return -EIO;
+	if (old_es.num_entries < num_new_entries) {
+		int newentry;
 
+		newentry = exfat_find_empty_entry(inode, p_dir, num_new_entries,
+				&new_es);
+		if (newentry < 0) {
+			ret = newentry; /* -EIO or -ENOSPC */
+			goto put_old_es;
+		}
+
+		epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
 		*epnew = *epold;
 		if (exfat_get_entry_type(epnew) == TYPE_FILE) {
 			epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
 			ei->attr |= EXFAT_ATTR_ARCHIVE;
 		}
-		exfat_update_bh(new_bh, sync);
-		brelse(old_bh);
-		brelse(new_bh);
-
-		epold = exfat_get_dentry(sb, p_dir, oldentry + 1, &old_bh);
-		if (!epold)
-			return -EIO;
-		epnew = exfat_get_dentry(sb, p_dir, newentry + 1, &new_bh);
-		if (!epnew) {
-			brelse(old_bh);
-			return -EIO;
-		}
 
+		epold = exfat_get_dentry_cached(&old_es, ES_IDX_STREAM);
+		epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
 		*epnew = *epold;
-		exfat_update_bh(new_bh, sync);
-		brelse(old_bh);
-		brelse(new_bh);
 
-		ret = exfat_init_ext_entry(inode, p_dir, newentry,
-			num_new_entries, p_uniname);
+		exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+
+		ret = exfat_put_dentry_set(&new_es, sync);
 		if (ret)
-			return ret;
+			goto put_old_es;
 
-		exfat_remove_entries(inode, p_dir, oldentry, 0,
-			num_old_entries);
+		exfat_remove_entries(inode, &old_es, ES_IDX_FILE);
 		ei->dir = *p_dir;
 		ei->entry = newentry;
 	} else {
@@ -1090,85 +1029,72 @@ static int exfat_rename_file(struct inode *inode, struct exfat_chain *p_dir,
 			epold->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
 			ei->attr |= EXFAT_ATTR_ARCHIVE;
 		}
-		exfat_update_bh(old_bh, sync);
-		brelse(old_bh);
-		ret = exfat_init_ext_entry(inode, p_dir, oldentry,
-			num_new_entries, p_uniname);
-		if (ret)
-			return ret;
 
-		exfat_remove_entries(inode, p_dir, oldentry, num_new_entries,
-			num_old_entries);
+		exfat_remove_entries(inode, &old_es, ES_IDX_FIRST_FILENAME + 1);
+		exfat_init_ext_entry(&old_es, num_new_entries, p_uniname);
 	}
-	return 0;
+	return exfat_put_dentry_set(&old_es, sync);
+
+put_old_es:
+	exfat_put_dentry_set(&old_es, false);
+	return ret;
 }
 
 static int exfat_move_file(struct inode *inode, struct exfat_chain *p_olddir,
 		int oldentry, struct exfat_chain *p_newdir,
 		struct exfat_uni_name *p_uniname, struct exfat_inode_info *ei)
 {
-	int ret, newentry, num_new_entries, num_old_entries;
+	int ret, newentry, num_new_entries;
 	struct exfat_dentry *epmov, *epnew;
 	struct super_block *sb = inode->i_sb;
-	struct buffer_head *mov_bh, *new_bh;
-
-	epmov = exfat_get_dentry(sb, p_olddir, oldentry, &mov_bh);
-	if (!epmov)
-		return -EIO;
-
-	num_old_entries = exfat_count_ext_entries(sb, p_olddir, oldentry,
-		epmov);
-	if (num_old_entries < 0)
-		return -EIO;
-	num_old_entries++;
+	struct exfat_entry_set_cache mov_es, new_es;
 
 	num_new_entries = exfat_calc_num_entries(p_uniname);
 	if (num_new_entries < 0)
 		return num_new_entries;
 
-	newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries);
-	if (newentry < 0)
-		return newentry; /* -EIO or -ENOSPC */
-
-	epnew = exfat_get_dentry(sb, p_newdir, newentry, &new_bh);
-	if (!epnew)
+	ret = exfat_get_dentry_set(&mov_es, sb, p_olddir, oldentry,
+			ES_ALL_ENTRIES);
+	if (ret)
 		return -EIO;
 
+	newentry = exfat_find_empty_entry(inode, p_newdir, num_new_entries,
+			&new_es);
+	if (newentry < 0) {
+		ret = newentry; /* -EIO or -ENOSPC */
+		goto put_mov_es;
+	}
+
+	epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_FILE);
+	epnew = exfat_get_dentry_cached(&new_es, ES_IDX_FILE);
 	*epnew = *epmov;
 	if (exfat_get_entry_type(epnew) == TYPE_FILE) {
 		epnew->dentry.file.attr |= cpu_to_le16(EXFAT_ATTR_ARCHIVE);
 		ei->attr |= EXFAT_ATTR_ARCHIVE;
 	}
-	exfat_update_bh(new_bh, IS_DIRSYNC(inode));
-	brelse(mov_bh);
-	brelse(new_bh);
-
-	epmov = exfat_get_dentry(sb, p_olddir, oldentry + 1, &mov_bh);
-	if (!epmov)
-		return -EIO;
-	epnew = exfat_get_dentry(sb, p_newdir, newentry + 1, &new_bh);
-	if (!epnew) {
-		brelse(mov_bh);
-		return -EIO;
-	}
 
+	epmov = exfat_get_dentry_cached(&mov_es, ES_IDX_STREAM);
+	epnew = exfat_get_dentry_cached(&new_es, ES_IDX_STREAM);
 	*epnew = *epmov;
-	exfat_update_bh(new_bh, IS_DIRSYNC(inode));
-	brelse(mov_bh);
-	brelse(new_bh);
-
-	ret = exfat_init_ext_entry(inode, p_newdir, newentry, num_new_entries,
-		p_uniname);
-	if (ret)
-		return ret;
 
-	exfat_remove_entries(inode, p_olddir, oldentry, 0, num_old_entries);
+	exfat_init_ext_entry(&new_es, num_new_entries, p_uniname);
+	exfat_remove_entries(inode, &mov_es, ES_IDX_FILE);
 
 	exfat_chain_set(&ei->dir, p_newdir->dir, p_newdir->size,
 		p_newdir->flags);
 
 	ei->entry = newentry;
-	return 0;
+
+	ret = exfat_put_dentry_set(&new_es, IS_DIRSYNC(inode));
+	if (ret)
+		goto put_mov_es;
+
+	return exfat_put_dentry_set(&mov_es, IS_DIRSYNC(inode));
+
+put_mov_es:
+	exfat_put_dentry_set(&mov_es, false);
+
+	return ret;
 }
 
 /* rename or move a old file into a new file */
@@ -1186,7 +1112,6 @@ static int __exfat_rename(struct inode *old_parent_inode,
 	struct exfat_sb_info *sbi = EXFAT_SB(sb);
 	const unsigned char *new_path = new_dentry->d_name.name;
 	struct inode *new_inode = new_dentry->d_inode;
-	int num_entries;
 	struct exfat_inode_info *new_ei = NULL;
 	unsigned int new_entry_type = TYPE_UNUSED;
 	int new_entry = 0;
@@ -1257,25 +1182,21 @@ static int __exfat_rename(struct inode *old_parent_inode,
 				&newdir, &uni_name, ei);
 
 	if (!ret && new_inode) {
+		struct exfat_entry_set_cache es;
+
 		/* delete entries of new_dir */
-		ep = exfat_get_dentry(sb, p_dir, new_entry, &new_bh);
-		if (!ep) {
+		ret = exfat_get_dentry_set(&es, sb, p_dir, new_entry,
+				ES_ALL_ENTRIES);
+		if (ret) {
 			ret = -EIO;
 			goto del_out;
 		}
 
-		num_entries = exfat_count_ext_entries(sb, p_dir, new_entry, ep);
-		if (num_entries < 0) {
-			ret = -EIO;
-			goto del_out;
-		}
-		brelse(new_bh);
+		exfat_remove_entries(new_inode, &es, ES_IDX_FILE);
 
-		if (exfat_remove_entries(new_inode, p_dir, new_entry, 0,
-				num_entries + 1)) {
-			ret = -EIO;
+		ret = exfat_put_dentry_set(&es, IS_DIRSYNC(new_inode));
+		if (ret)
 			goto del_out;
-		}
 
 		/* Free the clusters if new_inode is a dir(as if exfat_rmdir) */
 		if (new_entry_type == TYPE_DIR &&
@@ -1317,6 +1238,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
 	struct super_block *sb = old_dir->i_sb;
 	loff_t i_pos;
 	int err;
+	loff_t size = i_size_read(new_dir);
 
 	/*
 	 * The VFS already checks for existence, so for local filesystems
@@ -1338,7 +1260,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
 	EXFAT_I(new_dir)->i_crtime = current_time(new_dir);
 	exfat_truncate_inode_atime(new_dir);
-	if (IS_DIRSYNC(new_dir))
+	if (IS_DIRSYNC(new_dir) && size != i_size_read(new_dir))
 		exfat_sync_inode(new_dir);
 	else
 		mark_inode_dirty(new_dir);
@@ -1359,9 +1281,7 @@ static int exfat_rename(struct mnt_idmap *idmap,
 	}
 
 	inode_inc_iversion(old_dir);
-	if (IS_DIRSYNC(old_dir))
-		exfat_sync_inode(old_dir);
-	else
+	if (new_dir != old_dir)
 		mark_inode_dirty(old_dir);
 
 	if (new_inode) {
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index fcb658267765..3d5ea2cfad66 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -813,7 +813,7 @@ static int __init init_exfat_fs(void)
 
 	exfat_inode_cachep = kmem_cache_create("exfat_inode_cache",
 			sizeof(struct exfat_inode_info),
-			0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+			0, SLAB_RECLAIM_ACCOUNT,
 			exfat_inode_init_once);
 	if (!exfat_inode_cachep) {
 		err = -ENOMEM;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 3ae0154c5680..07ea3d62b298 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -255,7 +255,7 @@ static bool filldir_one(struct dir_context *ctx, const char *name, int len,
 		container_of(ctx, struct getdents_callback, ctx);
 
 	buf->sequence++;
-	if (buf->ino == ino && len <= NAME_MAX) {
+	if (buf->ino == ino && len <= NAME_MAX && !is_dot_dotdot(name, len)) {
 		memcpy(buf->name, name, len);
 		buf->name[len] = '\0';
 		buf->found = 1;
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 74d98965902e..d6cfb1849580 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,16 +1,23 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config EXT2_FS
-	tristate "Second extended fs support"
+	tristate "Second extended fs support (DEPRECATED)"
 	select BUFFER_HEAD
 	select FS_IOMAP
 	select LEGACY_DIRECT_IO
 	help
 	  Ext2 is a standard Linux file system for hard disks.
 
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ext2.
+	  This filesystem driver is deprecated because it does not properly
+	  support inode time stamps beyond 03:14:07 UTC on 19 January 2038.
 
-	  If unsure, say Y.
+	  Ext2 users are advised to use ext4 driver to access their filesystem.
+	  The driver is fully compatible, supports filesystems without journal
+          or extents, and also supports larger time stamps if the filesystem
+          is created with at least 256 byte inodes.
+
+	  This code is kept as a simple reference for filesystem developers.
+
+	  If unsure, say N.
 
 config EXT2_FS_XATTR
 	bool "Ext2 extended attributes"
diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c
index e124f3d709b2..1bfd6ab11038 100644
--- a/fs/ext2/balloc.c
+++ b/fs/ext2/balloc.c
@@ -412,7 +412,7 @@ void ext2_init_block_alloc_info(struct inode *inode)
 	struct ext2_block_alloc_info *block_i;
 	struct super_block *sb = inode->i_sb;
 
-	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
+	block_i = kmalloc(sizeof(*block_i), GFP_KERNEL);
 	if (block_i) {
 		struct ext2_reserve_window_node *rsv = &block_i->rsv_window_node;
 
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 677a9ad45dcb..f38bdd46e4f7 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -674,7 +674,7 @@ struct ext2_inode_info {
 	struct inode	vfs_inode;
 	struct list_head i_orphan;	/* unlinked but open inodes */
 #ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 #endif
 };
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 5a4272b2c6b0..f3d570a9302b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -754,7 +754,7 @@ static int ext2_get_blocks(struct inode *inode,
 		 */
 		err = sb_issue_zeroout(inode->i_sb,
 				le32_to_cpu(chain[depth-1].key), count,
-				GFP_NOFS);
+				GFP_KERNEL);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 01f9addc8b1f..37f7ce56adce 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -213,8 +213,7 @@ static int __init init_inodecache(void)
 {
 	ext2_inode_cachep = kmem_cache_create_usercopy("ext2_inode_cache",
 				sizeof(struct ext2_inode_info), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-					SLAB_ACCOUNT),
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 				offsetof(struct ext2_inode_info, i_data),
 				sizeof_field(struct ext2_inode_info, i_data),
 				init_once);
@@ -320,7 +319,7 @@ static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, siz
 static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off);
 static int ext2_quota_on(struct super_block *sb, int type, int format_id,
 			 const struct path *path);
-static struct dquot **ext2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext2_get_dquots(struct inode *inode)
 {
 	return EXT2_I(inode)->i_dquot;
 }
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index e849241ebb8f..c885dcc3bd0d 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -874,7 +874,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
 	__u32 hash = le32_to_cpu(HDR(bh)->h_hash);
 	int error;
 
-	error = mb_cache_entry_create(cache, GFP_NOFS, hash, bh->b_blocknr,
+	error = mb_cache_entry_create(cache, GFP_KERNEL, hash, bh->b_blocknr,
 				      true);
 	if (error) {
 		if (error == -EBUSY) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 023571f8dd1b..8d126654019e 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1158,7 +1158,7 @@ struct ext4_inode_info {
 	tid_t i_datasync_tid;
 
 #ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 #endif
 
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
@@ -1550,7 +1550,7 @@ struct ext4_sb_info {
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
-	struct bdev_handle *s_journal_bdev_handle;
+	struct file *s_journal_bdev_file;
 #ifdef CONFIG_QUOTA
 	/* Names of quota files with journalled quota */
 	char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7669d154c05e..e57054bdc5fd 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4111,10 +4111,10 @@ insert_hole:
  *
  * Need to be called with
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * (ie, flags is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  *
  * return > 0, number of blocks already mapped/allocated
- *          if create == 0 and these are pre-allocated blocks
+ *          if flags doesn't contain EXT4_GET_BLOCKS_CREATE and these are pre-allocated blocks
  *          	buffer head is unmapped
  *          otherwise blocks are mapped
  *
@@ -4218,7 +4218,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 
 	/*
 	 * requested block isn't allocated yet;
-	 * we couldn't try to create block if create flag is zero
+	 * we couldn't try to create block if flags doesn't contain EXT4_GET_BLOCKS_CREATE
 	 */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 		ext4_lblk_t len;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 11e6f33677a2..df853c4d3a8c 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -576,9 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
 	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
 	    fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
 		return true;
-	if (EXT4_SB(sb)->s_journal_bdev_handle &&
+	if (EXT4_SB(sb)->s_journal_bdev_file &&
 	    fm->fmr_device ==
-	    new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev))
+	    new_encode_dev(file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev))
 		return true;
 	return false;
 }
@@ -648,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
 	memset(handlers, 0, sizeof(handlers));
 	handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
 	handlers[0].gfd_fn = ext4_getfsmap_datadev;
-	if (EXT4_SB(sb)->s_journal_bdev_handle) {
+	if (EXT4_SB(sb)->s_journal_bdev_file) {
 		handlers[1].gfd_dev = new_encode_dev(
-			EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev);
+			file_bdev(EXT4_SB(sb)->s_journal_bdev_file)->bd_dev);
 		handlers[1].gfd_fn = ext4_getfsmap_logdev;
 	}
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2ccf3b5e3a7c..537803250ca9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -465,9 +465,10 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
- * On success, it returns the number of blocks being mapped or allocated.  if
- * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
- * is marked as unwritten. If the create == 1, it will mark @map as mapped.
+ * On success, it returns the number of blocks being mapped or allocated.
+ * If flags doesn't contain EXT4_GET_BLOCKS_CREATE the blocks are
+ * pre-allocated and unwritten, the resulting @map is marked as unwritten.
+ * If the flags contain EXT4_GET_BLOCKS_CREATE, it will mark @map as mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
  * that case, @map is returned as unmapped but we still do fill map->m_len to
@@ -589,8 +590,7 @@ found:
 	 * Returns if the blocks have already allocated
 	 *
 	 * Note that if blocks have been preallocated
-	 * ext4_ext_get_block() returns the create = 0
-	 * with buffer head unmapped.
+	 * ext4_ext_map_blocks() returns with buffer head unmapped
 	 */
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
 		/*
diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
index f94901fd3835..044ca5238f41 100644
--- a/fs/ext4/mballoc-test.c
+++ b/fs/ext4/mballoc-test.c
@@ -5,6 +5,7 @@
 
 #include <kunit/test.h>
 #include <kunit/static_stub.h>
+#include <linux/random.h>
 
 #include "ext4.h"
 
@@ -20,41 +21,135 @@ struct mbt_ctx {
 };
 
 struct mbt_ext4_super_block {
-	struct super_block sb;
+	struct ext4_super_block es;
+	struct ext4_sb_info sbi;
 	struct mbt_ctx mbt_ctx;
 };
 
-#define MBT_CTX(_sb) (&(container_of((_sb), struct mbt_ext4_super_block, sb)->mbt_ctx))
+#define MBT_SB(_sb) (container_of((_sb)->s_fs_info, struct mbt_ext4_super_block, sbi))
+#define MBT_CTX(_sb) (&MBT_SB(_sb)->mbt_ctx)
 #define MBT_GRP_CTX(_sb, _group) (&MBT_CTX(_sb)->grp_ctx[_group])
 
+static const struct super_operations mbt_sops = {
+};
+
+static void mbt_kill_sb(struct super_block *sb)
+{
+	generic_shutdown_super(sb);
+}
+
+static struct file_system_type mbt_fs_type = {
+	.name			= "mballoc test",
+	.kill_sb		= mbt_kill_sb,
+};
+
+static int mbt_mb_init(struct super_block *sb)
+{
+	ext4_fsblk_t block;
+	int ret;
+
+	/* needed by ext4_mb_init->bdev_nonrot(sb->s_bdev) */
+	sb->s_bdev = kzalloc(sizeof(*sb->s_bdev), GFP_KERNEL);
+	if (sb->s_bdev == NULL)
+		return -ENOMEM;
+
+	sb->s_bdev->bd_queue = kzalloc(sizeof(struct request_queue), GFP_KERNEL);
+	if (sb->s_bdev->bd_queue == NULL) {
+		kfree(sb->s_bdev);
+		return -ENOMEM;
+	}
+
+	/*
+	 * needed by ext4_mb_init->ext4_mb_init_backend-> sbi->s_buddy_cache =
+	 * new_inode(sb);
+	 */
+	INIT_LIST_HEAD(&sb->s_inodes);
+	sb->s_op = &mbt_sops;
+
+	ret = ext4_mb_init(sb);
+	if (ret != 0)
+		goto err_out;
+
+	block = ext4_count_free_clusters(sb);
+	ret = percpu_counter_init(&EXT4_SB(sb)->s_freeclusters_counter, block,
+				  GFP_KERNEL);
+	if (ret != 0)
+		goto err_mb_release;
+
+	ret = percpu_counter_init(&EXT4_SB(sb)->s_dirtyclusters_counter, 0,
+				  GFP_KERNEL);
+	if (ret != 0)
+		goto err_freeclusters;
+
+	return 0;
+
+err_freeclusters:
+	percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+err_mb_release:
+	ext4_mb_release(sb);
+err_out:
+	kfree(sb->s_bdev->bd_queue);
+	kfree(sb->s_bdev);
+	return ret;
+}
+
+static void mbt_mb_release(struct super_block *sb)
+{
+	percpu_counter_destroy(&EXT4_SB(sb)->s_dirtyclusters_counter);
+	percpu_counter_destroy(&EXT4_SB(sb)->s_freeclusters_counter);
+	ext4_mb_release(sb);
+	kfree(sb->s_bdev->bd_queue);
+	kfree(sb->s_bdev);
+}
+
+static int mbt_set(struct super_block *sb, void *data)
+{
+	return 0;
+}
+
 static struct super_block *mbt_ext4_alloc_super_block(void)
 {
-	struct ext4_super_block *es = kzalloc(sizeof(*es), GFP_KERNEL);
-	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	struct mbt_ext4_super_block *fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+	struct mbt_ext4_super_block *fsb;
+	struct super_block *sb;
+	struct ext4_sb_info *sbi;
+
+	fsb = kzalloc(sizeof(*fsb), GFP_KERNEL);
+	if (fsb == NULL)
+		return NULL;
 
-	if (fsb == NULL || sbi == NULL || es == NULL)
+	sb = sget(&mbt_fs_type, NULL, mbt_set, 0, NULL);
+	if (IS_ERR(sb))
 		goto out;
 
-	sbi->s_es = es;
-	fsb->sb.s_fs_info = sbi;
-	return &fsb->sb;
+	sbi = &fsb->sbi;
+
+	sbi->s_blockgroup_lock =
+		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+	if (!sbi->s_blockgroup_lock)
+		goto out_deactivate;
+
+	bgl_lock_init(sbi->s_blockgroup_lock);
+
+	sbi->s_es = &fsb->es;
+	sb->s_fs_info = sbi;
+
+	up_write(&sb->s_umount);
+	return sb;
 
+out_deactivate:
+	deactivate_locked_super(sb);
 out:
 	kfree(fsb);
-	kfree(sbi);
-	kfree(es);
 	return NULL;
 }
 
 static void mbt_ext4_free_super_block(struct super_block *sb)
 {
-	struct mbt_ext4_super_block *fsb =
-		container_of(sb, struct mbt_ext4_super_block, sb);
+	struct mbt_ext4_super_block *fsb = MBT_SB(sb);
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	kfree(sbi->s_es);
-	kfree(sbi);
+	kfree(sbi->s_blockgroup_lock);
+	deactivate_super(sb);
 	kfree(fsb);
 }
 
@@ -82,6 +177,9 @@ static void mbt_init_sb_layout(struct super_block *sb,
 	sbi->s_clusters_per_group = layout->blocks_per_group >>
 				    layout->cluster_bits;
 	sbi->s_desc_size = layout->desc_size;
+	sbi->s_desc_per_block_bits =
+		sb->s_blocksize_bits - (fls(layout->desc_size) - 1);
+	sbi->s_desc_per_block = 1 << sbi->s_desc_per_block_bits;
 
 	es->s_first_data_block = cpu_to_le32(0);
 	es->s_blocks_count_lo = cpu_to_le32(layout->blocks_per_group *
@@ -91,9 +189,13 @@ static void mbt_init_sb_layout(struct super_block *sb,
 static int mbt_grp_ctx_init(struct super_block *sb,
 			    struct mbt_grp_ctx *grp_ctx)
 {
+	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+
 	grp_ctx->bitmap_bh.b_data = kzalloc(EXT4_BLOCK_SIZE(sb), GFP_KERNEL);
 	if (grp_ctx->bitmap_bh.b_data == NULL)
 		return -ENOMEM;
+	mb_set_bits(grp_ctx->bitmap_bh.b_data, max, sb->s_blocksize * 8 - max);
+	ext4_free_group_clusters_set(sb, &grp_ctx->desc, max);
 
 	return 0;
 }
@@ -112,6 +214,13 @@ static void mbt_ctx_mark_used(struct super_block *sb, ext4_group_t group,
 	mb_set_bits(grp_ctx->bitmap_bh.b_data, start, len);
 }
 
+static void *mbt_ctx_bitmap(struct super_block *sb, ext4_group_t group)
+{
+	struct mbt_grp_ctx *grp_ctx = MBT_GRP_CTX(sb, group);
+
+	return grp_ctx->bitmap_bh.b_data;
+}
+
 /* called after mbt_init_sb_layout */
 static int mbt_ctx_init(struct super_block *sb)
 {
@@ -133,6 +242,8 @@ static int mbt_ctx_init(struct super_block *sb)
 	 * block which will fail ext4_sb_block_valid check.
 	 */
 	mb_set_bits(ctx->grp_ctx[0].bitmap_bh.b_data, 0, 1);
+	ext4_free_group_clusters_set(sb, &ctx->grp_ctx[0].desc,
+				     EXT4_CLUSTERS_PER_GROUP(sb) - 1);
 
 	return 0;
 out:
@@ -167,6 +278,13 @@ static int ext4_wait_block_bitmap_stub(struct super_block *sb,
 				       ext4_group_t block_group,
 				       struct buffer_head *bh)
 {
+	/*
+	 * real ext4_wait_block_bitmap will set these flags and
+	 * functions like ext4_mb_init_cache will verify the flags.
+	 */
+	set_buffer_uptodate(bh);
+	set_bitmap_uptodate(bh);
+	set_buffer_verified(bh);
 	return 0;
 }
 
@@ -232,6 +350,14 @@ static int mbt_kunit_init(struct kunit *test)
 	kunit_activate_static_stub(test,
 				   ext4_mb_mark_context,
 				   ext4_mb_mark_context_stub);
+
+	/* stub function will be called in mbt_mb_init->ext4_mb_init */
+	if (mbt_mb_init(sb) != 0) {
+		mbt_ctx_release(sb);
+		mbt_ext4_free_super_block(sb);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
@@ -239,6 +365,7 @@ static void mbt_kunit_exit(struct kunit *test)
 {
 	struct super_block *sb = (struct super_block *)test->priv;
 
+	mbt_mb_release(sb);
 	mbt_ctx_release(sb);
 	mbt_ext4_free_super_block(sb);
 }
@@ -246,14 +373,19 @@ static void mbt_kunit_exit(struct kunit *test)
 static void test_new_blocks_simple(struct kunit *test)
 {
 	struct super_block *sb = (struct super_block *)test->priv;
-	struct inode inode = { .i_sb = sb, };
+	struct inode *inode;
 	struct ext4_allocation_request ar;
 	ext4_group_t i, goal_group = TEST_GOAL_GROUP;
 	int err = 0;
 	ext4_fsblk_t found;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	ar.inode = &inode;
+	inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+	if (!inode)
+		return;
+
+	inode->i_sb = sb;
+	ar.inode = inode;
 
 	/* get block at goal */
 	ar.goal = ext4_group_first_block_no(sb, goal_group);
@@ -297,6 +429,436 @@ static void test_new_blocks_simple(struct kunit *test)
 		"unexpectedly get block when no block is available");
 }
 
+#define TEST_RANGE_COUNT 8
+
+struct test_range {
+	ext4_grpblk_t start;
+	ext4_grpblk_t len;
+};
+
+static void
+mbt_generate_test_ranges(struct super_block *sb, struct test_range *ranges,
+			 int count)
+{
+	ext4_grpblk_t start, len, max;
+	int i;
+
+	max = EXT4_CLUSTERS_PER_GROUP(sb) / count;
+	for (i = 0; i < count; i++) {
+		start = get_random_u32() % max;
+		len = get_random_u32() % max;
+		len = min(len, max - start);
+
+		ranges[i].start = start + i * max;
+		ranges[i].len = len;
+	}
+}
+
+static void
+validate_free_blocks_simple(struct kunit *test, struct super_block *sb,
+			    ext4_group_t goal_group, ext4_grpblk_t start,
+			    ext4_grpblk_t len)
+{
+	void *bitmap;
+	ext4_grpblk_t bit, max = EXT4_CLUSTERS_PER_GROUP(sb);
+	ext4_group_t i;
+
+	for (i = 0; i < ext4_get_groups_count(sb); i++) {
+		if (i == goal_group)
+			continue;
+
+		bitmap = mbt_ctx_bitmap(sb, i);
+		bit = mb_find_next_zero_bit(bitmap, max, 0);
+		KUNIT_ASSERT_EQ_MSG(test, bit, max,
+				    "free block on unexpected group %d", i);
+	}
+
+	bitmap = mbt_ctx_bitmap(sb, goal_group);
+	bit = mb_find_next_zero_bit(bitmap, max, 0);
+	KUNIT_ASSERT_EQ(test, bit, start);
+
+	bit = mb_find_next_bit(bitmap, max, bit + 1);
+	KUNIT_ASSERT_EQ(test, bit, start + len);
+}
+
+static void
+test_free_blocks_simple_range(struct kunit *test, ext4_group_t goal_group,
+			      ext4_grpblk_t start, ext4_grpblk_t len)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct inode *inode;
+	ext4_fsblk_t block;
+
+	inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+	if (!inode)
+		return;
+	inode->i_sb = sb;
+
+	if (len == 0)
+		return;
+
+	block = ext4_group_first_block_no(sb, goal_group) +
+		EXT4_C2B(sbi, start);
+	ext4_free_blocks_simple(inode, block, len);
+	validate_free_blocks_simple(test, sb, goal_group, start, len);
+	mbt_ctx_mark_used(sb, goal_group, 0, EXT4_CLUSTERS_PER_GROUP(sb));
+}
+
+static void test_free_blocks_simple(struct kunit *test)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+	ext4_group_t i;
+	struct test_range ranges[TEST_RANGE_COUNT];
+
+	for (i = 0; i < ext4_get_groups_count(sb); i++)
+		mbt_ctx_mark_used(sb, i, 0, max);
+
+	mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+	for (i = 0; i < TEST_RANGE_COUNT; i++)
+		test_free_blocks_simple_range(test, TEST_GOAL_GROUP,
+			ranges[i].start, ranges[i].len);
+}
+
+static void
+test_mark_diskspace_used_range(struct kunit *test,
+			       struct ext4_allocation_context *ac,
+			       ext4_grpblk_t start,
+			       ext4_grpblk_t len)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	int ret;
+	void *bitmap;
+	ext4_grpblk_t i, max;
+
+	/* ext4_mb_mark_diskspace_used will BUG if len is 0 */
+	if (len == 0)
+		return;
+
+	ac->ac_b_ex.fe_group = TEST_GOAL_GROUP;
+	ac->ac_b_ex.fe_start = start;
+	ac->ac_b_ex.fe_len = len;
+
+	bitmap = mbt_ctx_bitmap(sb, TEST_GOAL_GROUP);
+	memset(bitmap, 0, sb->s_blocksize);
+	ret = ext4_mb_mark_diskspace_used(ac, NULL, 0);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	max = EXT4_CLUSTERS_PER_GROUP(sb);
+	i = mb_find_next_bit(bitmap, max, 0);
+	KUNIT_ASSERT_EQ(test, i, start);
+	i = mb_find_next_zero_bit(bitmap, max, i + 1);
+	KUNIT_ASSERT_EQ(test, i, start + len);
+	i = mb_find_next_bit(bitmap, max, i + 1);
+	KUNIT_ASSERT_EQ(test, max, i);
+}
+
+static void test_mark_diskspace_used(struct kunit *test)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	struct inode *inode;
+	struct ext4_allocation_context ac;
+	struct test_range ranges[TEST_RANGE_COUNT];
+	int i;
+
+	mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+
+	inode = kunit_kzalloc(test, sizeof(*inode), GFP_KERNEL);
+	if (!inode)
+		return;
+	inode->i_sb = sb;
+
+	ac.ac_status = AC_STATUS_FOUND;
+	ac.ac_sb = sb;
+	ac.ac_inode = inode;
+	for (i = 0; i < TEST_RANGE_COUNT; i++)
+		test_mark_diskspace_used_range(test, &ac, ranges[i].start,
+					       ranges[i].len);
+}
+
+static void mbt_generate_buddy(struct super_block *sb, void *buddy,
+			       void *bitmap, struct ext4_group_info *grp)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	uint32_t order, off;
+	void *bb, *bb_h;
+	int max;
+
+	memset(buddy, 0xff, sb->s_blocksize);
+	memset(grp, 0, offsetof(struct ext4_group_info,
+				 bb_counters[MB_NUM_ORDERS(sb)]));
+
+	bb = bitmap;
+	max = EXT4_CLUSTERS_PER_GROUP(sb);
+	bb_h = buddy + sbi->s_mb_offsets[1];
+
+	off = mb_find_next_zero_bit(bb, max, 0);
+	grp->bb_first_free = off;
+	while (off < max) {
+		grp->bb_counters[0]++;
+		grp->bb_free++;
+
+		if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+			grp->bb_free++;
+			grp->bb_counters[0]--;
+			mb_clear_bit(off >> 1, bb_h);
+			grp->bb_counters[1]++;
+			grp->bb_largest_free_order = 1;
+			off++;
+		}
+
+		off = mb_find_next_zero_bit(bb, max, off + 1);
+	}
+
+	for (order = 1; order < MB_NUM_ORDERS(sb) - 1; order++) {
+		bb = buddy + sbi->s_mb_offsets[order];
+		bb_h = buddy + sbi->s_mb_offsets[order + 1];
+		max = max >> 1;
+		off = mb_find_next_zero_bit(bb, max, 0);
+
+		while (off < max) {
+			if (!(off & 1) && !mb_test_bit(off + 1, bb)) {
+				mb_set_bits(bb, off, 2);
+				grp->bb_counters[order] -= 2;
+				mb_clear_bit(off >> 1, bb_h);
+				grp->bb_counters[order + 1]++;
+				grp->bb_largest_free_order = order + 1;
+				off++;
+			}
+
+			off = mb_find_next_zero_bit(bb, max, off + 1);
+		}
+	}
+
+	max = EXT4_CLUSTERS_PER_GROUP(sb);
+	off = mb_find_next_zero_bit(bitmap, max, 0);
+	while (off < max) {
+		grp->bb_fragments++;
+
+		off = mb_find_next_bit(bitmap, max, off + 1);
+		if (off + 1 >= max)
+			break;
+
+		off = mb_find_next_zero_bit(bitmap, max, off + 1);
+	}
+}
+
+static void
+mbt_validate_group_info(struct kunit *test, struct ext4_group_info *grp1,
+			struct ext4_group_info *grp2)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	int i;
+
+	KUNIT_ASSERT_EQ(test, grp1->bb_first_free,
+			grp2->bb_first_free);
+	KUNIT_ASSERT_EQ(test, grp1->bb_fragments,
+			grp2->bb_fragments);
+	KUNIT_ASSERT_EQ(test, grp1->bb_free, grp2->bb_free);
+	KUNIT_ASSERT_EQ(test, grp1->bb_largest_free_order,
+			grp2->bb_largest_free_order);
+
+	for (i = 1; i < MB_NUM_ORDERS(sb); i++) {
+		KUNIT_ASSERT_EQ_MSG(test, grp1->bb_counters[i],
+				    grp2->bb_counters[i],
+				    "bb_counters[%d] diffs, expected %d, generated %d",
+				    i, grp1->bb_counters[i],
+				    grp2->bb_counters[i]);
+	}
+}
+
+static void
+do_test_generate_buddy(struct kunit *test, struct super_block *sb, void *bitmap,
+			   void *mbt_buddy, struct ext4_group_info *mbt_grp,
+			   void *ext4_buddy, struct ext4_group_info *ext4_grp)
+{
+	int i;
+
+	mbt_generate_buddy(sb, mbt_buddy, bitmap, mbt_grp);
+
+	for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+		ext4_grp->bb_counters[i] = 0;
+	/* needed by validation in ext4_mb_generate_buddy */
+	ext4_grp->bb_free = mbt_grp->bb_free;
+	memset(ext4_buddy, 0xff, sb->s_blocksize);
+	ext4_mb_generate_buddy(sb, ext4_buddy, bitmap, TEST_GOAL_GROUP,
+			       ext4_grp);
+
+	KUNIT_ASSERT_EQ(test, memcmp(mbt_buddy, ext4_buddy, sb->s_blocksize),
+			0);
+	mbt_validate_group_info(test, mbt_grp, ext4_grp);
+}
+
+static void test_mb_generate_buddy(struct kunit *test)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	void *bitmap, *expected_bb, *generate_bb;
+	struct ext4_group_info *expected_grp, *generate_grp;
+	struct test_range ranges[TEST_RANGE_COUNT];
+	int i;
+
+	bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+	expected_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_bb);
+	generate_bb = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, generate_bb);
+	expected_grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+				bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, expected_grp);
+	generate_grp = ext4_get_group_info(sb, TEST_GOAL_GROUP);
+	KUNIT_ASSERT_NOT_NULL(test, generate_grp);
+
+	mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+	for (i = 0; i < TEST_RANGE_COUNT; i++) {
+		mb_set_bits(bitmap, ranges[i].start, ranges[i].len);
+		do_test_generate_buddy(test, sb, bitmap, expected_bb,
+				       expected_grp, generate_bb, generate_grp);
+	}
+}
+
+static void
+test_mb_mark_used_range(struct kunit *test, struct ext4_buddy *e4b,
+			ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+			void *buddy, struct ext4_group_info *grp)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	struct ext4_free_extent ex;
+	int i;
+
+	/* mb_mark_used only accepts non-zero len */
+	if (len == 0)
+		return;
+
+	ex.fe_start = start;
+	ex.fe_len = len;
+	ex.fe_group = TEST_GOAL_GROUP;
+
+	ext4_lock_group(sb, TEST_GOAL_GROUP);
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+	mb_set_bits(bitmap, start, len);
+	/* bypass bb_free validatoin in ext4_mb_generate_buddy */
+	grp->bb_free -= len;
+	memset(buddy, 0xff, sb->s_blocksize);
+	for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+		grp->bb_counters[i] = 0;
+	ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+	KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+			0);
+	mbt_validate_group_info(test, grp, e4b->bd_info);
+}
+
+static void test_mb_mark_used(struct kunit *test)
+{
+	struct ext4_buddy e4b;
+	struct super_block *sb = (struct super_block *)test->priv;
+	void *bitmap, *buddy;
+	struct ext4_group_info *grp;
+	int ret;
+	struct test_range ranges[TEST_RANGE_COUNT];
+	int i;
+
+	/* buddy cache assumes that each page contains at least one block */
+	if (sb->s_blocksize > PAGE_SIZE)
+		kunit_skip(test, "blocksize exceeds pagesize");
+
+	bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+	buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+	grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+				bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+	ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	grp->bb_free = EXT4_CLUSTERS_PER_GROUP(sb);
+	mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+	for (i = 0; i < TEST_RANGE_COUNT; i++)
+		test_mb_mark_used_range(test, &e4b, ranges[i].start,
+					ranges[i].len, bitmap, buddy, grp);
+
+	ext4_mb_unload_buddy(&e4b);
+}
+
+static void
+test_mb_free_blocks_range(struct kunit *test, struct ext4_buddy *e4b,
+			  ext4_grpblk_t start, ext4_grpblk_t len, void *bitmap,
+			  void *buddy, struct ext4_group_info *grp)
+{
+	struct super_block *sb = (struct super_block *)test->priv;
+	int i;
+
+	/* mb_free_blocks will WARN if len is 0 */
+	if (len == 0)
+		return;
+
+	ext4_lock_group(sb, e4b->bd_group);
+	mb_free_blocks(NULL, e4b, start, len);
+	ext4_unlock_group(sb, e4b->bd_group);
+
+	mb_clear_bits(bitmap, start, len);
+	/* bypass bb_free validatoin in ext4_mb_generate_buddy */
+	grp->bb_free += len;
+	memset(buddy, 0xff, sb->s_blocksize);
+	for (i = 0; i < MB_NUM_ORDERS(sb); i++)
+		grp->bb_counters[i] = 0;
+	ext4_mb_generate_buddy(sb, buddy, bitmap, 0, grp);
+
+	KUNIT_ASSERT_EQ(test, memcmp(buddy, e4b->bd_buddy, sb->s_blocksize),
+			0);
+	mbt_validate_group_info(test, grp, e4b->bd_info);
+
+}
+
+static void test_mb_free_blocks(struct kunit *test)
+{
+	struct ext4_buddy e4b;
+	struct super_block *sb = (struct super_block *)test->priv;
+	void *bitmap, *buddy;
+	struct ext4_group_info *grp;
+	struct ext4_free_extent ex;
+	int ret;
+	int i;
+	struct test_range ranges[TEST_RANGE_COUNT];
+
+	/* buddy cache assumes that each page contains at least one block */
+	if (sb->s_blocksize > PAGE_SIZE)
+		kunit_skip(test, "blocksize exceeds pagesize");
+
+	bitmap = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, bitmap);
+	buddy = kunit_kzalloc(test, sb->s_blocksize, GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy);
+	grp = kunit_kzalloc(test, offsetof(struct ext4_group_info,
+				bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL);
+
+	ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b);
+	KUNIT_ASSERT_EQ(test, ret, 0);
+
+	ex.fe_start = 0;
+	ex.fe_len = EXT4_CLUSTERS_PER_GROUP(sb);
+	ex.fe_group = TEST_GOAL_GROUP;
+
+	ext4_lock_group(sb, TEST_GOAL_GROUP);
+	mb_mark_used(&e4b, &ex);
+	ext4_unlock_group(sb, TEST_GOAL_GROUP);
+
+	grp->bb_free = 0;
+	memset(bitmap, 0xff, sb->s_blocksize);
+
+	mbt_generate_test_ranges(sb, ranges, TEST_RANGE_COUNT);
+	for (i = 0; i < TEST_RANGE_COUNT; i++)
+		test_mb_free_blocks_range(test, &e4b, ranges[i].start,
+					  ranges[i].len, bitmap, buddy, grp);
+
+	ext4_mb_unload_buddy(&e4b);
+}
+
 static const struct mbt_ext4_block_layout mbt_test_layouts[] = {
 	{
 		.blocksize_bits = 10,
@@ -334,6 +896,11 @@ KUNIT_ARRAY_PARAM(mbt_layouts, mbt_test_layouts, mbt_show_layout);
 
 static struct kunit_case mbt_test_cases[] = {
 	KUNIT_CASE_PARAM(test_new_blocks_simple, mbt_layouts_gen_params),
+	KUNIT_CASE_PARAM(test_free_blocks_simple, mbt_layouts_gen_params),
+	KUNIT_CASE_PARAM(test_mb_generate_buddy, mbt_layouts_gen_params),
+	KUNIT_CASE_PARAM(test_mb_mark_used, mbt_layouts_gen_params),
+	KUNIT_CASE_PARAM(test_mb_free_blocks, mbt_layouts_gen_params),
+	KUNIT_CASE_PARAM(test_mark_diskspace_used, mbt_layouts_gen_params),
 	{}
 };
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e4f7cf9d89c4..12b3f196010b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3015,8 +3015,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 {
 	struct super_block *sb = pde_data(file_inode(seq->file));
 	ext4_group_t group = (ext4_group_t) ((unsigned long) v);
-	int i;
-	int err, buddy_loaded = 0;
+	int i, err;
+	char nbuf[16];
 	struct ext4_buddy e4b;
 	struct ext4_group_info *grinfo;
 	unsigned char blocksize_bits = min_t(unsigned char,
@@ -3043,23 +3043,26 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
 		err = ext4_mb_load_buddy(sb, group, &e4b);
 		if (err) {
-			seq_printf(seq, "#%-5u: I/O error\n", group);
+			seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf));
 			return 0;
 		}
-		buddy_loaded = 1;
+		ext4_mb_unload_buddy(&e4b);
 	}
 
+	/*
+	 * We care only about free space counters in the group info and
+	 * these are safe to access even after the buddy has been unloaded
+	 */
 	memcpy(&sg, grinfo, i);
-
-	if (buddy_loaded)
-		ext4_mb_unload_buddy(&e4b);
-
 	seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
 			sg.info.bb_fragments, sg.info.bb_first_free);
 	for (i = 0; i <= 13; i++)
 		seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
 				sg.info.bb_counters[i] : 0);
-	seq_puts(seq, " ]\n");
+	seq_puts(seq, " ]");
+	if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info))
+		seq_puts(seq, " Block bitmap corrupted!");
+	seq_puts(seq, "\n");
 
 	return 0;
 }
@@ -3829,8 +3832,7 @@ void ext4_mb_release(struct super_block *sb)
 }
 
 static inline int ext4_issue_discard(struct super_block *sb,
-		ext4_group_t block_group, ext4_grpblk_t cluster, int count,
-		struct bio **biop)
+		ext4_group_t block_group, ext4_grpblk_t cluster, int count)
 {
 	ext4_fsblk_t discard_block;
 
@@ -3839,13 +3841,8 @@ static inline int ext4_issue_discard(struct super_block *sb,
 	count = EXT4_C2B(EXT4_SB(sb), count);
 	trace_ext4_discard_blocks(sb,
 			(unsigned long long) discard_block, count);
-	if (biop) {
-		return __blkdev_issue_discard(sb->s_bdev,
-			(sector_t)discard_block << (sb->s_blocksize_bits - 9),
-			(sector_t)count << (sb->s_blocksize_bits - 9),
-			GFP_NOFS, biop);
-	} else
-		return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
+
+	return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
 }
 
 static void ext4_free_data_in_buddy(struct super_block *sb,
@@ -5169,10 +5166,16 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 			.fe_len = ac->ac_orig_goal_len,
 		};
 		loff_t orig_goal_end = extent_logical_end(sbi, &ex);
+		loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
 
-		/* we can't allocate as much as normalizer wants.
-		 * so, found space must get proper lstart
-		 * to cover original request */
+		/*
+		 * We can't allocate as much as normalizer wants, so we try
+		 * to get proper lstart to cover the original request, except
+		 * when the goal doesn't cover the original request as below:
+		 *
+		 * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
+		 * best_ex:0/200(200) -> adjusted: 1848/2048(200)
+		 */
 		BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
 		BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
 
@@ -5184,7 +5187,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 		 * 1. Check if best ex can be kept at end of goal (before
 		 *    cr_best_avail trimmed it) and still cover original start
 		 * 2. Else, check if best ex can be kept at start of goal and
-		 *    still cover original start
+		 *    still cover original end
 		 * 3. Else, keep the best ex at start of original request.
 		 */
 		ex.fe_len = ac->ac_b_ex.fe_len;
@@ -5194,7 +5197,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 			goto adjust_bex;
 
 		ex.fe_logical = ac->ac_g_ex.fe_logical;
-		if (ac->ac_o_ex.fe_logical < extent_logical_end(sbi, &ex))
+		if (o_ex_end <= extent_logical_end(sbi, &ex))
 			goto adjust_bex;
 
 		ex.fe_logical = ac->ac_o_ex.fe_logical;
@@ -5202,7 +5205,6 @@ adjust_bex:
 		ac->ac_b_ex.fe_logical = ex.fe_logical;
 
 		BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
-		BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
 		BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
 	}
 
@@ -6487,8 +6489,14 @@ do_more:
 	} else {
 		if (test_opt(sb, DISCARD)) {
 			err = ext4_issue_discard(sb, block_group, bit,
-						 count_clusters, NULL);
-			if (err && err != -EOPNOTSUPP)
+						 count_clusters);
+			/*
+			 * Ignore EOPNOTSUPP error. This is consistent with
+			 * what happens when using journal.
+			 */
+			if (err == -EOPNOTSUPP)
+				err = 0;
+			if (err)
 				ext4_msg(sb, KERN_WARNING, "discard request in"
 					 " group:%u block:%d count:%lu failed"
 					 " with %d", block_group, bit, count,
@@ -6738,7 +6746,7 @@ __acquires(bitlock)
 	 */
 	mb_mark_used(e4b, &ex);
 	ext4_unlock_group(sb, group);
-	ret = ext4_issue_discard(sb, group, start, count, NULL);
+	ret = ext4_issue_discard(sb, group, start, count);
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
 	return ret;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 05b647e6bc19..5e4f65c14dfb 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1762,7 +1762,6 @@ static struct buffer_head *ext4_lookup_entry(struct inode *dir,
 	struct buffer_head *bh;
 
 	err = ext4_fname_prepare_lookup(dir, dentry, &fname);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return NULL;
 	if (err)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4d4a5a32e310..0ba9837d65ca 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1602,7 +1602,8 @@ exit_journal:
 		int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
 		int gdb_num_end = ((group + flex_gd->count - 1) /
 				   EXT4_DESC_PER_BLOCK(sb));
-		int meta_bg = ext4_has_feature_meta_bg(sb);
+		int meta_bg = ext4_has_feature_meta_bg(sb) &&
+			      gdb_num >= le32_to_cpu(es->s_first_meta_bg);
 		sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
 					 ext4_group_first_block_no(sb, 0);
 
@@ -2084,7 +2085,7 @@ retry:
 		}
 	}
 
-	if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
+	if ((!resize_inode && !meta_bg && n_desc_blocks > o_desc_blocks) || n_blocks_count == o_blocks_count) {
 		err = ext4_convert_meta_bg(sb, resize_inode);
 		if (err)
 			goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0f931d0c227d..044135796f2b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1359,14 +1359,14 @@ static void ext4_put_super(struct super_block *sb)
 
 	sync_blockdev(sb->s_bdev);
 	invalidate_bdev(sb->s_bdev);
-	if (sbi->s_journal_bdev_handle) {
+	if (sbi->s_journal_bdev_file) {
 		/*
 		 * Invalidate the journal device's buffers.  We don't want them
 		 * floating about in memory - the physical journal device may
 		 * hotswapped, and it breaks the `ro-after' testing code.
 		 */
-		sync_blockdev(sbi->s_journal_bdev_handle->bdev);
-		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
+		sync_blockdev(file_bdev(sbi->s_journal_bdev_file));
+		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
 	}
 
 	ext4_xattr_destroy_cache(sbi->s_ea_inode_cache);
@@ -1500,8 +1500,7 @@ static int __init init_inodecache(void)
 {
 	ext4_inode_cachep = kmem_cache_create_usercopy("ext4_inode_cache",
 				sizeof(struct ext4_inode_info), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-					SLAB_ACCOUNT),
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 				offsetof(struct ext4_inode_info, i_data),
 				sizeof_field(struct ext4_inode_info, i_data),
 				init_once);
@@ -1600,7 +1599,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 			     unsigned int flags);
 
-static struct dquot **ext4_get_dquots(struct inode *inode)
+static struct dquot __rcu **ext4_get_dquots(struct inode *inode)
 {
 	return EXT4_I(inode)->i_dquot;
 }
@@ -4233,7 +4232,7 @@ int ext4_calculate_overhead(struct super_block *sb)
 	 * Add the internal journal blocks whether the journal has been
 	 * loaded or not
 	 */
-	if (sbi->s_journal && !sbi->s_journal_bdev_handle)
+	if (sbi->s_journal && !sbi->s_journal_bdev_file)
 		overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len);
 	else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
 		/* j_inum for internal journal is non-zero */
@@ -4422,22 +4421,6 @@ static int ext4_handle_clustersize(struct super_block *sb)
 		}
 		sbi->s_cluster_bits = le32_to_cpu(es->s_log_cluster_size) -
 			le32_to_cpu(es->s_log_block_size);
-		sbi->s_clusters_per_group =
-			le32_to_cpu(es->s_clusters_per_group);
-		if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
-			ext4_msg(sb, KERN_ERR,
-				 "#clusters per group too big: %lu",
-				 sbi->s_clusters_per_group);
-			return -EINVAL;
-		}
-		if (sbi->s_blocks_per_group !=
-		    (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
-			ext4_msg(sb, KERN_ERR, "blocks per group (%lu) and "
-				 "clusters per group (%lu) inconsistent",
-				 sbi->s_blocks_per_group,
-				 sbi->s_clusters_per_group);
-			return -EINVAL;
-		}
 	} else {
 		if (clustersize != sb->s_blocksize) {
 			ext4_msg(sb, KERN_ERR,
@@ -4451,9 +4434,21 @@ static int ext4_handle_clustersize(struct super_block *sb)
 				 sbi->s_blocks_per_group);
 			return -EINVAL;
 		}
-		sbi->s_clusters_per_group = sbi->s_blocks_per_group;
 		sbi->s_cluster_bits = 0;
 	}
+	sbi->s_clusters_per_group = le32_to_cpu(es->s_clusters_per_group);
+	if (sbi->s_clusters_per_group > sb->s_blocksize * 8) {
+		ext4_msg(sb, KERN_ERR, "#clusters per group too big: %lu",
+			 sbi->s_clusters_per_group);
+		return -EINVAL;
+	}
+	if (sbi->s_blocks_per_group !=
+	    (sbi->s_clusters_per_group * (clustersize / sb->s_blocksize))) {
+		ext4_msg(sb, KERN_ERR,
+			 "blocks per group (%lu) and clusters per group (%lu) inconsistent",
+			 sbi->s_blocks_per_group, sbi->s_clusters_per_group);
+		return -EINVAL;
+	}
 	sbi->s_cluster_ratio = clustersize / sb->s_blocksize;
 
 	/* Do we have standard group size of clustersize * 8 blocks ? */
@@ -5346,7 +5341,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		sb->s_qcop = &ext4_qctl_operations;
 	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
-	memcpy(&sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+	super_set_uuid(sb, es->s_uuid, sizeof(es->s_uuid));
 
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
@@ -5484,6 +5479,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
 		goto failed_mount4;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		ext4_msg(sb, KERN_ERR, "get root dentry failed");
@@ -5670,9 +5666,9 @@ failed_mount:
 #endif
 	fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy);
 	brelse(sbi->s_sbh);
-	if (sbi->s_journal_bdev_handle) {
-		invalidate_bdev(sbi->s_journal_bdev_handle->bdev);
-		bdev_release(sbi->s_journal_bdev_handle);
+	if (sbi->s_journal_bdev_file) {
+		invalidate_bdev(file_bdev(sbi->s_journal_bdev_file));
+		bdev_fput(sbi->s_journal_bdev_file);
 	}
 out_fail:
 	invalidate_bdev(sb->s_bdev);
@@ -5842,30 +5838,30 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
 	return journal;
 }
 
-static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
+static struct file *ext4_get_journal_blkdev(struct super_block *sb,
 					dev_t j_dev, ext4_fsblk_t *j_start,
 					ext4_fsblk_t *j_len)
 {
 	struct buffer_head *bh;
 	struct block_device *bdev;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int hblock, blocksize;
 	ext4_fsblk_t sb_block;
 	unsigned long offset;
 	struct ext4_super_block *es;
 	int errno;
 
-	bdev_handle = bdev_open_by_dev(j_dev,
+	bdev_file = bdev_file_open_by_dev(j_dev,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
 		sb, &fs_holder_ops);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		ext4_msg(sb, KERN_ERR,
 			 "failed to open journal device unknown-block(%u,%u) %ld",
-			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle));
-		return bdev_handle;
+			 MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_file));
+		return bdev_file;
 	}
 
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 	blocksize = sb->s_blocksize;
 	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
@@ -5912,12 +5908,12 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
 	*j_start = sb_block + 1;
 	*j_len = ext4_blocks_count(es);
 	brelse(bh);
-	return bdev_handle;
+	return bdev_file;
 
 out_bh:
 	brelse(bh);
 out_bdev:
-	bdev_release(bdev_handle);
+	bdev_fput(bdev_file);
 	return ERR_PTR(errno);
 }
 
@@ -5927,14 +5923,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 	journal_t *journal;
 	ext4_fsblk_t j_start;
 	ext4_fsblk_t j_len;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int errno = 0;
 
-	bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
-	if (IS_ERR(bdev_handle))
-		return ERR_CAST(bdev_handle);
+	bdev_file = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len);
+	if (IS_ERR(bdev_file))
+		return ERR_CAST(bdev_file);
 
-	journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start,
+	journal = jbd2_journal_init_dev(file_bdev(bdev_file), sb->s_bdev, j_start,
 					j_len, sb->s_blocksize);
 	if (IS_ERR(journal)) {
 		ext4_msg(sb, KERN_ERR, "failed to create device journal");
@@ -5949,14 +5945,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb,
 		goto out_journal;
 	}
 	journal->j_private = sb;
-	EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle;
+	EXT4_SB(sb)->s_journal_bdev_file = bdev_file;
 	ext4_init_journal_params(sb, journal);
 	return journal;
 
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
-	bdev_release(bdev_handle);
+	bdev_fput(bdev_file);
 	return ERR_PTR(errno);
 }
 
@@ -6864,6 +6860,10 @@ static int ext4_write_dquot(struct dquot *dquot)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
+	if (ret < 0)
+		ext4_error_err(dquot->dq_sb, -ret,
+			       "Failed to commit dquot type %d",
+			       dquot->dq_id.type);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
@@ -6880,6 +6880,10 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
+	if (ret < 0)
+		ext4_error_err(dquot->dq_sb, -ret,
+			      "Failed to acquire dquot type %d",
+			      dquot->dq_id.type);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
@@ -6899,6 +6903,10 @@ static int ext4_release_dquot(struct dquot *dquot)
 		return PTR_ERR(handle);
 	}
 	ret = dquot_release(dquot);
+	if (ret < 0)
+		ext4_error_err(dquot->dq_sb, -ret,
+			       "Failed to release dquot type %d",
+			       dquot->dq_id.type);
 	err = ext4_journal_stop(handle);
 	if (!ret)
 		ret = err;
@@ -7314,12 +7322,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb)
 static void ext4_kill_sb(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL;
+	struct file *bdev_file = sbi ? sbi->s_journal_bdev_file : NULL;
 
 	kill_block_super(sb);
 
-	if (handle)
-		bdev_release(handle);
+	if (bdev_file)
+		bdev_fput(bdev_file);
 }
 
 static struct file_system_type ext4_fs_type = {
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 82dc5e673d5c..b67a176bfcf9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1565,46 +1565,49 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
 /*
  * Add value of the EA in an inode.
  */
-static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode,
-					  const void *value, size_t value_len,
-					  struct inode **ret_inode)
+static struct inode *ext4_xattr_inode_lookup_create(handle_t *handle,
+		struct inode *inode, const void *value, size_t value_len)
 {
 	struct inode *ea_inode;
 	u32 hash;
 	int err;
 
+	/* Account inode & space to quota even if sharing... */
+	err = ext4_xattr_inode_alloc_quota(inode, value_len);
+	if (err)
+		return ERR_PTR(err);
+
 	hash = ext4_xattr_inode_hash(EXT4_SB(inode->i_sb), value, value_len);
 	ea_inode = ext4_xattr_inode_cache_find(inode, value, value_len, hash);
 	if (ea_inode) {
 		err = ext4_xattr_inode_inc_ref(handle, ea_inode);
-		if (err) {
-			iput(ea_inode);
-			return err;
-		}
-
-		*ret_inode = ea_inode;
-		return 0;
+		if (err)
+			goto out_err;
+		return ea_inode;
 	}
 
 	/* Create an inode for the EA value */
 	ea_inode = ext4_xattr_inode_create(handle, inode, hash);
-	if (IS_ERR(ea_inode))
-		return PTR_ERR(ea_inode);
+	if (IS_ERR(ea_inode)) {
+		ext4_xattr_inode_free_quota(inode, NULL, value_len);
+		return ea_inode;
+	}
 
 	err = ext4_xattr_inode_write(handle, ea_inode, value, value_len);
 	if (err) {
 		if (ext4_xattr_inode_dec_ref(handle, ea_inode))
 			ext4_warning_inode(ea_inode, "cleanup dec ref error %d", err);
-		iput(ea_inode);
-		return err;
+		goto out_err;
 	}
 
 	if (EA_INODE_CACHE(inode))
 		mb_cache_entry_create(EA_INODE_CACHE(inode), GFP_NOFS, hash,
 				      ea_inode->i_ino, true /* reusable */);
-
-	*ret_inode = ea_inode;
-	return 0;
+	return ea_inode;
+out_err:
+	iput(ea_inode);
+	ext4_xattr_inode_free_quota(inode, NULL, value_len);
+	return ERR_PTR(err);
 }
 
 /*
@@ -1712,16 +1715,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
 	if (i->value && in_inode) {
 		WARN_ON_ONCE(!i->value_len);
 
-		ret = ext4_xattr_inode_alloc_quota(inode, i->value_len);
-		if (ret)
-			goto out;
-
-		ret = ext4_xattr_inode_lookup_create(handle, inode, i->value,
-						     i->value_len,
-						     &new_ea_inode);
-		if (ret) {
+		new_ea_inode = ext4_xattr_inode_lookup_create(handle, inode,
+					i->value, i->value_len);
+		if (IS_ERR(new_ea_inode)) {
+			ret = PTR_ERR(new_ea_inode);
 			new_ea_inode = NULL;
-			ext4_xattr_inode_free_quota(inode, NULL, i->value_len);
 			goto out;
 		}
 	}
@@ -2160,17 +2158,6 @@ getblk_failed:
 						      ENTRY(header(s->base)+1));
 			if (error)
 				goto getblk_failed;
-			if (ea_inode) {
-				/* Drop the extra ref on ea_inode. */
-				error = ext4_xattr_inode_dec_ref(handle,
-								 ea_inode);
-				if (error)
-					ext4_warning_inode(ea_inode,
-							   "dec ref error=%d",
-							   error);
-				iput(ea_inode);
-				ea_inode = NULL;
-			}
 
 			lock_buffer(new_bh);
 			error = ext4_journal_get_create_access(handle, sb,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index b0597a539fc5..eac698b8dd38 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -154,49 +154,47 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
 	if (unlikely(f2fs_cp_error(sbi)))
 		return exist;
 
-	if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
-		f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
-			 blkaddr, exist);
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		return exist;
-	}
+	if ((exist && type == DATA_GENERIC_ENHANCE_UPDATE) ||
+		(!exist && type == DATA_GENERIC_ENHANCE))
+		goto out_err;
+	if (!exist && type != DATA_GENERIC_ENHANCE_UPDATE)
+		goto out_handle;
+	return exist;
 
-	if (!exist && type == DATA_GENERIC_ENHANCE) {
-		f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
-			 blkaddr, exist);
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
-		dump_stack();
-	}
+out_err:
+	f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+		 blkaddr, exist);
+	set_sbi_flag(sbi, SBI_NEED_FSCK);
+	dump_stack();
+out_handle:
+	f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 	return exist;
 }
 
-bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+static bool __f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
-	if (time_to_inject(sbi, FAULT_BLKADDR))
-		return false;
-
 	switch (type) {
 	case META_NAT:
 		break;
 	case META_SIT:
 		if (unlikely(blkaddr >= SIT_BLK_CNT(sbi)))
-			return false;
+			goto err;
 		break;
 	case META_SSA:
 		if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) ||
 			blkaddr < SM_I(sbi)->ssa_blkaddr))
-			return false;
+			goto err;
 		break;
 	case META_CP:
 		if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr ||
 			blkaddr < __start_cp_addr(sbi)))
-			return false;
+			goto err;
 		break;
 	case META_POR:
 		if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
 			blkaddr < MAIN_BLKADDR(sbi)))
-			return false;
+			goto err;
 		break;
 	case DATA_GENERIC:
 	case DATA_GENERIC_ENHANCE:
@@ -213,7 +211,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 				  blkaddr);
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
 			dump_stack();
-			return false;
+			goto err;
 		} else {
 			return __is_bitmap_valid(sbi, blkaddr, type);
 		}
@@ -221,13 +219,30 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 	case META_GENERIC:
 		if (unlikely(blkaddr < SEG0_BLKADDR(sbi) ||
 			blkaddr >= MAIN_BLKADDR(sbi)))
-			return false;
+			goto err;
 		break;
 	default:
 		BUG();
 	}
 
 	return true;
+err:
+	f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+	return false;
+}
+
+bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	if (time_to_inject(sbi, FAULT_BLKADDR_VALIDITY))
+		return false;
+	return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
+}
+
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type)
+{
+	return __f2fs_is_valid_blkaddr(sbi, blkaddr, type);
 }
 
 /*
@@ -889,7 +904,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
 
 	cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
 
-	if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
+	if (cp_blocks > BLKS_PER_SEG(sbi) || cp_blocks <= F2FS_CP_PACKS) {
 		f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
 			  le32_to_cpu(cp_block->cp_pack_total_block_count));
 		goto invalid_cp;
@@ -1324,7 +1339,7 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	if (cpc->reason & CP_UMOUNT) {
 		if (le32_to_cpu(ckpt->cp_pack_total_block_count) +
-			NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) {
+			NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) {
 			clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG);
 			f2fs_notice(sbi, "Disable nat_bits due to no space");
 		} else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) &&
@@ -1527,7 +1542,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		cp_ver |= ((__u64)crc32 << 32);
 		*(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver);
 
-		blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks;
+		blk = start_blk + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks;
 		for (i = 0; i < nm_i->nat_bits_blocks; i++)
 			f2fs_update_meta_page(sbi, nm_i->nat_bits +
 					(i << F2FS_BLKSIZE_BITS), blk + i);
@@ -1587,8 +1602,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	 */
 	if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) ||
 		f2fs_sb_has_compression(sbi))
-		invalidate_mapping_pages(META_MAPPING(sbi),
-				MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1);
+		f2fs_bug_on(sbi,
+			invalidate_inode_pages2_range(META_MAPPING(sbi),
+				MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1));
 
 	f2fs_release_ino_entry(sbi, false);
 
@@ -1730,9 +1746,9 @@ void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi)
 		im->ino_num = 0;
 	}
 
-	sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS -
+	sbi->max_orphans = (BLKS_PER_SEG(sbi) - F2FS_CP_PACKS -
 			NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) *
-				F2FS_ORPHANS_PER_BLOCK;
+			F2FS_ORPHANS_PER_BLOCK;
 }
 
 int __init f2fs_create_checkpoint_caches(void)
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 531517dac079..8892c8262141 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -512,8 +512,8 @@ static int lzorle_compress_pages(struct compress_ctx *cc)
 	ret = lzorle1x_1_compress(cc->rbuf, cc->rlen, cc->cbuf->cdata,
 					&cc->clen, cc->private);
 	if (ret != LZO_E_OK) {
-		printk_ratelimited("%sF2FS-fs (%s): lzo-rle compress failed, ret:%d\n",
-				KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id, ret);
+		f2fs_err_ratelimited(F2FS_I_SB(cc->inode),
+				"lzo-rle compress failed, ret:%d", ret);
 		return -EIO;
 	}
 	return 0;
@@ -780,9 +780,9 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task)
 		if (provided != calculated) {
 			if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) {
 				set_inode_flag(dic->inode, FI_COMPRESS_CORRUPT);
-				printk_ratelimited(
-					"%sF2FS-fs (%s): checksum invalid, nid = %lu, %x vs %x",
-					KERN_INFO, sbi->sb->s_id, dic->inode->i_ino,
+				f2fs_info_ratelimited(sbi,
+					"checksum invalid, nid = %lu, %x vs %x",
+					dic->inode->i_ino,
 					provided, calculated);
 			}
 			set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -1418,6 +1418,8 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 	struct f2fs_sb_info *sbi = bio->bi_private;
 	struct compress_io_ctx *cic =
 			(struct compress_io_ctx *)page_private(page);
+	enum count_type type = WB_DATA_TYPE(page,
+				f2fs_is_compressed_page(page));
 	int i;
 
 	if (unlikely(bio->bi_status))
@@ -1425,7 +1427,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 
 	f2fs_compress_free_page(page);
 
-	dec_page_count(sbi, F2FS_WB_DATA);
+	dec_page_count(sbi, type);
 
 	if (atomic_dec_return(&cic->pending_pages))
 		return;
@@ -1441,12 +1443,14 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page)
 }
 
 static int f2fs_write_raw_pages(struct compress_ctx *cc,
-					int *submitted,
+					int *submitted_p,
 					struct writeback_control *wbc,
 					enum iostat_type io_type)
 {
 	struct address_space *mapping = cc->inode->i_mapping;
-	int _submitted, compr_blocks, ret, i;
+	struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
+	int submitted, compr_blocks, i;
+	int ret = 0;
 
 	compr_blocks = f2fs_compressed_blocks(cc);
 
@@ -1461,6 +1465,10 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc,
 	if (compr_blocks < 0)
 		return compr_blocks;
 
+	/* overwrite compressed cluster w/ normal cluster */
+	if (compr_blocks > 0)
+		f2fs_lock_op(sbi);
+
 	for (i = 0; i < cc->cluster_size; i++) {
 		if (!cc->rpages[i])
 			continue;
@@ -1485,7 +1493,7 @@ continue_unlock:
 		if (!clear_page_dirty_for_io(cc->rpages[i]))
 			goto continue_unlock;
 
-		ret = f2fs_write_single_data_page(cc->rpages[i], &_submitted,
+		ret = f2fs_write_single_data_page(cc->rpages[i], &submitted,
 						NULL, NULL, wbc, io_type,
 						compr_blocks, false);
 		if (ret) {
@@ -1493,26 +1501,29 @@ continue_unlock:
 				unlock_page(cc->rpages[i]);
 				ret = 0;
 			} else if (ret == -EAGAIN) {
+				ret = 0;
 				/*
 				 * for quota file, just redirty left pages to
 				 * avoid deadlock caused by cluster update race
 				 * from foreground operation.
 				 */
 				if (IS_NOQUOTA(cc->inode))
-					return 0;
-				ret = 0;
+					goto out;
 				f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT);
 				goto retry_write;
 			}
-			return ret;
+			goto out;
 		}
 
-		*submitted += _submitted;
+		*submitted_p += submitted;
 	}
 
-	f2fs_balance_fs(F2FS_M_SB(mapping), true);
+out:
+	if (compr_blocks > 0)
+		f2fs_unlock_op(sbi);
 
-	return 0;
+	f2fs_balance_fs(sbi, true);
+	return ret;
 }
 
 int f2fs_write_multi_pages(struct compress_ctx *cc,
@@ -1806,16 +1817,18 @@ void f2fs_put_page_dic(struct page *page, bool in_task)
  * check whether cluster blocks are contiguous, and add extent cache entry
  * only if cluster blocks are logically and physically contiguous.
  */
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn)
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+						unsigned int ofs_in_node)
 {
-	bool compressed = f2fs_data_blkaddr(dn) == COMPRESS_ADDR;
+	bool compressed = data_blkaddr(dn->inode, dn->node_page,
+					ofs_in_node) == COMPRESS_ADDR;
 	int i = compressed ? 1 : 0;
 	block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
-						dn->ofs_in_node + i);
+							ofs_in_node + i);
 
 	for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) {
 		block_t blkaddr = data_blkaddr(dn->inode, dn->node_page,
-						dn->ofs_in_node + i);
+							ofs_in_node + i);
 
 		if (!__is_valid_data_blkaddr(blkaddr))
 			break;
@@ -1878,12 +1891,8 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page,
 
 	set_page_private_data(cpage, ino);
 
-	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE_READ))
-		goto out;
-
 	memcpy(page_address(cpage), page_address(page), PAGE_SIZE);
 	SetPageUptodate(cpage);
-out:
 	f2fs_put_page(cpage, 1);
 }
 
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 26e317696b33..d9494b5fc7c1 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -48,7 +48,7 @@ void f2fs_destroy_bioset(void)
 	bioset_exit(&f2fs_bioset);
 }
 
-static bool __is_cp_guaranteed(struct page *page)
+bool f2fs_is_cp_guaranteed(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode;
@@ -65,8 +65,6 @@ static bool __is_cp_guaranteed(struct page *page)
 			S_ISDIR(inode->i_mode))
 		return true;
 
-	if (f2fs_is_compressed_page(page))
-		return false;
 	if ((S_ISREG(inode->i_mode) && IS_NOQUOTA(inode)) ||
 			page_private_gcing(page))
 		return true;
@@ -338,18 +336,7 @@ static void f2fs_write_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		struct page *page = bvec->bv_page;
-		enum count_type type = WB_DATA_TYPE(page);
-
-		if (page_private_dummy(page)) {
-			clear_page_private_dummy(page);
-			unlock_page(page);
-			mempool_free(page, sbi->write_io_dummy);
-
-			if (unlikely(bio->bi_status))
-				f2fs_stop_checkpoint(sbi, true,
-						STOP_CP_REASON_WRITE_FAIL);
-			continue;
-		}
+		enum count_type type = WB_DATA_TYPE(page, false);
 
 		fscrypt_finalize_bounce_page(&page);
 
@@ -524,50 +511,13 @@ void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
 	submit_bio(bio);
 }
 
-static void f2fs_align_write_bio(struct f2fs_sb_info *sbi, struct bio *bio)
-{
-	unsigned int start =
-		(bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS) % F2FS_IO_SIZE(sbi);
-
-	if (start == 0)
-		return;
-
-	/* fill dummy pages */
-	for (; start < F2FS_IO_SIZE(sbi); start++) {
-		struct page *page =
-			mempool_alloc(sbi->write_io_dummy,
-				      GFP_NOIO | __GFP_NOFAIL);
-		f2fs_bug_on(sbi, !page);
-
-		lock_page(page);
-
-		zero_user_segment(page, 0, PAGE_SIZE);
-		set_page_private_dummy(page);
-
-		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
-			f2fs_bug_on(sbi, 1);
-	}
-}
-
 static void f2fs_submit_write_bio(struct f2fs_sb_info *sbi, struct bio *bio,
 				  enum page_type type)
 {
 	WARN_ON_ONCE(is_read_io(bio_op(bio)));
 
-	if (type == DATA || type == NODE) {
-		if (f2fs_lfs_mode(sbi) && current->plug)
-			blk_finish_plug(current->plug);
-
-		if (F2FS_IO_ALIGNED(sbi)) {
-			f2fs_align_write_bio(sbi, bio);
-			/*
-			 * In the NODE case, we lose next block address chain.
-			 * So, we need to do checkpoint in f2fs_sync_file.
-			 */
-			if (type == NODE)
-				set_sbi_flag(sbi, SBI_NEED_CP);
-		}
-	}
+	if (f2fs_lfs_mode(sbi) && current->plug && PAGE_TYPE_ON_MAIN(type))
+		blk_finish_plug(current->plug);
 
 	trace_f2fs_submit_write_bio(sbi->sb, type, bio);
 	iostat_update_submit_ctx(bio, type);
@@ -740,10 +690,8 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 
 	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
 			fio->is_por ? META_POR : (__is_meta_io(fio) ?
-			META_GENERIC : DATA_GENERIC_ENHANCE))) {
-		f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+			META_GENERIC : DATA_GENERIC_ENHANCE)))
 		return -EFSCORRUPTED;
-	}
 
 	trace_f2fs_submit_page_bio(page, fio);
 
@@ -762,7 +710,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
 		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
 
 	inc_page_count(fio->sbi, is_read_io(fio->op) ?
-			__read_io_type(page) : WB_DATA_TYPE(fio->page));
+			__read_io_type(page) : WB_DATA_TYPE(fio->page, false));
 
 	if (is_read_io(bio_op(bio)))
 		f2fs_submit_read_bio(fio->sbi, bio, fio->type);
@@ -796,16 +744,6 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
 					block_t last_blkaddr,
 					block_t cur_blkaddr)
 {
-	if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) {
-		unsigned int filled_blocks =
-				F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size);
-		unsigned int io_size = F2FS_IO_SIZE(sbi);
-		unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt;
-
-		/* IOs in bio is aligned and left space of vectors is not enough */
-		if (!(filled_blocks % io_size) && left_vecs < io_size)
-			return false;
-	}
 	if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr))
 		return false;
 	return io_type_is_mergeable(io, fio);
@@ -948,10 +886,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio)
 			fio->encrypted_page : fio->page;
 
 	if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
-			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) {
-		f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
+			__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
 		return -EFSCORRUPTED;
-	}
 
 	trace_f2fs_submit_page_bio(page, fio);
 
@@ -973,7 +909,7 @@ alloc_new:
 	if (fio->io_wbc)
 		wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
 
-	inc_page_count(fio->sbi, WB_DATA_TYPE(page));
+	inc_page_count(fio->sbi, WB_DATA_TYPE(page, false));
 
 	*fio->last_block = fio->new_blkaddr;
 	*fio->bio = bio;
@@ -1007,11 +943,12 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
 	struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
 	struct page *bio_page;
+	enum count_type type;
 
 	f2fs_bug_on(sbi, is_read_io(fio->op));
 
 	f2fs_down_write(&io->io_rwsem);
-
+next:
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_has_blkzoned(sbi) && btype < META && io->zone_pending_bio) {
 		wait_for_completion_io(&io->zone_wait);
@@ -1021,7 +958,6 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio)
 	}
 #endif
 
-next:
 	if (fio->in_list) {
 		spin_lock(&io->io_lock);
 		if (list_empty(&io->io_list)) {
@@ -1046,7 +982,8 @@ next:
 	/* set submitted = true as a return value */
 	fio->submitted = 1;
 
-	inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+	type = WB_DATA_TYPE(bio_page, fio->compressed_page);
+	inc_page_count(sbi, type);
 
 	if (io->bio &&
 	    (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
@@ -1056,13 +993,6 @@ next:
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		if (F2FS_IO_ALIGNED(sbi) &&
-				(fio->type == DATA || fio->type == NODE) &&
-				fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
-			dec_page_count(sbi, WB_DATA_TYPE(bio_page));
-			fio->retry = 1;
-			goto skip;
-		}
 		io->bio = __bio_alloc(fio, BIO_MAX_VECS);
 		f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
 				       bio_page->index, fio, GFP_NOIO);
@@ -1080,10 +1010,6 @@ alloc_new:
 	io->last_block_in_bio = fio->new_blkaddr;
 
 	trace_f2fs_submit_page_write(fio->page, fio);
-skip:
-	if (fio->in_list)
-		goto next;
-out:
 #ifdef CONFIG_BLK_DEV_ZONED
 	if (f2fs_sb_has_blkzoned(sbi) && btype < META &&
 			is_end_zone_blkaddr(sbi, fio->new_blkaddr)) {
@@ -1096,6 +1022,9 @@ out:
 		__submit_merged_bio(io);
 	}
 #endif
+	if (fio->in_list)
+		goto next;
+out:
 	if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
 				!f2fs_is_checkpoint_ready(sbi))
 		__submit_merged_bio(io);
@@ -1218,7 +1147,8 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
 
 	if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
 		return -EPERM;
-	if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
+	err = inc_valid_block_count(sbi, dn->inode, &count, true);
+	if (unlikely(err))
 		return err;
 
 	trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
@@ -1285,8 +1215,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ)) {
 			err = -EFSCORRUPTED;
-			f2fs_handle_error(F2FS_I_SB(inode),
-						ERROR_INVALID_BLKADDR);
 			goto put_err;
 		}
 		goto got_it;
@@ -1312,8 +1240,6 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
 						dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE)) {
 		err = -EFSCORRUPTED;
-		f2fs_handle_error(F2FS_I_SB(inode),
-					ERROR_INVALID_BLKADDR);
 		goto put_err;
 	}
 got_it:
@@ -1475,15 +1401,18 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
 
 	dn->data_blkaddr = f2fs_data_blkaddr(dn);
 	if (dn->data_blkaddr == NULL_ADDR) {
-		err = inc_valid_block_count(sbi, dn->inode, &count);
+		err = inc_valid_block_count(sbi, dn->inode, &count, true);
 		if (unlikely(err))
 			return err;
 	}
 
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 	old_blkaddr = dn->data_blkaddr;
-	f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
-				&sum, seg_type, NULL);
+	err = f2fs_allocate_data_block(sbi, NULL, old_blkaddr,
+				&dn->data_blkaddr, &sum, seg_type, NULL);
+	if (err)
+		return err;
+
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
 		f2fs_invalidate_internal_cache(sbi, old_blkaddr);
 
@@ -1641,7 +1570,6 @@ next_block:
 	if (!is_hole &&
 	    !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
 		err = -EFSCORRUPTED;
-		f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 		goto sync_out;
 	}
 
@@ -2165,8 +2093,6 @@ got_it:
 		if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
 						DATA_GENERIC_ENHANCE_READ)) {
 			ret = -EFSCORRUPTED;
-			f2fs_handle_error(F2FS_I_SB(inode),
-						ERROR_INVALID_BLKADDR);
 			goto out;
 		}
 	} else {
@@ -2668,8 +2594,6 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
 	if (fio) {
 		if (page_private_gcing(fio->page))
 			return true;
-		if (page_private_dummy(fio->page))
-			return true;
 		if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
 			f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
 			return true;
@@ -2706,11 +2630,8 @@ int f2fs_do_write_data_page(struct f2fs_io_info *fio)
 	    f2fs_lookup_read_extent_cache_block(inode, page->index,
 						&fio->old_blkaddr)) {
 		if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
-						DATA_GENERIC_ENHANCE)) {
-			f2fs_handle_error(fio->sbi,
-						ERROR_INVALID_BLKADDR);
+						DATA_GENERIC_ENHANCE))
 			return -EFSCORRUPTED;
-		}
 
 		ipu_force = true;
 		fio->need_lock = LOCK_DONE;
@@ -2738,7 +2659,6 @@ got_it:
 		!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
 						DATA_GENERIC_ENHANCE)) {
 		err = -EFSCORRUPTED;
-		f2fs_handle_error(fio->sbi, ERROR_INVALID_BLKADDR);
 		goto out_writepage;
 	}
 
@@ -2838,7 +2758,7 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
 		.encrypted_page = NULL,
 		.submitted = 0,
 		.compr_blocks = compr_blocks,
-		.need_lock = LOCK_RETRY,
+		.need_lock = compr_blocks ? LOCK_DONE : LOCK_RETRY,
 		.post_read = f2fs_post_read_required(inode) ? 1 : 0,
 		.io_type = io_type,
 		.io_wbc = wbc,
@@ -2919,6 +2839,7 @@ write:
 	if (err == -EAGAIN) {
 		err = f2fs_do_write_data_page(&fio);
 		if (err == -EAGAIN) {
+			f2fs_bug_on(sbi, compr_blocks);
 			fio.need_lock = LOCK_REQ;
 			err = f2fs_do_write_data_page(&fio);
 		}
@@ -3704,7 +3625,6 @@ repeat:
 		if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
 				DATA_GENERIC_ENHANCE_READ)) {
 			err = -EFSCORRUPTED;
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			goto fail;
 		}
 		err = f2fs_submit_page_read(use_cow ?
@@ -3905,26 +3825,36 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk,
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	unsigned int blkofs;
 	unsigned int blk_per_sec = BLKS_PER_SEC(sbi);
+	unsigned int end_blk = start_blk + blkcnt - 1;
 	unsigned int secidx = start_blk / blk_per_sec;
-	unsigned int end_sec = secidx + blkcnt / blk_per_sec;
+	unsigned int end_sec;
 	int ret = 0;
 
+	if (!blkcnt)
+		return 0;
+	end_sec = end_blk / blk_per_sec;
+
 	f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 	filemap_invalidate_lock(inode->i_mapping);
 
 	set_inode_flag(inode, FI_ALIGNED_WRITE);
 	set_inode_flag(inode, FI_OPU_WRITE);
 
-	for (; secidx < end_sec; secidx++) {
+	for (; secidx <= end_sec; secidx++) {
+		unsigned int blkofs_end = secidx == end_sec ?
+				end_blk % blk_per_sec : blk_per_sec - 1;
+
 		f2fs_down_write(&sbi->pin_sem);
 
-		f2fs_lock_op(sbi);
-		f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
-		f2fs_unlock_op(sbi);
+		ret = f2fs_allocate_pinning_section(sbi);
+		if (ret) {
+			f2fs_up_write(&sbi->pin_sem);
+			break;
+		}
 
 		set_inode_flag(inode, FI_SKIP_WRITES);
 
-		for (blkofs = 0; blkofs < blk_per_sec; blkofs++) {
+		for (blkofs = 0; blkofs <= blkofs_end; blkofs++) {
 			struct page *page;
 			unsigned int blkidx = secidx * blk_per_sec + blkofs;
 
@@ -4013,27 +3943,34 @@ retry:
 		nr_pblocks = map.m_len;
 
 		if ((pblock - SM_I(sbi)->main_blkaddr) & sec_blks_mask ||
-				nr_pblocks & sec_blks_mask) {
+				nr_pblocks & sec_blks_mask ||
+				!f2fs_valid_pinned_area(sbi, pblock)) {
+			bool last_extent = false;
+
 			not_aligned++;
 
 			nr_pblocks = roundup(nr_pblocks, blks_per_sec);
 			if (cur_lblock + nr_pblocks > sis->max)
 				nr_pblocks -= blks_per_sec;
 
+			/* this extent is last one */
 			if (!nr_pblocks) {
-				/* this extent is last one */
-				nr_pblocks = map.m_len;
-				f2fs_warn(sbi, "Swapfile: last extent is not aligned to section");
-				goto next;
+				nr_pblocks = last_lblock - cur_lblock;
+				last_extent = true;
 			}
 
 			ret = f2fs_migrate_blocks(inode, cur_lblock,
 							nr_pblocks);
-			if (ret)
+			if (ret) {
+				if (ret == -ENOENT)
+					ret = -EINVAL;
 				goto out;
-			goto retry;
+			}
+
+			if (!last_extent)
+				goto retry;
 		}
-next:
+
 		if (cur_lblock + nr_pblocks >= sis->max)
 			nr_pblocks = sis->max - cur_lblock;
 
@@ -4071,17 +4008,17 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 				sector_t *span)
 {
 	struct inode *inode = file_inode(file);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	int ret;
 
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
-	if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
-	if (f2fs_lfs_mode(F2FS_I_SB(inode))) {
-		f2fs_err(F2FS_I_SB(inode),
-			"Swapfile not supported in LFS mode");
+	if (f2fs_lfs_mode(sbi) && !f2fs_sb_has_blkzoned(sbi)) {
+		f2fs_err(sbi, "Swapfile not supported in LFS mode");
 		return -EINVAL;
 	}
 
@@ -4092,6 +4029,10 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 	if (!f2fs_disable_compressed_file(inode))
 		return -EINVAL;
 
+	ret = filemap_fdatawrite(inode->i_mapping);
+	if (ret < 0)
+		return ret;
+
 	f2fs_precache_extents(inode);
 
 	ret = check_swap_activate(sis, file, span);
@@ -4100,7 +4041,7 @@ static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
 
 	stat_inc_swapfile_inode(inode);
 	set_inode_flag(inode, FI_PIN_FILE);
-	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	f2fs_update_time(sbi, REQ_TIME);
 	return ret;
 }
 
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fdbf994f1271..8b0e1e71b667 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -41,7 +41,7 @@ void f2fs_update_sit_info(struct f2fs_sb_info *sbi)
 	total_vblocks = 0;
 	blks_per_sec = CAP_BLKS_PER_SEC(sbi);
 	hblks_per_sec = blks_per_sec / 2;
-	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
 		vblocks = get_valid_blocks(sbi, segno, true);
 		dist = abs(vblocks - hblks_per_sec);
 		bimodal += dist * dist;
@@ -135,7 +135,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->cur_ckpt_time = sbi->cprc_info.cur_time;
 	si->peak_ckpt_time = sbi->cprc_info.peak_time;
 	spin_unlock(&sbi->cprc_info.stat_lock);
-	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
+	si->total_count = BLKS_TO_SEGS(sbi, (int)sbi->user_block_count);
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
 	si->valid_count = valid_user_blocks(sbi);
@@ -176,11 +176,10 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
 	si->io_skip_bggc = sbi->io_skip_bggc;
 	si->other_skip_bggc = sbi->other_skip_bggc;
-	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
+	si->util_free = (int)(BLKS_TO_SEGS(sbi, free_user_blocks(sbi)))
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
 		/ 2;
-	si->util_valid = (int)(written_block_count(sbi) >>
-						sbi->log_blocks_per_seg)
+	si->util_valid = (int)(BLKS_TO_SEGS(sbi, written_block_count(sbi)))
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
 		/ 2;
 	si->util_invalid = 50 - si->util_free - si->util_valid;
@@ -208,7 +207,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 		if (!blks)
 			continue;
 
-		if (blks == sbi->blocks_per_seg)
+		if (blks == BLKS_PER_SEG(sbi))
 			si->full_seg[type]++;
 		else
 			si->dirty_seg[type]++;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 042593aed1ec..02c9355176d3 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -830,13 +830,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
 	return err;
 }
 
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+					struct f2fs_filename *fname)
 {
 	struct page *page;
 	int err = 0;
 
 	f2fs_down_write(&F2FS_I(inode)->i_sem);
-	page = f2fs_init_inode_metadata(inode, dir, NULL, NULL);
+	page = f2fs_init_inode_metadata(inode, dir, fname, NULL);
 	if (IS_ERR(page)) {
 		err = PTR_ERR(page);
 		goto fail;
@@ -995,9 +996,8 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 		de = &d->dentry[bit_pos];
 		if (de->name_len == 0) {
 			if (found_valid_dirent || !bit_pos) {
-				printk_ratelimited(
-					"%sF2FS-fs (%s): invalid namelen(0), ino:%u, run fsck to fix.",
-					KERN_WARNING, sbi->sb->s_id,
+				f2fs_warn_ratelimited(sbi,
+					"invalid namelen(0), ino:%u, run fsck to fix.",
 					le32_to_cpu(de->ino));
 				set_sbi_flag(sbi, SBI_NEED_FSCK);
 			}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index ad8dfac73bd4..48048fa36427 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -43,7 +43,6 @@ bool sanity_check_extent_cache(struct inode *inode)
 	if (!f2fs_is_valid_blkaddr(sbi, ei->blk, DATA_GENERIC_ENHANCE) ||
 	    !f2fs_is_valid_blkaddr(sbi, ei->blk + ei->len - 1,
 					DATA_GENERIC_ENHANCE)) {
-		set_sbi_flag(sbi, SBI_NEED_FSCK);
 		f2fs_warn(sbi, "%s: inode (ino=%lx) extent info [%u, %u, %u] is incorrect, run fsck to fix",
 			  __func__, inode->i_ino,
 			  ei->blk, ei->fofs, ei->len);
@@ -856,10 +855,8 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei,
 		goto out;
 
 	if (__is_valid_data_blkaddr(blkaddr) &&
-	    !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
-		f2fs_bug_on(sbi, 1);
+	    !f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE))
 		return -EINVAL;
-	}
 out:
 	/*
 	 * init block age with zero, this can happen when the block age extent
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 65294e3b0bef..fced2b7652f4 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -24,6 +24,7 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/part_stat.h>
+#include <linux/rw_hint.h>
 #include <crypto/hash.h>
 
 #include <linux/fscrypt.h>
@@ -60,7 +61,9 @@ enum {
 	FAULT_SLAB_ALLOC,
 	FAULT_DQUOT_INIT,
 	FAULT_LOCK_OP,
-	FAULT_BLKADDR,
+	FAULT_BLKADDR_VALIDITY,
+	FAULT_BLKADDR_CONSISTENCE,
+	FAULT_NO_SEGMENT,
 	FAULT_MAX,
 };
 
@@ -75,6 +78,11 @@ struct f2fs_fault_info {
 
 extern const char *f2fs_fault_name[FAULT_MAX];
 #define IS_FAULT_SET(fi, type) ((fi)->inject_type & BIT(type))
+
+/* maximum retry count for injected failure */
+#define DEFAULT_FAILURE_RETRY_COUNT		8
+#else
+#define DEFAULT_FAILURE_RETRY_COUNT		1
 #endif
 
 /*
@@ -142,7 +150,6 @@ struct f2fs_rwsem {
 
 struct f2fs_mount_info {
 	unsigned int opt;
-	int write_io_size_bits;		/* Write IO size bits */
 	block_t root_reserved_blocks;	/* root reserved blocks */
 	kuid_t s_resuid;		/* reserved blocks for uid */
 	kgid_t s_resgid;		/* reserved blocks for gid */
@@ -829,7 +836,7 @@ struct f2fs_inode_info {
 	spinlock_t i_size_lock;		/* protect last_disk_size */
 
 #ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 
 	/* quota space reservation, managed internally by quota code */
 	qsize_t i_reserved_quota;
@@ -1080,7 +1087,8 @@ struct f2fs_sm_info {
  * f2fs monitors the number of several block types such as on-writeback,
  * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
  */
-#define WB_DATA_TYPE(p)	(__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
+#define WB_DATA_TYPE(p, f)			\
+	(f || f2fs_is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
@@ -1110,6 +1118,7 @@ enum count_type {
  * ...			Only can be used with META.
  */
 #define PAGE_TYPE_OF_BIO(type)	((type) > META ? META : (type))
+#define PAGE_TYPE_ON_MAIN(type)	((type) == DATA || (type) == NODE)
 enum page_type {
 	DATA = 0,
 	NODE = 1,	/* should not change this */
@@ -1204,7 +1213,6 @@ struct f2fs_io_info {
 	unsigned int submitted:1;	/* indicate IO submission */
 	unsigned int in_list:1;		/* indicate fio is in io_list */
 	unsigned int is_por:1;		/* indicate IO is from recovery or not */
-	unsigned int retry:1;		/* need to reallocate block address */
 	unsigned int encrypted:1;	/* indicate file is encrypted */
 	unsigned int post_read:1;	/* require post read */
 	enum iostat_type io_type;	/* io type */
@@ -1239,7 +1247,7 @@ struct f2fs_bio_info {
 #define FDEV(i)				(sbi->devs[i])
 #define RDEV(i)				(raw_super->devs[i])
 struct f2fs_dev_info {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 	char path[MAX_PATH_LEN];
 	unsigned int total_segments;
@@ -1406,18 +1414,16 @@ static inline void f2fs_clear_bit(unsigned int nr, char *addr);
  * Layout A: lowest bit should be 1
  * | bit0 = 1 | bit1 | bit2 | ... | bit MAX | private data .... |
  * bit 0	PAGE_PRIVATE_NOT_POINTER
- * bit 1	PAGE_PRIVATE_DUMMY_WRITE
- * bit 2	PAGE_PRIVATE_ONGOING_MIGRATION
- * bit 3	PAGE_PRIVATE_INLINE_INODE
- * bit 4	PAGE_PRIVATE_REF_RESOURCE
- * bit 5-	f2fs private data
+ * bit 1	PAGE_PRIVATE_ONGOING_MIGRATION
+ * bit 2	PAGE_PRIVATE_INLINE_INODE
+ * bit 3	PAGE_PRIVATE_REF_RESOURCE
+ * bit 4-	f2fs private data
  *
  * Layout B: lowest bit should be 0
  * page.private is a wrapped pointer.
  */
 enum {
 	PAGE_PRIVATE_NOT_POINTER,		/* private contains non-pointer data */
-	PAGE_PRIVATE_DUMMY_WRITE,		/* data page for padding aligned IO */
 	PAGE_PRIVATE_ONGOING_MIGRATION,		/* data page which is on-going migrating */
 	PAGE_PRIVATE_INLINE_INODE,		/* inode page contains inline data */
 	PAGE_PRIVATE_REF_RESOURCE,		/* dirty page has referenced resources */
@@ -1564,7 +1570,6 @@ struct f2fs_sb_info {
 	struct f2fs_bio_info *write_io[NR_PAGE_TYPE];	/* for write bios */
 	/* keep migration IO order for LFS mode */
 	struct f2fs_rwsem io_order_lock;
-	mempool_t *write_io_dummy;		/* Dummy pages */
 	pgoff_t page_eio_ofs[NR_PAGE_TYPE];	/* EIO page offset */
 	int page_eio_cnt[NR_PAGE_TYPE];		/* EIO count */
 
@@ -1810,6 +1815,37 @@ struct f2fs_sb_info {
 #endif
 };
 
+/* Definitions to access f2fs_sb_info */
+#define SEGS_TO_BLKS(sbi, segs)					\
+		((segs) << (sbi)->log_blocks_per_seg)
+#define BLKS_TO_SEGS(sbi, blks)					\
+		((blks) >> (sbi)->log_blocks_per_seg)
+
+#define BLKS_PER_SEG(sbi)	((sbi)->blocks_per_seg)
+#define BLKS_PER_SEC(sbi)	(SEGS_TO_BLKS(sbi, (sbi)->segs_per_sec))
+#define SEGS_PER_SEC(sbi)	((sbi)->segs_per_sec)
+
+__printf(3, 4)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, const char *fmt, ...);
+
+#define f2fs_err(sbi, fmt, ...)						\
+	f2fs_printk(sbi, false, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_notice(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_NOTICE fmt, ##__VA_ARGS__)
+#define f2fs_info(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_INFO fmt, ##__VA_ARGS__)
+#define f2fs_debug(sbi, fmt, ...)					\
+	f2fs_printk(sbi, false, KERN_DEBUG fmt, ##__VA_ARGS__)
+
+#define f2fs_err_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_ERR fmt, ##__VA_ARGS__)
+#define f2fs_warn_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_WARNING fmt, ##__VA_ARGS__)
+#define f2fs_info_ratelimited(sbi, fmt, ...)				\
+	f2fs_printk(sbi, true, KERN_INFO fmt, ##__VA_ARGS__)
+
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 #define time_to_inject(sbi, type) __time_to_inject(sbi, type, __func__,	\
 									__builtin_return_address(0))
@@ -1827,9 +1863,8 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type,
 	atomic_inc(&ffi->inject_ops);
 	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
 		atomic_set(&ffi->inject_ops, 0);
-		printk_ratelimited("%sF2FS-fs (%s) : inject %s in %s of %pS\n",
-			KERN_INFO, sbi->sb->s_id, f2fs_fault_name[type],
-			func, parent_func);
+		f2fs_info_ratelimited(sbi, "inject %s in %s of %pS",
+				f2fs_fault_name[type], func, parent_func);
 		return true;
 	}
 	return false;
@@ -2249,9 +2284,30 @@ static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi,
 	return false;
 }
 
+static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi,
+						struct inode *inode, bool cap)
+{
+	block_t avail_user_block_count;
+
+	avail_user_block_count = sbi->user_block_count -
+					sbi->current_reserved_blocks;
+
+	if (!__allow_reserved_blocks(sbi, inode, cap))
+		avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
+
+	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
+		if (avail_user_block_count > sbi->unusable_block_count)
+			avail_user_block_count -= sbi->unusable_block_count;
+		else
+			avail_user_block_count = 0;
+	}
+
+	return avail_user_block_count;
+}
+
 static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool);
 static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
-				 struct inode *inode, blkcnt_t *count)
+				 struct inode *inode, blkcnt_t *count, bool partial)
 {
 	blkcnt_t diff = 0, release = 0;
 	block_t avail_user_block_count;
@@ -2274,23 +2330,14 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 	sbi->total_valid_block_count += (block_t)(*count);
-	avail_user_block_count = sbi->user_block_count -
-					sbi->current_reserved_blocks;
+	avail_user_block_count = get_available_block_count(sbi, inode, true);
 
-	if (!__allow_reserved_blocks(sbi, inode, true))
-		avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks;
-
-	if (F2FS_IO_ALIGNED(sbi))
-		avail_user_block_count -= sbi->blocks_per_seg *
-				SM_I(sbi)->additional_reserved_segments;
-
-	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
-		if (avail_user_block_count > sbi->unusable_block_count)
-			avail_user_block_count -= sbi->unusable_block_count;
-		else
-			avail_user_block_count = 0;
-	}
 	if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
+		if (!partial) {
+			spin_unlock(&sbi->stat_lock);
+			goto enospc;
+		}
+
 		diff = sbi->total_valid_block_count - avail_user_block_count;
 		if (diff > *count)
 			diff = *count;
@@ -2318,20 +2365,6 @@ release_quota:
 	return -ENOSPC;
 }
 
-__printf(2, 3)
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...);
-
-#define f2fs_err(sbi, fmt, ...)						\
-	f2fs_printk(sbi, KERN_ERR fmt, ##__VA_ARGS__)
-#define f2fs_warn(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_WARNING fmt, ##__VA_ARGS__)
-#define f2fs_notice(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_NOTICE fmt, ##__VA_ARGS__)
-#define f2fs_info(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_INFO fmt, ##__VA_ARGS__)
-#define f2fs_debug(sbi, fmt, ...)					\
-	f2fs_printk(sbi, KERN_DEBUG fmt, ##__VA_ARGS__)
-
 #define PAGE_PRIVATE_GET_FUNC(name, flagname) \
 static inline bool page_private_##name(struct page *page) \
 { \
@@ -2360,17 +2393,14 @@ static inline void clear_page_private_##name(struct page *page) \
 PAGE_PRIVATE_GET_FUNC(nonpointer, NOT_POINTER);
 PAGE_PRIVATE_GET_FUNC(inline, INLINE_INODE);
 PAGE_PRIVATE_GET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_GET_FUNC(dummy, DUMMY_WRITE);
 
 PAGE_PRIVATE_SET_FUNC(reference, REF_RESOURCE);
 PAGE_PRIVATE_SET_FUNC(inline, INLINE_INODE);
 PAGE_PRIVATE_SET_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_SET_FUNC(dummy, DUMMY_WRITE);
 
 PAGE_PRIVATE_CLEAR_FUNC(reference, REF_RESOURCE);
 PAGE_PRIVATE_CLEAR_FUNC(inline, INLINE_INODE);
 PAGE_PRIVATE_CLEAR_FUNC(gcing, ONGOING_MIGRATION);
-PAGE_PRIVATE_CLEAR_FUNC(dummy, DUMMY_WRITE);
 
 static inline unsigned long get_page_private_data(struct page *page)
 {
@@ -2504,11 +2534,8 @@ static inline int get_dirty_pages(struct inode *inode)
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
 {
-	unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
-	unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >>
-						sbi->log_blocks_per_seg;
-
-	return segs / sbi->segs_per_sec;
+	return div_u64(get_pages(sbi, block_type) + BLKS_PER_SEC(sbi) - 1,
+							BLKS_PER_SEC(sbi));
 }
 
 static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi)
@@ -2572,7 +2599,7 @@ static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
 	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
 	if (sbi->cur_cp_pack == 2)
-		start_addr += sbi->blocks_per_seg;
+		start_addr += BLKS_PER_SEG(sbi);
 	return start_addr;
 }
 
@@ -2581,7 +2608,7 @@ static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
 	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
 	if (sbi->cur_cp_pack == 1)
-		start_addr += sbi->blocks_per_seg;
+		start_addr += BLKS_PER_SEG(sbi);
 	return start_addr;
 }
 
@@ -2600,7 +2627,8 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 					struct inode *inode, bool is_inode)
 {
 	block_t	valid_block_count;
-	unsigned int valid_node_count, user_block_count;
+	unsigned int valid_node_count;
+	unsigned int avail_user_block_count;
 	int err;
 
 	if (is_inode) {
@@ -2620,21 +2648,10 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
 
 	spin_lock(&sbi->stat_lock);
 
-	valid_block_count = sbi->total_valid_block_count +
-					sbi->current_reserved_blocks + 1;
-
-	if (!__allow_reserved_blocks(sbi, inode, false))
-		valid_block_count += F2FS_OPTION(sbi).root_reserved_blocks;
+	valid_block_count = sbi->total_valid_block_count + 1;
+	avail_user_block_count = get_available_block_count(sbi, inode, false);
 
-	if (F2FS_IO_ALIGNED(sbi))
-		valid_block_count += sbi->blocks_per_seg *
-				SM_I(sbi)->additional_reserved_segments;
-
-	user_block_count = sbi->user_block_count;
-	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
-		user_block_count -= sbi->unusable_block_count;
-
-	if (unlikely(valid_block_count > user_block_count)) {
+	if (unlikely(valid_block_count > avail_user_block_count)) {
 		spin_unlock(&sbi->stat_lock);
 		goto enospc;
 	}
@@ -3021,6 +3038,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 	case FI_INLINE_DOTS:
 	case FI_PIN_FILE:
 	case FI_COMPRESS_RELEASED:
+	case FI_ATOMIC_COMMITTED:
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
@@ -3364,17 +3382,6 @@ static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi)
 	return is_set_ckpt_flags(sbi, CP_ERROR_FLAG);
 }
 
-static inline bool is_dot_dotdot(const u8 *name, size_t len)
-{
-	if (len == 1 && name[0] == '.')
-		return true;
-
-	if (len == 2 && name[0] == '.' && name[1] == '.')
-		return true;
-
-	return false;
-}
-
 static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
 					size_t size, gfp_t flags)
 {
@@ -3455,7 +3462,7 @@ static inline __le32 *get_dnode_addr(struct inode *inode,
 		sizeof((f2fs_inode)->field))			\
 		<= (F2FS_OLD_ATTRIBUTE_SIZE + (extra_isize)))	\
 
-#define __is_large_section(sbi)		((sbi)->segs_per_sec > 1)
+#define __is_large_section(sbi)		(SEGS_PER_SEC(sbi) > 1)
 
 #define __is_meta_io(fio) (PAGE_TYPE_OF_BIO((fio)->type) == META)
 
@@ -3464,11 +3471,9 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 static inline void verify_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type)
 {
-	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type)) {
+	if (!f2fs_is_valid_blkaddr(sbi, blkaddr, type))
 		f2fs_err(sbi, "invalid blkaddr: %u, type: %d, run fsck to fix.",
 			 blkaddr, type);
-		f2fs_bug_on(sbi, 1);
-	}
 }
 
 static inline bool __is_valid_data_blkaddr(block_t blkaddr)
@@ -3570,7 +3575,8 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name,
 			struct inode *inode, nid_t ino, umode_t mode);
 void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 			struct inode *dir, struct inode *inode);
-int f2fs_do_tmpfile(struct inode *inode, struct inode *dir);
+int f2fs_do_tmpfile(struct inode *inode, struct inode *dir,
+					struct f2fs_filename *fname);
 bool f2fs_empty_dir(struct inode *dir);
 
 static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
@@ -3685,15 +3691,14 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable);
 void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi);
 int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
 bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno);
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi);
 void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi);
 void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi);
-void f2fs_get_new_segment(struct f2fs_sb_info *sbi,
-			unsigned int *newseg, bool new_sec, int dir);
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 					unsigned int start, unsigned int end);
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force);
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi);
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi);
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range);
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
 					struct cp_control *cpc);
@@ -3714,7 +3719,7 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
 			block_t old_addr, block_t new_addr,
 			unsigned char version, bool recover_curseg,
 			bool recover_newaddr);
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 			block_t old_blkaddr, block_t *new_blkaddr,
 			struct f2fs_summary *sum, int type,
 			struct f2fs_io_info *fio);
@@ -3764,6 +3769,8 @@ struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index);
 struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index);
 bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
 					block_t blkaddr, int type);
+bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi,
+					block_t blkaddr, int type);
 int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 			int type, bool sync);
 void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index,
@@ -3804,6 +3811,7 @@ void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi);
  */
 int __init f2fs_init_bioset(void);
 void f2fs_destroy_bioset(void);
+bool f2fs_is_cp_guaranteed(struct page *page);
 int f2fs_init_bio_entry_cache(void);
 void f2fs_destroy_bio_entry_cache(void);
 void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
@@ -3867,6 +3875,9 @@ void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi);
 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode);
 int f2fs_gc(struct f2fs_sb_info *sbi, struct f2fs_gc_control *gc_control);
 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi);
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+		unsigned int start_seg, unsigned int end_seg,
+		bool dry_run, unsigned int dry_run_sections);
 int f2fs_resize_fs(struct file *filp, __u64 block_count);
 int __init f2fs_create_garbage_collection_cache(void);
 void f2fs_destroy_garbage_collection_cache(void);
@@ -4287,7 +4298,8 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
 void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed,
 				bool in_task);
 void f2fs_put_page_dic(struct page *page, bool in_task);
-unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn);
+unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn,
+						unsigned int ofs_in_node);
 int f2fs_init_compress_ctx(struct compress_ctx *cc);
 void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse);
 void f2fs_init_compress_info(struct f2fs_sb_info *sbi);
@@ -4344,7 +4356,8 @@ static inline void f2fs_put_page_dic(struct page *page, bool in_task)
 {
 	WARN_ON_ONCE(1);
 }
-static inline unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn) { return 0; }
+static inline unsigned int f2fs_cluster_blocks_are_contiguous(
+			struct dnode_of_data *dn, unsigned int ofs_in_node) { return 0; }
 static inline bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) { return false; }
 static inline int f2fs_init_compress_inode(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_compress_inode(struct f2fs_sb_info *sbi) { }
@@ -4401,15 +4414,24 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode)
 {
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
-	if (!f2fs_compressed_file(inode))
+	f2fs_down_write(&F2FS_I(inode)->i_sem);
+
+	if (!f2fs_compressed_file(inode)) {
+		f2fs_up_write(&F2FS_I(inode)->i_sem);
 		return true;
-	if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))
+	}
+	if (f2fs_is_mmap_file(inode) ||
+		(S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) {
+		f2fs_up_write(&F2FS_I(inode)->i_sem);
 		return false;
+	}
 
 	fi->i_flags &= ~F2FS_COMPR_FL;
 	stat_dec_compr_inode(inode);
 	clear_inode_flag(inode, FI_COMPRESSED_FILE);
 	f2fs_mark_inode_dirty_sync(inode, true);
+
+	f2fs_up_write(&F2FS_I(inode)->i_sem);
 	return true;
 }
 
@@ -4512,6 +4534,17 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
 	return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
 }
 
+static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi,
+					  block_t blkaddr)
+{
+	if (f2fs_sb_has_blkzoned(sbi)) {
+		int devi = f2fs_target_device_index(sbi, blkaddr);
+
+		return !bdev_is_zoned(FDEV(devi).bdev);
+	}
+	return true;
+}
+
 static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi)
 {
 	return F2FS_OPTION(sbi).memory_mode == MEMORY_MODE_LOW;
@@ -4613,10 +4646,36 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
 	return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
 }
 
+static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi,
+					block_t blkaddr, unsigned int cnt)
+{
+	bool need_submit = false;
+	int i = 0;
+
+	do {
+		struct page *page;
+
+		page = find_get_page(META_MAPPING(sbi), blkaddr + i);
+		if (page) {
+			if (PageWriteback(page))
+				need_submit = true;
+			f2fs_put_page(page, 0);
+		}
+	} while (++i < cnt && !need_submit);
+
+	if (need_submit)
+		f2fs_submit_merged_write_cond(sbi, sbi->meta_inode,
+							NULL, 0, DATA);
+
+	truncate_inode_pages_range(META_MAPPING(sbi),
+			F2FS_BLK_TO_BYTES((loff_t)blkaddr),
+			F2FS_BLK_END_BYTES((loff_t)(blkaddr + cnt - 1)));
+}
+
 static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
 								block_t blkaddr)
 {
-	invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr);
+	f2fs_truncate_meta_inode_pages(sbi, blkaddr, 1);
 	f2fs_invalidate_compress_page(sbi, blkaddr);
 }
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index b58ab1157b7e..1761ad125f97 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -39,6 +39,7 @@
 static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vmf->vma->vm_file);
+	vm_flags_t flags = vmf->vma->vm_flags;
 	vm_fault_t ret;
 
 	ret = filemap_fault(vmf);
@@ -46,7 +47,7 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
 		f2fs_update_iostat(F2FS_I_SB(inode), inode,
 					APP_MAPPED_READ_IO, F2FS_BLKSIZE);
 
-	trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret);
+	trace_f2fs_filemap_fault(inode, vmf->pgoff, flags, ret);
 
 	return ret;
 }
@@ -394,9 +395,20 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	return f2fs_do_sync_file(file, start, end, datasync, false);
 }
 
-static bool __found_offset(struct address_space *mapping, block_t blkaddr,
-				pgoff_t index, int whence)
+static bool __found_offset(struct address_space *mapping,
+		struct dnode_of_data *dn, pgoff_t index, int whence)
 {
+	block_t blkaddr = f2fs_data_blkaddr(dn);
+	struct inode *inode = mapping->host;
+	bool compressed_cluster = false;
+
+	if (f2fs_compressed_file(inode)) {
+		block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page,
+		    ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size));
+
+		compressed_cluster = first_blkaddr == COMPRESS_ADDR;
+	}
+
 	switch (whence) {
 	case SEEK_DATA:
 		if (__is_valid_data_blkaddr(blkaddr))
@@ -404,8 +416,12 @@ static bool __found_offset(struct address_space *mapping, block_t blkaddr,
 		if (blkaddr == NEW_ADDR &&
 		    xa_get_mark(&mapping->i_pages, index, PAGECACHE_TAG_DIRTY))
 			return true;
+		if (compressed_cluster)
+			return true;
 		break;
 	case SEEK_HOLE:
+		if (compressed_cluster)
+			return false;
 		if (blkaddr == NULL_ADDR)
 			return true;
 		break;
@@ -474,7 +490,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 				goto fail;
 			}
 
-			if (__found_offset(file->f_mapping, blkaddr,
+			if (__found_offset(file->f_mapping, &dn,
 							pgofs, whence)) {
 				f2fs_put_dnode(&dn);
 				goto found;
@@ -590,8 +606,10 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 		f2fs_set_data_blkaddr(dn, NULL_ADDR);
 
 		if (__is_valid_data_blkaddr(blkaddr)) {
-			if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
-					DATA_GENERIC_ENHANCE))
+			if (time_to_inject(sbi, FAULT_BLKADDR_CONSISTENCE))
+				continue;
+			if (!f2fs_is_valid_blkaddr_raw(sbi, blkaddr,
+						DATA_GENERIC_ENHANCE))
 				continue;
 			if (compressed_cluster)
 				valid_blocks++;
@@ -818,8 +836,6 @@ static bool f2fs_force_buffered_io(struct inode *inode, int rw)
 	 */
 	if (f2fs_sb_has_blkzoned(sbi) && (rw == WRITE))
 		return true;
-	if (f2fs_lfs_mode(sbi) && rw == WRITE && F2FS_IO_ALIGNED(sbi))
-		return true;
 	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
 		return true;
 
@@ -1192,7 +1208,6 @@ next_dnode:
 			!f2fs_is_valid_blkaddr(sbi, *blkaddr,
 					DATA_GENERIC_ENHANCE)) {
 			f2fs_put_dnode(&dn);
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			return -EFSCORRUPTED;
 		}
 
@@ -1478,7 +1493,6 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
 		if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
 					DATA_GENERIC_ENHANCE)) {
 			ret = -EFSCORRUPTED;
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			break;
 		}
 
@@ -1662,10 +1676,12 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	}
 	filemap_invalidate_unlock(mapping);
 	f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
+	if (ret)
+		return ret;
 
 	/* write out all moved pages, if possible */
 	filemap_invalidate_lock(mapping);
-	filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
+	ret = filemap_write_and_wait_range(mapping, offset, LLONG_MAX);
 	truncate_pagecache(inode, offset);
 	filemap_invalidate_unlock(mapping);
 
@@ -1731,9 +1747,11 @@ next_alloc:
 
 		f2fs_down_write(&sbi->pin_sem);
 
-		f2fs_lock_op(sbi);
-		f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
-		f2fs_unlock_op(sbi);
+		err = f2fs_allocate_pinning_section(sbi);
+		if (err) {
+			f2fs_up_write(&sbi->pin_sem);
+			goto out_err;
+		}
 
 		map.m_seg_type = CURSEG_COLD_DATA_PINNED;
 		err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_PRE_DIO);
@@ -2066,7 +2084,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp, bool truncate)
 
 	inode_lock(inode);
 
-	if (!f2fs_disable_compressed_file(inode)) {
+	if (!f2fs_disable_compressed_file(inode) ||
+			f2fs_is_pinned_file(inode)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -2243,8 +2262,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	case F2FS_GOING_DOWN_METASYNC:
 		/* do checkpoint only */
 		ret = f2fs_sync_fs(sb, 1);
-		if (ret)
+		if (ret) {
+			if (ret == -EIO)
+				ret = 0;
 			goto out;
+		}
 		f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
 		break;
 	case F2FS_GOING_DOWN_NOSYNC:
@@ -2260,6 +2282,8 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 		set_sbi_flag(sbi, SBI_IS_DIRTY);
 		/* do checkpoint only */
 		ret = f2fs_sync_fs(sb, 1);
+		if (ret == -EIO)
+			ret = 0;
 		goto out;
 	default:
 		ret = -EINVAL;
@@ -2578,7 +2602,6 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
 					.m_may_create = false };
 	struct extent_info ei = {};
 	pgoff_t pg_start, pg_end, next_pgofs;
-	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	unsigned int total = 0, sec_num;
 	block_t blk_end = 0;
 	bool fragmented = false;
@@ -2687,7 +2710,8 @@ do_map:
 		set_inode_flag(inode, FI_SKIP_WRITES);
 
 		idx = map.m_lblk;
-		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+		while (idx < map.m_lblk + map.m_len &&
+						cnt < BLKS_PER_SEG(sbi)) {
 			struct page *page;
 
 			page = f2fs_get_lock_data_page(inode, idx, true);
@@ -2707,7 +2731,7 @@ do_map:
 
 		map.m_lblk = idx;
 check:
-		if (map.m_lblk < pg_end && cnt < blk_per_seg)
+		if (map.m_lblk < pg_end && cnt < BLKS_PER_SEG(sbi))
 			goto do_map;
 
 		clear_inode_flag(inode, FI_SKIP_WRITES);
@@ -2976,8 +3000,8 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg)
 
 	if (!f2fs_is_multi_device(sbi) || sbi->s_ndevs - 1 <= range.dev_num ||
 			__is_large_section(sbi)) {
-		f2fs_warn(sbi, "Can't flush %u in %d for segs_per_sec %u != 1",
-			  range.dev_num, sbi->s_ndevs, sbi->segs_per_sec);
+		f2fs_warn(sbi, "Can't flush %u in %d for SEGS_PER_SEC %u != 1",
+			  range.dev_num, sbi->s_ndevs, SEGS_PER_SEC(sbi));
 		return -EINVAL;
 	}
 
@@ -3183,6 +3207,7 @@ int f2fs_pin_file_control(struct inode *inode, bool inc)
 static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	__u32 pin;
 	int ret = 0;
 
@@ -3192,7 +3217,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
 
-	if (f2fs_readonly(F2FS_I_SB(inode)->sb))
+	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
 	ret = mnt_want_write_file(filp);
@@ -3205,9 +3230,18 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 		clear_inode_flag(inode, FI_PIN_FILE);
 		f2fs_i_gc_failures_write(inode, 0);
 		goto done;
+	} else if (f2fs_is_pinned_file(inode)) {
+		goto done;
 	}
 
-	if (f2fs_should_update_outplace(inode, NULL)) {
+	if (f2fs_sb_has_blkzoned(sbi) && F2FS_HAS_BLOCKS(inode)) {
+		ret = -EFBIG;
+		goto out;
+	}
+
+	/* Let's allow file pinning on zoned device. */
+	if (!f2fs_sb_has_blkzoned(sbi) &&
+	    f2fs_should_update_outplace(inode, NULL)) {
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3229,7 +3263,7 @@ static int f2fs_ioc_set_pin_file(struct file *filp, unsigned long arg)
 	set_inode_flag(inode, FI_PIN_FILE);
 	ret = F2FS_I(inode)->i_gc_failures[GC_FAILURE_PIN];
 done:
-	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+	f2fs_update_time(sbi, REQ_TIME);
 out:
 	inode_unlock(inode);
 	mnt_drop_write_file(filp);
@@ -3438,10 +3472,8 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		if (!__is_valid_data_blkaddr(blkaddr))
 			continue;
 		if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
-					DATA_GENERIC_ENHANCE))) {
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+					DATA_GENERIC_ENHANCE)))
 			return -EFSCORRUPTED;
-		}
 	}
 
 	while (count) {
@@ -3588,10 +3620,10 @@ out:
 	return ret;
 }
 
-static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
+static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count,
+		unsigned int *reserved_blocks)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
-	unsigned int reserved_blocks = 0;
 	int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
 	block_t blkaddr;
 	int i;
@@ -3603,10 +3635,8 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		if (!__is_valid_data_blkaddr(blkaddr))
 			continue;
 		if (unlikely(!f2fs_is_valid_blkaddr(sbi, blkaddr,
-					DATA_GENERIC_ENHANCE))) {
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
+					DATA_GENERIC_ENHANCE)))
 			return -EFSCORRUPTED;
-		}
 	}
 
 	while (count) {
@@ -3614,40 +3644,53 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
 		blkcnt_t reserved;
 		int ret;
 
-		for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
-			blkaddr = f2fs_data_blkaddr(dn);
+		for (i = 0; i < cluster_size; i++) {
+			blkaddr = data_blkaddr(dn->inode, dn->node_page,
+						dn->ofs_in_node + i);
 
 			if (i == 0) {
-				if (blkaddr == COMPRESS_ADDR)
-					continue;
-				dn->ofs_in_node += cluster_size;
-				goto next;
+				if (blkaddr != COMPRESS_ADDR) {
+					dn->ofs_in_node += cluster_size;
+					goto next;
+				}
+				continue;
 			}
 
-			if (__is_valid_data_blkaddr(blkaddr)) {
+			/*
+			 * compressed cluster was not released due to it
+			 * fails in release_compress_blocks(), so NEW_ADDR
+			 * is a possible case.
+			 */
+			if (blkaddr == NEW_ADDR ||
+				__is_valid_data_blkaddr(blkaddr)) {
 				compr_blocks++;
 				continue;
 			}
-
-			f2fs_set_data_blkaddr(dn, NEW_ADDR);
 		}
 
 		reserved = cluster_size - compr_blocks;
-		ret = inc_valid_block_count(sbi, dn->inode, &reserved);
-		if (ret)
+
+		/* for the case all blocks in cluster were reserved */
+		if (reserved == 1)
+			goto next;
+
+		ret = inc_valid_block_count(sbi, dn->inode, &reserved, false);
+		if (unlikely(ret))
 			return ret;
 
-		if (reserved != cluster_size - compr_blocks)
-			return -ENOSPC;
+		for (i = 0; i < cluster_size; i++, dn->ofs_in_node++) {
+			if (f2fs_data_blkaddr(dn) == NULL_ADDR)
+				f2fs_set_data_blkaddr(dn, NEW_ADDR);
+		}
 
 		f2fs_i_compr_blocks_update(dn->inode, compr_blocks, true);
 
-		reserved_blocks += reserved;
+		*reserved_blocks += reserved;
 next:
 		count -= cluster_size;
 	}
 
-	return reserved_blocks;
+	return 0;
 }
 
 static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
@@ -3671,9 +3714,6 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 	if (ret)
 		return ret;
 
-	if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
-		goto out;
-
 	f2fs_balance_fs(sbi, true);
 
 	inode_lock(inode);
@@ -3683,6 +3723,9 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 		goto unlock_inode;
 	}
 
+	if (atomic_read(&F2FS_I(inode)->i_compr_blocks))
+		goto unlock_inode;
+
 	f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 	filemap_invalidate_lock(inode->i_mapping);
 
@@ -3708,7 +3751,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 		count = min(end_offset - dn.ofs_in_node, last_idx - page_idx);
 		count = round_up(count, F2FS_I(inode)->i_cluster_size);
 
-		ret = reserve_compress_blocks(&dn, count);
+		ret = reserve_compress_blocks(&dn, count, &reserved_blocks);
 
 		f2fs_put_dnode(&dn);
 
@@ -3716,23 +3759,21 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg)
 			break;
 
 		page_idx += count;
-		reserved_blocks += ret;
 	}
 
 	filemap_invalidate_unlock(inode->i_mapping);
 	f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
 
-	if (ret >= 0) {
+	if (!ret) {
 		clear_inode_flag(inode, FI_COMPRESS_RELEASED);
 		inode_set_ctime_current(inode);
 		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 unlock_inode:
 	inode_unlock(inode);
-out:
 	mnt_drop_write_file(filp);
 
-	if (ret >= 0) {
+	if (!ret) {
 		ret = put_user(reserved_blocks, (u64 __user *)arg);
 	} else if (reserved_blocks &&
 			atomic_read(&F2FS_I(inode)->i_compr_blocks)) {
@@ -3877,8 +3918,6 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg)
 						DATA_GENERIC_ENHANCE)) {
 				ret = -EFSCORRUPTED;
 				f2fs_put_dnode(&dn);
-				f2fs_handle_error(sbi,
-						ERROR_INVALID_BLKADDR);
 				goto out;
 			}
 
@@ -3981,16 +4020,20 @@ static int f2fs_ioc_set_compress_option(struct file *filp, unsigned long arg)
 				sizeof(option)))
 		return -EFAULT;
 
-	if (!f2fs_compressed_file(inode) ||
-			option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
-			option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
-			option.algorithm >= COMPRESS_MAX)
+	if (option.log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
+		option.log_cluster_size > MAX_COMPRESS_LOG_SIZE ||
+		option.algorithm >= COMPRESS_MAX)
 		return -EINVAL;
 
 	file_start_write(filp);
 	inode_lock(inode);
 
 	f2fs_down_write(&F2FS_I(inode)->i_sem);
+	if (!f2fs_compressed_file(inode)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
 	if (f2fs_is_mmap_file(inode) || get_dirty_pages(inode)) {
 		ret = -EBUSY;
 		goto out;
@@ -4066,7 +4109,6 @@ static int f2fs_ioc_decompress_file(struct file *filp)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 	pgoff_t page_idx = 0, last_idx;
-	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	int cluster_size = fi->i_cluster_size;
 	int count, ret;
 
@@ -4110,7 +4152,7 @@ static int f2fs_ioc_decompress_file(struct file *filp)
 		if (ret < 0)
 			break;
 
-		if (get_dirty_pages(inode) >= blk_per_seg) {
+		if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
 			ret = filemap_fdatawrite(inode->i_mapping);
 			if (ret < 0)
 				break;
@@ -4145,7 +4187,6 @@ static int f2fs_ioc_compress_file(struct file *filp)
 	struct inode *inode = file_inode(filp);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	pgoff_t page_idx = 0, last_idx;
-	unsigned int blk_per_seg = sbi->blocks_per_seg;
 	int cluster_size = F2FS_I(inode)->i_cluster_size;
 	int count, ret;
 
@@ -4188,7 +4229,7 @@ static int f2fs_ioc_compress_file(struct file *filp)
 		if (ret < 0)
 			break;
 
-		if (get_dirty_pages(inode) >= blk_per_seg) {
+		if (get_dirty_pages(inode) >= BLKS_PER_SEG(sbi)) {
 			ret = filemap_fdatawrite(inode->i_mapping);
 			if (ret < 0)
 				break;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index a079eebfb080..8852814dab7f 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -259,7 +259,7 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 		p->ofs_unit = 1;
 	} else {
 		p->gc_mode = select_gc_type(sbi, gc_type);
-		p->ofs_unit = sbi->segs_per_sec;
+		p->ofs_unit = SEGS_PER_SEC(sbi);
 		if (__is_large_section(sbi)) {
 			p->dirty_bitmap = dirty_i->dirty_secmap;
 			p->max_search = count_bits(p->dirty_bitmap,
@@ -280,11 +280,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
 			p->max_search > sbi->max_victim_search)
 		p->max_search = sbi->max_victim_search;
 
-	/* let's select beginning hot/small space first in no_heap mode*/
+	/* let's select beginning hot/small space first. */
 	if (f2fs_need_rand_seg(sbi))
-		p->offset = get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
-	else if (test_opt(sbi, NOHEAP) &&
-		(type == CURSEG_HOT_DATA || IS_NODESEG(type)))
+		p->offset = get_random_u32_below(MAIN_SECS(sbi) *
+						SEGS_PER_SEC(sbi));
+	else if (type == CURSEG_HOT_DATA || IS_NODESEG(type))
 		p->offset = 0;
 	else
 		p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
@@ -295,13 +295,13 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 {
 	/* SSR allocates in a segment unit */
 	if (p->alloc_mode == SSR)
-		return sbi->blocks_per_seg;
+		return BLKS_PER_SEG(sbi);
 	else if (p->alloc_mode == AT_SSR)
 		return UINT_MAX;
 
 	/* LFS */
 	if (p->gc_mode == GC_GREEDY)
-		return 2 * sbi->blocks_per_seg * p->ofs_unit;
+		return SEGS_TO_BLKS(sbi, 2 * p->ofs_unit);
 	else if (p->gc_mode == GC_CB)
 		return UINT_MAX;
 	else if (p->gc_mode == GC_AT)
@@ -348,7 +348,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
 	mtime = div_u64(mtime, usable_segs_per_sec);
 	vblocks = div_u64(vblocks, usable_segs_per_sec);
 
-	u = (vblocks * 100) >> sbi->log_blocks_per_seg;
+	u = BLKS_TO_SEGS(sbi, vblocks * 100);
 
 	/* Handle if the system time has changed by the user */
 	if (mtime < sit_i->min_mtime)
@@ -496,9 +496,9 @@ static void add_victim_entry(struct f2fs_sb_info *sbi,
 			return;
 	}
 
-	for (i = 0; i < sbi->segs_per_sec; i++)
+	for (i = 0; i < SEGS_PER_SEC(sbi); i++)
 		mtime += get_seg_entry(sbi, start + i)->mtime;
-	mtime = div_u64(mtime, sbi->segs_per_sec);
+	mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
 
 	/* Handle if the system time has changed by the user */
 	if (mtime < sit_i->min_mtime)
@@ -599,7 +599,6 @@ static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
 	unsigned long long age;
 	unsigned long long max_mtime = sit_i->dirty_max_mtime;
 	unsigned long long min_mtime = sit_i->dirty_min_mtime;
-	unsigned int seg_blocks = sbi->blocks_per_seg;
 	unsigned int vblocks;
 	unsigned int dirty_threshold = max(am->max_candidate_count,
 					am->candidate_ratio *
@@ -629,7 +628,7 @@ next_node:
 	f2fs_bug_on(sbi, !vblocks);
 
 	/* rare case */
-	if (vblocks == seg_blocks)
+	if (vblocks == BLKS_PER_SEG(sbi))
 		goto skip_node;
 
 	iter++;
@@ -755,7 +754,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result,
 	int ret = 0;
 
 	mutex_lock(&dirty_i->seglist_lock);
-	last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
+	last_segment = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
 
 	p.alloc_mode = alloc_mode;
 	p.age = age;
@@ -896,7 +895,7 @@ next:
 			else
 				sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
 			sm->last_victim[p.gc_mode] %=
-				(MAIN_SECS(sbi) * sbi->segs_per_sec);
+				(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
 			break;
 		}
 	}
@@ -1184,7 +1183,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 		.op_flags = 0,
 		.encrypted_page = NULL,
 		.in_list = 0,
-		.retry = 0,
 	};
 	int err;
 
@@ -1197,7 +1195,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 		if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE_READ))) {
 			err = -EFSCORRUPTED;
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			goto put_page;
 		}
 		goto got_it;
@@ -1216,7 +1213,6 @@ static int ra_data_block(struct inode *inode, pgoff_t index)
 	if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
 						DATA_GENERIC_ENHANCE))) {
 		err = -EFSCORRUPTED;
-		f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 		goto put_page;
 	}
 got_it:
@@ -1273,7 +1269,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
 		.op_flags = 0,
 		.encrypted_page = NULL,
 		.in_list = 0,
-		.retry = 0,
 	};
 	struct dnode_of_data dn;
 	struct f2fs_summary sum;
@@ -1364,8 +1359,13 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
 
 	/* allocate block address */
-	f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
+	err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
 				&sum, type, NULL);
+	if (err) {
+		f2fs_put_page(mpage, 1);
+		/* filesystem should shutdown, no need to recovery block */
+		goto up_out;
+	}
 
 	fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
 				newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
@@ -1393,18 +1393,12 @@ static int move_data_block(struct inode *inode, block_t bidx,
 	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_write(&fio);
-	if (fio.retry) {
-		err = -EAGAIN;
-		if (PageWriteback(fio.encrypted_page))
-			end_page_writeback(fio.encrypted_page);
-		goto put_page_out;
-	}
 
 	f2fs_update_iostat(fio.sbi, NULL, FS_GC_DATA_IO, F2FS_BLKSIZE);
 
 	f2fs_update_data_blkaddr(&dn, newaddr);
 	set_inode_flag(inode, FI_APPEND_WRITE);
-put_page_out:
+
 	f2fs_put_page(fio.encrypted_page, 1);
 recover_block:
 	if (err)
@@ -1678,7 +1672,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	struct f2fs_summary_block *sum;
 	struct blk_plug plug;
 	unsigned int segno = start_segno;
-	unsigned int end_segno = start_segno + sbi->segs_per_sec;
+	unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi);
 	int seg_freed = 0, migrated = 0;
 	unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
 						SUM_TYPE_DATA : SUM_TYPE_NODE;
@@ -1686,7 +1680,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	int submitted = 0;
 
 	if (__is_large_section(sbi))
-		end_segno = rounddown(end_segno, sbi->segs_per_sec);
+		end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi));
 
 	/*
 	 * zone-capacity can be less than zone-size in zoned devices,
@@ -1694,7 +1688,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
 	 * calculate the end segno in the zone which can be garbage collected
 	 */
 	if (f2fs_sb_has_blkzoned(sbi))
-		end_segno -= sbi->segs_per_sec -
+		end_segno -= SEGS_PER_SEC(sbi) -
 					f2fs_usable_segs_in_sec(sbi, segno);
 
 	sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
@@ -1983,10 +1977,43 @@ void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
 	init_atgc_management(sbi);
 }
 
+int f2fs_gc_range(struct f2fs_sb_info *sbi,
+		unsigned int start_seg, unsigned int end_seg,
+		bool dry_run, unsigned int dry_run_sections)
+{
+	unsigned int segno;
+	unsigned int gc_secs = dry_run_sections;
+
+	if (unlikely(f2fs_cp_error(sbi)))
+		return -EIO;
+
+	for (segno = start_seg; segno <= end_seg; segno += SEGS_PER_SEC(sbi)) {
+		struct gc_inode_list gc_list = {
+			.ilist = LIST_HEAD_INIT(gc_list.ilist),
+			.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
+		};
+
+		do_garbage_collect(sbi, segno, &gc_list, FG_GC,
+						dry_run_sections == 0);
+		put_gc_inode(&gc_list);
+
+		if (!dry_run && get_valid_blocks(sbi, segno, true))
+			return -EAGAIN;
+		if (dry_run && dry_run_sections &&
+		    !get_valid_blocks(sbi, segno, true) && --gc_secs == 0)
+			break;
+
+		if (fatal_signal_pending(current))
+			return -ERESTARTSYS;
+	}
+
+	return 0;
+}
+
 static int free_segment_range(struct f2fs_sb_info *sbi,
-				unsigned int secs, bool gc_only)
+				unsigned int secs, bool dry_run)
 {
-	unsigned int segno, next_inuse, start, end;
+	unsigned int next_inuse, start, end;
 	struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
 	int gc_mode, gc_type;
 	int err = 0;
@@ -1994,7 +2021,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
 
 	/* Force block allocation for GC */
 	MAIN_SECS(sbi) -= secs;
-	start = MAIN_SECS(sbi) * sbi->segs_per_sec;
+	start = MAIN_SECS(sbi) * SEGS_PER_SEC(sbi);
 	end = MAIN_SEGS(sbi) - 1;
 
 	mutex_lock(&DIRTY_I(sbi)->seglist_lock);
@@ -2008,29 +2035,15 @@ static int free_segment_range(struct f2fs_sb_info *sbi,
 	mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
 
 	/* Move out cursegs from the target range */
-	for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
-		f2fs_allocate_segment_for_resize(sbi, type, start, end);
-
-	/* do GC to move out valid blocks in the range */
-	for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
-		struct gc_inode_list gc_list = {
-			.ilist = LIST_HEAD_INIT(gc_list.ilist),
-			.iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
-		};
-
-		do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
-		put_gc_inode(&gc_list);
-
-		if (!gc_only && get_valid_blocks(sbi, segno, true)) {
-			err = -EAGAIN;
-			goto out;
-		}
-		if (fatal_signal_pending(current)) {
-			err = -ERESTARTSYS;
+	for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++) {
+		err = f2fs_allocate_segment_for_resize(sbi, type, start, end);
+		if (err)
 			goto out;
-		}
 	}
-	if (gc_only)
+
+	/* do GC to move out valid blocks in the range */
+	err = f2fs_gc_range(sbi, start, end, dry_run, 0);
+	if (err || dry_run)
 		goto out;
 
 	stat_inc_cp_call_count(sbi, TOTAL_CALL);
@@ -2056,7 +2069,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
 	int segment_count;
 	int segment_count_main;
 	long long block_count;
-	int segs = secs * sbi->segs_per_sec;
+	int segs = secs * SEGS_PER_SEC(sbi);
 
 	f2fs_down_write(&sbi->sb_lock);
 
@@ -2069,7 +2082,7 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
 	raw_sb->segment_count = cpu_to_le32(segment_count + segs);
 	raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
 	raw_sb->block_count = cpu_to_le64(block_count +
-					(long long)segs * sbi->blocks_per_seg);
+			(long long)SEGS_TO_BLKS(sbi, segs));
 	if (f2fs_is_multi_device(sbi)) {
 		int last_dev = sbi->s_ndevs - 1;
 		int dev_segs =
@@ -2084,8 +2097,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
 
 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
 {
-	int segs = secs * sbi->segs_per_sec;
-	long long blks = (long long)segs * sbi->blocks_per_seg;
+	int segs = secs * SEGS_PER_SEC(sbi);
+	long long blks = SEGS_TO_BLKS(sbi, segs);
 	long long user_block_count =
 				le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
 
@@ -2127,7 +2140,7 @@ int f2fs_resize_fs(struct file *filp, __u64 block_count)
 		int last_dev = sbi->s_ndevs - 1;
 		__u64 last_segs = FDEV(last_dev).total_segments;
 
-		if (block_count + last_segs * sbi->blocks_per_seg <=
+		if (block_count + SEGS_TO_BLKS(sbi, last_segs) <=
 								old_block_count)
 			return -EINVAL;
 	}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 28a00942802c..9c0d06c4d19a 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -96,7 +96,7 @@ static inline block_t free_segs_blk_count(struct f2fs_sb_info *sbi)
 	if (f2fs_sb_has_blkzoned(sbi))
 		return free_segs_blk_count_zoned(sbi);
 
-	return free_segments(sbi) << sbi->log_blocks_per_seg;
+	return SEGS_TO_BLKS(sbi, free_segments(sbi));
 }
 
 static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
@@ -104,7 +104,7 @@ static inline block_t free_user_blocks(struct f2fs_sb_info *sbi)
 	block_t free_blks, ovp_blks;
 
 	free_blks = free_segs_blk_count(sbi);
-	ovp_blks = overprovision_segments(sbi) << sbi->log_blocks_per_seg;
+	ovp_blks = SEGS_TO_BLKS(sbi, overprovision_segments(sbi));
 
 	if (free_blks < ovp_blks)
 		return 0;
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index b3bb815fc6aa..e54f8c08bda8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -531,7 +531,6 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	}
 
 	err = f2fs_prepare_lookup(dir, dentry, &fname);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		goto out_splice;
 	if (err)
@@ -852,7 +851,7 @@ out:
 
 static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 			  struct file *file, umode_t mode, bool is_whiteout,
-			  struct inode **new_inode)
+			  struct inode **new_inode, struct f2fs_filename *fname)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
 	struct inode *inode;
@@ -880,7 +879,7 @@ static int __f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	if (err)
 		goto out;
 
-	err = f2fs_do_tmpfile(inode, dir);
+	err = f2fs_do_tmpfile(inode, dir, fname);
 	if (err)
 		goto release_out;
 
@@ -931,22 +930,24 @@ static int f2fs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 	if (!f2fs_is_checkpoint_ready(sbi))
 		return -ENOSPC;
 
-	err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL);
+	err = __f2fs_tmpfile(idmap, dir, file, mode, false, NULL, NULL);
 
 	return finish_open_simple(file, err);
 }
 
 static int f2fs_create_whiteout(struct mnt_idmap *idmap,
-				struct inode *dir, struct inode **whiteout)
+				struct inode *dir, struct inode **whiteout,
+				struct f2fs_filename *fname)
 {
-	return __f2fs_tmpfile(idmap, dir, NULL,
-				S_IFCHR | WHITEOUT_MODE, true, whiteout);
+	return __f2fs_tmpfile(idmap, dir, NULL, S_IFCHR | WHITEOUT_MODE,
+						true, whiteout, fname);
 }
 
 int f2fs_get_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
 		     struct inode **new_inode)
 {
-	return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG, false, new_inode);
+	return __f2fs_tmpfile(idmap, dir, NULL, S_IFREG,
+				false, new_inode, NULL);
 }
 
 static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
@@ -990,7 +991,14 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 	}
 
 	if (flags & RENAME_WHITEOUT) {
-		err = f2fs_create_whiteout(idmap, old_dir, &whiteout);
+		struct f2fs_filename fname;
+
+		err = f2fs_setup_filename(old_dir, &old_dentry->d_name,
+							0, &fname);
+		if (err)
+			return err;
+
+		err = f2fs_create_whiteout(idmap, old_dir, &whiteout, &fname);
 		if (err)
 			return err;
 	}
@@ -1105,14 +1113,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
 		iput(whiteout);
 	}
 
-	if (old_is_dir) {
-		if (old_dir_entry)
-			f2fs_set_link(old_inode, old_dir_entry,
-						old_dir_page, new_dir);
-		else
-			f2fs_put_page(old_dir_page, 0);
+	if (old_dir_entry)
+		f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir);
+	if (old_is_dir)
 		f2fs_i_links_write(old_dir, false);
-	}
+
 	if (F2FS_OPTION(sbi).fsync_mode == FSYNC_MODE_STRICT) {
 		f2fs_add_ino_entry(sbi, new_dir->i_ino, TRANS_DIR_INO);
 		if (S_ISDIR(old_inode->i_mode))
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9b546fd21010..b3de6d6cdb02 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -852,21 +852,29 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 
 	if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) &&
 					f2fs_sb_has_readonly(sbi)) {
-		unsigned int c_len = f2fs_cluster_blocks_are_contiguous(dn);
+		unsigned int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
+		unsigned int ofs_in_node = dn->ofs_in_node;
+		pgoff_t fofs = index;
+		unsigned int c_len;
 		block_t blkaddr;
 
+		/* should align fofs and ofs_in_node to cluster_size */
+		if (fofs % cluster_size) {
+			fofs = round_down(fofs, cluster_size);
+			ofs_in_node = round_down(ofs_in_node, cluster_size);
+		}
+
+		c_len = f2fs_cluster_blocks_are_contiguous(dn, ofs_in_node);
 		if (!c_len)
 			goto out;
 
-		blkaddr = f2fs_data_blkaddr(dn);
+		blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node);
 		if (blkaddr == COMPRESS_ADDR)
 			blkaddr = data_blkaddr(dn->inode, dn->node_page,
-						dn->ofs_in_node + 1);
+						ofs_in_node + 1);
 
 		f2fs_update_read_extent_tree_range_compressed(dn->inode,
-					index, blkaddr,
-					F2FS_I(dn->inode)->i_cluster_size,
-					c_len);
+					fofs, blkaddr, cluster_size, c_len);
 	}
 out:
 	return 0;
@@ -1919,7 +1927,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi)
 		for (i = 0; i < nr_folios; i++) {
 			struct page *page = &fbatch.folios[i]->page;
 
-			if (!IS_DNODE(page))
+			if (!IS_INODE(page))
 				continue;
 
 			lock_page(page);
@@ -2841,7 +2849,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi,
 	int i, idx, last_offset, nrpages;
 
 	/* scan the node segment */
-	last_offset = sbi->blocks_per_seg;
+	last_offset = BLKS_PER_SEG(sbi);
 	addr = START_BLOCK(sbi, segno);
 	sum_entry = &sum->entries[0];
 
@@ -3158,7 +3166,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
 	if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG))
 		return 0;
 
-	nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg -
+	nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) -
 						nm_i->nat_bits_blocks;
 	for (i = 0; i < nm_i->nat_bits_blocks; i++) {
 		struct page *page;
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 5bd16a95eef8..6aea13024ac1 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -208,10 +208,10 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
 
 	block_addr = (pgoff_t)(nm_i->nat_blkaddr +
 		(block_off << 1) -
-		(block_off & (sbi->blocks_per_seg - 1)));
+		(block_off & (BLKS_PER_SEG(sbi) - 1)));
 
 	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
-		block_addr += sbi->blocks_per_seg;
+		block_addr += BLKS_PER_SEG(sbi);
 
 	return block_addr;
 }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index d0f24ccbd1ac..e7bf15b8240a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -354,7 +354,7 @@ static unsigned int adjust_por_ra_blocks(struct f2fs_sb_info *sbi,
 	if (blkaddr + 1 == next_blkaddr)
 		ra_blocks = min_t(unsigned int, RECOVERY_MAX_RA_BLOCKS,
 							ra_blocks * 2);
-	else if (next_blkaddr % sbi->blocks_per_seg)
+	else if (next_blkaddr % BLKS_PER_SEG(sbi))
 		ra_blocks = max_t(unsigned int, RECOVERY_MIN_RA_BLOCKS,
 							ra_blocks / 2);
 	return ra_blocks;
@@ -611,6 +611,19 @@ truncate_out:
 	return 0;
 }
 
+static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn)
+{
+	int i, err = 0;
+
+	for (i = DEFAULT_FAILURE_RETRY_COUNT; i > 0; i--) {
+		err = f2fs_reserve_new_block(dn);
+		if (!err)
+			break;
+	}
+
+	return err;
+}
+
 static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 					struct page *page)
 {
@@ -680,14 +693,12 @@ retry_dn:
 		if (__is_valid_data_blkaddr(src) &&
 			!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
 			err = -EFSCORRUPTED;
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			goto err;
 		}
 
 		if (__is_valid_data_blkaddr(dest) &&
 			!f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
 			err = -EFSCORRUPTED;
-			f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
 			goto err;
 		}
 
@@ -712,14 +723,8 @@ retry_dn:
 		 */
 		if (dest == NEW_ADDR) {
 			f2fs_truncate_data_blocks_range(&dn, 1);
-			do {
-				err = f2fs_reserve_new_block(&dn);
-				if (err == -ENOSPC) {
-					f2fs_bug_on(sbi, 1);
-					break;
-				}
-			} while (err &&
-				IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+
+			err = f2fs_reserve_new_block_retry(&dn);
 			if (err)
 				goto err;
 			continue;
@@ -727,16 +732,8 @@ retry_dn:
 
 		/* dest is valid block, try to recover from src to dest */
 		if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
-
 			if (src == NULL_ADDR) {
-				do {
-					err = f2fs_reserve_new_block(&dn);
-					if (err == -ENOSPC) {
-						f2fs_bug_on(sbi, 1);
-						break;
-					}
-				} while (err &&
-					IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+				err = f2fs_reserve_new_block_retry(&dn);
 				if (err)
 					goto err;
 			}
@@ -756,8 +753,6 @@ retry_prev:
 				f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
 					dest, inode->i_ino, dn.ofs_in_node);
 				err = -EFSCORRUPTED;
-				f2fs_handle_error(sbi,
-						ERROR_INVALID_BLKADDR);
 				goto err;
 			}
 
@@ -852,7 +847,7 @@ next:
 		f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks);
 	}
 	if (!err)
-		f2fs_allocate_new_segments(sbi);
+		err = f2fs_allocate_new_segments(sbi);
 	return err;
 }
 
@@ -864,7 +859,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	int ret = 0;
 	unsigned long s_flags = sbi->sb->s_flags;
 	bool need_writecp = false;
-	bool fix_curseg_write_pointer = false;
 
 	if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE))
 		f2fs_info(sbi, "recover fsync data on readonly fs");
@@ -895,8 +889,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
 	else
 		f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE);
 skip:
-	fix_curseg_write_pointer = !check_only || list_empty(&inode_list);
-
 	destroy_fsync_dnodes(&inode_list, err);
 	destroy_fsync_dnodes(&tmp_inode_list, err);
 
@@ -914,11 +906,13 @@ skip:
 	 * and the f2fs is not read only, check and fix zoned block devices'
 	 * write pointer consistency.
 	 */
-	if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
-			f2fs_sb_has_blkzoned(sbi)) {
-		err = f2fs_fix_curseg_write_pointer(sbi);
-		if (!err)
-			err = f2fs_check_write_pointer(sbi);
+	if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sbi->sb)) {
+		int err2 = f2fs_fix_curseg_write_pointer(sbi);
+
+		if (!err2)
+			err2 = f2fs_check_write_pointer(sbi);
+		if (err2)
+			err = err2;
 		ret = err;
 	}
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90f..4fd76e867e0a 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -192,6 +192,9 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	if (!f2fs_is_atomic_file(inode))
 		return;
 
+	if (clean)
+		truncate_inode_pages_final(inode->i_mapping);
+
 	release_atomic_write_cnt(inode);
 	clear_inode_flag(inode, FI_ATOMIC_COMMITTED);
 	clear_inode_flag(inode, FI_ATOMIC_REPLACE);
@@ -201,7 +204,6 @@ void f2fs_abort_atomic_write(struct inode *inode, bool clean)
 	F2FS_I(inode)->atomic_write_task = NULL;
 
 	if (clean) {
-		truncate_inode_pages_final(inode->i_mapping);
 		f2fs_i_size_write(inode, fi->original_i_size);
 		fi->original_i_size = 0;
 	}
@@ -248,7 +250,7 @@ retry:
 	} else {
 		blkcnt_t count = 1;
 
-		err = inc_valid_block_count(sbi, inode, &count);
+		err = inc_valid_block_count(sbi, inode, &count, true);
 		if (err) {
 			f2fs_put_dnode(&dn);
 			return err;
@@ -334,8 +336,6 @@ static int __f2fs_commit_atomic_write(struct inode *inode)
 					DATA_GENERIC_ENHANCE)) {
 				f2fs_put_dnode(&dn);
 				ret = -EFSCORRUPTED;
-				f2fs_handle_error(sbi,
-						ERROR_INVALID_BLKADDR);
 				goto out;
 			}
 
@@ -400,6 +400,9 @@ int f2fs_commit_atomic_write(struct inode *inode)
  */
 void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 {
+	if (f2fs_cp_error(sbi))
+		return;
+
 	if (time_to_inject(sbi, FAULT_CHECKPOINT))
 		f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT);
 
@@ -448,8 +451,8 @@ static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi)
 	unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES);
 	unsigned int meta = get_pages(sbi, F2FS_DIRTY_META);
 	unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
-	unsigned int threshold = sbi->blocks_per_seg * factor *
-					DEFAULT_DIRTY_THRESHOLD;
+	unsigned int threshold =
+		SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD));
 	unsigned int global_threshold = threshold * 3 / 2;
 
 	if (dents >= threshold || qdata >= threshold ||
@@ -872,7 +875,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi)
 {
 	int ovp_hole_segs =
 		(overprovision_segments(sbi) - reserved_segments(sbi));
-	block_t ovp_holes = ovp_hole_segs << sbi->log_blocks_per_seg;
+	block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs);
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	block_t holes[2] = {0, 0};	/* DATA and NODE */
 	block_t unusable;
@@ -901,11 +904,16 @@ int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable)
 {
 	int ovp_hole_segs =
 		(overprovision_segments(sbi) - reserved_segments(sbi));
+
+	if (F2FS_OPTION(sbi).unusable_cap_perc == 100)
+		return 0;
 	if (unusable > F2FS_OPTION(sbi).unusable_cap)
 		return -EAGAIN;
 	if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) &&
 		dirty_segments(sbi) > ovp_hole_segs)
 		return -EAGAIN;
+	if (has_not_enough_free_secs(sbi, 0, 0))
+		return -EAGAIN;
 	return 0;
 }
 
@@ -1132,8 +1140,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 	struct seg_entry *sentry;
 	unsigned int segno;
 	block_t blk = start;
-	unsigned long offset, size, max_blocks = sbi->blocks_per_seg;
-	unsigned long *map;
+	unsigned long offset, size, *map;
 
 	while (blk < end) {
 		segno = GET_SEGNO(sbi, blk);
@@ -1143,7 +1150,7 @@ static void __check_sit_bitmap(struct f2fs_sb_info *sbi,
 		if (end < START_BLOCK(sbi, segno + 1))
 			size = GET_BLKOFF_FROM_SEG0(sbi, end);
 		else
-			size = max_blocks;
+			size = BLKS_PER_SEG(sbi);
 		map = (unsigned long *)(sentry->cur_valid_map);
 		offset = __find_rev_next_bit(map, size, offset);
 		f2fs_bug_on(sbi, offset != size);
@@ -1971,9 +1978,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
 		}
 
 		if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
+			unsigned int nofs_flags;
+			int ret;
+
 			trace_f2fs_issue_reset_zone(bdev, blkstart);
-			return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
-						sector, nr_sects, GFP_NOFS);
+			nofs_flags = memalloc_nofs_save();
+			ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+						sector, nr_sects);
+			memalloc_nofs_restore(nofs_flags);
+			return ret;
 		}
 
 		__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
@@ -2042,7 +2055,6 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 							bool check_only)
 {
 	int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
-	int max_blocks = sbi->blocks_per_seg;
 	struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
 	unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
 	unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
@@ -2054,8 +2066,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 	struct list_head *head = &SM_I(sbi)->dcc_info->entry_list;
 	int i;
 
-	if (se->valid_blocks == max_blocks || !f2fs_hw_support_discard(sbi) ||
-			!f2fs_block_unit_discard(sbi))
+	if (se->valid_blocks == BLKS_PER_SEG(sbi) ||
+	    !f2fs_hw_support_discard(sbi) ||
+	    !f2fs_block_unit_discard(sbi))
 		return false;
 
 	if (!force) {
@@ -2072,13 +2085,14 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc,
 
 	while (force || SM_I(sbi)->dcc_info->nr_discards <=
 				SM_I(sbi)->dcc_info->max_discards) {
-		start = __find_rev_next_bit(dmap, max_blocks, end + 1);
-		if (start >= max_blocks)
+		start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1);
+		if (start >= BLKS_PER_SEG(sbi))
 			break;
 
-		end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1);
-		if (force && start && end != max_blocks
-					&& (end - start) < cpc->trim_minlen)
+		end = __find_rev_next_zero_bit(dmap,
+						BLKS_PER_SEG(sbi), start + 1);
+		if (force && start && end != BLKS_PER_SEG(sbi) &&
+		    (end - start) < cpc->trim_minlen)
 			continue;
 
 		if (check_only)
@@ -2160,8 +2174,8 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 								start + 1);
 
 		if (section_alignment) {
-			start = rounddown(start, sbi->segs_per_sec);
-			end = roundup(end, sbi->segs_per_sec);
+			start = rounddown(start, SEGS_PER_SEC(sbi));
+			end = roundup(end, SEGS_PER_SEC(sbi));
 		}
 
 		for (i = start; i < end; i++) {
@@ -2180,7 +2194,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
 		if (!f2fs_sb_has_blkzoned(sbi) &&
 		    (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) {
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
-				(end - start) << sbi->log_blocks_per_seg);
+				SEGS_TO_BLKS(sbi, end - start));
 			continue;
 		}
 next:
@@ -2189,9 +2203,9 @@ next:
 		if (!IS_CURSEC(sbi, secno) &&
 			!get_valid_blocks(sbi, start, true))
 			f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno),
-				sbi->segs_per_sec << sbi->log_blocks_per_seg);
+						BLKS_PER_SEC(sbi));
 
-		start = start_segno + sbi->segs_per_sec;
+		start = start_segno + SEGS_PER_SEC(sbi);
 		if (start < end)
 			goto next;
 		else
@@ -2210,7 +2224,7 @@ next:
 find_next:
 		if (is_valid) {
 			next_pos = find_next_zero_bit_le(entry->discard_map,
-					sbi->blocks_per_seg, cur_pos);
+						BLKS_PER_SEG(sbi), cur_pos);
 			len = next_pos - cur_pos;
 
 			if (f2fs_sb_has_blkzoned(sbi) ||
@@ -2222,13 +2236,13 @@ find_next:
 			total_len += len;
 		} else {
 			next_pos = find_next_bit_le(entry->discard_map,
-					sbi->blocks_per_seg, cur_pos);
+						BLKS_PER_SEG(sbi), cur_pos);
 		}
 skip:
 		cur_pos = next_pos;
 		is_valid = !is_valid;
 
-		if (cur_pos < sbi->blocks_per_seg)
+		if (cur_pos < BLKS_PER_SEG(sbi))
 			goto find_next;
 
 		release_discard_addr(entry);
@@ -2245,6 +2259,12 @@ int f2fs_start_discard_thread(struct f2fs_sb_info *sbi)
 	struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
 	int err = 0;
 
+	if (f2fs_sb_has_readonly(sbi)) {
+		f2fs_info(sbi,
+			"Skip to start discard thread for readonly image");
+		return 0;
+	}
+
 	if (!f2fs_realtime_discard_enable(sbi))
 		return 0;
 
@@ -2277,7 +2297,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
 	dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
 	if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
-		dcc->discard_granularity = sbi->blocks_per_seg;
+		dcc->discard_granularity = BLKS_PER_SEG(sbi);
 	else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
 		dcc->discard_granularity = BLKS_PER_SEC(sbi);
 
@@ -2291,7 +2311,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
 	atomic_set(&dcc->queued_discard, 0);
 	atomic_set(&dcc->discard_cmd_cnt, 0);
 	dcc->nr_discards = 0;
-	dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg;
+	dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi));
 	dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST;
 	dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME;
 	dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME;
@@ -2399,6 +2419,8 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
 #endif
 
 	segno = GET_SEGNO(sbi, blkaddr);
+	if (segno == NULL_SEGNO)
+		return;
 
 	se = get_seg_entry(sbi, segno);
 	new_vblocks = se->valid_blocks + del;
@@ -2540,7 +2562,7 @@ static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int typ
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
 	if (sbi->ckpt->alloc_type[type] == SSR)
-		return sbi->blocks_per_seg;
+		return BLKS_PER_SEG(sbi);
 	return curseg->next_blkoff;
 }
 
@@ -2628,7 +2650,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
 	unsigned int segno = curseg->segno + 1;
 	struct free_segmap_info *free_i = FREE_I(sbi);
 
-	if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec)
+	if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi))
 		return !test_bit(segno, free_i->free_segmap);
 	return 0;
 }
@@ -2637,54 +2659,51 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi,
  * Find a new segment from the free segments bitmap to right order
  * This function should be returned with success, otherwise BUG
  */
-static void get_new_segment(struct f2fs_sb_info *sbi,
-			unsigned int *newseg, bool new_sec, int dir)
+static int get_new_segment(struct f2fs_sb_info *sbi,
+			unsigned int *newseg, bool new_sec, bool pinning)
 {
 	struct free_segmap_info *free_i = FREE_I(sbi);
 	unsigned int segno, secno, zoneno;
 	unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone;
 	unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg);
 	unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg);
-	unsigned int left_start = hint;
 	bool init = true;
-	int go_left = 0;
 	int i;
+	int ret = 0;
 
 	spin_lock(&free_i->segmap_lock);
 
-	if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
+	if (time_to_inject(sbi, FAULT_NO_SEGMENT)) {
+		ret = -ENOSPC;
+		goto out_unlock;
+	}
+
+	if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) {
 		segno = find_next_zero_bit(free_i->free_segmap,
 			GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1);
 		if (segno < GET_SEG_FROM_SEC(sbi, hint + 1))
 			goto got_it;
 	}
+
+	/*
+	 * If we format f2fs on zoned storage, let's try to get pinned sections
+	 * from beginning of the storage, which should be a conventional one.
+	 */
+	if (f2fs_sb_has_blkzoned(sbi)) {
+		segno = pinning ? 0 : max(first_zoned_segno(sbi), *newseg);
+		hint = GET_SEC_FROM_SEG(sbi, segno);
+	}
+
 find_other_zone:
 	secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint);
 	if (secno >= MAIN_SECS(sbi)) {
-		if (dir == ALLOC_RIGHT) {
-			secno = find_first_zero_bit(free_i->free_secmap,
+		secno = find_first_zero_bit(free_i->free_secmap,
 							MAIN_SECS(sbi));
-			f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi));
-		} else {
-			go_left = 1;
-			left_start = hint - 1;
-		}
-	}
-	if (go_left == 0)
-		goto skip_left;
-
-	while (test_bit(left_start, free_i->free_secmap)) {
-		if (left_start > 0) {
-			left_start--;
-			continue;
+		if (secno >= MAIN_SECS(sbi)) {
+			ret = -ENOSPC;
+			goto out_unlock;
 		}
-		left_start = find_first_zero_bit(free_i->free_secmap,
-							MAIN_SECS(sbi));
-		f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi));
-		break;
 	}
-	secno = left_start;
-skip_left:
 	segno = GET_SEG_FROM_SEC(sbi, secno);
 	zoneno = GET_ZONE_FROM_SEC(sbi, secno);
 
@@ -2695,21 +2714,13 @@ skip_left:
 		goto got_it;
 	if (zoneno == old_zoneno)
 		goto got_it;
-	if (dir == ALLOC_LEFT) {
-		if (!go_left && zoneno + 1 >= total_zones)
-			goto got_it;
-		if (go_left && zoneno == 0)
-			goto got_it;
-	}
 	for (i = 0; i < NR_CURSEG_TYPE; i++)
 		if (CURSEG_I(sbi, i)->zone == zoneno)
 			break;
 
 	if (i < NR_CURSEG_TYPE) {
 		/* zone is in user, try another */
-		if (go_left)
-			hint = zoneno * sbi->secs_per_zone - 1;
-		else if (zoneno + 1 >= total_zones)
+		if (zoneno + 1 >= total_zones)
 			hint = 0;
 		else
 			hint = (zoneno + 1) * sbi->secs_per_zone;
@@ -2719,9 +2730,23 @@ skip_left:
 got_it:
 	/* set it as dirty segment in free segmap */
 	f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
+
+	/* no free section in conventional zone */
+	if (new_sec && pinning &&
+		!f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
 	__set_inuse(sbi, segno);
 	*newseg = segno;
+out_unlock:
 	spin_unlock(&free_i->segmap_lock);
+
+	if (ret == -ENOSPC) {
+		f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT);
+		f2fs_bug_on(sbi, 1);
+	}
+	return ret;
 }
 
 static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -2730,6 +2755,10 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
 	struct summary_footer *sum_footer;
 	unsigned short seg_type = curseg->seg_type;
 
+	/* only happen when get_new_segment() fails */
+	if (curseg->next_segno == NULL_SEGNO)
+		return;
+
 	curseg->inited = true;
 	curseg->segno = curseg->next_segno;
 	curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno);
@@ -2755,9 +2784,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 
 	sanity_check_seg_type(sbi, seg_type);
 	if (f2fs_need_rand_seg(sbi))
-		return get_random_u32_below(MAIN_SECS(sbi) * sbi->segs_per_sec);
+		return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi));
 
-	/* if segs_per_sec is large than 1, we need to keep original policy. */
 	if (__is_large_section(sbi))
 		return curseg->segno;
 
@@ -2768,8 +2796,7 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
 	if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED)))
 		return 0;
 
-	if (test_opt(sbi, NOHEAP) &&
-		(seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)))
+	if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type))
 		return 0;
 
 	if (SIT_I(sbi)->last_victim[ALLOC_NEXT])
@@ -2786,30 +2813,31 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type)
  * Allocate a current working segment.
  * This function always allocates a free segment in LFS manner.
  */
-static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
+static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	unsigned short seg_type = curseg->seg_type;
 	unsigned int segno = curseg->segno;
-	int dir = ALLOC_LEFT;
+	bool pinning = type == CURSEG_COLD_DATA_PINNED;
+	int ret;
 
 	if (curseg->inited)
-		write_sum_page(sbi, curseg->sum_blk,
-				GET_SUM_BLOCK(sbi, segno));
-	if (seg_type == CURSEG_WARM_DATA || seg_type == CURSEG_COLD_DATA)
-		dir = ALLOC_RIGHT;
-
-	if (test_opt(sbi, NOHEAP))
-		dir = ALLOC_RIGHT;
+		write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno));
 
 	segno = __get_next_segno(sbi, type);
-	get_new_segment(sbi, &segno, new_sec, dir);
+	ret = get_new_segment(sbi, &segno, new_sec, pinning);
+	if (ret) {
+		if (ret == -ENOSPC)
+			curseg->segno = NULL_SEGNO;
+		return ret;
+	}
+
 	curseg->next_segno = segno;
 	reset_curseg(sbi, type, 1);
 	curseg->alloc_type = LFS;
 	if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK)
 		curseg->fragment_remained_chunk =
 				get_random_u32_inclusive(1, sbi->max_fragment_chunk);
+	return 0;
 }
 
 static int __next_free_blkoff(struct f2fs_sb_info *sbi,
@@ -2825,7 +2853,7 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi,
 	for (i = 0; i < entries; i++)
 		target_map[i] = ckpt_map[i] | cur_map[i];
 
-	return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start);
+	return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start);
 }
 
 static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
@@ -2836,14 +2864,14 @@ static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi,
 
 bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno)
 {
-	return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg;
+	return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi);
 }
 
 /*
  * This function always allocates a used segment(from dirty seglist) by SSR
  * manner, so it should recover the existing segment information of valid blocks
  */
-static void change_curseg(struct f2fs_sb_info *sbi, int type)
+static int change_curseg(struct f2fs_sb_info *sbi, int type)
 {
 	struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
@@ -2868,21 +2896,23 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type)
 	if (IS_ERR(sum_page)) {
 		/* GC won't be able to use stale summary pages by cp_error */
 		memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE);
-		return;
+		return PTR_ERR(sum_page);
 	}
 	sum_node = (struct f2fs_summary_block *)page_address(sum_page);
 	memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE);
 	f2fs_put_page(sum_page, 1);
+	return 0;
 }
 
 static int get_ssr_segment(struct f2fs_sb_info *sbi, int type,
 				int alloc_mode, unsigned long long age);
 
-static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
+static int get_atssr_segment(struct f2fs_sb_info *sbi, int type,
 					int target_type, int alloc_mode,
 					unsigned long long age)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
+	int ret = 0;
 
 	curseg->seg_type = target_type;
 
@@ -2890,38 +2920,41 @@ static void get_atssr_segment(struct f2fs_sb_info *sbi, int type,
 		struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno);
 
 		curseg->seg_type = se->type;
-		change_curseg(sbi, type);
+		ret = change_curseg(sbi, type);
 	} else {
 		/* allocate cold segment by default */
 		curseg->seg_type = CURSEG_COLD_DATA;
-		new_curseg(sbi, type, true);
+		ret = new_curseg(sbi, type, true);
 	}
 	stat_inc_seg_type(sbi, curseg);
+	return ret;
 }
 
-static void __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
+static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC);
+	int ret = 0;
 
 	if (!sbi->am.atgc_enabled)
-		return;
+		return 0;
 
 	f2fs_down_read(&SM_I(sbi)->curseg_lock);
 
 	mutex_lock(&curseg->curseg_mutex);
 	down_write(&SIT_I(sbi)->sentry_lock);
 
-	get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0);
+	ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC,
+					CURSEG_COLD_DATA, SSR, 0);
 
 	up_write(&SIT_I(sbi)->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
 
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
-
+	return ret;
 }
-void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
+int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi)
 {
-	__f2fs_init_atgc_curseg(sbi);
+	return __f2fs_init_atgc_curseg(sbi);
 }
 
 static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type)
@@ -3049,11 +3082,12 @@ static bool need_new_seg(struct f2fs_sb_info *sbi, int type)
 	return false;
 }
 
-void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
+int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 					unsigned int start, unsigned int end)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	unsigned int segno;
+	int ret = 0;
 
 	f2fs_down_read(&SM_I(sbi)->curseg_lock);
 	mutex_lock(&curseg->curseg_mutex);
@@ -3064,9 +3098,9 @@ void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type,
 		goto unlock;
 
 	if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0))
-		change_curseg(sbi, type);
+		ret = change_curseg(sbi, type);
 	else
-		new_curseg(sbi, type, true);
+		ret = new_curseg(sbi, type, true);
 
 	stat_inc_seg_type(sbi, curseg);
 
@@ -3080,45 +3114,84 @@ unlock:
 
 	mutex_unlock(&curseg->curseg_mutex);
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
+	return ret;
 }
 
-static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
+static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type,
 						bool new_sec, bool force)
 {
 	struct curseg_info *curseg = CURSEG_I(sbi, type);
 	unsigned int old_segno;
+	int err = 0;
+
+	if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited)
+		goto allocate;
 
 	if (!force && curseg->inited &&
 	    !curseg->next_blkoff &&
 	    !get_valid_blocks(sbi, curseg->segno, new_sec) &&
 	    !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec))
-		return;
+		return 0;
 
+allocate:
 	old_segno = curseg->segno;
-	new_curseg(sbi, type, true);
+	err = new_curseg(sbi, type, true);
+	if (err)
+		return err;
 	stat_inc_seg_type(sbi, curseg);
 	locate_dirty_segment(sbi, old_segno);
+	return 0;
 }
 
-void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
+int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force)
 {
+	int ret;
+
 	f2fs_down_read(&SM_I(sbi)->curseg_lock);
 	down_write(&SIT_I(sbi)->sentry_lock);
-	__allocate_new_segment(sbi, type, true, force);
+	ret = __allocate_new_segment(sbi, type, true, force);
 	up_write(&SIT_I(sbi)->sentry_lock);
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+	return ret;
+}
+
+int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi)
+{
+	int err;
+	bool gc_required = true;
+
+retry:
+	f2fs_lock_op(sbi);
+	err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false);
+	f2fs_unlock_op(sbi);
+
+	if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) {
+		f2fs_down_write(&sbi->gc_lock);
+		err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), true, 1);
+		f2fs_up_write(&sbi->gc_lock);
+
+		gc_required = false;
+		if (!err)
+			goto retry;
+	}
+
+	return err;
 }
 
-void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
+int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi)
 {
 	int i;
+	int err = 0;
 
 	f2fs_down_read(&SM_I(sbi)->curseg_lock);
 	down_write(&SIT_I(sbi)->sentry_lock);
 	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-		__allocate_new_segment(sbi, i, false, false);
+		err += __allocate_new_segment(sbi, i, false, false);
 	up_write(&SIT_I(sbi)->sentry_lock);
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
+
+	return err;
 }
 
 bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi,
@@ -3236,8 +3309,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
 						GET_SEGNO(sbi, end);
 	if (need_align) {
-		start_segno = rounddown(start_segno, sbi->segs_per_sec);
-		end_segno = roundup(end_segno + 1, sbi->segs_per_sec) - 1;
+		start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi));
+		end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1;
 	}
 
 	cpc.reason = CP_DISCARD;
@@ -3410,7 +3483,14 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi,
 		get_random_u32_inclusive(1, sbi->max_fragment_hole);
 }
 
-void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
+static void reset_curseg_fields(struct curseg_info *curseg)
+{
+	curseg->inited = false;
+	curseg->segno = NULL_SEGNO;
+	curseg->next_segno = 0;
+}
+
+int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		block_t old_blkaddr, block_t *new_blkaddr,
 		struct f2fs_summary *sum, int type,
 		struct f2fs_io_info *fio)
@@ -3421,12 +3501,18 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
 	struct seg_entry *se = NULL;
 	bool segment_full = false;
+	int ret = 0;
 
 	f2fs_down_read(&SM_I(sbi)->curseg_lock);
 
 	mutex_lock(&curseg->curseg_mutex);
 	down_write(&sit_i->sentry_lock);
 
+	if (curseg->segno == NULL_SEGNO) {
+		ret = -ENOSPC;
+		goto out_err;
+	}
+
 	if (from_gc) {
 		f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
 		se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
@@ -3435,7 +3521,7 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	}
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
-	f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
+	f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi));
 
 	f2fs_wait_discard_bio(sbi, *new_blkaddr);
 
@@ -3464,25 +3550,35 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	 * since SSR needs latest valid block information.
 	 */
 	update_sit_entry(sbi, *new_blkaddr, 1);
-	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
-		update_sit_entry(sbi, old_blkaddr, -1);
+	update_sit_entry(sbi, old_blkaddr, -1);
 
 	/*
 	 * If the current segment is full, flush it out and replace it with a
 	 * new segment.
 	 */
 	if (segment_full) {
+		if (type == CURSEG_COLD_DATA_PINNED &&
+		    !((curseg->segno + 1) % sbi->segs_per_sec)) {
+			reset_curseg_fields(curseg);
+			goto skip_new_segment;
+		}
+
 		if (from_gc) {
-			get_atssr_segment(sbi, type, se->type,
+			ret = get_atssr_segment(sbi, type, se->type,
 						AT_SSR, se->mtime);
 		} else {
 			if (need_new_seg(sbi, type))
-				new_curseg(sbi, type, false);
+				ret = new_curseg(sbi, type, false);
 			else
-				change_curseg(sbi, type);
+				ret = change_curseg(sbi, type);
 			stat_inc_seg_type(sbi, curseg);
 		}
+
+		if (ret)
+			goto out_err;
 	}
+
+skip_new_segment:
 	/*
 	 * segment dirty status should be updated after segment allocation,
 	 * so we just need to update status only one time after previous
@@ -3491,12 +3587,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
 	locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
 
-	if (IS_DATASEG(type))
+	if (IS_DATASEG(curseg->seg_type))
 		atomic64_inc(&sbi->allocated_data_blocks);
 
 	up_write(&sit_i->sentry_lock);
 
-	if (page && IS_NODESEG(type)) {
+	if (page && IS_NODESEG(curseg->seg_type)) {
 		fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
 
 		f2fs_inode_chksum_set(sbi, page);
@@ -3505,9 +3601,6 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	if (fio) {
 		struct f2fs_bio_info *io;
 
-		if (F2FS_IO_ALIGNED(sbi))
-			fio->retry = 0;
-
 		INIT_LIST_HEAD(&fio->list);
 		fio->in_list = 1;
 		io = sbi->write_io[fio->type] + fio->temp;
@@ -3517,8 +3610,15 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 	}
 
 	mutex_unlock(&curseg->curseg_mutex);
-
 	f2fs_up_read(&SM_I(sbi)->curseg_lock);
+	return 0;
+out_err:
+	*new_blkaddr = NULL_ADDR;
+	up_write(&sit_i->sentry_lock);
+	mutex_unlock(&curseg->curseg_mutex);
+	f2fs_up_read(&SM_I(sbi)->curseg_lock);
+	return ret;
+
 }
 
 void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino,
@@ -3555,21 +3655,25 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
 
 	if (keep_order)
 		f2fs_down_read(&fio->sbi->io_order_lock);
-reallocate:
-	f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
-			&fio->new_blkaddr, sum, type, fio);
+
+	if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
+			&fio->new_blkaddr, sum, type, fio)) {
+		if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host))
+			fscrypt_finalize_bounce_page(&fio->encrypted_page);
+		if (PageWriteback(fio->page))
+			end_page_writeback(fio->page);
+		if (f2fs_in_warm_node_list(fio->sbi, fio->page))
+			f2fs_del_fsync_node_entry(fio->sbi, fio->page);
+		goto out;
+	}
 	if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
 		f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
 
 	/* writeout dirty page into bdev */
 	f2fs_submit_page_write(fio);
-	if (fio->retry) {
-		fio->old_blkaddr = fio->new_blkaddr;
-		goto reallocate;
-	}
 
 	f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
-
+out:
 	if (keep_order)
 		f2fs_up_read(&fio->sbi->io_order_lock);
 }
@@ -3653,8 +3757,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio)
 	}
 
 	if (fio->post_read)
-		invalidate_mapping_pages(META_MAPPING(sbi),
-				fio->new_blkaddr, fio->new_blkaddr);
+		f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1);
 
 	stat_inc_inplace_blocks(fio->sbi);
 
@@ -3743,7 +3846,8 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	/* change the current segment */
 	if (segno != curseg->segno) {
 		curseg->next_segno = segno;
-		change_curseg(sbi, type);
+		if (change_curseg(sbi, type))
+			goto out_unlock;
 	}
 
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
@@ -3769,12 +3873,14 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	if (recover_curseg) {
 		if (old_cursegno != curseg->segno) {
 			curseg->next_segno = old_cursegno;
-			change_curseg(sbi, type);
+			if (change_curseg(sbi, type))
+				goto out_unlock;
 		}
 		curseg->next_blkoff = old_blkoff;
 		curseg->alloc_type = old_alloc_type;
 	}
 
+out_unlock:
 	up_write(&sit_i->sentry_lock);
 	mutex_unlock(&curseg->curseg_mutex);
 	f2fs_up_write(&SM_I(sbi)->curseg_lock);
@@ -3844,7 +3950,7 @@ void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr,
 	for (i = 0; i < len; i++)
 		f2fs_wait_on_block_writeback(inode, blkaddr + i);
 
-	invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr + len - 1);
+	f2fs_truncate_meta_inode_pages(sbi, blkaddr, len);
 }
 
 static int read_compacted_summaries(struct f2fs_sb_info *sbi)
@@ -3886,7 +3992,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi)
 		seg_i->next_blkoff = blk_off;
 
 		if (seg_i->alloc_type == SSR)
-			blk_off = sbi->blocks_per_seg;
+			blk_off = BLKS_PER_SEG(sbi);
 
 		for (j = 0; j < blk_off; j++) {
 			struct f2fs_summary *s;
@@ -3954,7 +4060,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
 			struct f2fs_summary *ns = &sum->entries[0];
 			int i;
 
-			for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
+			for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) {
 				ns->version = 0;
 				ns->ofs_in_node = 0;
 			}
@@ -4460,7 +4566,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 #endif
 
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
-	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
+	sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs);
 	sit_i->written_valid_blocks = 0;
 	sit_i->bitmap_size = sit_bitmap_size;
 	sit_i->dirty_sentries = 0;
@@ -4533,9 +4639,7 @@ static int build_curseg(struct f2fs_sb_info *sbi)
 			array[i].seg_type = CURSEG_COLD_DATA;
 		else if (i == CURSEG_ALL_DATA_ATGC)
 			array[i].seg_type = CURSEG_COLD_DATA;
-		array[i].segno = NULL_SEGNO;
-		array[i].next_blkoff = 0;
-		array[i].inited = false;
+		reset_curseg_fields(&array[i]);
 	}
 	return restore_curseg_summaries(sbi);
 }
@@ -4587,21 +4691,20 @@ static int build_sit_entries(struct f2fs_sb_info *sbi)
 
 			sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
 
-			if (f2fs_block_unit_discard(sbi)) {
-				/* build discard map only one time */
-				if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
-					memset(se->discard_map, 0xff,
+			if (!f2fs_block_unit_discard(sbi))
+				goto init_discard_map_done;
+
+			/* build discard map only one time */
+			if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
+				memset(se->discard_map, 0xff,
 						SIT_VBLOCK_MAP_SIZE);
-				} else {
-					memcpy(se->discard_map,
-						se->cur_valid_map,
+				goto init_discard_map_done;
+			}
+			memcpy(se->discard_map, se->cur_valid_map,
 						SIT_VBLOCK_MAP_SIZE);
-					sbi->discard_blks +=
-						sbi->blocks_per_seg -
+			sbi->discard_blks += BLKS_PER_SEG(sbi) -
 						se->valid_blocks;
-				}
-			}
-
+init_discard_map_done:
 			if (__is_large_section(sbi))
 				get_sec_entry(sbi, start)->valid_blocks +=
 							se->valid_blocks;
@@ -4741,7 +4844,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi)
 		return;
 
 	mutex_lock(&dirty_i->seglist_lock);
-	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
 		valid_blocks = get_valid_blocks(sbi, segno, true);
 		secno = GET_SEC_FROM_SEG(sbi, segno);
 
@@ -4840,7 +4943,7 @@ static int sanity_check_curseg(struct f2fs_sb_info *sbi)
 		if (curseg->alloc_type == SSR)
 			continue;
 
-		for (blkofs += 1; blkofs < sbi->blocks_per_seg; blkofs++) {
+		for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) {
 			if (!f2fs_test_bit(blkofs, se->cur_valid_map))
 				continue;
 out:
@@ -4856,6 +4959,16 @@ out:
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
+static const char *f2fs_zone_status[BLK_ZONE_COND_OFFLINE + 1] = {
+	[BLK_ZONE_COND_NOT_WP]		= "NOT_WP",
+	[BLK_ZONE_COND_EMPTY]		= "EMPTY",
+	[BLK_ZONE_COND_IMP_OPEN]	= "IMPLICIT_OPEN",
+	[BLK_ZONE_COND_EXP_OPEN]	= "EXPLICIT_OPEN",
+	[BLK_ZONE_COND_CLOSED]		= "CLOSED",
+	[BLK_ZONE_COND_READONLY]	= "READONLY",
+	[BLK_ZONE_COND_FULL]		= "FULL",
+	[BLK_ZONE_COND_OFFLINE]		= "OFFLINE",
+};
 
 static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 				    struct f2fs_dev_info *fdev,
@@ -4865,6 +4978,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 	block_t zone_block, valid_block_cnt;
 	unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
 	int ret;
+	unsigned int nofs_flags;
 
 	if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
 		return 0;
@@ -4876,14 +4990,19 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 	 * Skip check of zones cursegs point to, since
 	 * fix_curseg_write_pointer() checks them.
 	 */
-	if (zone_segno >= MAIN_SEGS(sbi) ||
-	    IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno)))
+	if (zone_segno >= MAIN_SEGS(sbi))
 		return 0;
 
 	/*
 	 * Get # of valid block of the zone.
 	 */
 	valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
+	if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) {
+		f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]",
+				zone_segno, valid_block_cnt,
+				f2fs_zone_status[zone->cond]);
+		return 0;
+	}
 
 	if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) ||
 	    (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL))
@@ -4891,8 +5010,8 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 
 	if (!valid_block_cnt) {
 		f2fs_notice(sbi, "Zone without valid block has non-zero write "
-			    "pointer. Reset the write pointer: cond[0x%x]",
-			    zone->cond);
+			    "pointer. Reset the write pointer: cond[%s]",
+			    f2fs_zone_status[zone->cond]);
 		ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
 					zone->len >> log_sectors_per_block);
 		if (ret)
@@ -4909,11 +5028,13 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
 	 * selected for write operation until it get discarded.
 	 */
 	f2fs_notice(sbi, "Valid blocks are not aligned with write "
-		    "pointer: valid block[0x%x,0x%x] cond[0x%x]",
-		    zone_segno, valid_block_cnt, zone->cond);
+		    "pointer: valid block[0x%x,0x%x] cond[%s]",
+		    zone_segno, valid_block_cnt, f2fs_zone_status[zone->cond]);
 
+	nofs_flags = memalloc_nofs_save();
 	ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
-				zone->start, zone->len, GFP_NOFS);
+				zone->start, zone->len);
+	memalloc_nofs_restore(nofs_flags);
 	if (ret == -EOPNOTSUPP) {
 		ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
 					zone->len - (zone->wp - zone->start),
@@ -5119,7 +5240,7 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
 	unsigned int secno;
 
 	if (!sbi->unusable_blocks_per_sec)
-		return sbi->blocks_per_seg;
+		return BLKS_PER_SEG(sbi);
 
 	secno = GET_SEC_FROM_SEG(sbi, segno);
 	seg_start = START_BLOCK(sbi, segno);
@@ -5134,10 +5255,10 @@ static inline unsigned int f2fs_usable_zone_blks_in_seg(
 	 */
 	if (seg_start >= sec_cap_blkaddr)
 		return 0;
-	if (seg_start + sbi->blocks_per_seg > sec_cap_blkaddr)
+	if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr)
 		return sec_cap_blkaddr - seg_start;
 
-	return sbi->blocks_per_seg;
+	return BLKS_PER_SEG(sbi);
 }
 #else
 int f2fs_fix_curseg_write_pointer(struct f2fs_sb_info *sbi)
@@ -5163,7 +5284,7 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi,
 	if (f2fs_sb_has_blkzoned(sbi))
 		return f2fs_usable_zone_blks_in_seg(sbi, segno);
 
-	return sbi->blocks_per_seg;
+	return BLKS_PER_SEG(sbi);
 }
 
 unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
@@ -5172,7 +5293,7 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi,
 	if (f2fs_sb_has_blkzoned(sbi))
 		return CAP_SEGS_PER_SEC(sbi);
 
-	return sbi->segs_per_sec;
+	return SEGS_PER_SEC(sbi);
 }
 
 /*
@@ -5187,14 +5308,14 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
 
 	sit_i->min_mtime = ULLONG_MAX;
 
-	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
+	for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) {
 		unsigned int i;
 		unsigned long long mtime = 0;
 
-		for (i = 0; i < sbi->segs_per_sec; i++)
+		for (i = 0; i < SEGS_PER_SEC(sbi); i++)
 			mtime += get_seg_entry(sbi, segno + i)->mtime;
 
-		mtime = div_u64(mtime, sbi->segs_per_sec);
+		mtime = div_u64(mtime, SEGS_PER_SEC(sbi));
 
 		if (sit_i->min_mtime > mtime)
 			sit_i->min_mtime = mtime;
@@ -5233,7 +5354,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
 		sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC);
 	sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
 	sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
-	sm_info->min_seq_blocks = sbi->blocks_per_seg;
+	sm_info->min_seq_blocks = BLKS_PER_SEG(sbi);
 	sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
 	sm_info->min_ssr_sections = reserved_sections(sbi);
 
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 8129be788bd5..e1c0f418aa11 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -48,21 +48,21 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
 
 #define IS_CURSEC(sbi, secno)						\
 	(((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno /		\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA_PINNED)->segno /	\
-	  (sbi)->segs_per_sec) ||	\
+	  SEGS_PER_SEC(sbi)) ||	\
 	 ((secno) == CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC)->segno /	\
-	  (sbi)->segs_per_sec))
+	  SEGS_PER_SEC(sbi)))
 
 #define MAIN_BLKADDR(sbi)						\
 	(SM_I(sbi) ? SM_I(sbi)->main_blkaddr : 				\
@@ -77,40 +77,37 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
 #define TOTAL_SEGS(sbi)							\
 	(SM_I(sbi) ? SM_I(sbi)->segment_count : 				\
 		le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count))
-#define TOTAL_BLKS(sbi)	(TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg)
+#define TOTAL_BLKS(sbi)	(SEGS_TO_BLKS(sbi, TOTAL_SEGS(sbi)))
 
 #define MAX_BLKADDR(sbi)	(SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi))
 #define SEGMENT_SIZE(sbi)	(1ULL << ((sbi)->log_blocksize +	\
 					(sbi)->log_blocks_per_seg))
 
 #define START_BLOCK(sbi, segno)	(SEG0_BLKADDR(sbi) +			\
-	 (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg))
+	 (SEGS_TO_BLKS(sbi, GET_R2L_SEGNO(FREE_I(sbi), segno))))
 
 #define NEXT_FREE_BLKADDR(sbi, curseg)					\
 	(START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff)
 
 #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr)	((blk_addr) - SEG0_BLKADDR(sbi))
 #define GET_SEGNO_FROM_SEG0(sbi, blk_addr)				\
-	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg)
+	(BLKS_TO_SEGS(sbi, GET_SEGOFF_FROM_SEG0(sbi, blk_addr)))
 #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr)				\
-	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1))
+	(GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (BLKS_PER_SEG(sbi) - 1))
 
 #define GET_SEGNO(sbi, blk_addr)					\
 	((!__is_valid_data_blkaddr(blk_addr)) ?			\
 	NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi),			\
 		GET_SEGNO_FROM_SEG0(sbi, blk_addr)))
-#define BLKS_PER_SEC(sbi)					\
-	((sbi)->segs_per_sec * (sbi)->blocks_per_seg)
 #define CAP_BLKS_PER_SEC(sbi)					\
-	((sbi)->segs_per_sec * (sbi)->blocks_per_seg -		\
-	 (sbi)->unusable_blocks_per_sec)
+	(BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec)
 #define CAP_SEGS_PER_SEC(sbi)					\
-	((sbi)->segs_per_sec - ((sbi)->unusable_blocks_per_sec >>\
-	(sbi)->log_blocks_per_seg))
+	(SEGS_PER_SEC(sbi) -					\
+	BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec))
 #define GET_SEC_FROM_SEG(sbi, segno)				\
-	(((segno) == -1) ? -1 : (segno) / (sbi)->segs_per_sec)
+	(((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi))
 #define GET_SEG_FROM_SEC(sbi, secno)				\
-	((secno) * (sbi)->segs_per_sec)
+	((secno) * SEGS_PER_SEC(sbi))
 #define GET_ZONE_FROM_SEC(sbi, secno)				\
 	(((secno) == -1) ? -1 : (secno) / (sbi)->secs_per_zone)
 #define GET_ZONE_FROM_SEG(sbi, segno)				\
@@ -139,16 +136,6 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
 	((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK)
 
 /*
- * indicate a block allocation direction: RIGHT and LEFT.
- * RIGHT means allocating new sections towards the end of volume.
- * LEFT means the opposite direction.
- */
-enum {
-	ALLOC_RIGHT = 0,
-	ALLOC_LEFT
-};
-
-/*
  * In the victim_sel_policy->alloc_mode, there are three block allocation modes.
  * LFS writes data sequentially with cleaning operations.
  * SSR (Slack Space Recycle) reuses obsolete space without cleaning operations.
@@ -364,7 +351,7 @@ static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi,
 		unsigned int blocks = 0;
 		int i;
 
-		for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) {
+		for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) {
 			struct seg_entry *se = get_seg_entry(sbi, start_segno);
 
 			blocks += se->ckpt_valid_blocks;
@@ -449,7 +436,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
 	free_i->free_segments++;
 
 	next = find_next_bit(free_i->free_segmap,
-			start_segno + sbi->segs_per_sec, start_segno);
+			start_segno + SEGS_PER_SEC(sbi), start_segno);
 	if (next >= start_segno + usable_segs) {
 		clear_bit(secno, free_i->free_secmap);
 		free_i->free_sections++;
@@ -485,7 +472,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
 		if (!inmem && IS_CURSEC(sbi, secno))
 			goto skip_free;
 		next = find_next_bit(free_i->free_segmap,
-				start_segno + sbi->segs_per_sec, start_segno);
+				start_segno + SEGS_PER_SEC(sbi), start_segno);
 		if (next >= start_segno + usable_segs) {
 			if (test_and_clear_bit(secno, free_i->free_secmap))
 				free_i->free_sections++;
@@ -573,23 +560,22 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
 			unsigned int node_blocks, unsigned int dent_blocks)
 {
 
-	unsigned int segno, left_blocks;
+	unsigned segno, left_blocks;
 	int i;
 
-	/* check current node segment */
+	/* check current node sections in the worst case. */
 	for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
 		segno = CURSEG_I(sbi, i)->segno;
-		left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
-				get_seg_entry(sbi, segno)->ckpt_valid_blocks;
-
+		left_blocks = CAP_BLKS_PER_SEC(sbi) -
+				get_ckpt_valid_blocks(sbi, segno, true);
 		if (node_blocks > left_blocks)
 			return false;
 	}
 
-	/* check current data segment */
+	/* check current data section for dentry blocks. */
 	segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
-	left_blocks = f2fs_usable_blks_in_seg(sbi, segno) -
-			get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+	left_blocks = CAP_BLKS_PER_SEC(sbi) -
+			get_ckpt_valid_blocks(sbi, segno, true);
 	if (dent_blocks > left_blocks)
 		return false;
 	return true;
@@ -638,7 +624,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 
 	if (free_secs > upper_secs)
 		return false;
-	else if (free_secs <= lower_secs)
+	if (free_secs <= lower_secs)
 		return true;
 	return !curseg_space;
 }
@@ -793,10 +779,10 @@ static inline int check_block_count(struct f2fs_sb_info *sbi,
 		return -EFSCORRUPTED;
 	}
 
-	if (usable_blks_per_seg < sbi->blocks_per_seg)
+	if (usable_blks_per_seg < BLKS_PER_SEG(sbi))
 		f2fs_bug_on(sbi, find_next_bit_le(&raw_sit->valid_map,
-				sbi->blocks_per_seg,
-				usable_blks_per_seg) != sbi->blocks_per_seg);
+				BLKS_PER_SEG(sbi),
+				usable_blks_per_seg) != BLKS_PER_SEG(sbi));
 
 	/* check segment usage, and check boundary of a given segment number */
 	if (unlikely(GET_SIT_VBLOCKS(raw_sit) > usable_blks_per_seg
@@ -915,9 +901,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 		return 0;
 
 	if (type == DATA)
-		return sbi->blocks_per_seg;
+		return BLKS_PER_SEG(sbi);
 	else if (type == NODE)
-		return 8 * sbi->blocks_per_seg;
+		return SEGS_TO_BLKS(sbi, 8);
 	else if (type == META)
 		return 8 * BIO_MAX_VECS;
 	else
@@ -969,3 +955,13 @@ wake_up:
 	dcc->discard_wake = true;
 	wake_up_interruptible_all(&dcc->discard_wait_queue);
 }
+
+static inline unsigned int first_zoned_segno(struct f2fs_sb_info *sbi)
+{
+	int devi;
+
+	for (devi = 0; devi < sbi->s_ndevs; devi++)
+		if (bdev_is_zoned(FDEV(devi).bdev))
+			return GET_SEGNO(sbi, FDEV(devi).start_blk);
+	return 0;
+}
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d45ab0992ae5..a4bc26dfdb1a 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,24 +44,26 @@ static struct kmem_cache *f2fs_inode_cachep;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 
 const char *f2fs_fault_name[FAULT_MAX] = {
-	[FAULT_KMALLOC]		= "kmalloc",
-	[FAULT_KVMALLOC]	= "kvmalloc",
-	[FAULT_PAGE_ALLOC]	= "page alloc",
-	[FAULT_PAGE_GET]	= "page get",
-	[FAULT_ALLOC_NID]	= "alloc nid",
-	[FAULT_ORPHAN]		= "orphan",
-	[FAULT_BLOCK]		= "no more block",
-	[FAULT_DIR_DEPTH]	= "too big dir depth",
-	[FAULT_EVICT_INODE]	= "evict_inode fail",
-	[FAULT_TRUNCATE]	= "truncate fail",
-	[FAULT_READ_IO]		= "read IO error",
-	[FAULT_CHECKPOINT]	= "checkpoint error",
-	[FAULT_DISCARD]		= "discard error",
-	[FAULT_WRITE_IO]	= "write IO error",
-	[FAULT_SLAB_ALLOC]	= "slab alloc",
-	[FAULT_DQUOT_INIT]	= "dquot initialize",
-	[FAULT_LOCK_OP]		= "lock_op",
-	[FAULT_BLKADDR]		= "invalid blkaddr",
+	[FAULT_KMALLOC]			= "kmalloc",
+	[FAULT_KVMALLOC]		= "kvmalloc",
+	[FAULT_PAGE_ALLOC]		= "page alloc",
+	[FAULT_PAGE_GET]		= "page get",
+	[FAULT_ALLOC_NID]		= "alloc nid",
+	[FAULT_ORPHAN]			= "orphan",
+	[FAULT_BLOCK]			= "no more block",
+	[FAULT_DIR_DEPTH]		= "too big dir depth",
+	[FAULT_EVICT_INODE]		= "evict_inode fail",
+	[FAULT_TRUNCATE]		= "truncate fail",
+	[FAULT_READ_IO]			= "read IO error",
+	[FAULT_CHECKPOINT]		= "checkpoint error",
+	[FAULT_DISCARD]			= "discard error",
+	[FAULT_WRITE_IO]		= "write IO error",
+	[FAULT_SLAB_ALLOC]		= "slab alloc",
+	[FAULT_DQUOT_INIT]		= "dquot initialize",
+	[FAULT_LOCK_OP]			= "lock_op",
+	[FAULT_BLKADDR_VALIDITY]	= "invalid blkaddr",
+	[FAULT_BLKADDR_CONSISTENCE]	= "inconsistent blkaddr",
+	[FAULT_NO_SEGMENT]		= "no free segment",
 };
 
 void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate,
@@ -137,7 +139,6 @@ enum {
 	Opt_resgid,
 	Opt_resuid,
 	Opt_mode,
-	Opt_io_size_bits,
 	Opt_fault_injection,
 	Opt_fault_type,
 	Opt_lazytime,
@@ -216,7 +217,6 @@ static match_table_t f2fs_tokens = {
 	{Opt_resgid, "resgid=%u"},
 	{Opt_resuid, "resuid=%u"},
 	{Opt_mode, "mode=%s"},
-	{Opt_io_size_bits, "io_bits=%u"},
 	{Opt_fault_injection, "fault_injection=%u"},
 	{Opt_fault_type, "fault_type=%u"},
 	{Opt_lazytime, "lazytime"},
@@ -263,7 +263,8 @@ static match_table_t f2fs_tokens = {
 	{Opt_err, NULL},
 };
 
-void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
+void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate,
+						const char *fmt, ...)
 {
 	struct va_format vaf;
 	va_list args;
@@ -274,8 +275,12 @@ void f2fs_printk(struct f2fs_sb_info *sbi, const char *fmt, ...)
 	level = printk_get_level(fmt);
 	vaf.fmt = printk_skip_level(fmt);
 	vaf.va = &args;
-	printk("%c%cF2FS-fs (%s): %pV\n",
-	       KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+	if (limit_rate)
+		printk_ratelimited("%c%cF2FS-fs (%s): %pV\n",
+			KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
+	else
+		printk("%c%cF2FS-fs (%s): %pV\n",
+			KERN_SOH_ASCII, level, sbi->sb->s_id, &vaf);
 
 	va_end(args);
 }
@@ -343,46 +348,6 @@ static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
 					   F2FS_OPTION(sbi).s_resgid));
 }
 
-static inline int adjust_reserved_segment(struct f2fs_sb_info *sbi)
-{
-	unsigned int sec_blks = sbi->blocks_per_seg * sbi->segs_per_sec;
-	unsigned int avg_vblocks;
-	unsigned int wanted_reserved_segments;
-	block_t avail_user_block_count;
-
-	if (!F2FS_IO_ALIGNED(sbi))
-		return 0;
-
-	/* average valid block count in section in worst case */
-	avg_vblocks = sec_blks / F2FS_IO_SIZE(sbi);
-
-	/*
-	 * we need enough free space when migrating one section in worst case
-	 */
-	wanted_reserved_segments = (F2FS_IO_SIZE(sbi) / avg_vblocks) *
-						reserved_segments(sbi);
-	wanted_reserved_segments -= reserved_segments(sbi);
-
-	avail_user_block_count = sbi->user_block_count -
-				sbi->current_reserved_blocks -
-				F2FS_OPTION(sbi).root_reserved_blocks;
-
-	if (wanted_reserved_segments * sbi->blocks_per_seg >
-					avail_user_block_count) {
-		f2fs_err(sbi, "IO align feature can't grab additional reserved segment: %u, available segments: %u",
-			wanted_reserved_segments,
-			avail_user_block_count >> sbi->log_blocks_per_seg);
-		return -ENOSPC;
-	}
-
-	SM_I(sbi)->additional_reserved_segments = wanted_reserved_segments;
-
-	f2fs_info(sbi, "IO align feature needs additional reserved segment: %u",
-			 wanted_reserved_segments);
-
-	return 0;
-}
-
 static inline void adjust_unusable_cap_perc(struct f2fs_sb_info *sbi)
 {
 	if (!F2FS_OPTION(sbi).unusable_cap_perc)
@@ -663,7 +628,7 @@ static int f2fs_set_lz4hc_level(struct f2fs_sb_info *sbi, const char *str)
 #ifdef CONFIG_F2FS_FS_ZSTD
 static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
 {
-	unsigned int level;
+	int level;
 	int len = 4;
 
 	if (strlen(str) == len) {
@@ -677,9 +642,15 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str)
 		f2fs_info(sbi, "wrong format, e.g. <alg_name>:<compr_level>");
 		return -EINVAL;
 	}
-	if (kstrtouint(str + 1, 10, &level))
+	if (kstrtoint(str + 1, 10, &level))
 		return -EINVAL;
 
+	/* f2fs does not support negative compress level now */
+	if (level < 0) {
+		f2fs_info(sbi, "do not support negative compress level: %d", level);
+		return -ERANGE;
+	}
+
 	if (!f2fs_is_compress_level_valid(COMPRESS_ZSTD, level)) {
 		f2fs_info(sbi, "invalid zstd compress level: %d", level);
 		return -EINVAL;
@@ -763,10 +734,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			clear_opt(sbi, DISCARD);
 			break;
 		case Opt_noheap:
-			set_opt(sbi, NOHEAP);
-			break;
 		case Opt_heap:
-			clear_opt(sbi, NOHEAP);
+			f2fs_warn(sbi, "heap/no_heap options were deprecated");
 			break;
 #ifdef CONFIG_F2FS_FS_XATTR
 		case Opt_user_xattr:
@@ -913,16 +882,6 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount)
 			}
 			kfree(name);
 			break;
-		case Opt_io_size_bits:
-			if (args->from && match_int(args, &arg))
-				return -EINVAL;
-			if (arg <= 0 || arg > __ilog2_u32(BIO_MAX_VECS)) {
-				f2fs_warn(sbi, "Not support %ld, larger than %d",
-					BIT(arg), BIO_MAX_VECS);
-				return -EINVAL;
-			}
-			F2FS_OPTION(sbi).write_io_size_bits = arg;
-			break;
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 		case Opt_fault_injection:
 			if (args->from && match_int(args, &arg))
@@ -1392,12 +1351,6 @@ default_check:
 	}
 #endif
 
-	if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
-		f2fs_err(sbi, "Should set mode=lfs with %luKB-sized IO",
-			 F2FS_IO_SIZE_KB(sbi));
-		return -EINVAL;
-	}
-
 	if (test_opt(sbi, INLINE_XATTR_SIZE)) {
 		int min_size, max_size;
 
@@ -1605,7 +1558,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < sbi->s_ndevs; i++) {
 		if (i > 0)
-			bdev_release(FDEV(i).bdev_handle);
+			bdev_fput(FDEV(i).bdev_file);
 #ifdef CONFIG_BLK_DEV_ZONED
 		kvfree(FDEV(i).blkz_seq);
 #endif
@@ -1718,7 +1671,6 @@ static void f2fs_put_super(struct super_block *sb)
 
 	f2fs_destroy_page_array_cache(sbi);
 	f2fs_destroy_xattr_caches(sbi);
-	mempool_destroy(sbi->write_io_dummy);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(F2FS_OPTION(sbi).s_qf_names[i]);
@@ -2009,10 +1961,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 	} else {
 		seq_puts(seq, ",nodiscard");
 	}
-	if (test_opt(sbi, NOHEAP))
-		seq_puts(seq, ",no_heap");
-	else
-		seq_puts(seq, ",heap");
 #ifdef CONFIG_F2FS_FS_XATTR
 	if (test_opt(sbi, XATTR_USER))
 		seq_puts(seq, ",user_xattr");
@@ -2078,9 +2026,6 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 					F2FS_OPTION(sbi).s_resuid),
 				from_kgid_munged(&init_user_ns,
 					F2FS_OPTION(sbi).s_resgid));
-	if (F2FS_IO_SIZE_BITS(sbi))
-		seq_printf(seq, ",io_bits=%u",
-				F2FS_OPTION(sbi).write_io_size_bits);
 #ifdef CONFIG_F2FS_FAULT_INJECTION
 	if (test_opt(sbi, FAULT_INJECTION)) {
 		seq_printf(seq, ",fault_injection=%u",
@@ -2192,7 +2137,6 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount)
 	set_opt(sbi, INLINE_XATTR);
 	set_opt(sbi, INLINE_DATA);
 	set_opt(sbi, INLINE_DENTRY);
-	set_opt(sbi, NOHEAP);
 	set_opt(sbi, MERGE_CHECKPOINT);
 	F2FS_OPTION(sbi).unusable_cap = 0;
 	sbi->sb->s_flags |= SB_LAZYTIME;
@@ -2247,6 +2191,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
 			.init_gc_type = FG_GC,
 			.should_migrate_blocks = false,
 			.err_gc_skipped = true,
+			.no_bg_gc = true,
 			.nr_free_secs = 1 };
 
 		f2fs_down_write(&sbi->gc_lock);
@@ -2332,7 +2277,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	bool no_read_extent_cache = !test_opt(sbi, READ_EXTENT_CACHE);
 	bool no_age_extent_cache = !test_opt(sbi, AGE_EXTENT_CACHE);
 	bool enable_checkpoint = !test_opt(sbi, DISABLE_CHECKPOINT);
-	bool no_io_align = !F2FS_IO_ALIGNED(sbi);
 	bool no_atgc = !test_opt(sbi, ATGC);
 	bool no_discard = !test_opt(sbi, DISCARD);
 	bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE);
@@ -2440,12 +2384,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 		goto restore_opts;
 	}
 
-	if (no_io_align == !!F2FS_IO_ALIGNED(sbi)) {
-		err = -EINVAL;
-		f2fs_warn(sbi, "switch io_bits option is not allowed");
-		goto restore_opts;
-	}
-
 	if (no_compress_cache == !!test_opt(sbi, COMPRESS_CACHE)) {
 		err = -EINVAL;
 		f2fs_warn(sbi, "switch compress_cache option is not allowed");
@@ -2768,7 +2706,7 @@ int f2fs_dquot_initialize(struct inode *inode)
 	return dquot_initialize(inode);
 }
 
-static struct dquot **f2fs_get_dquots(struct inode *inode)
+static struct dquot __rcu **f2fs_get_dquots(struct inode *inode)
 {
 	return F2FS_I(inode)->i_dquot;
 }
@@ -3706,7 +3644,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	}
 
 	main_segs = le32_to_cpu(raw_super->segment_count_main);
-	blocks_per_seg = sbi->blocks_per_seg;
+	blocks_per_seg = BLKS_PER_SEG(sbi);
 
 	for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
 		if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs ||
@@ -3818,9 +3756,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->segs_per_sec = le32_to_cpu(raw_super->segs_per_sec);
 	sbi->secs_per_zone = le32_to_cpu(raw_super->secs_per_zone);
 	sbi->total_sections = le32_to_cpu(raw_super->section_count);
-	sbi->total_node_count =
-		(le32_to_cpu(raw_super->segment_count_nat) / 2)
-			* sbi->blocks_per_seg * NAT_ENTRY_PER_BLOCK;
+	sbi->total_node_count = SEGS_TO_BLKS(sbi,
+			((le32_to_cpu(raw_super->segment_count_nat) / 2) *
+			NAT_ENTRY_PER_BLOCK));
 	F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino);
 	F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino);
 	F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino);
@@ -3829,7 +3767,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
 	sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
 	sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH;
-	sbi->migration_granularity = sbi->segs_per_sec;
+	sbi->migration_granularity = SEGS_PER_SEC(sbi);
 	sbi->seq_file_ra_mul = MIN_RA_MUL;
 	sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE;
 	sbi->max_fragment_hole = DEF_FRAGMENT_SIZE;
@@ -3930,11 +3868,6 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
 		return 0;
 
 	zone_sectors = bdev_zone_sectors(bdev);
-	if (!is_power_of_2(zone_sectors)) {
-		f2fs_err(sbi, "F2FS does not support non power of 2 zone sizes\n");
-		return -EINVAL;
-	}
-
 	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
 				SECTOR_TO_BLOCK(zone_sectors))
 		return -EINVAL;
@@ -4090,7 +4023,9 @@ static void f2fs_record_stop_reason(struct f2fs_sb_info *sbi)
 
 	f2fs_up_write(&sbi->sb_lock);
 	if (err)
-		f2fs_err(sbi, "f2fs_commit_super fails to record err:%d", err);
+		f2fs_err_ratelimited(sbi,
+			"f2fs_commit_super fails to record stop_reason, err:%d",
+			err);
 }
 
 void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag)
@@ -4133,8 +4068,9 @@ static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error)
 
 	err = f2fs_commit_super(sbi, false);
 	if (err)
-		f2fs_err(sbi, "f2fs_commit_super fails to record errors:%u, err:%d",
-								error, err);
+		f2fs_err_ratelimited(sbi,
+			"f2fs_commit_super fails to record errors:%u, err:%d",
+			error, err);
 out_unlock:
 	f2fs_up_write(&sbi->sb_lock);
 }
@@ -4247,7 +4183,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 
 	for (i = 0; i < max_devices; i++) {
 		if (i == 0)
-			FDEV(0).bdev_handle = sbi->sb->s_bdev_handle;
+			FDEV(0).bdev_file = sbi->sb->s_bdev_file;
 		else if (!RDEV(i).path[0])
 			break;
 
@@ -4259,22 +4195,22 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 			if (i == 0) {
 				FDEV(i).start_blk = 0;
 				FDEV(i).end_blk = FDEV(i).start_blk +
-				    (FDEV(i).total_segments <<
-				    sbi->log_blocks_per_seg) - 1 +
-				    le32_to_cpu(raw_super->segment0_blkaddr);
+					SEGS_TO_BLKS(sbi,
+					FDEV(i).total_segments) - 1 +
+					le32_to_cpu(raw_super->segment0_blkaddr);
 			} else {
 				FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
 				FDEV(i).end_blk = FDEV(i).start_blk +
-					(FDEV(i).total_segments <<
-					sbi->log_blocks_per_seg) - 1;
-				FDEV(i).bdev_handle = bdev_open_by_path(
+						SEGS_TO_BLKS(sbi,
+						FDEV(i).total_segments) - 1;
+				FDEV(i).bdev_file = bdev_file_open_by_path(
 					FDEV(i).path, mode, sbi->sb, NULL);
 			}
 		}
-		if (IS_ERR(FDEV(i).bdev_handle))
-			return PTR_ERR(FDEV(i).bdev_handle);
+		if (IS_ERR(FDEV(i).bdev_file))
+			return PTR_ERR(FDEV(i).bdev_file);
 
-		FDEV(i).bdev = FDEV(i).bdev_handle->bdev;
+		FDEV(i).bdev = file_bdev(FDEV(i).bdev_file);
 		/* to release errored devices */
 		sbi->s_ndevs = i + 1;
 
@@ -4305,8 +4241,6 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
 			  FDEV(i).total_segments,
 			  FDEV(i).start_blk, FDEV(i).end_blk);
 	}
-	f2fs_info(sbi,
-		  "IO Block Size: %8ld KB", F2FS_IO_SIZE_KB(sbi));
 	return 0;
 }
 
@@ -4496,7 +4430,7 @@ try_onemore:
 	sb->s_time_gran = 1;
 	sb->s_flags = (sb->s_flags & ~SB_POSIXACL) |
 		(test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0);
-	memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid));
+	super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid));
 	sb->s_iflags |= SB_I_CGROUPWB;
 
 	/* init f2fs-specific super block info */
@@ -4519,19 +4453,10 @@ try_onemore:
 	if (err)
 		goto free_iostat;
 
-	if (F2FS_IO_ALIGNED(sbi)) {
-		sbi->write_io_dummy =
-			mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0);
-		if (!sbi->write_io_dummy) {
-			err = -ENOMEM;
-			goto free_percpu;
-		}
-	}
-
 	/* init per sbi slab cache */
 	err = f2fs_init_xattr_caches(sbi);
 	if (err)
-		goto free_io_dummy;
+		goto free_percpu;
 	err = f2fs_init_page_array_cache(sbi);
 	if (err)
 		goto free_xattr_cache;
@@ -4619,10 +4544,6 @@ try_onemore:
 		goto free_nm;
 	}
 
-	err = adjust_reserved_segment(sbi);
-	if (err)
-		goto free_nm;
-
 	/* For write statistics */
 	sbi->sectors_written_start = f2fs_get_sectors_written(sbi);
 
@@ -4660,6 +4581,7 @@ try_onemore:
 		goto free_node_inode;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root); /* allocate root dentry */
 	if (!sb->s_root) {
 		err = -ENOMEM;
@@ -4748,13 +4670,20 @@ reset_checkpoint:
 	 * If the f2fs is not readonly and fsync data recovery succeeds,
 	 * check zoned block devices' write pointer consistency.
 	 */
-	if (!err && !f2fs_readonly(sb) && f2fs_sb_has_blkzoned(sbi)) {
-		err = f2fs_check_write_pointer(sbi);
-		if (err)
-			goto free_meta;
+	if (f2fs_sb_has_blkzoned(sbi) && !f2fs_readonly(sb)) {
+		int err2;
+
+		f2fs_notice(sbi, "Checking entire write pointers");
+		err2 = f2fs_check_write_pointer(sbi);
+		if (err2)
+			err = err2;
 	}
+	if (err)
+		goto free_meta;
 
-	f2fs_init_inmem_curseg(sbi);
+	err = f2fs_init_inmem_curseg(sbi);
+	if (err)
+		goto sync_free_meta;
 
 	/* f2fs_recover_fsync_data() cleared this already */
 	clear_sbi_flag(sbi, SBI_POR_DOING);
@@ -4853,8 +4782,6 @@ free_page_array_cache:
 	f2fs_destroy_page_array_cache(sbi);
 free_xattr_cache:
 	f2fs_destroy_xattr_caches(sbi);
-free_io_dummy:
-	mempool_destroy(sbi->write_io_dummy);
 free_percpu:
 	destroy_percpu_info(sbi);
 free_iostat:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index a7ec55c7bb20..a568ce96cf56 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -493,8 +493,8 @@ out:
 		spin_lock(&sbi->stat_lock);
 		if (t > (unsigned long)(sbi->user_block_count -
 				F2FS_OPTION(sbi).root_reserved_blocks -
-				sbi->blocks_per_seg *
-				SM_I(sbi)->additional_reserved_segments)) {
+				SEGS_TO_BLKS(sbi,
+				SM_I(sbi)->additional_reserved_segments))) {
 			spin_unlock(&sbi->stat_lock);
 			return -EINVAL;
 		}
@@ -551,7 +551,7 @@ out:
 	}
 
 	if (!strcmp(a->attr.name, "migration_granularity")) {
-		if (t == 0 || t > sbi->segs_per_sec)
+		if (t == 0 || t > SEGS_PER_SEC(sbi))
 			return -EINVAL;
 	}
 
@@ -1492,6 +1492,50 @@ static int __maybe_unused discard_plist_seq_show(struct seq_file *seq,
 	return 0;
 }
 
+static int __maybe_unused disk_map_seq_show(struct seq_file *seq,
+						void *offset)
+{
+	struct super_block *sb = seq->private;
+	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int i;
+
+	seq_printf(seq, "Address Layout   : %5luB Block address (# of Segments)\n",
+					F2FS_BLKSIZE);
+	seq_printf(seq, " SB            : %12s\n", "0/1024B");
+	seq_printf(seq, " seg0_blkaddr  : 0x%010x\n", SEG0_BLKADDR(sbi));
+	seq_printf(seq, " Checkpoint    : 0x%010x (%10d)\n",
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr), 2);
+	seq_printf(seq, " SIT           : 0x%010x (%10d)\n",
+			SIT_I(sbi)->sit_base_addr,
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_sit));
+	seq_printf(seq, " NAT           : 0x%010x (%10d)\n",
+			NM_I(sbi)->nat_blkaddr,
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_nat));
+	seq_printf(seq, " SSA           : 0x%010x (%10d)\n",
+			SM_I(sbi)->ssa_blkaddr,
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_ssa));
+	seq_printf(seq, " Main          : 0x%010x (%10d)\n",
+			SM_I(sbi)->main_blkaddr,
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main));
+	seq_printf(seq, " # of Sections : %12d\n",
+			le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count));
+	seq_printf(seq, " Segs/Sections : %12d\n",
+			SEGS_PER_SEC(sbi));
+	seq_printf(seq, " Section size  : %12d MB\n",
+			SEGS_PER_SEC(sbi) << 1);
+
+	if (!f2fs_is_multi_device(sbi))
+		return 0;
+
+	seq_puts(seq, "\nDisk Map for multi devices:\n");
+	for (i = 0; i < sbi->s_ndevs; i++)
+		seq_printf(seq, "Disk:%2d (zoned=%d): 0x%010x - 0x%010x on %s\n",
+			i, bdev_is_zoned(FDEV(i).bdev),
+			FDEV(i).start_blk, FDEV(i).end_blk,
+			FDEV(i).path);
+	return 0;
+}
+
 int __init f2fs_init_sysfs(void)
 {
 	int ret;
@@ -1573,6 +1617,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
 				victim_bits_seq_show, sb);
 	proc_create_single_data("discard_plist_info", 0444, sbi->s_proc,
 				discard_plist_seq_show, sb);
+	proc_create_single_data("disk_map", 0444, sbi->s_proc,
+				disk_map_seq_show, sb);
 	return 0;
 put_feature_list_kobj:
 	kobject_put(&sbi->s_feature_list_kobj);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 4fc95f353a7a..f7bb0c54502c 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -258,21 +258,23 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode,
 					       pgoff_t index,
 					       unsigned long num_ra_pages)
 {
-	struct page *page;
+	struct folio *folio;
 
 	index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
 
-	page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
-	if (!page || !PageUptodate(page)) {
+	folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0);
+	if (IS_ERR(folio) || !folio_test_uptodate(folio)) {
 		DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index);
 
-		if (page)
-			put_page(page);
+		if (!IS_ERR(folio))
+			folio_put(folio);
 		else if (num_ra_pages > 1)
 			page_cache_ra_unbounded(&ractl, num_ra_pages, 0);
-		page = read_mapping_page(inode->i_mapping, index, NULL);
+		folio = read_mapping_folio(inode->i_mapping, index, NULL);
+		if (IS_ERR(folio))
+			return ERR_CAST(folio);
 	}
-	return page;
+	return folio_file_page(folio, index);
 }
 
 static int f2fs_write_merkle_tree_block(struct inode *inode, const void *buf,
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 738e427e2d21..2af424e200b3 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -47,7 +47,7 @@ int __init fat_cache_init(void)
 {
 	fat_cache_cachep = kmem_cache_create("fat_cache",
 				sizeof(struct fat_cache),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+				0, SLAB_RECLAIM_ACCOUNT,
 				init_once);
 	if (fat_cache_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 1fac3dabf130..d9e6fbb6f246 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -787,7 +787,7 @@ static int __init fat_init_inodecache(void)
 	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1762,6 +1762,9 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
 	else /* fat 16 or 12 */
 		sbi->vol_id = bpb.fat16_vol_id;
 
+	__le32 vol_id_le = cpu_to_le32(sbi->vol_id);
+	super_set_uuid(sb, (void *) &vol_id_le, sizeof(vol_id_le));
+
 	sbi->dir_per_block = sb->s_blocksize / sizeof(struct msdos_dir_entry);
 	sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1;
 
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index c52e63e10d35..509eea96a457 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
 		fid->parent_i_gen = parent->i_generation;
 		type = FILEID_FAT_WITH_PARENT;
 		*lenp = FAT_FID_SIZE_WITH_PARENT;
+	} else {
+		/*
+		 * We need to initialize this field because the fh is actually
+		 * 12 bytes long
+		 */
+		fid->parent_i_pos_hi = 0;
 	}
 
 	return type;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index c80a6acad742..54cc85d3338e 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -27,6 +27,7 @@
 #include <linux/memfd.h>
 #include <linux/compat.h>
 #include <linux/mount.h>
+#include <linux/rw_hint.h>
 
 #include <linux/poll.h>
 #include <asm/siginfo.h>
@@ -268,8 +269,15 @@ static int f_getowner_uids(struct file *filp, unsigned long arg)
 }
 #endif
 
-static bool rw_hint_valid(enum rw_hint hint)
+static bool rw_hint_valid(u64 hint)
 {
+	BUILD_BUG_ON(WRITE_LIFE_NOT_SET != RWH_WRITE_LIFE_NOT_SET);
+	BUILD_BUG_ON(WRITE_LIFE_NONE != RWH_WRITE_LIFE_NONE);
+	BUILD_BUG_ON(WRITE_LIFE_SHORT != RWH_WRITE_LIFE_SHORT);
+	BUILD_BUG_ON(WRITE_LIFE_MEDIUM != RWH_WRITE_LIFE_MEDIUM);
+	BUILD_BUG_ON(WRITE_LIFE_LONG != RWH_WRITE_LIFE_LONG);
+	BUILD_BUG_ON(WRITE_LIFE_EXTREME != RWH_WRITE_LIFE_EXTREME);
+
 	switch (hint) {
 	case RWH_WRITE_LIFE_NOT_SET:
 	case RWH_WRITE_LIFE_NONE:
@@ -283,34 +291,40 @@ static bool rw_hint_valid(enum rw_hint hint)
 	}
 }
 
-static long fcntl_rw_hint(struct file *file, unsigned int cmd,
-			  unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
+			      unsigned long arg)
 {
 	struct inode *inode = file_inode(file);
 	u64 __user *argp = (u64 __user *)arg;
-	enum rw_hint hint;
-	u64 h;
+	u64 hint = READ_ONCE(inode->i_write_hint);
 
-	switch (cmd) {
-	case F_GET_RW_HINT:
-		h = inode->i_write_hint;
-		if (copy_to_user(argp, &h, sizeof(*argp)))
-			return -EFAULT;
-		return 0;
-	case F_SET_RW_HINT:
-		if (copy_from_user(&h, argp, sizeof(h)))
-			return -EFAULT;
-		hint = (enum rw_hint) h;
-		if (!rw_hint_valid(hint))
-			return -EINVAL;
+	if (copy_to_user(argp, &hint, sizeof(*argp)))
+		return -EFAULT;
+	return 0;
+}
 
-		inode_lock(inode);
-		inode->i_write_hint = hint;
-		inode_unlock(inode);
-		return 0;
-	default:
+static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	u64 __user *argp = (u64 __user *)arg;
+	u64 hint;
+
+	if (copy_from_user(&hint, argp, sizeof(hint)))
+		return -EFAULT;
+	if (!rw_hint_valid(hint))
 		return -EINVAL;
-	}
+
+	WRITE_ONCE(inode->i_write_hint, hint);
+
+	/*
+	 * file->f_mapping->host may differ from inode. As an example,
+	 * blkdev_open() modifies file->f_mapping.
+	 */
+	if (file->f_mapping->host != inode)
+		WRITE_ONCE(file->f_mapping->host->i_write_hint, hint);
+
+	return 0;
 }
 
 static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
@@ -416,8 +430,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 		err = memfd_fcntl(filp, cmd, argi);
 		break;
 	case F_GET_RW_HINT:
+		err = fcntl_get_rw_hint(filp, cmd, arg);
+		break;
 	case F_SET_RW_HINT:
-		err = fcntl_rw_hint(filp, cmd, arg);
+		err = fcntl_set_rw_hint(filp, cmd, arg);
 		break;
 	default:
 		break;
@@ -846,12 +862,6 @@ int send_sigurg(struct fown_struct *fown)
 static DEFINE_SPINLOCK(fasync_lock);
 static struct kmem_cache *fasync_cache __ro_after_init;
 
-static void fasync_free_rcu(struct rcu_head *head)
-{
-	kmem_cache_free(fasync_cache,
-			container_of(head, struct fasync_struct, fa_rcu));
-}
-
 /*
  * Remove a fasync entry. If successfully removed, return
  * positive and clear the FASYNC flag. If no entry exists,
@@ -877,7 +887,7 @@ int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp)
 		write_unlock_irq(&fa->fa_lock);
 
 		*fp = fa->fa_next;
-		call_rcu(&fa->fa_rcu, fasync_free_rcu);
+		kfree_rcu(fa, fa_rcu);
 		filp->f_flags &= ~FASYNC;
 		result = 1;
 		break;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 18b3ba8dc8ea..57a12614addf 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -36,7 +36,7 @@ static long do_sys_name_to_handle(const struct path *path,
 	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
 		return -EINVAL;
 
-	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+	handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
 			 GFP_KERNEL);
 	if (!handle)
 		return -ENOMEM;
diff --git a/fs/file_table.c b/fs/file_table.c
index b991f90571b4..4f03beed4737 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -26,7 +26,6 @@
 #include <linux/percpu_counter.h>
 #include <linux/percpu.h>
 #include <linux/task_work.h>
-#include <linux/ima.h>
 #include <linux/swap.h>
 #include <linux/kmemleak.h>
 
@@ -276,21 +275,15 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred)
 }
 
 /**
- * alloc_file - allocate and initialize a 'struct file'
+ * file_init_path - initialize a 'struct file' based on path
  *
+ * @file: the file to set up
  * @path: the (dentry, vfsmount) pair for the new file
- * @flags: O_... flags with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-static struct file *alloc_file(const struct path *path, int flags,
-		const struct file_operations *fop)
+static void file_init_path(struct file *file, const struct path *path,
+			   const struct file_operations *fop)
 {
-	struct file *file;
-
-	file = alloc_empty_file(flags, current_cred());
-	if (IS_ERR(file))
-		return file;
-
 	file->f_path = *path;
 	file->f_inode = path->dentry->d_inode;
 	file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -309,22 +302,51 @@ static struct file *alloc_file(const struct path *path, int flags,
 	file->f_op = fop;
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
 		i_readcount_inc(path->dentry->d_inode);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @flags: O_... flags with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+static struct file *alloc_file(const struct path *path, int flags,
+		const struct file_operations *fop)
+{
+	struct file *file;
+
+	file = alloc_empty_file(flags, current_cred());
+	if (!IS_ERR(file))
+		file_init_path(file, path, fop);
 	return file;
 }
 
-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
-				const char *name, int flags,
-				const struct file_operations *fops)
+static inline int alloc_path_pseudo(const char *name, struct inode *inode,
+				    struct vfsmount *mnt, struct path *path)
 {
 	struct qstr this = QSTR_INIT(name, strlen(name));
+
+	path->dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
+	if (!path->dentry)
+		return -ENOMEM;
+	path->mnt = mntget(mnt);
+	d_instantiate(path->dentry, inode);
+	return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+			       const char *name, int flags,
+			       const struct file_operations *fops)
+{
+	int ret;
 	struct path path;
 	struct file *file;
 
-	path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
-	if (!path.dentry)
-		return ERR_PTR(-ENOMEM);
-	path.mnt = mntget(mnt);
-	d_instantiate(path.dentry, inode);
+	ret = alloc_path_pseudo(name, inode, mnt, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
 	file = alloc_file(&path, flags, fops);
 	if (IS_ERR(file)) {
 		ihold(inode);
@@ -334,6 +356,30 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
 }
 EXPORT_SYMBOL(alloc_file_pseudo);
 
+struct file *alloc_file_pseudo_noaccount(struct inode *inode,
+					 struct vfsmount *mnt, const char *name,
+					 int flags,
+					 const struct file_operations *fops)
+{
+	int ret;
+	struct path path;
+	struct file *file;
+
+	ret = alloc_path_pseudo(name, inode, mnt, &path);
+	if (ret)
+		return ERR_PTR(ret);
+
+	file = alloc_empty_file_noaccount(flags, current_cred());
+	if (IS_ERR(file)) {
+		ihold(inode);
+		path_put(&path);
+		return file;
+	}
+	file_init_path(file, &path, fops);
+	return file;
+}
+EXPORT_SYMBOL_GPL(alloc_file_pseudo_noaccount);
+
 struct file *alloc_file_clone(struct file *base, int flags,
 				const struct file_operations *fops)
 {
@@ -367,7 +413,7 @@ static void __fput(struct file *file)
 	eventpoll_release(file);
 	locks_remove_file(file);
 
-	ima_file_free(file);
+	security_file_release(file);
 	if (unlikely(file->f_flags & FASYNC)) {
 		if (file->f_op->fasync)
 			file->f_op->fasync(-1, file, 0);
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index e6e2a2185e7c..42e03b6b1cc7 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -307,7 +307,7 @@ vxfs_init(void)
 
 	vxfs_inode_cachep = kmem_cache_create_usercopy("vxfs_inode",
 			sizeof(struct vxfs_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			SLAB_RECLAIM_ACCOUNT,
 			offsetof(struct vxfs_inode_info, vii_immed.vi_immed),
 			sizeof_field(struct vxfs_inode_info,
 				vii_immed.vi_immed),
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 3d84fcc471c6..e4f17c53ddfc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -141,6 +141,31 @@ static void wb_wakeup(struct bdi_writeback *wb)
 	spin_unlock_irq(&wb->work_lock);
 }
 
+/*
+ * This function is used when the first inode for this wb is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ *
+ * We have to be careful not to postpone flush work if it is scheduled for
+ * earlier. Thus we use queue_delayed_work().
+ */
+static void wb_wakeup_delayed(struct bdi_writeback *wb)
+{
+	unsigned long timeout;
+
+	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+	spin_lock_irq(&wb->work_lock);
+	if (test_bit(WB_registered, &wb->state))
+		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+	spin_unlock_irq(&wb->work_lock);
+}
+
 static void finish_writeback_work(struct bdi_writeback *wb,
 				  struct wb_writeback_work *work)
 {
diff --git a/fs/fs_parser.c b/fs/fs_parser.c
index edb3712dcfa5..a4d6ca0b8971 100644
--- a/fs/fs_parser.c
+++ b/fs/fs_parser.c
@@ -83,8 +83,8 @@ static const struct fs_parameter_spec *fs_lookup_key(
 }
 
 /*
- * fs_parse - Parse a filesystem configuration parameter
- * @fc: The filesystem context to log errors through.
+ * __fs_parse - Parse a filesystem configuration parameter
+ * @log: The filesystem context to log errors through.
  * @desc: The parameter description to use.
  * @param: The parameter.
  * @result: Where to place the result of the parse
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..8674dbfbe59d 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -52,3 +52,14 @@ config FUSE_DAX
 
 	  If you want to allow mounting a Virtio Filesystem with the "dax"
 	  option, answer Y.
+
+config FUSE_PASSTHROUGH
+	bool "FUSE passthrough operations support"
+	default y
+	depends on FUSE_FS
+	select FS_STACK
+	help
+	  This allows bypassing FUSE server by mapping specific FUSE operations
+	  to be performed directly on a backing file.
+
+	  If you want to allow passthrough operations, answer Y.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 0c48b35c058d..6e0228c6d0cb 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -8,6 +8,8 @@ obj-$(CONFIG_CUSE) += cuse.o
 obj-$(CONFIG_VIRTIO_FS) += virtiofs.o
 
 fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o
+fuse-y += iomode.o
 fuse-$(CONFIG_FUSE_DAX) += dax.o
+fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 284a35006462..97ac994ff78f 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -174,11 +174,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
 	if (!fc)
 		goto out;
 
-	down_read(&fc->killsb);
-	spin_lock(&fc->bg_lock);
-	fc->congestion_threshold = val;
-	spin_unlock(&fc->bg_lock);
-	up_read(&fc->killsb);
+	WRITE_ONCE(fc->congestion_threshold, val);
 	fuse_conn_put(fc);
 out:
 	return ret;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 1a8f82f478cb..3ec8bb5e68ff 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1775,6 +1775,61 @@ copy_finish:
 	return err;
 }
 
+/*
+ * Resending all processing queue requests.
+ *
+ * During a FUSE daemon panics and failover, it is possible for some inflight
+ * requests to be lost and never returned. As a result, applications awaiting
+ * replies would become stuck forever. To address this, we can use notification
+ * to trigger resending of these pending requests to the FUSE daemon, ensuring
+ * they are properly processed again.
+ *
+ * Please note that this strategy is applicable only to idempotent requests or
+ * if the FUSE daemon takes careful measures to avoid processing duplicated
+ * non-idempotent requests.
+ */
+static void fuse_resend(struct fuse_conn *fc)
+{
+	struct fuse_dev *fud;
+	struct fuse_req *req, *next;
+	struct fuse_iqueue *fiq = &fc->iq;
+	LIST_HEAD(to_queue);
+	unsigned int i;
+
+	spin_lock(&fc->lock);
+	if (!fc->connected) {
+		spin_unlock(&fc->lock);
+		return;
+	}
+
+	list_for_each_entry(fud, &fc->devices, entry) {
+		struct fuse_pqueue *fpq = &fud->pq;
+
+		spin_lock(&fpq->lock);
+		for (i = 0; i < FUSE_PQ_HASH_SIZE; i++)
+			list_splice_tail_init(&fpq->processing[i], &to_queue);
+		spin_unlock(&fpq->lock);
+	}
+	spin_unlock(&fc->lock);
+
+	list_for_each_entry_safe(req, next, &to_queue, list) {
+		__set_bit(FR_PENDING, &req->flags);
+		/* mark the request as resend request */
+		req->in.h.unique |= FUSE_UNIQUE_RESEND;
+	}
+
+	spin_lock(&fiq->lock);
+	/* iq and pq requests are both oldest to newest */
+	list_splice(&to_queue, &fiq->pending);
+	fiq->ops->wake_pending_and_unlock(fiq);
+}
+
+static int fuse_notify_resend(struct fuse_conn *fc)
+{
+	fuse_resend(fc);
+	return 0;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
@@ -1800,6 +1855,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_DELETE:
 		return fuse_notify_delete(fc, size, cs);
 
+	case FUSE_NOTIFY_RESEND:
+		return fuse_notify_resend(fc);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -2251,43 +2309,91 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new)
 	return 0;
 }
 
-static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
-			   unsigned long arg)
+static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp)
 {
 	int res;
 	int oldfd;
 	struct fuse_dev *fud = NULL;
 	struct fd f;
 
+	if (get_user(oldfd, argp))
+		return -EFAULT;
+
+	f = fdget(oldfd);
+	if (!f.file)
+		return -EINVAL;
+
+	/*
+	 * Check against file->f_op because CUSE
+	 * uses the same ioctl handler.
+	 */
+	if (f.file->f_op == file->f_op)
+		fud = fuse_get_dev(f.file);
+
+	res = -EINVAL;
+	if (fud) {
+		mutex_lock(&fuse_mutex);
+		res = fuse_device_clone(fud->fc, file);
+		mutex_unlock(&fuse_mutex);
+	}
+
+	fdput(f);
+	return res;
+}
+
+static long fuse_dev_ioctl_backing_open(struct file *file,
+					struct fuse_backing_map __user *argp)
+{
+	struct fuse_dev *fud = fuse_get_dev(file);
+	struct fuse_backing_map map;
+
+	if (!fud)
+		return -EPERM;
+
+	if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&map, argp, sizeof(map)))
+		return -EFAULT;
+
+	return fuse_backing_open(fud->fc, &map);
+}
+
+static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp)
+{
+	struct fuse_dev *fud = fuse_get_dev(file);
+	int backing_id;
+
+	if (!fud)
+		return -EPERM;
+
+	if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		return -EOPNOTSUPP;
+
+	if (get_user(backing_id, argp))
+		return -EFAULT;
+
+	return fuse_backing_close(fud->fc, backing_id);
+}
+
+static long fuse_dev_ioctl(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+
 	switch (cmd) {
 	case FUSE_DEV_IOC_CLONE:
-		if (get_user(oldfd, (__u32 __user *)arg))
-			return -EFAULT;
+		return fuse_dev_ioctl_clone(file, argp);
 
-		f = fdget(oldfd);
-		if (!f.file)
-			return -EINVAL;
+	case FUSE_DEV_IOC_BACKING_OPEN:
+		return fuse_dev_ioctl_backing_open(file, argp);
+
+	case FUSE_DEV_IOC_BACKING_CLOSE:
+		return fuse_dev_ioctl_backing_close(file, argp);
 
-		/*
-		 * Check against file->f_op because CUSE
-		 * uses the same ioctl handler.
-		 */
-		if (f.file->f_op == file->f_op)
-			fud = fuse_get_dev(f.file);
-
-		res = -EINVAL;
-		if (fud) {
-			mutex_lock(&fuse_mutex);
-			res = fuse_device_clone(fud->fc, file);
-			mutex_unlock(&fuse_mutex);
-		}
-		fdput(f);
-		break;
 	default:
-		res = -ENOTTY;
-		break;
+		return -ENOTTY;
 	}
-	return res;
 }
 
 const struct file_operations fuse_dev_operations = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index d19cbf34c634..4a6df591add6 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -391,6 +391,10 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
 	err = -EIO;
 	if (fuse_invalid_attr(&outarg->attr))
 		goto out_put_forget;
+	if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) {
+		pr_warn_once("root generation should be zero\n");
+		outarg->generation = 0;
+	}
 
 	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
 			   &outarg->attr, ATTR_TIMEOUT(outarg),
@@ -615,7 +619,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	FUSE_ARGS(args);
 	struct fuse_forget_link *forget;
 	struct fuse_create_in inarg;
-	struct fuse_open_out outopen;
+	struct fuse_open_out *outopenp;
 	struct fuse_entry_out outentry;
 	struct fuse_inode *fi;
 	struct fuse_file *ff;
@@ -630,7 +634,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 		goto out_err;
 
 	err = -ENOMEM;
-	ff = fuse_file_alloc(fm);
+	ff = fuse_file_alloc(fm, true);
 	if (!ff)
 		goto out_put_forget_req;
 
@@ -659,8 +663,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	args.out_numargs = 2;
 	args.out_args[0].size = sizeof(outentry);
 	args.out_args[0].value = &outentry;
-	args.out_args[1].size = sizeof(outopen);
-	args.out_args[1].value = &outopen;
+	/* Store outarg for fuse_finish_open() */
+	outopenp = &ff->args->open_outarg;
+	args.out_args[1].size = sizeof(*outopenp);
+	args.out_args[1].value = outopenp;
 
 	err = get_create_ext(&args, dir, entry, mode);
 	if (err)
@@ -676,9 +682,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	    fuse_invalid_attr(&outentry.attr))
 		goto out_free_ff;
 
-	ff->fh = outopen.fh;
+	ff->fh = outopenp->fh;
 	ff->nodeid = outentry.nodeid;
-	ff->open_flags = outopen.open_flags;
+	ff->open_flags = outopenp->open_flags;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
 			  &outentry.attr, ATTR_TIMEOUT(&outentry), 0);
 	if (!inode) {
@@ -692,13 +698,15 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	d_instantiate(entry, inode);
 	fuse_change_entry_timeout(entry, &outentry);
 	fuse_dir_changed(dir);
-	err = finish_open(file, entry, generic_file_open);
+	err = generic_file_open(inode, file);
+	if (!err) {
+		file->private_data = ff;
+		err = finish_open(file, entry, fuse_finish_open);
+	}
 	if (err) {
 		fi = get_fuse_inode(inode);
 		fuse_sync_release(fi, ff, flags);
 	} else {
-		file->private_data = ff;
-		fuse_finish_open(inode, file);
 		if (fm->fc->atomic_o_trunc && trunc)
 			truncate_pagecache(inode, 0);
 		else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
@@ -1210,7 +1218,7 @@ static int fuse_do_statx(struct inode *inode, struct file *file,
 	if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) ||
 	    ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) ||
 					 inode_wrong_type(inode, sx->mode)))) {
-		make_bad_inode(inode);
+		fuse_make_bad(inode);
 		return -EIO;
 	}
 
@@ -1485,7 +1493,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
  *
  * 1) Local access checking ('default_permissions' mount option) based
  * on file mode.  This is the plain old disk filesystem permission
- * modell.
+ * model.
  *
  * 2) "Remote" access checking, where server is responsible for
  * checking permission in each inode operation.  An exception to this
@@ -1630,7 +1638,30 @@ out_err:
 
 static int fuse_dir_open(struct inode *inode, struct file *file)
 {
-	return fuse_open_common(inode, file, true);
+	struct fuse_mount *fm = get_fuse_mount(inode);
+	int err;
+
+	if (fuse_is_bad(inode))
+		return -EIO;
+
+	err = generic_file_open(inode, file);
+	if (err)
+		return err;
+
+	err = fuse_do_open(fm, get_node_id(inode), file, true);
+	if (!err) {
+		struct fuse_file *ff = file->private_data;
+
+		/*
+		 * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for
+		 * directories for backward compatibility, though it's unlikely
+		 * to be useful.
+		 */
+		if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE))
+			nonseekable_open(inode, file);
+	}
+
+	return err;
 }
 
 static int fuse_dir_release(struct inode *inode, struct file *file)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 148a71b8b4d0..a56e7bffd000 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -20,6 +20,7 @@
 #include <linux/fs.h>
 #include <linux/filelock.h>
 #include <linux/splice.h>
+#include <linux/task_io_accounting_ops.h>
 
 static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
 			  unsigned int open_flags, int opcode,
@@ -50,13 +51,7 @@ static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
 	return fuse_simple_request(fm, &args);
 }
 
-struct fuse_release_args {
-	struct fuse_args args;
-	struct fuse_release_in inarg;
-	struct inode *inode;
-};
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release)
 {
 	struct fuse_file *ff;
 
@@ -65,15 +60,15 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
 		return NULL;
 
 	ff->fm = fm;
-	ff->release_args = kzalloc(sizeof(*ff->release_args),
-				   GFP_KERNEL_ACCOUNT);
-	if (!ff->release_args) {
-		kfree(ff);
-		return NULL;
+	if (release) {
+		ff->args = kzalloc(sizeof(*ff->args), GFP_KERNEL_ACCOUNT);
+		if (!ff->args) {
+			kfree(ff);
+			return NULL;
+		}
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
-	mutex_init(&ff->readdir.lock);
 	refcount_set(&ff->count, 1);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
@@ -85,8 +80,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_mount *fm)
 
 void fuse_file_free(struct fuse_file *ff)
 {
-	kfree(ff->release_args);
-	mutex_destroy(&ff->readdir.lock);
+	kfree(ff->args);
 	kfree(ff);
 }
 
@@ -105,14 +99,17 @@ static void fuse_release_end(struct fuse_mount *fm, struct fuse_args *args,
 	kfree(ra);
 }
 
-static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
 	if (refcount_dec_and_test(&ff->count)) {
-		struct fuse_args *args = &ff->release_args->args;
+		struct fuse_release_args *ra = &ff->args->release_args;
+		struct fuse_args *args = (ra ? &ra->args : NULL);
 
-		if (isdir ? ff->fm->fc->no_opendir : ff->fm->fc->no_open) {
-			/* Do nothing when client does not implement 'open' */
-			fuse_release_end(ff->fm, args, 0);
+		if (ra && ra->inode)
+			fuse_file_io_release(ff, ra->inode);
+
+		if (!args) {
+			/* Do nothing when server does not implement 'open' */
 		} else if (sync) {
 			fuse_simple_request(ff->fm, args);
 			fuse_release_end(ff->fm, args, 0);
@@ -132,27 +129,31 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 	struct fuse_conn *fc = fm->fc;
 	struct fuse_file *ff;
 	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+	bool open = isdir ? !fc->no_opendir : !fc->no_open;
 
-	ff = fuse_file_alloc(fm);
+	ff = fuse_file_alloc(fm, open);
 	if (!ff)
 		return ERR_PTR(-ENOMEM);
 
 	ff->fh = 0;
 	/* Default for no-open */
 	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
-	if (isdir ? !fc->no_opendir : !fc->no_open) {
-		struct fuse_open_out outarg;
+	if (open) {
+		/* Store outarg for fuse_finish_open() */
+		struct fuse_open_out *outargp = &ff->args->open_outarg;
 		int err;
 
-		err = fuse_send_open(fm, nodeid, open_flags, opcode, &outarg);
+		err = fuse_send_open(fm, nodeid, open_flags, opcode, outargp);
 		if (!err) {
-			ff->fh = outarg.fh;
-			ff->open_flags = outarg.open_flags;
-
+			ff->fh = outargp->fh;
+			ff->open_flags = outargp->open_flags;
 		} else if (err != -ENOSYS) {
 			fuse_file_free(ff);
 			return ERR_PTR(err);
 		} else {
+			/* No release needed */
+			kfree(ff->args);
+			ff->args = NULL;
 			if (isdir)
 				fc->no_opendir = 1;
 			else
@@ -195,40 +196,50 @@ static void fuse_link_write_file(struct file *file)
 	spin_unlock(&fi->lock);
 }
 
-void fuse_finish_open(struct inode *inode, struct file *file)
+int fuse_finish_open(struct inode *inode, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	int err;
+
+	err = fuse_file_io_open(file, inode);
+	if (err)
+		return err;
 
 	if (ff->open_flags & FOPEN_STREAM)
 		stream_open(inode, file);
 	else if (ff->open_flags & FOPEN_NONSEEKABLE)
 		nonseekable_open(inode, file);
 
-	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
-		struct fuse_inode *fi = get_fuse_inode(inode);
-
-		spin_lock(&fi->lock);
-		fi->attr_version = atomic64_inc_return(&fc->attr_version);
-		i_size_write(inode, 0);
-		spin_unlock(&fi->lock);
-		file_update_time(file);
-		fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
-	}
 	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 		fuse_link_write_file(file);
+
+	return 0;
 }
 
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+static void fuse_truncate_update_attr(struct inode *inode, struct file *file)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fi->lock);
+	fi->attr_version = atomic64_inc_return(&fc->attr_version);
+	i_size_write(inode, 0);
+	spin_unlock(&fi->lock);
+	file_update_time(file);
+	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
 {
 	struct fuse_mount *fm = get_fuse_mount(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = fm->fc;
+	struct fuse_file *ff;
 	int err;
-	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
-			  fc->atomic_o_trunc &&
-			  fc->writeback_cache;
-	bool dax_truncate = (file->f_flags & O_TRUNC) &&
-			  fc->atomic_o_trunc && FUSE_IS_DAX(inode);
+	bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
+	bool is_wb_truncate = is_truncate && fc->writeback_cache;
+	bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
 
 	if (fuse_is_bad(inode))
 		return -EIO;
@@ -250,16 +261,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	if (is_wb_truncate || dax_truncate)
 		fuse_set_nowrite(inode);
 
-	err = fuse_do_open(fm, get_node_id(inode), file, isdir);
-	if (!err)
-		fuse_finish_open(inode, file);
+	err = fuse_do_open(fm, get_node_id(inode), file, false);
+	if (!err) {
+		ff = file->private_data;
+		err = fuse_finish_open(inode, file);
+		if (err)
+			fuse_sync_release(fi, ff, file->f_flags);
+		else if (is_truncate)
+			fuse_truncate_update_attr(inode, file);
+	}
 
 	if (is_wb_truncate || dax_truncate)
 		fuse_release_nowrite(inode);
 	if (!err) {
-		struct fuse_file *ff = file->private_data;
-
-		if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC))
+		if (is_truncate)
 			truncate_pagecache(inode, 0);
 		else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
 			invalidate_inode_pages2(inode->i_mapping);
@@ -274,10 +289,13 @@ out_inode_unlock:
 }
 
 static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
-				 unsigned int flags, int opcode)
+				 unsigned int flags, int opcode, bool sync)
 {
 	struct fuse_conn *fc = ff->fm->fc;
-	struct fuse_release_args *ra = ff->release_args;
+	struct fuse_release_args *ra = &ff->args->release_args;
+
+	if (fuse_file_passthrough(ff))
+		fuse_passthrough_release(ff, fuse_inode_backing(fi));
 
 	/* Inode is NULL on error path of fuse_create_open() */
 	if (likely(fi)) {
@@ -292,6 +310,11 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 
 	wake_up_interruptible_all(&ff->poll_wait);
 
+	if (!ra)
+		return;
+
+	/* ff->args was used for open outarg */
+	memset(ff->args, 0, sizeof(*ff->args));
 	ra->inarg.fh = ff->fh;
 	ra->inarg.flags = flags;
 	ra->args.in_numargs = 1;
@@ -301,23 +324,28 @@ static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
 	ra->args.nodeid = ff->nodeid;
 	ra->args.force = true;
 	ra->args.nocreds = true;
+
+	/*
+	 * Hold inode until release is finished.
+	 * From fuse_sync_release() the refcount is 1 and everything's
+	 * synchronous, so we are fine with not doing igrab() here.
+	 */
+	ra->inode = sync ? NULL : igrab(&fi->inode);
 }
 
 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 		       unsigned int open_flags, fl_owner_t id, bool isdir)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
-	struct fuse_release_args *ra = ff->release_args;
+	struct fuse_release_args *ra = &ff->args->release_args;
 	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
 
-	fuse_prepare_release(fi, ff, open_flags, opcode);
+	fuse_prepare_release(fi, ff, open_flags, opcode, false);
 
-	if (ff->flock) {
+	if (ra && ff->flock) {
 		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
 		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fm->fc, id);
 	}
-	/* Hold inode until release is finished */
-	ra->inode = igrab(inode);
 
 	/*
 	 * Normally this will send the RELEASE request, however if
@@ -328,7 +356,7 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fm->fc->destroy, isdir);
+	fuse_file_put(ff, ff->fm->fc->destroy);
 }
 
 void fuse_release_common(struct file *file, bool isdir)
@@ -337,11 +365,6 @@ void fuse_release_common(struct file *file, bool isdir)
 			  (fl_owner_t) file, isdir);
 }
 
-static int fuse_open(struct inode *inode, struct file *file)
-{
-	return fuse_open_common(inode, file, false);
-}
-
 static int fuse_release(struct inode *inode, struct file *file)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
@@ -363,12 +386,8 @@ void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
 		       unsigned int flags)
 {
 	WARN_ON(refcount_read(&ff->count) > 1);
-	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
-	/*
-	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
-	 * synchronous, we are fine with not doing igrab() here"
-	 */
-	fuse_file_put(ff, true, false);
+	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE, true);
+	fuse_file_put(ff, true);
 }
 EXPORT_SYMBOL_GPL(fuse_sync_release);
 
@@ -634,7 +653,8 @@ static void fuse_release_user_pages(struct fuse_args_pages *ap,
 	for (i = 0; i < ap->num_pages; i++) {
 		if (should_dirty)
 			set_page_dirty_lock(ap->pages[i]);
-		put_page(ap->pages[i]);
+		if (ap->args.is_pinned)
+			unpin_user_page(ap->pages[i]);
 	}
 }
 
@@ -925,7 +945,7 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args,
 		put_page(page);
 	}
 	if (ia->ff)
-		fuse_file_put(ia->ff, false, false);
+		fuse_file_put(ia->ff, false);
 
 	fuse_io_free(ia);
 }
@@ -1299,13 +1319,93 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
 	return res;
 }
 
+static bool fuse_io_past_eof(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+
+	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
+}
+
+/*
+ * @return true if an exclusive lock for direct IO writes is needed
+ */
+static bool fuse_dio_wr_exclusive_lock(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	/* Server side has to advise that it supports parallel dio writes. */
+	if (!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES))
+		return true;
+
+	/*
+	 * Append will need to know the eventual EOF - always needs an
+	 * exclusive lock.
+	 */
+	if (iocb->ki_flags & IOCB_APPEND)
+		return true;
+
+	/* shared locks are not allowed with parallel page cache IO */
+	if (test_bit(FUSE_I_CACHE_IO_MODE, &fi->state))
+		return false;
+
+	/* Parallel dio beyond EOF is not supported, at least for now. */
+	if (fuse_io_past_eof(iocb, from))
+		return true;
+
+	return false;
+}
+
+static void fuse_dio_lock(struct kiocb *iocb, struct iov_iter *from,
+			  bool *exclusive)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_file *ff = iocb->ki_filp->private_data;
+
+	*exclusive = fuse_dio_wr_exclusive_lock(iocb, from);
+	if (*exclusive) {
+		inode_lock(inode);
+	} else {
+		inode_lock_shared(inode);
+		/*
+		 * New parallal dio allowed only if inode is not in caching
+		 * mode and denies new opens in caching mode. This check
+		 * should be performed only after taking shared inode lock.
+		 * Previous past eof check was without inode lock and might
+		 * have raced, so check it again.
+		 */
+		if (fuse_io_past_eof(iocb, from) ||
+		    fuse_file_uncached_io_start(inode, ff, NULL) != 0) {
+			inode_unlock_shared(inode);
+			inode_lock(inode);
+			*exclusive = true;
+		}
+	}
+}
+
+static void fuse_dio_unlock(struct kiocb *iocb, bool exclusive)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	struct fuse_file *ff = iocb->ki_filp->private_data;
+
+	if (exclusive) {
+		inode_unlock(inode);
+	} else {
+		/* Allow opens in caching mode after last parallel dio end */
+		fuse_file_uncached_io_end(inode, ff);
+		inode_unlock_shared(inode);
+	}
+}
+
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	ssize_t written = 0;
 	struct inode *inode = mapping->host;
-	ssize_t err;
+	ssize_t err, count;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
 	if (fc->writeback_cache) {
@@ -1327,10 +1427,12 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
 writethrough:
 	inode_lock(inode);
 
-	err = generic_write_checks(iocb, from);
+	err = count = generic_write_checks(iocb, from);
 	if (err <= 0)
 		goto out;
 
+	task_io_account_write(count);
+
 	err = file_remove_privs(file);
 	if (err)
 		goto out;
@@ -1392,10 +1494,13 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
 		unsigned npages;
 		size_t start;
-		ret = iov_iter_get_pages2(ii, &ap->pages[ap->num_pages],
-					*nbytesp - nbytes,
-					max_pages - ap->num_pages,
-					&start);
+		struct page **pt_pages;
+
+		pt_pages = &ap->pages[ap->num_pages];
+		ret = iov_iter_extract_pages(ii, &pt_pages,
+					     *nbytesp - nbytes,
+					     max_pages - ap->num_pages,
+					     0, &start);
 		if (ret < 0)
 			break;
 
@@ -1412,6 +1517,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
 			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
 	}
 
+	ap->args.is_pinned = iov_iter_extract_will_pin(ii);
 	ap->args.user_pages = true;
 	if (write)
 		ap->args.in_pages = true;
@@ -1558,51 +1664,17 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	return res;
 }
 
-static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
-					       struct iov_iter *iter)
-{
-	struct inode *inode = file_inode(iocb->ki_filp);
-
-	return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
-}
-
 static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
-	struct file *file = iocb->ki_filp;
-	struct fuse_file *ff = file->private_data;
 	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
 	ssize_t res;
-	bool exclusive_lock =
-		!(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
-		get_fuse_conn(inode)->direct_io_allow_mmap ||
-		iocb->ki_flags & IOCB_APPEND ||
-		fuse_direct_write_extending_i_size(iocb, from);
-
-	/*
-	 * Take exclusive lock if
-	 * - Parallel direct writes are disabled - a user space decision
-	 * - Parallel direct writes are enabled and i_size is being extended.
-	 * - Shared mmap on direct_io file is supported (FUSE_DIRECT_IO_ALLOW_MMAP).
-	 *   This might not be needed at all, but needs further investigation.
-	 */
-	if (exclusive_lock)
-		inode_lock(inode);
-	else {
-		inode_lock_shared(inode);
-
-		/* A race with truncate might have come up as the decision for
-		 * the lock type was done without holding the lock, check again.
-		 */
-		if (fuse_direct_write_extending_i_size(iocb, from)) {
-			inode_unlock_shared(inode);
-			inode_lock(inode);
-			exclusive_lock = true;
-		}
-	}
+	bool exclusive;
 
+	fuse_dio_lock(iocb, from, &exclusive);
 	res = generic_write_checks(iocb, from);
 	if (res > 0) {
+		task_io_account_write(res);
 		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
 			res = fuse_direct_IO(iocb, from);
 		} else {
@@ -1611,10 +1683,7 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			fuse_write_update_attr(inode, iocb->ki_pos, res);
 		}
 	}
-	if (exclusive_lock)
-		inode_unlock(inode);
-	else
-		inode_unlock_shared(inode);
+	fuse_dio_unlock(iocb, exclusive);
 
 	return res;
 }
@@ -1631,10 +1700,13 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	if (FUSE_IS_DAX(inode))
 		return fuse_dax_read_iter(iocb, to);
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
-		return fuse_cache_read_iter(iocb, to);
-	else
+	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+	if (ff->open_flags & FOPEN_DIRECT_IO)
 		return fuse_direct_read_iter(iocb, to);
+	else if (fuse_file_passthrough(ff))
+		return fuse_passthrough_read_iter(iocb, to);
+	else
+		return fuse_cache_read_iter(iocb, to);
 }
 
 static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1649,10 +1721,38 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (FUSE_IS_DAX(inode))
 		return fuse_dax_write_iter(iocb, from);
 
-	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+	if (ff->open_flags & FOPEN_DIRECT_IO)
+		return fuse_direct_write_iter(iocb, from);
+	else if (fuse_file_passthrough(ff))
+		return fuse_passthrough_write_iter(iocb, from);
+	else
 		return fuse_cache_write_iter(iocb, from);
+}
+
+static ssize_t fuse_splice_read(struct file *in, loff_t *ppos,
+				struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags)
+{
+	struct fuse_file *ff = in->private_data;
+
+	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+	if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+		return fuse_passthrough_splice_read(in, ppos, pipe, len, flags);
 	else
-		return fuse_direct_write_iter(iocb, from);
+		return filemap_splice_read(in, ppos, pipe, len, flags);
+}
+
+static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out,
+				 loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct fuse_file *ff = out->private_data;
+
+	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
+	if (fuse_file_passthrough(ff) && !(ff->open_flags & FOPEN_DIRECT_IO))
+		return fuse_passthrough_splice_write(pipe, out, ppos, len, flags);
+	else
+		return iter_file_splice_write(pipe, out, ppos, len, flags);
 }
 
 static void fuse_writepage_free(struct fuse_writepage_args *wpa)
@@ -1667,7 +1767,7 @@ static void fuse_writepage_free(struct fuse_writepage_args *wpa)
 		__free_page(ap->pages[i]);
 
 	if (wpa->ia.ff)
-		fuse_file_put(wpa->ia.ff, false, false);
+		fuse_file_put(wpa->ia.ff, false);
 
 	kfree(ap->pages);
 	kfree(wpa);
@@ -1909,7 +2009,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
 	ff = __fuse_write_file_get(fi);
 	err = fuse_flush_times(inode, ff);
 	if (ff)
-		fuse_file_put(ff, false, false);
+		fuse_file_put(ff, false);
 
 	return err;
 }
@@ -1947,26 +2047,26 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc,
 	rcu_read_unlock();
 }
 
-static int fuse_writepage_locked(struct page *page)
+static int fuse_writepage_locked(struct folio *folio)
 {
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = folio->mapping;
 	struct inode *inode = mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_writepage_args *wpa;
 	struct fuse_args_pages *ap;
-	struct page *tmp_page;
+	struct folio *tmp_folio;
 	int error = -ENOMEM;
 
-	set_page_writeback(page);
+	folio_start_writeback(folio);
 
 	wpa = fuse_writepage_args_alloc();
 	if (!wpa)
 		goto err;
 	ap = &wpa->ia.ap;
 
-	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
-	if (!tmp_page)
+	tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0);
+	if (!tmp_folio)
 		goto err_free;
 
 	error = -EIO;
@@ -1975,21 +2075,21 @@ static int fuse_writepage_locked(struct page *page)
 		goto err_nofile;
 
 	fuse_writepage_add_to_bucket(fc, wpa);
-	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
+	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, folio_pos(folio), 0);
 
-	copy_highpage(tmp_page, page);
+	folio_copy(tmp_folio, folio);
 	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
 	wpa->next = NULL;
 	ap->args.in_pages = true;
 	ap->num_pages = 1;
-	ap->pages[0] = tmp_page;
+	ap->pages[0] = &tmp_folio->page;
 	ap->descs[0].offset = 0;
 	ap->descs[0].length = PAGE_SIZE;
 	ap->args.end = fuse_writepage_end;
 	wpa->inode = inode;
 
 	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
-	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
+	node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP);
 
 	spin_lock(&fi->lock);
 	tree_insert(&fi->writepages, wpa);
@@ -1997,48 +2097,20 @@ static int fuse_writepage_locked(struct page *page)
 	fuse_flush_writepages(inode);
 	spin_unlock(&fi->lock);
 
-	end_page_writeback(page);
+	folio_end_writeback(folio);
 
 	return 0;
 
 err_nofile:
-	__free_page(tmp_page);
+	folio_put(tmp_folio);
 err_free:
 	kfree(wpa);
 err:
-	mapping_set_error(page->mapping, error);
-	end_page_writeback(page);
+	mapping_set_error(folio->mapping, error);
+	folio_end_writeback(folio);
 	return error;
 }
 
-static int fuse_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct fuse_conn *fc = get_fuse_conn(page->mapping->host);
-	int err;
-
-	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
-		/*
-		 * ->writepages() should be called for sync() and friends.  We
-		 * should only get here on direct reclaim and then we are
-		 * allowed to skip a page which is already in flight
-		 */
-		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
-
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-		return 0;
-	}
-
-	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    fc->num_background >= fc->congestion_threshold)
-		return AOP_WRITEPAGE_ACTIVATE;
-
-	err = fuse_writepage_locked(page);
-	unlock_page(page);
-
-	return err;
-}
-
 struct fuse_fill_wb_data {
 	struct fuse_writepage_args *wpa;
 	struct fuse_file *ff;
@@ -2307,7 +2379,7 @@ static int fuse_writepages(struct address_space *mapping,
 		fuse_writepages_send(&data);
 	}
 	if (data.ff)
-		fuse_file_put(data.ff, false, false);
+		fuse_file_put(data.ff, false);
 
 	kfree(data.orig_pages);
 out:
@@ -2401,7 +2473,7 @@ static int fuse_launder_folio(struct folio *folio)
 
 		/* Serialize with pending writeback for the same page */
 		fuse_wait_on_page_writeback(inode, folio->index);
-		err = fuse_writepage_locked(&folio->page);
+		err = fuse_writepage_locked(folio);
 		if (!err)
 			fuse_wait_on_page_writeback(inode, folio->index);
 	}
@@ -2462,13 +2534,30 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fm->fc;
+	struct inode *inode = file_inode(file);
+	int rc;
 
 	/* DAX mmap is superior to direct_io mmap */
-	if (FUSE_IS_DAX(file_inode(file)))
+	if (FUSE_IS_DAX(inode))
 		return fuse_dax_mmap(file, vma);
 
+	/*
+	 * If inode is in passthrough io mode, because it has some file open
+	 * in passthrough mode, either mmap to backing file or fail mmap,
+	 * because mixing cached mmap and passthrough io mode is not allowed.
+	 */
+	if (fuse_file_passthrough(ff))
+		return fuse_passthrough_mmap(file, vma);
+	else if (fuse_inode_backing(get_fuse_inode(inode)))
+		return -ENODEV;
+
+	/*
+	 * FOPEN_DIRECT_IO handling is special compared to O_DIRECT,
+	 * as does not allow MAP_SHARED mmap without FUSE_DIRECT_IO_ALLOW_MMAP.
+	 */
 	if (ff->open_flags & FOPEN_DIRECT_IO) {
-		/* Can't provide the coherency needed for MAP_SHARED
+		/*
+		 * Can't provide the coherency needed for MAP_SHARED
 		 * if FUSE_DIRECT_IO_ALLOW_MMAP isn't set.
 		 */
 		if ((vma->vm_flags & VM_MAYSHARE) && !fc->direct_io_allow_mmap)
@@ -2476,7 +2565,19 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 		invalidate_inode_pages2(file->f_mapping);
 
-		return generic_file_mmap(file, vma);
+		if (!(vma->vm_flags & VM_MAYSHARE)) {
+			/* MAP_PRIVATE */
+			return generic_file_mmap(file, vma);
+		}
+
+		/*
+		 * First mmap of direct_io file enters caching inode io mode.
+		 * Also waits for parallel dio writers to go into serial mode
+		 * (exclusive instead of shared lock).
+		 */
+		rc = fuse_file_cached_io_start(inode, ff);
+		if (rc)
+			return rc;
 	}
 
 	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
@@ -2509,14 +2610,14 @@ static int convert_fuse_file_lock(struct fuse_conn *fc,
 		 * translate it into the caller's pid namespace.
 		 */
 		rcu_read_lock();
-		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
+		fl->c.flc_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
 		rcu_read_unlock();
 		break;
 
 	default:
 		return -EIO;
 	}
-	fl->fl_type = ffl->type;
+	fl->c.flc_type = ffl->type;
 	return 0;
 }
 
@@ -2530,10 +2631,10 @@ static void fuse_lk_fill(struct fuse_args *args, struct file *file,
 
 	memset(inarg, 0, sizeof(*inarg));
 	inarg->fh = ff->fh;
-	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+	inarg->owner = fuse_lock_owner_id(fc, fl->c.flc_owner);
 	inarg->lk.start = fl->fl_start;
 	inarg->lk.end = fl->fl_end;
-	inarg->lk.type = fl->fl_type;
+	inarg->lk.type = fl->c.flc_type;
 	inarg->lk.pid = pid;
 	if (flock)
 		inarg->lk_flags |= FUSE_LK_FLOCK;
@@ -2570,8 +2671,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 	struct fuse_mount *fm = get_fuse_mount(inode);
 	FUSE_ARGS(args);
 	struct fuse_lk_in inarg;
-	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
-	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
+	int opcode = (fl->c.flc_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+	struct pid *pid = fl->c.flc_type != F_UNLCK ? task_tgid(current) : NULL;
 	pid_t pid_nr = pid_nr_ns(pid, fm->fc->pid_ns);
 	int err;
 
@@ -2580,10 +2681,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
 		return -ENOLCK;
 	}
 
-	/* Unlock on close is handled by the flush method */
-	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
-		return 0;
-
 	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
 	err = fuse_simple_request(fm, &args);
 
@@ -3213,8 +3310,8 @@ static const struct file_operations fuse_file_operations = {
 	.lock		= fuse_file_lock,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.flock		= fuse_file_flock,
-	.splice_read	= filemap_splice_read,
-	.splice_write	= iter_file_splice_write,
+	.splice_read	= fuse_splice_read,
+	.splice_write	= fuse_splice_write,
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
@@ -3225,10 +3322,10 @@ static const struct file_operations fuse_file_operations = {
 static const struct address_space_operations fuse_file_aops  = {
 	.read_folio	= fuse_read_folio,
 	.readahead	= fuse_readahead,
-	.writepage	= fuse_writepage,
 	.writepages	= fuse_writepages,
 	.launder_folio	= fuse_launder_folio,
 	.dirty_folio	= filemap_dirty_folio,
+	.migrate_folio	= filemap_migrate_folio,
 	.bmap		= fuse_bmap,
 	.direct_IO	= fuse_direct_IO,
 	.write_begin	= fuse_write_begin,
@@ -3245,7 +3342,9 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags)
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	fi->writectr = 0;
+	fi->iocachectr = 0;
 	init_waitqueue_head(&fi->page_waitq);
+	init_waitqueue_head(&fi->direct_io_waitq);
 	fi->writepages = RB_ROOT;
 
 	if (IS_ENABLED(CONFIG_FUSE_DAX))
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index bcbe34488862..b24084b60864 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -76,6 +76,16 @@ struct fuse_submount_lookup {
 	struct fuse_forget_link *forget;
 };
 
+/** Container for data related to mapping to backing file */
+struct fuse_backing {
+	struct file *file;
+	struct cred *cred;
+
+	/** refcount */
+	refcount_t count;
+	struct rcu_head rcu;
+};
+
 /** FUSE inode */
 struct fuse_inode {
 	/** Inode data */
@@ -111,7 +121,7 @@ struct fuse_inode {
 	u64 attr_version;
 
 	union {
-		/* Write related fields (regular file only) */
+		/* read/write io cache (regular file only) */
 		struct {
 			/* Files usable in writepage.  Protected by fi->lock */
 			struct list_head write_files;
@@ -123,9 +133,15 @@ struct fuse_inode {
 			 * (FUSE_NOWRITE) means more writes are blocked */
 			int writectr;
 
+			/** Number of files/maps using page cache */
+			int iocachectr;
+
 			/* Waitq for writepage completion */
 			wait_queue_head_t page_waitq;
 
+			/* waitq for direct-io completion */
+			wait_queue_head_t direct_io_waitq;
+
 			/* List of writepage requestst (pending or sent) */
 			struct rb_root writepages;
 		};
@@ -173,6 +189,10 @@ struct fuse_inode {
 #endif
 	/** Submount specific lookup tracking */
 	struct fuse_submount_lookup *submount_lookup;
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	/** Reference to backing file in passthrough mode */
+	struct fuse_backing *fb;
+#endif
 };
 
 /** FUSE inode state bits */
@@ -187,19 +207,21 @@ enum {
 	FUSE_I_BAD,
 	/* Has btime */
 	FUSE_I_BTIME,
+	/* Wants or already has page cache IO */
+	FUSE_I_CACHE_IO_MODE,
 };
 
 struct fuse_conn;
 struct fuse_mount;
-struct fuse_release_args;
+union fuse_file_args;
 
 /** FUSE specific file data */
 struct fuse_file {
 	/** Fuse connection for this file */
 	struct fuse_mount *fm;
 
-	/* Argument space reserved for release */
-	struct fuse_release_args *release_args;
+	/* Argument space reserved for open/release */
+	union fuse_file_args *args;
 
 	/** Kernel file handle guaranteed to be unique */
 	u64 kh;
@@ -221,12 +243,6 @@ struct fuse_file {
 
 	/* Readdir related */
 	struct {
-		/*
-		 * Protects below fields against (crazy) parallel readdir on
-		 * same open file.  Uncontended in the normal case.
-		 */
-		struct mutex lock;
-
 		/* Dir stream position */
 		loff_t pos;
 
@@ -244,6 +260,15 @@ struct fuse_file {
 	/** Wait queue head for poll */
 	wait_queue_head_t poll_wait;
 
+	/** Does file hold a fi->iocachectr refcount? */
+	enum { IOM_NONE, IOM_CACHED, IOM_UNCACHED } iomode;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	/** Reference to backing file in passthrough mode */
+	struct file *passthrough;
+	const struct cred *cred;
+#endif
+
 	/** Has flock been performed on this file? */
 	bool flock:1;
 };
@@ -283,6 +308,7 @@ struct fuse_args {
 	bool page_replace:1;
 	bool may_block:1;
 	bool is_ext:1;
+	bool is_pinned:1;
 	struct fuse_in_arg in_args[3];
 	struct fuse_arg out_args[2];
 	void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
@@ -295,6 +321,19 @@ struct fuse_args_pages {
 	unsigned int num_pages;
 };
 
+struct fuse_release_args {
+	struct fuse_args args;
+	struct fuse_release_in inarg;
+	struct inode *inode;
+};
+
+union fuse_file_args {
+	/* Used during open() */
+	struct fuse_open_out open_outarg;
+	/* Used during release() */
+	struct fuse_release_args release_args;
+};
+
 #define FUSE_ARGS(args) struct fuse_args args = {}
 
 /** The request IO state (for asynchronous processing) */
@@ -818,6 +857,12 @@ struct fuse_conn {
 	/* Is statx not implemented by fs? */
 	unsigned int no_statx:1;
 
+	/** Passthrough support for read/write IO */
+	unsigned int passthrough:1;
+
+	/** Maximum stack depth for passthrough backing files */
+	int max_stack_depth;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -867,6 +912,11 @@ struct fuse_conn {
 
 	/* New writepages go into this bucket */
 	struct fuse_sync_bucket __rcu *curr_bucket;
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	/** IDR for backing files ids */
+	struct idr backing_files_map;
+#endif
 };
 
 /*
@@ -940,7 +990,6 @@ static inline bool fuse_stale_inode(const struct inode *inode, int generation,
 
 static inline void fuse_make_bad(struct inode *inode)
 {
-	remove_inode_hash(inode);
 	set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
 }
 
@@ -1032,14 +1081,9 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
 			 size_t count, int opcode);
 
 
-/**
- * Send OPEN or OPENDIR request
- */
-int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
-
-struct fuse_file *fuse_file_alloc(struct fuse_mount *fm);
+struct fuse_file *fuse_file_alloc(struct fuse_mount *fm, bool release);
 void fuse_file_free(struct fuse_file *ff);
-void fuse_finish_open(struct inode *inode, struct file *file);
+int fuse_finish_open(struct inode *inode, struct file *file);
 
 void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff,
 		       unsigned int flags);
@@ -1349,11 +1393,82 @@ int fuse_fileattr_get(struct dentry *dentry, struct fileattr *fa);
 int fuse_fileattr_set(struct mnt_idmap *idmap,
 		      struct dentry *dentry, struct fileattr *fa);
 
-/* file.c */
+/* iomode.c */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff);
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb);
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff);
+
+int fuse_file_io_open(struct file *file, struct inode *inode);
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode);
 
+/* file.c */
 struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid,
 				 unsigned int open_flags, bool isdir);
 void fuse_file_release(struct inode *inode, struct fuse_file *ff,
 		       unsigned int open_flags, fl_owner_t id, bool isdir);
 
+/* passthrough.c */
+static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	return READ_ONCE(fi->fb);
+#else
+	return NULL;
+#endif
+}
+
+static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi,
+							  struct fuse_backing *fb)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	return xchg(&fi->fb, fb);
+#else
+	return NULL;
+#endif
+}
+
+#ifdef CONFIG_FUSE_PASSTHROUGH
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb);
+void fuse_backing_put(struct fuse_backing *fb);
+#else
+
+static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+	return NULL;
+}
+
+static inline void fuse_backing_put(struct fuse_backing *fb)
+{
+}
+#endif
+
+void fuse_backing_files_init(struct fuse_conn *fc);
+void fuse_backing_files_free(struct fuse_conn *fc);
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map);
+int fuse_backing_close(struct fuse_conn *fc, int backing_id);
+
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+					   struct inode *inode,
+					   int backing_id);
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb);
+
+static inline struct file *fuse_file_passthrough(struct fuse_file *ff)
+{
+#ifdef CONFIG_FUSE_PASSTHROUGH
+	return ff->passthrough;
+#else
+	return NULL;
+#endif
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb, struct iov_iter *iter);
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+				     struct pipe_inode_info *pipe,
+				     size_t len, unsigned int flags);
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+				      struct file *out, loff_t *ppos,
+				      size_t len, unsigned int flags);
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma);
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 516ea2979a90..3a5d88878335 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -111,6 +111,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	if (IS_ENABLED(CONFIG_FUSE_DAX) && !fuse_dax_inode_alloc(sb, fi))
 		goto out_free_forget;
 
+	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		fuse_inode_backing_set(fi, NULL);
+
 	return &fi->inode;
 
 out_free_forget:
@@ -129,6 +132,9 @@ static void fuse_free_inode(struct inode *inode)
 #ifdef CONFIG_FUSE_DAX
 	kfree(fi->dax);
 #endif
+	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		fuse_backing_put(fuse_inode_backing(fi));
+
 	kmem_cache_free(fuse_inode_cachep, fi);
 }
 
@@ -469,8 +475,11 @@ retry:
 	} else if (fuse_stale_inode(inode, generation, attr)) {
 		/* nodeid was reused, any I/O on the old inode should fail */
 		fuse_make_bad(inode);
-		iput(inode);
-		goto retry;
+		if (inode != d_inode(sb->s_root)) {
+			remove_inode_hash(inode);
+			iput(inode);
+			goto retry;
+		}
 	}
 	fi = get_fuse_inode(inode);
 	spin_lock(&fi->lock);
@@ -924,6 +933,9 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
 	fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ;
 	fc->max_pages_limit = FUSE_MAX_MAX_PAGES;
 
+	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		fuse_backing_files_init(fc);
+
 	INIT_LIST_HEAD(&fc->mounts);
 	list_add(&fm->fc_entry, &fc->mounts);
 	fm->fc = fc;
@@ -954,6 +966,8 @@ void fuse_conn_put(struct fuse_conn *fc)
 			WARN_ON(atomic_read(&bucket->count) != 1);
 			kfree(bucket);
 		}
+		if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+			fuse_backing_files_free(fc);
 		call_rcu(&fc->rcu, delayed_release);
 	}
 }
@@ -974,7 +988,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
 	attr.nlink = 1;
-	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+	return fuse_iget(sb, FUSE_ROOT_ID, 0, &attr, 0, 0);
 }
 
 struct fuse_inode_handle {
@@ -1117,6 +1131,11 @@ static struct dentry *fuse_get_parent(struct dentry *child)
 	return parent;
 }
 
+/* only for fid encoding; no support for file handle */
+static const struct export_operations fuse_export_fid_operations = {
+	.encode_fh	= fuse_encode_fh,
+};
+
 static const struct export_operations fuse_export_operations = {
 	.fh_to_dentry	= fuse_fh_to_dentry,
 	.fh_to_parent	= fuse_fh_to_parent,
@@ -1291,6 +1310,26 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				fc->create_supp_group = 1;
 			if (flags & FUSE_DIRECT_IO_ALLOW_MMAP)
 				fc->direct_io_allow_mmap = 1;
+			/*
+			 * max_stack_depth is the max stack depth of FUSE fs,
+			 * so it has to be at least 1 to support passthrough
+			 * to backing files.
+			 *
+			 * with max_stack_depth > 1, the backing files can be
+			 * on a stacked fs (e.g. overlayfs) themselves and with
+			 * max_stack_depth == 1, FUSE fs can be stacked as the
+			 * underlying fs of a stacked fs (e.g. overlayfs).
+			 */
+			if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) &&
+			    (flags & FUSE_PASSTHROUGH) &&
+			    arg->max_stack_depth > 0 &&
+			    arg->max_stack_depth <= FILESYSTEM_MAX_STACK_DEPTH) {
+				fc->passthrough = 1;
+				fc->max_stack_depth = arg->max_stack_depth;
+				fm->sb->s_stack_depth = arg->max_stack_depth;
+			}
+			if (flags & FUSE_NO_EXPORT_SUPPORT)
+				fm->sb->s_export_op = &fuse_export_fid_operations;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1337,7 +1376,8 @@ void fuse_send_init(struct fuse_mount *fm)
 		FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA |
 		FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT |
 		FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP |
-		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP;
+		FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP |
+		FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND;
 #ifdef CONFIG_FUSE_DAX
 	if (fm->fc->dax)
 		flags |= FUSE_MAP_ALIGNMENT;
@@ -1346,6 +1386,8 @@ void fuse_send_init(struct fuse_mount *fm)
 #endif
 	if (fm->fc->auto_submounts)
 		flags |= FUSE_SUBMOUNTS;
+	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
+		flags |= FUSE_PASSTHROUGH;
 
 	ia->in.flags = flags;
 	ia->in.flags2 = flags >> 32;
@@ -1496,8 +1538,8 @@ static void fuse_fill_attr_from_inode(struct fuse_attr *attr,
 		.ctimensec	= ctime.tv_nsec,
 		.mode		= fi->inode.i_mode,
 		.nlink		= fi->inode.i_nlink,
-		.uid		= fi->inode.i_uid.val,
-		.gid		= fi->inode.i_gid.val,
+		.uid		= __kuid_val(fi->inode.i_uid),
+		.gid		= __kgid_val(fi->inode.i_gid),
 		.rdev		= fi->inode.i_rdev,
 		.blksize	= 1u << fi->inode.i_blkbits,
 	};
@@ -1534,6 +1576,7 @@ static int fuse_fill_super_submount(struct super_block *sb,
 	sb->s_bdi = bdi_get(parent_sb->s_bdi);
 
 	sb->s_xattr = parent_sb->s_xattr;
+	sb->s_export_op = parent_sb->s_export_op;
 	sb->s_time_gran = parent_sb->s_time_gran;
 	sb->s_blocksize = parent_sb->s_blocksize;
 	sb->s_blocksize_bits = parent_sb->s_blocksize_bits;
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
new file mode 100644
index 000000000000..c653ddcf0578
--- /dev/null
+++ b/fs/fuse/iomode.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE inode io modes.
+ *
+ * Copyright (c) 2024 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+
+/*
+ * Return true if need to wait for new opens in caching mode.
+ */
+static inline bool fuse_is_io_cache_wait(struct fuse_inode *fi)
+{
+	return READ_ONCE(fi->iocachectr) < 0 && !fuse_inode_backing(fi);
+}
+
+/*
+ * Start cached io mode.
+ *
+ * Blocks new parallel dio writes and waits for the in-progress parallel dio
+ * writes to complete.
+ */
+int fuse_file_cached_io_start(struct inode *inode, struct fuse_file *ff)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	/* There are no io modes if server does not implement open */
+	if (!ff->args)
+		return 0;
+
+	spin_lock(&fi->lock);
+	/*
+	 * Setting the bit advises new direct-io writes to use an exclusive
+	 * lock - without it the wait below might be forever.
+	 */
+	while (fuse_is_io_cache_wait(fi)) {
+		set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+		spin_unlock(&fi->lock);
+		wait_event(fi->direct_io_waitq, !fuse_is_io_cache_wait(fi));
+		spin_lock(&fi->lock);
+	}
+
+	/*
+	 * Check if inode entered passthrough io mode while waiting for parallel
+	 * dio write completion.
+	 */
+	if (fuse_inode_backing(fi)) {
+		clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+		spin_unlock(&fi->lock);
+		return -ETXTBSY;
+	}
+
+	WARN_ON(ff->iomode == IOM_UNCACHED);
+	if (ff->iomode == IOM_NONE) {
+		ff->iomode = IOM_CACHED;
+		if (fi->iocachectr == 0)
+			set_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+		fi->iocachectr++;
+	}
+	spin_unlock(&fi->lock);
+	return 0;
+}
+
+static void fuse_file_cached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	spin_lock(&fi->lock);
+	WARN_ON(fi->iocachectr <= 0);
+	WARN_ON(ff->iomode != IOM_CACHED);
+	ff->iomode = IOM_NONE;
+	fi->iocachectr--;
+	if (fi->iocachectr == 0)
+		clear_bit(FUSE_I_CACHE_IO_MODE, &fi->state);
+	spin_unlock(&fi->lock);
+}
+
+/* Start strictly uncached io mode where cache access is not allowed */
+int fuse_file_uncached_io_start(struct inode *inode, struct fuse_file *ff, struct fuse_backing *fb)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_backing *oldfb;
+	int err = 0;
+
+	spin_lock(&fi->lock);
+	/* deny conflicting backing files on same fuse inode */
+	oldfb = fuse_inode_backing(fi);
+	if (oldfb && oldfb != fb) {
+		err = -EBUSY;
+		goto unlock;
+	}
+	if (fi->iocachectr > 0) {
+		err = -ETXTBSY;
+		goto unlock;
+	}
+	WARN_ON(ff->iomode != IOM_NONE);
+	fi->iocachectr--;
+	ff->iomode = IOM_UNCACHED;
+
+	/* fuse inode holds a single refcount of backing file */
+	if (!oldfb) {
+		oldfb = fuse_inode_backing_set(fi, fb);
+		WARN_ON_ONCE(oldfb != NULL);
+	} else {
+		fuse_backing_put(fb);
+	}
+unlock:
+	spin_unlock(&fi->lock);
+	return err;
+}
+
+void fuse_file_uncached_io_end(struct inode *inode, struct fuse_file *ff)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_backing *oldfb = NULL;
+
+	spin_lock(&fi->lock);
+	WARN_ON(fi->iocachectr >= 0);
+	WARN_ON(ff->iomode != IOM_UNCACHED);
+	ff->iomode = IOM_NONE;
+	fi->iocachectr++;
+	if (!fi->iocachectr) {
+		wake_up(&fi->direct_io_waitq);
+		oldfb = fuse_inode_backing_set(fi, NULL);
+	}
+	spin_unlock(&fi->lock);
+	if (oldfb)
+		fuse_backing_put(oldfb);
+}
+
+/*
+ * Open flags that are allowed in combination with FOPEN_PASSTHROUGH.
+ * A combination of FOPEN_PASSTHROUGH and FOPEN_DIRECT_IO means that read/write
+ * operations go directly to the server, but mmap is done on the backing file.
+ * FOPEN_PASSTHROUGH mode should not co-exist with any users of the fuse inode
+ * page cache, so FOPEN_KEEP_CACHE is a strange and undesired combination.
+ */
+#define FOPEN_PASSTHROUGH_MASK \
+	(FOPEN_PASSTHROUGH | FOPEN_DIRECT_IO | FOPEN_PARALLEL_DIRECT_WRITES | \
+	 FOPEN_NOFLUSH)
+
+static int fuse_file_passthrough_open(struct inode *inode, struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_backing *fb;
+	int err;
+
+	/* Check allowed conditions for file open in passthrough mode */
+	if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH) || !fc->passthrough ||
+	    (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK))
+		return -EINVAL;
+
+	fb = fuse_passthrough_open(file, inode,
+				   ff->args->open_outarg.backing_id);
+	if (IS_ERR(fb))
+		return PTR_ERR(fb);
+
+	/* First passthrough file open denies caching inode io mode */
+	err = fuse_file_uncached_io_start(inode, ff, fb);
+	if (!err)
+		return 0;
+
+	fuse_passthrough_release(ff, fb);
+	fuse_backing_put(fb);
+
+	return err;
+}
+
+/* Request access to submit new io to inode via open file */
+int fuse_file_io_open(struct file *file, struct inode *inode)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	int err;
+
+	/*
+	 * io modes are not relevant with DAX and with server that does not
+	 * implement open.
+	 */
+	if (FUSE_IS_DAX(inode) || !ff->args)
+		return 0;
+
+	/*
+	 * Server is expected to use FOPEN_PASSTHROUGH for all opens of an inode
+	 * which is already open for passthrough.
+	 */
+	err = -EINVAL;
+	if (fuse_inode_backing(fi) && !(ff->open_flags & FOPEN_PASSTHROUGH))
+		goto fail;
+
+	/*
+	 * FOPEN_PARALLEL_DIRECT_WRITES requires FOPEN_DIRECT_IO.
+	 */
+	if (!(ff->open_flags & FOPEN_DIRECT_IO))
+		ff->open_flags &= ~FOPEN_PARALLEL_DIRECT_WRITES;
+
+	/*
+	 * First passthrough file open denies caching inode io mode.
+	 * First caching file open enters caching inode io mode.
+	 *
+	 * Note that if user opens a file open with O_DIRECT, but server did
+	 * not specify FOPEN_DIRECT_IO, a later fcntl() could remove O_DIRECT,
+	 * so we put the inode in caching mode to prevent parallel dio.
+	 */
+	if ((ff->open_flags & FOPEN_DIRECT_IO) &&
+	    !(ff->open_flags & FOPEN_PASSTHROUGH))
+		return 0;
+
+	if (ff->open_flags & FOPEN_PASSTHROUGH)
+		err = fuse_file_passthrough_open(inode, file);
+	else
+		err = fuse_file_cached_io_start(inode, ff);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	pr_debug("failed to open file in requested io mode (open_flags=0x%x, err=%i).\n",
+		 ff->open_flags, err);
+	/*
+	 * The file open mode determines the inode io mode.
+	 * Using incorrect open mode is a server mistake, which results in
+	 * user visible failure of open() with EIO error.
+	 */
+	return -EIO;
+}
+
+/* No more pending io and no new io possible to inode via open/mmapped file */
+void fuse_file_io_release(struct fuse_file *ff, struct inode *inode)
+{
+	/*
+	 * Last parallel dio close allows caching inode io mode.
+	 * Last caching file close exits caching inode io mode.
+	 */
+	switch (ff->iomode) {
+	case IOM_NONE:
+		/* Nothing to do */
+		break;
+	case IOM_UNCACHED:
+		fuse_file_uncached_io_end(inode, ff);
+		break;
+	case IOM_CACHED:
+		fuse_file_cached_io_end(inode, ff);
+		break;
+	}
+}
diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c
new file mode 100644
index 000000000000..1567f0323858
--- /dev/null
+++ b/fs/fuse/passthrough.c
@@ -0,0 +1,355 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * FUSE passthrough to backing file.
+ *
+ * Copyright (c) 2023 CTERA Networks.
+ */
+
+#include "fuse_i.h"
+
+#include <linux/file.h>
+#include <linux/backing-file.h>
+#include <linux/splice.h>
+
+static void fuse_file_accessed(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	fuse_invalidate_atime(inode);
+}
+
+static void fuse_file_modified(struct file *file)
+{
+	struct inode *inode = file_inode(file);
+
+	fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE);
+}
+
+ssize_t fuse_passthrough_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct file *backing_file = fuse_file_passthrough(ff);
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ff->cred,
+		.user_file = file,
+		.accessed = fuse_file_accessed,
+	};
+
+
+	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+		 backing_file, iocb->ki_pos, count);
+
+	if (!count)
+		return 0;
+
+	ret = backing_file_read_iter(backing_file, iter, iocb, iocb->ki_flags,
+				     &ctx);
+
+	return ret;
+}
+
+ssize_t fuse_passthrough_write_iter(struct kiocb *iocb,
+				    struct iov_iter *iter)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file_inode(file);
+	struct fuse_file *ff = file->private_data;
+	struct file *backing_file = fuse_file_passthrough(ff);
+	size_t count = iov_iter_count(iter);
+	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ff->cred,
+		.user_file = file,
+		.end_write = fuse_file_modified,
+	};
+
+	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu\n", __func__,
+		 backing_file, iocb->ki_pos, count);
+
+	if (!count)
+		return 0;
+
+	inode_lock(inode);
+	ret = backing_file_write_iter(backing_file, iter, iocb, iocb->ki_flags,
+				      &ctx);
+	inode_unlock(inode);
+
+	return ret;
+}
+
+ssize_t fuse_passthrough_splice_read(struct file *in, loff_t *ppos,
+				     struct pipe_inode_info *pipe,
+				     size_t len, unsigned int flags)
+{
+	struct fuse_file *ff = in->private_data;
+	struct file *backing_file = fuse_file_passthrough(ff);
+	struct backing_file_ctx ctx = {
+		.cred = ff->cred,
+		.user_file = in,
+		.accessed = fuse_file_accessed,
+	};
+
+	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+		 backing_file, ppos ? *ppos : 0, len, flags);
+
+	return backing_file_splice_read(backing_file, ppos, pipe, len, flags,
+					&ctx);
+}
+
+ssize_t fuse_passthrough_splice_write(struct pipe_inode_info *pipe,
+				      struct file *out, loff_t *ppos,
+				      size_t len, unsigned int flags)
+{
+	struct fuse_file *ff = out->private_data;
+	struct file *backing_file = fuse_file_passthrough(ff);
+	struct inode *inode = file_inode(out);
+	ssize_t ret;
+	struct backing_file_ctx ctx = {
+		.cred = ff->cred,
+		.user_file = out,
+		.end_write = fuse_file_modified,
+	};
+
+	pr_debug("%s: backing_file=0x%p, pos=%lld, len=%zu, flags=0x%x\n", __func__,
+		 backing_file, ppos ? *ppos : 0, len, flags);
+
+	inode_lock(inode);
+	ret = backing_file_splice_write(pipe, backing_file, ppos, len, flags,
+					&ctx);
+	inode_unlock(inode);
+
+	return ret;
+}
+
+ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct fuse_file *ff = file->private_data;
+	struct file *backing_file = fuse_file_passthrough(ff);
+	struct backing_file_ctx ctx = {
+		.cred = ff->cred,
+		.user_file = file,
+		.accessed = fuse_file_accessed,
+	};
+
+	pr_debug("%s: backing_file=0x%p, start=%lu, end=%lu\n", __func__,
+		 backing_file, vma->vm_start, vma->vm_end);
+
+	return backing_file_mmap(backing_file, vma, &ctx);
+}
+
+struct fuse_backing *fuse_backing_get(struct fuse_backing *fb)
+{
+	if (fb && refcount_inc_not_zero(&fb->count))
+		return fb;
+	return NULL;
+}
+
+static void fuse_backing_free(struct fuse_backing *fb)
+{
+	pr_debug("%s: fb=0x%p\n", __func__, fb);
+
+	if (fb->file)
+		fput(fb->file);
+	put_cred(fb->cred);
+	kfree_rcu(fb, rcu);
+}
+
+void fuse_backing_put(struct fuse_backing *fb)
+{
+	if (fb && refcount_dec_and_test(&fb->count))
+		fuse_backing_free(fb);
+}
+
+void fuse_backing_files_init(struct fuse_conn *fc)
+{
+	idr_init(&fc->backing_files_map);
+}
+
+static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb)
+{
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&fc->lock);
+	/* FIXME: xarray might be space inefficient */
+	id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC);
+	spin_unlock(&fc->lock);
+	idr_preload_end();
+
+	WARN_ON_ONCE(id == 0);
+	return id;
+}
+
+static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc,
+						   int id)
+{
+	struct fuse_backing *fb;
+
+	spin_lock(&fc->lock);
+	fb = idr_remove(&fc->backing_files_map, id);
+	spin_unlock(&fc->lock);
+
+	return fb;
+}
+
+static int fuse_backing_id_free(int id, void *p, void *data)
+{
+	struct fuse_backing *fb = p;
+
+	WARN_ON_ONCE(refcount_read(&fb->count) != 1);
+	fuse_backing_free(fb);
+	return 0;
+}
+
+void fuse_backing_files_free(struct fuse_conn *fc)
+{
+	idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL);
+	idr_destroy(&fc->backing_files_map);
+}
+
+int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map)
+{
+	struct file *file;
+	struct super_block *backing_sb;
+	struct fuse_backing *fb = NULL;
+	int res;
+
+	pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags);
+
+	/* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+	res = -EPERM;
+	if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	res = -EINVAL;
+	if (map->flags)
+		goto out;
+
+	file = fget(map->fd);
+	res = -EBADF;
+	if (!file)
+		goto out;
+
+	res = -EOPNOTSUPP;
+	if (!file->f_op->read_iter || !file->f_op->write_iter)
+		goto out_fput;
+
+	backing_sb = file_inode(file)->i_sb;
+	res = -ELOOP;
+	if (backing_sb->s_stack_depth >= fc->max_stack_depth)
+		goto out_fput;
+
+	fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL);
+	res = -ENOMEM;
+	if (!fb)
+		goto out_fput;
+
+	fb->file = file;
+	fb->cred = prepare_creds();
+	refcount_set(&fb->count, 1);
+
+	res = fuse_backing_id_alloc(fc, fb);
+	if (res < 0) {
+		fuse_backing_free(fb);
+		fb = NULL;
+	}
+
+out:
+	pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res);
+
+	return res;
+
+out_fput:
+	fput(file);
+	goto out;
+}
+
+int fuse_backing_close(struct fuse_conn *fc, int backing_id)
+{
+	struct fuse_backing *fb = NULL;
+	int err;
+
+	pr_debug("%s: backing_id=%d\n", __func__, backing_id);
+
+	/* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */
+	err = -EPERM;
+	if (!fc->passthrough || !capable(CAP_SYS_ADMIN))
+		goto out;
+
+	err = -EINVAL;
+	if (backing_id <= 0)
+		goto out;
+
+	err = -ENOENT;
+	fb = fuse_backing_id_remove(fc, backing_id);
+	if (!fb)
+		goto out;
+
+	fuse_backing_put(fb);
+	err = 0;
+out:
+	pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err);
+
+	return err;
+}
+
+/*
+ * Setup passthrough to a backing file.
+ *
+ * Returns an fb object with elevated refcount to be stored in fuse inode.
+ */
+struct fuse_backing *fuse_passthrough_open(struct file *file,
+					   struct inode *inode,
+					   int backing_id)
+{
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fm->fc;
+	struct fuse_backing *fb = NULL;
+	struct file *backing_file;
+	int err;
+
+	err = -EINVAL;
+	if (backing_id <= 0)
+		goto out;
+
+	rcu_read_lock();
+	fb = idr_find(&fc->backing_files_map, backing_id);
+	fb = fuse_backing_get(fb);
+	rcu_read_unlock();
+
+	err = -ENOENT;
+	if (!fb)
+		goto out;
+
+	/* Allocate backing file per fuse file to store fuse path */
+	backing_file = backing_file_open(&file->f_path, file->f_flags,
+					 &fb->file->f_path, fb->cred);
+	err = PTR_ERR(backing_file);
+	if (IS_ERR(backing_file)) {
+		fuse_backing_put(fb);
+		goto out;
+	}
+
+	err = 0;
+	ff->passthrough = backing_file;
+	ff->cred = get_cred(fb->cred);
+out:
+	pr_debug("%s: backing_id=%d, fb=0x%p, backing_file=0x%p, err=%i\n", __func__,
+		 backing_id, fb, ff->passthrough, err);
+
+	return err ? ERR_PTR(err) : fb;
+}
+
+void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb)
+{
+	pr_debug("%s: fb=0x%p, backing_file=0x%p\n", __func__,
+		 fb, ff->passthrough);
+
+	fput(ff->passthrough);
+	ff->passthrough = NULL;
+	put_cred(ff->cred);
+	ff->cred = NULL;
+}
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index c66a54d6c7d3..0377b6dc24c8 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -592,15 +592,11 @@ int fuse_readdir(struct file *file, struct dir_context *ctx)
 	if (fuse_is_bad(inode))
 		return -EIO;
 
-	mutex_lock(&ff->readdir.lock);
-
 	err = UNCACHED;
 	if (ff->open_flags & FOPEN_CACHE_DIR)
 		err = fuse_readdir_cached(file, ctx);
 	if (err == UNCACHED)
 		err = fuse_readdir_uncached(file, ctx);
 
-	mutex_unlock(&ff->readdir.lock);
-
 	return err;
 }
diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
index 5f1be1da92ce..322af827a232 100644
--- a/fs/fuse/virtio_fs.c
+++ b/fs/fuse/virtio_fs.c
@@ -16,6 +16,7 @@
 #include <linux/fs_context.h>
 #include <linux/fs_parser.h>
 #include <linux/highmem.h>
+#include <linux/cleanup.h>
 #include <linux/uio.h>
 #include "fuse_i.h"
 
@@ -31,6 +32,9 @@
 static DEFINE_MUTEX(virtio_fs_mutex);
 static LIST_HEAD(virtio_fs_instances);
 
+/* The /sys/fs/virtio_fs/ kset */
+static struct kset *virtio_fs_kset;
+
 enum {
 	VQ_HIPRIO,
 	VQ_REQUEST
@@ -55,7 +59,7 @@ struct virtio_fs_vq {
 
 /* A virtio-fs device instance */
 struct virtio_fs {
-	struct kref refcount;
+	struct kobject kobj;
 	struct list_head list;    /* on virtio_fs_instances */
 	char *tag;
 	struct virtio_fs_vq *vqs;
@@ -161,18 +165,40 @@ static inline void dec_in_flight_req(struct virtio_fs_vq *fsvq)
 		complete(&fsvq->in_flight_zero);
 }
 
-static void release_virtio_fs_obj(struct kref *ref)
+static ssize_t tag_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+	return sysfs_emit(buf, fs->tag);
+}
+
+static struct kobj_attribute virtio_fs_tag_attr = __ATTR_RO(tag);
+
+static struct attribute *virtio_fs_attrs[] = {
+	&virtio_fs_tag_attr.attr,
+	NULL
+};
+ATTRIBUTE_GROUPS(virtio_fs);
+
+static void virtio_fs_ktype_release(struct kobject *kobj)
 {
-	struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount);
+	struct virtio_fs *vfs = container_of(kobj, struct virtio_fs, kobj);
 
 	kfree(vfs->vqs);
 	kfree(vfs);
 }
 
+static const struct kobj_type virtio_fs_ktype = {
+	.release = virtio_fs_ktype_release,
+	.sysfs_ops = &kobj_sysfs_ops,
+	.default_groups = virtio_fs_groups,
+};
+
 /* Make sure virtiofs_mutex is held */
 static void virtio_fs_put(struct virtio_fs *fs)
 {
-	kref_put(&fs->refcount, release_virtio_fs_obj);
+	kobject_put(&fs->kobj);
 }
 
 static void virtio_fs_fiq_release(struct fuse_iqueue *fiq)
@@ -243,25 +269,46 @@ static void virtio_fs_start_all_queues(struct virtio_fs *fs)
 }
 
 /* Add a new instance to the list or return -EEXIST if tag name exists*/
-static int virtio_fs_add_instance(struct virtio_fs *fs)
+static int virtio_fs_add_instance(struct virtio_device *vdev,
+				  struct virtio_fs *fs)
 {
 	struct virtio_fs *fs2;
-	bool duplicate = false;
+	int ret;
 
 	mutex_lock(&virtio_fs_mutex);
 
 	list_for_each_entry(fs2, &virtio_fs_instances, list) {
-		if (strcmp(fs->tag, fs2->tag) == 0)
-			duplicate = true;
+		if (strcmp(fs->tag, fs2->tag) == 0) {
+			mutex_unlock(&virtio_fs_mutex);
+			return -EEXIST;
+		}
 	}
 
-	if (!duplicate)
-		list_add_tail(&fs->list, &virtio_fs_instances);
+	/* Use the virtio_device's index as a unique identifier, there is no
+	 * need to allocate our own identifiers because the virtio_fs instance
+	 * is only visible to userspace as long as the underlying virtio_device
+	 * exists.
+	 */
+	fs->kobj.kset = virtio_fs_kset;
+	ret = kobject_add(&fs->kobj, NULL, "%d", vdev->index);
+	if (ret < 0) {
+		mutex_unlock(&virtio_fs_mutex);
+		return ret;
+	}
+
+	ret = sysfs_create_link(&fs->kobj, &vdev->dev.kobj, "device");
+	if (ret < 0) {
+		kobject_del(&fs->kobj);
+		mutex_unlock(&virtio_fs_mutex);
+		return ret;
+	}
+
+	list_add_tail(&fs->list, &virtio_fs_instances);
 
 	mutex_unlock(&virtio_fs_mutex);
 
-	if (duplicate)
-		return -EEXIST;
+	kobject_uevent(&fs->kobj, KOBJ_ADD);
+
 	return 0;
 }
 
@@ -274,7 +321,7 @@ static struct virtio_fs *virtio_fs_find_instance(const char *tag)
 
 	list_for_each_entry(fs, &virtio_fs_instances, list) {
 		if (strcmp(fs->tag, tag) == 0) {
-			kref_get(&fs->refcount);
+			kobject_get(&fs->kobj);
 			goto found;
 		}
 	}
@@ -323,6 +370,16 @@ static int virtio_fs_read_tag(struct virtio_device *vdev, struct virtio_fs *fs)
 		return -ENOMEM;
 	memcpy(fs->tag, tag_buf, len);
 	fs->tag[len] = '\0';
+
+	/* While the VIRTIO specification allows any character, newlines are
+	 * awkward on mount(8) command-lines and cause problems in the sysfs
+	 * "tag" attr and uevent TAG= properties. Forbid them.
+	 */
+	if (strchr(fs->tag, '\n')) {
+		dev_dbg(&vdev->dev, "refusing virtiofs tag with newline character\n");
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
@@ -345,7 +402,7 @@ static void virtio_fs_hiprio_done_work(struct work_struct *work)
 			kfree(req);
 			dec_in_flight_req(fsvq);
 		}
-	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	} while (!virtqueue_enable_cb(vq));
 	spin_unlock(&fsvq->lock);
 }
 
@@ -627,7 +684,7 @@ static void virtio_fs_requests_done_work(struct work_struct *work)
 			list_move_tail(&req->list, &reqs);
 			spin_unlock(&fpq->lock);
 		}
-	} while (!virtqueue_enable_cb(vq) && likely(!virtqueue_is_broken(vq)));
+	} while (!virtqueue_enable_cb(vq));
 	spin_unlock(&fsvq->lock);
 
 	/* End requests */
@@ -795,8 +852,11 @@ static void virtio_fs_cleanup_dax(void *data)
 	put_dax(dax_dev);
 }
 
+DEFINE_FREE(cleanup_dax, struct dax_dev *, if (!IS_ERR_OR_NULL(_T)) virtio_fs_cleanup_dax(_T))
+
 static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 {
+	struct dax_device *dax_dev __free(cleanup_dax) = NULL;
 	struct virtio_shm_region cache_reg;
 	struct dev_pagemap *pgmap;
 	bool have_cache;
@@ -804,6 +864,12 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	if (!IS_ENABLED(CONFIG_FUSE_DAX))
 		return 0;
 
+	dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
+	if (IS_ERR(dax_dev)) {
+		int rc = PTR_ERR(dax_dev);
+		return rc == -EOPNOTSUPP ? 0 : rc;
+	}
+
 	/* Get cache region */
 	have_cache = virtio_get_shm_region(vdev, &cache_reg,
 					   (u8)VIRTIO_FS_SHMCAP_ID_CACHE);
@@ -849,10 +915,7 @@ static int virtio_fs_setup_dax(struct virtio_device *vdev, struct virtio_fs *fs)
 	dev_dbg(&vdev->dev, "%s: window kaddr 0x%px phys_addr 0x%llx len 0x%llx\n",
 		__func__, fs->window_kaddr, cache_reg.addr, cache_reg.len);
 
-	fs->dax_dev = alloc_dax(fs, &virtio_fs_dax_ops);
-	if (IS_ERR(fs->dax_dev))
-		return PTR_ERR(fs->dax_dev);
-
+	fs->dax_dev = no_free_ptr(dax_dev);
 	return devm_add_action_or_reset(&vdev->dev, virtio_fs_cleanup_dax,
 					fs->dax_dev);
 }
@@ -865,7 +928,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 	fs = kzalloc(sizeof(*fs), GFP_KERNEL);
 	if (!fs)
 		return -ENOMEM;
-	kref_init(&fs->refcount);
+	kobject_init(&fs->kobj, &virtio_fs_ktype);
 	vdev->priv = fs;
 
 	ret = virtio_fs_read_tag(vdev, fs);
@@ -887,7 +950,7 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 	 */
 	virtio_device_ready(vdev);
 
-	ret = virtio_fs_add_instance(fs);
+	ret = virtio_fs_add_instance(vdev, fs);
 	if (ret < 0)
 		goto out_vqs;
 
@@ -896,11 +959,10 @@ static int virtio_fs_probe(struct virtio_device *vdev)
 out_vqs:
 	virtio_reset_device(vdev);
 	virtio_fs_cleanup_vqs(vdev);
-	kfree(fs->vqs);
 
 out:
 	vdev->priv = NULL;
-	kfree(fs);
+	kobject_put(&fs->kobj);
 	return ret;
 }
 
@@ -924,6 +986,8 @@ static void virtio_fs_remove(struct virtio_device *vdev)
 	mutex_lock(&virtio_fs_mutex);
 	/* This device is going away. No one should get new reference */
 	list_del_init(&fs->list);
+	sysfs_remove_link(&fs->kobj, "device");
+	kobject_del(&fs->kobj);
 	virtio_fs_stop_all_queues(fs);
 	virtio_fs_drain_all_queues_locked(fs);
 	virtio_reset_device(vdev);
@@ -1510,21 +1574,56 @@ static struct file_system_type virtio_fs_type = {
 	.kill_sb	= virtio_kill_sb,
 };
 
+static int virtio_fs_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
+{
+	const struct virtio_fs *fs = container_of(kobj, struct virtio_fs, kobj);
+
+	add_uevent_var(env, "TAG=%s", fs->tag);
+	return 0;
+}
+
+static const struct kset_uevent_ops virtio_fs_uevent_ops = {
+	.uevent = virtio_fs_uevent,
+};
+
+static int __init virtio_fs_sysfs_init(void)
+{
+	virtio_fs_kset = kset_create_and_add("virtiofs", &virtio_fs_uevent_ops,
+					     fs_kobj);
+	if (!virtio_fs_kset)
+		return -ENOMEM;
+	return 0;
+}
+
+static void virtio_fs_sysfs_exit(void)
+{
+	kset_unregister(virtio_fs_kset);
+	virtio_fs_kset = NULL;
+}
+
 static int __init virtio_fs_init(void)
 {
 	int ret;
 
-	ret = register_virtio_driver(&virtio_fs_driver);
+	ret = virtio_fs_sysfs_init();
 	if (ret < 0)
 		return ret;
 
+	ret = register_virtio_driver(&virtio_fs_driver);
+	if (ret < 0)
+		goto sysfs_exit;
+
 	ret = register_filesystem(&virtio_fs_type);
-	if (ret < 0) {
-		unregister_virtio_driver(&virtio_fs_driver);
-		return ret;
-	}
+	if (ret < 0)
+		goto unregister_virtio_driver;
 
 	return 0;
+
+unregister_virtio_driver:
+	unregister_virtio_driver(&virtio_fs_driver);
+sysfs_exit:
+	virtio_fs_sysfs_exit();
+	return ret;
 }
 module_init(virtio_fs_init);
 
@@ -1532,6 +1631,7 @@ static void __exit virtio_fs_exit(void)
 {
 	unregister_filesystem(&virtio_fs_type);
 	unregister_virtio_driver(&virtio_fs_driver);
+	virtio_fs_sysfs_exit();
 }
 module_exit(virtio_fs_exit);
 
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index d9ccfd27e4f1..aa1626955b2c 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1718,7 +1718,8 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 	struct buffer_head *dibh, *bh;
 	struct gfs2_holder rd_gh;
 	unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
-	u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
+	unsigned int bsize = 1 << bsize_shift;
+	u64 lblock = (offset + bsize - 1) >> bsize_shift;
 	__u16 start_list[GFS2_MAX_META_HEIGHT];
 	__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
 	unsigned int start_aligned, end_aligned;
@@ -1729,7 +1730,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
 	u64 prev_bnr = 0;
 	__be64 *start, *end;
 
-	if (offset >= maxsize) {
+	if (offset + bsize - 1 >= maxsize) {
 		/*
 		 * The starting point lies beyond the allocated metadata;
 		 * there are no blocks to deallocate.
@@ -2465,7 +2466,7 @@ out:
 }
 
 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
-		loff_t offset)
+		loff_t offset, unsigned int len)
 {
 	int ret;
 
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 992ca4effb50..4c42ada60ae7 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1440,10 +1440,10 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 	struct gfs2_sbd *sdp = GFS2_SB(file->f_mapping->host);
 	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
 
-	if (!(fl->fl_flags & FL_POSIX))
+	if (!(fl->c.flc_flags & FL_POSIX))
 		return -ENOLCK;
 	if (gfs2_withdrawing_or_withdrawn(sdp)) {
-		if (fl->fl_type == F_UNLCK)
+		if (lock_is_unlock(fl))
 			locks_lock_file_wait(file, fl);
 		return -EIO;
 	}
@@ -1451,7 +1451,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 		return dlm_posix_cancel(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else if (IS_GETLK(cmd))
 		return dlm_posix_get(ls->ls_dlm, ip->i_no_addr, file, fl);
-	else if (fl->fl_type == F_UNLCK)
+	else if (lock_is_unlock(fl))
 		return dlm_posix_unlock(ls->ls_dlm, ip->i_no_addr, file, fl);
 	else
 		return dlm_posix_lock(ls->ls_dlm, ip->i_no_addr, file, cmd, fl);
@@ -1483,7 +1483,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	int error = 0;
 	int sleeptime;
 
-	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
+	state = lock_is_write(fl) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
 	flags = GL_EXACT | GL_NOPID;
 	if (!IS_SETLKW(cmd))
 		flags |= LM_FLAG_TRY_1CB;
@@ -1495,8 +1495,8 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		if (fl_gh->gh_state == state)
 			goto out;
 		locks_init_lock(&request);
-		request.fl_type = F_UNLCK;
-		request.fl_flags = FL_FLOCK;
+		request.c.flc_type = F_UNLCK;
+		request.c.flc_flags = FL_FLOCK;
 		locks_lock_file_wait(file, &request);
 		gfs2_glock_dq(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
@@ -1557,10 +1557,10 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		return -ENOLCK;
 
-	if (fl->fl_type == F_UNLCK) {
+	if (lock_is_unlock(fl)) {
 		do_unflock(file, fl);
 		return 0;
 	} else {
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 79be0cdc730c..04cadc02e5a6 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -111,7 +111,6 @@ static int __init init_gfs2_fs(void)
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
-						  SLAB_MEM_SPREAD|
 						  SLAB_ACCOUNT,
 					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1281e60be639..572d58e86296 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -214,7 +214,7 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const void *buf)
 
 	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
 	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
-	memcpy(&s->s_uuid, str->sb_uuid, 16);
+	super_set_uuid(s, str->sb_uuid, 16);
 }
 
 /**
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index b0cb70400996..ce9346099c72 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,7 +30,7 @@ struct hfsplus_wd {
  * @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
  * @buf: buffer for I/O
  * @data: output pointer for location of requested data
- * @opf: request op flags
+ * @opf: I/O operation type and flags
  *
  * The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
  * HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 6b0ba3c1efba..314834a078e9 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -255,7 +255,7 @@ static int init_inodecache(void)
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index d746866ae3b6..6502c7e776d1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -933,7 +933,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
 	unsigned int ia_valid = attr->ia_valid;
 	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
-	error = setattr_prepare(&nop_mnt_idmap, dentry, attr);
+	error = setattr_prepare(idmap, dentry, attr);
 	if (error)
 		return error;
 
@@ -950,7 +950,7 @@ static int hugetlbfs_setattr(struct mnt_idmap *idmap,
 		hugetlb_vmtruncate(inode, newsize);
 	}
 
-	setattr_copy(&nop_mnt_idmap, inode, attr);
+	setattr_copy(idmap, inode, attr);
 	mark_inode_dirty(inode);
 	return 0;
 }
@@ -985,6 +985,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
 
 static struct inode *hugetlbfs_get_inode(struct super_block *sb,
+					struct mnt_idmap *idmap,
 					struct inode *dir,
 					umode_t mode, dev_t dev)
 {
@@ -1006,7 +1007,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 
 		inode->i_ino = get_next_ino();
-		inode_init_owner(&nop_mnt_idmap, inode, dir, mode);
+		inode_init_owner(idmap, inode, dir, mode);
 		lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
 				&hugetlbfs_i_mmap_rwsem_key);
 		inode->i_mapping->a_ops = &hugetlbfs_aops;
@@ -1050,7 +1051,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 {
 	struct inode *inode;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, dev);
 	if (!inode)
 		return -ENOSPC;
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1062,7 +1063,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 			   struct dentry *dentry, umode_t mode)
 {
-	int retval = hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry,
+	int retval = hugetlbfs_mknod(idmap, dir, dentry,
 				     mode | S_IFDIR, 0);
 	if (!retval)
 		inc_nlink(dir);
@@ -1073,7 +1074,7 @@ static int hugetlbfs_create(struct mnt_idmap *idmap,
 			    struct inode *dir, struct dentry *dentry,
 			    umode_t mode, bool excl)
 {
-	return hugetlbfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFREG, 0);
+	return hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFREG, 0);
 }
 
 static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
@@ -1082,7 +1083,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap,
 {
 	struct inode *inode;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode | S_IFREG, 0);
 	if (!inode)
 		return -ENOSPC;
 	inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
@@ -1094,10 +1095,11 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap,
 			     struct inode *dir, struct dentry *dentry,
 			     const char *symname)
 {
+	const umode_t mode = S_IFLNK|S_IRWXUGO;
 	struct inode *inode;
 	int error = -ENOSPC;
 
-	inode = hugetlbfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
+	inode = hugetlbfs_get_inode(dir->i_sb, idmap, dir, mode, 0);
 	if (inode) {
 		int l = strlen(symname)+1;
 		error = page_symlink(inode, symname, l);
@@ -1566,6 +1568,7 @@ static struct file_system_type hugetlbfs_fs_type = {
 	.init_fs_context	= hugetlbfs_init_fs_context,
 	.parameters		= hugetlb_fs_parameters,
 	.kill_sb		= kill_litter_super,
+	.fs_flags               = FS_ALLOW_IDMAP,
 };
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
@@ -1619,7 +1622,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 	}
 
 	file = ERR_PTR(-ENOSPC);
-	inode = hugetlbfs_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0);
+	/* hugetlbfs_vfsmount[] mounts do not use idmapped mounts.  */
+	inode = hugetlbfs_get_inode(mnt->mnt_sb, &nop_mnt_idmap, NULL,
+				    S_IFREG | S_IRWXUGO, 0);
 	if (!inode)
 		goto out;
 	if (creat_flags == HUGETLB_SHMFS_INODE)
diff --git a/fs/inode.c b/fs/inode.c
index 91048c4c9c9e..3a41f83a4ba5 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -20,6 +20,7 @@
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
 #include <linux/iversion.h>
+#include <linux/rw_hint.h>
 #include <trace/events/writeback.h>
 #include "internal.h"
 
@@ -588,7 +589,8 @@ void dump_mapping(const struct address_space *mapping)
 	}
 
 	dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
-	if (get_kernel_nofault(dentry, dentry_ptr)) {
+	if (get_kernel_nofault(dentry, dentry_ptr) ||
+	    !dentry.d_parent || !dentry.d_name.name) {
 		pr_warn("aops:%ps ino:%lx invalid dentry:%px\n",
 				a_ops, ino, dentry_ptr);
 		return;
@@ -2031,7 +2033,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
 	return notify_change(idmap, dentry, &newattrs, NULL);
 }
 
-static int __file_remove_privs(struct file *file, unsigned int flags)
+int file_remove_privs_flags(struct file *file, unsigned int flags)
 {
 	struct dentry *dentry = file_dentry(file);
 	struct inode *inode = file_inode(file);
@@ -2056,6 +2058,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
 		inode_has_no_xattr(inode);
 	return error;
 }
+EXPORT_SYMBOL_GPL(file_remove_privs_flags);
 
 /**
  * file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2068,7 +2071,7 @@ static int __file_remove_privs(struct file *file, unsigned int flags)
  */
 int file_remove_privs(struct file *file)
 {
-	return __file_remove_privs(file, 0);
+	return file_remove_privs_flags(file, 0);
 }
 EXPORT_SYMBOL(file_remove_privs);
 
@@ -2161,7 +2164,7 @@ static int file_modified_flags(struct file *file, int flags)
 	 * Clear the security bits if the process is not being run by root.
 	 * This keeps people from modifying setuid and setgid binaries.
 	 */
-	ret = __file_remove_privs(file, flags);
+	ret = file_remove_privs_flags(file, flags);
 	if (ret)
 		return ret;
 
@@ -2285,7 +2288,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					 SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
@@ -2509,7 +2512,7 @@ struct timespec64 inode_set_ctime_current(struct inode *inode)
 {
 	struct timespec64 now = current_time(inode);
 
-	inode_set_ctime(inode, now.tv_sec, now.tv_nsec);
+	inode_set_ctime_to_ts(inode, now);
 	return now;
 }
 EXPORT_SYMBOL(inode_set_ctime_current);
diff --git a/fs/internal.h b/fs/internal.h
index b67406435fc0..7ca738904e34 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -183,6 +183,7 @@ extern struct open_how build_open_how(int flags, umode_t mode);
 extern int build_open_flags(const struct open_how *how, struct open_flags *op);
 struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
 
+long do_ftruncate(struct file *file, loff_t length, int small);
 long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
 int chmod_common(const struct path *path, umode_t mode);
 int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
@@ -310,3 +311,10 @@ ssize_t __kernel_write_iter(struct file *file, struct iov_iter *from, loff_t *po
 struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns);
 struct mnt_idmap *mnt_idmap_get(struct mnt_idmap *idmap);
 void mnt_idmap_put(struct mnt_idmap *idmap);
+struct stashed_operations {
+	void (*put_data)(void *data);
+	int (*init_inode)(struct inode *inode, void *data);
+};
+int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
+		      struct path *path);
+void stashed_dentry_prune(struct dentry *dentry);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 76cf22ac97d7..1d5abfdf0f22 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -763,6 +763,33 @@ static int ioctl_fssetxattr(struct file *file, void __user *argp)
 	return err;
 }
 
+static int ioctl_getfsuuid(struct file *file, void __user *argp)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+	struct fsuuid2 u = { .len = sb->s_uuid_len, };
+
+	if (!sb->s_uuid_len)
+		return -ENOIOCTLCMD;
+
+	memcpy(&u.uuid[0], &sb->s_uuid, sb->s_uuid_len);
+
+	return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
+static int ioctl_get_fs_sysfs_path(struct file *file, void __user *argp)
+{
+	struct super_block *sb = file_inode(file)->i_sb;
+
+	if (!strlen(sb->s_sysfs_name))
+		return -ENOIOCTLCMD;
+
+	struct fs_sysfs_path u = {};
+
+	u.len = scnprintf(u.name, sizeof(u.name), "%s/%s", sb->s_type->name, sb->s_sysfs_name);
+
+	return copy_to_user(argp, &u, sizeof(u)) ? -EFAULT : 0;
+}
+
 /*
  * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d.
  * It's just a simple helper for sys_ioctl and compat_sys_ioctl.
@@ -845,6 +872,12 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd,
 	case FS_IOC_FSSETXATTR:
 		return ioctl_fssetxattr(filp, argp);
 
+	case FS_IOC_GETFSUUID:
+		return ioctl_getfsuuid(filp, argp);
+
+	case FS_IOC_GETFSSYSFSPATH:
+		return ioctl_get_fs_sysfs_path(filp, argp);
+
 	default:
 		if (S_ISREG(inode->i_mode))
 			return file_ioctl(filp, cmd, argp);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 093c4515b22a..4e8e41c8b3c0 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /*
  * Copyright (C) 2010 Red Hat, Inc.
- * Copyright (C) 2016-2019 Christoph Hellwig.
+ * Copyright (C) 2016-2023 Christoph Hellwig.
  */
 #include <linux/module.h>
 #include <linux/compiler.h>
@@ -95,6 +95,44 @@ static inline bool ifs_block_is_dirty(struct folio *folio,
 	return test_bit(block + blks_per_folio, ifs->state);
 }
 
+static unsigned ifs_find_dirty_range(struct folio *folio,
+		struct iomap_folio_state *ifs, u64 *range_start, u64 range_end)
+{
+	struct inode *inode = folio->mapping->host;
+	unsigned start_blk =
+		offset_in_folio(folio, *range_start) >> inode->i_blkbits;
+	unsigned end_blk = min_not_zero(
+		offset_in_folio(folio, range_end) >> inode->i_blkbits,
+		i_blocks_per_folio(inode, folio));
+	unsigned nblks = 1;
+
+	while (!ifs_block_is_dirty(folio, ifs, start_blk))
+		if (++start_blk == end_blk)
+			return 0;
+
+	while (start_blk + nblks < end_blk) {
+		if (!ifs_block_is_dirty(folio, ifs, start_blk + nblks))
+			break;
+		nblks++;
+	}
+
+	*range_start = folio_pos(folio) + (start_blk << inode->i_blkbits);
+	return nblks << inode->i_blkbits;
+}
+
+static unsigned iomap_find_dirty_range(struct folio *folio, u64 *range_start,
+		u64 range_end)
+{
+	struct iomap_folio_state *ifs = folio->private;
+
+	if (*range_start >= range_end)
+		return 0;
+
+	if (ifs)
+		return ifs_find_dirty_range(folio, ifs, range_start, range_end);
+	return range_end - *range_start;
+}
+
 static void ifs_clear_range_dirty(struct folio *folio,
 		struct iomap_folio_state *ifs, size_t off, size_t len)
 {
@@ -1454,15 +1492,10 @@ out_unlock:
 EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
 
 static void iomap_finish_folio_write(struct inode *inode, struct folio *folio,
-		size_t len, int error)
+		size_t len)
 {
 	struct iomap_folio_state *ifs = folio->private;
 
-	if (error) {
-		folio_set_error(folio);
-		mapping_set_error(inode->i_mapping, error);
-	}
-
 	WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs);
 	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0);
 
@@ -1479,40 +1512,29 @@ static u32
 iomap_finish_ioend(struct iomap_ioend *ioend, int error)
 {
 	struct inode *inode = ioend->io_inode;
-	struct bio *bio = &ioend->io_inline_bio;
-	struct bio *last = ioend->io_bio, *next;
-	u64 start = bio->bi_iter.bi_sector;
-	loff_t offset = ioend->io_offset;
-	bool quiet = bio_flagged(bio, BIO_QUIET);
+	struct bio *bio = &ioend->io_bio;
+	struct folio_iter fi;
 	u32 folio_count = 0;
 
-	for (bio = &ioend->io_inline_bio; bio; bio = next) {
-		struct folio_iter fi;
-
-		/*
-		 * For the last bio, bi_private points to the ioend, so we
-		 * need to explicitly end the iteration here.
-		 */
-		if (bio == last)
-			next = NULL;
-		else
-			next = bio->bi_private;
-
-		/* walk all folios in bio, ending page IO on them */
-		bio_for_each_folio_all(fi, bio) {
-			iomap_finish_folio_write(inode, fi.folio, fi.length,
-					error);
-			folio_count++;
+	if (error) {
+		mapping_set_error(inode->i_mapping, error);
+		if (!bio_flagged(bio, BIO_QUIET)) {
+			pr_err_ratelimited(
+"%s: writeback error on inode %lu, offset %lld, sector %llu",
+				inode->i_sb->s_id, inode->i_ino,
+				ioend->io_offset, ioend->io_sector);
 		}
-		bio_put(bio);
 	}
-	/* The ioend has been freed by bio_put() */
 
-	if (unlikely(error && !quiet)) {
-		printk_ratelimited(KERN_ERR
-"%s: writeback error on inode %lu, offset %lld, sector %llu",
-			inode->i_sb->s_id, inode->i_ino, offset, start);
+	/* walk all folios in bio, ending page IO on them */
+	bio_for_each_folio_all(fi, bio) {
+		if (error)
+			folio_set_error(fi.folio);
+		iomap_finish_folio_write(inode, fi.folio, fi.length);
+		folio_count++;
 	}
+
+	bio_put(bio);	/* frees the ioend */
 	return folio_count;
 }
 
@@ -1553,7 +1575,7 @@ EXPORT_SYMBOL_GPL(iomap_finish_ioends);
 static bool
 iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next)
 {
-	if (ioend->io_bio->bi_status != next->io_bio->bi_status)
+	if (ioend->io_bio.bi_status != next->io_bio.bi_status)
 		return false;
 	if ((ioend->io_flags & IOMAP_F_SHARED) ^
 	    (next->io_flags & IOMAP_F_SHARED))
@@ -1618,47 +1640,46 @@ EXPORT_SYMBOL_GPL(iomap_sort_ioends);
 
 static void iomap_writepage_end_bio(struct bio *bio)
 {
-	struct iomap_ioend *ioend = bio->bi_private;
-
-	iomap_finish_ioend(ioend, blk_status_to_errno(bio->bi_status));
+	iomap_finish_ioend(iomap_ioend_from_bio(bio),
+			blk_status_to_errno(bio->bi_status));
 }
 
 /*
  * Submit the final bio for an ioend.
  *
  * If @error is non-zero, it means that we have a situation where some part of
- * the submission process has failed after we've marked pages for writeback
- * and unlocked them.  In this situation, we need to fail the bio instead of
- * submitting it.  This typically only happens on a filesystem shutdown.
+ * the submission process has failed after we've marked pages for writeback.
+ * We cannot cancel ioend directly in that case, so call the bio end I/O handler
+ * with the error status here to run the normal I/O completion handler to clear
+ * the writeback bit and let the file system proess the errors.
  */
-static int
-iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend,
-		int error)
+static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error)
 {
-	ioend->io_bio->bi_private = ioend;
-	ioend->io_bio->bi_end_io = iomap_writepage_end_bio;
+	if (!wpc->ioend)
+		return error;
 
+	/*
+	 * Let the file systems prepare the I/O submission and hook in an I/O
+	 * comletion handler.  This also needs to happen in case after a
+	 * failure happened so that the file system end I/O handler gets called
+	 * to clean up.
+	 */
 	if (wpc->ops->prepare_ioend)
-		error = wpc->ops->prepare_ioend(ioend, error);
+		error = wpc->ops->prepare_ioend(wpc->ioend, error);
+
 	if (error) {
-		/*
-		 * If we're failing the IO now, just mark the ioend with an
-		 * error and finish it.  This will run IO completion immediately
-		 * as there is only one reference to the ioend at this point in
-		 * time.
-		 */
-		ioend->io_bio->bi_status = errno_to_blk_status(error);
-		bio_endio(ioend->io_bio);
-		return error;
+		wpc->ioend->io_bio.bi_status = errno_to_blk_status(error);
+		bio_endio(&wpc->ioend->io_bio);
+	} else {
+		submit_bio(&wpc->ioend->io_bio);
 	}
 
-	submit_bio(ioend->io_bio);
-	return 0;
+	wpc->ioend = NULL;
+	return error;
 }
 
-static struct iomap_ioend *
-iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
-		loff_t offset, sector_t sector, struct writeback_control *wbc)
+static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct inode *inode, loff_t pos)
 {
 	struct iomap_ioend *ioend;
 	struct bio *bio;
@@ -1666,63 +1687,42 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc,
 	bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS,
 			       REQ_OP_WRITE | wbc_to_write_flags(wbc),
 			       GFP_NOFS, &iomap_ioend_bioset);
-	bio->bi_iter.bi_sector = sector;
+	bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos);
+	bio->bi_end_io = iomap_writepage_end_bio;
 	wbc_init_bio(wbc, bio);
+	bio->bi_write_hint = inode->i_write_hint;
 
-	ioend = container_of(bio, struct iomap_ioend, io_inline_bio);
+	ioend = iomap_ioend_from_bio(bio);
 	INIT_LIST_HEAD(&ioend->io_list);
 	ioend->io_type = wpc->iomap.type;
 	ioend->io_flags = wpc->iomap.flags;
 	ioend->io_inode = inode;
 	ioend->io_size = 0;
-	ioend->io_folios = 0;
-	ioend->io_offset = offset;
-	ioend->io_bio = bio;
-	ioend->io_sector = sector;
-	return ioend;
-}
-
-/*
- * Allocate a new bio, and chain the old bio to the new one.
- *
- * Note that we have to perform the chaining in this unintuitive order
- * so that the bi_private linkage is set up in the right direction for the
- * traversal in iomap_finish_ioend().
- */
-static struct bio *
-iomap_chain_bio(struct bio *prev)
-{
-	struct bio *new;
-
-	new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS);
-	bio_clone_blkg_association(new, prev);
-	new->bi_iter.bi_sector = bio_end_sector(prev);
+	ioend->io_offset = pos;
+	ioend->io_sector = bio->bi_iter.bi_sector;
 
-	bio_chain(prev, new);
-	bio_get(prev);		/* for iomap_finish_ioend */
-	submit_bio(prev);
-	return new;
+	wpc->nr_folios = 0;
+	return ioend;
 }
 
-static bool
-iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
-		sector_t sector)
+static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos)
 {
 	if ((wpc->iomap.flags & IOMAP_F_SHARED) !=
 	    (wpc->ioend->io_flags & IOMAP_F_SHARED))
 		return false;
 	if (wpc->iomap.type != wpc->ioend->io_type)
 		return false;
-	if (offset != wpc->ioend->io_offset + wpc->ioend->io_size)
+	if (pos != wpc->ioend->io_offset + wpc->ioend->io_size)
 		return false;
-	if (sector != bio_end_sector(wpc->ioend->io_bio))
+	if (iomap_sector(&wpc->iomap, pos) !=
+	    bio_end_sector(&wpc->ioend->io_bio))
 		return false;
 	/*
 	 * Limit ioend bio chain lengths to minimise IO completion latency. This
 	 * also prevents long tight loops ending page writeback on all the
 	 * folios in the ioend.
 	 */
-	if (wpc->ioend->io_folios >= IOEND_BATCH_SIZE)
+	if (wpc->nr_folios >= IOEND_BATCH_SIZE)
 		return false;
 	return true;
 }
@@ -1730,255 +1730,238 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset,
 /*
  * Test to see if we have an existing ioend structure that we could append to
  * first; otherwise finish off the current ioend and start another.
+ *
+ * If a new ioend is created and cached, the old ioend is submitted to the block
+ * layer instantly.  Batching optimisations are provided by higher level block
+ * plugging.
+ *
+ * At the end of a writeback pass, there will be a cached ioend remaining on the
+ * writepage context that the caller will need to submit.
  */
-static void
-iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio,
-		struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct list_head *iolist)
+static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, loff_t pos, unsigned len)
 {
-	sector_t sector = iomap_sector(&wpc->iomap, pos);
-	unsigned len = i_blocksize(inode);
+	struct iomap_folio_state *ifs = folio->private;
 	size_t poff = offset_in_folio(folio, pos);
+	int error;
 
-	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, sector)) {
-		if (wpc->ioend)
-			list_add(&wpc->ioend->io_list, iolist);
-		wpc->ioend = iomap_alloc_ioend(inode, wpc, pos, sector, wbc);
+	if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) {
+new_ioend:
+		error = iomap_submit_ioend(wpc, 0);
+		if (error)
+			return error;
+		wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos);
 	}
 
-	if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) {
-		wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio);
-		bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff);
-	}
+	if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff))
+		goto new_ioend;
 
 	if (ifs)
 		atomic_add(len, &ifs->write_bytes_pending);
 	wpc->ioend->io_size += len;
 	wbc_account_cgroup_owner(wbc, &folio->page, len);
+	return 0;
 }
 
-/*
- * We implement an immediate ioend submission policy here to avoid needing to
- * chain multiple ioends and hence nest mempool allocations which can violate
- * the forward progress guarantees we need to provide. The current ioend we're
- * adding blocks to is cached in the writepage context, and if the new block
- * doesn't append to the cached ioend, it will create a new ioend and cache that
- * instead.
- *
- * If a new ioend is created and cached, the old ioend is returned and queued
- * locally for submission once the entire page is processed or an error has been
- * detected.  While ioends are submitted immediately after they are completed,
- * batching optimisations are provided by higher level block plugging.
- *
- * At the end of a writeback pass, there will be a cached ioend remaining on the
- * writepage context that the caller will need to submit.
- */
-static int
-iomap_writepage_map(struct iomap_writepage_ctx *wpc,
-		struct writeback_control *wbc, struct inode *inode,
-		struct folio *folio, u64 end_pos)
+static int iomap_writepage_map_blocks(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio,
+		struct inode *inode, u64 pos, unsigned dirty_len,
+		unsigned *count)
 {
-	struct iomap_folio_state *ifs = folio->private;
-	struct iomap_ioend *ioend, *next;
-	unsigned len = i_blocksize(inode);
-	unsigned nblocks = i_blocks_per_folio(inode, folio);
-	u64 pos = folio_pos(folio);
-	int error = 0, count = 0, i;
-	LIST_HEAD(submit_list);
-
-	WARN_ON_ONCE(end_pos <= pos);
-
-	if (!ifs && nblocks > 1) {
-		ifs = ifs_alloc(inode, folio, 0);
-		iomap_set_range_dirty(folio, 0, end_pos - pos);
-	}
+	int error;
 
-	WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0);
-
-	/*
-	 * Walk through the folio to find areas to write back. If we
-	 * run off the end of the current map or find the current map
-	 * invalid, grab a new one.
-	 */
-	for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) {
-		if (ifs && !ifs_block_is_dirty(folio, ifs, i))
-			continue;
+	do {
+		unsigned map_len;
 
-		error = wpc->ops->map_blocks(wpc, inode, pos);
+		error = wpc->ops->map_blocks(wpc, inode, pos, dirty_len);
 		if (error)
 			break;
-		trace_iomap_writepage_map(inode, &wpc->iomap);
-		if (WARN_ON_ONCE(wpc->iomap.type == IOMAP_INLINE))
-			continue;
-		if (wpc->iomap.type == IOMAP_HOLE)
-			continue;
-		iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc,
-				 &submit_list);
-		count++;
-	}
-	if (count)
-		wpc->ioend->io_folios++;
+		trace_iomap_writepage_map(inode, pos, dirty_len, &wpc->iomap);
 
-	WARN_ON_ONCE(!wpc->ioend && !list_empty(&submit_list));
-	WARN_ON_ONCE(!folio_test_locked(folio));
-	WARN_ON_ONCE(folio_test_writeback(folio));
-	WARN_ON_ONCE(folio_test_dirty(folio));
+		map_len = min_t(u64, dirty_len,
+			wpc->iomap.offset + wpc->iomap.length - pos);
+		WARN_ON_ONCE(!folio->private && map_len < dirty_len);
+
+		switch (wpc->iomap.type) {
+		case IOMAP_INLINE:
+			WARN_ON_ONCE(1);
+			error = -EIO;
+			break;
+		case IOMAP_HOLE:
+			break;
+		default:
+			error = iomap_add_to_ioend(wpc, wbc, folio, inode, pos,
+					map_len);
+			if (!error)
+				(*count)++;
+			break;
+		}
+		dirty_len -= map_len;
+		pos += map_len;
+	} while (dirty_len && !error);
 
 	/*
 	 * We cannot cancel the ioend directly here on error.  We may have
 	 * already set other pages under writeback and hence we have to run I/O
 	 * completion to mark the error state of the pages under writeback
 	 * appropriately.
+	 *
+	 * Just let the file system know what portion of the folio failed to
+	 * map.
 	 */
-	if (unlikely(error)) {
-		/*
-		 * Let the filesystem know what portion of the current page
-		 * failed to map. If the page hasn't been added to ioend, it
-		 * won't be affected by I/O completion and we must unlock it
-		 * now.
-		 */
-		if (wpc->ops->discard_folio)
-			wpc->ops->discard_folio(folio, pos);
-		if (!count) {
-			folio_unlock(folio);
-			goto done;
-		}
-	}
-
-	/*
-	 * We can have dirty bits set past end of file in page_mkwrite path
-	 * while mapping the last partial folio. Hence it's better to clear
-	 * all the dirty bits in the folio here.
-	 */
-	iomap_clear_range_dirty(folio, 0, folio_size(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-
-	/*
-	 * Preserve the original error if there was one; catch
-	 * submission errors here and propagate into subsequent ioend
-	 * submissions.
-	 */
-	list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
-		int error2;
-
-		list_del_init(&ioend->io_list);
-		error2 = iomap_submit_ioend(wpc, ioend, error);
-		if (error2 && !error)
-			error = error2;
-	}
-
-	/*
-	 * We can end up here with no error and nothing to write only if we race
-	 * with a partial page truncate on a sub-page block sized filesystem.
-	 */
-	if (!count)
-		folio_end_writeback(folio);
-done:
-	mapping_set_error(inode->i_mapping, error);
+	if (error && wpc->ops->discard_folio)
+		wpc->ops->discard_folio(folio, pos);
 	return error;
 }
 
 /*
- * Write out a dirty page.
+ * Check interaction of the folio with the file end.
  *
- * For delalloc space on the page, we need to allocate space and flush it.
- * For unwritten space on the page, we need to start the conversion to
- * regular allocated space.
+ * If the folio is entirely beyond i_size, return false.  If it straddles
+ * i_size, adjust end_pos and zero all data beyond i_size.
  */
-static int iomap_do_writepage(struct folio *folio,
-		struct writeback_control *wbc, void *data)
+static bool iomap_writepage_handle_eof(struct folio *folio, struct inode *inode,
+		u64 *end_pos)
 {
-	struct iomap_writepage_ctx *wpc = data;
-	struct inode *inode = folio->mapping->host;
-	u64 end_pos, isize;
-
-	trace_iomap_writepage(inode, folio_pos(folio), folio_size(folio));
+	u64 isize = i_size_read(inode);
 
-	/*
-	 * Refuse to write the folio out if we're called from reclaim context.
-	 *
-	 * This avoids stack overflows when called from deeply used stacks in
-	 * random callers for direct reclaim or memcg reclaim.  We explicitly
-	 * allow reclaim from kswapd as the stack usage there is relatively low.
-	 *
-	 * This should never happen except in the case of a VM regression so
-	 * warn about it.
-	 */
-	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
-			PF_MEMALLOC))
-		goto redirty;
-
-	/*
-	 * Is this folio beyond the end of the file?
-	 *
-	 * The folio index is less than the end_index, adjust the end_pos
-	 * to the highest offset that this folio should represent.
-	 * -----------------------------------------------------
-	 * |			file mapping	       | <EOF> |
-	 * -----------------------------------------------------
-	 * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
-	 * ^--------------------------------^----------|--------
-	 * |     desired writeback range    |      see else    |
-	 * ---------------------------------^------------------|
-	 */
-	isize = i_size_read(inode);
-	end_pos = folio_pos(folio) + folio_size(folio);
-	if (end_pos > isize) {
-		/*
-		 * Check whether the page to write out is beyond or straddles
-		 * i_size or not.
-		 * -------------------------------------------------------
-		 * |		file mapping		        | <EOF>  |
-		 * -------------------------------------------------------
-		 * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
-		 * ^--------------------------------^-----------|---------
-		 * |				    |      Straddles     |
-		 * ---------------------------------^-----------|--------|
-		 */
+	if (*end_pos > isize) {
 		size_t poff = offset_in_folio(folio, isize);
 		pgoff_t end_index = isize >> PAGE_SHIFT;
 
 		/*
-		 * Skip the page if it's fully outside i_size, e.g.
-		 * due to a truncate operation that's in progress.  We've
-		 * cleaned this page and truncate will finish things off for
-		 * us.
+		 * If the folio is entirely ouside of i_size, skip it.
+		 *
+		 * This can happen due to a truncate operation that is in
+		 * progress and in that case truncate will finish it off once
+		 * we've dropped the folio lock.
 		 *
-		 * Note that the end_index is unsigned long.  If the given
-		 * offset is greater than 16TB on a 32-bit system then if we
-		 * checked if the page is fully outside i_size with
-		 * "if (page->index >= end_index + 1)", "end_index + 1" would
-		 * overflow and evaluate to 0.  Hence this page would be
+		 * Note that the pgoff_t used for end_index is an unsigned long.
+		 * If the given offset is greater than 16TB on a 32-bit system,
+		 * then if we checked if the folio is fully outside i_size with
+		 * "if (folio->index >= end_index + 1)", "end_index + 1" would
+		 * overflow and evaluate to 0.  Hence this folio would be
 		 * redirtied and written out repeatedly, which would result in
 		 * an infinite loop; the user program performing this operation
 		 * would hang.  Instead, we can detect this situation by
-		 * checking if the page is totally beyond i_size or if its
+		 * checking if the folio is totally beyond i_size or if its
 		 * offset is just equal to the EOF.
 		 */
 		if (folio->index > end_index ||
 		    (folio->index == end_index && poff == 0))
-			goto unlock;
+			return false;
 
 		/*
-		 * The page straddles i_size.  It must be zeroed out on each
-		 * and every writepage invocation because it may be mmapped.
-		 * "A file is mapped in multiples of the page size.  For a file
-		 * that is not a multiple of the page size, the remaining
-		 * memory is zeroed when mapped, and writes to that region are
-		 * not written out to the file."
+		 * The folio straddles i_size.
+		 *
+		 * It must be zeroed out on each and every writepage invocation
+		 * because it may be mmapped:
+		 *
+		 *    A file is mapped in multiples of the page size.  For a
+		 *    file that is not a multiple of the page size, the
+		 *    remaining memory is zeroed when mapped, and writes to that
+		 *    region are not written out to the file.
+		 *
+		 * Also adjust the writeback range to skip all blocks entirely
+		 * beyond i_size.
 		 */
 		folio_zero_segment(folio, poff, folio_size(folio));
-		end_pos = isize;
+		*end_pos = round_up(isize, i_blocksize(inode));
+	}
+
+	return true;
+}
+
+static int iomap_writepage_map(struct iomap_writepage_ctx *wpc,
+		struct writeback_control *wbc, struct folio *folio)
+{
+	struct iomap_folio_state *ifs = folio->private;
+	struct inode *inode = folio->mapping->host;
+	u64 pos = folio_pos(folio);
+	u64 end_pos = pos + folio_size(folio);
+	unsigned count = 0;
+	int error = 0;
+	u32 rlen;
+
+	WARN_ON_ONCE(!folio_test_locked(folio));
+	WARN_ON_ONCE(folio_test_dirty(folio));
+	WARN_ON_ONCE(folio_test_writeback(folio));
+
+	trace_iomap_writepage(inode, pos, folio_size(folio));
+
+	if (!iomap_writepage_handle_eof(folio, inode, &end_pos)) {
+		folio_unlock(folio);
+		return 0;
+	}
+	WARN_ON_ONCE(end_pos <= pos);
+
+	if (i_blocks_per_folio(inode, folio) > 1) {
+		if (!ifs) {
+			ifs = ifs_alloc(inode, folio, 0);
+			iomap_set_range_dirty(folio, 0, end_pos - pos);
+		}
+
+		/*
+		 * Keep the I/O completion handler from clearing the writeback
+		 * bit until we have submitted all blocks by adding a bias to
+		 * ifs->write_bytes_pending, which is dropped after submitting
+		 * all blocks.
+		 */
+		WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending) != 0);
+		atomic_inc(&ifs->write_bytes_pending);
 	}
 
-	return iomap_writepage_map(wpc, wbc, inode, folio, end_pos);
+	/*
+	 * Set the writeback bit ASAP, as the I/O completion for the single
+	 * block per folio case happen hit as soon as we're submitting the bio.
+	 */
+	folio_start_writeback(folio);
 
-redirty:
-	folio_redirty_for_writepage(wbc, folio);
-unlock:
+	/*
+	 * Walk through the folio to find dirty areas to write back.
+	 */
+	while ((rlen = iomap_find_dirty_range(folio, &pos, end_pos))) {
+		error = iomap_writepage_map_blocks(wpc, wbc, folio, inode,
+				pos, rlen, &count);
+		if (error)
+			break;
+		pos += rlen;
+	}
+
+	if (count)
+		wpc->nr_folios++;
+
+	/*
+	 * We can have dirty bits set past end of file in page_mkwrite path
+	 * while mapping the last partial folio. Hence it's better to clear
+	 * all the dirty bits in the folio here.
+	 */
+	iomap_clear_range_dirty(folio, 0, folio_size(folio));
+
+	/*
+	 * Usually the writeback bit is cleared by the I/O completion handler.
+	 * But we may end up either not actually writing any blocks, or (when
+	 * there are multiple blocks in a folio) all I/O might have finished
+	 * already at this point.  In that case we need to clear the writeback
+	 * bit ourselves right after unlocking the page.
+	 */
 	folio_unlock(folio);
-	return 0;
+	if (ifs) {
+		if (atomic_dec_and_test(&ifs->write_bytes_pending))
+			folio_end_writeback(folio);
+	} else {
+		if (!count)
+			folio_end_writeback(folio);
+	}
+	mapping_set_error(inode->i_mapping, error);
+	return error;
+}
+
+static int iomap_do_writepage(struct folio *folio,
+		struct writeback_control *wbc, void *data)
+{
+	return iomap_writepage_map(data, wbc, folio);
 }
 
 int
@@ -1988,18 +1971,24 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc,
 {
 	int			ret;
 
+	/*
+	 * Writeback from reclaim context should never happen except in the case
+	 * of a VM regression so warn about it and refuse to write the data.
+	 */
+	if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC | PF_KSWAPD)) ==
+			PF_MEMALLOC))
+		return -EIO;
+
 	wpc->ops = ops;
 	ret = write_cache_pages(mapping, wbc, iomap_do_writepage, wpc);
-	if (!wpc->ioend)
-		return ret;
-	return iomap_submit_ioend(wpc, wpc->ioend, ret);
+	return iomap_submit_ioend(wpc, ret);
 }
 EXPORT_SYMBOL_GPL(iomap_writepages);
 
 static int __init iomap_init(void)
 {
 	return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE),
-			   offsetof(struct iomap_ioend, io_inline_bio),
+			   offsetof(struct iomap_ioend, io_bio),
 			   BIOSET_NEED_BVECS);
 }
 fs_initcall(iomap_init);
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index bcd3f8cf5ea4..f3b43d223a46 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -380,6 +380,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
 		fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
 					  GFP_KERNEL);
 		bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
+		bio->bi_write_hint = inode->i_write_hint;
 		bio->bi_ioprio = dio->iocb->ki_ioprio;
 		bio->bi_private = dio;
 		bio->bi_end_io = iomap_dio_bio_end_io;
diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h
index c16fd55f5595..0a991c4ce87d 100644
--- a/fs/iomap/trace.h
+++ b/fs/iomap/trace.h
@@ -154,7 +154,48 @@ DEFINE_EVENT(iomap_class, name,	\
 	TP_ARGS(inode, iomap))
 DEFINE_IOMAP_EVENT(iomap_iter_dstmap);
 DEFINE_IOMAP_EVENT(iomap_iter_srcmap);
-DEFINE_IOMAP_EVENT(iomap_writepage_map);
+
+TRACE_EVENT(iomap_writepage_map,
+	TP_PROTO(struct inode *inode, u64 pos, unsigned int dirty_len,
+		 struct iomap *iomap),
+	TP_ARGS(inode, pos, dirty_len, iomap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(u64, ino)
+		__field(u64, pos)
+		__field(u64, dirty_len)
+		__field(u64, addr)
+		__field(loff_t, offset)
+		__field(u64, length)
+		__field(u16, type)
+		__field(u16, flags)
+		__field(dev_t, bdev)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->pos = pos;
+		__entry->dirty_len = dirty_len;
+		__entry->addr = iomap->addr;
+		__entry->offset = iomap->offset;
+		__entry->length = iomap->length;
+		__entry->type = iomap->type;
+		__entry->flags = iomap->flags;
+		__entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0;
+	),
+	TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx "
+		  "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  MAJOR(__entry->bdev), MINOR(__entry->bdev),
+		  __entry->pos,
+		  __entry->dirty_len,
+		  __entry->addr,
+		  __entry->offset,
+		  __entry->length,
+		  __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS),
+		  __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS))
+);
 
 TRACE_EVENT(iomap_iter,
 	TP_PROTO(struct iomap_iter *iter, const void *ops,
@@ -165,6 +206,7 @@ TRACE_EVENT(iomap_iter,
 		__field(u64, ino)
 		__field(loff_t, pos)
 		__field(u64, length)
+		__field(s64, processed)
 		__field(unsigned int, flags)
 		__field(const void *, ops)
 		__field(unsigned long, caller)
@@ -174,15 +216,17 @@ TRACE_EVENT(iomap_iter,
 		__entry->ino = iter->inode->i_ino;
 		__entry->pos = iter->pos;
 		__entry->length = iomap_length(iter);
+		__entry->processed = iter->processed;
 		__entry->flags = iter->flags;
 		__entry->ops = ops;
 		__entry->caller = caller;
 	),
-	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx flags %s (0x%x) ops %ps caller %pS",
+	TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		   __entry->ino,
 		   __entry->pos,
 		   __entry->length,
+		   __entry->processed,
 		   __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS),
 		   __entry->flags,
 		   __entry->ops,
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3e4d53e26f94..2a616a9f289d 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -93,7 +93,7 @@ static int __init init_inodecache(void)
 	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					SLAB_ACCOUNT),
 					init_once);
 	if (!isofs_inode_cachep)
 		return -ENOMEM;
@@ -908,8 +908,22 @@ root_found:
 	 * we then decide whether to use the Joliet descriptor.
 	 */
 	inode = isofs_iget(s, sbi->s_firstdatazone, 0);
-	if (IS_ERR(inode))
-		goto out_no_root;
+
+	/*
+	 * Fix for broken CDs with a corrupt root inode but a correct Joliet
+	 * root directory.
+	 */
+	if (IS_ERR(inode)) {
+		if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
+			printk(KERN_NOTICE
+			       "ISOFS: root inode is unusable. "
+			       "Disabling Rock Ridge and switching to Joliet.");
+			sbi->s_rock = 0;
+			inode = NULL;
+		} else {
+			goto out_no_root;
+		}
+	}
 
 	/*
 	 * Fix for broken CDs with Rock Ridge and empty ISO root directory but
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f99591a634b4..aede1be4dc0c 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -387,7 +387,7 @@ static int __init init_jffs2_fs(void)
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h
index dd4264aa9bed..10934f9a11be 100644
--- a/fs/jfs/jfs_incore.h
+++ b/fs/jfs/jfs_incore.h
@@ -92,7 +92,7 @@ struct jfs_inode_info {
 		} link;
 	} u;
 #ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 #endif
 	u32 dev;	/* will die when we get wide dev_t */
 	struct inode	vfs_inode;
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index cb6d1fda66a7..9609349e92e5 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync)
 int lmLogOpen(struct super_block *sb)
 {
 	int rc;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct jfs_log *log;
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 
@@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb)
 
 	mutex_lock(&jfs_log_mutex);
 	list_for_each_entry(log, &jfs_external_logs, journal_list) {
-		if (log->bdev_handle->bdev->bd_dev == sbi->logdev) {
+		if (file_bdev(log->bdev_file)->bd_dev == sbi->logdev) {
 			if (!uuid_equal(&log->uuid, &sbi->loguuid)) {
 				jfs_warn("wrong uuid on JFS journal");
 				mutex_unlock(&jfs_log_mutex);
@@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb)
 	 * file systems to log may have n-to-1 relationship;
 	 */
 
-	bdev_handle = bdev_open_by_dev(sbi->logdev,
+	bdev_file = bdev_file_open_by_dev(sbi->logdev,
 			BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL);
-	if (IS_ERR(bdev_handle)) {
-		rc = PTR_ERR(bdev_handle);
+	if (IS_ERR(bdev_file)) {
+		rc = PTR_ERR(bdev_file);
 		goto free;
 	}
 
-	log->bdev_handle = bdev_handle;
+	log->bdev_file = bdev_file;
 	uuid_copy(&log->uuid, &sbi->loguuid);
 
 	/*
@@ -1141,7 +1141,7 @@ journal_found:
 	lbmLogShutdown(log);
 
       close:		/* close external log device */
-	bdev_release(bdev_handle);
+	bdev_fput(bdev_file);
 
       free:		/* free log descriptor */
 	mutex_unlock(&jfs_log_mutex);
@@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb)
 	init_waitqueue_head(&log->syncwait);
 
 	set_bit(log_INLINELOG, &log->flag);
-	log->bdev_handle = sb->s_bdev_handle;
+	log->bdev_file = sb->s_bdev_file;
 	log->base = addressPXD(&JFS_SBI(sb)->logpxd);
 	log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >>
 	    (L2LOGPSIZE - sb->s_blocksize_bits);
@@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb)
 {
 	struct jfs_sb_info *sbi = JFS_SBI(sb);
 	struct jfs_log *log = sbi->log;
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	int rc = 0;
 
 	jfs_info("lmLogClose: log:0x%p", log);
@@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb)
 	 *	external log as separate logical volume
 	 */
 	list_del(&log->journal_list);
-	bdev_handle = log->bdev_handle;
+	bdev_file = log->bdev_file;
 	rc = lmLogShutdown(log);
 
-	bdev_release(bdev_handle);
+	bdev_fput(bdev_file);
 
 	kfree(log);
 
@@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bp->l_flag |= lbmREAD;
 
-	bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS);
+	bio = bio_alloc(file_bdev(log->bdev_file), 1, REQ_OP_READ, GFP_NOFS);
 	bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9);
 	__bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset);
 	BUG_ON(bio->bi_iter.bi_size != LOGPSIZE);
@@ -2115,7 +2115,7 @@ static void lbmStartIO(struct lbuf * bp)
 	jfs_info("lbmStartIO");
 
 	if (!log->no_integrity)
-		bdev = log->bdev_handle->bdev;
+		bdev = file_bdev(log->bdev_file);
 
 	bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC,
 			GFP_NOFS);
diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h
index 84aa2d253907..8b8994e48cd0 100644
--- a/fs/jfs/jfs_logmgr.h
+++ b/fs/jfs/jfs_logmgr.h
@@ -356,7 +356,7 @@ struct jfs_log {
 				 *    before writing syncpt.
 				 */
 	struct list_head journal_list; /* Global list */
-	struct bdev_handle *bdev_handle; /* 4: log lv pointer */
+	struct file *bdev_file;	/* 4: log lv pointer */
 	int serial;		/* 4: log mount serial number */
 
 	s64 base;		/* @8: log extent address (inline log ) */
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 9b5c6a20b30c..98f9a432c336 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -431,7 +431,7 @@ int updateSuper(struct super_block *sb, uint state)
 	if (state == FM_MOUNT) {
 		/* record log's dev_t and mount serial number */
 		j_sb->s_logdev = cpu_to_le32(
-			new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev));
+			new_encode_dev(file_bdev(sbi->log->bdev_file)->bd_dev));
 		j_sb->s_logserial = cpu_to_le32(sbi->log->serial);
 	} else if (state == FM_CLEAN) {
 		/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8d8e556bd610..e1be21ca5d6e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -824,7 +824,7 @@ out:
 	return len - towrite;
 }
 
-static struct dquot **jfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **jfs_get_dquots(struct inode *inode)
 {
 	return JFS_IP(inode)->i_dquot;
 }
@@ -932,7 +932,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create_usercopy("jfs_ip", sizeof(struct jfs_inode_info),
-			0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+			0, SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			offsetof(struct jfs_inode_info, i_inline_all),
 			sizeof_field(struct jfs_inode_info, i_inline_all),
 			init_once);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index bce1d7ac95ca..458519e416fe 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -529,6 +529,20 @@ void kernfs_get(struct kernfs_node *kn)
 }
 EXPORT_SYMBOL_GPL(kernfs_get);
 
+static void kernfs_free_rcu(struct rcu_head *rcu)
+{
+	struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu);
+
+	kfree_const(kn->name);
+
+	if (kn->iattr) {
+		simple_xattrs_free(&kn->iattr->xattrs, NULL);
+		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
+	}
+
+	kmem_cache_free(kernfs_node_cache, kn);
+}
+
 /**
  * kernfs_put - put a reference count on a kernfs_node
  * @kn: the target kernfs_node
@@ -557,16 +571,11 @@ void kernfs_put(struct kernfs_node *kn)
 	if (kernfs_type(kn) == KERNFS_LINK)
 		kernfs_put(kn->symlink.target_kn);
 
-	kfree_const(kn->name);
-
-	if (kn->iattr) {
-		simple_xattrs_free(&kn->iattr->xattrs, NULL);
-		kmem_cache_free(kernfs_iattrs_cache, kn->iattr);
-	}
 	spin_lock(&kernfs_idr_lock);
 	idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
 	spin_unlock(&kernfs_idr_lock);
-	kmem_cache_free(kernfs_node_cache, kn);
+
+	call_rcu(&kn->rcu, kernfs_free_rcu);
 
 	kn = parent;
 	if (kn) {
@@ -575,7 +584,7 @@ void kernfs_put(struct kernfs_node *kn)
 	} else {
 		/* just released the root kn, free @root too */
 		idr_destroy(&root->ino_idr);
-		kfree(root);
+		kfree_rcu(root, rcu);
 	}
 }
 EXPORT_SYMBOL_GPL(kernfs_put);
@@ -715,7 +724,7 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	ino_t ino = kernfs_id_ino(id);
 	u32 gen = kernfs_id_gen(id);
 
-	spin_lock(&kernfs_idr_lock);
+	rcu_read_lock();
 
 	kn = idr_find(&root->ino_idr, (u32)ino);
 	if (!kn)
@@ -739,10 +748,10 @@ struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root,
 	if (unlikely(!__kernfs_active(kn) || !atomic_inc_not_zero(&kn->count)))
 		goto err_unlock;
 
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return kn;
 err_unlock:
-	spin_unlock(&kernfs_idr_lock);
+	rcu_read_unlock();
 	return NULL;
 }
 
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ffa4565c275a..8502ef68459b 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -483,9 +483,11 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
 		goto out_put;
 
 	rc = 0;
-	of->mmapped = true;
-	of_on(of)->nr_mmapped++;
-	of->vm_ops = vma->vm_ops;
+	if (!of->mmapped) {
+		of->mmapped = true;
+		of_on(of)->nr_mmapped++;
+		of->vm_ops = vma->vm_ops;
+	}
 	vma->vm_ops = &kernfs_vm_ops;
 out_put:
 	kernfs_put_active(of->kn);
@@ -634,11 +636,18 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	 * each file a separate locking class.  Let's differentiate on
 	 * whether the file has mmap or not for now.
 	 *
-	 * Both paths of the branch look the same.  They're supposed to
+	 * For similar reasons, writable and readonly files are given different
+	 * lockdep key, because the writable file /sys/power/resume may call vfs
+	 * lookup helpers for arbitrary paths and readonly files can be read by
+	 * overlayfs from vfs helpers when sysfs is a lower layer of overalyfs.
+	 *
+	 * All three cases look the same.  They're supposed to
 	 * look that way and give @of->mutex different static lockdep keys.
 	 */
 	if (has_mmap)
 		mutex_init(&of->mutex);
+	else if (file->f_mode & FMODE_WRITE)
+		mutex_init(&of->mutex);
 	else
 		mutex_init(&of->mutex);
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b941..b42ee6547cdc 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ struct kernfs_root {
 	struct rw_semaphore	kernfs_rwsem;
 	struct rw_semaphore	kernfs_iattr_rwsem;
 	struct rw_semaphore	kernfs_supers_rwsem;
+
+	struct rcu_head		rcu;
 };
 
 /* +1 to avoid triggering overflow warning when negating it */
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 0c93cad0f0ac..e29f4edf9572 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -358,7 +358,9 @@ int kernfs_get_tree(struct fs_context *fc)
 		}
 		sb->s_flags |= SB_ACTIVE;
 
-		uuid_gen(&sb->s_uuid);
+		uuid_t uuid;
+		uuid_gen(&uuid);
+		super_set_uuid(sb, uuid.b, sizeof(uuid));
 
 		down_write(&root->kernfs_supers_rwsem);
 		list_add(&info->node, &info->root->supers);
diff --git a/fs/libfs.c b/fs/libfs.c
index eec6031b0155..3a6f2cb364f8 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -23,6 +23,7 @@
 #include <linux/fsnotify.h>
 #include <linux/unicode.h>
 #include <linux/fscrypt.h>
+#include <linux/pidfs.h>
 
 #include <linux/uaccess.h>
 
@@ -240,17 +241,22 @@ const struct inode_operations simple_dir_inode_operations = {
 };
 EXPORT_SYMBOL(simple_dir_inode_operations);
 
-static void offset_set(struct dentry *dentry, u32 offset)
+/* 0 is '.', 1 is '..', so always start with offset 2 or more */
+enum {
+	DIR_OFFSET_MIN	= 2,
+};
+
+static void offset_set(struct dentry *dentry, long offset)
 {
-	dentry->d_fsdata = (void *)((uintptr_t)(offset));
+	dentry->d_fsdata = (void *)offset;
 }
 
-static u32 dentry2offset(struct dentry *dentry)
+static long dentry2offset(struct dentry *dentry)
 {
-	return (u32)((uintptr_t)(dentry->d_fsdata));
+	return (long)dentry->d_fsdata;
 }
 
-static struct lock_class_key simple_offset_xa_lock;
+static struct lock_class_key simple_offset_lock_class;
 
 /**
  * simple_offset_init - initialize an offset_ctx
@@ -259,11 +265,9 @@ static struct lock_class_key simple_offset_xa_lock;
  */
 void simple_offset_init(struct offset_ctx *octx)
 {
-	xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1);
-	lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock);
-
-	/* 0 is '.', 1 is '..', so always start with offset 2 */
-	octx->next_offset = 2;
+	mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE);
+	lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class);
+	octx->next_offset = DIR_OFFSET_MIN;
 }
 
 /**
@@ -271,20 +275,19 @@ void simple_offset_init(struct offset_ctx *octx)
  * @octx: directory offset ctx to be updated
  * @dentry: new dentry being added
  *
- * Returns zero on success. @so_ctx and the dentry offset are updated.
+ * Returns zero on success. @octx and the dentry's offset are updated.
  * Otherwise, a negative errno value is returned.
  */
 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
 {
-	static const struct xa_limit limit = XA_LIMIT(2, U32_MAX);
-	u32 offset;
+	unsigned long offset;
 	int ret;
 
 	if (dentry2offset(dentry) != 0)
 		return -EBUSY;
 
-	ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit,
-			      &octx->next_offset, GFP_KERNEL);
+	ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN,
+				 LONG_MAX, &octx->next_offset, GFP_KERNEL);
 	if (ret < 0)
 		return ret;
 
@@ -300,17 +303,49 @@ int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry)
  */
 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry)
 {
-	u32 offset;
+	long offset;
 
 	offset = dentry2offset(dentry);
 	if (offset == 0)
 		return;
 
-	xa_erase(&octx->xa, offset);
+	mtree_erase(&octx->mt, offset);
 	offset_set(dentry, 0);
 }
 
 /**
+ * simple_offset_empty - Check if a dentry can be unlinked
+ * @dentry: dentry to be tested
+ *
+ * Returns 0 if @dentry is a non-empty directory; otherwise returns 1.
+ */
+int simple_offset_empty(struct dentry *dentry)
+{
+	struct inode *inode = d_inode(dentry);
+	struct offset_ctx *octx;
+	struct dentry *child;
+	unsigned long index;
+	int ret = 1;
+
+	if (!inode || !S_ISDIR(inode->i_mode))
+		return ret;
+
+	index = DIR_OFFSET_MIN;
+	octx = inode->i_op->get_offset_ctx(inode);
+	mt_for_each(&octx->mt, child, index, LONG_MAX) {
+		spin_lock(&child->d_lock);
+		if (simple_positive(child)) {
+			spin_unlock(&child->d_lock);
+			ret = 0;
+			break;
+		}
+		spin_unlock(&child->d_lock);
+	}
+
+	return ret;
+}
+
+/**
  * simple_offset_rename_exchange - exchange rename with directory offsets
  * @old_dir: parent of dentry being moved
  * @old_dentry: dentry being moved
@@ -327,8 +362,8 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 {
 	struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir);
 	struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir);
-	u32 old_index = dentry2offset(old_dentry);
-	u32 new_index = dentry2offset(new_dentry);
+	long old_index = dentry2offset(old_dentry);
+	long new_index = dentry2offset(new_dentry);
 	int ret;
 
 	simple_offset_remove(old_ctx, old_dentry);
@@ -354,9 +389,9 @@ int simple_offset_rename_exchange(struct inode *old_dir,
 
 out_restore:
 	offset_set(old_dentry, old_index);
-	xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL);
+	mtree_store(&old_ctx->mt, old_index, old_dentry, GFP_KERNEL);
 	offset_set(new_dentry, new_index);
-	xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL);
+	mtree_store(&new_ctx->mt, new_index, new_dentry, GFP_KERNEL);
 	return ret;
 }
 
@@ -369,7 +404,7 @@ out_restore:
  */
 void simple_offset_destroy(struct offset_ctx *octx)
 {
-	xa_destroy(&octx->xa);
+	mtree_destroy(&octx->mt);
 }
 
 /**
@@ -399,15 +434,16 @@ static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence)
 
 	/* In this case, ->private_data is protected by f_pos_lock */
 	file->private_data = NULL;
-	return vfs_setpos(file, offset, U32_MAX);
+	return vfs_setpos(file, offset, LONG_MAX);
 }
 
-static struct dentry *offset_find_next(struct xa_state *xas)
+static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset)
 {
+	MA_STATE(mas, &octx->mt, offset, offset);
 	struct dentry *child, *found = NULL;
 
 	rcu_read_lock();
-	child = xas_next_entry(xas, U32_MAX);
+	child = mas_find(&mas, LONG_MAX);
 	if (!child)
 		goto out;
 	spin_lock(&child->d_lock);
@@ -421,8 +457,8 @@ out:
 
 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 {
-	u32 offset = dentry2offset(dentry);
 	struct inode *inode = d_inode(dentry);
+	long offset = dentry2offset(dentry);
 
 	return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset,
 			  inode->i_ino, fs_umode_to_dtype(inode->i_mode));
@@ -430,12 +466,11 @@ static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry)
 
 static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 {
-	struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode);
-	XA_STATE(xas, &so_ctx->xa, ctx->pos);
+	struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode);
 	struct dentry *dentry;
 
 	while (true) {
-		dentry = offset_find_next(&xas);
+		dentry = offset_find_next(octx, ctx->pos);
 		if (!dentry)
 			return ERR_PTR(-ENOENT);
 
@@ -444,8 +479,8 @@ static void *offset_iterate_dir(struct inode *inode, struct dir_context *ctx)
 			break;
 		}
 
+		ctx->pos = dentry2offset(dentry) + 1;
 		dput(dentry);
-		ctx->pos = xas.xa_index + 1;
 	}
 	return NULL;
 }
@@ -481,7 +516,7 @@ static int offset_readdir(struct file *file, struct dir_context *ctx)
 		return 0;
 
 	/* In this case, ->private_data is protected by f_pos_lock */
-	if (ctx->pos == 2)
+	if (ctx->pos == DIR_OFFSET_MIN)
 		file->private_data = NULL;
 	else if (file->private_data == ERR_PTR(-ENOENT))
 		return 0;
@@ -1580,7 +1615,7 @@ EXPORT_SYMBOL(alloc_anon_inode);
  * All arguments are ignored and it just returns -EINVAL.
  */
 int
-simple_nosetlease(struct file *filp, int arg, struct file_lock **flp,
+simple_nosetlease(struct file *filp, int arg, struct file_lease **flp,
 		  void **priv)
 {
 	return -EINVAL;
@@ -1704,16 +1739,28 @@ bool is_empty_dir_inode(struct inode *inode)
 static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 				const char *str, const struct qstr *name)
 {
-	const struct dentry *parent = READ_ONCE(dentry->d_parent);
-	const struct inode *dir = READ_ONCE(parent->d_inode);
-	const struct super_block *sb = dentry->d_sb;
-	const struct unicode_map *um = sb->s_encoding;
-	struct qstr qstr = QSTR_INIT(str, len);
+	const struct dentry *parent;
+	const struct inode *dir;
 	char strbuf[DNAME_INLINE_LEN];
-	int ret;
+	struct qstr qstr;
+
+	/*
+	 * Attempt a case-sensitive match first. It is cheaper and
+	 * should cover most lookups, including all the sane
+	 * applications that expect a case-sensitive filesystem.
+	 *
+	 * This comparison is safe under RCU because the caller
+	 * guarantees the consistency between str and len. See
+	 * __d_lookup_rcu_op_compare() for details.
+	 */
+	if (len == name->len && !memcmp(str, name->name, len))
+		return 0;
 
+	parent = READ_ONCE(dentry->d_parent);
+	dir = READ_ONCE(parent->d_inode);
 	if (!dir || !IS_CASEFOLDED(dir))
-		goto fallback;
+		return 1;
+
 	/*
 	 * If the dentry name is stored in-line, then it may be concurrently
 	 * modified by a rename.  If this happens, the VFS will eventually retry
@@ -1724,20 +1771,14 @@ static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
 	if (len <= DNAME_INLINE_LEN - 1) {
 		memcpy(strbuf, str, len);
 		strbuf[len] = 0;
-		qstr.name = strbuf;
+		str = strbuf;
 		/* prevent compiler from optimizing out the temporary buffer */
 		barrier();
 	}
-	ret = utf8_strncasecmp(um, name, &qstr);
-	if (ret >= 0)
-		return ret;
+	qstr.len = len;
+	qstr.name = str;
 
-	if (sb_has_strict_encoding(sb))
-		return -EINVAL;
-fallback:
-	if (len != name->len)
-		return 1;
-	return !!memcmp(str, name->name, len);
+	return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr);
 }
 
 /**
@@ -1752,7 +1793,7 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 	const struct inode *dir = READ_ONCE(dentry->d_inode);
 	struct super_block *sb = dentry->d_sb;
 	const struct unicode_map *um = sb->s_encoding;
-	int ret = 0;
+	int ret;
 
 	if (!dir || !IS_CASEFOLDED(dir))
 		return 0;
@@ -1766,73 +1807,45 @@ static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
 static const struct dentry_operations generic_ci_dentry_ops = {
 	.d_hash = generic_ci_d_hash,
 	.d_compare = generic_ci_d_compare,
-};
-#endif
-
 #ifdef CONFIG_FS_ENCRYPTION
-static const struct dentry_operations generic_encrypted_dentry_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
+#endif
 };
 #endif
 
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
-	.d_hash = generic_ci_d_hash,
-	.d_compare = generic_ci_d_compare,
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
 	.d_revalidate = fscrypt_d_revalidate,
 };
 #endif
 
 /**
- * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
- * @dentry:	dentry to set ops on
- *
- * Casefolded directories need d_hash and d_compare set, so that the dentries
- * contained in them are handled case-insensitively.  Note that these operations
- * are needed on the parent directory rather than on the dentries in it, and
- * while the casefolding flag can be toggled on and off on an empty directory,
- * dentry_operations can't be changed later.  As a result, if the filesystem has
- * casefolding support enabled at all, we have to give all dentries the
- * casefolding operations even if their inode doesn't have the casefolding flag
- * currently (and thus the casefolding ops would be no-ops for now).
+ * generic_set_sb_d_ops - helper for choosing the set of
+ * filesystem-wide dentry operations for the enabled features
+ * @sb: superblock to be configured
  *
- * Encryption works differently in that the only dentry operation it needs is
- * d_revalidate, which it only needs on dentries that have the no-key name flag.
- * The no-key flag can't be set "later", so we don't have to worry about that.
- *
- * Finally, to maximize compatibility with overlayfs (which isn't compatible
- * with certain dentry operations) and to avoid taking an unnecessary
- * performance hit, we use custom dentry_operations for each possible
- * combination rather than always installing all operations.
+ * Filesystems supporting casefolding and/or fscrypt can call this
+ * helper at mount-time to configure sb->s_d_op to best set of dentry
+ * operations required for the enabled features. The helper must be
+ * called after these have been configured, but before the root dentry
+ * is created.
  */
-void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+void generic_set_sb_d_ops(struct super_block *sb)
 {
-#ifdef CONFIG_FS_ENCRYPTION
-	bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
-#endif
 #if IS_ENABLED(CONFIG_UNICODE)
-	bool needs_ci_ops = dentry->d_sb->s_encoding;
-#endif
-#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
-	if (needs_encrypt_ops && needs_ci_ops) {
-		d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+	if (sb->s_encoding) {
+		sb->s_d_op = &generic_ci_dentry_ops;
 		return;
 	}
 #endif
 #ifdef CONFIG_FS_ENCRYPTION
-	if (needs_encrypt_ops) {
-		d_set_d_op(dentry, &generic_encrypted_dentry_ops);
-		return;
-	}
-#endif
-#if IS_ENABLED(CONFIG_UNICODE)
-	if (needs_ci_ops) {
-		d_set_d_op(dentry, &generic_ci_dentry_ops);
+	if (sb->s_cop) {
+		sb->s_d_op = &generic_encrypted_dentry_ops;
 		return;
 	}
 #endif
 }
-EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+EXPORT_SYMBOL(generic_set_sb_d_ops);
 
 /**
  * inode_maybe_inc_iversion - increments i_version
@@ -1973,3 +1986,147 @@ struct timespec64 simple_inode_init_ts(struct inode *inode)
 	return ts;
 }
 EXPORT_SYMBOL(simple_inode_init_ts);
+
+static inline struct dentry *get_stashed_dentry(struct dentry *stashed)
+{
+	struct dentry *dentry;
+
+	guard(rcu)();
+	dentry = READ_ONCE(stashed);
+	if (!dentry)
+		return NULL;
+	if (!lockref_get_not_dead(&dentry->d_lockref))
+		return NULL;
+	return dentry;
+}
+
+static struct dentry *prepare_anon_dentry(struct dentry **stashed,
+					  struct super_block *sb,
+					  void *data)
+{
+	struct dentry *dentry;
+	struct inode *inode;
+	const struct stashed_operations *sops = sb->s_fs_info;
+	int ret;
+
+	inode = new_inode_pseudo(sb);
+	if (!inode) {
+		sops->put_data(data);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	inode->i_flags |= S_IMMUTABLE;
+	inode->i_mode = S_IFREG;
+	simple_inode_init_ts(inode);
+
+	ret = sops->init_inode(inode, data);
+	if (ret < 0) {
+		iput(inode);
+		return ERR_PTR(ret);
+	}
+
+	/* Notice when this is changed. */
+	WARN_ON_ONCE(!S_ISREG(inode->i_mode));
+	WARN_ON_ONCE(!IS_IMMUTABLE(inode));
+
+	dentry = d_alloc_anon(sb);
+	if (!dentry) {
+		iput(inode);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Store address of location where dentry's supposed to be stashed. */
+	dentry->d_fsdata = stashed;
+
+	/* @data is now owned by the fs */
+	d_instantiate(dentry, inode);
+	return dentry;
+}
+
+static struct dentry *stash_dentry(struct dentry **stashed,
+				   struct dentry *dentry)
+{
+	guard(rcu)();
+	for (;;) {
+		struct dentry *old;
+
+		/* Assume any old dentry was cleared out. */
+		old = cmpxchg(stashed, NULL, dentry);
+		if (likely(!old))
+			return dentry;
+
+		/* Check if somebody else installed a reusable dentry. */
+		if (lockref_get_not_dead(&old->d_lockref))
+			return old;
+
+		/* There's an old dead dentry there, try to take it over. */
+		if (likely(try_cmpxchg(stashed, &old, dentry)))
+			return dentry;
+	}
+}
+
+/**
+ * path_from_stashed - create path from stashed or new dentry
+ * @stashed:    where to retrieve or stash dentry
+ * @mnt:        mnt of the filesystems to use
+ * @data:       data to store in inode->i_private
+ * @path:       path to create
+ *
+ * The function tries to retrieve a stashed dentry from @stashed. If the dentry
+ * is still valid then it will be reused. If the dentry isn't able the function
+ * will allocate a new dentry and inode. It will then check again whether it
+ * can reuse an existing dentry in case one has been added in the meantime or
+ * update @stashed with the newly added dentry.
+ *
+ * Special-purpose helper for nsfs and pidfs.
+ *
+ * Return: On success zero and on failure a negative error is returned.
+ */
+int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data,
+		      struct path *path)
+{
+	struct dentry *dentry;
+	const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info;
+
+	/* See if dentry can be reused. */
+	path->dentry = get_stashed_dentry(*stashed);
+	if (path->dentry) {
+		sops->put_data(data);
+		goto out_path;
+	}
+
+	/* Allocate a new dentry. */
+	dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data);
+	if (IS_ERR(dentry))
+		return PTR_ERR(dentry);
+
+	/* Added a new dentry. @data is now owned by the filesystem. */
+	path->dentry = stash_dentry(stashed, dentry);
+	if (path->dentry != dentry)
+		dput(dentry);
+
+out_path:
+	WARN_ON_ONCE(path->dentry->d_fsdata != stashed);
+	WARN_ON_ONCE(d_inode(path->dentry)->i_private != data);
+	path->mnt = mntget(mnt);
+	return 0;
+}
+
+void stashed_dentry_prune(struct dentry *dentry)
+{
+	struct dentry **stashed = dentry->d_fsdata;
+	struct inode *inode = d_inode(dentry);
+
+	if (WARN_ON_ONCE(!stashed))
+		return;
+
+	if (!inode)
+		return;
+
+	/*
+	 * Only replace our own @dentry as someone else might've
+	 * already cleared out @dentry and stashed their own
+	 * dentry in there.
+	 */
+	cmpxchg(stashed, dentry, NULL);
+}
diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 8161667c976f..527458db4525 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -243,7 +243,7 @@ static void encode_nlm4_holder(struct xdr_stream *xdr,
 	u64 l_offset, l_len;
 	__be32 *p;
 
-	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
 	encode_int32(xdr, lock->svid);
 	encode_netobj(xdr, lock->oh.data, lock->oh.len);
 
@@ -270,7 +270,7 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
 		goto out_overflow;
 	exclusive = be32_to_cpup(p++);
 	lock->svid = be32_to_cpup(p);
-	fl->fl_pid = (pid_t)lock->svid;
+	fl->c.flc_pid = (pid_t)lock->svid;
 
 	error = decode_netobj(xdr, &lock->oh);
 	if (unlikely(error))
@@ -280,8 +280,8 @@ static int decode_nlm4_holder(struct xdr_stream *xdr, struct nlm_res *result)
 	if (unlikely(p == NULL))
 		goto out_overflow;
 
-	fl->fl_flags = FL_POSIX;
-	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	fl->c.flc_flags = FL_POSIX;
+	fl->c.flc_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
 	p = xdr_decode_hyper(p, &l_offset);
 	xdr_decode_hyper(p, &l_len);
 	nlm4svc_set_file_lock_range(fl, l_offset, l_len);
@@ -357,7 +357,7 @@ static void nlm4_xdr_enc_testargs(struct rpc_rqst *req,
 	const struct nlm_lock *lock = &args->lock;
 
 	encode_cookie(xdr, &args->cookie);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm4_lock(xdr, lock);
 }
 
@@ -380,7 +380,7 @@ static void nlm4_xdr_enc_lockargs(struct rpc_rqst *req,
 
 	encode_cookie(xdr, &args->cookie);
 	encode_bool(xdr, args->block);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm4_lock(xdr, lock);
 	encode_bool(xdr, args->reclaim);
 	encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm4_xdr_enc_cancargs(struct rpc_rqst *req,
 
 	encode_cookie(xdr, &args->cookie);
 	encode_bool(xdr, args->block);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm4_lock(xdr, lock);
 }
 
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 5d85715be763..a7e0519ec024 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -185,7 +185,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
 			continue;
 		if (!rpc_cmp_addr(nlm_addr(block->b_host), addr))
 			continue;
-		if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)), fh) != 0)
+		if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->c.flc_file)), fh) != 0)
 			continue;
 		/* Alright, we found a lock. Set the return status
 		 * and wake up the caller
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index fba6c7fa7474..cebcc283b7ce 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -133,7 +133,8 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	char *nodename = req->a_host->h_rpcclnt->cl_nodename;
 
 	nlmclnt_next_cookie(&argp->cookie);
-	memcpy(&lock->fh, NFS_FH(file_inode(fl->fl_file)), sizeof(struct nfs_fh));
+	memcpy(&lock->fh, NFS_FH(file_inode(fl->c.flc_file)),
+	       sizeof(struct nfs_fh));
 	lock->caller  = nodename;
 	lock->oh.data = req->a_owner;
 	lock->oh.len  = snprintf(req->a_owner, sizeof(req->a_owner), "%u@%s",
@@ -142,7 +143,7 @@ static void nlmclnt_setlockargs(struct nlm_rqst *req, struct file_lock *fl)
 	lock->svid = fl->fl_u.nfs_fl.owner->pid;
 	lock->fl.fl_start = fl->fl_start;
 	lock->fl.fl_end = fl->fl_end;
-	lock->fl.fl_type = fl->fl_type;
+	lock->fl.c.flc_type = fl->c.flc_type;
 }
 
 static void nlmclnt_release_lockargs(struct nlm_rqst *req)
@@ -182,7 +183,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *dat
 	call->a_callback_data = data;
 
 	if (IS_SETLK(cmd) || IS_SETLKW(cmd)) {
-		if (fl->fl_type != F_UNLCK) {
+		if (fl->c.flc_type != F_UNLCK) {
 			call->a_args.block = IS_SETLKW(cmd) ? 1 : 0;
 			status = nlmclnt_lock(call, fl);
 		} else
@@ -432,13 +433,14 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 {
 	int	status;
 
-	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST);
+	status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+			      NLMPROC_TEST);
 	if (status < 0)
 		goto out;
 
 	switch (req->a_res.status) {
 		case nlm_granted:
-			fl->fl_type = F_UNLCK;
+			fl->c.flc_type = F_UNLCK;
 			break;
 		case nlm_lck_denied:
 			/*
@@ -446,8 +448,8 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl)
 			 */
 			fl->fl_start = req->a_res.lock.fl.fl_start;
 			fl->fl_end = req->a_res.lock.fl.fl_end;
-			fl->fl_type = req->a_res.lock.fl.fl_type;
-			fl->fl_pid = -req->a_res.lock.fl.fl_pid;
+			fl->c.flc_type = req->a_res.lock.fl.c.flc_type;
+			fl->c.flc_pid = -req->a_res.lock.fl.c.flc_pid;
 			break;
 		default:
 			status = nlm_stat_to_errno(req->a_res.status);
@@ -485,14 +487,15 @@ static const struct file_lock_operations nlmclnt_lock_ops = {
 static void nlmclnt_locks_init_private(struct file_lock *fl, struct nlm_host *host)
 {
 	fl->fl_u.nfs_fl.state = 0;
-	fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host, fl->fl_owner);
+	fl->fl_u.nfs_fl.owner = nlmclnt_find_lockowner(host,
+						       fl->c.flc_owner);
 	INIT_LIST_HEAD(&fl->fl_u.nfs_fl.list);
 	fl->fl_ops = &nlmclnt_lock_ops;
 }
 
 static int do_vfs_lock(struct file_lock *fl)
 {
-	return locks_lock_file_wait(fl->fl_file, fl);
+	return locks_lock_file_wait(fl->c.flc_file, fl);
 }
 
 /*
@@ -518,12 +521,12 @@ static int do_vfs_lock(struct file_lock *fl)
 static int
 nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 {
-	const struct cred *cred = nfs_file_cred(fl->fl_file);
+	const struct cred *cred = nfs_file_cred(fl->c.flc_file);
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	struct nlm_wait block;
-	unsigned char fl_flags = fl->fl_flags;
-	unsigned char fl_type;
+	unsigned char flags = fl->c.flc_flags;
+	unsigned char type;
 	__be32 b_status;
 	int status = -ENOLCK;
 
@@ -531,9 +534,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl)
 		goto out;
 	req->a_args.state = nsm_local_state;
 
-	fl->fl_flags |= FL_ACCESS;
+	fl->c.flc_flags |= FL_ACCESS;
 	status = do_vfs_lock(fl);
-	fl->fl_flags = fl_flags;
+	fl->c.flc_flags = flags;
 	if (status < 0)
 		goto out;
 
@@ -591,11 +594,11 @@ again:
 			goto again;
 		}
 		/* Ensure the resulting lock will get added to granted list */
-		fl->fl_flags |= FL_SLEEP;
+		fl->c.flc_flags |= FL_SLEEP;
 		if (do_vfs_lock(fl) < 0)
 			printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
 		up_read(&host->h_rwsem);
-		fl->fl_flags = fl_flags;
+		fl->c.flc_flags = flags;
 		status = 0;
 	}
 	if (status < 0)
@@ -605,7 +608,7 @@ again:
 	 * cases NLM_LCK_DENIED is returned for a permanent error.  So
 	 * turn it into an ENOLCK.
 	 */
-	if (resp->status == nlm_lck_denied && (fl_flags & FL_SLEEP))
+	if (resp->status == nlm_lck_denied && (flags & FL_SLEEP))
 		status = -ENOLCK;
 	else
 		status = nlm_stat_to_errno(resp->status);
@@ -622,13 +625,13 @@ out_unlock:
 			   req->a_host->h_addrlen, req->a_res.status);
 	dprintk("lockd: lock attempt ended in fatal error.\n"
 		"       Attempting to unlock.\n");
-	fl_type = fl->fl_type;
-	fl->fl_type = F_UNLCK;
+	type = fl->c.flc_type;
+	fl->c.flc_type = F_UNLCK;
 	down_read(&host->h_rwsem);
 	do_vfs_lock(fl);
 	up_read(&host->h_rwsem);
-	fl->fl_type = fl_type;
-	fl->fl_flags = fl_flags;
+	fl->c.flc_type = type;
+	fl->c.flc_flags = flags;
 	nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 	return status;
 }
@@ -651,12 +654,14 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl,
 	nlmclnt_setlockargs(req, fl);
 	req->a_args.reclaim = 1;
 
-	status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK);
+	status = nlmclnt_call(nfs_file_cred(fl->c.flc_file), req,
+			      NLMPROC_LOCK);
 	if (status >= 0 && req->a_res.status == nlm_granted)
 		return 0;
 
 	printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d "
-				"(errno %d, status %d)\n", fl->fl_pid,
+				"(errno %d, status %d)\n",
+				fl->c.flc_pid,
 				status, ntohl(req->a_res.status));
 
 	/*
@@ -683,26 +688,26 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl)
 	struct nlm_host	*host = req->a_host;
 	struct nlm_res	*resp = &req->a_res;
 	int status;
-	unsigned char fl_flags = fl->fl_flags;
+	unsigned char flags = fl->c.flc_flags;
 
 	/*
 	 * Note: the server is supposed to either grant us the unlock
 	 * request, or to deny it with NLM_LCK_DENIED_GRACE_PERIOD. In either
 	 * case, we want to unlock.
 	 */
-	fl->fl_flags |= FL_EXISTS;
+	fl->c.flc_flags |= FL_EXISTS;
 	down_read(&host->h_rwsem);
 	status = do_vfs_lock(fl);
 	up_read(&host->h_rwsem);
-	fl->fl_flags = fl_flags;
+	fl->c.flc_flags = flags;
 	if (status == -ENOENT) {
 		status = 0;
 		goto out;
 	}
 
 	refcount_inc(&req->a_count);
-	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
-			NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
+	status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+				    NLMPROC_UNLOCK, &nlmclnt_unlock_ops);
 	if (status < 0)
 		goto out;
 
@@ -795,8 +800,8 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl
 	req->a_args.block = block;
 
 	refcount_inc(&req->a_count);
-	status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req,
-			NLMPROC_CANCEL, &nlmclnt_cancel_ops);
+	status = nlmclnt_async_call(nfs_file_cred(fl->c.flc_file), req,
+				    NLMPROC_CANCEL, &nlmclnt_cancel_ops);
 	if (status == 0 && req->a_res.status == nlm_lck_denied)
 		status = -ENOLCK;
 	nlmclnt_release_call(req);
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 4df62f635529..a3e97278b997 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -238,7 +238,7 @@ static void encode_nlm_holder(struct xdr_stream *xdr,
 	u32 l_offset, l_len;
 	__be32 *p;
 
-	encode_bool(xdr, lock->fl.fl_type == F_RDLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_RDLCK);
 	encode_int32(xdr, lock->svid);
 	encode_netobj(xdr, lock->oh.data, lock->oh.len);
 
@@ -265,7 +265,7 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
 		goto out_overflow;
 	exclusive = be32_to_cpup(p++);
 	lock->svid = be32_to_cpup(p);
-	fl->fl_pid = (pid_t)lock->svid;
+	fl->c.flc_pid = (pid_t)lock->svid;
 
 	error = decode_netobj(xdr, &lock->oh);
 	if (unlikely(error))
@@ -275,8 +275,8 @@ static int decode_nlm_holder(struct xdr_stream *xdr, struct nlm_res *result)
 	if (unlikely(p == NULL))
 		goto out_overflow;
 
-	fl->fl_flags = FL_POSIX;
-	fl->fl_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
+	fl->c.flc_flags = FL_POSIX;
+	fl->c.flc_type  = exclusive != 0 ? F_WRLCK : F_RDLCK;
 	l_offset = be32_to_cpup(p++);
 	l_len = be32_to_cpup(p);
 	end = l_offset + l_len - 1;
@@ -357,7 +357,7 @@ static void nlm_xdr_enc_testargs(struct rpc_rqst *req,
 	const struct nlm_lock *lock = &args->lock;
 
 	encode_cookie(xdr, &args->cookie);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm_lock(xdr, lock);
 }
 
@@ -380,7 +380,7 @@ static void nlm_xdr_enc_lockargs(struct rpc_rqst *req,
 
 	encode_cookie(xdr, &args->cookie);
 	encode_bool(xdr, args->block);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm_lock(xdr, lock);
 	encode_bool(xdr, args->reclaim);
 	encode_int32(xdr, args->state);
@@ -403,7 +403,7 @@ static void nlm_xdr_enc_cancargs(struct rpc_rqst *req,
 
 	encode_cookie(xdr, &args->cookie);
 	encode_bool(xdr, args->block);
-	encode_bool(xdr, lock->fl.fl_type == F_WRLCK);
+	encode_bool(xdr, lock->fl.c.flc_type == F_WRLCK);
 	encode_nlm_lock(xdr, lock);
 }
 
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index ce5862482097..ab8042a5b895 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -710,8 +710,6 @@ static const struct svc_version *nlmsvc_version[] = {
 #endif
 };
 
-static struct svc_stat		nlmsvc_stats;
-
 #define NLM_NRVERS	ARRAY_SIZE(nlmsvc_version)
 static struct svc_program	nlmsvc_program = {
 	.pg_prog		= NLM_PROGRAM,		/* program number */
@@ -719,7 +717,6 @@ static struct svc_program	nlmsvc_program = {
 	.pg_vers		= nlmsvc_version,	/* version table */
 	.pg_name		= "lockd",		/* service name */
 	.pg_class		= "nfsd",		/* share authentication with nfsd */
-	.pg_stats		= &nlmsvc_stats,	/* stats table */
 	.pg_authenticate	= &lockd_authenticate,	/* export authentication */
 	.pg_init_request	= svc_generic_init_request,
 	.pg_rpcbind_set		= svc_generic_rpcbind_set,
diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c
index b72023a6b4c1..8a72c418cdcc 100644
--- a/fs/lockd/svc4proc.c
+++ b/fs/lockd/svc4proc.c
@@ -52,16 +52,16 @@ nlm4svc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 		*filp = file;
 
 		/* Set up the missing parts of the file_lock structure */
-		lock->fl.fl_flags = FL_POSIX;
-		lock->fl.fl_file  = file->f_file[mode];
-		lock->fl.fl_pid = current->tgid;
+		lock->fl.c.flc_flags = FL_POSIX;
+		lock->fl.c.flc_file  = file->f_file[mode];
+		lock->fl.c.flc_pid = current->tgid;
 		lock->fl.fl_start = (loff_t)lock->lock_start;
 		lock->fl.fl_end = lock->lock_len ?
 				   (loff_t)(lock->lock_start + lock->lock_len - 1) :
 				   OFFSET_MAX;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
 		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
-		if (!lock->fl.fl_owner) {
+		if (!lock->fl.c.flc_owner) {
 			/* lockowner allocation has failed */
 			nlmsvc_release_host(host);
 			return nlm_lck_denied_nolocks;
@@ -106,7 +106,7 @@ __nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
-	test_owner = argp->lock.fl.fl_owner;
+	test_owner = argp->lock.fl.c.flc_owner;
 	/* Now check for conflicting locks */
 	resp->status = nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie);
 	if (resp->status == nlm_drop_reply)
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 2dc10900ad1c..1f2149db10f2 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -150,16 +150,17 @@ nlmsvc_lookup_block(struct nlm_file *file, struct nlm_lock *lock)
 	struct file_lock	*fl;
 
 	dprintk("lockd: nlmsvc_lookup_block f=%p pd=%d %Ld-%Ld ty=%d\n",
-				file, lock->fl.fl_pid,
+				file, lock->fl.c.flc_pid,
 				(long long)lock->fl.fl_start,
-				(long long)lock->fl.fl_end, lock->fl.fl_type);
+				(long long)lock->fl.fl_end,
+				lock->fl.c.flc_type);
 	spin_lock(&nlm_blocked_lock);
 	list_for_each_entry(block, &nlm_blocked, b_list) {
 		fl = &block->b_call->a_args.lock.fl;
 		dprintk("lockd: check f=%p pd=%d %Ld-%Ld ty=%d cookie=%s\n",
-				block->b_file, fl->fl_pid,
+				block->b_file, fl->c.flc_pid,
 				(long long)fl->fl_start,
-				(long long)fl->fl_end, fl->fl_type,
+				(long long)fl->fl_end, fl->c.flc_type,
 				nlmdbg_cookie2a(&block->b_call->a_args.cookie));
 		if (block->b_file == file && nlm_compare_locks(fl, &lock->fl)) {
 			kref_get(&block->b_count);
@@ -244,7 +245,7 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host,
 		goto failed_free;
 
 	/* Set notifier function for VFS, and init args */
-	call->a_args.lock.fl.fl_flags |= FL_SLEEP;
+	call->a_args.lock.fl.c.flc_flags |= FL_SLEEP;
 	call->a_args.lock.fl.fl_lmops = &nlmsvc_lock_operations;
 	nlmclnt_next_cookie(&call->a_args.cookie);
 
@@ -402,14 +403,14 @@ static struct nlm_lockowner *nlmsvc_find_lockowner(struct nlm_host *host, pid_t
 void
 nlmsvc_release_lockowner(struct nlm_lock *lock)
 {
-	if (lock->fl.fl_owner)
-		nlmsvc_put_lockowner(lock->fl.fl_owner);
+	if (lock->fl.c.flc_owner)
+		nlmsvc_put_lockowner(lock->fl.c.flc_owner);
 }
 
 void nlmsvc_locks_init_private(struct file_lock *fl, struct nlm_host *host,
 						pid_t pid)
 {
-	fl->fl_owner = nlmsvc_find_lockowner(host, pid);
+	fl->c.flc_owner = nlmsvc_find_lockowner(host, pid);
 }
 
 /*
@@ -425,7 +426,7 @@ static int nlmsvc_setgrantargs(struct nlm_rqst *call, struct nlm_lock *lock)
 
 	/* set default data area */
 	call->a_args.lock.oh.data = call->a_owner;
-	call->a_args.lock.svid = ((struct nlm_lockowner *)lock->fl.fl_owner)->pid;
+	call->a_args.lock.svid = ((struct nlm_lockowner *) lock->fl.c.flc_owner)->pid;
 
 	if (lock->oh.len > NLMCLNT_OHSIZE) {
 		void *data = kmalloc(lock->oh.len, GFP_KERNEL);
@@ -489,7 +490,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 
 	dprintk("lockd: nlmsvc_lock(%s/%ld, ty=%d, pi=%d, %Ld-%Ld, bl=%d)\n",
 				inode->i_sb->s_id, inode->i_ino,
-				lock->fl.fl_type, lock->fl.fl_pid,
+				lock->fl.c.flc_type,
+				lock->fl.c.flc_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end,
 				wait);
@@ -512,7 +514,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		lock = &block->b_call->a_args.lock;
 	} else
-		lock->fl.fl_flags &= ~FL_SLEEP;
+		lock->fl.c.flc_flags &= ~FL_SLEEP;
 
 	if (block->b_flags & B_QUEUED) {
 		dprintk("lockd: nlmsvc_lock deferred block %p flags %d\n",
@@ -560,10 +562,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 	spin_unlock(&nlm_blocked_lock);
 
 	if (!wait)
-		lock->fl.fl_flags &= ~FL_SLEEP;
+		lock->fl.c.flc_flags &= ~FL_SLEEP;
 	mode = lock_to_openmode(&lock->fl);
 	error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
-	lock->fl.fl_flags &= ~FL_SLEEP;
+	lock->fl.c.flc_flags &= ~FL_SLEEP;
 
 	dprintk("lockd: vfs_lock_file returned %d\n", error);
 	switch (error) {
@@ -616,7 +618,7 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 	dprintk("lockd: nlmsvc_testlock(%s/%ld, ty=%d, %Ld-%Ld)\n",
 				nlmsvc_file_inode(file)->i_sb->s_id,
 				nlmsvc_file_inode(file)->i_ino,
-				lock->fl.fl_type,
+				lock->fl.c.flc_type,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
@@ -636,19 +638,19 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file,
 		goto out;
 	}
 
-	if (lock->fl.fl_type == F_UNLCK) {
+	if (lock->fl.c.flc_type == F_UNLCK) {
 		ret = nlm_granted;
 		goto out;
 	}
 
 	dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n",
-		lock->fl.fl_type, (long long)lock->fl.fl_start,
+		lock->fl.c.flc_type, (long long)lock->fl.fl_start,
 		(long long)lock->fl.fl_end);
 	conflock->caller = "somehost";	/* FIXME */
 	conflock->len = strlen(conflock->caller);
 	conflock->oh.len = 0;		/* don't return OH info */
-	conflock->svid = lock->fl.fl_pid;
-	conflock->fl.fl_type = lock->fl.fl_type;
+	conflock->svid = lock->fl.c.flc_pid;
+	conflock->fl.c.flc_type = lock->fl.c.flc_type;
 	conflock->fl.fl_start = lock->fl.fl_start;
 	conflock->fl.fl_end = lock->fl.fl_end;
 	locks_release_private(&lock->fl);
@@ -673,21 +675,21 @@ nlmsvc_unlock(struct net *net, struct nlm_file *file, struct nlm_lock *lock)
 	dprintk("lockd: nlmsvc_unlock(%s/%ld, pi=%d, %Ld-%Ld)\n",
 				nlmsvc_file_inode(file)->i_sb->s_id,
 				nlmsvc_file_inode(file)->i_ino,
-				lock->fl.fl_pid,
+				lock->fl.c.flc_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
 	/* First, cancel any lock that might be there */
 	nlmsvc_cancel_blocked(net, file, lock);
 
-	lock->fl.fl_type = F_UNLCK;
-	lock->fl.fl_file = file->f_file[O_RDONLY];
-	if (lock->fl.fl_file)
-		error = vfs_lock_file(lock->fl.fl_file, F_SETLK,
+	lock->fl.c.flc_type = F_UNLCK;
+	lock->fl.c.flc_file = file->f_file[O_RDONLY];
+	if (lock->fl.c.flc_file)
+		error = vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
 					&lock->fl, NULL);
-	lock->fl.fl_file = file->f_file[O_WRONLY];
-	if (lock->fl.fl_file)
-		error |= vfs_lock_file(lock->fl.fl_file, F_SETLK,
+	lock->fl.c.flc_file = file->f_file[O_WRONLY];
+	if (lock->fl.c.flc_file)
+		error |= vfs_lock_file(lock->fl.c.flc_file, F_SETLK,
 					&lock->fl, NULL);
 
 	return (error < 0)? nlm_lck_denied_nolocks : nlm_granted;
@@ -710,7 +712,7 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l
 	dprintk("lockd: nlmsvc_cancel(%s/%ld, pi=%d, %Ld-%Ld)\n",
 				nlmsvc_file_inode(file)->i_sb->s_id,
 				nlmsvc_file_inode(file)->i_ino,
-				lock->fl.fl_pid,
+				lock->fl.c.flc_pid,
 				(long long)lock->fl.fl_start,
 				(long long)lock->fl.fl_end);
 
@@ -863,12 +865,12 @@ nlmsvc_grant_blocked(struct nlm_block *block)
 	/* vfs_lock_file() can mangle fl_start and fl_end, but we need
 	 * them unchanged for the GRANT_MSG
 	 */
-	lock->fl.fl_flags |= FL_SLEEP;
+	lock->fl.c.flc_flags |= FL_SLEEP;
 	fl_start = lock->fl.fl_start;
 	fl_end = lock->fl.fl_end;
 	mode = lock_to_openmode(&lock->fl);
 	error = vfs_lock_file(file->f_file[mode], F_SETLK, &lock->fl, NULL);
-	lock->fl.fl_flags &= ~FL_SLEEP;
+	lock->fl.c.flc_flags &= ~FL_SLEEP;
 	lock->fl.fl_start = fl_start;
 	lock->fl.fl_end = fl_end;
 
@@ -993,8 +995,8 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status)
 		/* Client doesn't want it, just unlock it */
 		nlmsvc_unlink_block(block);
 		fl = &block->b_call->a_args.lock.fl;
-		fl->fl_type = F_UNLCK;
-		error = vfs_lock_file(fl->fl_file, F_SETLK, fl, NULL);
+		fl->c.flc_type = F_UNLCK;
+		error = vfs_lock_file(fl->c.flc_file, F_SETLK, fl, NULL);
 		if (error)
 			pr_warn("lockd: unable to unlock lock rejected by client!\n");
 		break;
diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c
index 32784f508c81..a03220e66ce0 100644
--- a/fs/lockd/svcproc.c
+++ b/fs/lockd/svcproc.c
@@ -77,12 +77,12 @@ nlmsvc_retrieve_args(struct svc_rqst *rqstp, struct nlm_args *argp,
 
 		/* Set up the missing parts of the file_lock structure */
 		mode = lock_to_openmode(&lock->fl);
-		lock->fl.fl_flags = FL_POSIX;
-		lock->fl.fl_file  = file->f_file[mode];
-		lock->fl.fl_pid = current->tgid;
+		lock->fl.c.flc_flags = FL_POSIX;
+		lock->fl.c.flc_file  = file->f_file[mode];
+		lock->fl.c.flc_pid = current->tgid;
 		lock->fl.fl_lmops = &nlmsvc_lock_operations;
 		nlmsvc_locks_init_private(&lock->fl, host, (pid_t)lock->svid);
-		if (!lock->fl.fl_owner) {
+		if (!lock->fl.c.flc_owner) {
 			/* lockowner allocation has failed */
 			nlmsvc_release_host(host);
 			return nlm_lck_denied_nolocks;
@@ -127,7 +127,7 @@ __nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_res *resp)
 	if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file)))
 		return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success;
 
-	test_owner = argp->lock.fl.fl_owner;
+	test_owner = argp->lock.fl.c.flc_owner;
 
 	/* Now check for conflicting locks */
 	resp->status = cast_status(nlmsvc_testlock(rqstp, file, host, &argp->lock, &resp->lock, &resp->cookie));
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index e3b6229e7ae5..9103896164f6 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -73,7 +73,7 @@ static inline unsigned int file_hash(struct nfs_fh *f)
 
 int lock_to_openmode(struct file_lock *lock)
 {
-	return (lock->fl_type == F_WRLCK) ? O_WRONLY : O_RDONLY;
+	return lock_is_write(lock) ? O_WRONLY : O_RDONLY;
 }
 
 /*
@@ -181,18 +181,18 @@ static int nlm_unlock_files(struct nlm_file *file, const struct file_lock *fl)
 	struct file_lock lock;
 
 	locks_init_lock(&lock);
-	lock.fl_type  = F_UNLCK;
+	lock.c.flc_type  = F_UNLCK;
 	lock.fl_start = 0;
 	lock.fl_end   = OFFSET_MAX;
-	lock.fl_owner = fl->fl_owner;
-	lock.fl_pid   = fl->fl_pid;
-	lock.fl_flags = FL_POSIX;
+	lock.c.flc_owner = fl->c.flc_owner;
+	lock.c.flc_pid   = fl->c.flc_pid;
+	lock.c.flc_flags = FL_POSIX;
 
-	lock.fl_file = file->f_file[O_RDONLY];
-	if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+	lock.c.flc_file = file->f_file[O_RDONLY];
+	if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
 		goto out_err;
-	lock.fl_file = file->f_file[O_WRONLY];
-	if (lock.fl_file && vfs_lock_file(lock.fl_file, F_SETLK, &lock, NULL))
+	lock.c.flc_file = file->f_file[O_WRONLY];
+	if (lock.c.flc_file && vfs_lock_file(lock.c.flc_file, F_SETLK, &lock, NULL))
 		goto out_err;
 	return 0;
 out_err:
@@ -218,14 +218,14 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 again:
 	file->f_locks = 0;
 	spin_lock(&flctx->flc_lock);
-	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+	for_each_file_lock(fl, &flctx->flc_posix) {
 		if (fl->fl_lmops != &nlmsvc_lock_operations)
 			continue;
 
 		/* update current lock count */
 		file->f_locks++;
 
-		lockhost = ((struct nlm_lockowner *)fl->fl_owner)->host;
+		lockhost = ((struct nlm_lockowner *) fl->c.flc_owner)->host;
 		if (match(lockhost, host)) {
 
 			spin_unlock(&flctx->flc_lock);
@@ -272,7 +272,7 @@ nlm_file_inuse(struct nlm_file *file)
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
 		spin_lock(&flctx->flc_lock);
-		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
+		for_each_file_lock(fl, &flctx->flc_posix) {
 			if (fl->fl_lmops == &nlmsvc_lock_operations) {
 				spin_unlock(&flctx->flc_lock);
 				return 1;
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 2fb5748dae0c..adfcce2bf11b 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -88,8 +88,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 		return false;
 
 	locks_init_lock(fl);
-	fl->fl_flags = FL_POSIX;
-	fl->fl_type  = F_RDLCK;
+	fl->c.flc_flags = FL_POSIX;
+	fl->c.flc_type  = F_RDLCK;
 	end = start + len - 1;
 	fl->fl_start = s32_to_loff_t(start);
 	if (len == 0 || end < 0)
@@ -107,7 +107,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
 	s32 start, len;
 
 	/* exclusive */
-	if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+	if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
 		return false;
 	if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
 		return false;
@@ -164,7 +164,7 @@ nlmsvc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 
 	return true;
 }
@@ -184,7 +184,7 @@ nlmsvc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 	if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
 		return false;
 	if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -209,7 +209,7 @@ nlmsvc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 
 	return true;
 }
@@ -223,7 +223,7 @@ nlmsvc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 		return false;
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
-	argp->lock.fl.fl_type = F_UNLCK;
+	argp->lock.fl.c.flc_type = F_UNLCK;
 
 	return true;
 }
diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c
index 5fcbf30cd275..3d28b9c3ed15 100644
--- a/fs/lockd/xdr4.c
+++ b/fs/lockd/xdr4.c
@@ -89,8 +89,8 @@ svcxdr_decode_lock(struct xdr_stream *xdr, struct nlm_lock *lock)
 		return false;
 
 	locks_init_lock(fl);
-	fl->fl_flags = FL_POSIX;
-	fl->fl_type  = F_RDLCK;
+	fl->c.flc_flags = FL_POSIX;
+	fl->c.flc_type  = F_RDLCK;
 	nlm4svc_set_file_lock_range(fl, lock->lock_start, lock->lock_len);
 	return true;
 }
@@ -102,7 +102,7 @@ svcxdr_encode_holder(struct xdr_stream *xdr, const struct nlm_lock *lock)
 	s64 start, len;
 
 	/* exclusive */
-	if (xdr_stream_encode_bool(xdr, fl->fl_type != F_RDLCK) < 0)
+	if (xdr_stream_encode_bool(xdr, fl->c.flc_type != F_RDLCK) < 0)
 		return false;
 	if (xdr_stream_encode_u32(xdr, lock->svid) < 0)
 		return false;
@@ -159,7 +159,7 @@ nlm4svc_decode_testargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 
 	return true;
 }
@@ -179,7 +179,7 @@ nlm4svc_decode_lockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 	if (xdr_stream_decode_bool(xdr, &argp->reclaim) < 0)
 		return false;
 	if (xdr_stream_decode_u32(xdr, &argp->state) < 0)
@@ -204,7 +204,7 @@ nlm4svc_decode_cancargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
 	if (exclusive)
-		argp->lock.fl.fl_type = F_WRLCK;
+		argp->lock.fl.c.flc_type = F_WRLCK;
 
 	return true;
 }
@@ -218,7 +218,7 @@ nlm4svc_decode_unlockargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
 		return false;
 	if (!svcxdr_decode_lock(xdr, &argp->lock))
 		return false;
-	argp->lock.fl.fl_type = F_UNLCK;
+	argp->lock.fl.c.flc_type = F_UNLCK;
 
 	return true;
 }
diff --git a/fs/locks.c b/fs/locks.c
index cc7c117ee192..90c8746874de 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -48,7 +48,6 @@
  * children.
  *
  */
-
 #include <linux/capability.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
@@ -70,24 +69,28 @@
 
 #include <linux/uaccess.h>
 
-#define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
-#define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
-#define IS_OFDLCK(fl)	(fl->fl_flags & FL_OFDLCK)
-#define IS_REMOTELCK(fl)	(fl->fl_pid <= 0)
+static struct file_lock *file_lock(struct file_lock_core *flc)
+{
+	return container_of(flc, struct file_lock, c);
+}
+
+static struct file_lease *file_lease(struct file_lock_core *flc)
+{
+	return container_of(flc, struct file_lease, c);
+}
 
-static bool lease_breaking(struct file_lock *fl)
+static bool lease_breaking(struct file_lease *fl)
 {
-	return fl->fl_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
+	return fl->c.flc_flags & (FL_UNLOCK_PENDING | FL_DOWNGRADE_PENDING);
 }
 
-static int target_leasetype(struct file_lock *fl)
+static int target_leasetype(struct file_lease *fl)
 {
-	if (fl->fl_flags & FL_UNLOCK_PENDING)
+	if (fl->c.flc_flags & FL_UNLOCK_PENDING)
 		return F_UNLCK;
-	if (fl->fl_flags & FL_DOWNGRADE_PENDING)
+	if (fl->c.flc_flags & FL_DOWNGRADE_PENDING)
 		return F_RDLCK;
-	return fl->fl_type;
+	return fl->c.flc_type;
 }
 
 static int leases_enable = 1;
@@ -168,6 +171,7 @@ static DEFINE_SPINLOCK(blocked_lock_lock);
 
 static struct kmem_cache *flctx_cache __ro_after_init;
 static struct kmem_cache *filelock_cache __ro_after_init;
+static struct kmem_cache *filelease_cache __ro_after_init;
 
 static struct file_lock_context *
 locks_get_lock_context(struct inode *inode, int type)
@@ -204,11 +208,12 @@ out:
 static void
 locks_dump_ctx_list(struct list_head *list, char *list_type)
 {
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
-	list_for_each_entry(fl, list, fl_list) {
-		pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
-	}
+	list_for_each_entry(flc, list, flc_list)
+		pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
+			list_type, flc->flc_owner, flc->flc_flags,
+			flc->flc_type, flc->flc_pid);
 }
 
 static void
@@ -229,19 +234,19 @@ locks_check_ctx_lists(struct inode *inode)
 }
 
 static void
-locks_check_ctx_file_list(struct file *filp, struct list_head *list,
-				char *list_type)
+locks_check_ctx_file_list(struct file *filp, struct list_head *list, char *list_type)
 {
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 	struct inode *inode = file_inode(filp);
 
-	list_for_each_entry(fl, list, fl_list)
-		if (fl->fl_file == filp)
+	list_for_each_entry(flc, list, flc_list)
+		if (flc->flc_file == filp)
 			pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx "
 				" fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n",
 				list_type, MAJOR(inode->i_sb->s_dev),
 				MINOR(inode->i_sb->s_dev), inode->i_ino,
-				fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+				flc->flc_owner, flc->flc_flags,
+				flc->flc_type, flc->flc_pid);
 }
 
 void
@@ -255,13 +260,13 @@ locks_free_lock_context(struct inode *inode)
 	}
 }
 
-static void locks_init_lock_heads(struct file_lock *fl)
+static void locks_init_lock_heads(struct file_lock_core *flc)
 {
-	INIT_HLIST_NODE(&fl->fl_link);
-	INIT_LIST_HEAD(&fl->fl_list);
-	INIT_LIST_HEAD(&fl->fl_blocked_requests);
-	INIT_LIST_HEAD(&fl->fl_blocked_member);
-	init_waitqueue_head(&fl->fl_wait);
+	INIT_HLIST_NODE(&flc->flc_link);
+	INIT_LIST_HEAD(&flc->flc_list);
+	INIT_LIST_HEAD(&flc->flc_blocked_requests);
+	INIT_LIST_HEAD(&flc->flc_blocked_member);
+	init_waitqueue_head(&flc->flc_wait);
 }
 
 /* Allocate an empty lock structure. */
@@ -270,19 +275,33 @@ struct file_lock *locks_alloc_lock(void)
 	struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
 
 	if (fl)
-		locks_init_lock_heads(fl);
+		locks_init_lock_heads(&fl->c);
 
 	return fl;
 }
 EXPORT_SYMBOL_GPL(locks_alloc_lock);
 
+/* Allocate an empty lock structure. */
+struct file_lease *locks_alloc_lease(void)
+{
+	struct file_lease *fl = kmem_cache_zalloc(filelease_cache, GFP_KERNEL);
+
+	if (fl)
+		locks_init_lock_heads(&fl->c);
+
+	return fl;
+}
+EXPORT_SYMBOL_GPL(locks_alloc_lease);
+
 void locks_release_private(struct file_lock *fl)
 {
-	BUG_ON(waitqueue_active(&fl->fl_wait));
-	BUG_ON(!list_empty(&fl->fl_list));
-	BUG_ON(!list_empty(&fl->fl_blocked_requests));
-	BUG_ON(!list_empty(&fl->fl_blocked_member));
-	BUG_ON(!hlist_unhashed(&fl->fl_link));
+	struct file_lock_core *flc = &fl->c;
+
+	BUG_ON(waitqueue_active(&flc->flc_wait));
+	BUG_ON(!list_empty(&flc->flc_list));
+	BUG_ON(!list_empty(&flc->flc_blocked_requests));
+	BUG_ON(!list_empty(&flc->flc_blocked_member));
+	BUG_ON(!hlist_unhashed(&flc->flc_link));
 
 	if (fl->fl_ops) {
 		if (fl->fl_ops->fl_release_private)
@@ -292,8 +311,8 @@ void locks_release_private(struct file_lock *fl)
 
 	if (fl->fl_lmops) {
 		if (fl->fl_lmops->lm_put_owner) {
-			fl->fl_lmops->lm_put_owner(fl->fl_owner);
-			fl->fl_owner = NULL;
+			fl->fl_lmops->lm_put_owner(flc->flc_owner);
+			flc->flc_owner = NULL;
 		}
 		fl->fl_lmops = NULL;
 	}
@@ -309,16 +328,15 @@ EXPORT_SYMBOL_GPL(locks_release_private);
  *   %true: @owner has at least one blocker
  *   %false: @owner has no blockers
  */
-bool locks_owner_has_blockers(struct file_lock_context *flctx,
-		fl_owner_t owner)
+bool locks_owner_has_blockers(struct file_lock_context *flctx, fl_owner_t owner)
 {
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
 	spin_lock(&flctx->flc_lock);
-	list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-		if (fl->fl_owner != owner)
+	list_for_each_entry(flc, &flctx->flc_posix, flc_list) {
+		if (flc->flc_owner != owner)
 			continue;
-		if (!list_empty(&fl->fl_blocked_requests)) {
+		if (!list_empty(&flc->flc_blocked_requests)) {
 			spin_unlock(&flctx->flc_lock);
 			return true;
 		}
@@ -336,35 +354,52 @@ void locks_free_lock(struct file_lock *fl)
 }
 EXPORT_SYMBOL(locks_free_lock);
 
+/* Free a lease which is not in use. */
+void locks_free_lease(struct file_lease *fl)
+{
+	kmem_cache_free(filelease_cache, fl);
+}
+EXPORT_SYMBOL(locks_free_lease);
+
 static void
 locks_dispose_list(struct list_head *dispose)
 {
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
 	while (!list_empty(dispose)) {
-		fl = list_first_entry(dispose, struct file_lock, fl_list);
-		list_del_init(&fl->fl_list);
-		locks_free_lock(fl);
+		flc = list_first_entry(dispose, struct file_lock_core, flc_list);
+		list_del_init(&flc->flc_list);
+		if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
+			locks_free_lease(file_lease(flc));
+		else
+			locks_free_lock(file_lock(flc));
 	}
 }
 
 void locks_init_lock(struct file_lock *fl)
 {
 	memset(fl, 0, sizeof(struct file_lock));
-	locks_init_lock_heads(fl);
+	locks_init_lock_heads(&fl->c);
 }
 EXPORT_SYMBOL(locks_init_lock);
 
+void locks_init_lease(struct file_lease *fl)
+{
+	memset(fl, 0, sizeof(*fl));
+	locks_init_lock_heads(&fl->c);
+}
+EXPORT_SYMBOL(locks_init_lease);
+
 /*
  * Initialize a new lock from an existing file_lock structure.
  */
 void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
 {
-	new->fl_owner = fl->fl_owner;
-	new->fl_pid = fl->fl_pid;
-	new->fl_file = NULL;
-	new->fl_flags = fl->fl_flags;
-	new->fl_type = fl->fl_type;
+	new->c.flc_owner = fl->c.flc_owner;
+	new->c.flc_pid = fl->c.flc_pid;
+	new->c.flc_file = NULL;
+	new->c.flc_flags = fl->c.flc_flags;
+	new->c.flc_type = fl->c.flc_type;
 	new->fl_start = fl->fl_start;
 	new->fl_end = fl->fl_end;
 	new->fl_lmops = fl->fl_lmops;
@@ -372,7 +407,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl)
 
 	if (fl->fl_lmops) {
 		if (fl->fl_lmops->lm_get_owner)
-			fl->fl_lmops->lm_get_owner(fl->fl_owner);
+			fl->fl_lmops->lm_get_owner(fl->c.flc_owner);
 	}
 }
 EXPORT_SYMBOL(locks_copy_conflock);
@@ -384,7 +419,7 @@ void locks_copy_lock(struct file_lock *new, struct file_lock *fl)
 
 	locks_copy_conflock(new, fl);
 
-	new->fl_file = fl->fl_file;
+	new->c.flc_file = fl->c.flc_file;
 	new->fl_ops = fl->fl_ops;
 
 	if (fl->fl_ops) {
@@ -400,15 +435,17 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl)
 
 	/*
 	 * As ctx->flc_lock is held, new requests cannot be added to
-	 * ->fl_blocked_requests, so we don't need a lock to check if it
+	 * ->flc_blocked_requests, so we don't need a lock to check if it
 	 * is empty.
 	 */
-	if (list_empty(&fl->fl_blocked_requests))
+	if (list_empty(&fl->c.flc_blocked_requests))
 		return;
 	spin_lock(&blocked_lock_lock);
-	list_splice_init(&fl->fl_blocked_requests, &new->fl_blocked_requests);
-	list_for_each_entry(f, &new->fl_blocked_requests, fl_blocked_member)
-		f->fl_blocker = new;
+	list_splice_init(&fl->c.flc_blocked_requests,
+			 &new->c.flc_blocked_requests);
+	list_for_each_entry(f, &new->c.flc_blocked_requests,
+			    c.flc_blocked_member)
+		f->c.flc_blocker = &new->c;
 	spin_unlock(&blocked_lock_lock);
 }
 
@@ -429,21 +466,21 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type)
 {
 	locks_init_lock(fl);
 
-	fl->fl_file = filp;
-	fl->fl_owner = filp;
-	fl->fl_pid = current->tgid;
-	fl->fl_flags = FL_FLOCK;
-	fl->fl_type = type;
+	fl->c.flc_file = filp;
+	fl->c.flc_owner = filp;
+	fl->c.flc_pid = current->tgid;
+	fl->c.flc_flags = FL_FLOCK;
+	fl->c.flc_type = type;
 	fl->fl_end = OFFSET_MAX;
 }
 
-static int assign_type(struct file_lock *fl, int type)
+static int assign_type(struct file_lock_core *flc, int type)
 {
 	switch (type) {
 	case F_RDLCK:
 	case F_WRLCK:
 	case F_UNLCK:
-		fl->fl_type = type;
+		flc->flc_type = type;
 		break;
 	default:
 		return -EINVAL;
@@ -488,14 +525,14 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 	} else
 		fl->fl_end = OFFSET_MAX;
 
-	fl->fl_owner = current->files;
-	fl->fl_pid = current->tgid;
-	fl->fl_file = filp;
-	fl->fl_flags = FL_POSIX;
+	fl->c.flc_owner = current->files;
+	fl->c.flc_pid = current->tgid;
+	fl->c.flc_file = filp;
+	fl->c.flc_flags = FL_POSIX;
 	fl->fl_ops = NULL;
 	fl->fl_lmops = NULL;
 
-	return assign_type(fl, l->l_type);
+	return assign_type(&fl->c, l->l_type);
 }
 
 /* Verify a "struct flock" and copy it to a "struct file_lock" as a POSIX
@@ -516,16 +553,16 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl,
 
 /* default lease lock manager operations */
 static bool
-lease_break_callback(struct file_lock *fl)
+lease_break_callback(struct file_lease *fl)
 {
 	kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG);
 	return false;
 }
 
 static void
-lease_setup(struct file_lock *fl, void **priv)
+lease_setup(struct file_lease *fl, void **priv)
 {
-	struct file *filp = fl->fl_file;
+	struct file *filp = fl->c.flc_file;
 	struct fasync_struct *fa = *priv;
 
 	/*
@@ -539,7 +576,7 @@ lease_setup(struct file_lock *fl, void **priv)
 	__f_setown(filp, task_pid(current), PIDTYPE_TGID, 0);
 }
 
-static const struct lock_manager_operations lease_manager_ops = {
+static const struct lease_manager_operations lease_manager_ops = {
 	.lm_break = lease_break_callback,
 	.lm_change = lease_modify,
 	.lm_setup = lease_setup,
@@ -548,27 +585,24 @@ static const struct lock_manager_operations lease_manager_ops = {
 /*
  * Initialize a lease, use the default lock manager operations
  */
-static int lease_init(struct file *filp, int type, struct file_lock *fl)
+static int lease_init(struct file *filp, int type, struct file_lease *fl)
 {
-	if (assign_type(fl, type) != 0)
+	if (assign_type(&fl->c, type) != 0)
 		return -EINVAL;
 
-	fl->fl_owner = filp;
-	fl->fl_pid = current->tgid;
+	fl->c.flc_owner = filp;
+	fl->c.flc_pid = current->tgid;
 
-	fl->fl_file = filp;
-	fl->fl_flags = FL_LEASE;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
-	fl->fl_ops = NULL;
+	fl->c.flc_file = filp;
+	fl->c.flc_flags = FL_LEASE;
 	fl->fl_lmops = &lease_manager_ops;
 	return 0;
 }
 
 /* Allocate a file_lock initialised to this type of lease */
-static struct file_lock *lease_alloc(struct file *filp, int type)
+static struct file_lease *lease_alloc(struct file *filp, int type)
 {
-	struct file_lock *fl = locks_alloc_lock();
+	struct file_lease *fl = locks_alloc_lease();
 	int error = -ENOMEM;
 
 	if (fl == NULL)
@@ -576,7 +610,7 @@ static struct file_lock *lease_alloc(struct file *filp, int type)
 
 	error = lease_init(filp, type, fl);
 	if (error) {
-		locks_free_lock(fl);
+		locks_free_lease(fl);
 		return ERR_PTR(error);
 	}
 	return fl;
@@ -593,26 +627,26 @@ static inline int locks_overlap(struct file_lock *fl1, struct file_lock *fl2)
 /*
  * Check whether two locks have the same owner.
  */
-static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
+static int posix_same_owner(struct file_lock_core *fl1, struct file_lock_core *fl2)
 {
-	return fl1->fl_owner == fl2->fl_owner;
+	return fl1->flc_owner == fl2->flc_owner;
 }
 
 /* Must be called with the flc_lock held! */
-static void locks_insert_global_locks(struct file_lock *fl)
+static void locks_insert_global_locks(struct file_lock_core *flc)
 {
 	struct file_lock_list_struct *fll = this_cpu_ptr(&file_lock_list);
 
 	percpu_rwsem_assert_held(&file_rwsem);
 
 	spin_lock(&fll->lock);
-	fl->fl_link_cpu = smp_processor_id();
-	hlist_add_head(&fl->fl_link, &fll->hlist);
+	flc->flc_link_cpu = smp_processor_id();
+	hlist_add_head(&flc->flc_link, &fll->hlist);
 	spin_unlock(&fll->lock);
 }
 
 /* Must be called with the flc_lock held! */
-static void locks_delete_global_locks(struct file_lock *fl)
+static void locks_delete_global_locks(struct file_lock_core *flc)
 {
 	struct file_lock_list_struct *fll;
 
@@ -623,33 +657,33 @@ static void locks_delete_global_locks(struct file_lock *fl)
 	 * is done while holding the flc_lock, and new insertions into the list
 	 * also require that it be held.
 	 */
-	if (hlist_unhashed(&fl->fl_link))
+	if (hlist_unhashed(&flc->flc_link))
 		return;
 
-	fll = per_cpu_ptr(&file_lock_list, fl->fl_link_cpu);
+	fll = per_cpu_ptr(&file_lock_list, flc->flc_link_cpu);
 	spin_lock(&fll->lock);
-	hlist_del_init(&fl->fl_link);
+	hlist_del_init(&flc->flc_link);
 	spin_unlock(&fll->lock);
 }
 
 static unsigned long
-posix_owner_key(struct file_lock *fl)
+posix_owner_key(struct file_lock_core *flc)
 {
-	return (unsigned long)fl->fl_owner;
+	return (unsigned long) flc->flc_owner;
 }
 
-static void locks_insert_global_blocked(struct file_lock *waiter)
+static void locks_insert_global_blocked(struct file_lock_core *waiter)
 {
 	lockdep_assert_held(&blocked_lock_lock);
 
-	hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter));
+	hash_add(blocked_hash, &waiter->flc_link, posix_owner_key(waiter));
 }
 
-static void locks_delete_global_blocked(struct file_lock *waiter)
+static void locks_delete_global_blocked(struct file_lock_core *waiter)
 {
 	lockdep_assert_held(&blocked_lock_lock);
 
-	hash_del(&waiter->fl_link);
+	hash_del(&waiter->flc_link);
 }
 
 /* Remove waiter from blocker's block list.
@@ -657,41 +691,39 @@ static void locks_delete_global_blocked(struct file_lock *waiter)
  *
  * Must be called with blocked_lock_lock held.
  */
-static void __locks_delete_block(struct file_lock *waiter)
+static void __locks_unlink_block(struct file_lock_core *waiter)
 {
 	locks_delete_global_blocked(waiter);
-	list_del_init(&waiter->fl_blocked_member);
+	list_del_init(&waiter->flc_blocked_member);
 }
 
-static void __locks_wake_up_blocks(struct file_lock *blocker)
+static void __locks_wake_up_blocks(struct file_lock_core *blocker)
 {
-	while (!list_empty(&blocker->fl_blocked_requests)) {
-		struct file_lock *waiter;
+	while (!list_empty(&blocker->flc_blocked_requests)) {
+		struct file_lock_core *waiter;
+		struct file_lock *fl;
 
-		waiter = list_first_entry(&blocker->fl_blocked_requests,
-					  struct file_lock, fl_blocked_member);
-		__locks_delete_block(waiter);
-		if (waiter->fl_lmops && waiter->fl_lmops->lm_notify)
-			waiter->fl_lmops->lm_notify(waiter);
+		waiter = list_first_entry(&blocker->flc_blocked_requests,
+					  struct file_lock_core, flc_blocked_member);
+
+		fl = file_lock(waiter);
+		__locks_unlink_block(waiter);
+		if ((waiter->flc_flags & (FL_POSIX | FL_FLOCK)) &&
+		    fl->fl_lmops && fl->fl_lmops->lm_notify)
+			fl->fl_lmops->lm_notify(fl);
 		else
-			wake_up(&waiter->fl_wait);
+			locks_wake_up(fl);
 
 		/*
-		 * The setting of fl_blocker to NULL marks the "done"
+		 * The setting of flc_blocker to NULL marks the "done"
 		 * point in deleting a block. Paired with acquire at the top
 		 * of locks_delete_block().
 		 */
-		smp_store_release(&waiter->fl_blocker, NULL);
+		smp_store_release(&waiter->flc_blocker, NULL);
 	}
 }
 
-/**
- *	locks_delete_block - stop waiting for a file lock
- *	@waiter: the lock which was waiting
- *
- *	lockd/nfsd need to disconnect the lock while working on it.
- */
-int locks_delete_block(struct file_lock *waiter)
+static int __locks_delete_block(struct file_lock_core *waiter)
 {
 	int status = -ENOENT;
 
@@ -716,24 +748,35 @@ int locks_delete_block(struct file_lock *waiter)
 	 * no new locks can be inserted into its fl_blocked_requests list, and
 	 * can avoid doing anything further if the list is empty.
 	 */
-	if (!smp_load_acquire(&waiter->fl_blocker) &&
-	    list_empty(&waiter->fl_blocked_requests))
+	if (!smp_load_acquire(&waiter->flc_blocker) &&
+	    list_empty(&waiter->flc_blocked_requests))
 		return status;
 
 	spin_lock(&blocked_lock_lock);
-	if (waiter->fl_blocker)
+	if (waiter->flc_blocker)
 		status = 0;
 	__locks_wake_up_blocks(waiter);
-	__locks_delete_block(waiter);
+	__locks_unlink_block(waiter);
 
 	/*
 	 * The setting of fl_blocker to NULL marks the "done" point in deleting
 	 * a block. Paired with acquire at the top of this function.
 	 */
-	smp_store_release(&waiter->fl_blocker, NULL);
+	smp_store_release(&waiter->flc_blocker, NULL);
 	spin_unlock(&blocked_lock_lock);
 	return status;
 }
+
+/**
+ *	locks_delete_block - stop waiting for a file lock
+ *	@waiter: the lock which was waiting
+ *
+ *	lockd/nfsd need to disconnect the lock while working on it.
+ */
+int locks_delete_block(struct file_lock *waiter)
+{
+	return __locks_delete_block(&waiter->c);
+}
 EXPORT_SYMBOL(locks_delete_block);
 
 /* Insert waiter into blocker's block list.
@@ -751,26 +794,28 @@ EXPORT_SYMBOL(locks_delete_block);
  * waiters, and add beneath any waiter that blocks the new waiter.
  * Thus wakeups don't happen until needed.
  */
-static void __locks_insert_block(struct file_lock *blocker,
-				 struct file_lock *waiter,
-				 bool conflict(struct file_lock *,
-					       struct file_lock *))
+static void __locks_insert_block(struct file_lock_core *blocker,
+				 struct file_lock_core *waiter,
+				 bool conflict(struct file_lock_core *,
+					       struct file_lock_core *))
 {
-	struct file_lock *fl;
-	BUG_ON(!list_empty(&waiter->fl_blocked_member));
+	struct file_lock_core *flc;
 
+	BUG_ON(!list_empty(&waiter->flc_blocked_member));
 new_blocker:
-	list_for_each_entry(fl, &blocker->fl_blocked_requests, fl_blocked_member)
-		if (conflict(fl, waiter)) {
-			blocker =  fl;
+	list_for_each_entry(flc, &blocker->flc_blocked_requests, flc_blocked_member)
+		if (conflict(flc, waiter)) {
+			blocker =  flc;
 			goto new_blocker;
 		}
-	waiter->fl_blocker = blocker;
-	list_add_tail(&waiter->fl_blocked_member, &blocker->fl_blocked_requests);
-	if (IS_POSIX(blocker) && !IS_OFDLCK(blocker))
+	waiter->flc_blocker = blocker;
+	list_add_tail(&waiter->flc_blocked_member,
+		      &blocker->flc_blocked_requests);
+
+	if ((blocker->flc_flags & (FL_POSIX|FL_OFDLCK)) == FL_POSIX)
 		locks_insert_global_blocked(waiter);
 
-	/* The requests in waiter->fl_blocked are known to conflict with
+	/* The requests in waiter->flc_blocked are known to conflict with
 	 * waiter, but might not conflict with blocker, or the requests
 	 * and lock which block it.  So they all need to be woken.
 	 */
@@ -778,10 +823,10 @@ new_blocker:
 }
 
 /* Must be called with flc_lock held. */
-static void locks_insert_block(struct file_lock *blocker,
-			       struct file_lock *waiter,
-			       bool conflict(struct file_lock *,
-					     struct file_lock *))
+static void locks_insert_block(struct file_lock_core *blocker,
+			       struct file_lock_core *waiter,
+			       bool conflict(struct file_lock_core *,
+					     struct file_lock_core *))
 {
 	spin_lock(&blocked_lock_lock);
 	__locks_insert_block(blocker, waiter, conflict);
@@ -793,7 +838,7 @@ static void locks_insert_block(struct file_lock *blocker,
  *
  * Must be called with the inode->flc_lock held!
  */
-static void locks_wake_up_blocks(struct file_lock *blocker)
+static void locks_wake_up_blocks(struct file_lock_core *blocker)
 {
 	/*
 	 * Avoid taking global lock if list is empty. This is safe since new
@@ -802,7 +847,7 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 	 * fl_blocked_requests list does not require the flc_lock, so we must
 	 * recheck list_empty() after acquiring the blocked_lock_lock.
 	 */
-	if (list_empty(&blocker->fl_blocked_requests))
+	if (list_empty(&blocker->flc_blocked_requests))
 		return;
 
 	spin_lock(&blocked_lock_lock);
@@ -811,39 +856,39 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
 }
 
 static void
-locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
+locks_insert_lock_ctx(struct file_lock_core *fl, struct list_head *before)
 {
-	list_add_tail(&fl->fl_list, before);
+	list_add_tail(&fl->flc_list, before);
 	locks_insert_global_locks(fl);
 }
 
 static void
-locks_unlink_lock_ctx(struct file_lock *fl)
+locks_unlink_lock_ctx(struct file_lock_core *fl)
 {
 	locks_delete_global_locks(fl);
-	list_del_init(&fl->fl_list);
+	list_del_init(&fl->flc_list);
 	locks_wake_up_blocks(fl);
 }
 
 static void
-locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
+locks_delete_lock_ctx(struct file_lock_core *fl, struct list_head *dispose)
 {
 	locks_unlink_lock_ctx(fl);
 	if (dispose)
-		list_add(&fl->fl_list, dispose);
+		list_add(&fl->flc_list, dispose);
 	else
-		locks_free_lock(fl);
+		locks_free_lock(file_lock(fl));
 }
 
 /* Determine if lock sys_fl blocks lock caller_fl. Common functionality
  * checks for shared/exclusive status of overlapping locks.
  */
-static bool locks_conflict(struct file_lock *caller_fl,
-			   struct file_lock *sys_fl)
+static bool locks_conflict(struct file_lock_core *caller_flc,
+			   struct file_lock_core *sys_flc)
 {
-	if (sys_fl->fl_type == F_WRLCK)
+	if (sys_flc->flc_type == F_WRLCK)
 		return true;
-	if (caller_fl->fl_type == F_WRLCK)
+	if (caller_flc->flc_type == F_WRLCK)
 		return true;
 	return false;
 }
@@ -851,20 +896,23 @@ static bool locks_conflict(struct file_lock *caller_fl,
 /* Determine if lock sys_fl blocks lock caller_fl. POSIX specific
  * checking before calling the locks_conflict().
  */
-static bool posix_locks_conflict(struct file_lock *caller_fl,
-				 struct file_lock *sys_fl)
+static bool posix_locks_conflict(struct file_lock_core *caller_flc,
+				 struct file_lock_core *sys_flc)
 {
+	struct file_lock *caller_fl = file_lock(caller_flc);
+	struct file_lock *sys_fl = file_lock(sys_flc);
+
 	/* POSIX locks owned by the same process do not conflict with
 	 * each other.
 	 */
-	if (posix_same_owner(caller_fl, sys_fl))
+	if (posix_same_owner(caller_flc, sys_flc))
 		return false;
 
 	/* Check whether they overlap */
 	if (!locks_overlap(caller_fl, sys_fl))
 		return false;
 
-	return locks_conflict(caller_fl, sys_fl);
+	return locks_conflict(caller_flc, sys_flc);
 }
 
 /* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK
@@ -873,28 +921,31 @@ static bool posix_locks_conflict(struct file_lock *caller_fl,
 static bool posix_test_locks_conflict(struct file_lock *caller_fl,
 				      struct file_lock *sys_fl)
 {
+	struct file_lock_core *caller = &caller_fl->c;
+	struct file_lock_core *sys = &sys_fl->c;
+
 	/* F_UNLCK checks any locks on the same fd. */
-	if (caller_fl->fl_type == F_UNLCK) {
-		if (!posix_same_owner(caller_fl, sys_fl))
+	if (lock_is_unlock(caller_fl)) {
+		if (!posix_same_owner(caller, sys))
 			return false;
 		return locks_overlap(caller_fl, sys_fl);
 	}
-	return posix_locks_conflict(caller_fl, sys_fl);
+	return posix_locks_conflict(caller, sys);
 }
 
 /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific
  * checking before calling the locks_conflict().
  */
-static bool flock_locks_conflict(struct file_lock *caller_fl,
-				 struct file_lock *sys_fl)
+static bool flock_locks_conflict(struct file_lock_core *caller_flc,
+				 struct file_lock_core *sys_flc)
 {
 	/* FLOCK locks referring to the same filp do not conflict with
 	 * each other.
 	 */
-	if (caller_fl->fl_file == sys_fl->fl_file)
+	if (caller_flc->flc_file == sys_flc->flc_file)
 		return false;
 
-	return locks_conflict(caller_fl, sys_fl);
+	return locks_conflict(caller_flc, sys_flc);
 }
 
 void
@@ -908,13 +959,13 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 
 	ctx = locks_inode_context(inode);
 	if (!ctx || list_empty_careful(&ctx->flc_posix)) {
-		fl->fl_type = F_UNLCK;
+		fl->c.flc_type = F_UNLCK;
 		return;
 	}
 
 retry:
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+	list_for_each_entry(cfl, &ctx->flc_posix, c.flc_list) {
 		if (!posix_test_locks_conflict(fl, cfl))
 			continue;
 		if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable
@@ -930,7 +981,7 @@ retry:
 		locks_copy_conflock(fl, cfl);
 		goto out;
 	}
-	fl->fl_type = F_UNLCK;
+	fl->c.flc_type = F_UNLCK;
 out:
 	spin_unlock(&ctx->flc_lock);
 	return;
@@ -972,25 +1023,27 @@ EXPORT_SYMBOL(posix_test_lock);
 
 #define MAX_DEADLK_ITERATIONS 10
 
-/* Find a lock that the owner of the given block_fl is blocking on. */
-static struct file_lock *what_owner_is_waiting_for(struct file_lock *block_fl)
+/* Find a lock that the owner of the given @blocker is blocking on. */
+static struct file_lock_core *what_owner_is_waiting_for(struct file_lock_core *blocker)
 {
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
-	hash_for_each_possible(blocked_hash, fl, fl_link, posix_owner_key(block_fl)) {
-		if (posix_same_owner(fl, block_fl)) {
-			while (fl->fl_blocker)
-				fl = fl->fl_blocker;
-			return fl;
+	hash_for_each_possible(blocked_hash, flc, flc_link, posix_owner_key(blocker)) {
+		if (posix_same_owner(flc, blocker)) {
+			while (flc->flc_blocker)
+				flc = flc->flc_blocker;
+			return flc;
 		}
 	}
 	return NULL;
 }
 
 /* Must be called with the blocked_lock_lock held! */
-static int posix_locks_deadlock(struct file_lock *caller_fl,
-				struct file_lock *block_fl)
+static bool posix_locks_deadlock(struct file_lock *caller_fl,
+				 struct file_lock *block_fl)
 {
+	struct file_lock_core *caller = &caller_fl->c;
+	struct file_lock_core *blocker = &block_fl->c;
 	int i = 0;
 
 	lockdep_assert_held(&blocked_lock_lock);
@@ -999,16 +1052,16 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 	 * This deadlock detector can't reasonably detect deadlocks with
 	 * FL_OFDLCK locks, since they aren't owned by a process, per-se.
 	 */
-	if (IS_OFDLCK(caller_fl))
-		return 0;
+	if (caller->flc_flags & FL_OFDLCK)
+		return false;
 
-	while ((block_fl = what_owner_is_waiting_for(block_fl))) {
+	while ((blocker = what_owner_is_waiting_for(blocker))) {
 		if (i++ > MAX_DEADLK_ITERATIONS)
-			return 0;
-		if (posix_same_owner(caller_fl, block_fl))
-			return 1;
+			return false;
+		if (posix_same_owner(caller, blocker))
+			return true;
 	}
-	return 0;
+	return false;
 }
 
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
@@ -1027,14 +1080,14 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 	bool found = false;
 	LIST_HEAD(dispose);
 
-	ctx = locks_get_lock_context(inode, request->fl_type);
+	ctx = locks_get_lock_context(inode, request->c.flc_type);
 	if (!ctx) {
-		if (request->fl_type != F_UNLCK)
+		if (request->c.flc_type != F_UNLCK)
 			return -ENOMEM;
-		return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0;
+		return (request->c.flc_flags & FL_EXISTS) ? -ENOENT : 0;
 	}
 
-	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
+	if (!(request->c.flc_flags & FL_ACCESS) && (request->c.flc_type != F_UNLCK)) {
 		new_fl = locks_alloc_lock();
 		if (!new_fl)
 			return -ENOMEM;
@@ -1042,41 +1095,41 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
-	if (request->fl_flags & FL_ACCESS)
+	if (request->c.flc_flags & FL_ACCESS)
 		goto find_conflict;
 
-	list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-		if (request->fl_file != fl->fl_file)
+	list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+		if (request->c.flc_file != fl->c.flc_file)
 			continue;
-		if (request->fl_type == fl->fl_type)
+		if (request->c.flc_type == fl->c.flc_type)
 			goto out;
 		found = true;
-		locks_delete_lock_ctx(fl, &dispose);
+		locks_delete_lock_ctx(&fl->c, &dispose);
 		break;
 	}
 
-	if (request->fl_type == F_UNLCK) {
-		if ((request->fl_flags & FL_EXISTS) && !found)
+	if (lock_is_unlock(request)) {
+		if ((request->c.flc_flags & FL_EXISTS) && !found)
 			error = -ENOENT;
 		goto out;
 	}
 
 find_conflict:
-	list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-		if (!flock_locks_conflict(request, fl))
+	list_for_each_entry(fl, &ctx->flc_flock, c.flc_list) {
+		if (!flock_locks_conflict(&request->c, &fl->c))
 			continue;
 		error = -EAGAIN;
-		if (!(request->fl_flags & FL_SLEEP))
+		if (!(request->c.flc_flags & FL_SLEEP))
 			goto out;
 		error = FILE_LOCK_DEFERRED;
-		locks_insert_block(fl, request, flock_locks_conflict);
+		locks_insert_block(&fl->c, &request->c, flock_locks_conflict);
 		goto out;
 	}
-	if (request->fl_flags & FL_ACCESS)
+	if (request->c.flc_flags & FL_ACCESS)
 		goto out;
 	locks_copy_lock(new_fl, request);
 	locks_move_blocks(new_fl, request);
-	locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
+	locks_insert_lock_ctx(&new_fl->c, &ctx->flc_flock);
 	new_fl = NULL;
 	error = 0;
 
@@ -1105,9 +1158,9 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 	void *owner;
 	void (*func)(void);
 
-	ctx = locks_get_lock_context(inode, request->fl_type);
+	ctx = locks_get_lock_context(inode, request->c.flc_type);
 	if (!ctx)
-		return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM;
+		return lock_is_unlock(request) ? 0 : -ENOMEM;
 
 	/*
 	 * We may need two file_lock structures for this operation,
@@ -1115,8 +1168,8 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
 	 *
 	 * In some cases we can be sure, that no new locks will be needed
 	 */
-	if (!(request->fl_flags & FL_ACCESS) &&
-	    (request->fl_type != F_UNLCK ||
+	if (!(request->c.flc_flags & FL_ACCESS) &&
+	    (request->c.flc_type != F_UNLCK ||
 	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
 		new_fl = locks_alloc_lock();
 		new_fl2 = locks_alloc_lock();
@@ -1130,9 +1183,9 @@ retry:
 	 * there are any, either return error or put the request on the
 	 * blocker's list of waiters and the global blocked_hash.
 	 */
-	if (request->fl_type != F_UNLCK) {
-		list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-			if (!posix_locks_conflict(request, fl))
+	if (request->c.flc_type != F_UNLCK) {
+		list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+			if (!posix_locks_conflict(&request->c, &fl->c))
 				continue;
 			if (fl->fl_lmops && fl->fl_lmops->lm_lock_expirable
 				&& (*fl->fl_lmops->lm_lock_expirable)(fl)) {
@@ -1148,7 +1201,7 @@ retry:
 			if (conflock)
 				locks_copy_conflock(conflock, fl);
 			error = -EAGAIN;
-			if (!(request->fl_flags & FL_SLEEP))
+			if (!(request->c.flc_flags & FL_SLEEP))
 				goto out;
 			/*
 			 * Deadlock detection and insertion into the blocked
@@ -1160,10 +1213,10 @@ retry:
 			 * Ensure that we don't find any locks blocked on this
 			 * request during deadlock detection.
 			 */
-			__locks_wake_up_blocks(request);
+			__locks_wake_up_blocks(&request->c);
 			if (likely(!posix_locks_deadlock(request, fl))) {
 				error = FILE_LOCK_DEFERRED;
-				__locks_insert_block(fl, request,
+				__locks_insert_block(&fl->c, &request->c,
 						     posix_locks_conflict);
 			}
 			spin_unlock(&blocked_lock_lock);
@@ -1173,22 +1226,22 @@ retry:
 
 	/* If we're just looking for a conflict, we're done. */
 	error = 0;
-	if (request->fl_flags & FL_ACCESS)
+	if (request->c.flc_flags & FL_ACCESS)
 		goto out;
 
 	/* Find the first old lock with the same owner as the new lock */
-	list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-		if (posix_same_owner(request, fl))
+	list_for_each_entry(fl, &ctx->flc_posix, c.flc_list) {
+		if (posix_same_owner(&request->c, &fl->c))
 			break;
 	}
 
 	/* Process locks with this owner. */
-	list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
-		if (!posix_same_owner(request, fl))
+	list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, c.flc_list) {
+		if (!posix_same_owner(&request->c, &fl->c))
 			break;
 
 		/* Detect adjacent or overlapping regions (if same lock type) */
-		if (request->fl_type == fl->fl_type) {
+		if (request->c.flc_type == fl->c.flc_type) {
 			/* In all comparisons of start vs end, use
 			 * "start - 1" rather than "end + 1". If end
 			 * is OFFSET_MAX, end + 1 will become negative.
@@ -1215,7 +1268,7 @@ retry:
 			else
 				request->fl_end = fl->fl_end;
 			if (added) {
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_delete_lock_ctx(&fl->c, &dispose);
 				continue;
 			}
 			request = fl;
@@ -1228,7 +1281,7 @@ retry:
 				continue;
 			if (fl->fl_start > request->fl_end)
 				break;
-			if (request->fl_type == F_UNLCK)
+			if (lock_is_unlock(request))
 				added = true;
 			if (fl->fl_start < request->fl_start)
 				left = fl;
@@ -1244,7 +1297,7 @@ retry:
 				 * one (This may happen several times).
 				 */
 				if (added) {
-					locks_delete_lock_ctx(fl, &dispose);
+					locks_delete_lock_ctx(&fl->c, &dispose);
 					continue;
 				}
 				/*
@@ -1261,8 +1314,9 @@ retry:
 				locks_move_blocks(new_fl, request);
 				request = new_fl;
 				new_fl = NULL;
-				locks_insert_lock_ctx(request, &fl->fl_list);
-				locks_delete_lock_ctx(fl, &dispose);
+				locks_insert_lock_ctx(&request->c,
+						      &fl->c.flc_list);
+				locks_delete_lock_ctx(&fl->c, &dispose);
 				added = true;
 			}
 		}
@@ -1279,8 +1333,8 @@ retry:
 
 	error = 0;
 	if (!added) {
-		if (request->fl_type == F_UNLCK) {
-			if (request->fl_flags & FL_EXISTS)
+		if (lock_is_unlock(request)) {
+			if (request->c.flc_flags & FL_EXISTS)
 				error = -ENOENT;
 			goto out;
 		}
@@ -1291,7 +1345,7 @@ retry:
 		}
 		locks_copy_lock(new_fl, request);
 		locks_move_blocks(new_fl, request);
-		locks_insert_lock_ctx(new_fl, &fl->fl_list);
+		locks_insert_lock_ctx(&new_fl->c, &fl->c.flc_list);
 		fl = new_fl;
 		new_fl = NULL;
 	}
@@ -1303,14 +1357,14 @@ retry:
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
-			locks_insert_lock_ctx(left, &fl->fl_list);
+			locks_insert_lock_ctx(&left->c, &fl->c.flc_list);
 		}
 		right->fl_start = request->fl_end + 1;
-		locks_wake_up_blocks(right);
+		locks_wake_up_blocks(&right->c);
 	}
 	if (left) {
 		left->fl_end = request->fl_start - 1;
-		locks_wake_up_blocks(left);
+		locks_wake_up_blocks(&left->c);
 	}
  out:
 	spin_unlock(&ctx->flc_lock);
@@ -1364,8 +1418,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 		error = posix_lock_inode(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait,
-					list_empty(&fl->fl_blocked_member));
+		error = wait_event_interruptible(fl->c.flc_wait,
+						 list_empty(&fl->c.flc_blocked_member));
 		if (error)
 			break;
 	}
@@ -1373,37 +1427,37 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 	return error;
 }
 
-static void lease_clear_pending(struct file_lock *fl, int arg)
+static void lease_clear_pending(struct file_lease *fl, int arg)
 {
 	switch (arg) {
 	case F_UNLCK:
-		fl->fl_flags &= ~FL_UNLOCK_PENDING;
+		fl->c.flc_flags &= ~FL_UNLOCK_PENDING;
 		fallthrough;
 	case F_RDLCK:
-		fl->fl_flags &= ~FL_DOWNGRADE_PENDING;
+		fl->c.flc_flags &= ~FL_DOWNGRADE_PENDING;
 	}
 }
 
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
+int lease_modify(struct file_lease *fl, int arg, struct list_head *dispose)
 {
-	int error = assign_type(fl, arg);
+	int error = assign_type(&fl->c, arg);
 
 	if (error)
 		return error;
 	lease_clear_pending(fl, arg);
-	locks_wake_up_blocks(fl);
+	locks_wake_up_blocks(&fl->c);
 	if (arg == F_UNLCK) {
-		struct file *filp = fl->fl_file;
+		struct file *filp = fl->c.flc_file;
 
 		f_delown(filp);
 		filp->f_owner.signum = 0;
-		fasync_helper(0, fl->fl_file, 0, &fl->fl_fasync);
+		fasync_helper(0, fl->c.flc_file, 0, &fl->fl_fasync);
 		if (fl->fl_fasync != NULL) {
 			printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
 			fl->fl_fasync = NULL;
 		}
-		locks_delete_lock_ctx(fl, dispose);
+		locks_delete_lock_ctx(&fl->c, dispose);
 	}
 	return 0;
 }
@@ -1420,11 +1474,11 @@ static bool past_time(unsigned long then)
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
 	struct file_lock_context *ctx = inode->i_flctx;
-	struct file_lock *fl, *tmp;
+	struct file_lease *fl, *tmp;
 
 	lockdep_assert_held(&ctx->flc_lock);
 
-	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
 		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
 			lease_modify(fl, F_RDLCK, dispose);
@@ -1433,38 +1487,40 @@ static void time_out_leases(struct inode *inode, struct list_head *dispose)
 	}
 }
 
-static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
+static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc)
 {
 	bool rc;
+	struct file_lease *lease = file_lease(lc);
+	struct file_lease *breaker = file_lease(bc);
 
 	if (lease->fl_lmops->lm_breaker_owns_lease
 			&& lease->fl_lmops->lm_breaker_owns_lease(lease))
 		return false;
-	if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT)) {
+	if ((bc->flc_flags & FL_LAYOUT) != (lc->flc_flags & FL_LAYOUT)) {
 		rc = false;
 		goto trace;
 	}
-	if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE)) {
+	if ((bc->flc_flags & FL_DELEG) && (lc->flc_flags & FL_LEASE)) {
 		rc = false;
 		goto trace;
 	}
 
-	rc = locks_conflict(breaker, lease);
+	rc = locks_conflict(bc, lc);
 trace:
 	trace_leases_conflict(rc, lease, breaker);
 	return rc;
 }
 
 static bool
-any_leases_conflict(struct inode *inode, struct file_lock *breaker)
+any_leases_conflict(struct inode *inode, struct file_lease *breaker)
 {
 	struct file_lock_context *ctx = inode->i_flctx;
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
 	lockdep_assert_held(&ctx->flc_lock);
 
-	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (leases_conflict(fl, breaker))
+	list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+		if (leases_conflict(flc, &breaker->c))
 			return true;
 	}
 	return false;
@@ -1487,7 +1543,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
 	int error = 0;
 	struct file_lock_context *ctx;
-	struct file_lock *new_fl, *fl, *tmp;
+	struct file_lease *new_fl, *fl, *tmp;
 	unsigned long break_time;
 	int want_write = (mode & O_ACCMODE) != O_RDONLY;
 	LIST_HEAD(dispose);
@@ -1495,7 +1551,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
 	if (IS_ERR(new_fl))
 		return PTR_ERR(new_fl);
-	new_fl->fl_flags = type;
+	new_fl->c.flc_flags = type;
 
 	/* typically we will check that ctx is non-NULL before calling */
 	ctx = locks_inode_context(inode);
@@ -1519,22 +1575,22 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 			break_time++;	/* so that 0 means no break time */
 	}
 
-	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
-		if (!leases_conflict(fl, new_fl))
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
+		if (!leases_conflict(&fl->c, &new_fl->c))
 			continue;
 		if (want_write) {
-			if (fl->fl_flags & FL_UNLOCK_PENDING)
+			if (fl->c.flc_flags & FL_UNLOCK_PENDING)
 				continue;
-			fl->fl_flags |= FL_UNLOCK_PENDING;
+			fl->c.flc_flags |= FL_UNLOCK_PENDING;
 			fl->fl_break_time = break_time;
 		} else {
 			if (lease_breaking(fl))
 				continue;
-			fl->fl_flags |= FL_DOWNGRADE_PENDING;
+			fl->c.flc_flags |= FL_DOWNGRADE_PENDING;
 			fl->fl_downgrade_time = break_time;
 		}
 		if (fl->fl_lmops->lm_break(fl))
-			locks_delete_lock_ctx(fl, &dispose);
+			locks_delete_lock_ctx(&fl->c, &dispose);
 	}
 
 	if (list_empty(&ctx->flc_lease))
@@ -1547,26 +1603,26 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	}
 
 restart:
-	fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+	fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
 	break_time = fl->fl_break_time;
 	if (break_time != 0)
 		break_time -= jiffies;
 	if (break_time == 0)
 		break_time++;
-	locks_insert_block(fl, new_fl, leases_conflict);
+	locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
 	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&ctx->flc_lock);
 	percpu_up_read(&file_rwsem);
 
 	locks_dispose_list(&dispose);
-	error = wait_event_interruptible_timeout(new_fl->fl_wait,
-					list_empty(&new_fl->fl_blocked_member),
-					break_time);
+	error = wait_event_interruptible_timeout(new_fl->c.flc_wait,
+						 list_empty(&new_fl->c.flc_blocked_member),
+						 break_time);
 
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	trace_break_lease_unblock(inode, new_fl);
-	locks_delete_block(new_fl);
+	__locks_delete_block(&new_fl->c);
 	if (error >= 0) {
 		/*
 		 * Wait for the next conflicting lease that has not been
@@ -1583,7 +1639,7 @@ out:
 	percpu_up_read(&file_rwsem);
 	locks_dispose_list(&dispose);
 free_lock:
-	locks_free_lock(new_fl);
+	locks_free_lease(new_fl);
 	return error;
 }
 EXPORT_SYMBOL(__break_lease);
@@ -1601,14 +1657,14 @@ void lease_get_mtime(struct inode *inode, struct timespec64 *time)
 {
 	bool has_lease = false;
 	struct file_lock_context *ctx;
-	struct file_lock *fl;
+	struct file_lock_core *flc;
 
 	ctx = locks_inode_context(inode);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
-		fl = list_first_entry_or_null(&ctx->flc_lease,
-					      struct file_lock, fl_list);
-		if (fl && (fl->fl_type == F_WRLCK))
+		flc = list_first_entry_or_null(&ctx->flc_lease,
+					       struct file_lock_core, flc_list);
+		if (flc && flc->flc_type == F_WRLCK)
 			has_lease = true;
 		spin_unlock(&ctx->flc_lock);
 	}
@@ -1643,7 +1699,7 @@ EXPORT_SYMBOL(lease_get_mtime);
  */
 int fcntl_getlease(struct file *filp)
 {
-	struct file_lock *fl;
+	struct file_lease *fl;
 	struct inode *inode = file_inode(filp);
 	struct file_lock_context *ctx;
 	int type = F_UNLCK;
@@ -1654,8 +1710,8 @@ int fcntl_getlease(struct file *filp)
 		percpu_down_read(&file_rwsem);
 		spin_lock(&ctx->flc_lock);
 		time_out_leases(inode, &dispose);
-		list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-			if (fl->fl_file != filp)
+		list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+			if (fl->c.flc_file != filp)
 				continue;
 			type = target_leasetype(fl);
 			break;
@@ -1715,12 +1771,12 @@ check_conflicting_open(struct file *filp, const int arg, int flags)
 }
 
 static int
-generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv)
+generic_add_lease(struct file *filp, int arg, struct file_lease **flp, void **priv)
 {
-	struct file_lock *fl, *my_fl = NULL, *lease;
+	struct file_lease *fl, *my_fl = NULL, *lease;
 	struct inode *inode = file_inode(filp);
 	struct file_lock_context *ctx;
-	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
+	bool is_deleg = (*flp)->c.flc_flags & FL_DELEG;
 	int error;
 	LIST_HEAD(dispose);
 
@@ -1746,7 +1802,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
 	time_out_leases(inode, &dispose);
-	error = check_conflicting_open(filp, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->c.flc_flags);
 	if (error)
 		goto out;
 
@@ -1759,9 +1815,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
 	 * except for this filp.
 	 */
 	error = -EAGAIN;
-	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (fl->fl_file == filp &&
-		    fl->fl_owner == lease->fl_owner) {
+	list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+		if (fl->c.flc_file == filp &&
+		    fl->c.flc_owner == lease->c.flc_owner) {
 			my_fl = fl;
 			continue;
 		}
@@ -1776,7 +1832,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
 		 * Modifying our existing lease is OK, but no getting a
 		 * new lease if someone else is opening for write:
 		 */
-		if (fl->fl_flags & FL_UNLOCK_PENDING)
+		if (fl->c.flc_flags & FL_UNLOCK_PENDING)
 			goto out;
 	}
 
@@ -1792,7 +1848,7 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
 	if (!leases_enable)
 		goto out;
 
-	locks_insert_lock_ctx(lease, &ctx->flc_lease);
+	locks_insert_lock_ctx(&lease->c, &ctx->flc_lease);
 	/*
 	 * The check in break_lease() is lockless. It's possible for another
 	 * open to race in after we did the earlier check for a conflicting
@@ -1803,9 +1859,9 @@ generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **pri
 	 * precedes these checks.
 	 */
 	smp_mb();
-	error = check_conflicting_open(filp, arg, lease->fl_flags);
+	error = check_conflicting_open(filp, arg, lease->c.flc_flags);
 	if (error) {
-		locks_unlink_lock_ctx(lease);
+		locks_unlink_lock_ctx(&lease->c);
 		goto out;
 	}
 
@@ -1826,7 +1882,7 @@ out:
 static int generic_delete_lease(struct file *filp, void *owner)
 {
 	int error = -EAGAIN;
-	struct file_lock *fl, *victim = NULL;
+	struct file_lease *fl, *victim = NULL;
 	struct inode *inode = file_inode(filp);
 	struct file_lock_context *ctx;
 	LIST_HEAD(dispose);
@@ -1839,9 +1895,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
 
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (fl->fl_file == filp &&
-		    fl->fl_owner == owner) {
+	list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+		if (fl->c.flc_file == filp &&
+		    fl->c.flc_owner == owner) {
 			victim = fl;
 			break;
 		}
@@ -1866,21 +1922,9 @@ static int generic_delete_lease(struct file *filp, void *owner)
  *	The (input) flp->fl_lmops->lm_break function is required
  *	by break_lease().
  */
-int generic_setlease(struct file *filp, int arg, struct file_lock **flp,
+int generic_setlease(struct file *filp, int arg, struct file_lease **flp,
 			void **priv)
 {
-	struct inode *inode = file_inode(filp);
-	vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
-	int error;
-
-	if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
-		return -EACCES;
-	if (!S_ISREG(inode->i_mode))
-		return -EINVAL;
-	error = security_file_lock(filp, arg);
-	if (error)
-		return error;
-
 	switch (arg) {
 	case F_UNLCK:
 		return generic_delete_lease(filp, *priv);
@@ -1913,7 +1957,7 @@ lease_notifier_chain_init(void)
 }
 
 static inline void
-setlease_notifier(int arg, struct file_lock *lease)
+setlease_notifier(int arg, struct file_lease *lease)
 {
 	if (arg != F_UNLCK)
 		srcu_notifier_call_chain(&lease_notifier_chain, arg, lease);
@@ -1931,6 +1975,19 @@ void lease_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(lease_unregister_notifier);
 
+
+int
+kernel_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
+{
+	if (lease)
+		setlease_notifier(arg, *lease);
+	if (filp->f_op->setlease)
+		return filp->f_op->setlease(filp, arg, lease, priv);
+	else
+		return generic_setlease(filp, arg, lease, priv);
+}
+EXPORT_SYMBOL_GPL(kernel_setlease);
+
 /**
  * vfs_setlease        -       sets a lease on an open file
  * @filp:	file pointer
@@ -1949,20 +2006,26 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier);
  * may be NULL if the lm_setup operation doesn't require it.
  */
 int
-vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv)
+vfs_setlease(struct file *filp, int arg, struct file_lease **lease, void **priv)
 {
-	if (lease)
-		setlease_notifier(arg, *lease);
-	if (filp->f_op->setlease)
-		return filp->f_op->setlease(filp, arg, lease, priv);
-	else
-		return generic_setlease(filp, arg, lease, priv);
+	struct inode *inode = file_inode(filp);
+	vfsuid_t vfsuid = i_uid_into_vfsuid(file_mnt_idmap(filp), inode);
+	int error;
+
+	if ((!vfsuid_eq_kuid(vfsuid, current_fsuid())) && !capable(CAP_LEASE))
+		return -EACCES;
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+	error = security_file_lock(filp, arg);
+	if (error)
+		return error;
+	return kernel_setlease(filp, arg, lease, priv);
 }
 EXPORT_SYMBOL_GPL(vfs_setlease);
 
 static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
 {
-	struct file_lock *fl;
+	struct file_lease *fl;
 	struct fasync_struct *new;
 	int error;
 
@@ -1972,14 +2035,14 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg)
 
 	new = fasync_alloc();
 	if (!new) {
-		locks_free_lock(fl);
+		locks_free_lease(fl);
 		return -ENOMEM;
 	}
 	new->fa_fd = fd;
 
 	error = vfs_setlease(filp, arg, &fl, (void **)&new);
 	if (fl)
-		locks_free_lock(fl);
+		locks_free_lease(fl);
 	if (new)
 		fasync_free(new);
 	return error;
@@ -2017,8 +2080,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 		error = flock_lock_inode(inode, fl);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait,
-				list_empty(&fl->fl_blocked_member));
+		error = wait_event_interruptible(fl->c.flc_wait,
+						 list_empty(&fl->c.flc_blocked_member));
 		if (error)
 			break;
 	}
@@ -2036,7 +2099,7 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 {
 	int res = 0;
-	switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) {
+	switch (fl->c.flc_flags & (FL_POSIX|FL_FLOCK)) {
 		case FL_POSIX:
 			res = posix_lock_inode_wait(inode, fl);
 			break;
@@ -2098,13 +2161,13 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 
 	flock_make_lock(f.file, &fl, type);
 
-	error = security_file_lock(f.file, fl.fl_type);
+	error = security_file_lock(f.file, fl.c.flc_type);
 	if (error)
 		goto out_putf;
 
 	can_sleep = !(cmd & LOCK_NB);
 	if (can_sleep)
-		fl.fl_flags |= FL_SLEEP;
+		fl.c.flc_flags |= FL_SLEEP;
 
 	if (f.file->f_op->flock)
 		error = f.file->f_op->flock(f.file,
@@ -2130,7 +2193,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  */
 int vfs_test_lock(struct file *filp, struct file_lock *fl)
 {
-	WARN_ON_ONCE(filp != fl->fl_file);
+	WARN_ON_ONCE(filp != fl->c.flc_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_GETLK, fl);
 	posix_test_lock(filp, fl);
@@ -2145,25 +2208,28 @@ EXPORT_SYMBOL_GPL(vfs_test_lock);
  *
  * Used to translate a fl_pid into a namespace virtual pid number
  */
-static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
+static pid_t locks_translate_pid(struct file_lock_core *fl, struct pid_namespace *ns)
 {
 	pid_t vnr;
 	struct pid *pid;
 
-	if (IS_OFDLCK(fl))
+	if (fl->flc_flags & FL_OFDLCK)
 		return -1;
-	if (IS_REMOTELCK(fl))
-		return fl->fl_pid;
+
+	/* Remote locks report a negative pid value */
+	if (fl->flc_pid <= 0)
+		return fl->flc_pid;
+
 	/*
 	 * If the flock owner process is dead and its pid has been already
 	 * freed, the translation below won't work, but we still want to show
 	 * flock owner pid number in init pidns.
 	 */
 	if (ns == &init_pid_ns)
-		return (pid_t)fl->fl_pid;
+		return (pid_t) fl->flc_pid;
 
 	rcu_read_lock();
-	pid = find_pid_ns(fl->fl_pid, &init_pid_ns);
+	pid = find_pid_ns(fl->flc_pid, &init_pid_ns);
 	vnr = pid_nr_ns(pid, ns);
 	rcu_read_unlock();
 	return vnr;
@@ -2171,7 +2237,7 @@ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns)
 
 static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 {
-	flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+	flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
 #if BITS_PER_LONG == 32
 	/*
 	 * Make sure we can represent the posix lock via
@@ -2186,19 +2252,19 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl)
 	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
 		fl->fl_end - fl->fl_start + 1;
 	flock->l_whence = 0;
-	flock->l_type = fl->fl_type;
+	flock->l_type = fl->c.flc_type;
 	return 0;
 }
 
 #if BITS_PER_LONG == 32
 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl)
 {
-	flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current));
+	flock->l_pid = locks_translate_pid(&fl->c, task_active_pid_ns(current));
 	flock->l_start = fl->fl_start;
 	flock->l_len = fl->fl_end == OFFSET_MAX ? 0 :
 		fl->fl_end - fl->fl_start + 1;
 	flock->l_whence = 0;
-	flock->l_type = fl->fl_type;
+	flock->l_type = fl->c.flc_type;
 }
 #endif
 
@@ -2227,16 +2293,16 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock)
 		if (flock->l_pid != 0)
 			goto out;
 
-		fl->fl_flags |= FL_OFDLCK;
-		fl->fl_owner = filp;
+		fl->c.flc_flags |= FL_OFDLCK;
+		fl->c.flc_owner = filp;
 	}
 
 	error = vfs_test_lock(filp, fl);
 	if (error)
 		goto out;
 
-	flock->l_type = fl->fl_type;
-	if (fl->fl_type != F_UNLCK) {
+	flock->l_type = fl->c.flc_type;
+	if (fl->c.flc_type != F_UNLCK) {
 		error = posix_lock_to_flock(flock, fl);
 		if (error)
 			goto out;
@@ -2283,7 +2349,7 @@ out:
  */
 int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf)
 {
-	WARN_ON_ONCE(filp != fl->fl_file);
+	WARN_ON_ONCE(filp != fl->c.flc_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, cmd, fl);
 	else
@@ -2296,7 +2362,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 {
 	int error;
 
-	error = security_file_lock(filp, fl->fl_type);
+	error = security_file_lock(filp, fl->c.flc_type);
 	if (error)
 		return error;
 
@@ -2304,8 +2370,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 		error = vfs_lock_file(filp, cmd, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
-		error = wait_event_interruptible(fl->fl_wait,
-					list_empty(&fl->fl_blocked_member));
+		error = wait_event_interruptible(fl->c.flc_wait,
+						 list_empty(&fl->c.flc_blocked_member));
 		if (error)
 			break;
 	}
@@ -2318,13 +2384,13 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 static int
 check_fmode_for_setlk(struct file_lock *fl)
 {
-	switch (fl->fl_type) {
+	switch (fl->c.flc_type) {
 	case F_RDLCK:
-		if (!(fl->fl_file->f_mode & FMODE_READ))
+		if (!(fl->c.flc_file->f_mode & FMODE_READ))
 			return -EBADF;
 		break;
 	case F_WRLCK:
-		if (!(fl->fl_file->f_mode & FMODE_WRITE))
+		if (!(fl->c.flc_file->f_mode & FMODE_WRITE))
 			return -EBADF;
 	}
 	return 0;
@@ -2363,8 +2429,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 			goto out;
 
 		cmd = F_SETLK;
-		file_lock->fl_flags |= FL_OFDLCK;
-		file_lock->fl_owner = filp;
+		file_lock->c.flc_flags |= FL_OFDLCK;
+		file_lock->c.flc_owner = filp;
 		break;
 	case F_OFD_SETLKW:
 		error = -EINVAL;
@@ -2372,11 +2438,11 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 			goto out;
 
 		cmd = F_SETLKW;
-		file_lock->fl_flags |= FL_OFDLCK;
-		file_lock->fl_owner = filp;
+		file_lock->c.flc_flags |= FL_OFDLCK;
+		file_lock->c.flc_owner = filp;
 		fallthrough;
 	case F_SETLKW:
-		file_lock->fl_flags |= FL_SLEEP;
+		file_lock->c.flc_flags |= FL_SLEEP;
 	}
 
 	error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2386,8 +2452,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	 * lock that was just acquired. There is no need to do that when we're
 	 * unlocking though, or for OFD locks.
 	 */
-	if (!error && file_lock->fl_type != F_UNLCK &&
-	    !(file_lock->fl_flags & FL_OFDLCK)) {
+	if (!error && file_lock->c.flc_type != F_UNLCK &&
+	    !(file_lock->c.flc_flags & FL_OFDLCK)) {
 		struct files_struct *files = current->files;
 		/*
 		 * We need that spin_lock here - it prevents reordering between
@@ -2398,7 +2464,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		f = files_lookup_fd_locked(files, fd);
 		spin_unlock(&files->file_lock);
 		if (f != filp) {
-			file_lock->fl_type = F_UNLCK;
+			file_lock->c.flc_type = F_UNLCK;
 			error = do_lock_file_wait(filp, cmd, file_lock);
 			WARN_ON_ONCE(error);
 			error = -EBADF;
@@ -2437,16 +2503,16 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock)
 		if (flock->l_pid != 0)
 			goto out;
 
-		fl->fl_flags |= FL_OFDLCK;
-		fl->fl_owner = filp;
+		fl->c.flc_flags |= FL_OFDLCK;
+		fl->c.flc_owner = filp;
 	}
 
 	error = vfs_test_lock(filp, fl);
 	if (error)
 		goto out;
 
-	flock->l_type = fl->fl_type;
-	if (fl->fl_type != F_UNLCK)
+	flock->l_type = fl->c.flc_type;
+	if (fl->c.flc_type != F_UNLCK)
 		posix_lock_to_flock64(flock, fl);
 
 out:
@@ -2486,8 +2552,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 			goto out;
 
 		cmd = F_SETLK64;
-		file_lock->fl_flags |= FL_OFDLCK;
-		file_lock->fl_owner = filp;
+		file_lock->c.flc_flags |= FL_OFDLCK;
+		file_lock->c.flc_owner = filp;
 		break;
 	case F_OFD_SETLKW:
 		error = -EINVAL;
@@ -2495,11 +2561,11 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 			goto out;
 
 		cmd = F_SETLKW64;
-		file_lock->fl_flags |= FL_OFDLCK;
-		file_lock->fl_owner = filp;
+		file_lock->c.flc_flags |= FL_OFDLCK;
+		file_lock->c.flc_owner = filp;
 		fallthrough;
 	case F_SETLKW64:
-		file_lock->fl_flags |= FL_SLEEP;
+		file_lock->c.flc_flags |= FL_SLEEP;
 	}
 
 	error = do_lock_file_wait(filp, cmd, file_lock);
@@ -2509,8 +2575,8 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	 * lock that was just acquired. There is no need to do that when we're
 	 * unlocking though, or for OFD locks.
 	 */
-	if (!error && file_lock->fl_type != F_UNLCK &&
-	    !(file_lock->fl_flags & FL_OFDLCK)) {
+	if (!error && file_lock->c.flc_type != F_UNLCK &&
+	    !(file_lock->c.flc_flags & FL_OFDLCK)) {
 		struct files_struct *files = current->files;
 		/*
 		 * We need that spin_lock here - it prevents reordering between
@@ -2521,7 +2587,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 		f = files_lookup_fd_locked(files, fd);
 		spin_unlock(&files->file_lock);
 		if (f != filp) {
-			file_lock->fl_type = F_UNLCK;
+			file_lock->c.flc_type = F_UNLCK;
 			error = do_lock_file_wait(filp, cmd, file_lock);
 			WARN_ON_ONCE(error);
 			error = -EBADF;
@@ -2555,13 +2621,13 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 		return;
 
 	locks_init_lock(&lock);
-	lock.fl_type = F_UNLCK;
-	lock.fl_flags = FL_POSIX | FL_CLOSE;
+	lock.c.flc_type = F_UNLCK;
+	lock.c.flc_flags = FL_POSIX | FL_CLOSE;
 	lock.fl_start = 0;
 	lock.fl_end = OFFSET_MAX;
-	lock.fl_owner = owner;
-	lock.fl_pid = current->tgid;
-	lock.fl_file = filp;
+	lock.c.flc_owner = owner;
+	lock.c.flc_pid = current->tgid;
+	lock.c.flc_file = filp;
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
@@ -2584,7 +2650,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 		return;
 
 	flock_make_lock(filp, &fl, F_UNLCK);
-	fl.fl_flags |= FL_CLOSE;
+	fl.c.flc_flags |= FL_CLOSE;
 
 	if (filp->f_op->flock)
 		filp->f_op->flock(filp, F_SETLKW, &fl);
@@ -2599,7 +2665,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx)
 static void
 locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
 {
-	struct file_lock *fl, *tmp;
+	struct file_lease *fl, *tmp;
 	LIST_HEAD(dispose);
 
 	if (list_empty(&ctx->flc_lease))
@@ -2607,8 +2673,8 @@ locks_remove_lease(struct file *filp, struct file_lock_context *ctx)
 
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
-		if (filp == fl->fl_file)
+	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list)
+		if (filp == fl->c.flc_file)
 			lease_modify(fl, F_UNLCK, &dispose);
 	spin_unlock(&ctx->flc_lock);
 	percpu_up_read(&file_rwsem);
@@ -2652,7 +2718,7 @@ void locks_remove_file(struct file *filp)
  */
 int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 {
-	WARN_ON_ONCE(filp != fl->fl_file);
+	WARN_ON_ONCE(filp != fl->c.flc_file);
 	if (filp->f_op->lock)
 		return filp->f_op->lock(filp, F_CANCELLK, fl);
 	return 0;
@@ -2691,69 +2757,73 @@ struct locks_iterator {
 	loff_t	li_pos;
 };
 
-static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+static void lock_get_status(struct seq_file *f, struct file_lock_core *flc,
 			    loff_t id, char *pfx, int repeat)
 {
 	struct inode *inode = NULL;
-	unsigned int fl_pid;
+	unsigned int pid;
 	struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
-	int type;
+	int type = flc->flc_type;
+	struct file_lock *fl = file_lock(flc);
+
+	pid = locks_translate_pid(flc, proc_pidns);
 
-	fl_pid = locks_translate_pid(fl, proc_pidns);
 	/*
 	 * If lock owner is dead (and pid is freed) or not visible in current
 	 * pidns, zero is shown as a pid value. Check lock info from
 	 * init_pid_ns to get saved lock pid value.
 	 */
-
-	if (fl->fl_file != NULL)
-		inode = file_inode(fl->fl_file);
+	if (flc->flc_file != NULL)
+		inode = file_inode(flc->flc_file);
 
 	seq_printf(f, "%lld: ", id);
 
 	if (repeat)
 		seq_printf(f, "%*s", repeat - 1 + (int)strlen(pfx), pfx);
 
-	if (IS_POSIX(fl)) {
-		if (fl->fl_flags & FL_ACCESS)
+	if (flc->flc_flags & FL_POSIX) {
+		if (flc->flc_flags & FL_ACCESS)
 			seq_puts(f, "ACCESS");
-		else if (IS_OFDLCK(fl))
+		else if (flc->flc_flags & FL_OFDLCK)
 			seq_puts(f, "OFDLCK");
 		else
 			seq_puts(f, "POSIX ");
 
 		seq_printf(f, " %s ",
 			     (inode == NULL) ? "*NOINODE*" : "ADVISORY ");
-	} else if (IS_FLOCK(fl)) {
+	} else if (flc->flc_flags & FL_FLOCK) {
 		seq_puts(f, "FLOCK  ADVISORY  ");
-	} else if (IS_LEASE(fl)) {
-		if (fl->fl_flags & FL_DELEG)
+	} else if (flc->flc_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) {
+		struct file_lease *lease = file_lease(flc);
+
+		type = target_leasetype(lease);
+
+		if (flc->flc_flags & FL_DELEG)
 			seq_puts(f, "DELEG  ");
 		else
 			seq_puts(f, "LEASE  ");
 
-		if (lease_breaking(fl))
+		if (lease_breaking(lease))
 			seq_puts(f, "BREAKING  ");
-		else if (fl->fl_file)
+		else if (flc->flc_file)
 			seq_puts(f, "ACTIVE    ");
 		else
 			seq_puts(f, "BREAKER   ");
 	} else {
 		seq_puts(f, "UNKNOWN UNKNOWN  ");
 	}
-	type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type;
 
 	seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" :
 			     (type == F_RDLCK) ? "READ" : "UNLCK");
 	if (inode) {
 		/* userspace relies on this representation of dev_t */
-		seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
+		seq_printf(f, "%d %02x:%02x:%lu ", pid,
 				MAJOR(inode->i_sb->s_dev),
 				MINOR(inode->i_sb->s_dev), inode->i_ino);
 	} else {
-		seq_printf(f, "%d <none>:0 ", fl_pid);
+		seq_printf(f, "%d <none>:0 ", pid);
 	}
-	if (IS_POSIX(fl)) {
+	if (flc->flc_flags & FL_POSIX) {
 		if (fl->fl_end == OFFSET_MAX)
 			seq_printf(f, "%Ld EOF\n", fl->fl_start);
 		else
@@ -2763,17 +2833,18 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	}
 }
 
-static struct file_lock *get_next_blocked_member(struct file_lock *node)
+static struct file_lock_core *get_next_blocked_member(struct file_lock_core *node)
 {
-	struct file_lock *tmp;
+	struct file_lock_core *tmp;
 
 	/* NULL node or root node */
-	if (node == NULL || node->fl_blocker == NULL)
+	if (node == NULL || node->flc_blocker == NULL)
 		return NULL;
 
 	/* Next member in the linked list could be itself */
-	tmp = list_next_entry(node, fl_blocked_member);
-	if (list_entry_is_head(tmp, &node->fl_blocker->fl_blocked_requests, fl_blocked_member)
+	tmp = list_next_entry(node, flc_blocked_member);
+	if (list_entry_is_head(tmp, &node->flc_blocker->flc_blocked_requests,
+			       flc_blocked_member)
 		|| tmp == node) {
 		return NULL;
 	}
@@ -2784,18 +2855,18 @@ static struct file_lock *get_next_blocked_member(struct file_lock *node)
 static int locks_show(struct seq_file *f, void *v)
 {
 	struct locks_iterator *iter = f->private;
-	struct file_lock *cur, *tmp;
+	struct file_lock_core *cur, *tmp;
 	struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb);
 	int level = 0;
 
-	cur = hlist_entry(v, struct file_lock, fl_link);
+	cur = hlist_entry(v, struct file_lock_core, flc_link);
 
 	if (locks_translate_pid(cur, proc_pidns) == 0)
 		return 0;
 
-	/* View this crossed linked list as a binary tree, the first member of fl_blocked_requests
-	 * is the left child of current node, the next silibing in fl_blocked_member is the
-	 * right child, we can alse get the parent of current node from fl_blocker, so this
+	/* View this crossed linked list as a binary tree, the first member of flc_blocked_requests
+	 * is the left child of current node, the next silibing in flc_blocked_member is the
+	 * right child, we can alse get the parent of current node from flc_blocker, so this
 	 * question becomes traversal of a binary tree
 	 */
 	while (cur != NULL) {
@@ -2804,17 +2875,18 @@ static int locks_show(struct seq_file *f, void *v)
 		else
 			lock_get_status(f, cur, iter->li_pos, "", level);
 
-		if (!list_empty(&cur->fl_blocked_requests)) {
+		if (!list_empty(&cur->flc_blocked_requests)) {
 			/* Turn left */
-			cur = list_first_entry_or_null(&cur->fl_blocked_requests,
-				struct file_lock, fl_blocked_member);
+			cur = list_first_entry_or_null(&cur->flc_blocked_requests,
+						       struct file_lock_core,
+						       flc_blocked_member);
 			level++;
 		} else {
 			/* Turn right */
 			tmp = get_next_blocked_member(cur);
 			/* Fall back to parent node */
-			while (tmp == NULL && cur->fl_blocker != NULL) {
-				cur = cur->fl_blocker;
+			while (tmp == NULL && cur->flc_blocker != NULL) {
+				cur = cur->flc_blocker;
 				level--;
 				tmp = get_next_blocked_member(cur);
 			}
@@ -2829,14 +2901,13 @@ static void __show_fd_locks(struct seq_file *f,
 			struct list_head *head, int *id,
 			struct file *filp, struct files_struct *files)
 {
-	struct file_lock *fl;
+	struct file_lock_core *fl;
 
-	list_for_each_entry(fl, head, fl_list) {
+	list_for_each_entry(fl, head, flc_list) {
 
-		if (filp != fl->fl_file)
+		if (filp != fl->flc_file)
 			continue;
-		if (fl->fl_owner != files &&
-		    fl->fl_owner != filp)
+		if (fl->flc_owner != files && fl->flc_owner != filp)
 			continue;
 
 		(*id)++;
@@ -2915,6 +2986,9 @@ static int __init filelock_init(void)
 	filelock_cache = kmem_cache_create("file_lock_cache",
 			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
 
+	filelease_cache = kmem_cache_create("file_lock_cache",
+			sizeof(struct file_lease), 0, SLAB_PANIC, NULL);
+
 	for_each_possible_cpu(i) {
 		struct file_lock_list_struct *fll = per_cpu_ptr(&file_lock_list, i);
 
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 82aa7a35db26..e60a840999aa 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -426,9 +426,7 @@ EXPORT_SYMBOL(mb_cache_destroy);
 
 static int __init mbcache_init(void)
 {
-	mb_entry_cache = kmem_cache_create("mbcache",
-				sizeof(struct mb_cache_entry), 0,
-				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
+	mb_entry_cache = KMEM_CACHE(mb_cache_entry, SLAB_RECLAIM_ACCOUNT);
 	if (!mb_entry_cache)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 73f37f298087..7cbd2b9f4d11 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -87,7 +87,7 @@ static int __init init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 64c5205e2b5e..3c60f1eaca61 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -214,7 +214,7 @@ static int copy_mnt_idmap(struct uid_gid_map *map_from,
 	 * anything at all.
 	 */
 	if (nr_extents == 0)
-		return 0;
+		return -EINVAL;
 
 	/*
 	 * Here we know that nr_extents is greater than zero which means
diff --git a/fs/mpage.c b/fs/mpage.c
index 738882e0766d..fa8b99a199fa 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -605,6 +605,7 @@ alloc_new:
 				GFP_NOFS);
 		bio->bi_iter.bi_sector = first_block << (blkbits - 9);
 		wbc_init_bio(wbc, bio);
+		bio->bi_write_hint = inode->i_write_hint;
 	}
 
 	/*
diff --git a/fs/namei.c b/fs/namei.c
index 9342fa6a38c2..c5b2a25be7d0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -17,8 +17,8 @@
 
 #include <linux/init.h>
 #include <linux/export.h>
-#include <linux/kernel.h>
 #include <linux/slab.h>
+#include <linux/wordpart.h>
 #include <linux/fs.h>
 #include <linux/filelock.h>
 #include <linux/namei.h>
@@ -27,7 +27,6 @@
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
 #include <linux/security.h>
-#include <linux/ima.h>
 #include <linux/syscalls.h>
 #include <linux/mount.h>
 #include <linux/audit.h>
@@ -2680,10 +2679,8 @@ static int lookup_one_common(struct mnt_idmap *idmap,
 	if (!len)
 		return -EACCES;
 
-	if (unlikely(name[0] == '.')) {
-		if (len < 2 || (len == 2 && name[1] == '.'))
-			return -EACCES;
-	}
+	if (is_dot_dotdot(name, len))
+		return -EACCES;
 
 	while (len--) {
 		unsigned int c = *(const unsigned char *)name++;
@@ -3644,7 +3641,7 @@ static int do_open(struct nameidata *nd,
 	if (!error && !(file->f_mode & FMODE_OPENED))
 		error = vfs_open(&nd->path, file);
 	if (!error)
-		error = ima_file_check(file, op->acc_mode);
+		error = security_file_post_open(file, op->acc_mode);
 	if (!error && do_truncate)
 		error = handle_truncate(idmap, file);
 	if (unlikely(error > 0)) {
@@ -3707,7 +3704,7 @@ static int vfs_tmpfile(struct mnt_idmap *idmap,
 		inode->i_state |= I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	ima_post_create_tmpfile(idmap, inode);
+	security_inode_post_create_tmpfile(idmap, inode);
 	return 0;
 }
 
@@ -4054,7 +4051,7 @@ retry:
 			error = vfs_create(idmap, path.dentry->d_inode,
 					   dentry, mode, true);
 			if (!error)
-				ima_post_path_mknod(idmap, dentry);
+				security_path_post_mknod(idmap, dentry);
 			break;
 		case S_IFCHR: case S_IFBLK:
 			error = vfs_mknod(idmap, path.dentry->d_inode,
diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c
index ad572f7ee897..43a651ed8264 100644
--- a/fs/netfs/fscache_io.c
+++ b/fs/netfs/fscache_io.c
@@ -83,8 +83,10 @@ static int fscache_begin_operation(struct netfs_cache_resources *cres,
 	cres->debug_id		= cookie->debug_id;
 	cres->inval_counter	= cookie->inval_counter;
 
-	if (!fscache_begin_cookie_access(cookie, why))
+	if (!fscache_begin_cookie_access(cookie, why)) {
+		cres->cache_priv = NULL;
 		return -ENOBUFS;
+	}
 
 again:
 	spin_lock(&cookie->lock);
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index b4294a8aa2d4..f1eeb4914199 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -108,7 +108,7 @@ struct pnfs_block_dev {
 	struct pnfs_block_dev		*children;
 	u64				chunk_size;
 
-	struct bdev_handle		*bdev_handle;
+	struct file			*bdev_file;
 	u64				disk_offset;
 
 	u64				pr_key;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index c97ebc42ec0f..93ef7f864980 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev)
 	} else {
 		if (dev->pr_registered) {
 			const struct pr_ops *ops =
-				dev->bdev_handle->bdev->bd_disk->fops->pr_ops;
+				file_bdev(dev->bdev_file)->bd_disk->fops->pr_ops;
 			int error;
 
-			error = ops->pr_register(dev->bdev_handle->bdev,
+			error = ops->pr_register(file_bdev(dev->bdev_file),
 				dev->pr_key, 0, false);
 			if (error)
 				pr_err("failed to unregister PR key.\n");
 		}
 
-		if (dev->bdev_handle)
-			bdev_release(dev->bdev_handle);
+		if (dev->bdev_file)
+			fput(dev->bdev_file);
 	}
 }
 
@@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
 	map->start = dev->start;
 	map->len = dev->len;
 	map->disk_offset = dev->disk_offset;
-	map->bdev = dev->bdev_handle->bdev;
+	map->bdev = file_bdev(dev->bdev_file);
 	return true;
 }
 
@@ -236,26 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	dev_t dev;
 
 	dev = bl_resolve_deviceid(server, v, gfp_mask);
 	if (!dev)
 		return -EIO;
 
-	bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
 				       NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
-			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle));
-		return PTR_ERR(bdev_handle);
+			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file));
+		return PTR_ERR(bdev_file);
 	}
-	d->bdev_handle = bdev_handle;
-	d->len = bdev_nr_bytes(bdev_handle->bdev);
+	d->bdev_file = bdev_file;
+	d->len = bdev_nr_bytes(file_bdev(bdev_file));
 	d->map = bl_map_simple;
 
 	printk(KERN_INFO "pNFS: using block device %s\n",
-		bdev_handle->bdev->bd_disk->disk_name);
+		file_bdev(bdev_file)->bd_disk->disk_name);
 	return 0;
 }
 
@@ -300,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v)
 	}
 }
 
-static struct bdev_handle *
+static struct file *
 bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 {
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	const char *devname;
 
 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
@@ -311,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix)
 	if (!devname)
 		return ERR_PTR(-ENOMEM);
 
-	bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
+	bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
 					NULL, NULL);
-	if (IS_ERR(bdev_handle)) {
+	if (IS_ERR(bdev_file)) {
 		pr_warn("pNFS: failed to open device %s (%ld)\n",
-			devname, PTR_ERR(bdev_handle));
+			devname, PTR_ERR(bdev_file));
 	}
 
 	kfree(devname);
-	return bdev_handle;
+	return bdev_file;
 }
 
 static int
@@ -327,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
 {
 	struct pnfs_block_volume *v = &volumes[idx];
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	const struct pr_ops *ops;
 	int error;
 
@@ -340,14 +340,14 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	 * On other distributions like Debian, the default SCSI by-id path will
 	 * point to the dm-multipath device if one exists.
 	 */
-	bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x");
-	if (IS_ERR(bdev_handle))
-		bdev_handle = bl_open_path(v, "wwn-0x");
-	if (IS_ERR(bdev_handle))
-		return PTR_ERR(bdev_handle);
-	d->bdev_handle = bdev_handle;
-
-	d->len = bdev_nr_bytes(d->bdev_handle->bdev);
+	bdev_file = bl_open_path(v, "dm-uuid-mpath-0x");
+	if (IS_ERR(bdev_file))
+		bdev_file = bl_open_path(v, "wwn-0x");
+	if (IS_ERR(bdev_file))
+		return PTR_ERR(bdev_file);
+	d->bdev_file = bdev_file;
+
+	d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
 	d->map = bl_map_simple;
 	d->pr_key = v->scsi.pr_key;
 
@@ -355,20 +355,20 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 		return -ENODEV;
 
 	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
-		d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
+		file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
 
-	ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops;
+	ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
 	if (!ops) {
 		pr_err("pNFS: block device %s does not support reservations.",
-				d->bdev_handle->bdev->bd_disk->disk_name);
+				file_bdev(d->bdev_file)->bd_disk->disk_name);
 		error = -EINVAL;
 		goto out_blkdev_put;
 	}
 
-	error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true);
+	error = ops->pr_register(file_bdev(d->bdev_file), 0, d->pr_key, true);
 	if (error) {
 		pr_err("pNFS: failed to register key for block device %s.",
-				d->bdev_handle->bdev->bd_disk->disk_name);
+				file_bdev(d->bdev_file)->bd_disk->disk_name);
 		goto out_blkdev_put;
 	}
 
@@ -376,7 +376,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
 	return 0;
 
 out_blkdev_put:
-	bdev_release(d->bdev_handle);
+	fput(d->bdev_file);
 	return error;
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 760d27dd7225..8adfcd4c8c1a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -356,15 +356,12 @@ static const struct svc_version *nfs4_callback_version[] = {
 	[4] = &nfs4_callback_version4,
 };
 
-static struct svc_stat nfs4_callback_stats;
-
 static struct svc_program nfs4_callback_program = {
 	.pg_prog = NFS4_CALLBACK,			/* RPC service number */
 	.pg_nvers = ARRAY_SIZE(nfs4_callback_version),	/* Number of entries */
 	.pg_vers = nfs4_callback_version,		/* version table */
 	.pg_name = "NFSv4 callback",			/* service name */
 	.pg_class = "nfs",				/* authentication class */
-	.pg_stats = &nfs4_callback_stats,
 	.pg_authenticate = nfs_callback_authenticate,
 	.pg_init_request = svc_generic_init_request,
 	.pg_rpcbind_set	= svc_generic_rpcbind_set,
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index fbdc9ca80f71..de77848ae654 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -73,14 +73,9 @@ const struct rpc_program nfs_program = {
 	.number			= NFS_PROGRAM,
 	.nrvers			= ARRAY_SIZE(nfs_version),
 	.version		= nfs_version,
-	.stats			= &nfs_rpcstat,
 	.pipe_dir_name		= NFS_PIPE_DIRNAME,
 };
 
-struct rpc_stat nfs_rpcstat = {
-	.program		= &nfs_program
-};
-
 static struct nfs_subversion *find_nfs_version(unsigned int version)
 {
 	struct nfs_subversion *nfs;
@@ -502,6 +497,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 			  const struct nfs_client_initdata *cl_init,
 			  rpc_authflavor_t flavor)
 {
+	struct nfs_net		*nn = net_generic(clp->cl_net, nfs_net_id);
 	struct rpc_clnt		*clnt = NULL;
 	struct rpc_create_args args = {
 		.net		= clp->cl_net,
@@ -513,6 +509,7 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 		.servername	= clp->cl_hostname,
 		.nodename	= cl_init->nodename,
 		.program	= &nfs_program,
+		.stats		= &nn->rpcstats,
 		.version	= clp->rpc_ops->version,
 		.authflavor	= flavor,
 		.cred		= cl_init->cred,
@@ -1182,6 +1179,8 @@ void nfs_clients_init(struct net *net)
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
 	nn->boot_time = ktime_get_real();
+	memset(&nn->rpcstats, 0, sizeof(nn->rpcstats));
+	nn->rpcstats.program = &nfs_program;
 
 	nfs_netns_sysfs_setup(nn, net);
 }
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index fa1a14def45c..6bace5fece04 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -156,8 +156,8 @@ static int nfs_delegation_claim_locks(struct nfs4_state *state, const nfs4_state
 	list = &flctx->flc_posix;
 	spin_lock(&flctx->flc_lock);
 restart:
-	list_for_each_entry(fl, list, fl_list) {
-		if (nfs_file_open_context(fl->fl_file)->state != state)
+	for_each_file_lock(fl, list) {
+		if (nfs_file_open_context(fl->c.flc_file)->state != state)
 			continue;
 		spin_unlock(&flctx->flc_lock);
 		status = nfs4_lock_delegation_recall(fl, state, stateid);
@@ -181,7 +181,6 @@ static int nfs_delegation_claim_opens(struct inode *inode,
 	struct nfs_open_context *ctx;
 	struct nfs4_state_owner *sp;
 	struct nfs4_state *state;
-	unsigned int seq;
 	int err;
 
 again:
@@ -202,12 +201,9 @@ again:
 		sp = state->owner;
 		/* Block nfs4_proc_unlck */
 		mutex_lock(&sp->so_delegreturn_mutex);
-		seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 		err = nfs4_open_delegation_recall(ctx, state, stateid);
 		if (!err)
 			err = nfs_delegation_claim_locks(state, stateid);
-		if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
-			err = -EAGAIN;
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		put_nfs_open_context(ctx);
 		if (err != 0)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index c03926a1cc73..bb2f583eb28b 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -606,6 +606,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 
 	trace_nfs_direct_commit_complete(dreq);
 
+	spin_lock(&dreq->lock);
 	if (status < 0) {
 		/* Errors in commit are fatal */
 		dreq->error = status;
@@ -613,6 +614,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 	} else {
 		status = dreq->error;
 	}
+	spin_unlock(&dreq->lock);
 
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
 
@@ -625,7 +627,10 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 			spin_unlock(&dreq->lock);
 			nfs_release_request(req);
 		} else if (!nfs_write_match_verf(verf, req)) {
-			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+			spin_lock(&dreq->lock);
+			if (dreq->flags == 0)
+				dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+			spin_unlock(&dreq->lock);
 			/*
 			 * Despite the reboot, the write was successful,
 			 * so reset wb_nio.
@@ -667,10 +672,17 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	LIST_HEAD(mds_list);
 
 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
+	nfs_commit_begin(cinfo.mds);
 	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
 	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
-	if (res < 0) /* res == -ENOMEM */
-		nfs_direct_write_reschedule(dreq);
+	if (res < 0) { /* res == -ENOMEM */
+		spin_lock(&dreq->lock);
+		if (dreq->flags == 0)
+			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+		spin_unlock(&dreq->lock);
+	}
+	if (nfs_commit_end(cinfo.mds))
+		nfs_direct_write_complete(dreq);
 }
 
 static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
@@ -1037,8 +1049,7 @@ int __init nfs_init_directcache(void)
 {
 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
 						sizeof(struct nfs_direct_req),
-						0, (SLAB_RECLAIM_ACCOUNT|
-							SLAB_MEM_SPREAD),
+						0, SLAB_RECLAIM_ACCOUNT,
 						NULL);
 	if (nfs_direct_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 8577ccf621f5..407c6e15afe2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -720,15 +720,15 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
 	struct inode *inode = filp->f_mapping->host;
 	int status = 0;
-	unsigned int saved_type = fl->fl_type;
+	unsigned int saved_type = fl->c.flc_type;
 
 	/* Try local locking first */
 	posix_test_lock(filp, fl);
-	if (fl->fl_type != F_UNLCK) {
+	if (fl->c.flc_type != F_UNLCK) {
 		/* found a conflict */
 		goto out;
 	}
-	fl->fl_type = saved_type;
+	fl->c.flc_type = saved_type;
 
 	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
 		goto out_noconflict;
@@ -740,7 +740,7 @@ do_getlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 out:
 	return status;
 out_noconflict:
-	fl->fl_type = F_UNLCK;
+	fl->c.flc_type = F_UNLCK;
 	goto out;
 }
 
@@ -765,7 +765,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 		 * 	If we're signalled while cleaning up locks on process exit, we
 		 * 	still need to complete the unlock.
 		 */
-		if (status < 0 && !(fl->fl_flags & FL_CLOSE))
+		if (status < 0 && !(fl->c.flc_flags & FL_CLOSE))
 			return status;
 	}
 
@@ -832,12 +832,12 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	int is_local = 0;
 
 	dprintk("NFS: lock(%pD2, t=%x, fl=%x, r=%lld:%lld)\n",
-			filp, fl->fl_type, fl->fl_flags,
+			filp, fl->c.flc_type, fl->c.flc_flags,
 			(long long)fl->fl_start, (long long)fl->fl_end);
 
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
 
-	if (fl->fl_flags & FL_RECLAIM)
+	if (fl->c.flc_flags & FL_RECLAIM)
 		return -ENOGRACE;
 
 	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FCNTL)
@@ -851,7 +851,7 @@ int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 
 	if (IS_GETLK(cmd))
 		ret = do_getlk(filp, cmd, fl, is_local);
-	else if (fl->fl_type == F_UNLCK)
+	else if (lock_is_unlock(fl))
 		ret = do_unlk(filp, cmd, fl, is_local);
 	else
 		ret = do_setlk(filp, cmd, fl, is_local);
@@ -869,16 +869,16 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 	int is_local = 0;
 
 	dprintk("NFS: flock(%pD2, t=%x, fl=%x)\n",
-			filp, fl->fl_type, fl->fl_flags);
+			filp, fl->c.flc_type, fl->c.flc_flags);
 
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		return -ENOLCK;
 
 	if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK)
 		is_local = 1;
 
 	/* We're simulating flock() locks using posix locks on the server */
-	if (fl->fl_type == F_UNLCK)
+	if (lock_is_unlock(fl))
 		return do_unlk(filp, cmd, fl, is_local);
 	return do_setlk(filp, cmd, fl, is_local);
 }
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index acf4b88889dc..4fa304fa5bc4 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -35,6 +35,7 @@
 #include "../internal.h"
 #include "../nfs4session.h"
 #include "filelayout.h"
+#include "../nfs4trace.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
 
@@ -172,6 +173,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
 		dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
 		if (!dsaddr->ds_list[i])
 			goto out_err_drain_dsaddrs;
+		trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr);
 
 		/* If DS was already in cache, free ds addrs */
 		while (!list_empty(&dsaddrs)) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index ef817a0475ff..3e724cb7ef01 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -2016,7 +2016,7 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg)
 	for (idx = 0; idx < flseg->mirror_array_cnt; idx++) {
 		mirror = flseg->mirror_array[idx];
 		mirror_ds = mirror->mirror_ds;
-		if (!mirror_ds)
+		if (IS_ERR_OR_NULL(mirror_ds))
 			continue;
 		ds = mirror->mirror_ds->ds;
 		if (!ds)
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 853e8d609bb3..d0a0956f8a13 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -652,6 +652,7 @@ static int nfs_fs_context_parse_param(struct fs_context *fc,
 		ctx->fscache_uniq = NULL;
 		break;
 	case Opt_fscache:
+		trace_nfs_mount_assign(param->key, param->string);
 		ctx->options |= NFS_OPTION_FSCACHE;
 		kfree(ctx->fscache_uniq);
 		ctx->fscache_uniq = param->string;
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 2d1bfee225c3..ddc1ee031955 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -301,11 +301,11 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
 	struct inode *inode = sreq->rreq->inode;
 	struct nfs_open_context *ctx = sreq->rreq->netfs_priv;
 	struct page *page;
+	unsigned long idx;
 	int err;
 	pgoff_t start = (sreq->start + sreq->transferred) >> PAGE_SHIFT;
 	pgoff_t last = ((sreq->start + sreq->len -
 			 sreq->transferred - 1) >> PAGE_SHIFT);
-	XA_STATE(xas, &sreq->rreq->mapping->i_pages, start);
 
 	nfs_pageio_init_read(&pgio, inode, false,
 			     &nfs_async_read_completion_ops);
@@ -316,19 +316,14 @@ static void nfs_netfs_issue_read(struct netfs_io_subrequest *sreq)
 
 	pgio.pg_netfs = netfs; /* used in completion */
 
-	xas_lock(&xas);
-	xas_for_each(&xas, page, last) {
+	xa_for_each_range(&sreq->rreq->mapping->i_pages, idx, page, start, last) {
 		/* nfs_read_add_folio() may schedule() due to pNFS layout and other RPCs  */
-		xas_pause(&xas);
-		xas_unlock(&xas);
 		err = nfs_read_add_folio(&pgio, ctx, page_folio(page));
 		if (err < 0) {
 			netfs->error = err;
 			goto out;
 		}
-		xas_lock(&xas);
 	}
-	xas_unlock(&xas);
 out:
 	nfs_pageio_complete_read(&pgio);
 	nfs_netfs_put(netfs);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebb8d60e1152..c709c296ea9a 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -2372,7 +2372,7 @@ static int __init nfs_init_inodecache(void)
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
@@ -2426,12 +2426,16 @@ EXPORT_SYMBOL_GPL(nfs_net_id);
 
 static int nfs_net_init(struct net *net)
 {
+	struct nfs_net *nn = net_generic(net, nfs_net_id);
+
 	nfs_clients_init(net);
+	rpc_proc_register(net, &nn->rpcstats);
 	return nfs_fs_proc_net_init(net);
 }
 
 static void nfs_net_exit(struct net *net)
 {
+	rpc_proc_unregister(net, "nfs");
 	nfs_fs_proc_net_exit(net);
 	nfs_clients_exit(net);
 }
@@ -2486,15 +2490,12 @@ static int __init init_nfs_fs(void)
 	if (err)
 		goto out1;
 
-	rpc_proc_register(&init_net, &nfs_rpcstat);
-
 	err = register_nfs_fs();
 	if (err)
 		goto out0;
 
 	return 0;
 out0:
-	rpc_proc_unregister(&init_net, "nfs");
 	nfs_destroy_directcache();
 out1:
 	nfs_destroy_writepagecache();
@@ -2524,7 +2525,6 @@ static void __exit exit_nfs_fs(void)
 	nfs_destroy_inodecache();
 	nfs_destroy_nfspagecache();
 	unregister_pernet_subsys(&nfs_net_ops);
-	rpc_proc_unregister(&init_net, "nfs");
 	unregister_nfs_fs();
 	nfs_fs_proc_exit();
 	nfsiod_stop();
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index e3722ce6722e..06253695fe53 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -449,8 +449,6 @@ int nfs_try_get_tree(struct fs_context *);
 int nfs_get_tree_common(struct fs_context *);
 void nfs_kill_super(struct super_block *);
 
-extern struct rpc_stat nfs_rpcstat;
-
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern bool nfs_sb_active(struct super_block *sb);
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index c8374f74dce1..a68b21603ea9 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -9,6 +9,7 @@
 #include <linux/nfs4.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/sunrpc/stats.h>
 
 struct bl_dev_msg {
 	int32_t status;
@@ -34,6 +35,7 @@ struct nfs_net {
 	struct nfs_netns_client *nfs_client;
 	spinlock_t nfs_client_lock;
 	ktime_t boot_time;
+	struct rpc_stat rpcstats;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *proc_nfsfs;
 #endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 674c012868b1..b0c8a39c2bbd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -111,6 +111,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 	cl_init.hostname = buf;
 
 	switch (ds_proto) {
+	case XPRT_TRANSPORT_RDMA:
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_TCP_TLS:
 		if (mds_clp->cl_nconnect > 1)
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 2de66e4e8280..cbbe3f0193b8 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -963,7 +963,7 @@ nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl)
 	struct nfs_open_context *ctx = nfs_file_open_context(filp);
 	int status;
 
-	if (fl->fl_flags & FL_CLOSE) {
+	if (fl->c.flc_flags & FL_CLOSE) {
 		l_ctx = nfs_get_lock_context(ctx);
 		if (IS_ERR(l_ctx))
 			l_ctx = NULL;
diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h
index b59876b01a1e..0282d93c8bcc 100644
--- a/fs/nfs/nfs42.h
+++ b/fs/nfs/nfs42.h
@@ -55,11 +55,14 @@ int nfs42_proc_removexattr(struct inode *inode, const char *name);
  * They would be 7 bytes long in the eventual buffer ("user.x\0"), and
  * 8 bytes long XDR-encoded.
  *
- * Include the trailing eof word as well.
+ * Include the trailing eof word as well and make the result a multiple
+ * of 4 bytes.
  */
 static inline u32 nfs42_listxattr_xdrsize(u32 buflen)
 {
-	return ((buflen / (XATTR_USER_PREFIX_LEN + 2)) * 8) + 4;
+	u32 size = 8 * buflen / (XATTR_USER_PREFIX_LEN + 2) + 4;
+
+	return (size + 3) & ~3;
 }
 #endif /* CONFIG_NFS_V4_2 */
 #endif /* __LINUX_FS_NFS_NFS4_2_H */
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 49aaf28a6950..b6e3d8f77b91 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -1017,7 +1017,7 @@ int __init nfs4_xattr_cache_init(void)
 
 	nfs4_xattr_cache_cachep = kmem_cache_create("nfs4_xattr_cache_cache",
 	    sizeof(struct nfs4_xattr_cache), 0,
-	    (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+	    (SLAB_RECLAIM_ACCOUNT),
 	    nfs4_xattr_cache_init_once);
 	if (nfs4_xattr_cache_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 581698f1b7b2..7024230f0d1d 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -120,7 +120,6 @@ struct nfs4_state_owner {
 	unsigned long	     so_flags;
 	struct list_head     so_states;
 	struct nfs_seqid_counter so_seqid;
-	seqcount_spinlock_t  so_reclaim_seqcount;
 	struct mutex	     so_delegreturn_mutex;
 };
 
@@ -330,7 +329,7 @@ extern int update_open_stateid(struct nfs4_state *state,
 				const nfs4_stateid *deleg_stateid,
 				fmode_t fmode);
 extern int nfs4_proc_setlease(struct file *file, int arg,
-			      struct file_lock **lease, void **priv);
+			      struct file_lease **lease, void **priv);
 extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
 extern void nfs4_update_changeattr(struct inode *dir,
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 11e3a285594c..84573df5cf5a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -924,6 +924,7 @@ static int nfs4_set_client(struct nfs_server *server,
 	else
 		cl_init.max_connect = max_connect;
 	switch (proto) {
+	case XPRT_TRANSPORT_RDMA:
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_TCP_TLS:
 		cl_init.nconnect = nconnect;
@@ -1000,6 +1001,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 	cl_init.hostname = buf;
 
 	switch (ds_proto) {
+	case XPRT_TRANSPORT_RDMA:
 	case XPRT_TRANSPORT_TCP:
 	case XPRT_TRANSPORT_TCP_TLS:
 		if (mds_clp->cl_nconnect > 1) {
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index e238abc78a13..1cd9652f3c28 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -439,7 +439,7 @@ void nfs42_ssc_unregister_ops(void)
 }
 #endif /* CONFIG_NFS_V4_2 */
 
-static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_setlease(struct file *file, int arg, struct file_lease **lease,
 			 void **priv)
 {
 	return nfs4_proc_setlease(file, arg, lease, priv);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 23819a756508..ea390db94b62 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3069,10 +3069,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	fmode_t acc_mode = _nfs4_ctx_to_accessmode(ctx);
 	struct inode *dir = d_inode(opendata->dir);
 	unsigned long dir_verifier;
-	unsigned int seq;
 	int ret;
 
-	seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
 	dir_verifier = nfs_save_change_attribute(dir);
 
 	ret = _nfs4_proc_open(opendata, ctx);
@@ -3125,11 +3123,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	if (ret != 0)
 		goto out;
 
-	if (d_inode(dentry) == state->inode) {
+	if (d_inode(dentry) == state->inode)
 		nfs_inode_attach_open_context(ctx);
-		if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
-			nfs4_schedule_stateid_recovery(server, state);
-	}
 
 out:
 	if (!opendata->cancelled) {
@@ -6800,7 +6795,7 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	switch (status) {
 		case 0:
-			request->fl_type = F_UNLCK;
+			request->c.flc_type = F_UNLCK;
 			break;
 		case -NFS4ERR_DENIED:
 			status = 0;
@@ -7018,8 +7013,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 	/* Ensure this is an unlock - when canceling a lock, the
 	 * canceled lock is passed in, and it won't be an unlock.
 	 */
-	fl->fl_type = F_UNLCK;
-	if (fl->fl_flags & FL_CLOSE)
+	fl->c.flc_type = F_UNLCK;
+	if (fl->c.flc_flags & FL_CLOSE)
 		set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags);
 
 	data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid);
@@ -7045,11 +7040,11 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	struct rpc_task *task;
 	struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
 	int status = 0;
-	unsigned char fl_flags = request->fl_flags;
+	unsigned char saved_flags = request->c.flc_flags;
 
 	status = nfs4_set_lock_state(state, request);
 	/* Unlock _before_ we do the RPC call */
-	request->fl_flags |= FL_EXISTS;
+	request->c.flc_flags |= FL_EXISTS;
 	/* Exclude nfs_delegation_claim_locks() */
 	mutex_lock(&sp->so_delegreturn_mutex);
 	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
@@ -7073,14 +7068,16 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	status = -ENOMEM;
 	if (IS_ERR(seqid))
 		goto out;
-	task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
+	task = nfs4_do_unlck(request,
+			     nfs_file_open_context(request->c.flc_file),
+			     lsp, seqid);
 	status = PTR_ERR(task);
 	if (IS_ERR(task))
 		goto out;
 	status = rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
 out:
-	request->fl_flags = fl_flags;
+	request->c.flc_flags = saved_flags;
 	trace_nfs4_unlock(request, state, F_SETLK, status);
 	return status;
 }
@@ -7191,7 +7188,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 		renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
 				data->timestamp);
 		if (data->arg.new_lock && !data->cancelled) {
-			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
+			data->fl.c.flc_flags &= ~(FL_SLEEP | FL_ACCESS);
 			if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
 				goto out_restart;
 		}
@@ -7292,7 +7289,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 	if (nfs_server_capable(state->inode, NFS_CAP_MOVEABLE))
 		task_setup_data.flags |= RPC_TASK_MOVEABLE;
 
-	data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file),
+	data = nfs4_alloc_lockdata(fl,
+				   nfs_file_open_context(fl->c.flc_file),
 				   fl->fl_u.nfs4_fl.owner, GFP_KERNEL);
 	if (data == NULL)
 		return -ENOMEM;
@@ -7398,10 +7396,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 {
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs4_state_owner *sp = state->owner;
-	unsigned char fl_flags = request->fl_flags;
+	unsigned char flags = request->c.flc_flags;
 	int status;
 
-	request->fl_flags |= FL_ACCESS;
+	request->c.flc_flags |= FL_ACCESS;
 	status = locks_lock_inode_wait(state->inode, request);
 	if (status < 0)
 		goto out;
@@ -7410,7 +7408,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
 		/* Yes: cache locks! */
 		/* ...but avoid races with delegation recall... */
-		request->fl_flags = fl_flags & ~FL_SLEEP;
+		request->c.flc_flags = flags & ~FL_SLEEP;
 		status = locks_lock_inode_wait(state->inode, request);
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
@@ -7420,7 +7418,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	mutex_unlock(&sp->so_delegreturn_mutex);
 	status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
-	request->fl_flags = fl_flags;
+	request->c.flc_flags = flags;
 	return status;
 }
 
@@ -7562,7 +7560,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	if (!(IS_SETLK(cmd) || IS_SETLKW(cmd)))
 		return -EINVAL;
 
-	if (request->fl_type == F_UNLCK) {
+	if (lock_is_unlock(request)) {
 		if (state != NULL)
 			return nfs4_proc_unlck(state, cmd, request);
 		return 0;
@@ -7571,7 +7569,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	if (state == NULL)
 		return -ENOLCK;
 
-	if ((request->fl_flags & FL_POSIX) &&
+	if ((request->c.flc_flags & FL_POSIX) &&
 	    !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
 		return -ENOLCK;
 
@@ -7579,7 +7577,7 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 	 * Don't rely on the VFS having checked the file open mode,
 	 * since it won't do this for flock() locks.
 	 */
-	switch (request->fl_type) {
+	switch (request->c.flc_type) {
 	case F_RDLCK:
 		if (!(filp->f_mode & FMODE_READ))
 			return -EBADF;
@@ -7601,7 +7599,7 @@ static int nfs4_delete_lease(struct file *file, void **priv)
 	return generic_setlease(file, F_UNLCK, NULL, priv);
 }
 
-static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
+static int nfs4_add_lease(struct file *file, int arg, struct file_lease **lease,
 			  void **priv)
 {
 	struct inode *inode = file_inode(file);
@@ -7619,7 +7617,7 @@ static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease,
 	return -EAGAIN;
 }
 
-int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease,
+int nfs4_proc_setlease(struct file *file, int arg, struct file_lease **lease,
 		       void **priv)
 {
 	switch (arg) {
@@ -8970,10 +8968,12 @@ try_again:
 		return;
 
 	status = task->tk_status;
-	if (status == 0)
+	if (status == 0) {
 		status = nfs4_detect_session_trunking(adata->clp,
 				task->tk_msg.rpc_resp, xprt);
-
+		trace_nfs4_trunked_exchange_id(adata->clp,
+			xprt->address_strings[RPC_DISPLAY_ADDR], status);
+	}
 	if (status == 0)
 		rpc_clnt_xprt_switch_add_xprt(clnt, xprt);
 	else if (status != -NFS4ERR_DELAY && rpc_clnt_xprt_switch_has_addr(clnt,
@@ -10615,29 +10615,33 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
 {
 	ssize_t error, error2, error3;
+	size_t left = size;
 
-	error = generic_listxattr(dentry, list, size);
+	error = generic_listxattr(dentry, list, left);
 	if (error < 0)
 		return error;
 	if (list) {
 		list += error;
-		size -= error;
+		left -= error;
 	}
 
-	error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+	error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, left);
 	if (error2 < 0)
 		return error2;
 
 	if (list) {
 		list += error2;
-		size -= error2;
+		left -= error2;
 	}
 
-	error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, size);
+	error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left);
 	if (error3 < 0)
 		return error3;
 
-	return error + error2 + error3;
+	error += error2 + error3;
+	if (size && error > size)
+		return -ERANGE;
+	return error;
 }
 
 static void nfs4_enable_swap(struct inode *inode)
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9a5d911a7edc..662e86ea3a2d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -513,7 +513,6 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 	nfs4_init_seqid_counter(&sp->so_seqid);
 	atomic_set(&sp->so_count, 1);
 	INIT_LIST_HEAD(&sp->so_lru);
-	seqcount_spinlock_init(&sp->so_reclaim_seqcount, &sp->so_lock);
 	mutex_init(&sp->so_delegreturn_mutex);
 	return sp;
 }
@@ -847,15 +846,15 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
  */
 static struct nfs4_lock_state *
 __nfs4_find_lock_state(struct nfs4_state *state,
-		       fl_owner_t fl_owner, fl_owner_t fl_owner2)
+		       fl_owner_t owner, fl_owner_t owner2)
 {
 	struct nfs4_lock_state *pos, *ret = NULL;
 	list_for_each_entry(pos, &state->lock_states, ls_locks) {
-		if (pos->ls_owner == fl_owner) {
+		if (pos->ls_owner == owner) {
 			ret = pos;
 			break;
 		}
-		if (pos->ls_owner == fl_owner2)
+		if (pos->ls_owner == owner2)
 			ret = pos;
 	}
 	if (ret)
@@ -868,7 +867,7 @@ __nfs4_find_lock_state(struct nfs4_state *state,
  * exists, return an uninitialized one.
  *
  */
-static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t owner)
 {
 	struct nfs4_lock_state *lsp;
 	struct nfs_server *server = state->owner->so_server;
@@ -879,7 +878,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	nfs4_init_seqid_counter(&lsp->ls_seqid);
 	refcount_set(&lsp->ls_count, 1);
 	lsp->ls_state = state;
-	lsp->ls_owner = fl_owner;
+	lsp->ls_owner = owner;
 	lsp->ls_seqid.owner_id = ida_alloc(&server->lockowner_id, GFP_KERNEL_ACCOUNT);
 	if (lsp->ls_seqid.owner_id < 0)
 		goto out_free;
@@ -980,7 +979,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
 	if (fl->fl_ops != NULL)
 		return 0;
-	lsp = nfs4_get_lock_state(state, fl->fl_owner);
+	lsp = nfs4_get_lock_state(state, fl->c.flc_owner);
 	if (lsp == NULL)
 		return -ENOMEM;
 	fl->fl_u.nfs4_fl.owner = lsp;
@@ -993,7 +992,7 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 		const struct nfs_lock_context *l_ctx)
 {
 	struct nfs4_lock_state *lsp;
-	fl_owner_t fl_owner, fl_flock_owner;
+	fl_owner_t owner, fl_flock_owner;
 	int ret = -ENOENT;
 
 	if (l_ctx == NULL)
@@ -1002,11 +1001,11 @@ static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
 		goto out;
 
-	fl_owner = l_ctx->lockowner;
+	owner = l_ctx->lockowner;
 	fl_flock_owner = l_ctx->open_context->flock_owner;
 
 	spin_lock(&state->state_lock);
-	lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
+	lsp = __nfs4_find_lock_state(state, owner, fl_flock_owner);
 	if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
 		ret = -EIO;
 	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -1529,8 +1528,8 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
 	down_write(&nfsi->rwsem);
 	spin_lock(&flctx->flc_lock);
 restart:
-	list_for_each_entry(fl, list, fl_list) {
-		if (nfs_file_open_context(fl->fl_file)->state != state)
+	for_each_file_lock(fl, list) {
+		if (nfs_file_open_context(fl->c.flc_file)->state != state)
 			continue;
 		spin_unlock(&flctx->flc_lock);
 		status = ops->recover_lock(state, fl);
@@ -1667,7 +1666,6 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp,
 	 * server that doesn't support a grace period.
 	 */
 	spin_lock(&sp->so_lock);
-	raw_write_seqcount_begin(&sp->so_reclaim_seqcount);
 restart:
 	list_for_each_entry(state, &sp->so_states, open_states) {
 		if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
@@ -1735,7 +1733,6 @@ restart:
 		spin_lock(&sp->so_lock);
 		goto restart;
 	}
-	raw_write_seqcount_end(&sp->so_reclaim_seqcount);
 	spin_unlock(&sp->so_lock);
 #ifdef CONFIG_NFS_V4_2
 	if (found_ssc_copy_state)
@@ -1745,7 +1742,6 @@ restart:
 out_err:
 	nfs4_put_open_state(state);
 	spin_lock(&sp->so_lock);
-	raw_write_seqcount_end(&sp->so_reclaim_seqcount);
 	spin_unlock(&sp->so_lock);
 	return status;
 }
@@ -1928,9 +1924,12 @@ static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recov
 	struct nfs_server *server;
 	struct rb_node *pos;
 	LIST_HEAD(freeme);
-	int status = 0;
 	int lost_locks = 0;
+	int status;
 
+	status = nfs4_begin_drain_session(clp);
+	if (status < 0)
+		return status;
 restart:
 	rcu_read_lock();
 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
@@ -2694,6 +2693,9 @@ static void nfs4_state_manager(struct nfs_client *clp)
 		/* Detect expired delegations... */
 		if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
 			section = "detect expired delegations";
+			status = nfs4_begin_drain_session(clp);
+			if (status < 0)
+				goto out_error;
 			nfs_reap_expired_delegations(clp);
 			continue;
 		}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index d09bcfd7db89..8da5a9c000f4 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -145,6 +145,7 @@ static int do_nfs4_mount(struct nfs_server *server,
 			 const char *export_path)
 {
 	struct nfs_fs_context *root_ctx;
+	struct nfs_fs_context *ctx;
 	struct fs_context *root_fc;
 	struct vfsmount *root_mnt;
 	struct dentry *dentry;
@@ -157,6 +158,12 @@ static int do_nfs4_mount(struct nfs_server *server,
 		.dirfd	= -1,
 	};
 
+	struct fs_parameter param_fsc = {
+		.key	= "fsc",
+		.type	= fs_value_is_string,
+		.dirfd	= -1,
+	};
+
 	if (IS_ERR(server))
 		return PTR_ERR(server);
 
@@ -168,9 +175,26 @@ static int do_nfs4_mount(struct nfs_server *server,
 	kfree(root_fc->source);
 	root_fc->source = NULL;
 
+	ctx = nfs_fc2context(fc);
 	root_ctx = nfs_fc2context(root_fc);
 	root_ctx->internal = true;
 	root_ctx->server = server;
+
+	if (ctx->fscache_uniq) {
+		len = strlen(ctx->fscache_uniq);
+		param_fsc.size = len;
+		param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL);
+		if (param_fsc.string == NULL) {
+			put_fs_context(root_fc);
+			return -ENOMEM;
+		}
+		ret = vfs_parse_fs_param(root_fc, &param_fsc);
+		kfree(param_fsc.string);
+		if (ret < 0) {
+			put_fs_context(root_fc);
+			return ret;
+		}
+	}
 	/* We leave export_path unset as it's not used to find the root. */
 
 	len = strlen(hostname) + 5;
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d9ac556bebcf..d22c6670f770 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -28,4 +28,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pnfs_mds_fallback_write_pagelist);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_read_error);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_write_error);
 EXPORT_TRACEPOINT_SYMBOL_GPL(ff_layout_commit_error);
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(fl_getdevinfo);
 #endif
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index d27919d7241d..10985a4b8259 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -77,6 +77,36 @@ DEFINE_NFS4_CLIENTID_EVENT(nfs4_bind_conn_to_session);
 DEFINE_NFS4_CLIENTID_EVENT(nfs4_sequence);
 DEFINE_NFS4_CLIENTID_EVENT(nfs4_reclaim_complete);
 
+TRACE_EVENT(nfs4_trunked_exchange_id,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			const char *addr,
+			int error
+		),
+
+		TP_ARGS(clp, addr, error),
+
+		TP_STRUCT__entry(
+			__string(main_addr, clp->cl_hostname)
+			__string(trunk_addr, addr)
+			__field(unsigned long, error)
+		),
+
+		TP_fast_assign(
+			__entry->error = error < 0 ? -error : 0;
+			__assign_str(main_addr, clp->cl_hostname);
+			__assign_str(trunk_addr, addr);
+		),
+
+		TP_printk(
+			"error=%ld (%s) main_addr=%s trunk_addr=%s",
+			-__entry->error,
+			show_nfs4_status(__entry->error),
+			__get_str(main_addr),
+			__get_str(trunk_addr)
+		)
+);
+
 TRACE_EVENT(nfs4_sequence_done,
 		TP_PROTO(
 			const struct nfs4_session *session,
@@ -699,7 +729,7 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
 
 			__entry->error = error < 0 ? -error : 0;
 			__entry->cmd = cmd;
-			__entry->type = request->fl_type;
+			__entry->type = request->c.flc_type;
 			__entry->start = request->fl_start;
 			__entry->end = request->fl_end;
 			__entry->dev = inode->i_sb->s_dev;
@@ -771,7 +801,7 @@ TRACE_EVENT(nfs4_set_lock,
 
 			__entry->error = error < 0 ? -error : 0;
 			__entry->cmd = cmd;
-			__entry->type = request->fl_type;
+			__entry->type = request->c.flc_type;
 			__entry->start = request->fl_start;
 			__entry->end = request->fl_end;
 			__entry->dev = inode->i_sb->s_dev;
@@ -1991,6 +2021,34 @@ DECLARE_EVENT_CLASS(nfs4_deviceid_status,
 DEFINE_PNFS_DEVICEID_STATUS(nfs4_getdeviceinfo);
 DEFINE_PNFS_DEVICEID_STATUS(nfs4_find_deviceid);
 
+TRACE_EVENT(fl_getdevinfo,
+		TP_PROTO(
+			const struct nfs_server *server,
+			const struct nfs4_deviceid *deviceid,
+			char *ds_remotestr
+		),
+		TP_ARGS(server, deviceid, ds_remotestr),
+
+		TP_STRUCT__entry(
+			__string(mds_addr, server->nfs_client->cl_hostname)
+			__array(unsigned char, deviceid, NFS4_DEVICEID4_SIZE)
+			__string(ds_ips, ds_remotestr)
+		),
+
+		TP_fast_assign(
+			__assign_str(mds_addr, server->nfs_client->cl_hostname);
+			__assign_str(ds_ips, ds_remotestr);
+			memcpy(__entry->deviceid, deviceid->data,
+			       NFS4_DEVICEID4_SIZE);
+		),
+		TP_printk(
+			"deviceid=%s, mds_addr=%s, ds_ips=%s",
+			__print_hex(__entry->deviceid, NFS4_DEVICEID4_SIZE),
+			__get_str(mds_addr),
+			__get_str(ds_ips)
+		)
+);
+
 DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event,
 		TP_PROTO(
 			const struct nfs_pgio_header *hdr
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 69406e60f391..1416099dfcd1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1305,7 +1305,7 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 
 static inline int nfs4_lock_type(struct file_lock *fl, int block)
 {
-	if (fl->fl_type == F_RDLCK)
+	if (lock_is_read(fl))
 		return block ? NFS4_READW_LT : NFS4_READ_LT;
 	return block ? NFS4_WRITEW_LT : NFS4_WRITE_LT;
 }
@@ -5052,10 +5052,10 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
 		fl->fl_end = fl->fl_start + (loff_t)length - 1;
 		if (length == ~(uint64_t)0)
 			fl->fl_end = OFFSET_MAX;
-		fl->fl_type = F_WRLCK;
+		fl->c.flc_type = F_WRLCK;
 		if (type & 1)
-			fl->fl_type = F_RDLCK;
-		fl->fl_pid = 0;
+			fl->c.flc_type = F_RDLCK;
+		fl->c.flc_pid = 0;
 	}
 	p = xdr_decode_hyper(p, &clientid); /* read 8 bytes */
 	namelen = be32_to_cpup(p); /* read 4 bytes */  /* have read all 32 bytes now */
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 7600100ba26f..432612d22437 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -175,10 +175,10 @@ static int __init root_nfs_cat(char *dest, const char *src,
 	size_t len = strlen(dest);
 
 	if (len && dest[len - 1] != ',')
-		if (strlcat(dest, ",", destlen) > destlen)
+		if (strlcat(dest, ",", destlen) >= destlen)
 			return -1;
 
-	if (strlcat(dest, src, destlen) > destlen)
+	if (strlcat(dest, src, destlen) >= destlen)
 		return -1;
 	return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0c0fed1ecd0b..a5cc6199127f 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1999,6 +1999,14 @@ pnfs_update_layout(struct inode *ino,
 	}
 
 lookup_again:
+	if (!nfs4_valid_open_stateid(ctx->state)) {
+		trace_pnfs_update_layout(ino, pos, count,
+					 iomode, lo, lseg,
+					 PNFS_UPDATE_LAYOUT_INVALID_OPEN);
+		lseg = ERR_PTR(-EIO);
+		goto out;
+	}
+
 	lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
 	if (IS_ERR(lseg))
 		goto out;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index afd23910f3bf..88e061bd711b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -919,6 +919,8 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 	dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
 
 	list_for_each_entry(da, &ds->ds_addrs, da_node) {
+		char servername[48];
+
 		dprintk("%s: DS %s: trying address %s\n",
 			__func__, ds->ds_remotestr, da->da_remotestr);
 
@@ -929,6 +931,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 				.dstaddr = (struct sockaddr *)&da->da_addr,
 				.addrlen = da->da_addrlen,
 				.servername = clp->cl_hostname,
+				.xprtsec = clp->cl_xprtsec,
 			};
 			struct nfs4_add_xprt_data xprtdata = {
 				.clp = clp,
@@ -938,10 +941,45 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 				.data = &xprtdata,
 			};
 
-			if (da->da_transport != clp->cl_proto)
+			if (da->da_transport != clp->cl_proto &&
+					clp->cl_proto != XPRT_TRANSPORT_TCP_TLS)
 				continue;
+			if (da->da_transport == XPRT_TRANSPORT_TCP &&
+				mds_srv->nfs_client->cl_proto ==
+					XPRT_TRANSPORT_TCP_TLS) {
+				struct sockaddr *addr =
+					(struct sockaddr *)&da->da_addr;
+				struct sockaddr_in *sin =
+					(struct sockaddr_in *)&da->da_addr;
+				struct sockaddr_in6 *sin6 =
+					(struct sockaddr_in6 *)&da->da_addr;
+
+				/* for NFS with TLS we need to supply a correct
+				 * servername of the trunked transport, not the
+				 * servername of the main transport stored in
+				 * clp->cl_hostname. And set the protocol to
+				 * indicate to use TLS
+				 */
+				servername[0] = '\0';
+				switch(addr->sa_family) {
+				case AF_INET:
+					snprintf(servername, sizeof(servername),
+						"%pI4", &sin->sin_addr.s_addr);
+					break;
+				case AF_INET6:
+					snprintf(servername, sizeof(servername),
+						"%pI6", &sin6->sin6_addr);
+					break;
+				default:
+					/* do not consider this address */
+					continue;
+				}
+				xprt_args.ident = XPRT_TRANSPORT_TCP_TLS;
+				xprt_args.servername = servername;
+			}
 			if (da->da_addr.ss_family != clp->cl_addr.ss_family)
 				continue;
+
 			/**
 			* Test this address for session trunking and
 			* add as an alias
@@ -953,6 +991,10 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 			if (xprtdata.cred)
 				put_cred(xprtdata.cred);
 		} else {
+			if (da->da_transport == XPRT_TRANSPORT_TCP &&
+				mds_srv->nfs_client->cl_proto ==
+					XPRT_TRANSPORT_TCP_TLS)
+				da->da_transport = XPRT_TRANSPORT_TCP_TLS;
 			clp = nfs4_set_ds_client(mds_srv,
 						&da->da_addr,
 						da->da_addrlen,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 7dc21a48e3e7..a142287d86f6 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -305,6 +305,8 @@ int nfs_read_add_folio(struct nfs_pageio_descriptor *pgio,
 	new = nfs_page_create_from_folio(ctx, folio, 0, aligned_len);
 	if (IS_ERR(new)) {
 		error = PTR_ERR(new);
+		if (nfs_netfs_folio_unlock(folio))
+			folio_unlock(folio);
 		goto out;
 	}
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 075b31c93f87..dc03f98f7616 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -516,8 +516,16 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 	else
 		nfs_show_nfsv4_options(m, nfss, showdefaults);
 
-	if (nfss->options & NFS_OPTION_FSCACHE)
+	if (nfss->options & NFS_OPTION_FSCACHE) {
+#ifdef CONFIG_NFS_FSCACHE
+		if (nfss->fscache_uniq)
+			seq_printf(m, ",fsc=%s", nfss->fscache_uniq);
+		else
+			seq_puts(m, ",fsc");
+#else
 		seq_puts(m, ",fsc");
+#endif
+	}
 
 	if (nfss->options & NFS_OPTION_MIGRATION)
 		seq_puts(m, ",migration");
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bb79d3a886ae..5de85d725fb9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -667,10 +667,6 @@ static int nfs_writepage_locked(struct folio *folio,
 	struct inode *inode = folio_file_mapping(folio)->host;
 	int err;
 
-	if (wbc->sync_mode == WB_SYNC_NONE &&
-	    NFS_SERVER(inode)->write_congested)
-		return AOP_WRITEPAGE_ACTIVATE;
-
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_pageio_init_write(&pgio, inode, 0, false,
 			      &nfs_async_write_completion_ops);
@@ -1301,7 +1297,7 @@ static bool
 is_whole_file_wrlock(struct file_lock *fl)
 {
 	return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
-			fl->fl_type == F_WRLCK;
+			lock_is_write(fl);
 }
 
 /* If we know the page is up to date, and we're not using byte range locks (or
@@ -1335,13 +1331,13 @@ static int nfs_can_extend_write(struct file *file, struct folio *folio,
 	spin_lock(&flctx->flc_lock);
 	if (!list_empty(&flctx->flc_posix)) {
 		fl = list_first_entry(&flctx->flc_posix, struct file_lock,
-					fl_list);
+					c.flc_list);
 		if (is_whole_file_wrlock(fl))
 			ret = 1;
 	} else if (!list_empty(&flctx->flc_flock)) {
 		fl = list_first_entry(&flctx->flc_flock, struct file_lock,
-					fl_list);
-		if (fl->fl_type == F_WRLCK)
+					c.flc_list);
+		if (lock_is_write(fl))
 			ret = 1;
 	}
 	spin_unlock(&flctx->flc_lock);
@@ -1650,7 +1646,7 @@ static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
 				       !atomic_read(&cinfo->rpcs_out));
 }
 
-static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
 {
 	atomic_inc(&cinfo->rpcs_out);
 }
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
index 46fd74d91ea9..3c040c81c77d 100644
--- a/fs/nfsd/blocklayout.c
+++ b/fs/nfsd/blocklayout.c
@@ -328,10 +328,10 @@ nfsd4_scsi_proc_layoutcommit(struct inode *inode,
 }
 
 static void
-nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls)
+nfsd4_scsi_fence_client(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
-	struct block_device *bdev = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_bdev;
+	struct block_device *bdev = file->nf_file->f_path.mnt->mnt_sb->s_bdev;
 
 	bdev->bd_disk->fops->pr_ops->pr_preempt(bdev, NFSD_MDS_PR_KEY,
 			nfsd4_scsi_pr_key(clp), 0, true);
diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h
index 4cbe0434cbb8..66a05fefae98 100644
--- a/fs/nfsd/cache.h
+++ b/fs/nfsd/cache.h
@@ -80,8 +80,6 @@ enum {
 
 int	nfsd_drc_slab_create(void);
 void	nfsd_drc_slab_free(void);
-int	nfsd_net_reply_cache_init(struct nfsd_net *nn);
-void	nfsd_net_reply_cache_destroy(struct nfsd_net *nn);
 int	nfsd_reply_cache_init(struct nfsd_net *);
 void	nfsd_reply_cache_shutdown(struct nfsd_net *);
 int	nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index 9cb7f0c33df5..ddd3e0d9cfa6 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -61,13 +61,10 @@ static DEFINE_PER_CPU(unsigned long, nfsd_file_total_age);
 static DEFINE_PER_CPU(unsigned long, nfsd_file_evictions);
 
 struct nfsd_fcache_disposal {
-	struct work_struct work;
 	spinlock_t lock;
 	struct list_head freeme;
 };
 
-static struct workqueue_struct *nfsd_filecache_wq __read_mostly;
-
 static struct kmem_cache		*nfsd_file_slab;
 static struct kmem_cache		*nfsd_file_mark_slab;
 static struct list_lru			nfsd_file_lru;
@@ -283,7 +280,7 @@ nfsd_file_free(struct nfsd_file *nf)
 		nfsd_file_mark_put(nf->nf_mark);
 	if (nf->nf_file) {
 		nfsd_file_check_write_error(nf);
-		filp_close(nf->nf_file, NULL);
+		nfsd_filp_close(nf->nf_file);
 	}
 
 	/*
@@ -421,7 +418,37 @@ nfsd_file_dispose_list_delayed(struct list_head *dispose)
 		spin_lock(&l->lock);
 		list_move_tail(&nf->nf_lru, &l->freeme);
 		spin_unlock(&l->lock);
-		queue_work(nfsd_filecache_wq, &l->work);
+		svc_wake_up(nn->nfsd_serv);
+	}
+}
+
+/**
+ * nfsd_file_net_dispose - deal with nfsd_files waiting to be disposed.
+ * @nn: nfsd_net in which to find files to be disposed.
+ *
+ * When files held open for nfsv3 are removed from the filecache, whether
+ * due to memory pressure or garbage collection, they are queued to
+ * a per-net-ns queue.  This function completes the disposal, either
+ * directly or by waking another nfsd thread to help with the work.
+ */
+void nfsd_file_net_dispose(struct nfsd_net *nn)
+{
+	struct nfsd_fcache_disposal *l = nn->fcache_disposal;
+
+	if (!list_empty(&l->freeme)) {
+		LIST_HEAD(dispose);
+		int i;
+
+		spin_lock(&l->lock);
+		for (i = 0; i < 8 && !list_empty(&l->freeme); i++)
+			list_move(l->freeme.next, &dispose);
+		spin_unlock(&l->lock);
+		if (!list_empty(&l->freeme))
+			/* Wake up another thread to share the work
+			 * *before* doing any actual disposing.
+			 */
+			svc_wake_up(nn->nfsd_serv);
+		nfsd_file_dispose_list(&dispose);
 	}
 }
 
@@ -631,28 +658,6 @@ nfsd_file_close_inode_sync(struct inode *inode)
 		list_del_init(&nf->nf_lru);
 		nfsd_file_free(nf);
 	}
-	flush_delayed_fput();
-}
-
-/**
- * nfsd_file_delayed_close - close unused nfsd_files
- * @work: dummy
- *
- * Scrape the freeme list for this nfsd_net, and then dispose of them
- * all.
- */
-static void
-nfsd_file_delayed_close(struct work_struct *work)
-{
-	LIST_HEAD(head);
-	struct nfsd_fcache_disposal *l = container_of(work,
-			struct nfsd_fcache_disposal, work);
-
-	spin_lock(&l->lock);
-	list_splice_init(&l->freeme, &head);
-	spin_unlock(&l->lock);
-
-	nfsd_file_dispose_list(&head);
 }
 
 static int
@@ -662,8 +667,8 @@ nfsd_file_lease_notifier_call(struct notifier_block *nb, unsigned long arg,
 	struct file_lock *fl = data;
 
 	/* Only close files for F_SETLEASE leases */
-	if (fl->fl_flags & FL_LEASE)
-		nfsd_file_close_inode(file_inode(fl->fl_file));
+	if (fl->c.flc_flags & FL_LEASE)
+		nfsd_file_close_inode(file_inode(fl->c.flc_file));
 	return 0;
 }
 
@@ -717,25 +722,18 @@ nfsd_file_cache_init(void)
 		return ret;
 
 	ret = -ENOMEM;
-	nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0);
-	if (!nfsd_filecache_wq)
-		goto out;
-
-	nfsd_file_slab = kmem_cache_create("nfsd_file",
-				sizeof(struct nfsd_file), 0, 0, NULL);
+	nfsd_file_slab = KMEM_CACHE(nfsd_file, 0);
 	if (!nfsd_file_slab) {
 		pr_err("nfsd: unable to create nfsd_file_slab\n");
 		goto out_err;
 	}
 
-	nfsd_file_mark_slab = kmem_cache_create("nfsd_file_mark",
-					sizeof(struct nfsd_file_mark), 0, 0, NULL);
+	nfsd_file_mark_slab = KMEM_CACHE(nfsd_file_mark, 0);
 	if (!nfsd_file_mark_slab) {
 		pr_err("nfsd: unable to create nfsd_file_mark_slab\n");
 		goto out_err;
 	}
 
-
 	ret = list_lru_init(&nfsd_file_lru);
 	if (ret) {
 		pr_err("nfsd: failed to init nfsd_file_lru: %d\n", ret);
@@ -785,8 +783,6 @@ out_err:
 	nfsd_file_slab = NULL;
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 	goto out;
 }
@@ -832,7 +828,6 @@ nfsd_alloc_fcache_disposal(void)
 	l = kmalloc(sizeof(*l), GFP_KERNEL);
 	if (!l)
 		return NULL;
-	INIT_WORK(&l->work, nfsd_file_delayed_close);
 	spin_lock_init(&l->lock);
 	INIT_LIST_HEAD(&l->freeme);
 	return l;
@@ -841,7 +836,6 @@ nfsd_alloc_fcache_disposal(void)
 static void
 nfsd_free_fcache_disposal(struct nfsd_fcache_disposal *l)
 {
-	cancel_work_sync(&l->work);
 	nfsd_file_dispose_list(&l->freeme);
 	kfree(l);
 }
@@ -910,8 +904,6 @@ nfsd_file_cache_shutdown(void)
 	fsnotify_wait_marks_destroyed();
 	kmem_cache_destroy(nfsd_file_mark_slab);
 	nfsd_file_mark_slab = NULL;
-	destroy_workqueue(nfsd_filecache_wq);
-	nfsd_filecache_wq = NULL;
 	rhltable_destroy(&nfsd_file_rhltable);
 
 	for_each_possible_cpu(i) {
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index e54165a3224f..c61884def906 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -56,6 +56,7 @@ void nfsd_file_cache_shutdown_net(struct net *net);
 void nfsd_file_put(struct nfsd_file *nf);
 struct nfsd_file *nfsd_file_get(struct nfsd_file *nf);
 void nfsd_file_close_inode_sync(struct inode *inode);
+void nfsd_file_net_dispose(struct nfsd_net *nn);
 bool nfsd_file_is_cached(struct inode *inode);
 __be32 nfsd_file_acquire_gc(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		  unsigned int may_flags, struct nfsd_file **nfp);
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 74b4360779a1..d4be519b5734 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -11,8 +11,10 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <linux/filelock.h>
+#include <linux/nfs4.h>
 #include <linux/percpu_counter.h>
 #include <linux/siphash.h>
+#include <linux/sunrpc/stats.h>
 
 /* Hash tables for nfs4_clientid state */
 #define CLIENT_HASH_BITS                 4
@@ -26,10 +28,22 @@ struct nfsd4_client_tracking_ops;
 
 enum {
 	/* cache misses due only to checksum comparison failures */
-	NFSD_NET_PAYLOAD_MISSES,
+	NFSD_STATS_PAYLOAD_MISSES,
 	/* amount of memory (in bytes) currently consumed by the DRC */
-	NFSD_NET_DRC_MEM_USAGE,
-	NFSD_NET_COUNTERS_NUM
+	NFSD_STATS_DRC_MEM_USAGE,
+	NFSD_STATS_RC_HITS,		/* repcache hits */
+	NFSD_STATS_RC_MISSES,		/* repcache misses */
+	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
+	NFSD_STATS_FH_STALE,		/* FH stale error */
+	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
+	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
+#ifdef CONFIG_NFSD_V4
+	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
+	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
+#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
+	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
+#endif
+	NFSD_STATS_COUNTERS_NUM
 };
 
 /*
@@ -164,7 +178,10 @@ struct nfsd_net {
 	atomic_t                 num_drc_entries;
 
 	/* Per-netns stats counters */
-	struct percpu_counter    counter[NFSD_NET_COUNTERS_NUM];
+	struct percpu_counter    counter[NFSD_STATS_COUNTERS_NUM];
+
+	/* sunrpc svc stats */
+	struct svc_stat          nfsd_svcstats;
 
 	/* longest hash chain seen */
 	unsigned int             longest_chain;
@@ -192,6 +209,10 @@ struct nfsd_net {
 	atomic_t		nfsd_courtesy_clients;
 	struct shrinker		*nfsd_client_shrinker;
 	struct work_struct	nfsd_shrinker_work;
+
+	/* last time an admin-revoke happened for NFSv4.0 */
+	time64_t		nfs40_last_revoke;
+
 };
 
 /* Simple check to find out if a given net was properly initialized */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index b78eceebd945..dfcc957e460d 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -71,13 +71,15 @@ nfsd3_proc_setattr(struct svc_rqst *rqstp)
 	struct nfsd_attrs attrs = {
 		.na_iattr	= &argp->attrs,
 	};
+	const struct timespec64 *guardtime = NULL;
 
 	dprintk("nfsd: SETATTR(3)  %s\n",
 				SVCFH_fmt(&argp->fh));
 
 	fh_copy(&resp->fh, &argp->fh);
-	resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs,
-				    argp->check_guard, argp->guardtime);
+	if (argp->check_guard)
+		guardtime = &argp->guardtime;
+	resp->status = nfsd_setattr(rqstp, &resp->fh, &attrs, guardtime);
 	return rpc_success;
 }
 
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index f32128955ec8..a7a07470c1f8 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -295,17 +295,14 @@ svcxdr_decode_sattr3(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 static bool
 svcxdr_decode_sattrguard3(struct xdr_stream *xdr, struct nfsd3_sattrargs *args)
 {
-	__be32 *p;
 	u32 check;
 
 	if (xdr_stream_decode_bool(xdr, &check) < 0)
 		return false;
 	if (check) {
-		p = xdr_inline_decode(xdr, XDR_UNIT * 2);
-		if (!p)
+		if (!svcxdr_decode_nfstime3(xdr, &args->guardtime))
 			return false;
 		args->check_guard = 1;
-		args->guardtime = be32_to_cpup(p);
 	} else
 		args->check_guard = 0;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 926c29879c6a..87c9547989f6 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -45,7 +45,7 @@
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason);
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp);
 
 #define NFSPROC4_CB_NULL 0
 #define NFSPROC4_CB_COMPOUND 1
@@ -85,7 +85,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n)
 static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
 			   size_t len)
 {
-	WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
+	xdr_stream_encode_uint32_array(xdr, bitmap, len);
+}
+
+static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap,
+				struct nfs4_cb_fattr *fattr)
+{
+	fattr->ncf_cb_change = 0;
+	fattr->ncf_cb_fsize = 0;
+	if (bitmap[0] & FATTR4_WORD0_CHANGE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0)
+			return -NFSERR_BAD_XDR;
+	if (bitmap[0] & FATTR4_WORD0_SIZE)
+		if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0)
+			return -NFSERR_BAD_XDR;
+	return 0;
 }
 
 static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
@@ -334,6 +348,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr,
 }
 
 /*
+ * CB_GETATTR4args
+ *	struct CB_GETATTR4args {
+ *	   nfs_fh4 fh;
+ *	   bitmap4 attr_request;
+ *	};
+ *
+ * The size and change attributes are the only one
+ * guaranteed to be serviced by the client.
+ */
+static void
+encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr,
+			struct nfs4_cb_fattr *fattr)
+{
+	struct nfs4_delegation *dp =
+		container_of(fattr, struct nfs4_delegation, dl_cb_fattr);
+	struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle;
+
+	encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR);
+	encode_nfs_fh4(xdr, fh);
+	encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap));
+	hdr->nops++;
+}
+
+/*
  * CB_SEQUENCE4args
  *
  *	struct CB_SEQUENCE4args {
@@ -469,6 +507,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 }
 
 /*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req,
+		struct xdr_stream *xdr, const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = cb->cb_clp->cl_cb_ident,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+	encode_cb_getattr4args(xdr, &hdr, ncf);
+	encode_cb_nops(&hdr);
+}
+
+/*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
 static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -524,6 +582,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr,
 }
 
 /*
+ * 20.1.  Operation 3: CB_GETATTR - Get Attributes
+ */
+static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+	u32 bitmap[3] = {0};
+	u32 attrlen;
+	struct nfs4_cb_fattr *ncf =
+		container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
+	status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status);
+	if (status)
+		return status;
+	if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0)
+		return -NFSERR_BAD_XDR;
+	if (xdr_stream_decode_u32(xdr, &attrlen) < 0)
+		return -NFSERR_BAD_XDR;
+	if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize)))
+		return -NFSERR_BAD_XDR;
+	status = decode_cb_fattr4(xdr, bitmap, ncf);
+	return status;
+}
+
+/*
  * 20.2. Operation 4: CB_RECALL - Recall a Delegation
  */
 static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp,
@@ -674,7 +768,7 @@ static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
 	const struct nfsd4_callback *cb = data;
 	const struct nfsd4_blocked_lock *nbl =
 		container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
-	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.c.flc_owner;
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = 0,
 		.minorversion = cb->cb_clp->cl_minorversion,
@@ -831,6 +925,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
 	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
+	PROC(CB_GETATTR,	COMPOUND,	cb_getattr,	cb_getattr),
 };
 
 static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)];
@@ -887,7 +982,16 @@ static struct workqueue_struct *callback_wq;
 
 static bool nfsd4_queue_cb(struct nfsd4_callback *cb)
 {
-	return queue_work(callback_wq, &cb->cb_work);
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
+	return queue_delayed_work(callback_wq, &cb->cb_work, 0);
+}
+
+static void nfsd4_queue_cb_delayed(struct nfsd4_callback *cb,
+				   unsigned long msecs)
+{
+	trace_nfsd_cb_queue(cb->cb_clp, cb);
+	queue_delayed_work(callback_wq, &cb->cb_work,
+			   msecs_to_jiffies(msecs));
 }
 
 static void nfsd41_cb_inflight_begin(struct nfs4_client *clp)
@@ -999,18 +1103,18 @@ static void nfsd4_mark_cb_state(struct nfs4_client *clp, int newstate)
 {
 	if (clp->cl_cb_state != newstate) {
 		clp->cl_cb_state = newstate;
-		trace_nfsd_cb_state(clp);
+		trace_nfsd_cb_new_state(clp);
 	}
 }
 
-static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_down(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
 	nfsd4_mark_cb_state(clp, NFSD4_CB_DOWN);
 }
 
-static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
+static void nfsd4_mark_cb_fault(struct nfs4_client *clp)
 {
 	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
 		return;
@@ -1022,7 +1126,7 @@ static void nfsd4_cb_probe_done(struct rpc_task *task, void *calldata)
 	struct nfs4_client *clp = container_of(calldata, struct nfs4_client, cl_cb_null);
 
 	if (task->tk_status)
-		nfsd4_mark_cb_down(clp, task->tk_status);
+		nfsd4_mark_cb_down(clp);
 	else
 		nfsd4_mark_cb_state(clp, NFSD4_CB_UP);
 }
@@ -1106,6 +1210,7 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
 
+	trace_nfsd_cb_destroy(clp, cb);
 	nfsd41_cb_release_slot(cb);
 	if (cb->cb_ops && cb->cb_ops->release)
 		cb->cb_ops->release(cb);
@@ -1158,6 +1263,8 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 	if (!cb->cb_holds_slot)
 		goto need_restart;
 
+	/* This is the operation status code for CB_SEQUENCE */
+	trace_nfsd_cb_seq_status(task, cb);
 	switch (cb->cb_seq_status) {
 	case 0:
 		/*
@@ -1171,13 +1278,23 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		break;
 	case -ESERVERFAULT:
 		++session->se_cb_seq_nr;
-		fallthrough;
+		nfsd4_mark_cb_fault(cb->cb_clp);
+		ret = false;
+		break;
 	case 1:
+		/*
+		 * cb_seq_status remains 1 if an RPC Reply was never
+		 * received. NFSD can't know if the client processed
+		 * the CB_SEQUENCE operation. Ask the client to send a
+		 * DESTROY_SESSION to recover.
+		 */
+		fallthrough;
 	case -NFS4ERR_BADSESSION:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 		ret = false;
-		break;
+		goto need_restart;
 	case -NFS4ERR_DELAY:
+		cb->cb_seq_status = 1;
 		if (!rpc_restart_call(task))
 			goto out;
 
@@ -1192,14 +1309,11 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback
 		}
 		break;
 	default:
-		nfsd4_mark_cb_fault(cb->cb_clp, cb->cb_seq_status);
-		dprintk("%s: unprocessed error %d\n", __func__,
-			cb->cb_seq_status);
+		nfsd4_mark_cb_fault(cb->cb_clp);
 	}
-
 	nfsd41_cb_release_slot(cb);
-	dprintk("%s: freed slot, new seqid=%d\n", __func__,
-		clp->cl_cb_session->se_cb_seq_nr);
+
+	trace_nfsd_cb_free_slot(task, cb);
 
 	if (RPC_SIGNALLED(task))
 		goto need_restart;
@@ -1211,6 +1325,7 @@ retry_nowait:
 	goto out;
 need_restart:
 	if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) {
+		trace_nfsd_cb_restart(clp, cb);
 		task->tk_status = 0;
 		cb->cb_need_restart = true;
 	}
@@ -1240,7 +1355,7 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata)
 		case -EIO:
 		case -ETIMEDOUT:
 		case -EACCES:
-			nfsd4_mark_cb_down(clp, task->tk_status);
+			nfsd4_mark_cb_down(clp);
 		}
 		break;
 	default:
@@ -1295,12 +1410,13 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp)
 	nfsd41_cb_inflight_wait_complete(clp);
 }
 
-/* requires cl_lock: */
 static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp)
 {
 	struct nfsd4_session *s;
 	struct nfsd4_conn *c;
 
+	lockdep_assert_held(&clp->cl_lock);
+
 	list_for_each_entry(s, &clp->cl_sessions, se_perclnt) {
 		list_for_each_entry(c, &s->se_conns, cn_persession) {
 			if (c->cn_flags & NFS4_CDFC4_BACK)
@@ -1324,11 +1440,14 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	struct nfsd4_conn *c;
 	int err;
 
+	trace_nfsd_cb_bc_update(clp, cb);
+
 	/*
 	 * This is either an update, or the client dying; in either case,
 	 * kill the old client:
 	 */
 	if (clp->cl_cb_client) {
+		trace_nfsd_cb_bc_shutdown(clp, cb);
 		rpc_shutdown_client(clp->cl_cb_client);
 		clp->cl_cb_client = NULL;
 		put_cred(clp->cl_cb_cred);
@@ -1340,13 +1459,15 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 	}
 	if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
 		return;
+
 	spin_lock(&clp->cl_lock);
 	/*
 	 * Only serialized callback code is allowed to clear these
 	 * flags; main nfsd code can only set them:
 	 */
-	BUG_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
+	WARN_ON(!(clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK));
 	clear_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags);
+
 	memcpy(&conn, &cb->cb_clp->cl_cb_conn, sizeof(struct nfs4_cb_conn));
 	c = __nfsd4_find_backchannel(clp);
 	if (c) {
@@ -1358,7 +1479,7 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
 
 	err = setup_callback_client(clp, &conn, ses);
 	if (err) {
-		nfsd4_mark_cb_down(clp, err);
+		nfsd4_mark_cb_down(clp);
 		if (c)
 			svc_xprt_put(c->cn_xprt);
 		return;
@@ -1369,25 +1490,28 @@ static void
 nfsd4_run_cb_work(struct work_struct *work)
 {
 	struct nfsd4_callback *cb =
-		container_of(work, struct nfsd4_callback, cb_work);
+		container_of(work, struct nfsd4_callback, cb_work.work);
 	struct nfs4_client *clp = cb->cb_clp;
 	struct rpc_clnt *clnt;
 	int flags;
 
-	if (cb->cb_need_restart) {
-		cb->cb_need_restart = false;
-	} else {
-		if (cb->cb_ops && cb->cb_ops->prepare)
-			cb->cb_ops->prepare(cb);
-	}
+	trace_nfsd_cb_start(clp);
 
 	if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK)
 		nfsd4_process_cb_update(cb);
 
 	clnt = clp->cl_cb_client;
 	if (!clnt) {
-		/* Callback channel broken, or client killed; give up: */
-		nfsd41_destroy_cb(cb);
+		if (test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags))
+			nfsd41_destroy_cb(cb);
+		else {
+			/*
+			 * XXX: Ideally, we could wait for the client to
+			 *	reconnect, but I haven't figured out how
+			 *	to do that yet.
+			 */
+			nfsd4_queue_cb_delayed(cb, 25);
+		}
 		return;
 	}
 
@@ -1400,6 +1524,12 @@ nfsd4_run_cb_work(struct work_struct *work)
 		return;
 	}
 
+	if (cb->cb_need_restart) {
+		cb->cb_need_restart = false;
+	} else {
+		if (cb->cb_ops && cb->cb_ops->prepare)
+			cb->cb_ops->prepare(cb);
+	}
 	cb->cb_msg.rpc_cred = clp->cl_cb_cred;
 	flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
 	rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
@@ -1414,8 +1544,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
 	cb->cb_msg.rpc_argp = cb;
 	cb->cb_msg.rpc_resp = cb;
 	cb->cb_ops = ops;
-	INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
-	cb->cb_seq_status = 1;
+	INIT_DELAYED_WORK(&cb->cb_work, nfsd4_run_cb_work);
 	cb->cb_status = 0;
 	cb->cb_need_restart = false;
 	cb->cb_holds_slot = false;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 5e8096bc5eaa..4f3072b5979a 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -25,7 +25,7 @@ static struct kmem_cache *nfs4_layout_cache;
 static struct kmem_cache *nfs4_layout_stateid_cache;
 
 static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
-static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+static const struct lease_manager_operations nfsd4_layouts_lm_ops;
 
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
 #ifdef CONFIG_NFSD_FLEXFILELAYOUT
@@ -152,6 +152,23 @@ void nfsd4_setup_layout_type(struct svc_export *exp)
 #endif
 }
 
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+	struct nfsd_file *fl;
+
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	fl = ls->ls_file;
+	ls->ls_file = NULL;
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+
+	if (fl) {
+		if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+			kernel_setlease(fl->nf_file, F_UNLCK, NULL,
+					(void **)&ls);
+		nfsd_file_put(fl);
+	}
+}
+
 static void
 nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 {
@@ -169,9 +186,7 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 	list_del_init(&ls->ls_perfile);
 	spin_unlock(&fp->fi_lock);
 
-	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
-		vfs_setlease(ls->ls_file->nf_file, F_UNLCK, NULL, (void **)&ls);
-	nfsd_file_put(ls->ls_file);
+	nfsd4_close_layout(ls);
 
 	if (ls->ls_recalled)
 		atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
@@ -182,27 +197,26 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 static int
 nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
 {
-	struct file_lock *fl;
+	struct file_lease *fl;
 	int status;
 
 	if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
 		return 0;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lease();
 	if (!fl)
 		return -ENOMEM;
-	locks_init_lock(fl);
+	locks_init_lease(fl);
 	fl->fl_lmops = &nfsd4_layouts_lm_ops;
-	fl->fl_flags = FL_LAYOUT;
-	fl->fl_type = F_RDLCK;
-	fl->fl_end = OFFSET_MAX;
-	fl->fl_owner = ls;
-	fl->fl_pid = current->tgid;
-	fl->fl_file = ls->ls_file->nf_file;
-
-	status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+	fl->c.flc_flags = FL_LAYOUT;
+	fl->c.flc_type = F_RDLCK;
+	fl->c.flc_owner = ls;
+	fl->c.flc_pid = current->tgid;
+	fl->c.flc_file = ls->ls_file->nf_file;
+
+	status = kernel_setlease(fl->c.flc_file, fl->c.flc_type, &fl, NULL);
 	if (status) {
-		locks_free_lock(fl);
+		locks_free_lease(fl);
 		return status;
 	}
 	BUG_ON(fl != NULL);
@@ -236,7 +250,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
 			NFSPROC4_CLNT_CB_LAYOUT);
 
-	if (parent->sc_type == NFS4_DELEG_STID)
+	if (parent->sc_type == SC_TYPE_DELEG)
 		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
 	else
 		ls->ls_file = find_any_file(fp);
@@ -250,7 +264,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 	}
 
 	spin_lock(&clp->cl_lock);
-	stp->sc_type = NFS4_LAYOUT_STID;
+	stp->sc_type = SC_TYPE_LAYOUT;
 	list_add(&ls->ls_perclnt, &clp->cl_lo_states);
 	spin_unlock(&clp->cl_lock);
 
@@ -269,13 +283,13 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 {
 	struct nfs4_layout_stateid *ls;
 	struct nfs4_stid *stid;
-	unsigned char typemask = NFS4_LAYOUT_STID;
+	unsigned short typemask = SC_TYPE_LAYOUT;
 	__be32 status;
 
 	if (create)
-		typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+		typemask |= (SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG);
 
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+	status = nfsd4_lookup_stateid(cstate, stateid, typemask, 0, &stid,
 			net_generic(SVC_NET(rqstp), nfsd_net_id));
 	if (status)
 		goto out;
@@ -286,7 +300,7 @@ nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
 		goto out_put_stid;
 	}
 
-	if (stid->sc_type != NFS4_LAYOUT_STID) {
+	if (stid->sc_type != SC_TYPE_LAYOUT) {
 		ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
 		nfs4_put_stid(stid);
 
@@ -518,7 +532,7 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp,
 		lrp->lrs_present = true;
 	} else {
 		trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid);
-		nfs4_unhash_stid(&ls->ls_stid);
+		ls->ls_stid.sc_status |= SC_STATUS_CLOSED;
 		lrp->lrs_present = false;
 	}
 	spin_unlock(&ls->ls_lock);
@@ -605,7 +619,7 @@ nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
 }
 
 static void
-nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls, struct nfsd_file *file)
 {
 	struct nfs4_client *clp = ls->ls_stid.sc_client;
 	char addr_str[INET6_ADDRSTRLEN];
@@ -627,7 +641,7 @@ nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
 
 	argv[0] = (char *)nfsd_recall_failed;
 	argv[1] = addr_str;
-	argv[2] = ls->ls_file->nf_file->f_path.mnt->mnt_sb->s_id;
+	argv[2] = file->nf_file->f_path.mnt->mnt_sb->s_id;
 	argv[3] = NULL;
 
 	error = call_usermodehelper(nfsd_recall_failed, argv, envp,
@@ -657,6 +671,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 	struct nfsd_net *nn;
 	ktime_t now, cutoff;
 	const struct nfsd4_layout_ops *ops;
+	struct nfsd_file *fl;
 
 	trace_nfsd_cb_layout_done(&ls->ls_stid.sc_stateid, task);
 	switch (task->tk_status) {
@@ -688,12 +703,17 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		 * Unknown error or non-responding client, we'll need to fence.
 		 */
 		trace_nfsd_layout_recall_fail(&ls->ls_stid.sc_stateid);
-
-		ops = nfsd4_layout_ops[ls->ls_layout_type];
-		if (ops->fence_client)
-			ops->fence_client(ls);
-		else
-			nfsd4_cb_layout_fail(ls);
+		rcu_read_lock();
+		fl = nfsd_file_get(ls->ls_file);
+		rcu_read_unlock();
+		if (fl) {
+			ops = nfsd4_layout_ops[ls->ls_layout_type];
+			if (ops->fence_client)
+				ops->fence_client(ls, fl);
+			else
+				nfsd4_cb_layout_fail(ls, fl);
+			nfsd_file_put(fl);
+		}
 		return 1;
 	case -NFS4ERR_NOMATCHING_LAYOUT:
 		trace_nfsd_layout_recall_done(&ls->ls_stid.sc_stateid);
@@ -723,7 +743,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
 };
 
 static bool
-nfsd4_layout_lm_break(struct file_lock *fl)
+nfsd4_layout_lm_break(struct file_lease *fl)
 {
 	/*
 	 * We don't want the locks code to timeout the lease for us;
@@ -731,19 +751,19 @@ nfsd4_layout_lm_break(struct file_lock *fl)
 	 * in time:
 	 */
 	fl->fl_break_time = 0;
-	nfsd4_recall_file_layout(fl->fl_owner);
+	nfsd4_recall_file_layout(fl->c.flc_owner);
 	return false;
 }
 
 static int
-nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+nfsd4_layout_lm_change(struct file_lease *onlist, int arg,
 		struct list_head *dispose)
 {
 	BUG_ON(!(arg & F_UNLCK));
 	return lease_modify(onlist, arg, dispose);
 }
 
-static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+static const struct lease_manager_operations nfsd4_layouts_lm_ops = {
 	.lm_break	= nfsd4_layout_lm_break,
 	.lm_change	= nfsd4_layout_lm_change,
 };
@@ -756,13 +776,11 @@ nfsd4_init_pnfs(void)
 	for (i = 0; i < DEVID_HASH_SIZE; i++)
 		INIT_LIST_HEAD(&nfsd_devid_hash[i]);
 
-	nfs4_layout_cache = kmem_cache_create("nfs4_layout",
-			sizeof(struct nfs4_layout), 0, 0, NULL);
+	nfs4_layout_cache = KMEM_CACHE(nfs4_layout, 0);
 	if (!nfs4_layout_cache)
 		return -ENOMEM;
 
-	nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
-			sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+	nfs4_layout_stateid_cache = KMEM_CACHE(nfs4_layout_stateid, 0);
 	if (!nfs4_layout_stateid_cache) {
 		kmem_cache_destroy(nfs4_layout_cache);
 		return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 14712fa08f76..2927b1263f08 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1143,6 +1143,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	};
 	struct inode *inode;
 	__be32 status = nfs_ok;
+	bool save_no_wcc;
 	int err;
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
@@ -1168,8 +1169,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (status)
 		goto out;
-	status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs,
-				0, (time64_t)0);
+	save_no_wcc = cstate->current_fh.fh_no_wcc;
+	cstate->current_fh.fh_no_wcc = true;
+	status = nfsd_setattr(rqstp, &cstate->current_fh, &attrs, NULL);
+	cstate->current_fh.fh_no_wcc = save_no_wcc;
 	if (!status)
 		status = nfserrno(attrs.na_labelerr);
 	if (!status)
@@ -2490,10 +2493,10 @@ nfsd4_proc_null(struct svc_rqst *rqstp)
 	return rpc_success;
 }
 
-static inline void nfsd4_increment_op_stats(u32 opnum)
+static inline void nfsd4_increment_op_stats(struct nfsd_net *nn, u32 opnum)
 {
 	if (opnum >= FIRST_NFS4_OP && opnum <= LAST_NFS4_OP)
-		percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_NFS4_OP(opnum)]);
+		percpu_counter_inc(&nn->counter[NFSD_STATS_NFS4_OP(opnum)]);
 }
 
 static const struct nfsd4_operation nfsd4_ops[];
@@ -2768,7 +2771,7 @@ encode_op:
 					   status, nfsd4_op_name(op->opnum));
 
 		nfsd4_cstate_clear_replay(cstate);
-		nfsd4_increment_op_stats(op->opnum);
+		nfsd4_increment_op_stats(nn, op->opnum);
 	}
 
 	fh_put(current_fh);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 7d6c657e0409..84d4093ca713 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -87,6 +87,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid);
 void nfsd4_end_grace(struct nfsd_net *nn);
 static void _free_cpntf_state_locked(struct nfsd_net *nn, struct nfs4_cpntf_state *cps);
 static void nfsd4_file_hash_remove(struct nfs4_file *fi);
+static void deleg_reaper(struct nfsd_net *nn);
 
 /* Locking: */
 
@@ -127,6 +128,7 @@ static void free_session(struct nfsd4_session *);
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
 
 static struct workqueue_struct *laundry_wq;
 
@@ -318,6 +320,7 @@ free_nbl(struct kref *kref)
 	struct nfsd4_blocked_lock *nbl;
 
 	nbl = container_of(kref, struct nfsd4_blocked_lock, nbl_kref);
+	locks_release_private(&nbl->nbl_lock);
 	kfree(nbl);
 }
 
@@ -325,7 +328,6 @@ static void
 free_blocked_lock(struct nfsd4_blocked_lock *nbl)
 {
 	locks_delete_block(&nbl->nbl_lock);
-	locks_release_private(&nbl->nbl_lock);
 	kref_put(&nbl->nbl_kref, free_nbl);
 }
 
@@ -1189,6 +1191,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 	dp->dl_recalled = false;
 	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
 		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
+	nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+			&nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+	dp->dl_cb_fattr.ncf_file_modified = false;
+	dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE;
 	get_nfs4_file(fp);
 	dp->dl_stid.sc_file = fp;
 	return dp;
@@ -1210,6 +1216,8 @@ nfs4_put_stid(struct nfs4_stid *s)
 		return;
 	}
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	nfs4_free_cpntf_statelist(clp->net, s);
 	spin_unlock(&clp->cl_lock);
 	s->sc_free(s);
@@ -1249,7 +1257,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
 
 	WARN_ON_ONCE(!fp->fi_delegees);
 
-	vfs_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
+	kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp);
 	put_deleg_file(fp);
 }
 
@@ -1260,11 +1268,6 @@ static void destroy_unhashed_deleg(struct nfs4_delegation *dp)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
-void nfs4_unhash_stid(struct nfs4_stid *s)
-{
-	s->sc_type = 0;
-}
-
 /**
  * nfs4_delegation_exists - Discover if this delegation already exists
  * @clp:     a pointer to the nfs4_client we're granting a delegation to
@@ -1312,11 +1315,12 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp)
 
 	lockdep_assert_held(&state_lock);
 	lockdep_assert_held(&fp->fi_lock);
+	lockdep_assert_held(&clp->cl_lock);
 
 	if (nfs4_delegation_exists(clp, fp))
 		return -EAGAIN;
 	refcount_inc(&dp->dl_stid.sc_count);
-	dp->dl_stid.sc_type = NFS4_DELEG_STID;
+	dp->dl_stid.sc_type = SC_TYPE_DELEG;
 	list_add(&dp->dl_perfile, &fp->fi_delegations);
 	list_add(&dp->dl_perclnt, &clp->cl_delegations);
 	return 0;
@@ -1328,7 +1332,7 @@ static bool delegation_hashed(struct nfs4_delegation *dp)
 }
 
 static bool
-unhash_delegation_locked(struct nfs4_delegation *dp)
+unhash_delegation_locked(struct nfs4_delegation *dp, unsigned short statusmask)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 
@@ -1337,7 +1341,13 @@ unhash_delegation_locked(struct nfs4_delegation *dp)
 	if (!delegation_hashed(dp))
 		return false;
 
-	dp->dl_stid.sc_type = NFS4_CLOSED_DELEG_STID;
+	if (statusmask == SC_STATUS_REVOKED &&
+	    dp->dl_stid.sc_client->cl_minorversion == 0)
+		statusmask = SC_STATUS_CLOSED;
+	dp->dl_stid.sc_status |= statusmask;
+	if (statusmask & SC_STATUS_ADMIN_REVOKED)
+		atomic_inc(&dp->dl_stid.sc_client->cl_admin_revoked);
+
 	/* Ensure that deleg break won't try to requeue it */
 	++dp->dl_time;
 	spin_lock(&fp->fi_lock);
@@ -1353,7 +1363,7 @@ static void destroy_delegation(struct nfs4_delegation *dp)
 	bool unhashed;
 
 	spin_lock(&state_lock);
-	unhashed = unhash_delegation_locked(dp);
+	unhashed = unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 	spin_unlock(&state_lock);
 	if (unhashed)
 		destroy_unhashed_deleg(dp);
@@ -1367,9 +1377,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 
 	trace_nfsd_stid_revoke(&dp->dl_stid);
 
-	if (clp->cl_minorversion) {
+	if (dp->dl_stid.sc_status &
+	    (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED)) {
 		spin_lock(&clp->cl_lock);
-		dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
 		refcount_inc(&dp->dl_stid.sc_count);
 		list_add(&dp->dl_recall_lru, &clp->cl_revoked);
 		spin_unlock(&clp->cl_lock);
@@ -1377,8 +1387,8 @@ static void revoke_delegation(struct nfs4_delegation *dp)
 	destroy_unhashed_deleg(dp);
 }
 
-/* 
- * SETCLIENTID state 
+/*
+ * SETCLIENTID state
  */
 
 static unsigned int clientid_hashval(u32 id)
@@ -1531,6 +1541,8 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp,
 	}
 
 	idr_remove(&clp->cl_stateids, s->sc_stateid.si_opaque.so_id);
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		atomic_dec(&s->sc_client->cl_admin_revoked);
 	list_add(&stp->st_locks, reaplist);
 }
 
@@ -1541,7 +1553,7 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp)
 	if (!unhash_ol_stateid(stp))
 		return false;
 	list_del_init(&stp->st_locks);
-	nfs4_unhash_stid(&stp->st_stid);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	return true;
 }
 
@@ -1599,7 +1611,7 @@ static void release_open_stateid_locks(struct nfs4_ol_stateid *open_stp,
 	while (!list_empty(&open_stp->st_locks)) {
 		stp = list_entry(open_stp->st_locks.next,
 				struct nfs4_ol_stateid, st_locks);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, reaplist);
 	}
 }
@@ -1620,6 +1632,7 @@ static void release_open_stateid(struct nfs4_ol_stateid *stp)
 	LIST_HEAD(reaplist);
 
 	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
 	if (unhash_open_stateid(stp, &reaplist))
 		put_ol_stateid_locked(stp, &reaplist);
 	spin_unlock(&stp->st_stid.sc_client->cl_lock);
@@ -1675,6 +1688,136 @@ static void release_openowner(struct nfs4_openowner *oo)
 	nfs4_put_stateowner(&oo->oo_owner);
 }
 
+static struct nfs4_stid *find_one_sb_stid(struct nfs4_client *clp,
+					  struct super_block *sb,
+					  unsigned int sc_types)
+{
+	unsigned long id, tmp;
+	struct nfs4_stid *stid;
+
+	spin_lock(&clp->cl_lock);
+	idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+		if ((stid->sc_type & sc_types) &&
+		    stid->sc_status == 0 &&
+		    stid->sc_file->fi_inode->i_sb == sb) {
+			refcount_inc(&stid->sc_count);
+			break;
+		}
+	spin_unlock(&clp->cl_lock);
+	return stid;
+}
+
+/**
+ * nfsd4_revoke_states - revoke all nfsv4 states associated with given filesystem
+ * @net:  used to identify instance of nfsd (there is one per net namespace)
+ * @sb:   super_block used to identify target filesystem
+ *
+ * All nfs4 states (open, lock, delegation, layout) held by the server instance
+ * and associated with a file on the given filesystem will be revoked resulting
+ * in any files being closed and so all references from nfsd to the filesystem
+ * being released.  Thus nfsd will no longer prevent the filesystem from being
+ * unmounted.
+ *
+ * The clients which own the states will subsequently being notified that the
+ * states have been "admin-revoked".
+ */
+void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	unsigned int idhashval;
+	unsigned int sc_types;
+
+	sc_types = SC_TYPE_OPEN | SC_TYPE_LOCK | SC_TYPE_DELEG | SC_TYPE_LAYOUT;
+
+	spin_lock(&nn->client_lock);
+	for (idhashval = 0; idhashval < CLIENT_HASH_MASK; idhashval++) {
+		struct list_head *head = &nn->conf_id_hashtbl[idhashval];
+		struct nfs4_client *clp;
+	retry:
+		list_for_each_entry(clp, head, cl_idhash) {
+			struct nfs4_stid *stid = find_one_sb_stid(clp, sb,
+								  sc_types);
+			if (stid) {
+				struct nfs4_ol_stateid *stp;
+				struct nfs4_delegation *dp;
+				struct nfs4_layout_stateid *ls;
+
+				spin_unlock(&nn->client_lock);
+				switch (stid->sc_type) {
+				case SC_TYPE_OPEN:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  OPEN_STATEID_MUTEX);
+
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
+				case SC_TYPE_LOCK:
+					stp = openlockstateid(stid);
+					mutex_lock_nested(&stp->st_mutex,
+							  LOCK_STATEID_MUTEX);
+					spin_lock(&clp->cl_lock);
+					if (stid->sc_status == 0) {
+						struct nfs4_lockowner *lo =
+							lockowner(stp->st_stateowner);
+						struct nfsd_file *nf;
+
+						stid->sc_status |=
+							SC_STATUS_ADMIN_REVOKED;
+						atomic_inc(&clp->cl_admin_revoked);
+						spin_unlock(&clp->cl_lock);
+						nf = find_any_file(stp->st_stid.sc_file);
+						if (nf) {
+							get_file(nf->nf_file);
+							filp_close(nf->nf_file,
+								   (fl_owner_t)lo);
+							nfsd_file_put(nf);
+						}
+						release_all_access(stp);
+					} else
+						spin_unlock(&clp->cl_lock);
+					mutex_unlock(&stp->st_mutex);
+					break;
+				case SC_TYPE_DELEG:
+					dp = delegstateid(stid);
+					spin_lock(&state_lock);
+					if (!unhash_delegation_locked(
+						    dp, SC_STATUS_ADMIN_REVOKED))
+						dp = NULL;
+					spin_unlock(&state_lock);
+					if (dp)
+						revoke_delegation(dp);
+					break;
+				case SC_TYPE_LAYOUT:
+					ls = layoutstateid(stid);
+					nfsd4_close_layout(ls);
+					break;
+				}
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				if (clp->cl_minorversion == 0)
+					/* Allow cleanup after a lease period.
+					 * store_release ensures cleanup will
+					 * see any newly revoked states if it
+					 * sees the time updated.
+					 */
+					nn->nfs40_last_revoke =
+						ktime_get_boottime_seconds();
+				goto retry;
+			}
+		}
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static inline int
 hash_sessionid(struct nfs4_sessionid *sessionid)
 {
@@ -2228,7 +2371,7 @@ __destroy_client(struct nfs4_client *clp)
 	spin_lock(&state_lock);
 	while (!list_empty(&clp->cl_delegations)) {
 		dp = list_entry(clp->cl_delegations.next, struct nfs4_delegation, dl_perclnt);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -2460,14 +2603,16 @@ find_stateid_locked(struct nfs4_client *cl, stateid_t *t)
 }
 
 static struct nfs4_stid *
-find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask)
+find_stateid_by_type(struct nfs4_client *cl, stateid_t *t,
+		     unsigned short typemask, unsigned short ok_states)
 {
 	struct nfs4_stid *s;
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, t);
 	if (s != NULL) {
-		if (typemask & s->sc_type)
+		if ((s->sc_status & ~ok_states) == 0 &&
+		    (typemask & s->sc_type))
 			refcount_inc(&s->sc_count);
 		else
 			s = NULL;
@@ -2487,9 +2632,9 @@ static struct nfs4_client *get_nfsdfs_clp(struct inode *inode)
 
 static void seq_quote_mem(struct seq_file *m, char *data, int len)
 {
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 	seq_escape_mem(m, data, len, ESCAPE_HEX | ESCAPE_NAP | ESCAPE_APPEND, "\"\\");
-	seq_printf(m, "\"");
+	seq_puts(m, "\"");
 }
 
 static const char *cb_state2str(int state)
@@ -2530,20 +2675,22 @@ static int client_info_show(struct seq_file *m, void *v)
 		seq_puts(m, "status: unconfirmed\n");
 	seq_printf(m, "seconds from last renew: %lld\n",
 		ktime_get_boottime_seconds() - clp->cl_time);
-	seq_printf(m, "name: ");
+	seq_puts(m, "name: ");
 	seq_quote_mem(m, clp->cl_name.data, clp->cl_name.len);
 	seq_printf(m, "\nminor version: %d\n", clp->cl_minorversion);
 	if (clp->cl_nii_domain.data) {
-		seq_printf(m, "Implementation domain: ");
+		seq_puts(m, "Implementation domain: ");
 		seq_quote_mem(m, clp->cl_nii_domain.data,
 					clp->cl_nii_domain.len);
-		seq_printf(m, "\nImplementation name: ");
+		seq_puts(m, "\nImplementation name: ");
 		seq_quote_mem(m, clp->cl_nii_name.data, clp->cl_nii_name.len);
 		seq_printf(m, "\nImplementation time: [%lld, %ld]\n",
 			clp->cl_nii_time.tv_sec, clp->cl_nii_time.tv_nsec);
 	}
 	seq_printf(m, "callback state: %s\n", cb_state2str(clp->cl_cb_state));
 	seq_printf(m, "callback address: %pISpc\n", &clp->cl_cb_conn.cb_addr);
+	seq_printf(m, "admin-revoked states: %d\n",
+		   atomic_read(&clp->cl_admin_revoked));
 	drop_client(clp);
 
 	return 0;
@@ -2602,7 +2749,7 @@ static void nfs4_show_superblock(struct seq_file *s, struct nfsd_file *f)
 
 static void nfs4_show_owner(struct seq_file *s, struct nfs4_stateowner *oo)
 {
-	seq_printf(s, "owner: ");
+	seq_puts(s, "owner: ");
 	seq_quote_mem(s, oo->so_owner.data, oo->so_owner.len);
 }
 
@@ -2620,20 +2767,13 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 	struct nfs4_stateowner *oo;
 	unsigned int access, deny;
 
-	if (st->sc_type != NFS4_OPEN_STID && st->sc_type != NFS4_LOCK_STID)
-		return 0; /* XXX: or SEQ_SKIP? */
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
 
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
-
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: open, ");
+	seq_puts(s, ": { type: open, ");
 
 	access = bmap_to_share_mode(ols->st_access_bmap);
 	deny   = bmap_to_share_mode(ols->st_deny_bmap);
@@ -2645,14 +2785,19 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
 		deny & NFS4_SHARE_ACCESS_READ ? "r" : "-",
 		deny & NFS4_SHARE_ACCESS_WRITE ? "w" : "-");
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	spin_unlock(&nf->fi_lock);
+	nfs4_show_owner(s, oo);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2666,30 +2811,31 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
 	ols = openlockstateid(st);
 	oo = ols->st_stateowner;
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = find_any_file_locked(nf);
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: lock, ");
+	seq_puts(s, ": { type: lock, ");
 
-	/*
-	 * Note: a lock stateid isn't really the same thing as a lock,
-	 * it's the locking state held by one owner on a file, and there
-	 * may be multiple (or no) lock ranges associated with it.
-	 * (Same for the matter is true of open stateids.)
-	 */
+	spin_lock(&nf->fi_lock);
+	file = find_any_file_locked(nf);
+	if (file) {
+		/*
+		 * Note: a lock stateid isn't really the same thing as a lock,
+		 * it's the locking state held by one owner on a file, and there
+		 * may be multiple (or no) lock ranges associated with it.
+		 * (Same for the matter is true of open stateids.)
+		 */
 
-	nfs4_show_superblock(s, file);
-	/* XXX: open stateid? */
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, ", ");
+		nfs4_show_superblock(s, file);
+		/* XXX: open stateid? */
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+		seq_puts(s, ", ");
+	}
 	nfs4_show_owner(s, oo);
-	seq_printf(s, " }\n");
-out:
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 	spin_unlock(&nf->fi_lock);
 	return 0;
 }
@@ -2702,27 +2848,28 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 
 	ds = delegstateid(st);
 	nf = st->sc_file;
-	spin_lock(&nf->fi_lock);
-	file = nf->fi_deleg_file;
-	if (!file)
-		goto out;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: deleg, ");
+	seq_puts(s, ": { type: deleg, ");
 
-	/* Kinda dead code as long as we only support read delegs: */
-	seq_printf(s, "access: %s, ",
-		ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
+	seq_printf(s, "access: %s",
+		   ds->dl_type == NFS4_OPEN_DELEGATE_READ ? "r" : "w");
 
 	/* XXX: lease time, whether it's being recalled. */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
-out:
+	spin_lock(&nf->fi_lock);
+	file = nf->fi_deleg_file;
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
 	spin_unlock(&nf->fi_lock);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 	return 0;
 }
 
@@ -2732,18 +2879,25 @@ static int nfs4_show_layout(struct seq_file *s, struct nfs4_stid *st)
 	struct nfsd_file *file;
 
 	ls = container_of(st, struct nfs4_layout_stateid, ls_stid);
-	file = ls->ls_file;
 
-	seq_printf(s, "- ");
+	seq_puts(s, "- ");
 	nfs4_show_stateid(s, &st->sc_stateid);
-	seq_printf(s, ": { type: layout, ");
+	seq_puts(s, ": { type: layout");
 
 	/* XXX: What else would be useful? */
 
-	nfs4_show_superblock(s, file);
-	seq_printf(s, ", ");
-	nfs4_show_fname(s, file);
-	seq_printf(s, " }\n");
+	spin_lock(&ls->ls_stid.sc_file->fi_lock);
+	file = ls->ls_file;
+	if (file) {
+		seq_puts(s, ", ");
+		nfs4_show_superblock(s, file);
+		seq_puts(s, ", ");
+		nfs4_show_fname(s, file);
+	}
+	spin_unlock(&ls->ls_stid.sc_file->fi_lock);
+	if (st->sc_status & SC_STATUS_ADMIN_REVOKED)
+		seq_puts(s, ", admin-revoked");
+	seq_puts(s, " }\n");
 
 	return 0;
 }
@@ -2753,13 +2907,13 @@ static int states_show(struct seq_file *s, void *v)
 	struct nfs4_stid *st = v;
 
 	switch (st->sc_type) {
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		return nfs4_show_open(s, st);
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		return nfs4_show_lock(s, st);
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		return nfs4_show_deleg(s, st);
-	case NFS4_LAYOUT_STID:
+	case SC_TYPE_LAYOUT:
 		return nfs4_show_layout(s, st);
 	default:
 		return 0; /* XXX: or SEQ_SKIP? */
@@ -2888,12 +3042,38 @@ static void
 nfsd4_cb_recall_any_release(struct nfsd4_callback *cb)
 {
 	struct nfs4_client *clp = cb->cb_clp;
-	struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
 
-	spin_lock(&nn->client_lock);
 	clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
-	put_client_renew_locked(clp);
-	spin_unlock(&nn->client_lock);
+	drop_client(clp);
+}
+
+static int
+nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+
+	ncf->ncf_cb_status = task->tk_status;
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_fattr *ncf =
+			container_of(cb, struct nfs4_cb_fattr, ncf_getattr);
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	nfs4_put_stid(&dp->dl_stid);
+	clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags);
+	wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY);
 }
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
@@ -2901,6 +3081,25 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
 	.release	= nfsd4_cb_recall_any_release,
 };
 
+static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
+	.done		= nfsd4_cb_getattr_done,
+	.release	= nfsd4_cb_getattr_release,
+};
+
+static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
+{
+	struct nfs4_delegation *dp =
+			container_of(ncf, struct nfs4_delegation, dl_cb_fattr);
+
+	if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags))
+		return;
+	/* set to proper status when nfsd4_cb_getattr_done runs */
+	ncf->ncf_cb_status = NFS4ERR_IO;
+
+	refcount_inc(&dp->dl_stid.sc_count);
+	nfsd4_run_cb(&ncf->ncf_getattr);
+}
+
 static struct nfs4_client *create_client(struct xdr_netobj name,
 		struct svc_rqst *rqstp, nfs4_verifier *verf)
 {
@@ -3414,6 +3613,9 @@ out_new:
 	new->cl_spo_must_allow.u.words[0] = exid->spo_must_allow[0];
 	new->cl_spo_must_allow.u.words[1] = exid->spo_must_allow[1];
 
+	/* Contrived initial CREATE_SESSION response */
+	new->cl_cs_slot.sl_status = nfserr_seq_misordered;
+
 	add_to_unconfirmed(new);
 	swap(new, conf);
 out_copy:
@@ -3584,10 +3786,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	struct nfsd4_create_session *cr_ses = &u->create_session;
 	struct sockaddr *sa = svc_addr(rqstp);
 	struct nfs4_client *conf, *unconf;
+	struct nfsd4_clid_slot *cs_slot;
 	struct nfs4_client *old = NULL;
 	struct nfsd4_session *new;
 	struct nfsd4_conn *conn;
-	struct nfsd4_clid_slot *cs_slot = NULL;
 	__be32 status = 0;
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
@@ -3611,53 +3813,63 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 		goto out_free_session;
 
 	spin_lock(&nn->client_lock);
+
+	/* RFC 8881 Section 18.36.4 Phase 1: Client record look-up. */
 	unconf = find_unconfirmed_client(&cr_ses->clientid, true, nn);
 	conf = find_confirmed_client(&cr_ses->clientid, true, nn);
-	WARN_ON_ONCE(conf && unconf);
+	if (!conf && !unconf) {
+		status = nfserr_stale_clientid;
+		goto out_free_conn;
+	}
 
+	/* RFC 8881 Section 18.36.4 Phase 2: Sequence ID processing. */
+	if (conf)
+		cs_slot = &conf->cl_cs_slot;
+	else
+		cs_slot = &unconf->cl_cs_slot;
+	status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
+	switch (status) {
+	case nfs_ok:
+		cs_slot->sl_seqid++;
+		cr_ses->seqid = cs_slot->sl_seqid;
+		break;
+	case nfserr_replay_cache:
+		status = nfsd4_replay_create_session(cr_ses, cs_slot);
+		fallthrough;
+	case nfserr_jukebox:
+		/* The server MUST NOT cache NFS4ERR_DELAY */
+		goto out_free_conn;
+	default:
+		goto out_cache_error;
+	}
+
+	/* RFC 8881 Section 18.36.4 Phase 3: Client ID confirmation. */
 	if (conf) {
 		status = nfserr_wrong_cred;
 		if (!nfsd4_mach_creds_match(conf, rqstp))
-			goto out_free_conn;
-		cs_slot = &conf->cl_cs_slot;
-		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
-		if (status) {
-			if (status == nfserr_replay_cache)
-				status = nfsd4_replay_create_session(cr_ses, cs_slot);
-			goto out_free_conn;
-		}
-	} else if (unconf) {
+			goto out_cache_error;
+	} else {
 		status = nfserr_clid_inuse;
 		if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
 		    !rpc_cmp_addr(sa, (struct sockaddr *) &unconf->cl_addr)) {
 			trace_nfsd_clid_cred_mismatch(unconf, rqstp);
-			goto out_free_conn;
+			goto out_cache_error;
 		}
 		status = nfserr_wrong_cred;
 		if (!nfsd4_mach_creds_match(unconf, rqstp))
-			goto out_free_conn;
-		cs_slot = &unconf->cl_cs_slot;
-		status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
-		if (status) {
-			/* an unconfirmed replay returns misordered */
-			status = nfserr_seq_misordered;
-			goto out_free_conn;
-		}
+			goto out_cache_error;
 		old = find_confirmed_client_by_name(&unconf->cl_name, nn);
 		if (old) {
 			status = mark_client_expired_locked(old);
-			if (status) {
-				old = NULL;
-				goto out_free_conn;
-			}
+			if (status)
+				goto out_expired_error;
 			trace_nfsd_clid_replaced(&old->cl_clientid);
 		}
 		move_to_confirmed(unconf);
 		conf = unconf;
-	} else {
-		status = nfserr_stale_clientid;
-		goto out_free_conn;
 	}
+
+	/* RFC 8881 Section 18.36.4 Phase 4: Session creation. */
 	status = nfs_ok;
 	/* Persistent sessions are not supported */
 	cr_ses->flags &= ~SESSION4_PERSIST;
@@ -3669,8 +3881,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
 	memcpy(cr_ses->sessionid.data, new->se_sessionid.data,
 	       NFS4_MAX_SESSIONID_LEN);
-	cs_slot->sl_seqid++;
-	cr_ses->seqid = cs_slot->sl_seqid;
 
 	/* cache solo and embedded create sessions under the client_lock */
 	nfsd4_cache_create_session(cr_ses, cs_slot, status);
@@ -3683,6 +3893,20 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (old)
 		expire_client(old);
 	return status;
+
+out_expired_error:
+	old = NULL;
+	/*
+	 * Revert the slot seq_nr change so the server will process
+	 * the client's resend instead of returning a cached response.
+	 */
+	if (status == nfserr_jukebox) {
+		cs_slot->sl_seqid--;
+		cr_ses->seqid = cs_slot->sl_seqid;
+		goto out_free_conn;
+	}
+out_cache_error:
+	nfsd4_cache_create_session(cr_ses, cs_slot, status);
 out_free_conn:
 	spin_unlock(&nn->client_lock);
 	free_conn(conn);
@@ -4058,6 +4282,9 @@ out:
 	}
 	if (!list_empty(&clp->cl_revoked))
 		seq->status_flags |= SEQ4_STATUS_RECALLABLE_STATE_REVOKED;
+	if (atomic_read(&clp->cl_admin_revoked))
+		seq->status_flags |= SEQ4_STATUS_ADMIN_STATE_REVOKED;
+	trace_nfsd_seq4_status(rqstp, seq);
 out_no_session:
 	if (conn)
 		free_conn(conn);
@@ -4352,32 +4579,25 @@ nfsd4_free_slabs(void)
 int
 nfsd4_init_slabs(void)
 {
-	client_slab = kmem_cache_create("nfsd4_clients",
-			sizeof(struct nfs4_client), 0, 0, NULL);
+	client_slab = KMEM_CACHE(nfs4_client, 0);
 	if (client_slab == NULL)
 		goto out;
-	openowner_slab = kmem_cache_create("nfsd4_openowners",
-			sizeof(struct nfs4_openowner), 0, 0, NULL);
+	openowner_slab = KMEM_CACHE(nfs4_openowner, 0);
 	if (openowner_slab == NULL)
 		goto out_free_client_slab;
-	lockowner_slab = kmem_cache_create("nfsd4_lockowners",
-			sizeof(struct nfs4_lockowner), 0, 0, NULL);
+	lockowner_slab = KMEM_CACHE(nfs4_lockowner, 0);
 	if (lockowner_slab == NULL)
 		goto out_free_openowner_slab;
-	file_slab = kmem_cache_create("nfsd4_files",
-			sizeof(struct nfs4_file), 0, 0, NULL);
+	file_slab = KMEM_CACHE(nfs4_file, 0);
 	if (file_slab == NULL)
 		goto out_free_lockowner_slab;
-	stateid_slab = kmem_cache_create("nfsd4_stateids",
-			sizeof(struct nfs4_ol_stateid), 0, 0, NULL);
+	stateid_slab = KMEM_CACHE(nfs4_ol_stateid, 0);
 	if (stateid_slab == NULL)
 		goto out_free_file_slab;
-	deleg_slab = kmem_cache_create("nfsd4_delegations",
-			sizeof(struct nfs4_delegation), 0, 0, NULL);
+	deleg_slab = KMEM_CACHE(nfs4_delegation, 0);
 	if (deleg_slab == NULL)
 		goto out_free_stateid_slab;
-	odstate_slab = kmem_cache_create("nfsd4_odstate",
-			sizeof(struct nfs4_clnt_odstate), 0, 0, NULL);
+	odstate_slab = KMEM_CACHE(nfs4_clnt_odstate, 0);
 	if (odstate_slab == NULL)
 		goto out_free_deleg_slab;
 	return 0;
@@ -4531,7 +4751,8 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 			continue;
 		if (local->st_stateowner != &oo->oo_owner)
 			continue;
-		if (local->st_stid.sc_type == NFS4_OPEN_STID) {
+		if (local->st_stid.sc_type == SC_TYPE_OPEN &&
+		    !local->st_stid.sc_status) {
 			ret = local;
 			refcount_inc(&ret->st_stid.sc_count);
 			break;
@@ -4540,22 +4761,75 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open)
 	return ret;
 }
 
-static __be32
-nfsd4_verify_open_stid(struct nfs4_stid *s)
+static void nfsd4_drop_revoked_stid(struct nfs4_stid *s)
+	__releases(&s->sc_client->cl_lock)
 {
-	__be32 ret = nfs_ok;
+	struct nfs4_client *cl = s->sc_client;
+	LIST_HEAD(reaplist);
+	struct nfs4_ol_stateid *stp;
+	struct nfs4_delegation *dp;
+	bool unhashed;
 
 	switch (s->sc_type) {
-	default:
+	case SC_TYPE_OPEN:
+		stp = openlockstateid(s);
+		if (unhash_open_stateid(stp, &reaplist))
+			put_ol_stateid_locked(stp, &reaplist);
+		spin_unlock(&cl->cl_lock);
+		free_ol_stateid_reaplist(&reaplist);
 		break;
-	case 0:
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
-		ret = nfserr_bad_stateid;
+	case SC_TYPE_LOCK:
+		stp = openlockstateid(s);
+		unhashed = unhash_lock_stateid(stp);
+		spin_unlock(&cl->cl_lock);
+		if (unhashed)
+			nfs4_put_stid(s);
 		break;
-	case NFS4_REVOKED_DELEG_STID:
-		ret = nfserr_deleg_revoked;
+	case SC_TYPE_DELEG:
+		dp = delegstateid(s);
+		list_del_init(&dp->dl_recall_lru);
+		spin_unlock(&cl->cl_lock);
+		nfs4_put_stid(s);
+		break;
+	default:
+		spin_unlock(&cl->cl_lock);
+	}
+}
+
+static void nfsd40_drop_revoked_stid(struct nfs4_client *cl,
+				    stateid_t *stid)
+{
+	/* NFSv4.0 has no way for the client to tell the server
+	 * that it can forget an admin-revoked stateid.
+	 * So we keep it around until the first time that the
+	 * client uses it, and drop it the first time
+	 * nfserr_admin_revoked is returned.
+	 * For v4.1 and later we wait until explicitly told
+	 * to free the stateid.
+	 */
+	if (cl->cl_minorversion == 0) {
+		struct nfs4_stid *st;
+
+		spin_lock(&cl->cl_lock);
+		st = find_stateid_locked(cl, stid);
+		if (st)
+			nfsd4_drop_revoked_stid(st);
+		else
+			spin_unlock(&cl->cl_lock);
 	}
+}
+
+static __be32
+nfsd4_verify_open_stid(struct nfs4_stid *s)
+{
+	__be32 ret = nfs_ok;
+
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED)
+		ret = nfserr_admin_revoked;
+	else if (s->sc_status & SC_STATUS_REVOKED)
+		ret = nfserr_deleg_revoked;
+	else if (s->sc_status & SC_STATUS_CLOSED)
+		ret = nfserr_bad_stateid;
 	return ret;
 }
 
@@ -4567,6 +4841,10 @@ nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp)
 
 	mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX);
 	ret = nfsd4_verify_open_stid(&stp->st_stid);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(stp->st_stid.sc_client,
+					&stp->st_stid.sc_stateid);
+
 	if (ret != nfs_ok)
 		mutex_unlock(&stp->st_mutex);
 	return ret;
@@ -4641,7 +4919,7 @@ retry:
 
 	open->op_stp = NULL;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_OPEN_STID;
+	stp->st_stid.sc_type = SC_TYPE_OPEN;
 	INIT_LIST_HEAD(&stp->st_locks);
 	stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner);
 	get_nfs4_file(fp);
@@ -4868,9 +5146,9 @@ static int nfsd4_cb_recall_done(struct nfsd4_callback *cb,
 
 	trace_nfsd_cb_recall_done(&dp->dl_stid.sc_stateid, task);
 
-	if (dp->dl_stid.sc_type == NFS4_CLOSED_DELEG_STID ||
-	    dp->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID)
-	        return 1;
+	if (dp->dl_stid.sc_status)
+		/* CLOSED or REVOKED */
+		return 1;
 
 	switch (task->tk_status) {
 	case 0:
@@ -4922,9 +5200,9 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
 
 /* Called from break_lease() with flc_lock held. */
 static bool
-nfsd_break_deleg_cb(struct file_lock *fl)
+nfsd_break_deleg_cb(struct file_lease *fl)
 {
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)fl->fl_owner;
+	struct nfs4_delegation *dp = (struct nfs4_delegation *) fl->c.flc_owner;
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
 	struct nfs4_client *clp = dp->dl_stid.sc_client;
 	struct nfsd_net *nn;
@@ -4958,9 +5236,9 @@ nfsd_break_deleg_cb(struct file_lock *fl)
  *   %true: Lease conflict was resolved
  *   %false: Lease conflict was not resolved.
  */
-static bool nfsd_breaker_owns_lease(struct file_lock *fl)
+static bool nfsd_breaker_owns_lease(struct file_lease *fl)
 {
-	struct nfs4_delegation *dl = fl->fl_owner;
+	struct nfs4_delegation *dl = fl->c.flc_owner;
 	struct svc_rqst *rqst;
 	struct nfs4_client *clp;
 
@@ -4975,10 +5253,10 @@ static bool nfsd_breaker_owns_lease(struct file_lock *fl)
 }
 
 static int
-nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+nfsd_change_deleg_cb(struct file_lease *onlist, int arg,
 		     struct list_head *dispose)
 {
-	struct nfs4_delegation *dp = (struct nfs4_delegation *)onlist->fl_owner;
+	struct nfs4_delegation *dp = (struct nfs4_delegation *) onlist->c.flc_owner;
 	struct nfs4_client *clp = dp->dl_stid.sc_client;
 
 	if (arg & F_UNLCK) {
@@ -4989,7 +5267,7 @@ nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
 		return -EAGAIN;
 }
 
-static const struct lock_manager_operations nfsd_lease_mng_ops = {
+static const struct lease_manager_operations nfsd_lease_mng_ops = {
 	.lm_breaker_owns_lease = nfsd_breaker_owns_lease,
 	.lm_break = nfsd_break_deleg_cb,
 	.lm_change = nfsd_change_deleg_cb,
@@ -5113,12 +5391,12 @@ static int share_access_to_flags(u32 share_access)
 	return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE;
 }
 
-static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, stateid_t *s)
+static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl,
+						  stateid_t *s)
 {
 	struct nfs4_stid *ret;
 
-	ret = find_stateid_by_type(cl, s,
-				NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID);
+	ret = find_stateid_by_type(cl, s, SC_TYPE_DELEG, SC_STATUS_REVOKED);
 	if (!ret)
 		return NULL;
 	return delegstateid(ret);
@@ -5141,10 +5419,15 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open,
 	deleg = find_deleg_stateid(cl, &open->op_delegate_stateid);
 	if (deleg == NULL)
 		goto out;
-	if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) {
+	if (deleg->dl_stid.sc_status & SC_STATUS_ADMIN_REVOKED) {
 		nfs4_put_stid(&deleg->dl_stid);
-		if (cl->cl_minorversion)
-			status = nfserr_deleg_revoked;
+		status = nfserr_admin_revoked;
+		goto out;
+	}
+	if (deleg->dl_stid.sc_status & SC_STATUS_REVOKED) {
+		nfs4_put_stid(&deleg->dl_stid);
+		nfsd40_drop_revoked_stid(cl, &open->op_delegate_stateid);
+		status = nfserr_deleg_revoked;
 		goto out;
 	}
 	flags = share_access_to_flags(open->op_share_access);
@@ -5189,7 +5472,7 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 		return 0;
 	if (!(open->op_share_access & NFS4_SHARE_ACCESS_WRITE))
 		return nfserr_inval;
-	return nfsd_setattr(rqstp, fh, &attrs, 0, (time64_t)0);
+	return nfsd_setattr(rqstp, fh, &attrs, NULL);
 }
 
 static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file *fp,
@@ -5329,21 +5612,20 @@ static bool nfsd4_cb_channel_good(struct nfs4_client *clp)
 	return clp->cl_minorversion && clp->cl_cb_state == NFSD4_CB_UNKNOWN;
 }
 
-static struct file_lock *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
+static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp,
 						int flag)
 {
-	struct file_lock *fl;
+	struct file_lease *fl;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lease();
 	if (!fl)
 		return NULL;
 	fl->fl_lmops = &nfsd_lease_mng_ops;
-	fl->fl_flags = FL_DELEG;
-	fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
-	fl->fl_end = OFFSET_MAX;
-	fl->fl_owner = (fl_owner_t)dp;
-	fl->fl_pid = current->tgid;
-	fl->fl_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+	fl->c.flc_flags = FL_DELEG;
+	fl->c.flc_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK;
+	fl->c.flc_owner = (fl_owner_t)dp;
+	fl->c.flc_pid = current->tgid;
+	fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
 	return fl;
 }
 
@@ -5461,7 +5743,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate;
 	struct nfs4_delegation *dp;
 	struct nfsd_file *nf = NULL;
-	struct file_lock *fl;
+	struct file_lease *fl;
 	u32 dl_type;
 
 	/*
@@ -5531,9 +5813,10 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (!fl)
 		goto out_clnt_odstate;
 
-	status = vfs_setlease(fp->fi_deleg_file->nf_file, fl->fl_type, &fl, NULL);
+	status = kernel_setlease(fp->fi_deleg_file->nf_file,
+				      fl->c.flc_type, &fl, NULL);
 	if (fl)
-		locks_free_lock(fl);
+		locks_free_lease(fl);
 	if (status)
 		goto out_clnt_odstate;
 
@@ -5560,9 +5843,11 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		goto out_unlock;
 
 	spin_lock(&state_lock);
+	spin_lock(&clp->cl_lock);
 	spin_lock(&fp->fi_lock);
 	status = hash_delegation_locked(dp, fp);
 	spin_unlock(&fp->fi_lock);
+	spin_unlock(&clp->cl_lock);
 	spin_unlock(&state_lock);
 
 	if (status)
@@ -5570,7 +5855,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 
 	return dp;
 out_unlock:
-	vfs_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+	kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
 out_clnt_odstate:
 	put_clnt_odstate(dp->dl_clnt_odstate);
 	nfs4_put_stid(&dp->dl_stid);
@@ -5634,6 +5919,8 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	struct svc_fh *parent = NULL;
 	int cb_up;
 	int status = 0;
+	struct kstat stat;
+	struct path path;
 
 	cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client);
 	open->op_recall = false;
@@ -5671,6 +5958,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE;
 		trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid);
+		path.mnt = currentfh->fh_export->ex_path.mnt;
+		path.dentry = currentfh->fh_dentry;
+		if (vfs_getattr(&path, &stat,
+				(STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE),
+				AT_STATX_SYNC_AS_STAT)) {
+			nfs4_put_stid(&dp->dl_stid);
+			destroy_delegation(dp);
+			goto out_no_deleg;
+		}
+		dp->dl_cb_fattr.ncf_cur_fsize = stat.size;
+		dp->dl_cb_fattr.ncf_initial_cinfo =
+			nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry));
 	} else {
 		open->op_delegate_type = NFS4_OPEN_DELEGATE_READ;
 		trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid);
@@ -5774,7 +6073,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
 	} else {
 		status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open, true);
 		if (status) {
-			stp->st_stid.sc_type = NFS4_CLOSED_STID;
 			release_open_stateid(stp);
 			mutex_unlock(&stp->st_mutex);
 			goto out;
@@ -6128,6 +6426,43 @@ nfs4_process_client_reaplist(struct list_head *reaplist)
 	}
 }
 
+static void nfs40_clean_admin_revoked(struct nfsd_net *nn,
+				      struct laundry_time *lt)
+{
+	struct nfs4_client *clp;
+
+	spin_lock(&nn->client_lock);
+	if (nn->nfs40_last_revoke == 0 ||
+	    nn->nfs40_last_revoke > lt->cutoff) {
+		spin_unlock(&nn->client_lock);
+		return;
+	}
+	nn->nfs40_last_revoke = 0;
+
+retry:
+	list_for_each_entry(clp, &nn->client_lru, cl_lru) {
+		unsigned long id, tmp;
+		struct nfs4_stid *stid;
+
+		if (atomic_read(&clp->cl_admin_revoked) == 0)
+			continue;
+
+		spin_lock(&clp->cl_lock);
+		idr_for_each_entry_ul(&clp->cl_stateids, stid, tmp, id)
+			if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+				refcount_inc(&stid->sc_count);
+				spin_unlock(&nn->client_lock);
+				/* this function drops ->cl_lock */
+				nfsd4_drop_revoked_stid(stid);
+				nfs4_put_stid(stid);
+				spin_lock(&nn->client_lock);
+				goto retry;
+			}
+		spin_unlock(&clp->cl_lock);
+	}
+	spin_unlock(&nn->client_lock);
+}
+
 static time64_t
 nfs4_laundromat(struct nfsd_net *nn)
 {
@@ -6161,12 +6496,14 @@ nfs4_laundromat(struct nfsd_net *nn)
 	nfs4_get_client_reaplist(nn, &reaplist, &lt);
 	nfs4_process_client_reaplist(&reaplist);
 
+	nfs40_clean_admin_revoked(nn, &lt);
+
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
 		if (!state_expired(&lt, dp->dl_time))
 			break;
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp, SC_STATUS_REVOKED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -6225,6 +6562,8 @@ nfs4_laundromat(struct nfsd_net *nn)
 	/* service the server-to-server copy delayed unmount list */
 	nfsd4_ssc_expire_umount(nn);
 #endif
+	if (atomic_long_read(&num_delegations) >= max_delegations)
+		deleg_reaper(nn);
 out:
 	return max_t(time64_t, lt.new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
 }
@@ -6274,7 +6613,7 @@ deleg_reaper(struct nfsd_net *nn)
 		list_add(&clp->cl_ra_cblist, &cblist);
 
 		/* release in nfsd4_cb_recall_any_release */
-		atomic_inc(&clp->cl_rpc_users);
+		kref_get(&clp->cl_nfsdfs.cl_ref);
 		set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags);
 		clp->cl_ra_time = ktime_get_boottime_seconds();
 	}
@@ -6286,6 +6625,8 @@ deleg_reaper(struct nfsd_net *nn)
 		list_del_init(&clp->cl_ra_cblist);
 		clp->cl_ra->ra_keep = 0;
 		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG);
+		clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) |
+						BIT(RCA4_TYPE_MASK_WDATA_DLG);
 		trace_nfsd_cb_recall_any(clp->cl_ra);
 		nfsd4_run_cb(&clp->cl_ra->ra_cb);
 	}
@@ -6379,6 +6720,9 @@ static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_sti
 	if (ret == nfs_ok)
 		ret = check_stateid_generation(in, &s->sc_stateid, has_session);
 	spin_unlock(&s->sc_lock);
+	if (ret == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(s->sc_client,
+					&s->sc_stateid);
 	return ret;
 }
 
@@ -6405,32 +6749,33 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
 	status = nfsd4_stid_check_stateid_generation(stateid, s, 1);
 	if (status)
 		goto out_unlock;
+	status = nfsd4_verify_open_stid(s);
+	if (status)
+		goto out_unlock;
+
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs_ok;
 		break;
-	case NFS4_REVOKED_DELEG_STID:
-		status = nfserr_deleg_revoked;
-		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfsd4_check_openowner_confirmed(openlockstateid(s));
 		break;
 	default:
 		printk("unknown stateid type %x\n", s->sc_type);
-		fallthrough;
-	case NFS4_CLOSED_STID:
-	case NFS4_CLOSED_DELEG_STID:
 		status = nfserr_bad_stateid;
 	}
 out_unlock:
 	spin_unlock(&cl->cl_lock);
+	if (status == nfserr_admin_revoked)
+		nfsd40_drop_revoked_stid(cl, stateid);
 	return status;
 }
 
 __be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
+		     stateid_t *stateid,
+		     unsigned short typemask, unsigned short statusmask,
 		     struct nfs4_stid **s, struct nfsd_net *nn)
 {
 	__be32 status;
@@ -6441,10 +6786,15 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	 *  only return revoked delegations if explicitly asked.
 	 *  otherwise we report revoked or bad_stateid status.
 	 */
-	if (typemask & NFS4_REVOKED_DELEG_STID)
+	if (statusmask & SC_STATUS_REVOKED)
 		return_revoked = true;
-	else if (typemask & NFS4_DELEG_STID)
-		typemask |= NFS4_REVOKED_DELEG_STID;
+	if (typemask & SC_TYPE_DELEG)
+		/* Always allow REVOKED for DELEG so we can
+		 * retturn the appropriate error.
+		 */
+		statusmask |= SC_STATUS_REVOKED;
+
+	statusmask |= SC_STATUS_ADMIN_REVOKED;
 
 	if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
 		CLOSE_STATEID(stateid))
@@ -6457,14 +6807,17 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 	}
 	if (status)
 		return status;
-	stid = find_stateid_by_type(cstate->clp, stateid, typemask);
+	stid = find_stateid_by_type(cstate->clp, stateid, typemask, statusmask);
 	if (!stid)
 		return nfserr_bad_stateid;
-	if ((stid->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) {
+	if ((stid->sc_status & SC_STATUS_REVOKED) && !return_revoked) {
 		nfs4_put_stid(stid);
-		if (cstate->minorversion)
-			return nfserr_deleg_revoked;
-		return nfserr_bad_stateid;
+		return nfserr_deleg_revoked;
+	}
+	if (stid->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd40_drop_revoked_stid(cstate->clp, stateid);
+		nfs4_put_stid(stid);
+		return nfserr_admin_revoked;
 	}
 	*s = stid;
 	return nfs_ok;
@@ -6475,17 +6828,17 @@ nfs4_find_file(struct nfs4_stid *s, int flags)
 {
 	struct nfsd_file *ret = NULL;
 
-	if (!s)
+	if (!s || s->sc_status)
 		return NULL;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		spin_lock(&s->sc_file->fi_lock);
 		ret = nfsd_file_get(s->sc_file->fi_deleg_file);
 		spin_unlock(&s->sc_file->fi_lock);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		if (flags & RD_STATE)
 			ret = find_readable_file(s->sc_file);
 		else
@@ -6598,7 +6951,8 @@ static __be32 find_cpntf_state(struct nfsd_net *nn, stateid_t *st,
 		goto out;
 
 	*stid = find_stateid_by_type(found, &cps->cp_p_stateid,
-			NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID);
+				     SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				     0);
 	if (*stid)
 		status = nfs_ok;
 	else
@@ -6655,8 +7009,8 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 	}
 
 	status = nfsd4_lookup_stateid(cstate, stateid,
-				NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID,
-				&s, nn);
+				SC_TYPE_DELEG|SC_TYPE_OPEN|SC_TYPE_LOCK,
+				0, &s, nn);
 	if (status == nfserr_bad_stateid)
 		status = find_cpntf_state(nn, stateid, &s);
 	if (status)
@@ -6667,16 +7021,13 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		goto out;
 
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
 		status = nfs4_check_delegmode(delegstateid(s), flags);
 		break;
-	case NFS4_OPEN_STID:
-	case NFS4_LOCK_STID:
+	case SC_TYPE_OPEN:
+	case SC_TYPE_LOCK:
 		status = nfs4_check_olstateid(openlockstateid(s), flags);
 		break;
-	default:
-		status = nfserr_bad_stateid;
-		break;
 	}
 	if (status)
 		goto out;
@@ -6755,34 +7106,39 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	spin_lock(&cl->cl_lock);
 	s = find_stateid_locked(cl, stateid);
-	if (!s)
+	if (!s || s->sc_status & SC_STATUS_CLOSED)
 		goto out_unlock;
+	if (s->sc_status & SC_STATUS_ADMIN_REVOKED) {
+		nfsd4_drop_revoked_stid(s);
+		ret = nfs_ok;
+		goto out;
+	}
 	spin_lock(&s->sc_lock);
 	switch (s->sc_type) {
-	case NFS4_DELEG_STID:
+	case SC_TYPE_DELEG:
+		if (s->sc_status & SC_STATUS_REVOKED) {
+			spin_unlock(&s->sc_lock);
+			dp = delegstateid(s);
+			list_del_init(&dp->dl_recall_lru);
+			spin_unlock(&cl->cl_lock);
+			nfs4_put_stid(s);
+			ret = nfs_ok;
+			goto out;
+		}
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_OPEN_STID:
+	case SC_TYPE_OPEN:
 		ret = check_stateid_generation(stateid, &s->sc_stateid, 1);
 		if (ret)
 			break;
 		ret = nfserr_locks_held;
 		break;
-	case NFS4_LOCK_STID:
+	case SC_TYPE_LOCK:
 		spin_unlock(&s->sc_lock);
 		refcount_inc(&s->sc_count);
 		spin_unlock(&cl->cl_lock);
 		ret = nfsd4_free_lock_stateid(stateid, s);
 		goto out;
-	case NFS4_REVOKED_DELEG_STID:
-		spin_unlock(&s->sc_lock);
-		dp = delegstateid(s);
-		list_del_init(&dp->dl_recall_lru);
-		spin_unlock(&cl->cl_lock);
-		nfs4_put_stid(s);
-		ret = nfs_ok;
-		goto out;
-	/* Default falls through and returns nfserr_bad_stateid */
 	}
 	spin_unlock(&s->sc_lock);
 out_unlock:
@@ -6824,6 +7180,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  * @seqid: seqid (provided by client)
  * @stateid: stateid (provided by client)
  * @typemask: mask of allowable types for this operation
+ * @statusmask: mask of allowed states: 0 or STID_CLOSED
  * @stpp: return pointer for the stateid found
  * @nn: net namespace for request
  *
@@ -6833,7 +7190,8 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_
  */
 static __be32
 nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
-			 stateid_t *stateid, char typemask,
+			 stateid_t *stateid,
+			 unsigned short typemask, unsigned short statusmask,
 			 struct nfs4_ol_stateid **stpp,
 			 struct nfsd_net *nn)
 {
@@ -6844,7 +7202,8 @@ nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
 	trace_nfsd_preprocess(seqid, stateid);
 
 	*stpp = NULL;
-	status = nfsd4_lookup_stateid(cstate, stateid, typemask, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid,
+				      typemask, statusmask, &s, nn);
 	if (status)
 		return status;
 	stp = openlockstateid(s);
@@ -6866,7 +7225,7 @@ static __be32 nfs4_preprocess_confirmed_seqid_op(struct nfsd4_compound_state *cs
 	struct nfs4_ol_stateid *stp;
 
 	status = nfs4_preprocess_seqid_op(cstate, seqid, stateid,
-						NFS4_OPEN_STID, &stp, nn);
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		return status;
 	oo = openowner(stp->st_stateowner);
@@ -6897,8 +7256,8 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		return status;
 
 	status = nfs4_preprocess_seqid_op(cstate,
-					oc->oc_seqid, &oc->oc_req_stateid,
-					NFS4_OPEN_STID, &stp, nn);
+					  oc->oc_seqid, &oc->oc_req_stateid,
+					  SC_TYPE_OPEN, 0, &stp, nn);
 	if (status)
 		goto out;
 	oo = openowner(stp->st_stateowner);
@@ -7028,18 +7387,20 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	dprintk("NFSD: nfsd4_close on file %pd\n", 
+	dprintk("NFSD: nfsd4_close on file %pd\n",
 			cstate->current_fh.fh_dentry);
 
 	status = nfs4_preprocess_seqid_op(cstate, close->cl_seqid,
-					&close->cl_stateid,
-					NFS4_OPEN_STID|NFS4_CLOSED_STID,
-					&stp, nn);
+					  &close->cl_stateid,
+					  SC_TYPE_OPEN, SC_STATUS_CLOSED,
+					  &stp, nn);
 	nfsd4_bump_seqid(cstate, status);
 	if (status)
-		goto out; 
+		goto out;
 
-	stp->st_stid.sc_type = NFS4_CLOSED_STID;
+	spin_lock(&stp->st_stid.sc_client->cl_lock);
+	stp->st_stid.sc_status |= SC_STATUS_CLOSED;
+	spin_unlock(&stp->st_stid.sc_client->cl_lock);
 
 	/*
 	 * Technically we don't _really_ have to increment or copy it, since
@@ -7081,7 +7442,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		return status;
 
-	status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID, &s, nn);
+	status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, 0, &s, nn);
 	if (status)
 		goto out;
 	dp = delegstateid(s);
@@ -7148,7 +7509,7 @@ nfsd4_lm_put_owner(fl_owner_t owner)
 static bool
 nfsd4_lm_lock_expirable(struct file_lock *cfl)
 {
-	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)cfl->fl_owner;
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *) cfl->c.flc_owner;
 	struct nfs4_client *clp = lo->lo_owner.so_client;
 	struct nfsd_net *nn;
 
@@ -7170,7 +7531,7 @@ nfsd4_lm_expire_lock(void)
 static void
 nfsd4_lm_notify(struct file_lock *fl)
 {
-	struct nfs4_lockowner		*lo = (struct nfs4_lockowner *)fl->fl_owner;
+	struct nfs4_lockowner		*lo = (struct nfs4_lockowner *) fl->c.flc_owner;
 	struct net			*net = lo->lo_owner.so_client->net;
 	struct nfsd_net			*nn = net_generic(net, nfsd_net_id);
 	struct nfsd4_blocked_lock	*nbl = container_of(fl,
@@ -7207,7 +7568,7 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 	struct nfs4_lockowner *lo;
 
 	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
-		lo = (struct nfs4_lockowner *) fl->fl_owner;
+		lo = (struct nfs4_lockowner *) fl->c.flc_owner;
 		xdr_netobj_dup(&deny->ld_owner, &lo->lo_owner.so_owner,
 						GFP_KERNEL);
 		if (!deny->ld_owner.data)
@@ -7226,7 +7587,7 @@ nevermind:
 	if (fl->fl_end != NFS4_MAX_UINT64)
 		deny->ld_length = fl->fl_end - fl->fl_start + 1;        
 	deny->ld_type = NFS4_READ_LT;
-	if (fl->fl_type != F_RDLCK)
+	if (fl->c.flc_type != F_RDLCK)
 		deny->ld_type = NFS4_WRITE_LT;
 }
 
@@ -7348,7 +7709,7 @@ retry:
 	if (retstp)
 		goto out_found;
 	refcount_inc(&stp->st_stid.sc_count);
-	stp->st_stid.sc_type = NFS4_LOCK_STID;
+	stp->st_stid.sc_type = SC_TYPE_LOCK;
 	stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner);
 	get_nfs4_file(fp);
 	stp->st_stid.sc_file = fp;
@@ -7492,8 +7853,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	int lkflg;
 	int err;
 	bool new = false;
-	unsigned char fl_type;
-	unsigned int fl_flags = FL_POSIX;
+	unsigned char type;
+	unsigned int flags = FL_POSIX;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
@@ -7535,9 +7896,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 							&lock_stp, &new);
 	} else {
 		status = nfs4_preprocess_seqid_op(cstate,
-				       lock->lk_old_lock_seqid,
-				       &lock->lk_old_lock_stateid,
-				       NFS4_LOCK_STID, &lock_stp, nn);
+						  lock->lk_old_lock_seqid,
+						  &lock->lk_old_lock_stateid,
+						  SC_TYPE_LOCK, 0, &lock_stp,
+						  nn);
 	}
 	if (status)
 		goto out;
@@ -7556,14 +7918,14 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 
 	if (lock->lk_reclaim)
-		fl_flags |= FL_RECLAIM;
+		flags |= FL_RECLAIM;
 
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
 		case NFS4_READW_LT:
 			if (nfsd4_has_session(cstate) ||
 			    exportfs_lock_op_is_async(sb->s_export_op))
-				fl_flags |= FL_SLEEP;
+				flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_READ_LT:
 			spin_lock(&fp->fi_lock);
@@ -7571,12 +7933,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			spin_unlock(&fp->fi_lock);
-			fl_type = F_RDLCK;
+			type = F_RDLCK;
 			break;
 		case NFS4_WRITEW_LT:
 			if (nfsd4_has_session(cstate) ||
 			    exportfs_lock_op_is_async(sb->s_export_op))
-				fl_flags |= FL_SLEEP;
+				flags |= FL_SLEEP;
 			fallthrough;
 		case NFS4_WRITE_LT:
 			spin_lock(&fp->fi_lock);
@@ -7584,7 +7946,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			if (nf)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			spin_unlock(&fp->fi_lock);
-			fl_type = F_WRLCK;
+			type = F_WRLCK;
 			break;
 		default:
 			status = nfserr_inval;
@@ -7604,7 +7966,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	 * on those filesystems:
 	 */
 	if (!exportfs_lock_op_is_async(sb->s_export_op))
-		fl_flags &= ~FL_SLEEP;
+		flags &= ~FL_SLEEP;
 
 	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
 	if (!nbl) {
@@ -7614,11 +7976,11 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	}
 
 	file_lock = &nbl->nbl_lock;
-	file_lock->fl_type = fl_type;
-	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
-	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = nf->nf_file;
-	file_lock->fl_flags = fl_flags;
+	file_lock->c.flc_type = type;
+	file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
+	file_lock->c.flc_pid = current->tgid;
+	file_lock->c.flc_file = nf->nf_file;
+	file_lock->c.flc_flags = flags;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = lock->lk_offset;
 	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -7631,7 +7993,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
-	if (fl_flags & FL_SLEEP) {
+	if (flags & FL_SLEEP) {
 		nbl->nbl_time = ktime_get_boottime_seconds();
 		spin_lock(&nn->blocked_locks_lock);
 		list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
@@ -7668,7 +8030,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 out:
 	if (nbl) {
 		/* dequeue it if we queued it before */
-		if (fl_flags & FL_SLEEP) {
+		if (flags & FL_SLEEP) {
 			spin_lock(&nn->blocked_locks_lock);
 			if (!list_empty(&nbl->nbl_list) &&
 			    !list_empty(&nbl->nbl_lru)) {
@@ -7736,9 +8098,9 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct
 	err = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
 	if (err)
 		goto out;
-	lock->fl_file = nf->nf_file;
+	lock->c.flc_file = nf->nf_file;
 	err = nfserrno(vfs_test_lock(nf->nf_file, lock));
-	lock->fl_file = NULL;
+	lock->c.flc_file = NULL;
 out:
 	inode_unlock(inode);
 	nfsd_file_put(nf);
@@ -7783,11 +8145,11 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (lockt->lt_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
-			file_lock->fl_type = F_RDLCK;
+			file_lock->c.flc_type = F_RDLCK;
 			break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
-			file_lock->fl_type = F_WRLCK;
+			file_lock->c.flc_type = F_WRLCK;
 			break;
 		default:
 			dprintk("NFSD: nfs4_lockt: bad lock type!\n");
@@ -7797,9 +8159,9 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	lo = find_lockowner_str(cstate->clp, &lockt->lt_owner);
 	if (lo)
-		file_lock->fl_owner = (fl_owner_t)lo;
-	file_lock->fl_pid = current->tgid;
-	file_lock->fl_flags = FL_POSIX;
+		file_lock->c.flc_owner = (fl_owner_t)lo;
+	file_lock->c.flc_pid = current->tgid;
+	file_lock->c.flc_flags = FL_POSIX;
 
 	file_lock->fl_start = lockt->lt_offset;
 	file_lock->fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length);
@@ -7810,7 +8172,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (status)
 		goto out;
 
-	if (file_lock->fl_type != F_UNLCK) {
+	if (file_lock->c.flc_type != F_UNLCK) {
 		status = nfserr_denied;
 		nfs4_set_lock_denied(file_lock, &lockt->lt_denied);
 	}
@@ -7850,8 +8212,8 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 return nfserr_inval;
 
 	status = nfs4_preprocess_seqid_op(cstate, locku->lu_seqid,
-					&locku->lu_stateid, NFS4_LOCK_STID,
-					&stp, nn);
+					  &locku->lu_stateid, SC_TYPE_LOCK, 0,
+					  &stp, nn);
 	if (status)
 		goto out;
 	nf = find_any_file(stp->st_stid.sc_file);
@@ -7866,11 +8228,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto put_file;
 	}
 
-	file_lock->fl_type = F_UNLCK;
-	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
-	file_lock->fl_pid = current->tgid;
-	file_lock->fl_file = nf->nf_file;
-	file_lock->fl_flags = FL_POSIX;
+	file_lock->c.flc_type = F_UNLCK;
+	file_lock->c.flc_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner));
+	file_lock->c.flc_pid = current->tgid;
+	file_lock->c.flc_file = nf->nf_file;
+	file_lock->c.flc_flags = FL_POSIX;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = locku->lu_offset;
 
@@ -7927,8 +8289,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 
 	if (flctx && !list_empty_careful(&flctx->flc_posix)) {
 		spin_lock(&flctx->flc_lock);
-		list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-			if (fl->fl_owner == (fl_owner_t)lowner) {
+		for_each_file_lock(fl, &flctx->flc_posix) {
+			if (fl->c.flc_owner == (fl_owner_t)lowner) {
 				status = true;
 				break;
 			}
@@ -7996,7 +8358,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
 		stp = list_first_entry(&lo->lo_owner.so_stateids,
 				       struct nfs4_ol_stateid,
 				       st_perstateowner);
-		WARN_ON(!unhash_lock_stateid(stp));
+		unhash_lock_stateid(stp);
 		put_ol_stateid_locked(stp, &reaplist);
 	}
 	spin_unlock(&clp->cl_lock);
@@ -8289,7 +8651,7 @@ nfs4_state_shutdown_net(struct net *net)
 	spin_lock(&state_lock);
 	list_for_each_safe(pos, next, &nn->del_recall_lru) {
 		dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru);
-		WARN_ON(!unhash_delegation_locked(dp));
+		unhash_delegation_locked(dp, SC_STATUS_CLOSED);
 		list_add(&dp->dl_recall_lru, &reaplist);
 	}
 	spin_unlock(&state_lock);
@@ -8431,6 +8793,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict
  * @rqstp: RPC transaction context
  * @inode: file to be checked for a conflict
+ * @modified: return true if file was modified
+ * @size: new size of file if modified is true
  *
  * This function is called when there is a conflict between a write
  * delegation and a change/size GETATTR from another client. The server
@@ -8439,27 +8803,30 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate,
  * delegation before replying to the GETATTR. See RFC 8881 section
  * 18.7.4.
  *
- * The current implementation does not support CB_GETATTR yet. However
- * this can avoid recalling the delegation could be added in follow up
- * work.
- *
  * Returns 0 if there is no conflict; otherwise an nfs_stat
  * code is returned.
  */
 __be32
-nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
+nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode,
+				bool *modified, u64 *size)
 {
 	__be32 status;
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct file_lock_context *ctx;
-	struct file_lock *fl;
+	struct file_lease *fl;
 	struct nfs4_delegation *dp;
+	struct iattr attrs;
+	struct nfs4_cb_fattr *ncf;
 
+	*modified = false;
 	ctx = locks_inode_context(inode);
 	if (!ctx)
 		return 0;
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-		if (fl->fl_flags == FL_LAYOUT)
+	for_each_file_lock(fl, &ctx->flc_lease) {
+		unsigned char type = fl->c.flc_type;
+
+		if (fl->c.flc_flags == FL_LAYOUT)
 			continue;
 		if (fl->fl_lmops != &nfsd_lease_mng_ops) {
 			/*
@@ -8467,23 +8834,49 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode)
 			 * we are done; there isn't any write delegation
 			 * on this inode
 			 */
-			if (fl->fl_type == F_RDLCK)
+			if (type == F_RDLCK)
 				break;
 			goto break_lease;
 		}
-		if (fl->fl_type == F_WRLCK) {
-			dp = fl->fl_owner;
+		if (type == F_WRLCK) {
+			dp = fl->c.flc_owner;
 			if (dp->dl_recall.cb_clp == *(rqstp->rq_lease_breaker)) {
 				spin_unlock(&ctx->flc_lock);
 				return 0;
 			}
 break_lease:
+			nfsd_stats_wdeleg_getattr_inc(nn);
+			dp = fl->c.flc_owner;
+			ncf = &dp->dl_cb_fattr;
+			nfs4_cb_getattr(&dp->dl_cb_fattr);
 			spin_unlock(&ctx->flc_lock);
-			nfsd_stats_wdeleg_getattr_inc();
-			status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
-			if (status != nfserr_jukebox ||
-					!nfsd_wait_for_delegreturn(rqstp, inode))
-				return status;
+			wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY,
+					TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT);
+			if (ncf->ncf_cb_status) {
+				/* Recall delegation only if client didn't respond */
+				status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ));
+				if (status != nfserr_jukebox ||
+						!nfsd_wait_for_delegreturn(rqstp, inode))
+					return status;
+			}
+			if (!ncf->ncf_file_modified &&
+					(ncf->ncf_initial_cinfo != ncf->ncf_cb_change ||
+					ncf->ncf_cur_fsize != ncf->ncf_cb_fsize))
+				ncf->ncf_file_modified = true;
+			if (ncf->ncf_file_modified) {
+				/*
+				 * Per section 10.4.3 of RFC 8881, the server would
+				 * not update the file's metadata with the client's
+				 * modified size
+				 */
+				attrs.ia_mtime = attrs.ia_ctime = current_time(inode);
+				attrs.ia_valid = ATTR_MTIME | ATTR_CTIME;
+				setattr_copy(&nop_mnt_idmap, inode, &attrs);
+				mark_inode_dirty(inode);
+				ncf->ncf_cur_fsize = ncf->ncf_cb_fsize;
+				*size = ncf->ncf_cur_fsize;
+				*modified = true;
+			}
 			return 0;
 		}
 		break;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c719c475a068..1955481832e0 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3490,11 +3490,13 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		    struct dentry *dentry, const u32 *bmval,
 		    int ignore_crossmnt)
 {
+	DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
 	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
 	int starting_len = xdr->buf->len;
 	__be32 *attrlen_p, status;
 	int attrlen_offset;
+	u32 attrmask[3];
 	int err;
 	struct nfsd4_compoundres *resp = rqstp->rq_resp;
 	u32 minorversion = resp->cstate.minorversion;
@@ -3502,11 +3504,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		.mnt	= exp->ex_path.mnt,
 		.dentry	= dentry,
 	};
-	union {
-		u32		attrmask[3];
-		unsigned long	mask[2];
-	} u;
 	unsigned long bit;
+	bool file_modified = false;
+	u64 size = 0;
 
 	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
 	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
@@ -3519,21 +3519,21 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	/*
 	 * Make a local copy of the attribute bitmap that can be modified.
 	 */
-	memset(&u, 0, sizeof(u));
-	u.attrmask[0] = bmval[0];
-	u.attrmask[1] = bmval[1];
-	u.attrmask[2] = bmval[2];
+	attrmask[0] = bmval[0];
+	attrmask[1] = bmval[1];
+	attrmask[2] = bmval[2];
 
 	args.rdattr_err = 0;
 	if (exp->ex_fslocs.migrated) {
-		status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1],
-						&u.attrmask[2], &args.rdattr_err);
+		status = fattr_handle_absent_fs(&attrmask[0], &attrmask[1],
+						&attrmask[2], &args.rdattr_err);
 		if (status)
 			goto out;
 	}
 	args.size = 0;
-	if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
-		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry));
+	if (attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) {
+		status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry),
+					&file_modified, &size);
 		if (status)
 			goto out;
 	}
@@ -3543,20 +3543,23 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 			  AT_STATX_SYNC_AS_STAT);
 	if (err)
 		goto out_nfserr;
-	args.size = args.stat.size;
+	if (file_modified)
+		args.size = size;
+	else
+		args.size = args.stat.size;
 
 	if (!(args.stat.result_mask & STATX_BTIME))
 		/* underlying FS does not offer btime so we can't share it */
-		u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
-	if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
+		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	if ((attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE |
 			FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) ||
-	    (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
+	    (attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE |
 		       FATTR4_WORD1_SPACE_TOTAL))) {
 		err = vfs_statfs(&path, &args.statfs);
 		if (err)
 			goto out_nfserr;
 	}
-	if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
+	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
 	    !fhp) {
 		tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL);
 		status = nfserr_jukebox;
@@ -3571,10 +3574,10 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		args.fhp = fhp;
 
 	args.acl = NULL;
-	if (u.attrmask[0] & FATTR4_WORD0_ACL) {
+	if (attrmask[0] & FATTR4_WORD0_ACL) {
 		err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl);
 		if (err == -EOPNOTSUPP)
-			u.attrmask[0] &= ~FATTR4_WORD0_ACL;
+			attrmask[0] &= ~FATTR4_WORD0_ACL;
 		else if (err == -EINVAL) {
 			status = nfserr_attrnotsupp;
 			goto out;
@@ -3586,17 +3589,17 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 	args.context = NULL;
-	if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
-	     u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
+	if ((attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) ||
+	     attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) {
 		if (exp->ex_flags & NFSEXP_SECURITY_LABEL)
 			err = security_inode_getsecctx(d_inode(dentry),
 						&args.context, &args.contextlen);
 		else
 			err = -EOPNOTSUPP;
 		args.contextsupport = (err == 0);
-		if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
+		if (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) {
 			if (err == -EOPNOTSUPP)
-				u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+				attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
 			else if (err)
 				goto out_nfserr;
 		}
@@ -3604,8 +3607,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */
 
 	/* attrmask */
-	status = nfsd4_encode_bitmap4(xdr, u.attrmask[0],
-				      u.attrmask[1], u.attrmask[2]);
+	status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
+				      attrmask[2]);
 	if (status)
 		goto out;
 
@@ -3614,7 +3617,9 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	attrlen_p = xdr_reserve_space(xdr, XDR_UNIT);
 	if (!attrlen_p)
 		goto out_resource;
-	for_each_set_bit(bit, (const unsigned long *)&u.mask,
+	bitmap_from_arr32(attr_bitmap, attrmask,
+			  ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+	for_each_set_bit(bit, attr_bitmap,
 			 ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
 		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
 		if (status != nfs_ok)
@@ -5386,16 +5391,11 @@ nfsd4_listxattr_validate_cookie(struct nfsd4_listxattrs *listxattrs,
 
 	/*
 	 * If the cookie is larger than the maximum number we can fit
-	 * in either the buffer we just got back from vfs_listxattr, or,
-	 * XDR-encoded, in the return buffer, it's invalid.
+	 * in the buffer we just got back from vfs_listxattr, it's invalid.
 	 */
 	if (cookie > (listxattrs->lsxa_len) / (XATTR_USER_PREFIX_LEN + 2))
 		return nfserr_badcookie;
 
-	if (cookie > (listxattrs->lsxa_maxcount /
-		      (XDR_QUADLEN(XATTR_USER_PREFIX_LEN + 2) + 4)))
-		return nfserr_badcookie;
-
 	*offsetp = (u32)cookie;
 	return 0;
 }
@@ -5412,6 +5412,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	u64 cookie;
 	char *sp;
 	__be32 status, tmp;
+	__be64 wire_cookie;
 	__be32 *p;
 	u32 nuser;
 
@@ -5427,7 +5428,7 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	 */
 	cookie_offset = xdr->buf->len;
 	count_offset = cookie_offset + 8;
-	p = xdr_reserve_space(xdr, 12);
+	p = xdr_reserve_space(xdr, XDR_UNIT * 3);
 	if (!p) {
 		status = nfserr_resource;
 		goto out;
@@ -5438,7 +5439,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 	sp = listxattrs->lsxa_buf;
 	nuser = 0;
 
-	xdrleft = listxattrs->lsxa_maxcount;
+	/* Bytes left is maxcount - 8 (cookie) - 4 (array count) */
+	xdrleft = listxattrs->lsxa_maxcount - XDR_UNIT * 3;
 
 	while (left > 0 && xdrleft > 0) {
 		slen = strlen(sp);
@@ -5451,7 +5453,8 @@ nfsd4_encode_listxattrs(struct nfsd4_compoundres *resp, __be32 nfserr,
 
 		slen -= XATTR_USER_PREFIX_LEN;
 		xdrlen = 4 + ((slen + 3) & ~3);
-		if (xdrlen > xdrleft) {
+		/* Check if both entry and eof can fit in the XDR buffer */
+		if (xdrlen + XDR_UNIT > xdrleft) {
 			if (count == 0) {
 				/*
 				 * Can't even fit the first attribute name.
@@ -5503,7 +5506,8 @@ wreof:
 
 	cookie = offset + count;
 
-	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &cookie, 8);
+	wire_cookie = cpu_to_be64(cookie);
+	write_bytes_to_xdr_buf(xdr->buf, cookie_offset, &wire_cookie, 8);
 	tmp = cpu_to_be32(count);
 	write_bytes_to_xdr_buf(xdr->buf, count_offset, &tmp, 4);
 out:
@@ -5727,27 +5731,24 @@ release:
 	rqstp->rq_next_page = xdr->page_ptr + 1;
 }
 
-/* 
- * Encode the reply stored in the stateowner reply cache 
- * 
- * XDR note: do not encode rp->rp_buflen: the buffer contains the
- * previously sent already encoded operation.
+/**
+ * nfsd4_encode_replay - encode a result stored in the stateowner reply cache
+ * @xdr: send buffer's XDR stream
+ * @op: operation being replayed
+ *
+ * @op->replay->rp_buf contains the previously-sent already-encoded result.
  */
-void
-nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
+void nfsd4_encode_replay(struct xdr_stream *xdr, struct nfsd4_op *op)
 {
-	__be32 *p;
 	struct nfs4_replay *rp = op->replay;
 
-	p = xdr_reserve_space(xdr, 8 + rp->rp_buflen);
-	if (!p) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-	*p++ = cpu_to_be32(op->opnum);
-	*p++ = rp->rp_status;  /* already xdr'ed */
+	trace_nfsd_stateowner_replay(op->opnum, rp);
 
-	p = xdr_encode_opaque_fixed(p, rp->rp_buf, rp->rp_buflen);
+	if (xdr_stream_encode_u32(xdr, op->opnum) != XDR_UNIT)
+		return;
+	if (xdr_stream_encode_be32(xdr, rp->rp_status) != XDR_UNIT)
+		return;
+	xdr_stream_encode_opaque_fixed(xdr, rp->rp_buf, rp->rp_buflen);
 }
 
 void nfsd4_release_compoundargs(struct svc_rqst *rqstp)
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 5c1a4a0aa605..ba9d326b3de6 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -166,8 +166,7 @@ nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp,
 
 int nfsd_drc_slab_create(void)
 {
-	drc_slab = kmem_cache_create("nfsd_drc",
-				sizeof(struct nfsd_cacherep), 0, 0, NULL);
+	drc_slab = KMEM_CACHE(nfsd_cacherep, 0);
 	return drc_slab ? 0: -ENOMEM;
 }
 
@@ -176,27 +175,6 @@ void nfsd_drc_slab_free(void)
 	kmem_cache_destroy(drc_slab);
 }
 
-/**
- * nfsd_net_reply_cache_init - per net namespace reply cache set-up
- * @nn: nfsd_net being initialized
- *
- * Returns zero on succes; otherwise a negative errno is returned.
- */
-int nfsd_net_reply_cache_init(struct nfsd_net *nn)
-{
-	return nfsd_percpu_counters_init(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
-/**
- * nfsd_net_reply_cache_destroy - per net namespace reply cache tear-down
- * @nn: nfsd_net being freed
- *
- */
-void nfsd_net_reply_cache_destroy(struct nfsd_net *nn)
-{
-	nfsd_percpu_counters_destroy(nn->counter, NFSD_NET_COUNTERS_NUM);
-}
-
 int nfsd_reply_cache_init(struct nfsd_net *nn)
 {
 	unsigned int hashsize;
@@ -501,7 +479,7 @@ out:
 int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 		      unsigned int len, struct nfsd_cacherep **cacherep)
 {
-	struct nfsd_net		*nn;
+	struct nfsd_net		*nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct nfsd_cacherep	*rp, *found;
 	__wsum			csum;
 	struct nfsd_drc_bucket	*b;
@@ -510,7 +488,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	int rtn = RC_DOIT;
 
 	if (type == RC_NOCACHE) {
-		nfsd_stats_rc_nocache_inc();
+		nfsd_stats_rc_nocache_inc(nn);
 		goto out;
 	}
 
@@ -520,7 +498,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 	 * Since the common case is a cache miss followed by an insert,
 	 * preallocate an entry.
 	 */
-	nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	rp = nfsd_cacherep_alloc(rqstp, csum, nn);
 	if (!rp)
 		goto out;
@@ -537,7 +514,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 
 	nfsd_cacherep_dispose(&dispose);
 
-	nfsd_stats_rc_misses_inc();
+	nfsd_stats_rc_misses_inc(nn);
 	atomic_inc(&nn->num_drc_entries);
 	nfsd_stats_drc_mem_usage_add(nn, sizeof(*rp));
 	goto out;
@@ -545,7 +522,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
 found_entry:
 	/* We found a matching entry which is either in progress or done. */
 	nfsd_reply_cache_free_locked(NULL, rp, nn);
-	nfsd_stats_rc_hits_inc();
+	nfsd_stats_rc_hits_inc(nn);
 	rtn = RC_DROPIT;
 	rp = found;
 
@@ -687,15 +664,15 @@ int nfsd_reply_cache_stats_show(struct seq_file *m, void *v)
 		   atomic_read(&nn->num_drc_entries));
 	seq_printf(m, "hash buckets:          %u\n", 1 << nn->maskbits);
 	seq_printf(m, "mem usage:             %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_DRC_MEM_USAGE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_DRC_MEM_USAGE]));
 	seq_printf(m, "cache hits:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]));
 	seq_printf(m, "cache misses:          %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]));
 	seq_printf(m, "not cached:            %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]));
 	seq_printf(m, "payload misses:        %lld\n",
-		   percpu_counter_sum_positive(&nn->counter[NFSD_NET_PAYLOAD_MISSES]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]));
 	seq_printf(m, "longest chain len:     %u\n", nn->longest_chain);
 	seq_printf(m, "cachesize at longest:  %u\n", nn->longest_chain_cachesize);
 	return 0;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index f206ca32e7f5..ecd18bffeebc 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -281,6 +281,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
 	 * 3.  Is that directory the root of an exported file system?
 	 */
 	error = nlmsvc_unlock_all_by_sb(path.dentry->d_sb);
+	nfsd4_revoke_states(netns(file), path.dentry->d_sb);
 
 	path_put(&path);
 	return error;
@@ -1671,14 +1672,17 @@ static __net_init int nfsd_net_init(struct net *net)
 	retval = nfsd_idmap_init(net);
 	if (retval)
 		goto out_idmap_error;
-	retval = nfsd_net_reply_cache_init(nn);
+	retval = nfsd_stat_counters_init(nn);
 	if (retval)
 		goto out_repcache_error;
+	memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats));
+	nn->nfsd_svcstats.program = &nfsd_program;
 	nn->nfsd_versions = NULL;
 	nn->nfsd4_minorversions = NULL;
 	nfsd4_init_leases_net(nn);
 	get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key));
 	seqlock_init(&nn->writeverf_lock);
+	nfsd_proc_stat_init(net);
 
 	return 0;
 
@@ -1699,7 +1703,8 @@ static __net_exit void nfsd_net_exit(struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	nfsd_net_reply_cache_destroy(nn);
+	nfsd_proc_stat_shutdown(net);
+	nfsd_stat_counters_destroy(nn);
 	nfsd_idmap_shutdown(net);
 	nfsd_export_shutdown(net);
 	nfsd_netns_free_versions(nn);
@@ -1722,12 +1727,9 @@ static int __init init_nfsd(void)
 	retval = nfsd4_init_pnfs();
 	if (retval)
 		goto out_free_slabs;
-	retval = nfsd_stat_init();	/* Statistics */
-	if (retval)
-		goto out_free_pnfs;
 	retval = nfsd_drc_slab_create();
 	if (retval)
-		goto out_free_stat;
+		goto out_free_pnfs;
 	nfsd_lockd_init();	/* lockd->nfsd callbacks */
 	retval = create_proc_exports_entry();
 	if (retval)
@@ -1761,8 +1763,6 @@ out_free_exports:
 out_free_lockd:
 	nfsd_lockd_shutdown();
 	nfsd_drc_slab_free();
-out_free_stat:
-	nfsd_stat_shutdown();
 out_free_pnfs:
 	nfsd4_exit_pnfs();
 out_free_slabs:
@@ -1780,7 +1780,6 @@ static void __exit exit_nfsd(void)
 	nfsd_drc_slab_free();
 	remove_proc_entry("fs/nfs/exports", NULL);
 	remove_proc_entry("fs/nfs", NULL);
-	nfsd_stat_shutdown();
 	nfsd_lockd_shutdown();
 	nfsd4_free_slabs();
 	nfsd4_exit_pnfs();
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 304e9728b929..16c5a05f340e 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -86,6 +86,7 @@ extern struct mutex		nfsd_mutex;
 extern spinlock_t		nfsd_drc_lock;
 extern unsigned long		nfsd_drc_max_mem;
 extern unsigned long		nfsd_drc_mem_used;
+extern atomic_t			nfsd_th_cnt;		/* number of available threads */
 
 extern const struct seq_operations nfs_exports_op;
 
@@ -274,6 +275,7 @@ void		nfsd_lockd_shutdown(void);
 #define	nfserr_no_grace		cpu_to_be32(NFSERR_NO_GRACE)
 #define	nfserr_reclaim_bad	cpu_to_be32(NFSERR_RECLAIM_BAD)
 #define	nfserr_badname		cpu_to_be32(NFSERR_BADNAME)
+#define	nfserr_admin_revoked	cpu_to_be32(NFS4ERR_ADMIN_REVOKED)
 #define	nfserr_cb_path_down	cpu_to_be32(NFSERR_CB_PATH_DOWN)
 #define	nfserr_locked		cpu_to_be32(NFSERR_LOCKED)
 #define	nfserr_wrongsec		cpu_to_be32(NFSERR_WRONGSEC)
@@ -365,6 +367,7 @@ void		nfsd_lockd_shutdown(void);
 #define	NFSD_CLIENT_MAX_TRIM_PER_RUN	128
 #define	NFS4_CLIENTS_PER_GB		1024
 #define NFSD_DELEGRETURN_TIMEOUT	(HZ / 34)	/* 30ms */
+#define	NFSD_CB_GETATTR_TIMEOUT		NFSD_DELEGRETURN_TIMEOUT
 
 /*
  * The following attributes are currently not supported by the NFSv4 server:
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index dbfa0ac13564..40fecf7b224f 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -327,6 +327,7 @@ out:
 __be32
 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 {
+	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 	struct svc_export *exp = NULL;
 	struct dentry	*dentry;
 	__be32		error;
@@ -395,7 +396,7 @@ skip_pseudoflavor_check:
 out:
 	trace_nfsd_fh_verify_err(rqstp, fhp, type, access, error);
 	if (error == nfserr_stale)
-		nfsd_stats_fh_stale_inc(exp);
+		nfsd_stats_fh_stale_inc(nn, exp);
 	return error;
 }
 
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index a7315928a760..36370b957b63 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -103,7 +103,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp)
 		}
 	}
 
-	resp->status = nfsd_setattr(rqstp, fhp, &attrs, 0, (time64_t)0);
+	resp->status = nfsd_setattr(rqstp, fhp, &attrs, NULL);
 	if (resp->status != nfs_ok)
 		goto out;
 
@@ -390,8 +390,8 @@ nfsd_proc_create(struct svc_rqst *rqstp)
 		 */
 		attr->ia_valid &= ATTR_SIZE;
 		if (attr->ia_valid)
-			resp->status = nfsd_setattr(rqstp, newfhp, &attrs, 0,
-						    (time64_t)0);
+			resp->status = nfsd_setattr(rqstp, newfhp, &attrs,
+						    NULL);
 	}
 
 out_unlock:
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a667802e08e7..c0d17b92b249 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -34,6 +34,7 @@
 
 #define NFSDDBG_FACILITY	NFSDDBG_SVC
 
+atomic_t			nfsd_th_cnt = ATOMIC_INIT(0);
 extern struct svc_program	nfsd_program;
 static int			nfsd(void *vrqstp);
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
@@ -80,7 +81,6 @@ unsigned long	nfsd_drc_max_mem;
 unsigned long	nfsd_drc_mem_used;
 
 #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL)
-static struct svc_stat	nfsd_acl_svcstats;
 static const struct svc_version *nfsd_acl_version[] = {
 # if defined(CONFIG_NFSD_V2_ACL)
 	[2] = &nfsd_acl_version2,
@@ -99,15 +99,11 @@ static struct svc_program	nfsd_acl_program = {
 	.pg_vers		= nfsd_acl_version,
 	.pg_name		= "nfsacl",
 	.pg_class		= "nfsd",
-	.pg_stats		= &nfsd_acl_svcstats,
 	.pg_authenticate	= &svc_set_client,
 	.pg_init_request	= nfsd_acl_init_request,
 	.pg_rpcbind_set		= nfsd_acl_rpcbind_set,
 };
 
-static struct svc_stat	nfsd_acl_svcstats = {
-	.program	= &nfsd_acl_program,
-};
 #endif /* defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) */
 
 static const struct svc_version *nfsd_version[] = {
@@ -132,7 +128,6 @@ struct svc_program		nfsd_program = {
 	.pg_vers		= nfsd_version,		/* version table */
 	.pg_name		= "nfsd",		/* program name */
 	.pg_class		= "nfsd",		/* authentication class */
-	.pg_stats		= &nfsd_svcstats,	/* version table */
 	.pg_authenticate	= &svc_set_client,	/* export authentication */
 	.pg_init_request	= nfsd_init_request,
 	.pg_rpcbind_set		= nfsd_rpcbind_set,
@@ -666,7 +661,8 @@ int nfsd_create_serv(struct net *net)
 	if (nfsd_max_blksize == 0)
 		nfsd_max_blksize = nfsd_get_default_max_blksize();
 	nfsd_reset_versions(nn);
-	serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd);
+	serv = svc_create_pooled(&nfsd_program, &nn->nfsd_svcstats,
+				 nfsd_max_blksize, nfsd);
 	if (serv == NULL)
 		return -ENOMEM;
 
@@ -929,7 +925,7 @@ nfsd(void *vrqstp)
 
 	current->fs->umask = 0;
 
-	atomic_inc(&nfsdstats.th_cnt);
+	atomic_inc(&nfsd_th_cnt);
 
 	set_freezable();
 
@@ -941,9 +937,11 @@ nfsd(void *vrqstp)
 		rqstp->rq_server->sv_maxconn = nn->max_connections;
 
 		svc_recv(rqstp);
+
+		nfsd_file_net_dispose(nn);
 	}
 
-	atomic_dec(&nfsdstats.th_cnt);
+	atomic_dec(&nfsd_th_cnt);
 
 out:
 	/* Release the thread */
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index de1e0dfed06a..925817f66917 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -37,7 +37,8 @@ struct nfsd4_layout_ops {
 	__be32 (*proc_layoutcommit)(struct inode *inode,
 			struct nfsd4_layoutcommit *lcp);
 
-	void (*fence_client)(struct nfs4_layout_stateid *ls);
+	void (*fence_client)(struct nfs4_layout_stateid *ls,
+			     struct nfsd_file *file);
 };
 
 extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
@@ -72,11 +73,13 @@ void nfsd4_setup_layout_type(struct svc_export *exp);
 void nfsd4_return_all_client_layouts(struct nfs4_client *);
 void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp);
+void nfsd4_close_layout(struct nfs4_layout_stateid *ls);
 int nfsd4_init_pnfs(void);
 void nfsd4_exit_pnfs(void);
 #else
 struct nfs4_client;
 struct nfs4_file;
+struct nfs4_layout_stateid;
 
 static inline void nfsd4_setup_layout_type(struct svc_export *exp)
 {
@@ -89,6 +92,9 @@ static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
 		struct nfs4_file *fp)
 {
 }
+static inline void nfsd4_close_layout(struct nfs4_layout_stateid *ls)
+{
+}
 static inline void nfsd4_exit_pnfs(void)
 {
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 41bdc913fa71..01c6f3445646 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -68,7 +68,7 @@ struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
 	struct rpc_message cb_msg;
 	const struct nfsd4_callback_ops *cb_ops;
-	struct work_struct cb_work;
+	struct delayed_work cb_work;
 	int cb_seq_status;
 	int cb_status;
 	bool cb_need_restart;
@@ -88,17 +88,34 @@ struct nfsd4_callback_ops {
  */
 struct nfs4_stid {
 	refcount_t		sc_count;
-#define NFS4_OPEN_STID 1
-#define NFS4_LOCK_STID 2
-#define NFS4_DELEG_STID 4
-/* For an open stateid kept around *only* to process close replays: */
-#define NFS4_CLOSED_STID 8
+
+	/* A new stateid is added to the cl_stateids idr early before it
+	 * is fully initialised.  Its sc_type is then zero.  After
+	 * initialisation the sc_type it set under cl_lock, and then
+	 * never changes.
+	 */
+#define SC_TYPE_OPEN		BIT(0)
+#define SC_TYPE_LOCK		BIT(1)
+#define SC_TYPE_DELEG		BIT(2)
+#define SC_TYPE_LAYOUT		BIT(3)
+	unsigned short		sc_type;
+
+/* state_lock protects sc_status for delegation stateids.
+ * ->cl_lock protects sc_status for open and lock stateids.
+ * ->st_mutex also protect sc_status for open stateids.
+ * ->ls_lock protects sc_status for layout stateids.
+ */
+/*
+ * For an open stateid kept around *only* to process close replays.
+ * For deleg stateid, kept in idr until last reference is dropped.
+ */
+#define SC_STATUS_CLOSED	BIT(0)
 /* For a deleg stateid kept around only to process free_stateid's: */
-#define NFS4_REVOKED_DELEG_STID 16
-#define NFS4_CLOSED_DELEG_STID 32
-#define NFS4_LAYOUT_STID 64
+#define SC_STATUS_REVOKED	BIT(1)
+#define SC_STATUS_ADMIN_REVOKED	BIT(2)
+	unsigned short		sc_status;
+
 	struct list_head	sc_cp_list;
-	unsigned char		sc_type;
 	stateid_t		sc_stateid;
 	spinlock_t		sc_lock;
 	struct nfs4_client	*sc_client;
@@ -117,6 +134,24 @@ struct nfs4_cpntf_state {
 	time64_t		cpntf_time;	/* last time stateid used */
 };
 
+struct nfs4_cb_fattr {
+	struct nfsd4_callback ncf_getattr;
+	u32 ncf_cb_status;
+	u32 ncf_cb_bmap[1];
+
+	/* from CB_GETATTR reply */
+	u64 ncf_cb_change;
+	u64 ncf_cb_fsize;
+
+	unsigned long ncf_cb_flags;
+	bool ncf_file_modified;
+	u64 ncf_initial_cinfo;
+	u64 ncf_cur_fsize;
+};
+
+/* bits for ncf_cb_flags */
+#define	CB_GETATTR_BUSY		0
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -150,6 +185,9 @@ struct nfs4_delegation {
 	int			dl_retries;
 	struct nfsd4_callback	dl_recall;
 	bool			dl_recalled;
+
+	/* for CB_GETATTR */
+	struct nfs4_cb_fattr    dl_cb_fattr;
 };
 
 #define cb_to_delegation(cb) \
@@ -317,8 +355,9 @@ enum {
  * 0. If they are not renewed within a lease period, they become eligible for
  * destruction by the laundromat.
  *
- * These objects can also be destroyed prematurely by the fault injection code,
- * or if the client sends certain forms of SETCLIENTID or EXCHANGE_ID updates.
+ * These objects can also be destroyed if the client sends certain forms of
+ * SETCLIENTID or EXCHANGE_ID operations.
+ *
  * Care is taken *not* to do this however when the objects have an elevated
  * refcount.
  *
@@ -326,7 +365,7 @@ enum {
  *
  * o Each nfs4_clients is also hashed by name (the opaque quantity initially
  *   sent by the client to identify itself).
- * 	  
+ *
  * o cl_perclient list is used to ensure no dangling stateowner references
  *   when we expire the nfs4_client
  */
@@ -351,6 +390,7 @@ struct nfs4_client {
 	clientid_t		cl_clientid;	/* generated by server */
 	nfs4_verifier		cl_confirm;	/* generated by server */
 	u32			cl_minorversion;
+	atomic_t		cl_admin_revoked; /* count of admin-revoked states */
 	/* NFSv4.1 client implementation id: */
 	struct xdr_netobj	cl_nii_domain;
 	struct xdr_netobj	cl_nii_name;
@@ -640,6 +680,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 	NFSPROC4_CLNT_CB_RECALL_ANY,
+	NFSPROC4_CLNT_CB_GETATTR,
 };
 
 /* Returns true iff a is later than b: */
@@ -672,15 +713,15 @@ extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
 		stateid_t *stateid, int flags, struct nfsd_file **filp,
 		struct nfs4_stid **cstid);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
-		     stateid_t *stateid, unsigned char typemask,
-		     struct nfs4_stid **s, struct nfsd_net *nn);
+			    stateid_t *stateid, unsigned short typemask,
+			    unsigned short statusmask,
+			    struct nfs4_stid **s, struct nfsd_net *nn);
 struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab,
 				  void (*sc_free)(struct nfs4_stid *));
 int nfs4_init_copy_state(struct nfsd_net *nn, struct nfsd4_copy *copy);
 void nfs4_free_copy_state(struct nfsd4_copy *copy);
 struct nfs4_cpntf_state *nfs4_alloc_init_cpntf_state(struct nfsd_net *nn,
 			struct nfs4_stid *p_stid);
-void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_inc_and_copy_stateid(stateid_t *dst, struct nfs4_stid *stid);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
@@ -714,6 +755,14 @@ static inline void get_nfs4_file(struct nfs4_file *fi)
 }
 struct nfsd_file *find_any_file(struct nfs4_file *f);
 
+#ifdef CONFIG_NFSD_V4
+void nfsd4_revoke_states(struct net *net, struct super_block *sb);
+#else
+static inline void nfsd4_revoke_states(struct net *net, struct super_block *sb)
+{
+}
+#endif
+
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
 
@@ -732,5 +781,5 @@ static inline bool try_to_expire_client(struct nfs4_client *clp)
 }
 
 extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp,
-				struct inode *inode);
+		struct inode *inode, bool *file_modified, u64 *size);
 #endif   /* NFSD4_STATE_H */
diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c
index 12d79f5d4eb1..be52fb1e928e 100644
--- a/fs/nfsd/stats.c
+++ b/fs/nfsd/stats.c
@@ -27,25 +27,22 @@
 
 #include "nfsd.h"
 
-struct nfsd_stats	nfsdstats;
-struct svc_stat		nfsd_svcstats = {
-	.program	= &nfsd_program,
-};
-
 static int nfsd_show(struct seq_file *seq, void *v)
 {
+	struct net *net = pde_data(file_inode(seq->file));
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	int i;
 
 	seq_printf(seq, "rc %lld %lld %lld\nfh %lld 0 0 0 0\nio %lld %lld\n",
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_HITS]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_MISSES]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_FH_STALE]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_READ]),
-		   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_IO_WRITE]));
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_HITS]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_MISSES]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_RC_NOCACHE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_FH_STALE]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_READ]),
+		   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_IO_WRITE]));
 
 	/* thread usage: */
-	seq_printf(seq, "th %u 0", atomic_read(&nfsdstats.th_cnt));
+	seq_printf(seq, "th %u 0", atomic_read(&nfsd_th_cnt));
 
 	/* deprecated thread usage histogram stats */
 	for (i = 0; i < 10; i++)
@@ -55,7 +52,7 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_puts(seq, "\nra 0 0 0 0 0 0 0 0 0 0 0 0\n");
 
 	/* show my rpc info */
-	svc_seq_show(seq, &nfsd_svcstats);
+	svc_seq_show(seq, &nn->nfsd_svcstats);
 
 #ifdef CONFIG_NFSD_V4
 	/* Show count for individual nfsv4 operations */
@@ -63,10 +60,10 @@ static int nfsd_show(struct seq_file *seq, void *v)
 	seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1);
 	for (i = 0; i <= LAST_NFS4_OP; i++) {
 		seq_printf(seq, " %lld",
-			   percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)]));
+			   percpu_counter_sum_positive(&nn->counter[NFSD_STATS_NFS4_OP(i)]));
 	}
 	seq_printf(seq, "\nwdeleg_getattr %lld",
-		percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]));
+		percpu_counter_sum_positive(&nn->counter[NFSD_STATS_WDELEG_GETATTR]));
 
 	seq_putc(seq, '\n');
 #endif
@@ -108,31 +105,24 @@ void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num)
 		percpu_counter_destroy(&counters[i]);
 }
 
-static int nfsd_stat_counters_init(void)
+int nfsd_stat_counters_init(struct nfsd_net *nn)
 {
-	return nfsd_percpu_counters_init(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	return nfsd_percpu_counters_init(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-static void nfsd_stat_counters_destroy(void)
+void nfsd_stat_counters_destroy(struct nfsd_net *nn)
 {
-	nfsd_percpu_counters_destroy(nfsdstats.counter, NFSD_STATS_COUNTERS_NUM);
+	nfsd_percpu_counters_destroy(nn->counter, NFSD_STATS_COUNTERS_NUM);
 }
 
-int nfsd_stat_init(void)
+void nfsd_proc_stat_init(struct net *net)
 {
-	int err;
-
-	err = nfsd_stat_counters_init();
-	if (err)
-		return err;
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	svc_proc_register(&init_net, &nfsd_svcstats, &nfsd_proc_ops);
-
-	return 0;
+	svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops);
 }
 
-void nfsd_stat_shutdown(void)
+void nfsd_proc_stat_shutdown(struct net *net)
 {
-	nfsd_stat_counters_destroy();
-	svc_proc_unregister(&init_net, "nfsd");
+	svc_proc_unregister(net, "nfsd");
 }
diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h
index 14f50c660b61..d2753e975dfd 100644
--- a/fs/nfsd/stats.h
+++ b/fs/nfsd/stats.h
@@ -10,94 +10,72 @@
 #include <uapi/linux/nfsd/stats.h>
 #include <linux/percpu_counter.h>
 
-
-enum {
-	NFSD_STATS_RC_HITS,		/* repcache hits */
-	NFSD_STATS_RC_MISSES,		/* repcache misses */
-	NFSD_STATS_RC_NOCACHE,		/* uncached reqs */
-	NFSD_STATS_FH_STALE,		/* FH stale error */
-	NFSD_STATS_IO_READ,		/* bytes returned to read requests */
-	NFSD_STATS_IO_WRITE,		/* bytes passed in write requests */
-#ifdef CONFIG_NFSD_V4
-	NFSD_STATS_FIRST_NFS4_OP,	/* count of individual nfsv4 operations */
-	NFSD_STATS_LAST_NFS4_OP = NFSD_STATS_FIRST_NFS4_OP + LAST_NFS4_OP,
-#define NFSD_STATS_NFS4_OP(op)	(NFSD_STATS_FIRST_NFS4_OP + (op))
-	NFSD_STATS_WDELEG_GETATTR,	/* count of getattr conflict with wdeleg */
-#endif
-	NFSD_STATS_COUNTERS_NUM
-};
-
-struct nfsd_stats {
-	struct percpu_counter	counter[NFSD_STATS_COUNTERS_NUM];
-
-	atomic_t	th_cnt;		/* number of available threads */
-};
-
-extern struct nfsd_stats	nfsdstats;
-
-extern struct svc_stat		nfsd_svcstats;
-
 int nfsd_percpu_counters_init(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num);
 void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num);
-int nfsd_stat_init(void);
-void nfsd_stat_shutdown(void);
+int nfsd_stat_counters_init(struct nfsd_net *nn);
+void nfsd_stat_counters_destroy(struct nfsd_net *nn);
+void nfsd_proc_stat_init(struct net *net);
+void nfsd_proc_stat_shutdown(struct net *net);
 
-static inline void nfsd_stats_rc_hits_inc(void)
+static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_HITS]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_HITS]);
 }
 
-static inline void nfsd_stats_rc_misses_inc(void)
+static inline void nfsd_stats_rc_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_MISSES]);
 }
 
-static inline void nfsd_stats_rc_nocache_inc(void)
+static inline void nfsd_stats_rc_nocache_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_RC_NOCACHE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_RC_NOCACHE]);
 }
 
-static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp)
+static inline void nfsd_stats_fh_stale_inc(struct nfsd_net *nn,
+					   struct svc_export *exp)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_FH_STALE]);
 	if (exp && exp->ex_stats)
 		percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]);
 }
 
-static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_read_add(struct nfsd_net *nn,
+					  struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_READ], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount);
 }
 
-static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount)
+static inline void nfsd_stats_io_write_add(struct nfsd_net *nn,
+					   struct svc_export *exp, s64 amount)
 {
-	percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_IO_WRITE], amount);
 	if (exp && exp->ex_stats)
 		percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount);
 }
 
 static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nn->counter[NFSD_NET_PAYLOAD_MISSES]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_PAYLOAD_MISSES]);
 }
 
 static inline void nfsd_stats_drc_mem_usage_add(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_add(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_add(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 static inline void nfsd_stats_drc_mem_usage_sub(struct nfsd_net *nn, s64 amount)
 {
-	percpu_counter_sub(&nn->counter[NFSD_NET_DRC_MEM_USAGE], amount);
+	percpu_counter_sub(&nn->counter[NFSD_STATS_DRC_MEM_USAGE], amount);
 }
 
 #ifdef CONFIG_NFSD_V4
-static inline void nfsd_stats_wdeleg_getattr_inc(void)
+static inline void nfsd_stats_wdeleg_getattr_inc(struct nfsd_net *nn)
 {
-	percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_WDELEG_GETATTR]);
+	percpu_counter_inc(&nn->counter[NFSD_STATS_WDELEG_GETATTR]);
 }
 #endif
 #endif /* _NFSD_STATS_H */
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index d1e8cf079b0f..1cd2076210b1 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -9,8 +9,10 @@
 #define _NFSD_TRACE_H
 
 #include <linux/tracepoint.h>
+#include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprt.h>
 #include <trace/misc/nfs.h>
+#include <trace/misc/sunrpc.h>
 
 #include "export.h"
 #include "nfsfh.h"
@@ -102,7 +104,7 @@ TRACE_EVENT(nfsd_compound,
 	TP_fast_assign(
 		__entry->xid = be32_to_cpu(rqst->rq_xid);
 		__entry->opcnt = opcnt;
-		__assign_str_len(tag, tag, taglen);
+		__assign_str(tag, tag);
 	),
 	TP_printk("xid=0x%08x opcnt=%u tag=%s",
 		__entry->xid, __entry->opcnt, __get_str(tag)
@@ -483,7 +485,7 @@ TRACE_EVENT(nfsd_dirent,
 	TP_fast_assign(
 		__entry->fh_hash = fhp ? knfsd_fh_hash(&fhp->fh_handle) : 0;
 		__entry->ino = ino;
-		__assign_str_len(name, name, namlen)
+		__assign_str(name, name);
 	),
 	TP_printk("fh_hash=0x%08x ino=%llu name=%s",
 		__entry->fh_hash, __entry->ino, __get_str(name)
@@ -641,23 +643,18 @@ DEFINE_EVENT(nfsd_stateseqid_class, nfsd_##name, \
 DEFINE_STATESEQID_EVENT(preprocess);
 DEFINE_STATESEQID_EVENT(open_confirm);
 
-TRACE_DEFINE_ENUM(NFS4_OPEN_STID);
-TRACE_DEFINE_ENUM(NFS4_LOCK_STID);
-TRACE_DEFINE_ENUM(NFS4_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_STID);
-TRACE_DEFINE_ENUM(NFS4_REVOKED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_CLOSED_DELEG_STID);
-TRACE_DEFINE_ENUM(NFS4_LAYOUT_STID);
-
 #define show_stid_type(x)						\
 	__print_flags(x, "|",						\
-		{ NFS4_OPEN_STID,		"OPEN" },		\
-		{ NFS4_LOCK_STID,		"LOCK" },		\
-		{ NFS4_DELEG_STID,		"DELEG" },		\
-		{ NFS4_CLOSED_STID,		"CLOSED" },		\
-		{ NFS4_REVOKED_DELEG_STID,	"REVOKED" },		\
-		{ NFS4_CLOSED_DELEG_STID,	"CLOSED_DELEG" },	\
-		{ NFS4_LAYOUT_STID,		"LAYOUT" })
+		{ SC_TYPE_OPEN,		"OPEN" },		\
+		{ SC_TYPE_LOCK,		"LOCK" },		\
+		{ SC_TYPE_DELEG,		"DELEG" },		\
+		{ SC_TYPE_LAYOUT,		"LAYOUT" })
+
+#define show_stid_status(x)						\
+	__print_flags(x, "|",						\
+		{ SC_STATUS_CLOSED,		"CLOSED" },		\
+		{ SC_STATUS_REVOKED,		"REVOKED" },		\
+		{ SC_STATUS_ADMIN_REVOKED,	"ADMIN_REVOKED" })
 
 DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_PROTO(
@@ -666,6 +663,7 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 	TP_ARGS(stid),
 	TP_STRUCT__entry(
 		__field(unsigned long, sc_type)
+		__field(unsigned long, sc_status)
 		__field(int, sc_count)
 		__field(u32, cl_boot)
 		__field(u32, cl_id)
@@ -676,16 +674,18 @@ DECLARE_EVENT_CLASS(nfsd_stid_class,
 		const stateid_t *stp = &stid->sc_stateid;
 
 		__entry->sc_type = stid->sc_type;
+		__entry->sc_status = stid->sc_status;
 		__entry->sc_count = refcount_read(&stid->sc_count);
 		__entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
 		__entry->cl_id = stp->si_opaque.so_clid.cl_id;
 		__entry->si_id = stp->si_opaque.so_id;
 		__entry->si_generation = stp->si_generation;
 	),
-	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s",
+	TP_printk("client %08x:%08x stateid %08x:%08x ref=%d type=%s state=%s",
 		__entry->cl_boot, __entry->cl_id,
 		__entry->si_id, __entry->si_generation,
-		__entry->sc_count, show_stid_type(__entry->sc_type)
+		__entry->sc_count, show_stid_type(__entry->sc_type),
+		show_stid_status(__entry->sc_status)
 	)
 );
 
@@ -696,6 +696,59 @@ DEFINE_EVENT(nfsd_stid_class, nfsd_stid_##name,			\
 
 DEFINE_STID_EVENT(revoke);
 
+TRACE_EVENT(nfsd_stateowner_replay,
+	TP_PROTO(
+		u32 opnum,
+		const struct nfs4_replay *rp
+	),
+	TP_ARGS(opnum, rp),
+	TP_STRUCT__entry(
+		__field(unsigned long, status)
+		__field(u32, opnum)
+	),
+	TP_fast_assign(
+		__entry->status = be32_to_cpu(rp->rp_status);
+		__entry->opnum = opnum;
+	),
+	TP_printk("opnum=%u status=%lu",
+		__entry->opnum, __entry->status)
+);
+
+TRACE_EVENT_CONDITION(nfsd_seq4_status,
+	TP_PROTO(
+		const struct svc_rqst *rqstp,
+		const struct nfsd4_sequence *sequence
+	),
+	TP_ARGS(rqstp, sequence),
+	TP_CONDITION(sequence->status_flags),
+	TP_STRUCT__entry(
+		__field(unsigned int, netns_ino)
+		__field(u32, xid)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(unsigned long, status_flags)
+	),
+	TP_fast_assign(
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&sequence->sessionid;
+
+		__entry->netns_ino = SVC_NET(rqstp)->ns.inum;
+		__entry->xid = be32_to_cpu(rqstp->rq_xid);
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->status_flags = sequence->status_flags;
+	),
+	TP_printk("xid=0x%08x sessionid=%08x:%08x:%08x:%08x status_flags=%s",
+		__entry->xid, __entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		show_nfs4_seq4_status(__entry->status_flags)
+	)
+);
+
 DECLARE_EVENT_CLASS(nfsd_clientid_class,
 	TP_PROTO(const clientid_t *clid),
 	TP_ARGS(clid),
@@ -843,7 +896,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
 		__array(unsigned char, addr, sizeof(struct sockaddr_in6))
 		__field(unsigned long, flavor)
 		__array(unsigned char, verifier, NFS4_VERIFIER_SIZE)
-		__string_len(name, name, clp->cl_name.len)
+		__string_len(name, clp->cl_name.data, clp->cl_name.len)
 	),
 	TP_fast_assign(
 		__entry->cl_boot = clp->cl_clientid.cl_boot;
@@ -853,7 +906,7 @@ DECLARE_EVENT_CLASS(nfsd_clid_class,
 		__entry->flavor = clp->cl_cred.cr_flavor;
 		memcpy(__entry->verifier, (void *)&clp->cl_verifier,
 		       NFS4_VERIFIER_SIZE);
-		__assign_str_len(name, clp->cl_name.data, clp->cl_name.len);
+		__assign_str(name, clp->cl_name.data);
 	),
 	TP_printk("addr=%pISpc name='%s' verifier=0x%s flavor=%s client=%08x:%08x",
 		__entry->addr, __get_str(name),
@@ -1334,7 +1387,8 @@ DEFINE_EVENT(nfsd_cb_class, nfsd_cb_##name,		\
 	TP_PROTO(const struct nfs4_client *clp),	\
 	TP_ARGS(clp))
 
-DEFINE_NFSD_CB_EVENT(state);
+DEFINE_NFSD_CB_EVENT(start);
+DEFINE_NFSD_CB_EVENT(new_state);
 DEFINE_NFSD_CB_EVENT(probe);
 DEFINE_NFSD_CB_EVENT(lost);
 DEFINE_NFSD_CB_EVENT(shutdown);
@@ -1405,6 +1459,128 @@ TRACE_EVENT(nfsd_cb_setup_err,
 		__entry->error)
 );
 
+DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class,
+	TP_PROTO(
+		const struct nfs4_client *clp,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(clp, cb),
+	TP_STRUCT__entry(
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(const void *, cb)
+		__field(bool, need_restart)
+		__sockaddr(addr, clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_fast_assign(
+		__entry->cl_boot = clp->cl_clientid.cl_boot;
+		__entry->cl_id = clp->cl_clientid.cl_id;
+		__entry->cb = cb;
+		__entry->need_restart = cb->cb_need_restart;
+		__assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr,
+				  clp->cl_cb_conn.cb_addrlen)
+	),
+	TP_printk("addr=%pISpc client %08x:%08x cb=%p%s",
+		__get_sockaddr(addr), __entry->cl_boot, __entry->cl_id,
+		__entry->cb, __entry->need_restart ?
+			" (need restart)" : " (first try)"
+	)
+);
+
+#define DEFINE_NFSD_CB_LIFETIME_EVENT(name)		\
+DEFINE_EVENT(nfsd_cb_lifetime_class, nfsd_cb_##name,	\
+	TP_PROTO(					\
+		const struct nfs4_client *clp,		\
+		const struct nfsd4_callback *cb		\
+	),						\
+	TP_ARGS(clp, cb))
+
+DEFINE_NFSD_CB_LIFETIME_EVENT(queue);
+DEFINE_NFSD_CB_LIFETIME_EVENT(destroy);
+DEFINE_NFSD_CB_LIFETIME_EVENT(restart);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_update);
+DEFINE_NFSD_CB_LIFETIME_EVENT(bc_shutdown);
+
+TRACE_EVENT(nfsd_cb_seq_status,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(int, tk_status)
+		__field(int, seq_status)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->tk_status = task->tk_status;
+		__entry->seq_status = cb->cb_seq_status;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x tk_status=%d seq_status=%d\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->tk_status, __entry->seq_status
+	)
+);
+
+TRACE_EVENT(nfsd_cb_free_slot,
+	TP_PROTO(
+		const struct rpc_task *task,
+		const struct nfsd4_callback *cb
+	),
+	TP_ARGS(task, cb),
+	TP_STRUCT__entry(
+		__field(unsigned int, task_id)
+		__field(unsigned int, client_id)
+		__field(u32, cl_boot)
+		__field(u32, cl_id)
+		__field(u32, seqno)
+		__field(u32, reserved)
+		__field(u32, slot_seqno)
+	),
+	TP_fast_assign(
+		const struct nfs4_client *clp = cb->cb_clp;
+		const struct nfsd4_session *session = clp->cl_cb_session;
+		const struct nfsd4_sessionid *sid =
+			(struct nfsd4_sessionid *)&session->se_sessionid;
+
+		__entry->task_id = task->tk_pid;
+		__entry->client_id = task->tk_client ?
+				     task->tk_client->cl_clid : -1;
+		__entry->cl_boot = sid->clientid.cl_boot;
+		__entry->cl_id = sid->clientid.cl_id;
+		__entry->seqno = sid->sequence;
+		__entry->reserved = sid->reserved;
+		__entry->slot_seqno = session->se_cb_seq_nr;
+	),
+	TP_printk(SUNRPC_TRACE_TASK_SPECIFIER
+		" sessionid=%08x:%08x:%08x:%08x new slot seqno=%u\n",
+		__entry->task_id, __entry->client_id,
+		__entry->cl_boot, __entry->cl_id,
+		__entry->seqno, __entry->reserved,
+		__entry->slot_seqno
+	)
+);
+
 TRACE_EVENT_CONDITION(nfsd_cb_recall,
 	TP_PROTO(
 		const struct nfs4_stid *stid
@@ -1800,7 +1976,7 @@ TRACE_EVENT(nfsd_ctl_time,
 	TP_fast_assign(
 		__entry->netns_ino = net->ns.inum;
 		__entry->time = time;
-		__assign_str_len(name, name, namelen);
+		__assign_str(name, name);
 	),
 	TP_printk("file=%s time=%d\n",
 		__get_str(name), __entry->time
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b7c7a9273ea0..2e41eb4c3cec 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -25,7 +25,6 @@
 #include <linux/posix_acl_xattr.h>
 #include <linux/xattr.h>
 #include <linux/jhash.h>
-#include <linux/ima.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
@@ -476,7 +475,6 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
  * @rqstp: controlling RPC transaction
  * @fhp: filehandle of target
  * @attr: attributes to set
- * @check_guard: set to 1 if guardtime is a valid timestamp
  * @guardtime: do not act if ctime.tv_sec does not match this timestamp
  *
  * This call may adjust the contents of @attr (in particular, this
@@ -488,8 +486,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap)
  */
 __be32
 nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
-	     struct nfsd_attrs *attr,
-	     int check_guard, time64_t guardtime)
+	     struct nfsd_attrs *attr, const struct timespec64 *guardtime)
 {
 	struct dentry	*dentry;
 	struct inode	*inode;
@@ -497,7 +494,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	int		accmode = NFSD_MAY_SATTR;
 	umode_t		ftype = 0;
 	__be32		err;
-	int		host_err;
+	int		host_err = 0;
 	bool		get_write_count;
 	bool		size_change = (iap->ia_valid & ATTR_SIZE);
 	int		retries;
@@ -538,9 +535,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	nfsd_sanitize_attrs(inode, iap);
 
-	if (check_guard && guardtime != inode_get_ctime_sec(inode))
-		return nfserr_notsync;
-
 	/*
 	 * The size case is special, it changes the file in addition to the
 	 * attributes, and file systems don't expect it to be mixed with
@@ -555,6 +549,19 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	}
 
 	inode_lock(inode);
+	err = fh_fill_pre_attrs(fhp);
+	if (err)
+		goto out_unlock;
+
+	if (guardtime) {
+		struct timespec64 ctime = inode_get_ctime(inode);
+		if ((u32)guardtime->tv_sec != (u32)ctime.tv_sec ||
+		    guardtime->tv_nsec != ctime.tv_nsec) {
+			err = nfserr_notsync;
+			goto out_fill_attrs;
+		}
+	}
+
 	for (retries = 1;;) {
 		struct iattr attrs;
 
@@ -582,13 +589,23 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		attr->na_aclerr = set_posix_acl(&nop_mnt_idmap,
 						dentry, ACL_TYPE_DEFAULT,
 						attr->na_dpacl);
+out_fill_attrs:
+	/*
+	 * RFC 1813 Section 3.3.2 does not mandate that an NFS server
+	 * returns wcc_data for SETATTR. Some client implementations
+	 * depend on receiving wcc_data, however, to sort out partial
+	 * updates (eg., the client requested that size and mode be
+	 * modified, but the server changed only the file mode).
+	 */
+	fh_fill_post_attrs(fhp);
+out_unlock:
 	inode_unlock(inode);
 	if (size_change)
 		put_write_access(inode);
 out:
 	if (!host_err)
 		host_err = commit_metadata(fhp);
-	return nfserrno(host_err);
+	return err != 0 ? err : nfserrno(host_err);
 }
 
 #if defined(CONFIG_NFSD_V4)
@@ -877,7 +894,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 		goto out;
 	}
 
-	host_err = ima_file_check(file, may_flags);
+	host_err = security_file_post_open(file, may_flags);
 	if (host_err) {
 		fput(file);
 		goto out;
@@ -1002,7 +1019,9 @@ static __be32 nfsd_finish_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			       unsigned long *count, u32 *eof, ssize_t host_err)
 {
 	if (host_err >= 0) {
-		nfsd_stats_io_read_add(fhp->fh_export, host_err);
+		struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
+
+		nfsd_stats_io_read_add(nn, fhp->fh_export, host_err);
 		*eof = nfsd_eof_on_read(file, offset, host_err, *count);
 		*count = host_err;
 		fsnotify_access(file);
@@ -1185,7 +1204,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
 		goto out_nfserr;
 	}
 	*cnt = host_err;
-	nfsd_stats_io_write_add(exp, *cnt);
+	nfsd_stats_io_write_add(nn, exp, *cnt);
 	fsnotify_modify(file);
 	host_err = filemap_check_wb_err(file->f_mapping, since);
 	if (host_err < 0)
@@ -1404,7 +1423,7 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	 * if the attributes have not changed.
 	 */
 	if (iap->ia_valid)
-		status = nfsd_setattr(rqstp, resfhp, attrs, 0, (time64_t)0);
+		status = nfsd_setattr(rqstp, resfhp, attrs, NULL);
 	else
 		status = nfserrno(commit_metadata(resfhp));
 
@@ -1833,7 +1852,7 @@ retry:
 	trap = lock_rename(tdentry, fdentry);
 	if (IS_ERR(trap)) {
 		err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
-		goto out;
+		goto out_want_write;
 	}
 	err = fh_fill_pre_attrs(ffhp);
 	if (err != nfs_ok)
@@ -1903,13 +1922,14 @@ retry:
 	}
 out_unlock:
 	unlock_rename(tdentry, fdentry);
+out_want_write:
 	fh_drop_write(ffhp);
 
 	/*
-	 * If the target dentry has cached open files, then we need to try to
-	 * close them prior to doing the rename. Flushing delayed fput
-	 * shouldn't be done with locks held however, so we delay it until this
-	 * point and then reattempt the whole shebang.
+	 * If the target dentry has cached open files, then we need to
+	 * try to close them prior to doing the rename.  Final fput
+	 * shouldn't be done with locks held however, so we delay it
+	 * until this point and then reattempt the whole shebang.
 	 */
 	if (close_cached) {
 		close_cached = false;
@@ -2177,11 +2197,43 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
 	if (err == nfserr_eof || err == nfserr_toosmall)
 		err = nfs_ok; /* can still be found in ->err */
 out_close:
-	fput(file);
+	nfsd_filp_close(file);
 out:
 	return err;
 }
 
+/**
+ * nfsd_filp_close: close a file synchronously
+ * @fp: the file to close
+ *
+ * nfsd_filp_close() is similar in behaviour to filp_close().
+ * The difference is that if this is the final close on the
+ * file, the that finalisation happens immediately, rather then
+ * being handed over to a work_queue, as it the case for
+ * filp_close().
+ * When a user-space process closes a file (even when using
+ * filp_close() the finalisation happens before returning to
+ * userspace, so it is effectively synchronous.  When a kernel thread
+ * uses file_close(), on the other hand, the handling is completely
+ * asynchronous.  This means that any cost imposed by that finalisation
+ * is not imposed on the nfsd thread, and nfsd could potentually
+ * close files more quickly than the work queue finalises the close,
+ * which would lead to unbounded growth in the queue.
+ *
+ * In some contexts is it not safe to synchronously wait for
+ * close finalisation (see comment for __fput_sync()), but nfsd
+ * does not match those contexts.  In partcilarly it does not, at the
+ * time that this function is called, hold and locks and no finalisation
+ * of any file, socket, or device driver would have any cause to wait
+ * for nfsd to make progress.
+ */
+void nfsd_filp_close(struct file *fp)
+{
+	get_file(fp);
+	filp_close(fp, NULL);
+	__fput_sync(fp);
+}
+
 /*
  * Get file system stats
  * N.B. After this call fhp needs an fh_put
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 702fbc4483bf..c60fdb6200fd 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -69,7 +69,7 @@ __be32		 nfsd_lookup_dentry(struct svc_rqst *, struct svc_fh *,
 				const char *, unsigned int,
 				struct svc_export **, struct dentry **);
 __be32		nfsd_setattr(struct svc_rqst *, struct svc_fh *,
-				struct nfsd_attrs *, int, time64_t);
+			     struct nfsd_attrs *, const struct timespec64 *);
 int nfsd_mountpoint(struct dentry *, struct svc_export *);
 #ifdef CONFIG_NFSD_V4
 __be32		nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
@@ -148,6 +148,8 @@ __be32		nfsd_statfs(struct svc_rqst *, struct svc_fh *,
 __be32		nfsd_permission(struct svc_rqst *, struct svc_export *,
 				struct dentry *, int);
 
+void		nfsd_filp_close(struct file *fp);
+
 static inline int fh_want_write(struct svc_fh *fh)
 {
 	int ret;
diff --git a/fs/nfsd/xdr3.h b/fs/nfsd/xdr3.h
index 03fe4e21306c..522067b7fd75 100644
--- a/fs/nfsd/xdr3.h
+++ b/fs/nfsd/xdr3.h
@@ -14,7 +14,7 @@ struct nfsd3_sattrargs {
 	struct svc_fh		fh;
 	struct iattr		attrs;
 	int			check_guard;
-	time64_t		guardtime;
+	struct timespec64	guardtime;
 };
 
 struct nfsd3_diropargs {
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index 0d39af1b00a0..e8b00309c449 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -54,3 +54,21 @@
 #define NFS4_dec_cb_recall_any_sz	(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+/*
+ * 1: CB_GETATTR opcode (32-bit)
+ * N: file_handle
+ * 1: number of entry in attribute array (32-bit)
+ * 1: entry 0 in attribute array (32-bit)
+ */
+#define NFS4_enc_cb_getattr_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_nfs4_fh_sz + 1 + 1)
+/*
+ * 4: fattr_bitmap_maxsz
+ * 1: attribute array len
+ * 2: change attr (64-bit)
+ * 2: size (64-bit)
+ */
+#define NFS4_dec_cb_getattr_sz		(cb_compound_dec_hdr_sz  +      \
+			cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz)
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 7342de296ec3..89caef7513db 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -525,54 +525,55 @@ int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
 		ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
 		if (ret < 0)
 			return ret;
-		desc_kaddr = kmap(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
 							   maxgroup);
-		for (j = 0; j < n; j++, desc++, group++) {
+		for (j = 0; j < n; j++, desc++, group++, group_offset = 0) {
 			lock = nilfs_mdt_bgl_lock(inode, group);
-			if (nilfs_palloc_group_desc_nfrees(desc, lock) > 0) {
-				ret = nilfs_palloc_get_bitmap_block(
-					inode, group, 1, &bitmap_bh);
-				if (ret < 0)
-					goto out_desc;
-				bitmap_kaddr = kmap(bitmap_bh->b_page);
-				bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
-				pos = nilfs_palloc_find_available_slot(
-					bitmap, group_offset,
-					entries_per_group, lock);
-				if (pos >= 0) {
-					/* found a free entry */
-					nilfs_palloc_group_desc_add_entries(
-						desc, lock, -1);
-					req->pr_entry_nr =
-						entries_per_group * group + pos;
-					kunmap(desc_bh->b_page);
-					kunmap(bitmap_bh->b_page);
-
-					req->pr_desc_bh = desc_bh;
-					req->pr_bitmap_bh = bitmap_bh;
-					return 0;
-				}
-				kunmap(bitmap_bh->b_page);
-				brelse(bitmap_bh);
+			if (nilfs_palloc_group_desc_nfrees(desc, lock) == 0)
+				continue;
+
+			kunmap_local(desc_kaddr);
+			ret = nilfs_palloc_get_bitmap_block(inode, group, 1,
+							    &bitmap_bh);
+			if (unlikely(ret < 0)) {
+				brelse(desc_bh);
+				return ret;
 			}
 
-			group_offset = 0;
+			desc_kaddr = kmap_local_page(desc_bh->b_page);
+			desc = nilfs_palloc_block_get_group_desc(
+				inode, group, desc_bh, desc_kaddr);
+
+			bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
+			bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
+			pos = nilfs_palloc_find_available_slot(
+				bitmap, group_offset, entries_per_group, lock);
+			kunmap_local(bitmap_kaddr);
+			if (pos >= 0)
+				goto found;
+
+			brelse(bitmap_bh);
 		}
 
-		kunmap(desc_bh->b_page);
+		kunmap_local(desc_kaddr);
 		brelse(desc_bh);
 	}
 
 	/* no entries left */
 	return -ENOSPC;
 
- out_desc:
-	kunmap(desc_bh->b_page);
-	brelse(desc_bh);
-	return ret;
+found:
+	/* found a free entry */
+	nilfs_palloc_group_desc_add_entries(desc, lock, -1);
+	req->pr_entry_nr = entries_per_group * group + pos;
+	kunmap_local(desc_kaddr);
+
+	req->pr_desc_bh = desc_bh;
+	req->pr_bitmap_bh = bitmap_bh;
+	return 0;
 }
 
 /**
@@ -606,10 +607,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -621,8 +622,8 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	mark_buffer_dirty(req->pr_desc_bh);
 	mark_buffer_dirty(req->pr_bitmap_bh);
@@ -647,10 +648,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	spinlock_t *lock;
 
 	group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
-	desc_kaddr = kmap(req->pr_desc_bh->b_page);
+	desc_kaddr = kmap_local_page(req->pr_desc_bh->b_page);
 	desc = nilfs_palloc_block_get_group_desc(inode, group,
 						 req->pr_desc_bh, desc_kaddr);
-	bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
+	bitmap_kaddr = kmap_local_page(req->pr_bitmap_bh->b_page);
 	bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh);
 	lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -662,8 +663,8 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
 	else
 		nilfs_palloc_group_desc_add_entries(desc, lock, 1);
 
-	kunmap(req->pr_bitmap_bh->b_page);
-	kunmap(req->pr_desc_bh->b_page);
+	kunmap_local(bitmap_kaddr);
+	kunmap_local(desc_kaddr);
 
 	brelse(req->pr_bitmap_bh);
 	brelse(req->pr_desc_bh);
@@ -755,7 +756,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 		/* Get the first entry number of the group */
 		group_min_nr = (__u64)group * epg;
 
-		bitmap_kaddr = kmap(bitmap_bh->b_page);
+		bitmap_kaddr = kmap_local_page(bitmap_bh->b_page);
 		bitmap = bitmap_kaddr + bh_offset(bitmap_bh);
 		lock = nilfs_mdt_bgl_lock(inode, group);
 
@@ -801,7 +802,7 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 			entry_start = rounddown(group_offset, epb);
 		} while (true);
 
-		kunmap(bitmap_bh->b_page);
+		kunmap_local(bitmap_kaddr);
 		mark_buffer_dirty(bitmap_bh);
 		brelse(bitmap_bh);
 
@@ -815,11 +816,11 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
 					   inode->i_ino);
 		}
 
-		desc_kaddr = kmap_atomic(desc_bh->b_page);
+		desc_kaddr = kmap_local_page(desc_bh->b_page);
 		desc = nilfs_palloc_block_get_group_desc(
 			inode, group, desc_bh, desc_kaddr);
 		nfree = nilfs_palloc_group_desc_add_entries(desc, lock, n);
-		kunmap_atomic(desc_kaddr);
+		kunmap_local(desc_kaddr);
 		mark_buffer_dirty(desc_bh);
 		nilfs_mdt_mark_dirty(inode);
 		brelse(desc_bh);
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 7a8f166f2c8d..383f0afa2cea 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -548,13 +548,10 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
  */
 void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
 {
-	down_write(&bmap->b_sem);
 	memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
 	       NILFS_INODE_BMAP_SIZE * sizeof(__le64));
 	if (bmap->b_inode->i_ino == NILFS_DAT_INO)
 		bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
-
-	up_write(&bmap->b_sem);
 }
 
 void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 13592e82eaf6..65659fa0372e 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -724,7 +724,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
 		dat = nilfs_bmap_get_dat(btree);
 		ret = nilfs_dat_translate(dat, ptr, &blocknr);
 		if (ret < 0)
-			goto out;
+			goto dat_error;
 		ptr = blocknr;
 	}
 	cnt = 1;
@@ -743,7 +743,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
 			if (dat) {
 				ret = nilfs_dat_translate(dat, ptr2, &blocknr);
 				if (ret < 0)
-					goto out;
+					goto dat_error;
 				ptr2 = blocknr;
 			}
 			if (ptr2 != ptr + cnt || ++cnt == maxblocks)
@@ -781,6 +781,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
  out:
 	nilfs_btree_free_path(path);
 	return ret;
+
+ dat_error:
+	if (ret == -ENOENT)
+		ret = -EINVAL;  /* Notify bmap layer of metadata corruption */
+	goto out;
 }
 
 static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 39136637f715..69a5cced1e84 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -28,7 +28,7 @@ nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
 {
 	__u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
 
-	do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
+	tcno = div64_ul(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
 	return (unsigned long)tcno;
 }
 
@@ -187,35 +187,90 @@ static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
 }
 
 /**
- * nilfs_cpfile_get_checkpoint - get a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @create: create flag
- * @cpp: pointer to a checkpoint
- * @bhp: pointer to a buffer head
- *
- * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
- * specified by @cno. A new checkpoint will be created if @cno is the current
- * checkpoint number and @create is nonzero.
- *
- * Return Value: On success, 0 is returned, and the checkpoint and the
- * buffer head of the buffer on which the checkpoint is located are stored in
- * the place pointed by @cpp and @bhp, respectively. On error, one of the
- * following negative error codes is returned.
+ * nilfs_cpfile_read_checkpoint - read a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint entry to read
+ * @root:   nilfs root object
+ * @ifile:  ifile's inode to read and attach to @root
  *
- * %-EIO - I/O error.
+ * This function imports checkpoint information from the checkpoint file and
+ * stores it to the inode file given by @ifile and the nilfs root object
+ * given by @root.
  *
- * %-ENOMEM - Insufficient amount of memory available.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ */
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile)
+{
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (cno < 1 || cno > nilfs_mdt_cno(cpfile))
+		return -EINVAL;
+
+	down_read(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			ret = -EINVAL;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (nilfs_checkpoint_invalid(cp)) {
+		ret = -EINVAL;
+		goto put_cp;
+	}
+
+	ret = nilfs_read_inode_common(ifile, &cp->cp_ifile_inode);
+	if (unlikely(ret)) {
+		/*
+		 * Since this inode is on a checkpoint entry, treat errors
+		 * as metadata corruption.
+		 */
+		nilfs_err(cpfile->i_sb,
+			  "ifile inode (checkpoint number=%llu) corrupted",
+			  (unsigned long long)cno);
+		ret = -EIO;
+		goto put_cp;
+	}
+
+	/* Configure the nilfs root object */
+	atomic64_set(&root->inodes_count, le64_to_cpu(cp->cp_inodes_count));
+	atomic64_set(&root->blocks_count, le64_to_cpu(cp->cp_blocks_count));
+	root->ifile = ifile;
+
+put_cp:
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_read(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+}
+
+/**
+ * nilfs_cpfile_create_checkpoint - create a checkpoint entry on cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    number of checkpoint to set up
  *
- * %-ENOENT - No such checkpoint.
+ * This function creates a checkpoint with the number specified by @cno on
+ * cpfile.  If the specified checkpoint entry already exists due to a past
+ * failure, it will be reused without returning an error.
+ * In either case, the buffer of the block containing the checkpoint entry
+ * and the cpfile inode are made dirty for inclusion in the write log.
  *
- * %-EINVAL - invalid checkpoint.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
+ * * %-EROFS	- Read only filesystem
  */
-int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
-				__u64 cno,
-				int create,
-				struct nilfs_checkpoint **cpp,
-				struct buffer_head **bhp)
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno)
 {
 	struct buffer_head *header_bh, *cp_bh;
 	struct nilfs_cpfile_header *header;
@@ -223,70 +278,128 @@ int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
 	void *kaddr;
 	int ret;
 
-	if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
-		     (cno < nilfs_mdt_cno(cpfile) && create)))
-		return -EINVAL;
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
 
 	down_write(&NILFS_MDT(cpfile)->mi_sem);
-
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
-	if (ret < 0)
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT) {
+			nilfs_error(cpfile->i_sb,
+				    "checkpoint creation failed due to metadata corruption.");
+			ret = -EIO;
+		}
 		goto out_sem;
-	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
-	if (ret < 0)
+	}
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 1, &cp_bh);
+	if (unlikely(ret < 0))
 		goto out_header;
-	kaddr = kmap(cp_bh->b_page);
+
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
-		if (!create) {
-			kunmap(cp_bh->b_page);
-			brelse(cp_bh);
-			ret = -ENOENT;
-			goto out_header;
-		}
 		/* a newly-created checkpoint */
 		nilfs_checkpoint_clear_invalid(cp);
 		if (!nilfs_cpfile_is_in_first(cpfile, cno))
 			nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
 								 kaddr, 1);
-		mark_buffer_dirty(cp_bh);
+		kunmap_local(kaddr);
 
-		kaddr = kmap_atomic(header_bh->b_page);
+		kaddr = kmap_local_page(header_bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, 1);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		mark_buffer_dirty(header_bh);
-		nilfs_mdt_mark_dirty(cpfile);
+	} else {
+		kunmap_local(kaddr);
 	}
 
-	if (cpp != NULL)
-		*cpp = cp;
-	*bhp = cp_bh;
+	/* Force the buffer and the inode to become dirty */
+	mark_buffer_dirty(cp_bh);
+	brelse(cp_bh);
+	nilfs_mdt_mark_dirty(cpfile);
 
- out_header:
+out_header:
 	brelse(header_bh);
 
- out_sem:
+out_sem:
 	up_write(&NILFS_MDT(cpfile)->mi_sem);
 	return ret;
 }
 
 /**
- * nilfs_cpfile_put_checkpoint - put a checkpoint
- * @cpfile: inode of checkpoint file
- * @cno: checkpoint number
- * @bh: buffer head
+ * nilfs_cpfile_finalize_checkpoint - fill in a checkpoint entry in cpfile
+ * @cpfile: checkpoint file inode
+ * @cno:    checkpoint number
+ * @root:   nilfs root object
+ * @blkinc: number of blocks added by this checkpoint
+ * @ctime:  checkpoint creation time
+ * @minor:  minor checkpoint flag
+ *
+ * This function completes the checkpoint entry numbered by @cno in the
+ * cpfile with the data given by the arguments @root, @blkinc, @ctime, and
+ * @minor.
  *
- * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
- * specified by @cno. @bh must be the buffer head which has been returned by
- * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
  */
-void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
-				 struct buffer_head *bh)
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor)
 {
-	kunmap(bh->b_page);
-	brelse(bh);
+	struct buffer_head *cp_bh;
+	struct nilfs_checkpoint *cp;
+	void *kaddr;
+	int ret;
+
+	if (WARN_ON_ONCE(cno < 1))
+		return -EIO;
+
+	down_write(&NILFS_MDT(cpfile)->mi_sem);
+	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
+	if (unlikely(ret < 0)) {
+		if (ret == -ENOENT)
+			goto error;
+		goto out_sem;
+	}
+
+	kaddr = kmap_local_page(cp_bh->b_page);
+	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
+	if (unlikely(nilfs_checkpoint_invalid(cp))) {
+		kunmap_local(kaddr);
+		brelse(cp_bh);
+		goto error;
+	}
+
+	cp->cp_snapshot_list.ssl_next = 0;
+	cp->cp_snapshot_list.ssl_prev = 0;
+	cp->cp_inodes_count = cpu_to_le64(atomic64_read(&root->inodes_count));
+	cp->cp_blocks_count = cpu_to_le64(atomic64_read(&root->blocks_count));
+	cp->cp_nblk_inc = cpu_to_le64(blkinc);
+	cp->cp_create = cpu_to_le64(ctime);
+	cp->cp_cno = cpu_to_le64(cno);
+
+	if (minor)
+		nilfs_checkpoint_set_minor(cp);
+	else
+		nilfs_checkpoint_clear_minor(cp);
+
+	nilfs_write_inode_common(root->ifile, &cp->cp_ifile_inode);
+	nilfs_bmap_write(NILFS_I(root->ifile)->i_bmap, &cp->cp_ifile_inode);
+
+	kunmap_local(kaddr);
+	brelse(cp_bh);
+out_sem:
+	up_write(&NILFS_MDT(cpfile)->mi_sem);
+	return ret;
+
+error:
+	nilfs_error(cpfile->i_sb,
+		    "checkpoint finalization failed due to metadata corruption.");
+	ret = -EIO;
+	goto out_sem;
 }
 
 /**
@@ -347,7 +460,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			continue;
 		}
 
-		kaddr = kmap_atomic(cp_bh->b_page);
+		kaddr = kmap_local_page(cp_bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(
 			cpfile, cno, cp_bh, kaddr);
 		nicps = 0;
@@ -369,7 +482,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 						cpfile, cp_bh, kaddr, nicps);
 				if (count == 0) {
 					/* make hole */
-					kunmap_atomic(kaddr);
+					kunmap_local(kaddr);
 					brelse(cp_bh);
 					ret =
 					  nilfs_cpfile_delete_checkpoint_block(
@@ -384,18 +497,18 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
 			}
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(cp_bh);
 	}
 
 	if (tnicps > 0) {
-		kaddr = kmap_atomic(header_bh->b_page);
+		kaddr = kmap_local_page(header_bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, header_bh,
 						       kaddr);
 		le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
 		mark_buffer_dirty(header_bh);
 		nilfs_mdt_mark_dirty(cpfile);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 
 	brelse(header_bh);
@@ -447,7 +560,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 		}
 		ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
 
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 		for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
 			if (!nilfs_checkpoint_invalid(cp)) {
@@ -457,7 +570,7 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
 				n++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 	}
 
@@ -491,10 +604,10 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 		ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 		if (ret < 0)
 			goto out;
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 		curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (curr == 0) {
 			ret = 0;
@@ -512,7 +625,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 			ret = 0; /* No snapshots (started from a hole block) */
 		goto out;
 	}
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	while (n < nci) {
 		cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
 		curr = ~(__u64)0; /* Terminator */
@@ -528,7 +641,7 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 
 		next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
 		if (curr_blkoff != next_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
 								0, &bh);
@@ -536,12 +649,12 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
 				WARN_ON(ret == -ENOENT);
 				goto out;
 			}
-			kaddr = kmap_atomic(bh->b_page);
+			kaddr = kmap_local_page(bh->b_page);
 		}
 		curr = next;
 		curr_blkoff = next_blkoff;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 	*cnop = curr;
 	ret = n;
@@ -650,24 +763,24 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
 		goto out_cp;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	list = &header->ch_snapshot_list;
 	curr_bh = header_bh;
@@ -679,13 +792,13 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
 		curr = prev;
 		if (curr_blkoff != prev_blkoff) {
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 			brelse(curr_bh);
 			ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
 								0, &curr_bh);
 			if (ret < 0)
 				goto out_header;
-			kaddr = kmap_atomic(curr_bh->b_page);
+			kaddr = kmap_local_page(curr_bh->b_page);
 		}
 		curr_blkoff = prev_blkoff;
 		cp = nilfs_cpfile_block_get_checkpoint(
@@ -693,7 +806,7 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		list = &cp->cp_snapshot_list;
 		prev = le64_to_cpu(list->ssl_prev);
 	}
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (prev != 0) {
 		ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
@@ -705,29 +818,29 @@ static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(curr_bh->b_page);
+	kaddr = kmap_local_page(curr_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, curr, curr_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
 	nilfs_checkpoint_set_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(cno);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, 1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(prev_bh);
 	mark_buffer_dirty(curr_bh);
@@ -768,23 +881,23 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp)) {
 		ret = -ENOENT;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 	if (!nilfs_checkpoint_snapshot(cp)) {
 		ret = 0;
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		goto out_cp;
 	}
 
 	list = &cp->cp_snapshot_list;
 	next = le64_to_cpu(list->ssl_next);
 	prev = le64_to_cpu(list->ssl_prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
 	if (ret < 0)
@@ -808,29 +921,29 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
 		get_bh(prev_bh);
 	}
 
-	kaddr = kmap_atomic(next_bh->b_page);
+	kaddr = kmap_local_page(next_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, next, next_bh, kaddr);
 	list->ssl_prev = cpu_to_le64(prev);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(prev_bh->b_page);
+	kaddr = kmap_local_page(prev_bh->b_page);
 	list = nilfs_cpfile_block_get_snapshot_list(
 		cpfile, prev, prev_bh, kaddr);
 	list->ssl_next = cpu_to_le64(next);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(cp_bh->b_page);
+	kaddr = kmap_local_page(cp_bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
 	cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
 	cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
 	nilfs_checkpoint_clear_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
 	le64_add_cpu(&header->ch_nsnapshots, -1);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(next_bh);
 	mark_buffer_dirty(prev_bh);
@@ -889,13 +1002,13 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
 	ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
 	if (ret < 0)
 		goto out;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
 	if (nilfs_checkpoint_invalid(cp))
 		ret = -ENOENT;
 	else
 		ret = nilfs_checkpoint_snapshot(cp);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out:
@@ -972,12 +1085,12 @@ int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
 	ret = nilfs_cpfile_get_header_block(cpfile, &bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
 	cpstat->cs_cno = nilfs_mdt_cno(cpfile);
 	cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
 	cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh);
 
  out_sem:
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index edabb2dc5756..f5b1d59289eb 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -16,10 +16,12 @@
 #include <linux/nilfs2_ondisk.h>	/* nilfs_inode, nilfs_checkpoint */
 
 
-int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
-				struct nilfs_checkpoint **,
-				struct buffer_head **);
-void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
+int nilfs_cpfile_read_checkpoint(struct inode *cpfile, __u64 cno,
+				 struct nilfs_root *root, struct inode *ifile);
+int nilfs_cpfile_create_checkpoint(struct inode *cpfile, __u64 cno);
+int nilfs_cpfile_finalize_checkpoint(struct inode *cpfile, __u64 cno,
+				     struct nilfs_root *root, __u64 blkinc,
+				     time64_t ctime, bool minor);
 int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
 int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
 int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 9cf6ba58f585..180fc8d36213 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -91,13 +91,13 @@ void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_palloc_commit_alloc_entry(dat, req);
 	nilfs_dat_commit_entry(dat, req);
@@ -115,13 +115,13 @@ static void nilfs_dat_commit_free(struct inode *dat,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
 	entry->de_blocknr = cpu_to_le64(0);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 
@@ -145,12 +145,12 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
 	struct nilfs_dat_entry *entry;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_dat_commit_entry(dat, req);
 }
@@ -167,12 +167,12 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 	if (ret < 0)
 		return ret;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0) {
 		ret = nilfs_palloc_prepare_free_entry(dat, req);
@@ -202,7 +202,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	end = start = le64_to_cpu(entry->de_start);
@@ -212,7 +212,7 @@ void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
 	}
 	entry->de_end = cpu_to_le64(end);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (blocknr == 0)
 		nilfs_dat_commit_free(dat, req);
@@ -227,12 +227,12 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
 	sector_t blocknr;
 	void *kaddr;
 
-	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req->pr_entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
 					     req->pr_entry_bh, kaddr);
 	start = le64_to_cpu(entry->de_start);
 	blocknr = le64_to_cpu(entry->de_blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (start == nilfs_mdt_cno(dat) && blocknr == 0)
 		nilfs_palloc_abort_free_entry(dat, req);
@@ -362,7 +362,7 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
 		nilfs_crit(dat->i_sb,
@@ -370,13 +370,13 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
 			   __func__, (unsigned long long)vblocknr,
 			   (unsigned long long)le64_to_cpu(entry->de_start),
 			   (unsigned long long)le64_to_cpu(entry->de_end));
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 		return -EINVAL;
 	}
 	WARN_ON(blocknr == 0);
 	entry->de_blocknr = cpu_to_le64(blocknr);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(entry_bh);
 	nilfs_mdt_mark_dirty(dat);
@@ -426,7 +426,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 		}
 	}
 
-	kaddr = kmap_atomic(entry_bh->b_page);
+	kaddr = kmap_local_page(entry_bh->b_page);
 	entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
 	blocknr = le64_to_cpu(entry->de_blocknr);
 	if (blocknr == 0) {
@@ -436,7 +436,7 @@ int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
 	*blocknrp = blocknr;
 
  out:
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(entry_bh);
 	return ret;
 }
@@ -457,10 +457,10 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 						   0, &entry_bh);
 		if (ret < 0)
 			return ret;
-		kaddr = kmap_atomic(entry_bh->b_page);
+		kaddr = kmap_local_page(entry_bh->b_page);
 		/* last virtual block number in this block */
 		first = vinfo->vi_vblocknr;
-		do_div(first, entries_per_block);
+		first = div64_ul(first, entries_per_block);
 		first *= entries_per_block;
 		last = first + entries_per_block - 1;
 		for (j = i, n = 0;
@@ -473,7 +473,7 @@ ssize_t nilfs_dat_get_vinfo(struct inode *dat, void *buf, unsigned int visz,
 			vinfo->vi_end = le64_to_cpu(entry->de_end);
 			vinfo->vi_blocknr = le64_to_cpu(entry->de_blocknr);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(entry_bh);
 	}
 
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 4c85914f2abc..893ab36824cc 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -66,7 +66,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 		dat = nilfs_bmap_get_dat(direct);
 		ret = nilfs_dat_translate(dat, ptr, &blocknr);
 		if (ret < 0)
-			return ret;
+			goto dat_error;
 		ptr = blocknr;
 	}
 
@@ -79,7 +79,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 		if (dat) {
 			ret = nilfs_dat_translate(dat, ptr2, &blocknr);
 			if (ret < 0)
-				return ret;
+				goto dat_error;
 			ptr2 = blocknr;
 		}
 		if (ptr2 != ptr + cnt)
@@ -87,6 +87,11 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
 	}
 	*ptrp = ptr;
 	return cnt;
+
+ dat_error:
+	if (ret == -ENOENT)
+		ret = -EINVAL;  /* Notify bmap layer of metadata corruption */
+	return ret;
 }
 
 static __u64
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index a8a4bc8490b4..612e609158b5 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -15,6 +15,7 @@
 #include "mdt.h"
 #include "alloc.h"
 #include "ifile.h"
+#include "cpfile.h"
 
 /**
  * struct nilfs_ifile_info - on-memory private data of ifile
@@ -115,11 +116,11 @@ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
 		return ret;
 	}
 
-	kaddr = kmap_atomic(req.pr_entry_bh->b_page);
+	kaddr = kmap_local_page(req.pr_entry_bh->b_page);
 	raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
 						 req.pr_entry_bh, kaddr);
 	raw_inode->i_flags = 0;
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(req.pr_entry_bh);
 	brelse(req.pr_entry_bh);
@@ -173,14 +174,18 @@ int nilfs_ifile_count_free_inodes(struct inode *ifile,
  * nilfs_ifile_read - read or get ifile inode
  * @sb: super block instance
  * @root: root object
+ * @cno: number of checkpoint entry to read
  * @inode_size: size of an inode
- * @raw_inode: on-disk ifile inode
- * @inodep: buffer to store the inode
+ *
+ * Return: 0 on success, or the following negative error code on failure.
+ * * %-EINVAL	- Invalid checkpoint.
+ * * %-ENOMEM	- Insufficient memory available.
+ * * %-EIO	- I/O error (including metadata corruption).
  */
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep)
+		     __u64 cno, size_t inode_size)
 {
+	struct the_nilfs *nilfs;
 	struct inode *ifile;
 	int err;
 
@@ -201,13 +206,13 @@ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
 
 	nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache);
 
-	err = nilfs_read_inode_common(ifile, raw_inode);
+	nilfs = sb->s_fs_info;
+	err = nilfs_cpfile_read_checkpoint(nilfs->ns_cpfile, cno, root, ifile);
 	if (err)
 		goto failed;
 
 	unlock_new_inode(ifile);
  out:
-	*inodep = ifile;
 	return 0;
  failed:
 	iget_failed(ifile);
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 35c5273f4821..625545cc2a98 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -21,15 +21,14 @@
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
 {
-	void *kaddr = kmap(ibh->b_page);
+	void *kaddr = kmap_local_page(ibh->b_page);
 
 	return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
 }
 
-static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
-					   struct buffer_head *ibh)
+static inline void nilfs_ifile_unmap_inode(struct nilfs_inode *raw_inode)
 {
-	kunmap(ibh->b_page);
+	kunmap_local(raw_inode);
 }
 
 int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
@@ -39,7 +38,6 @@ int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
 int nilfs_ifile_count_free_inodes(struct inode *, u64 *, u64 *);
 
 int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root,
-		     size_t inode_size, struct nilfs_inode *raw_inode,
-		     struct inode **inodep);
+		     __u64 cno, size_t inode_size);
 
 #endif	/* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 9c334c722fc1..7340a01d80e1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -112,7 +112,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 					   "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
 					   __func__, inode->i_ino,
 					   (unsigned long long)blkoff);
-				err = 0;
+				err = -EAGAIN;
 			}
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
@@ -520,7 +520,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 			inode, inode->i_mode,
 			huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
 	}
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 	up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
 	nilfs_set_inode_flags(inode);
@@ -529,7 +529,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 	return 0;
 
  failed_unmap:
-	nilfs_ifile_unmap_inode(root->ifile, ino, bh);
+	nilfs_ifile_unmap_inode(raw_inode);
 	brelse(bh);
 
  bad_inode:
@@ -759,8 +759,18 @@ struct inode *nilfs_iget_for_shadow(struct inode *inode)
 	return s_inode;
 }
 
+/**
+ * nilfs_write_inode_common - export common inode information to on-disk inode
+ * @inode:     inode object
+ * @raw_inode: on-disk inode
+ *
+ * This function writes standard information from the on-memory inode @inode
+ * to @raw_inode on ifile, cpfile or a super root block.  Since inode bmap
+ * data is not exported, nilfs_bmap_write() must be called separately during
+ * log writing.
+ */
 void nilfs_write_inode_common(struct inode *inode,
-			      struct nilfs_inode *raw_inode, int has_bmap)
+			      struct nilfs_inode *raw_inode)
 {
 	struct nilfs_inode_info *ii = NILFS_I(inode);
 
@@ -778,21 +788,6 @@ void nilfs_write_inode_common(struct inode *inode,
 	raw_inode->i_flags = cpu_to_le32(ii->i_flags);
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
 
-	if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
-		struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
-
-		/* zero-fill unused portion in the case of super root block */
-		raw_inode->i_xattr = 0;
-		raw_inode->i_pad = 0;
-		memset((void *)raw_inode + sizeof(*raw_inode), 0,
-		       nilfs->ns_inode_size - sizeof(*raw_inode));
-	}
-
-	if (has_bmap)
-		nilfs_bmap_write(ii->i_bmap, raw_inode);
-	else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
-		raw_inode->i_device_code =
-			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 	/*
 	 * When extending inode, nilfs->ns_inode_size should be checked
 	 * for substitutions of appended fields.
@@ -813,14 +808,13 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 	if (flags & I_DIRTY_DATASYNC)
 		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
-	nilfs_write_inode_common(inode, raw_inode, 0);
-		/*
-		 * XXX: call with has_bmap = 0 is a workaround to avoid
-		 * deadlock of bmap.  This delays update of i_bmap to just
-		 * before writing.
-		 */
+	nilfs_write_inode_common(inode, raw_inode);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+		raw_inode->i_device_code =
+			cpu_to_le64(huge_encode_dev(inode->i_rdev));
 
-	nilfs_ifile_unmap_inode(ifile, ino, ibh);
+	nilfs_ifile_unmap_inode(raw_inode);
 }
 
 #define NILFS_MAX_TRUNCATE_BLOCKS	16384  /* 64MB for 4KB block */
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index cfb6aca5ec38..f1a01c191cf5 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -1111,7 +1111,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
 	segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize;
 
 	minseg = range[0] + segbytes - 1;
-	do_div(minseg, segbytes);
+	minseg = div64_ul(minseg, segbytes);
 
 	if (range[1] < 4096)
 		goto out;
@@ -1120,7 +1120,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
 	if (maxseg < segbytes)
 		goto out;
 
-	do_div(maxseg, segbytes);
+	maxseg = div64_ul(maxseg, segbytes);
 	maxseg--;
 
 	ret = nilfs_sufile_set_alloc_range(nilfs->ns_sufile, minseg, maxseg);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index e45c01a559c0..4f792a0ad0f0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -47,12 +47,12 @@ nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
 
 	set_buffer_mapped(bh);
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	memset(kaddr + bh_offset(bh), 0, i_blocksize(inode));
 	if (init_block)
 		init_block(inode, bh, kaddr);
 	flush_dcache_page(bh->b_page);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	set_buffer_uptodate(bh);
 	mark_buffer_dirty(bh);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 98cffaf0ac12..2e29b98ba8ba 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -256,7 +256,8 @@ extern struct inode *nilfs_new_inode(struct inode *, umode_t);
 extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 extern void nilfs_set_inode_flags(struct inode *);
 extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
-extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
+void nilfs_write_inode_common(struct inode *inode,
+			      struct nilfs_inode *raw_inode);
 struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
 			    unsigned long ino);
 struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 5c2eba1987bd..14e470fb8870 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -103,11 +103,11 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
 	struct page *spage = sbh->b_page, *dpage = dbh->b_page;
 	struct buffer_head *bh;
 
-	kaddr0 = kmap_atomic(spage);
-	kaddr1 = kmap_atomic(dpage);
+	kaddr0 = kmap_local_page(spage);
+	kaddr1 = kmap_local_page(dpage);
 	memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
-	kunmap_atomic(kaddr1);
-	kunmap_atomic(kaddr0);
+	kunmap_local(kaddr1);
+	kunmap_local(kaddr0);
 
 	dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
 	dbh->b_blocknr = sbh->b_blocknr;
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index a9b8d77c8c1d..49a70c68bf3c 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -482,9 +482,9 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
 	if (unlikely(!bh_org))
 		return -EIO;
 
-	kaddr = kmap_atomic(page);
+	kaddr = kmap_local_page(page);
 	memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(bh_org);
 	return 0;
 }
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 6e59dc19a732..dc431b4c34c9 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -220,9 +220,9 @@ static void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
 		crc = crc32_le(crc, bh->b_data, bh->b_size);
 	}
 	list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 	}
 	raw_sum->ss_datasum = cpu_to_le32(crc);
 }
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 2bfb08052d39..aa5290cb7467 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -880,76 +880,6 @@ static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
 	nilfs_mdt_clear_dirty(nilfs->ns_dat);
 }
 
-static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	int err;
-
-	/* XXX: this interface will be changed */
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
-					  &raw_cp, &bh_cp);
-	if (likely(!err)) {
-		/*
-		 * The following code is duplicated with cpfile.  But, it is
-		 * needed to collect the checkpoint even if it was not newly
-		 * created.
-		 */
-		mark_buffer_dirty(bh_cp);
-		nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
-		nilfs_cpfile_put_checkpoint(
-			nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	} else if (err == -EINVAL || err == -ENOENT) {
-		nilfs_error(sci->sc_super,
-			    "checkpoint creation failed due to metadata corruption.");
-		err = -EIO;
-	}
-	return err;
-}
-
-static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
-{
-	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
-	struct buffer_head *bh_cp;
-	struct nilfs_checkpoint *raw_cp;
-	int err;
-
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
-					  &raw_cp, &bh_cp);
-	if (unlikely(err)) {
-		if (err == -EINVAL || err == -ENOENT) {
-			nilfs_error(sci->sc_super,
-				    "checkpoint finalization failed due to metadata corruption.");
-			err = -EIO;
-		}
-		goto failed_ibh;
-	}
-	raw_cp->cp_snapshot_list.ssl_next = 0;
-	raw_cp->cp_snapshot_list.ssl_prev = 0;
-	raw_cp->cp_inodes_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->inodes_count));
-	raw_cp->cp_blocks_count =
-		cpu_to_le64(atomic64_read(&sci->sc_root->blocks_count));
-	raw_cp->cp_nblk_inc =
-		cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
-	raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
-	raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
-
-	if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
-		nilfs_checkpoint_clear_minor(raw_cp);
-	else
-		nilfs_checkpoint_set_minor(raw_cp);
-
-	nilfs_write_inode_common(sci->sc_root->ifile,
-				 &raw_cp->cp_ifile_inode, 1);
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
-	return 0;
-
- failed_ibh:
-	return err;
-}
-
 static void nilfs_fill_in_file_bmap(struct inode *ifile,
 				    struct nilfs_inode_info *ii)
 
@@ -963,7 +893,7 @@ static void nilfs_fill_in_file_bmap(struct inode *ifile,
 		raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
 						  ibh);
 		nilfs_bmap_write(ii->i_bmap, raw_inode);
-		nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
+		nilfs_ifile_unmap_inode(raw_inode);
 	}
 }
 
@@ -977,6 +907,33 @@ static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci)
 	}
 }
 
+/**
+ * nilfs_write_root_mdt_inode - export root metadata inode information to
+ *                              the on-disk inode
+ * @inode:     inode object of the root metadata file
+ * @raw_inode: on-disk inode
+ *
+ * nilfs_write_root_mdt_inode() writes inode information and bmap data of
+ * @inode to the inode area of the metadata file allocated on the super root
+ * block created to finalize the log.  Since super root blocks are configured
+ * each time, this function zero-fills the unused area of @raw_inode.
+ */
+static void nilfs_write_root_mdt_inode(struct inode *inode,
+				       struct nilfs_inode *raw_inode)
+{
+	struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
+
+	nilfs_write_inode_common(inode, raw_inode);
+
+	/* zero-fill unused portion of raw_inode */
+	raw_inode->i_xattr = 0;
+	raw_inode->i_pad = 0;
+	memset((void *)raw_inode + sizeof(*raw_inode), 0,
+	       nilfs->ns_inode_size - sizeof(*raw_inode));
+
+	nilfs_bmap_write(NILFS_I(inode)->i_bmap, raw_inode);
+}
+
 static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 					     struct the_nilfs *nilfs)
 {
@@ -998,12 +955,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
 			      nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
 	raw_sr->sr_flags = 0;
 
-	nilfs_write_inode_common(nilfs->ns_dat, (void *)raw_sr +
-				 NILFS_SR_DAT_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_cpfile, (void *)raw_sr +
-				 NILFS_SR_CPFILE_OFFSET(isz), 1);
-	nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
-				 NILFS_SR_SUFILE_OFFSET(isz), 1);
+	nilfs_write_root_mdt_inode(nilfs->ns_dat, (void *)raw_sr +
+				   NILFS_SR_DAT_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_cpfile, (void *)raw_sr +
+				   NILFS_SR_CPFILE_OFFSET(isz));
+	nilfs_write_root_mdt_inode(nilfs->ns_sufile, (void *)raw_sr +
+				   NILFS_SR_SUFILE_OFFSET(isz));
+
 	memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
 	set_buffer_uptodate(bh_sr);
 	unlock_buffer(bh_sr);
@@ -1230,7 +1188,8 @@ static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
 			break;
 		nilfs_sc_cstage_inc(sci);
 		/* Creating a checkpoint */
-		err = nilfs_segctor_create_checkpoint(sci);
+		err = nilfs_cpfile_create_checkpoint(nilfs->ns_cpfile,
+						     nilfs->ns_cno);
 		if (unlikely(err))
 			break;
 		fallthrough;
@@ -2101,7 +2060,11 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
 
 		if (mode == SC_LSEG_SR &&
 		    nilfs_sc_cstage_get(sci) >= NILFS_ST_CPFILE) {
-			err = nilfs_segctor_fill_in_checkpoint(sci);
+			err = nilfs_cpfile_finalize_checkpoint(
+				nilfs->ns_cpfile, nilfs->ns_cno, sci->sc_root,
+				sci->sc_nblk_inc + sci->sc_nblk_this_inc,
+				sci->sc_seg_ctime,
+				!test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags));
 			if (unlikely(err))
 				goto failed_to_write;
 
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 0a8119456c21..6748218be7c5 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -48,7 +48,7 @@ nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
 {
 	__u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
 
-	do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
+	t = div64_ul(t, nilfs_sufile_segment_usages_per_block(sufile));
 	return (unsigned long)t;
 }
 
@@ -107,11 +107,11 @@ static void nilfs_sufile_mod_counter(struct buffer_head *header_bh,
 	struct nilfs_sufile_header *header;
 	void *kaddr;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	le64_add_cpu(&header->sh_ncleansegs, ncleanadd);
 	le64_add_cpu(&header->sh_ndirtysegs, ndirtyadd);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 }
@@ -315,10 +315,10 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 	ret = nilfs_sufile_get_header_block(sufile, &header_bh);
 	if (ret < 0)
 		goto out_sem;
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	last_alloc = le64_to_cpu(header->sh_last_alloc);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nsegments = nilfs_sufile_get_nsegments(sufile);
 	maxsegnum = sui->allocmax;
@@ -352,7 +352,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 							   &su_bh);
 		if (ret < 0)
 			goto out_header;
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 
@@ -363,14 +363,14 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 				continue;
 			/* found a clean segment */
 			nilfs_segment_usage_set_dirty(su);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
-			kaddr = kmap_atomic(header_bh->b_page);
+			kaddr = kmap_local_page(header_bh->b_page);
 			header = kaddr + bh_offset(header_bh);
 			le64_add_cpu(&header->sh_ncleansegs, -1);
 			le64_add_cpu(&header->sh_ndirtysegs, 1);
 			header->sh_last_alloc = cpu_to_le64(segnum);
-			kunmap_atomic(kaddr);
+			kunmap_local(kaddr);
 
 			sui->ncleansegs--;
 			mark_buffer_dirty(header_bh);
@@ -384,7 +384,7 @@ int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
 			goto out_header;
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 
@@ -406,16 +406,16 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
 	struct nilfs_segment_usage *su;
 	void *kaddr;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (unlikely(!nilfs_segment_usage_clean(su))) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	nilfs_segment_usage_set_dirty(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, -1, 1);
 	NILFS_SUI(sufile)->ncleansegs--;
@@ -432,11 +432,11 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int clean, dirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (su->su_flags == cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY)) &&
 	    su->su_nblocks == cpu_to_le32(0)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	clean = nilfs_segment_usage_clean(su);
@@ -446,7 +446,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 segnum,
 	su->su_lastmod = cpu_to_le64(0);
 	su->su_nblocks = cpu_to_le32(0);
 	su->su_flags = cpu_to_le32(BIT(NILFS_SEGMENT_USAGE_DIRTY));
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	nilfs_sufile_mod_counter(header_bh, clean ? (u64)-1 : 0, dirty ? 0 : 1);
 	NILFS_SUI(sufile)->ncleansegs -= clean;
@@ -463,12 +463,12 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int sudirty;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_clean(su)) {
 		nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
 			   __func__, (unsigned long long)segnum);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	if (unlikely(nilfs_segment_usage_error(su)))
@@ -481,7 +481,7 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
 			   (unsigned long long)segnum);
 
 	nilfs_segment_usage_set_clean(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	mark_buffer_dirty(su_bh);
 
 	nilfs_sufile_mod_counter(header_bh, 1, sudirty ? (u64)-1 : 0);
@@ -509,12 +509,12 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 	if (ret)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (unlikely(nilfs_segment_usage_error(su))) {
 		struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(bh);
 		if (nilfs_segment_is_active(nilfs, segnum)) {
 			nilfs_error(sufile->i_sb,
@@ -532,7 +532,7 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
 		ret = -EIO;
 	} else {
 		nilfs_segment_usage_set_dirty(su);
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		mark_buffer_dirty(bh);
 		nilfs_mdt_mark_dirty(sufile);
 		brelse(bh);
@@ -562,7 +562,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(bh->b_page);
+	kaddr = kmap_local_page(bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
 	if (modtime) {
 		/*
@@ -573,7 +573,7 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
 		su->su_lastmod = cpu_to_le64(modtime);
 	}
 	su->su_nblocks = cpu_to_le32(nblocks);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -614,7 +614,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	if (ret < 0)
 		goto out_sem;
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
 	sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
@@ -624,7 +624,7 @@ int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
 	spin_lock(&nilfs->ns_last_segment_lock);
 	sustat->ss_prot_seq = nilfs->ns_prot_seq;
 	spin_unlock(&nilfs->ns_last_segment_lock);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
  out_sem:
@@ -640,15 +640,15 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum,
 	void *kaddr;
 	int suclean;
 
-	kaddr = kmap_atomic(su_bh->b_page);
+	kaddr = kmap_local_page(su_bh->b_page);
 	su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
 	if (nilfs_segment_usage_error(su)) {
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		return;
 	}
 	suclean = nilfs_segment_usage_clean(su);
 	nilfs_segment_usage_set_error(su);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	if (suclean) {
 		nilfs_sufile_mod_counter(header_bh, -1, 0);
@@ -717,7 +717,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			/* hole */
 			continue;
 		}
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		su2 = su;
@@ -726,7 +726,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 			     ~BIT(NILFS_SEGMENT_USAGE_ERROR)) ||
 			    nilfs_segment_is_active(nilfs, segnum + j)) {
 				ret = -EBUSY;
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 				brelse(su_bh);
 				goto out_header;
 			}
@@ -738,7 +738,7 @@ static int nilfs_sufile_truncate_range(struct inode *sufile,
 				nc++;
 			}
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		if (nc > 0) {
 			mark_buffer_dirty(su_bh);
 			ncleaned += nc;
@@ -823,10 +823,10 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
 		sui->allocmin = 0;
 	}
 
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	header->sh_ncleansegs = cpu_to_le64(sui->ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 
 	mark_buffer_dirty(header_bh);
 	nilfs_mdt_mark_dirty(sufile);
@@ -891,7 +891,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, segnum, su_bh, kaddr);
 		for (j = 0; j < n;
@@ -904,7 +904,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf,
 				si->sui_flags |=
 					BIT(NILFS_SEGMENT_USAGE_ACTIVE);
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		brelse(su_bh);
 	}
 	ret = nsegs;
@@ -973,7 +973,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 		goto out_header;
 
 	for (;;) {
-		kaddr = kmap_atomic(bh->b_page);
+		kaddr = kmap_local_page(bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(
 			sufile, sup->sup_segnum, bh, kaddr);
 
@@ -1010,7 +1010,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf,
 			su->su_flags = cpu_to_le32(sup->sup_sui.sui_flags);
 		}
 
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 
 		sup = (void *)sup + supsz;
 		if (sup >= supend)
@@ -1115,7 +1115,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			continue;
 		}
 
-		kaddr = kmap_atomic(su_bh->b_page);
+		kaddr = kmap_local_page(su_bh->b_page);
 		su = nilfs_sufile_block_get_segment_usage(sufile, segnum,
 				su_bh, kaddr);
 		for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) {
@@ -1145,7 +1145,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			}
 
 			if (nblocks >= minlen) {
-				kunmap_atomic(kaddr);
+				kunmap_local(kaddr);
 
 				ret = blkdev_issue_discard(nilfs->ns_bdev,
 						start * sects_per_block,
@@ -1157,7 +1157,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 				}
 
 				ndiscarded += nblocks;
-				kaddr = kmap_atomic(su_bh->b_page);
+				kaddr = kmap_local_page(su_bh->b_page);
 				su = nilfs_sufile_block_get_segment_usage(
 					sufile, segnum, su_bh, kaddr);
 			}
@@ -1166,7 +1166,7 @@ int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range)
 			start = seg_start;
 			nblocks = seg_end - seg_start + 1;
 		}
-		kunmap_atomic(kaddr);
+		kunmap_local(kaddr);
 		put_bh(su_bh);
 	}
 
@@ -1246,10 +1246,10 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
 		goto failed;
 
 	sui = NILFS_SUI(sufile);
-	kaddr = kmap_atomic(header_bh->b_page);
+	kaddr = kmap_local_page(header_bh->b_page);
 	header = kaddr + bh_offset(header_bh);
 	sui->ncleansegs = le64_to_cpu(header->sh_ncleansegs);
-	kunmap_atomic(kaddr);
+	kunmap_local(kaddr);
 	brelse(header_bh);
 
 	sui->allocmax = nilfs_sufile_get_nsegments(sufile) - 1;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index df8674173b22..ac24ed109ce9 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -448,7 +448,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
 
 	sb2off = NILFS_SB2_OFFSET_BYTES(newsize);
 	newnsegs = sb2off >> nilfs->ns_blocksize_bits;
-	do_div(newnsegs, nilfs->ns_blocks_per_segment);
+	newnsegs = div64_ul(newnsegs, nilfs->ns_blocks_per_segment);
 
 	ret = nilfs_sufile_resize(nilfs->ns_sufile, newnsegs);
 	up_write(&nilfs->ns_segctor_sem);
@@ -544,8 +544,6 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 {
 	struct the_nilfs *nilfs = sb->s_fs_info;
 	struct nilfs_root *root;
-	struct nilfs_checkpoint *raw_cp;
-	struct buffer_head *bh_cp;
 	int err = -ENOMEM;
 
 	root = nilfs_find_or_create_root(
@@ -557,38 +555,19 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
 		goto reuse; /* already attached checkpoint */
 
 	down_read(&nilfs->ns_segctor_sem);
-	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
-					  &bh_cp);
+	err = nilfs_ifile_read(sb, root, cno, nilfs->ns_inode_size);
 	up_read(&nilfs->ns_segctor_sem);
-	if (unlikely(err)) {
-		if (err == -ENOENT || err == -EINVAL) {
-			nilfs_err(sb,
-				  "Invalid checkpoint (checkpoint number=%llu)",
-				  (unsigned long long)cno);
-			err = -EINVAL;
-		}
+	if (unlikely(err))
 		goto failed;
-	}
-
-	err = nilfs_ifile_read(sb, root, nilfs->ns_inode_size,
-			       &raw_cp->cp_ifile_inode, &root->ifile);
-	if (err)
-		goto failed_bh;
-
-	atomic64_set(&root->inodes_count,
-			le64_to_cpu(raw_cp->cp_inodes_count));
-	atomic64_set(&root->blocks_count,
-			le64_to_cpu(raw_cp->cp_blocks_count));
-
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
 
  reuse:
 	*rootp = root;
 	return 0;
 
- failed_bh:
-	nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
  failed:
+	if (err == -EINVAL)
+		nilfs_err(sb, "Invalid checkpoint (checkpoint number=%llu)",
+			  (unsigned long long)cno);
 	nilfs_put_root(root);
 
 	return err;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 71400496ed36..2ae2c1bbf6d1 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -413,7 +413,7 @@ static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
 {
 	u64 max_count = U64_MAX;
 
-	do_div(max_count, nilfs->ns_blocks_per_segment);
+	max_count = div64_ul(max_count, nilfs->ns_blocks_per_segment);
 	return min_t(u64, max_count, ULONG_MAX);
 }
 
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 1e4def21811e..224bccaab4cc 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -228,8 +228,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
 
 	pr_debug("%s: group=%p event=%p\n", __func__, group, event);
 
-	ret = wait_event_killable(group->fanotify_data.access_waitq,
-				  event->state == FAN_EVENT_ANSWERED);
+	ret = wait_event_state(group->fanotify_data.access_waitq,
+				  event->state == FAN_EVENT_ANSWERED,
+				  (TASK_KILLABLE|TASK_FREEZABLE));
+
 	/* Signal pending? */
 	if (ret < 0) {
 		spin_lock(&group->notification_lock);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 8bfd690e9f10..2fc105a72a8f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -141,7 +141,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Are inode/sb/mount interested in parent and name info with this event? */
-static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
+static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask,
 					__u32 mask)
 {
 	__u32 marks_mask = 0;
@@ -160,13 +160,22 @@ static bool fsnotify_event_needs_parent(struct inode *inode, struct mount *mnt,
 	/* Did either inode/sb/mount subscribe for events with parent/name? */
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask);
 	marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask);
-	if (mnt)
-		marks_mask |= fsnotify_parent_needed_mask(mnt->mnt_fsnotify_mask);
+	marks_mask |= fsnotify_parent_needed_mask(mnt_mask);
 
 	/* Did they subscribe for this event with parent/name info? */
 	return mask & marks_mask;
 }
 
+/* Are there any inode/mount/sb objects that are interested in this event? */
+static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask,
+					   __u32 mask)
+{
+	__u32 marks_mask = inode->i_fsnotify_mask | mnt_mask |
+			   inode->i_sb->s_fsnotify_mask;
+
+	return mask & marks_mask & ALL_FSNOTIFY_EVENTS;
+}
+
 /*
  * Notify this dentry's parent about a child's events with child name info
  * if parent is watching or if inode/sb/mount are interested in events with
@@ -179,7 +188,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
 		      int data_type)
 {
 	const struct path *path = fsnotify_data_path(data, data_type);
-	struct mount *mnt = path ? real_mount(path->mnt) : NULL;
+	__u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0;
 	struct inode *inode = d_inode(dentry);
 	struct dentry *parent;
 	bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED;
@@ -190,16 +199,13 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
 	struct qstr *file_name = NULL;
 	int ret = 0;
 
-	/*
-	 * Do inode/sb/mount care about parent and name info on non-dir?
-	 * Do they care about any event at all?
-	 */
-	if (!inode->i_fsnotify_marks && !inode->i_sb->s_fsnotify_marks &&
-	    (!mnt || !mnt->mnt_fsnotify_marks) && !parent_watched)
+	/* Optimize the likely case of nobody watching this path */
+	if (likely(!parent_watched &&
+		   !fsnotify_object_watched(inode, mnt_mask, mask)))
 		return 0;
 
 	parent = NULL;
-	parent_needed = fsnotify_event_needs_parent(inode, mnt, mask);
+	parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask);
 	if (!parent_watched && !parent_needed)
 		goto notify;
 
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 34e1e3e36733..07e22a15ef02 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -27,26 +27,17 @@ static const struct file_operations ns_file_operations = {
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	return dynamic_dname(buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
 }
 
-static void ns_prune_dentry(struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-	if (inode) {
-		struct ns_common *ns = inode->i_private;
-		atomic_long_set(&ns->stashed, 0);
-	}
-}
-
-const struct dentry_operations ns_dentry_operations =
-{
-	.d_prune	= ns_prune_dentry,
+const struct dentry_operations ns_dentry_operations = {
 	.d_delete	= always_delete_dentry,
 	.d_dname	= ns_dname,
+	.d_prune	= stashed_dentry_prune,
 };
 
 static void nsfs_evict(struct inode *inode)
@@ -56,67 +47,16 @@ static void nsfs_evict(struct inode *inode)
 	ns->ops->put(ns);
 }
 
-static int __ns_get_path(struct path *path, struct ns_common *ns)
-{
-	struct vfsmount *mnt = nsfs_mnt;
-	struct dentry *dentry;
-	struct inode *inode;
-	unsigned long d;
-
-	rcu_read_lock();
-	d = atomic_long_read(&ns->stashed);
-	if (!d)
-		goto slow;
-	dentry = (struct dentry *)d;
-	if (!lockref_get_not_dead(&dentry->d_lockref))
-		goto slow;
-	rcu_read_unlock();
-	ns->ops->put(ns);
-got_it:
-	path->mnt = mntget(mnt);
-	path->dentry = dentry;
-	return 0;
-slow:
-	rcu_read_unlock();
-	inode = new_inode_pseudo(mnt->mnt_sb);
-	if (!inode) {
-		ns->ops->put(ns);
-		return -ENOMEM;
-	}
-	inode->i_ino = ns->inum;
-	simple_inode_init_ts(inode);
-	inode->i_flags |= S_IMMUTABLE;
-	inode->i_mode = S_IFREG | S_IRUGO;
-	inode->i_fop = &ns_file_operations;
-	inode->i_private = ns;
-
-	dentry = d_make_root(inode);	/* not the normal use, but... */
-	if (!dentry)
-		return -ENOMEM;
-	dentry->d_fsdata = (void *)ns->ops;
-	d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
-	if (d) {
-		d_delete(dentry);	/* make sure ->d_prune() does nothing */
-		dput(dentry);
-		cpu_relax();
-		return -EAGAIN;
-	}
-	goto got_it;
-}
-
 int ns_get_path_cb(struct path *path, ns_get_path_helper_t *ns_get_cb,
 		     void *private_data)
 {
-	int ret;
+	struct ns_common *ns;
 
-	do {
-		struct ns_common *ns = ns_get_cb(private_data);
-		if (!ns)
-			return -ENOENT;
-		ret = __ns_get_path(path, ns);
-	} while (ret == -EAGAIN);
+	ns = ns_get_cb(private_data);
+	if (!ns)
+		return -ENOENT;
 
-	return ret;
+	return path_from_stashed(&ns->stashed, nsfs_mnt, ns, path);
 }
 
 struct ns_get_path_task_args {
@@ -146,6 +86,7 @@ int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
+	struct ns_common *relative;
 	struct file *f;
 	int err;
 	int fd;
@@ -154,19 +95,14 @@ int open_related_ns(struct ns_common *ns,
 	if (fd < 0)
 		return fd;
 
-	do {
-		struct ns_common *relative;
-
-		relative = get_ns(ns);
-		if (IS_ERR(relative)) {
-			put_unused_fd(fd);
-			return PTR_ERR(relative);
-		}
-
-		err = __ns_get_path(&path, relative);
-	} while (err == -EAGAIN);
+	relative = get_ns(ns);
+	if (IS_ERR(relative)) {
+		put_unused_fd(fd);
+		return PTR_ERR(relative);
+	}
 
-	if (err) {
+	err = path_from_stashed(&relative->stashed, nsfs_mnt, relative, &path);
+	if (err < 0) {
 		put_unused_fd(fd);
 		return err;
 	}
@@ -249,7 +185,8 @@ bool ns_match(const struct ns_common *ns, dev_t dev, ino_t ino)
 static int nsfs_show_path(struct seq_file *seq, struct dentry *dentry)
 {
 	struct inode *inode = d_inode(dentry);
-	const struct proc_ns_operations *ns_ops = dentry->d_fsdata;
+	const struct ns_common *ns = inode->i_private;
+	const struct proc_ns_operations *ns_ops = ns->ops;
 
 	seq_printf(seq, "%s:[%lu]", ns_ops->name, inode->i_ino);
 	return 0;
@@ -261,6 +198,28 @@ static const struct super_operations nsfs_ops = {
 	.show_path = nsfs_show_path,
 };
 
+static int nsfs_init_inode(struct inode *inode, void *data)
+{
+	struct ns_common *ns = data;
+
+	inode->i_private = data;
+	inode->i_mode |= S_IRUGO;
+	inode->i_fop = &ns_file_operations;
+	inode->i_ino = ns->inum;
+	return 0;
+}
+
+static void nsfs_put_data(void *data)
+{
+	struct ns_common *ns = data;
+	ns->ops->put(ns);
+}
+
+static const struct stashed_operations nsfs_stashed_ops = {
+	.init_inode = nsfs_init_inode,
+	.put_data = nsfs_put_data,
+};
+
 static int nsfs_init_fs_context(struct fs_context *fc)
 {
 	struct pseudo_fs_context *ctx = init_pseudo(fc, NSFS_MAGIC);
@@ -268,6 +227,7 @@ static int nsfs_init_fs_context(struct fs_context *fc)
 		return -ENOMEM;
 	ctx->ops = &nsfs_ops;
 	ctx->dops = &ns_dentry_operations;
+	fc->s_fs_info = (void *)&nsfs_stashed_ops;
 	return 0;
 }
 
diff --git a/fs/ntfs/Kconfig b/fs/ntfs/Kconfig
deleted file mode 100644
index 7b2509741735..000000000000
--- a/fs/ntfs/Kconfig
+++ /dev/null
@@ -1,81 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-config NTFS_FS
-	tristate "NTFS file system support"
-	select BUFFER_HEAD
-	select NLS
-	help
-	  NTFS is the file system of Microsoft Windows NT, 2000, XP and 2003.
-
-	  Saying Y or M here enables read support.  There is partial, but
-	  safe, write support available.  For write support you must also
-	  say Y to "NTFS write support" below.
-
-	  There are also a number of user-space tools available, called
-	  ntfsprogs.  These include ntfsundelete and ntfsresize, that work
-	  without NTFS support enabled in the kernel.
-
-	  This is a rewrite from scratch of Linux NTFS support and replaced
-	  the old NTFS code starting with Linux 2.5.11.  A backport to
-	  the Linux 2.4 kernel series is separately available as a patch
-	  from the project web site.
-
-	  For more information see <file:Documentation/filesystems/ntfs.rst>
-	  and <http://www.linux-ntfs.org/>.
-
-	  To compile this file system support as a module, choose M here: the
-	  module will be called ntfs.
-
-	  If you are not using Windows NT, 2000, XP or 2003 in addition to
-	  Linux on your computer it is safe to say N.
-
-config NTFS_DEBUG
-	bool "NTFS debugging support"
-	depends on NTFS_FS
-	help
-	  If you are experiencing any problems with the NTFS file system, say
-	  Y here.  This will result in additional consistency checks to be
-	  performed by the driver as well as additional debugging messages to
-	  be written to the system log.  Note that debugging messages are
-	  disabled by default.  To enable them, supply the option debug_msgs=1
-	  at the kernel command line when booting the kernel or as an option
-	  to insmod when loading the ntfs module.  Once the driver is active,
-	  you can enable debugging messages by doing (as root):
-	  echo 1 > /proc/sys/fs/ntfs-debug
-	  Replacing the "1" with "0" would disable debug messages.
-
-	  If you leave debugging messages disabled, this results in little
-	  overhead, but enabling debug messages results in very significant
-	  slowdown of the system.
-
-	  When reporting bugs, please try to have available a full dump of
-	  debugging messages while the misbehaviour was occurring.
-
-config NTFS_RW
-	bool "NTFS write support"
-	depends on NTFS_FS
-	depends on PAGE_SIZE_LESS_THAN_64KB
-	help
-	  This enables the partial, but safe, write support in the NTFS driver.
-
-	  The only supported operation is overwriting existing files, without
-	  changing the file length.  No file or directory creation, deletion or
-	  renaming is possible.  Note only non-resident files can be written to
-	  so you may find that some very small files (<500 bytes or so) cannot
-	  be written to.
-
-	  While we cannot guarantee that it will not damage any data, we have
-	  so far not received a single report where the driver would have
-	  damaged someones data so we assume it is perfectly safe to use.
-
-	  Note:  While write support is safe in this version (a rewrite from
-	  scratch of the NTFS support), it should be noted that the old NTFS
-	  write support, included in Linux 2.5.10 and before (since 1997),
-	  is not safe.
-
-	  This is currently useful with TopologiLinux.  TopologiLinux is run
-	  on top of any DOS/Microsoft Windows system without partitioning your
-	  hard disk.  Unlike other Linux distributions TopologiLinux does not
-	  need its own partition.  For more information see
-	  <http://topologi-linux.sourceforge.net/>
-
-	  It is perfectly safe to say N here.
diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile
deleted file mode 100644
index 3e736572ed00..000000000000
--- a/fs/ntfs/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-# Rules for making the NTFS driver.
-
-obj-$(CONFIG_NTFS_FS) += ntfs.o
-
-ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \
-	  index.o inode.o mft.o mst.o namei.o runlist.o super.o sysctl.o \
-	  unistr.o upcase.o
-
-ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o
-
-ccflags-y := -DNTFS_VERSION=\"2.1.32\"
-ccflags-$(CONFIG_NTFS_DEBUG)	+= -DDEBUG
-ccflags-$(CONFIG_NTFS_RW)	+= -DNTFS_RW
-
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
deleted file mode 100644
index 2d01517a2d59..000000000000
--- a/fs/ntfs/aops.c
+++ /dev/null
@@ -1,1744 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * aops.c - NTFS kernel address space operations and page cache handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/pagemap.h>
-#include <linux/swap.h>
-#include <linux/buffer_head.h>
-#include <linux/writeback.h>
-#include <linux/bit_spinlock.h>
-#include <linux/bio.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "debug.h"
-#include "inode.h"
-#include "mft.h"
-#include "runlist.h"
-#include "types.h"
-#include "ntfs.h"
-
-/**
- * ntfs_end_buffer_async_read - async io completion for reading attributes
- * @bh:		buffer head on which io is completed
- * @uptodate:	whether @bh is now uptodate or not
- *
- * Asynchronous I/O completion handler for reading pages belonging to the
- * attribute address space of an inode.  The inodes can either be files or
- * directories or they can be fake inodes describing some attribute.
- *
- * If NInoMstProtected(), perform the post read mst fixups when all IO on the
- * page has been completed and mark the page uptodate or set the error bit on
- * the page.  To determine the size of the records that need fixing up, we
- * cheat a little bit by setting the index_block_size in ntfs_inode to the ntfs
- * record size, and index_block_size_bits, to the log(base 2) of the ntfs
- * record size.
- */
-static void ntfs_end_buffer_async_read(struct buffer_head *bh, int uptodate)
-{
-	unsigned long flags;
-	struct buffer_head *first, *tmp;
-	struct page *page;
-	struct inode *vi;
-	ntfs_inode *ni;
-	int page_uptodate = 1;
-
-	page = bh->b_page;
-	vi = page->mapping->host;
-	ni = NTFS_I(vi);
-
-	if (likely(uptodate)) {
-		loff_t i_size;
-		s64 file_ofs, init_size;
-
-		set_buffer_uptodate(bh);
-
-		file_ofs = ((s64)page->index << PAGE_SHIFT) +
-				bh_offset(bh);
-		read_lock_irqsave(&ni->size_lock, flags);
-		init_size = ni->initialized_size;
-		i_size = i_size_read(vi);
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (unlikely(init_size > i_size)) {
-			/* Race with shrinking truncate. */
-			init_size = i_size;
-		}
-		/* Check for the current buffer head overflowing. */
-		if (unlikely(file_ofs + bh->b_size > init_size)) {
-			int ofs;
-			void *kaddr;
-
-			ofs = 0;
-			if (file_ofs < init_size)
-				ofs = init_size - file_ofs;
-			kaddr = kmap_atomic(page);
-			memset(kaddr + bh_offset(bh) + ofs, 0,
-					bh->b_size - ofs);
-			flush_dcache_page(page);
-			kunmap_atomic(kaddr);
-		}
-	} else {
-		clear_buffer_uptodate(bh);
-		SetPageError(page);
-		ntfs_error(ni->vol->sb, "Buffer I/O error, logical block "
-				"0x%llx.", (unsigned long long)bh->b_blocknr);
-	}
-	first = page_buffers(page);
-	spin_lock_irqsave(&first->b_uptodate_lock, flags);
-	clear_buffer_async_read(bh);
-	unlock_buffer(bh);
-	tmp = bh;
-	do {
-		if (!buffer_uptodate(tmp))
-			page_uptodate = 0;
-		if (buffer_async_read(tmp)) {
-			if (likely(buffer_locked(tmp)))
-				goto still_busy;
-			/* Async buffers must be locked. */
-			BUG();
-		}
-		tmp = tmp->b_this_page;
-	} while (tmp != bh);
-	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
-	/*
-	 * If none of the buffers had errors then we can set the page uptodate,
-	 * but we first have to perform the post read mst fixups, if the
-	 * attribute is mst protected, i.e. if NInoMstProteced(ni) is true.
-	 * Note we ignore fixup errors as those are detected when
-	 * map_mft_record() is called which gives us per record granularity
-	 * rather than per page granularity.
-	 */
-	if (!NInoMstProtected(ni)) {
-		if (likely(page_uptodate && !PageError(page)))
-			SetPageUptodate(page);
-	} else {
-		u8 *kaddr;
-		unsigned int i, recs;
-		u32 rec_size;
-
-		rec_size = ni->itype.index.block_size;
-		recs = PAGE_SIZE / rec_size;
-		/* Should have been verified before we got here... */
-		BUG_ON(!recs);
-		kaddr = kmap_atomic(page);
-		for (i = 0; i < recs; i++)
-			post_read_mst_fixup((NTFS_RECORD*)(kaddr +
-					i * rec_size), rec_size);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(page);
-		if (likely(page_uptodate && !PageError(page)))
-			SetPageUptodate(page);
-	}
-	unlock_page(page);
-	return;
-still_busy:
-	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
-	return;
-}
-
-/**
- * ntfs_read_block - fill a @folio of an address space with data
- * @folio:	page cache folio to fill with data
- *
- * We read each buffer asynchronously and when all buffers are read in, our io
- * completion handler ntfs_end_buffer_read_async(), if required, automatically
- * applies the mst fixups to the folio before finally marking it uptodate and
- * unlocking it.
- *
- * We only enforce allocated_size limit because i_size is checked for in
- * generic_file_read().
- *
- * Return 0 on success and -errno on error.
- *
- * Contains an adapted version of fs/buffer.c::block_read_full_folio().
- */
-static int ntfs_read_block(struct folio *folio)
-{
-	loff_t i_size;
-	VCN vcn;
-	LCN lcn;
-	s64 init_size;
-	struct inode *vi;
-	ntfs_inode *ni;
-	ntfs_volume *vol;
-	runlist_element *rl;
-	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-	sector_t iblock, lblock, zblock;
-	unsigned long flags;
-	unsigned int blocksize, vcn_ofs;
-	int i, nr;
-	unsigned char blocksize_bits;
-
-	vi = folio->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-
-	/* $MFT/$DATA must have its complete runlist in memory at all times. */
-	BUG_ON(!ni->runlist.rl && !ni->mft_no && !NInoAttr(ni));
-
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-
-	head = folio_buffers(folio);
-	if (!head)
-		head = create_empty_buffers(folio, blocksize, 0);
-	bh = head;
-
-	/*
-	 * We may be racing with truncate.  To avoid some of the problems we
-	 * now take a snapshot of the various sizes and use those for the whole
-	 * of the function.  In case of an extending truncate it just means we
-	 * may leave some buffers unmapped which are now allocated.  This is
-	 * not a problem since these buffers will just get mapped when a write
-	 * occurs.  In case of a shrinking truncate, we will detect this later
-	 * on due to the runlist being incomplete and if the folio is being
-	 * fully truncated, truncate will throw it away as soon as we unlock
-	 * it so no need to worry what we do with it.
-	 */
-	iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-	read_lock_irqsave(&ni->size_lock, flags);
-	lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits;
-	init_size = ni->initialized_size;
-	i_size = i_size_read(vi);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (unlikely(init_size > i_size)) {
-		/* Race with shrinking truncate. */
-		init_size = i_size;
-	}
-	zblock = (init_size + blocksize - 1) >> blocksize_bits;
-
-	/* Loop through all the buffers in the folio. */
-	rl = NULL;
-	nr = i = 0;
-	do {
-		int err = 0;
-
-		if (unlikely(buffer_uptodate(bh)))
-			continue;
-		if (unlikely(buffer_mapped(bh))) {
-			arr[nr++] = bh;
-			continue;
-		}
-		bh->b_bdev = vol->sb->s_bdev;
-		/* Is the block within the allowed limits? */
-		if (iblock < lblock) {
-			bool is_retry = false;
-
-			/* Convert iblock into corresponding vcn and offset. */
-			vcn = (VCN)iblock << blocksize_bits >>
-					vol->cluster_size_bits;
-			vcn_ofs = ((VCN)iblock << blocksize_bits) &
-					vol->cluster_size_mask;
-			if (!rl) {
-lock_retry_remap:
-				down_read(&ni->runlist.lock);
-				rl = ni->runlist.rl;
-			}
-			if (likely(rl != NULL)) {
-				/* Seek to element containing target vcn. */
-				while (rl->length && rl[1].vcn <= vcn)
-					rl++;
-				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			} else
-				lcn = LCN_RL_NOT_MAPPED;
-			/* Successful remap. */
-			if (lcn >= 0) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn << vol->cluster_size_bits)
-						+ vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-				/* Only read initialized data blocks. */
-				if (iblock < zblock) {
-					arr[nr++] = bh;
-					continue;
-				}
-				/* Fully non-initialized data block, zero it. */
-				goto handle_zblock;
-			}
-			/* It is a hole, need to zero it. */
-			if (lcn == LCN_HOLE)
-				goto handle_hole;
-			/* If first try and runlist unmapped, map and retry. */
-			if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-				is_retry = true;
-				/*
-				 * Attempt to map runlist, dropping lock for
-				 * the duration.
-				 */
-				up_read(&ni->runlist.lock);
-				err = ntfs_map_runlist(ni, vcn);
-				if (likely(!err))
-					goto lock_retry_remap;
-				rl = NULL;
-			} else if (!rl)
-				up_read(&ni->runlist.lock);
-			/*
-			 * If buffer is outside the runlist, treat it as a
-			 * hole.  This can happen due to concurrent truncate
-			 * for example.
-			 */
-			if (err == -ENOENT || lcn == LCN_ENOENT) {
-				err = 0;
-				goto handle_hole;
-			}
-			/* Hard error, zero out region. */
-			if (!err)
-				err = -EIO;
-			bh->b_blocknr = -1;
-			folio_set_error(folio);
-			ntfs_error(vol->sb, "Failed to read from inode 0x%lx, "
-					"attribute type 0x%x, vcn 0x%llx, "
-					"offset 0x%x because its location on "
-					"disk could not be determined%s "
-					"(error code %i).", ni->mft_no,
-					ni->type, (unsigned long long)vcn,
-					vcn_ofs, is_retry ? " even after "
-					"retrying" : "", err);
-		}
-		/*
-		 * Either iblock was outside lblock limits or
-		 * ntfs_rl_vcn_to_lcn() returned error.  Just zero that portion
-		 * of the folio and set the buffer uptodate.
-		 */
-handle_hole:
-		bh->b_blocknr = -1UL;
-		clear_buffer_mapped(bh);
-handle_zblock:
-		folio_zero_range(folio, i * blocksize, blocksize);
-		if (likely(!err))
-			set_buffer_uptodate(bh);
-	} while (i++, iblock++, (bh = bh->b_this_page) != head);
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* Check we have at least one buffer ready for i/o. */
-	if (nr) {
-		struct buffer_head *tbh;
-
-		/* Lock the buffers. */
-		for (i = 0; i < nr; i++) {
-			tbh = arr[i];
-			lock_buffer(tbh);
-			tbh->b_end_io = ntfs_end_buffer_async_read;
-			set_buffer_async_read(tbh);
-		}
-		/* Finally, start i/o on the buffers. */
-		for (i = 0; i < nr; i++) {
-			tbh = arr[i];
-			if (likely(!buffer_uptodate(tbh)))
-				submit_bh(REQ_OP_READ, tbh);
-			else
-				ntfs_end_buffer_async_read(tbh, 1);
-		}
-		return 0;
-	}
-	/* No i/o was scheduled on any of the buffers. */
-	if (likely(!folio_test_error(folio)))
-		folio_mark_uptodate(folio);
-	else /* Signal synchronous i/o error. */
-		nr = -EIO;
-	folio_unlock(folio);
-	return nr;
-}
-
-/**
- * ntfs_read_folio - fill a @folio of a @file with data from the device
- * @file:	open file to which the folio @folio belongs or NULL
- * @folio:	page cache folio to fill with data
- *
- * For non-resident attributes, ntfs_read_folio() fills the @folio of the open
- * file @file by calling the ntfs version of the generic block_read_full_folio()
- * function, ntfs_read_block(), which in turn creates and reads in the buffers
- * associated with the folio asynchronously.
- *
- * For resident attributes, OTOH, ntfs_read_folio() fills @folio by copying the
- * data from the mft record (which at this stage is most likely in memory) and
- * fills the remainder with zeroes. Thus, in this case, I/O is synchronous, as
- * even if the mft record is not cached at this point in time, we need to wait
- * for it to be read in before we can do the copy.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_read_folio(struct file *file, struct folio *folio)
-{
-	struct page *page = &folio->page;
-	loff_t i_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	u8 *addr;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *mrec;
-	unsigned long flags;
-	u32 attr_len;
-	int err = 0;
-
-retry_readpage:
-	BUG_ON(!PageLocked(page));
-	vi = page->mapping->host;
-	i_size = i_size_read(vi);
-	/* Is the page fully outside i_size? (truncate in progress) */
-	if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >>
-			PAGE_SHIFT)) {
-		zero_user(page, 0, PAGE_SIZE);
-		ntfs_debug("Read outside i_size - truncated?");
-		goto done;
-	}
-	/*
-	 * This can potentially happen because we clear PageUptodate() during
-	 * ntfs_writepage() of MstProtected() attributes.
-	 */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		return 0;
-	}
-	ni = NTFS_I(vi);
-	/*
-	 * Only $DATA attributes can be encrypted and only unnamed $DATA
-	 * attributes can be compressed.  Index root can have the flags set but
-	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.  Note we need to check for
-	 * AT_INDEX_ALLOCATION since this is the type of both directory and
-	 * index inodes.
-	 */
-	if (ni->type != AT_INDEX_ALLOCATION) {
-		/* If attribute is encrypted, deny access, just like NT4. */
-		if (NInoEncrypted(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			err = -EACCES;
-			goto err_out;
-		}
-		/* Compressed data streams are handled in compress.c. */
-		if (NInoNonResident(ni) && NInoCompressed(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			BUG_ON(ni->name_len);
-			return ntfs_read_compressed_block(page);
-		}
-	}
-	/* NInoNonResident() == NInoIndexAllocPresent() */
-	if (NInoNonResident(ni)) {
-		/* Normal, non-resident data stream. */
-		return ntfs_read_block(folio);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed or encrypted.
-	 * This also means the attribute is smaller than an mft record and
-	 * hence smaller than a page, so can simply zero out any pages with
-	 * index above 0.  Note the attribute can actually be marked compressed
-	 * but if it is resident the actual data is not compressed so we are
-	 * ok to ignore the compressed flag here.
-	 */
-	if (unlikely(page->index > 0)) {
-		zero_user(page, 0, PAGE_SIZE);
-		goto done;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	mrec = map_mft_record(base_ni);
-	if (IS_ERR(mrec)) {
-		err = PTR_ERR(mrec);
-		goto err_out;
-	}
-	/*
-	 * If a parallel write made the attribute non-resident, drop the mft
-	 * record and retry the read_folio.
-	 */
-	if (unlikely(NInoNonResident(ni))) {
-		unmap_mft_record(base_ni);
-		goto retry_readpage;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, mrec);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto put_unm_err_out;
-	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	read_lock_irqsave(&ni->size_lock, flags);
-	if (unlikely(attr_len > ni->initialized_size))
-		attr_len = ni->initialized_size;
-	i_size = i_size_read(vi);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (unlikely(attr_len > i_size)) {
-		/* Race with shrinking truncate. */
-		attr_len = i_size;
-	}
-	addr = kmap_atomic(page);
-	/* Copy the data to the page. */
-	memcpy(addr, (u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			attr_len);
-	/* Zero the remainder of the page. */
-	memset(addr + attr_len, 0, PAGE_SIZE - attr_len);
-	flush_dcache_page(page);
-	kunmap_atomic(addr);
-put_unm_err_out:
-	ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
-	unmap_mft_record(base_ni);
-done:
-	SetPageUptodate(page);
-err_out:
-	unlock_page(page);
-	return err;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_block - write a @folio to the backing store
- * @folio:	page cache folio to write out
- * @wbc:	writeback control structure
- *
- * This function is for writing folios belonging to non-resident, non-mst
- * protected attributes to their backing store.
- *
- * For a folio with buffers, map and write the dirty buffers asynchronously
- * under folio writeback. For a folio without buffers, create buffers for the
- * folio, then proceed as above.
- *
- * If a folio doesn't have buffers the folio dirty state is definitive. If
- * a folio does have buffers, the folio dirty state is just a hint,
- * and the buffer dirty state is definitive. (A hint which has rules:
- * dirty buffers against a clean folio is illegal. Other combinations are
- * legal and need to be handled. In particular a dirty folio containing
- * clean buffers for example.)
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_read_block() and __block_write_full_folio().
- */
-static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc)
-{
-	VCN vcn;
-	LCN lcn;
-	s64 initialized_size;
-	loff_t i_size;
-	sector_t block, dblock, iblock;
-	struct inode *vi;
-	ntfs_inode *ni;
-	ntfs_volume *vol;
-	runlist_element *rl;
-	struct buffer_head *bh, *head;
-	unsigned long flags;
-	unsigned int blocksize, vcn_ofs;
-	int err;
-	bool need_end_writeback;
-	unsigned char blocksize_bits;
-
-	vi = folio->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx.", ni->mft_no, ni->type, folio->index);
-
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(NInoMstProtected(ni));
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	head = folio_buffers(folio);
-	if (!head) {
-		BUG_ON(!folio_test_uptodate(folio));
-		head = create_empty_buffers(folio, blocksize,
-				(1 << BH_Uptodate) | (1 << BH_Dirty));
-	}
-	bh = head;
-
-	/* NOTE: Different naming scheme to ntfs_read_block()! */
-
-	/* The first block in the folio. */
-	block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits);
-
-	read_lock_irqsave(&ni->size_lock, flags);
-	i_size = i_size_read(vi);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-
-	/* The first out of bounds block for the data size. */
-	dblock = (i_size + blocksize - 1) >> blocksize_bits;
-
-	/* The last (fully or partially) initialized block. */
-	iblock = initialized_size >> blocksize_bits;
-
-	/*
-	 * Be very careful.  We have no exclusion from block_dirty_folio
-	 * here, and the (potentially unmapped) buffers may become dirty at
-	 * any time.  If a buffer becomes dirty here after we've inspected it
-	 * then we just miss that fact, and the folio stays dirty.
-	 *
-	 * Buffers outside i_size may be dirtied by block_dirty_folio;
-	 * handle that here by just cleaning them.
-	 */
-
-	/*
-	 * Loop through all the buffers in the folio, mapping all the dirty
-	 * buffers to disk addresses and handling any aliases from the
-	 * underlying block device's mapping.
-	 */
-	rl = NULL;
-	err = 0;
-	do {
-		bool is_retry = false;
-
-		if (unlikely(block >= dblock)) {
-			/*
-			 * Mapped buffers outside i_size will occur, because
-			 * this folio can be outside i_size when there is a
-			 * truncate in progress. The contents of such buffers
-			 * were zeroed by ntfs_writepage().
-			 *
-			 * FIXME: What about the small race window where
-			 * ntfs_writepage() has not done any clearing because
-			 * the folio was within i_size but before we get here,
-			 * vmtruncate() modifies i_size?
-			 */
-			clear_buffer_dirty(bh);
-			set_buffer_uptodate(bh);
-			continue;
-		}
-
-		/* Clean buffers are not written out, so no need to map them. */
-		if (!buffer_dirty(bh))
-			continue;
-
-		/* Make sure we have enough initialized size. */
-		if (unlikely((block >= iblock) &&
-				(initialized_size < i_size))) {
-			/*
-			 * If this folio is fully outside initialized
-			 * size, zero out all folios between the current
-			 * initialized size and the current folio. Just
-			 * use ntfs_read_folio() to do the zeroing
-			 * transparently.
-			 */
-			if (block > iblock) {
-				// TODO:
-				// For each folio do:
-				// - read_cache_folio()
-				// Again for each folio do:
-				// - wait_on_folio_locked()
-				// - Check (folio_test_uptodate(folio) &&
-				//		!folio_test_error(folio))
-				// Update initialized size in the attribute and
-				// in the inode.
-				// Again, for each folio do:
-				//	block_dirty_folio();
-				// folio_put()
-				// We don't need to wait on the writes.
-				// Update iblock.
-			}
-			/*
-			 * The current folio straddles initialized size. Zero
-			 * all non-uptodate buffers and set them uptodate (and
-			 * dirty?). Note, there aren't any non-uptodate buffers
-			 * if the folio is uptodate.
-			 * FIXME: For an uptodate folio, the buffers may need to
-			 * be written out because they were not initialized on
-			 * disk before.
-			 */
-			if (!folio_test_uptodate(folio)) {
-				// TODO:
-				// Zero any non-uptodate buffers up to i_size.
-				// Set them uptodate and dirty.
-			}
-			// TODO:
-			// Update initialized size in the attribute and in the
-			// inode (up to i_size).
-			// Update iblock.
-			// FIXME: This is inefficient. Try to batch the two
-			// size changes to happen in one go.
-			ntfs_error(vol->sb, "Writing beyond initialized size "
-					"is not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			break;
-			// Do NOT set_buffer_new() BUT DO clear buffer range
-			// outside write request range.
-			// set_buffer_uptodate() on complete buffers as well as
-			// set_buffer_dirty().
-		}
-
-		/* No need to map buffers that are already mapped. */
-		if (buffer_mapped(bh))
-			continue;
-
-		/* Unmapped, dirty buffer. Need to map it. */
-		bh->b_bdev = vol->sb->s_bdev;
-
-		/* Convert block into corresponding vcn and offset. */
-		vcn = (VCN)block << blocksize_bits;
-		vcn_ofs = vcn & vol->cluster_size_mask;
-		vcn >>= vol->cluster_size_bits;
-		if (!rl) {
-lock_retry_remap:
-			down_read(&ni->runlist.lock);
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		/* Successful remap. */
-		if (lcn >= 0) {
-			/* Setup buffer head to point to correct block. */
-			bh->b_blocknr = ((lcn << vol->cluster_size_bits) +
-					vcn_ofs) >> blocksize_bits;
-			set_buffer_mapped(bh);
-			continue;
-		}
-		/* It is a hole, need to instantiate it. */
-		if (lcn == LCN_HOLE) {
-			u8 *kaddr;
-			unsigned long *bpos, *bend;
-
-			/* Check if the buffer is zero. */
-			kaddr = kmap_local_folio(folio, bh_offset(bh));
-			bpos = (unsigned long *)kaddr;
-			bend = (unsigned long *)(kaddr + blocksize);
-			do {
-				if (unlikely(*bpos))
-					break;
-			} while (likely(++bpos < bend));
-			kunmap_local(kaddr);
-			if (bpos == bend) {
-				/*
-				 * Buffer is zero and sparse, no need to write
-				 * it.
-				 */
-				bh->b_blocknr = -1;
-				clear_buffer_dirty(bh);
-				continue;
-			}
-			// TODO: Instantiate the hole.
-			// clear_buffer_new(bh);
-			// clean_bdev_bh_alias(bh);
-			ntfs_error(vol->sb, "Writing into sparse regions is "
-					"not supported yet. Sorry.");
-			err = -EOPNOTSUPP;
-			break;
-		}
-		/* If first try and runlist unmapped, map and retry. */
-		if (!is_retry && lcn == LCN_RL_NOT_MAPPED) {
-			is_retry = true;
-			/*
-			 * Attempt to map runlist, dropping lock for
-			 * the duration.
-			 */
-			up_read(&ni->runlist.lock);
-			err = ntfs_map_runlist(ni, vcn);
-			if (likely(!err))
-				goto lock_retry_remap;
-			rl = NULL;
-		} else if (!rl)
-			up_read(&ni->runlist.lock);
-		/*
-		 * If buffer is outside the runlist, truncate has cut it out
-		 * of the runlist.  Just clean and clear the buffer and set it
-		 * uptodate so it can get discarded by the VM.
-		 */
-		if (err == -ENOENT || lcn == LCN_ENOENT) {
-			bh->b_blocknr = -1;
-			clear_buffer_dirty(bh);
-			folio_zero_range(folio, bh_offset(bh), blocksize);
-			set_buffer_uptodate(bh);
-			err = 0;
-			continue;
-		}
-		/* Failed to map the buffer, even after retrying. */
-		if (!err)
-			err = -EIO;
-		bh->b_blocknr = -1;
-		ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
-				"attribute type 0x%x, vcn 0x%llx, offset 0x%x "
-				"because its location on disk could not be "
-				"determined%s (error code %i).", ni->mft_no,
-				ni->type, (unsigned long long)vcn,
-				vcn_ofs, is_retry ? " even after "
-				"retrying" : "", err);
-		break;
-	} while (block++, (bh = bh->b_this_page) != head);
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* For the error case, need to reset bh to the beginning. */
-	bh = head;
-
-	/* Just an optimization, so ->read_folio() is not called later. */
-	if (unlikely(!folio_test_uptodate(folio))) {
-		int uptodate = 1;
-		do {
-			if (!buffer_uptodate(bh)) {
-				uptodate = 0;
-				bh = head;
-				break;
-			}
-		} while ((bh = bh->b_this_page) != head);
-		if (uptodate)
-			folio_mark_uptodate(folio);
-	}
-
-	/* Setup all mapped, dirty buffers for async write i/o. */
-	do {
-		if (buffer_mapped(bh) && buffer_dirty(bh)) {
-			lock_buffer(bh);
-			if (test_clear_buffer_dirty(bh)) {
-				BUG_ON(!buffer_uptodate(bh));
-				mark_buffer_async_write(bh);
-			} else
-				unlock_buffer(bh);
-		} else if (unlikely(err)) {
-			/*
-			 * For the error case. The buffer may have been set
-			 * dirty during attachment to a dirty folio.
-			 */
-			if (err != -ENOMEM)
-				clear_buffer_dirty(bh);
-		}
-	} while ((bh = bh->b_this_page) != head);
-
-	if (unlikely(err)) {
-		// TODO: Remove the -EOPNOTSUPP check later on...
-		if (unlikely(err == -EOPNOTSUPP))
-			err = 0;
-		else if (err == -ENOMEM) {
-			ntfs_warning(vol->sb, "Error allocating memory. "
-					"Redirtying folio so we try again "
-					"later.");
-			/*
-			 * Put the folio back on mapping->dirty_pages, but
-			 * leave its buffer's dirty state as-is.
-			 */
-			folio_redirty_for_writepage(wbc, folio);
-			err = 0;
-		} else
-			folio_set_error(folio);
-	}
-
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);	/* Keeps try_to_free_buffers() away. */
-
-	/* Submit the prepared buffers for i/o. */
-	need_end_writeback = true;
-	do {
-		struct buffer_head *next = bh->b_this_page;
-		if (buffer_async_write(bh)) {
-			submit_bh(REQ_OP_WRITE, bh);
-			need_end_writeback = false;
-		}
-		bh = next;
-	} while (bh != head);
-	folio_unlock(folio);
-
-	/* If no i/o was started, need to end writeback here. */
-	if (unlikely(need_end_writeback))
-		folio_end_writeback(folio);
-
-	ntfs_debug("Done.");
-	return err;
-}
-
-/**
- * ntfs_write_mst_block - write a @page to the backing store
- * @page:	page cache page to write out
- * @wbc:	writeback control structure
- *
- * This function is for writing pages belonging to non-resident, mst protected
- * attributes to their backing store.  The only supported attributes are index
- * allocation and $MFT/$DATA.  Both directory inodes and index inodes are
- * supported for the index allocation case.
- *
- * The page must remain locked for the duration of the write because we apply
- * the mst fixups, write, and then undo the fixups, so if we were to unlock the
- * page before undoing the fixups, any other user of the page will see the
- * page contents as corrupt.
- *
- * We clear the page uptodate flag for the duration of the function to ensure
- * exclusion for the $MFT/$DATA case against someone mapping an mft record we
- * are about to apply the mst fixups to.
- *
- * Return 0 on success and -errno on error.
- *
- * Based on ntfs_write_block(), ntfs_mft_writepage(), and
- * write_mft_record_nolock().
- */
-static int ntfs_write_mst_block(struct page *page,
-		struct writeback_control *wbc)
-{
-	sector_t block, dblock, rec_block;
-	struct inode *vi = page->mapping->host;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	u8 *kaddr;
-	unsigned int rec_size = ni->itype.index.block_size;
-	ntfs_inode *locked_nis[PAGE_SIZE / NTFS_BLOCK_SIZE];
-	struct buffer_head *bh, *head, *tbh, *rec_start_bh;
-	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
-	runlist_element *rl;
-	int i, nr_locked_nis, nr_recs, nr_bhs, max_bhs, bhs_per_rec, err, err2;
-	unsigned bh_size, rec_size_bits;
-	bool sync, is_mft, page_is_dirty, rec_is_dirty;
-	unsigned char bh_size_bits;
-
-	if (WARN_ON(rec_size < NTFS_BLOCK_SIZE))
-		return -EINVAL;
-
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index "
-			"0x%lx.", vi->i_ino, ni->type, page->index);
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(!NInoMstProtected(ni));
-	is_mft = (S_ISREG(vi->i_mode) && !vi->i_ino);
-	/*
-	 * NOTE: ntfs_write_mst_block() would be called for $MFTMirr if a page
-	 * in its page cache were to be marked dirty.  However this should
-	 * never happen with the current driver and considering we do not
-	 * handle this case here we do want to BUG(), at least for now.
-	 */
-	BUG_ON(!(is_mft || S_ISDIR(vi->i_mode) ||
-			(NInoAttr(ni) && ni->type == AT_INDEX_ALLOCATION)));
-	bh_size = vol->sb->s_blocksize;
-	bh_size_bits = vol->sb->s_blocksize_bits;
-	max_bhs = PAGE_SIZE / bh_size;
-	BUG_ON(!max_bhs);
-	BUG_ON(max_bhs > MAX_BUF_PER_PAGE);
-
-	/* Were we called for sync purposes? */
-	sync = (wbc->sync_mode == WB_SYNC_ALL);
-
-	/* Make sure we have mapped buffers. */
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-
-	rec_size_bits = ni->itype.index.block_size_bits;
-	BUG_ON(!(PAGE_SIZE >> rec_size_bits));
-	bhs_per_rec = rec_size >> bh_size_bits;
-	BUG_ON(!bhs_per_rec);
-
-	/* The first block in the page. */
-	rec_block = block = (sector_t)page->index <<
-			(PAGE_SHIFT - bh_size_bits);
-
-	/* The first out of bounds block for the data size. */
-	dblock = (i_size_read(vi) + bh_size - 1) >> bh_size_bits;
-
-	rl = NULL;
-	err = err2 = nr_bhs = nr_recs = nr_locked_nis = 0;
-	page_is_dirty = rec_is_dirty = false;
-	rec_start_bh = NULL;
-	do {
-		bool is_retry = false;
-
-		if (likely(block < rec_block)) {
-			if (unlikely(block >= dblock)) {
-				clear_buffer_dirty(bh);
-				set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * This block is not the first one in the record.  We
-			 * ignore the buffer's dirty state because we could
-			 * have raced with a parallel mark_ntfs_record_dirty().
-			 */
-			if (!rec_is_dirty)
-				continue;
-			if (unlikely(err2)) {
-				if (err2 != -ENOMEM)
-					clear_buffer_dirty(bh);
-				continue;
-			}
-		} else /* if (block == rec_block) */ {
-			BUG_ON(block > rec_block);
-			/* This block is the first one in the record. */
-			rec_block += bhs_per_rec;
-			err2 = 0;
-			if (unlikely(block >= dblock)) {
-				clear_buffer_dirty(bh);
-				continue;
-			}
-			if (!buffer_dirty(bh)) {
-				/* Clean records are not written out. */
-				rec_is_dirty = false;
-				continue;
-			}
-			rec_is_dirty = true;
-			rec_start_bh = bh;
-		}
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = (VCN)block << bh_size_bits;
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-lock_retry_remap:
-				down_read(&ni->runlist.lock);
-				rl = ni->runlist.rl;
-			}
-			if (likely(rl != NULL)) {
-				/* Seek to element containing target vcn. */
-				while (rl->length && rl[1].vcn <= vcn)
-					rl++;
-				lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			} else
-				lcn = LCN_RL_NOT_MAPPED;
-			/* Successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> bh_size_bits;
-				set_buffer_mapped(bh);
-			} else {
-				/*
-				 * Remap failed.  Retry to map the runlist once
-				 * unless we are working on $MFT which always
-				 * has the whole of its runlist in memory.
-				 */
-				if (!is_mft && !is_retry &&
-						lcn == LCN_RL_NOT_MAPPED) {
-					is_retry = true;
-					/*
-					 * Attempt to map runlist, dropping
-					 * lock for the duration.
-					 */
-					up_read(&ni->runlist.lock);
-					err2 = ntfs_map_runlist(ni, vcn);
-					if (likely(!err2))
-						goto lock_retry_remap;
-					if (err2 == -ENOMEM)
-						page_is_dirty = true;
-					lcn = err2;
-				} else {
-					err2 = -EIO;
-					if (!rl)
-						up_read(&ni->runlist.lock);
-				}
-				/* Hard error.  Abort writing this record. */
-				if (!err || err == -ENOMEM)
-					err = err2;
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write ntfs record "
-						"0x%llx (inode 0x%lx, "
-						"attribute type 0x%x) because "
-						"its location on disk could "
-						"not be determined (error "
-						"code %lli).",
-						(long long)block <<
-						bh_size_bits >>
-						vol->mft_record_size_bits,
-						ni->mft_no, ni->type,
-						(long long)lcn);
-				/*
-				 * If this is not the first buffer, remove the
-				 * buffers in this record from the list of
-				 * buffers to write and clear their dirty bit
-				 * if not error -ENOMEM.
-				 */
-				if (rec_start_bh != bh) {
-					while (bhs[--nr_bhs] != rec_start_bh)
-						;
-					if (err2 != -ENOMEM) {
-						do {
-							clear_buffer_dirty(
-								rec_start_bh);
-						} while ((rec_start_bh =
-								rec_start_bh->
-								b_this_page) !=
-								bh);
-					}
-				}
-				continue;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-	} while (block++, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&ni->runlist.lock);
-	/* If there were no dirty buffers, we are done. */
-	if (!nr_bhs)
-		goto done;
-	/* Map the page so we can access its contents. */
-	kaddr = kmap(page);
-	/* Clear the page uptodate flag whilst the mst fixups are applied. */
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	for (i = 0; i < nr_bhs; i++) {
-		unsigned int ofs;
-
-		/* Skip buffers which are not at the beginning of records. */
-		if (i % bhs_per_rec)
-			continue;
-		tbh = bhs[i];
-		ofs = bh_offset(tbh);
-		if (is_mft) {
-			ntfs_inode *tni;
-			unsigned long mft_no;
-
-			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
-					>> rec_size_bits;
-			/* Check whether to write this mft record. */
-			tni = NULL;
-			if (!ntfs_may_write_mft_record(vol, mft_no,
-					(MFT_RECORD*)(kaddr + ofs), &tni)) {
-				/*
-				 * The record should not be written.  This
-				 * means we need to redirty the page before
-				 * returning.
-				 */
-				page_is_dirty = true;
-				/*
-				 * Remove the buffers in this mft record from
-				 * the list of buffers to write.
-				 */
-				do {
-					bhs[i] = NULL;
-				} while (++i % bhs_per_rec);
-				continue;
-			}
-			/*
-			 * The record should be written.  If a locked ntfs
-			 * inode was returned, add it to the array of locked
-			 * ntfs inodes.
-			 */
-			if (tni)
-				locked_nis[nr_locked_nis++] = tni;
-		}
-		/* Apply the mst protection fixups. */
-		err2 = pre_write_mst_fixup((NTFS_RECORD*)(kaddr + ofs),
-				rec_size);
-		if (unlikely(err2)) {
-			if (!err || err == -ENOMEM)
-				err = -EIO;
-			ntfs_error(vol->sb, "Failed to apply mst fixups "
-					"(inode 0x%lx, attribute type 0x%x, "
-					"page index 0x%lx, page offset 0x%x)!"
-					"  Unmount and run chkdsk.", vi->i_ino,
-					ni->type, page->index, ofs);
-			/*
-			 * Mark all the buffers in this record clean as we do
-			 * not want to write corrupt data to disk.
-			 */
-			do {
-				clear_buffer_dirty(bhs[i]);
-				bhs[i] = NULL;
-			} while (++i % bhs_per_rec);
-			continue;
-		}
-		nr_recs++;
-	}
-	/* If no records are to be written out, we are done. */
-	if (!nr_recs)
-		goto unm_done;
-	flush_dcache_page(page);
-	/* Lock buffers and start synchronous write i/o on them. */
-	for (i = 0; i < nr_bhs; i++) {
-		tbh = bhs[i];
-		if (!tbh)
-			continue;
-		if (!trylock_buffer(tbh))
-			BUG();
-		/* The buffer dirty state is now irrelevant, just clean it. */
-		clear_buffer_dirty(tbh);
-		BUG_ON(!buffer_uptodate(tbh));
-		BUG_ON(!buffer_mapped(tbh));
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, tbh);
-	}
-	/* Synchronize the mft mirror now if not @sync. */
-	if (is_mft && !sync)
-		goto do_mirror;
-do_wait:
-	/* Wait on i/o completion of buffers. */
-	for (i = 0; i < nr_bhs; i++) {
-		tbh = bhs[i];
-		if (!tbh)
-			continue;
-		wait_on_buffer(tbh);
-		if (unlikely(!buffer_uptodate(tbh))) {
-			ntfs_error(vol->sb, "I/O error while writing ntfs "
-					"record buffer (inode 0x%lx, "
-					"attribute type 0x%x, page index "
-					"0x%lx, page offset 0x%lx)!  Unmount "
-					"and run chkdsk.", vi->i_ino, ni->type,
-					page->index, bh_offset(tbh));
-			if (!err || err == -ENOMEM)
-				err = -EIO;
-			/*
-			 * Set the buffer uptodate so the page and buffer
-			 * states do not become out of sync.
-			 */
-			set_buffer_uptodate(tbh);
-		}
-	}
-	/* If @sync, now synchronize the mft mirror. */
-	if (is_mft && sync) {
-do_mirror:
-		for (i = 0; i < nr_bhs; i++) {
-			unsigned long mft_no;
-			unsigned int ofs;
-
-			/*
-			 * Skip buffers which are not at the beginning of
-			 * records.
-			 */
-			if (i % bhs_per_rec)
-				continue;
-			tbh = bhs[i];
-			/* Skip removed buffers (and hence records). */
-			if (!tbh)
-				continue;
-			ofs = bh_offset(tbh);
-			/* Get the mft record number. */
-			mft_no = (((s64)page->index << PAGE_SHIFT) + ofs)
-					>> rec_size_bits;
-			if (mft_no < vol->mftmirr_size)
-				ntfs_sync_mft_mirror(vol, mft_no,
-						(MFT_RECORD*)(kaddr + ofs),
-						sync);
-		}
-		if (!sync)
-			goto do_wait;
-	}
-	/* Remove the mst protection fixups again. */
-	for (i = 0; i < nr_bhs; i++) {
-		if (!(i % bhs_per_rec)) {
-			tbh = bhs[i];
-			if (!tbh)
-				continue;
-			post_write_mst_fixup((NTFS_RECORD*)(kaddr +
-					bh_offset(tbh)));
-		}
-	}
-	flush_dcache_page(page);
-unm_done:
-	/* Unlock any locked inodes. */
-	while (nr_locked_nis-- > 0) {
-		ntfs_inode *tni, *base_tni;
-		
-		tni = locked_nis[nr_locked_nis];
-		/* Get the base inode. */
-		mutex_lock(&tni->extent_lock);
-		if (tni->nr_extents >= 0)
-			base_tni = tni;
-		else {
-			base_tni = tni->ext.base_ntfs_ino;
-			BUG_ON(!base_tni);
-		}
-		mutex_unlock(&tni->extent_lock);
-		ntfs_debug("Unlocking %s inode 0x%lx.",
-				tni == base_tni ? "base" : "extent",
-				tni->mft_no);
-		mutex_unlock(&tni->mrec_lock);
-		atomic_dec(&tni->count);
-		iput(VFS_I(base_tni));
-	}
-	SetPageUptodate(page);
-	kunmap(page);
-done:
-	if (unlikely(err && err != -ENOMEM)) {
-		/*
-		 * Set page error if there is only one ntfs record in the page.
-		 * Otherwise we would loose per-record granularity.
-		 */
-		if (ni->itype.index.block_size == PAGE_SIZE)
-			SetPageError(page);
-		NVolSetErrors(vol);
-	}
-	if (page_is_dirty) {
-		ntfs_debug("Page still contains one or more dirty ntfs "
-				"records.  Redirtying the page starting at "
-				"record 0x%lx.", page->index <<
-				(PAGE_SHIFT - rec_size_bits));
-		redirty_page_for_writepage(wbc, page);
-		unlock_page(page);
-	} else {
-		/*
-		 * Keep the VM happy.  This must be done otherwise the
-		 * radix-tree tag PAGECACHE_TAG_DIRTY remains set even though
-		 * the page is clean.
-		 */
-		BUG_ON(PageWriteback(page));
-		set_page_writeback(page);
-		unlock_page(page);
-		end_page_writeback(page);
-	}
-	if (likely(!err))
-		ntfs_debug("Done.");
-	return err;
-}
-
-/**
- * ntfs_writepage - write a @page to the backing store
- * @page:	page cache page to write out
- * @wbc:	writeback control structure
- *
- * This is called from the VM when it wants to have a dirty ntfs page cache
- * page cleaned.  The VM has already locked the page and marked it clean.
- *
- * For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_folio() function,
- * ntfs_write_block(), which in turn if necessary creates and writes the
- * buffers associated with the page asynchronously.
- *
- * For resident attributes, OTOH, ntfs_writepage() writes the @page by copying
- * the data to the mft record (which at this stage is most likely in memory).
- * The mft record is then marked dirty and written out asynchronously via the
- * vfs inode dirty code path for the inode the mft record belongs to or via the
- * vm page dirty code path for the page the mft record is in.
- *
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct folio *folio = page_folio(page);
-	loff_t i_size;
-	struct inode *vi = folio->mapping->host;
-	ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi);
-	char *addr;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *m = NULL;
-	u32 attr_len;
-	int err;
-
-retry_writepage:
-	BUG_ON(!folio_test_locked(folio));
-	i_size = i_size_read(vi);
-	/* Is the folio fully outside i_size? (truncate in progress) */
-	if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >>
-			PAGE_SHIFT)) {
-		/*
-		 * The folio may have dirty, unmapped buffers.  Make them
-		 * freeable here, so the page does not leak.
-		 */
-		block_invalidate_folio(folio, 0, folio_size(folio));
-		folio_unlock(folio);
-		ntfs_debug("Write outside i_size - truncated?");
-		return 0;
-	}
-	/*
-	 * Only $DATA attributes can be encrypted and only unnamed $DATA
-	 * attributes can be compressed.  Index root can have the flags set but
-	 * this means to create compressed/encrypted files, not that the
-	 * attribute is compressed/encrypted.  Note we need to check for
-	 * AT_INDEX_ALLOCATION since this is the type of both directory and
-	 * index inodes.
-	 */
-	if (ni->type != AT_INDEX_ALLOCATION) {
-		/* If file is encrypted, deny access, just like NT4. */
-		if (NInoEncrypted(ni)) {
-			folio_unlock(folio);
-			BUG_ON(ni->type != AT_DATA);
-			ntfs_debug("Denying write access to encrypted file.");
-			return -EACCES;
-		}
-		/* Compressed data streams are handled in compress.c. */
-		if (NInoNonResident(ni) && NInoCompressed(ni)) {
-			BUG_ON(ni->type != AT_DATA);
-			BUG_ON(ni->name_len);
-			// TODO: Implement and replace this with
-			// return ntfs_write_compressed_block(page);
-			folio_unlock(folio);
-			ntfs_error(vi->i_sb, "Writing to compressed files is "
-					"not supported yet.  Sorry.");
-			return -EOPNOTSUPP;
-		}
-		// TODO: Implement and remove this check.
-		if (NInoNonResident(ni) && NInoSparse(ni)) {
-			folio_unlock(folio);
-			ntfs_error(vi->i_sb, "Writing to sparse files is not "
-					"supported yet.  Sorry.");
-			return -EOPNOTSUPP;
-		}
-	}
-	/* NInoNonResident() == NInoIndexAllocPresent() */
-	if (NInoNonResident(ni)) {
-		/* We have to zero every time due to mmap-at-end-of-file. */
-		if (folio->index >= (i_size >> PAGE_SHIFT)) {
-			/* The folio straddles i_size. */
-			unsigned int ofs = i_size & (folio_size(folio) - 1);
-			folio_zero_segment(folio, ofs, folio_size(folio));
-		}
-		/* Handle mst protected attributes. */
-		if (NInoMstProtected(ni))
-			return ntfs_write_mst_block(page, wbc);
-		/* Normal, non-resident data stream. */
-		return ntfs_write_block(folio, wbc);
-	}
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * mst protected.  This also means the attribute is smaller than an mft
-	 * record and hence smaller than a folio, so can simply return error on
-	 * any folios with index above 0.  Note the attribute can actually be
-	 * marked compressed but if it is resident the actual data is not
-	 * compressed so we are ok to ignore the compressed flag here.
-	 */
-	BUG_ON(folio_buffers(folio));
-	BUG_ON(!folio_test_uptodate(folio));
-	if (unlikely(folio->index > 0)) {
-		ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0.  "
-				"Aborting write.", folio->index);
-		BUG_ON(folio_test_writeback(folio));
-		folio_start_writeback(folio);
-		folio_unlock(folio);
-		folio_end_writeback(folio);
-		return -EIO;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	/*
-	 * If a parallel write made the attribute non-resident, drop the mft
-	 * record and retry the writepage.
-	 */
-	if (unlikely(NInoNonResident(ni))) {
-		unmap_mft_record(base_ni);
-		goto retry_writepage;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto err_out;
-	/*
-	 * Keep the VM happy.  This must be done otherwise
-	 * PAGECACHE_TAG_DIRTY remains set even though the folio is clean.
-	 */
-	BUG_ON(folio_test_writeback(folio));
-	folio_start_writeback(folio);
-	folio_unlock(folio);
-	attr_len = le32_to_cpu(ctx->attr->data.resident.value_length);
-	i_size = i_size_read(vi);
-	if (unlikely(attr_len > i_size)) {
-		/* Race with shrinking truncate or a failed truncate. */
-		attr_len = i_size;
-		/*
-		 * If the truncate failed, fix it up now.  If a concurrent
-		 * truncate, we do its job, so it does not have to do anything.
-		 */
-		err = ntfs_resident_attr_value_resize(ctx->mrec, ctx->attr,
-				attr_len);
-		/* Shrinking cannot fail. */
-		BUG_ON(err);
-	}
-	addr = kmap_local_folio(folio, 0);
-	/* Copy the data from the folio to the mft record. */
-	memcpy((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset),
-			addr, attr_len);
-	/* Zero out of bounds area in the page cache folio. */
-	memset(addr + attr_len, 0, folio_size(folio) - attr_len);
-	kunmap_local(addr);
-	flush_dcache_folio(folio);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	/* We are done with the folio. */
-	folio_end_writeback(folio);
-	/* Finally, mark the mft record dirty, so it gets written back. */
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	return 0;
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying "
-				"page so we try again later.");
-		/*
-		 * Put the folio back on mapping->dirty_pages, but leave its
-		 * buffers' dirty state as-is.
-		 */
-		folio_redirty_for_writepage(wbc, folio);
-		err = 0;
-	} else {
-		ntfs_error(vi->i_sb, "Resident attribute write failed with "
-				"error %i.", err);
-		folio_set_error(folio);
-		NVolSetErrors(ni->vol);
-	}
-	folio_unlock(folio);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-#endif	/* NTFS_RW */
-
-/**
- * ntfs_bmap - map logical file block to physical device block
- * @mapping:	address space mapping to which the block to be mapped belongs
- * @block:	logical block to map to its physical device block
- *
- * For regular, non-resident files (i.e. not compressed and not encrypted), map
- * the logical @block belonging to the file described by the address space
- * mapping @mapping to its physical device block.
- *
- * The size of the block is equal to the @s_blocksize field of the super block
- * of the mounted file system which is guaranteed to be smaller than or equal
- * to the cluster size thus the block is guaranteed to fit entirely inside the
- * cluster which means we do not need to care how many contiguous bytes are
- * available after the beginning of the block.
- *
- * Return the physical device block if the mapping succeeded or 0 if the block
- * is sparse or there was an error.
- *
- * Note: This is a problem if someone tries to run bmap() on $Boot system file
- * as that really is in block zero but there is nothing we can do.  bmap() is
- * just broken in that respect (just like it cannot distinguish sparse from
- * not available or error).
- */
-static sector_t ntfs_bmap(struct address_space *mapping, sector_t block)
-{
-	s64 ofs, size;
-	loff_t i_size;
-	LCN lcn;
-	unsigned long blocksize, flags;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	ntfs_volume *vol = ni->vol;
-	unsigned delta;
-	unsigned char blocksize_bits, cluster_size_shift;
-
-	ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.",
-			ni->mft_no, (unsigned long long)block);
-	if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) {
-		ntfs_error(vol->sb, "BMAP does not make sense for %s "
-				"attributes, returning 0.",
-				(ni->type != AT_DATA) ? "non-data" :
-				(!NInoNonResident(ni) ? "resident" :
-				"encrypted"));
-		return 0;
-	}
-	/* None of these can happen. */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoMstProtected(ni));
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	ofs = (s64)block << blocksize_bits;
-	read_lock_irqsave(&ni->size_lock, flags);
-	size = ni->initialized_size;
-	i_size = i_size_read(VFS_I(ni));
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * If the offset is outside the initialized size or the block straddles
-	 * the initialized size then pretend it is a hole unless the
-	 * initialized size equals the file size.
-	 */
-	if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size)))
-		goto hole;
-	cluster_size_shift = vol->cluster_size_bits;
-	down_read(&ni->runlist.lock);
-	lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false);
-	up_read(&ni->runlist.lock);
-	if (unlikely(lcn < LCN_HOLE)) {
-		/*
-		 * Step down to an integer to avoid gcc doing a long long
-		 * comparision in the switch when we know @lcn is between
-		 * LCN_HOLE and LCN_EIO (i.e. -1 to -5).
-		 *
-		 * Otherwise older gcc (at least on some architectures) will
-		 * try to use __cmpdi2() which is of course not available in
-		 * the kernel.
-		 */
-		switch ((int)lcn) {
-		case LCN_ENOENT:
-			/*
-			 * If the offset is out of bounds then pretend it is a
-			 * hole.
-			 */
-			goto hole;
-		case LCN_ENOMEM:
-			ntfs_error(vol->sb, "Not enough memory to complete "
-					"mapping for inode 0x%lx.  "
-					"Returning 0.", ni->mft_no);
-			break;
-		default:
-			ntfs_error(vol->sb, "Failed to complete mapping for "
-					"inode 0x%lx.  Run chkdsk.  "
-					"Returning 0.", ni->mft_no);
-			break;
-		}
-		return 0;
-	}
-	if (lcn < 0) {
-		/* It is a hole. */
-hole:
-		ntfs_debug("Done (returning hole).");
-		return 0;
-	}
-	/*
-	 * The block is really allocated and fullfils all our criteria.
-	 * Convert the cluster to units of block size and return the result.
-	 */
-	delta = ofs & vol->cluster_size_mask;
-	if (unlikely(sizeof(block) < sizeof(lcn))) {
-		block = lcn = ((lcn << cluster_size_shift) + delta) >>
-				blocksize_bits;
-		/* If the block number was truncated return 0. */
-		if (unlikely(block != lcn)) {
-			ntfs_error(vol->sb, "Physical block 0x%llx is too "
-					"large to be returned, returning 0.",
-					(long long)lcn);
-			return 0;
-		}
-	} else
-		block = ((lcn << cluster_size_shift) + delta) >>
-				blocksize_bits;
-	ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn);
-	return block;
-}
-
-/*
- * ntfs_normal_aops - address space operations for normal inodes and attributes
- *
- * Note these are not used for compressed or mst protected inodes and
- * attributes.
- */
-const struct address_space_operations ntfs_normal_aops = {
-	.read_folio	= ntfs_read_folio,
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,
-	.dirty_folio	= block_dirty_folio,
-#endif /* NTFS_RW */
-	.bmap		= ntfs_bmap,
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_compressed_aops - address space operations for compressed inodes
- */
-const struct address_space_operations ntfs_compressed_aops = {
-	.read_folio	= ntfs_read_folio,
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,
-	.dirty_folio	= block_dirty_folio,
-#endif /* NTFS_RW */
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate = block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-/*
- * ntfs_mst_aops - general address space operations for mst protecteed inodes
- *			  and attributes
- */
-const struct address_space_operations ntfs_mst_aops = {
-	.read_folio	= ntfs_read_folio,	/* Fill page with data. */
-#ifdef NTFS_RW
-	.writepage	= ntfs_writepage,	/* Write dirty page to disk. */
-	.dirty_folio	= filemap_dirty_folio,
-#endif /* NTFS_RW */
-	.migrate_folio	= buffer_migrate_folio,
-	.is_partially_uptodate	= block_is_partially_uptodate,
-	.error_remove_folio = generic_error_remove_folio,
-};
-
-#ifdef NTFS_RW
-
-/**
- * mark_ntfs_record_dirty - mark an ntfs record dirty
- * @page:	page containing the ntfs record to mark dirty
- * @ofs:	byte offset within @page at which the ntfs record begins
- *
- * Set the buffers and the page in which the ntfs record is located dirty.
- *
- * The latter also marks the vfs inode the ntfs record belongs to dirty
- * (I_DIRTY_PAGES only).
- *
- * If the page does not have buffers, we create them and set them uptodate.
- * The page may not be locked which is why we need to handle the buffers under
- * the mapping->i_private_lock.  Once the buffers are marked dirty we no longer
- * need the lock since try_to_free_buffers() does not free dirty buffers.
- */
-void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
-	struct address_space *mapping = page->mapping;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	struct buffer_head *bh, *head, *buffers_to_free = NULL;
-	unsigned int end, bh_size, bh_ofs;
-
-	BUG_ON(!PageUptodate(page));
-	end = ofs + ni->itype.index.block_size;
-	bh_size = VFS_I(ni)->i_sb->s_blocksize;
-	spin_lock(&mapping->i_private_lock);
-	if (unlikely(!page_has_buffers(page))) {
-		spin_unlock(&mapping->i_private_lock);
-		bh = head = alloc_page_buffers(page, bh_size, true);
-		spin_lock(&mapping->i_private_lock);
-		if (likely(!page_has_buffers(page))) {
-			struct buffer_head *tail;
-
-			do {
-				set_buffer_uptodate(bh);
-				tail = bh;
-				bh = bh->b_this_page;
-			} while (bh);
-			tail->b_this_page = head;
-			attach_page_private(page, head);
-		} else
-			buffers_to_free = bh;
-	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	do {
-		bh_ofs = bh_offset(bh);
-		if (bh_ofs + bh_size <= ofs)
-			continue;
-		if (unlikely(bh_ofs >= end))
-			break;
-		set_buffer_dirty(bh);
-	} while ((bh = bh->b_this_page) != head);
-	spin_unlock(&mapping->i_private_lock);
-	filemap_dirty_folio(mapping, page_folio(page));
-	if (unlikely(buffers_to_free)) {
-		do {
-			bh = buffers_to_free->b_this_page;
-			free_buffer_head(buffers_to_free);
-			buffers_to_free = bh;
-		} while (buffers_to_free);
-	}
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/aops.h b/fs/ntfs/aops.h
deleted file mode 100644
index 8d0958a149cb..000000000000
--- a/fs/ntfs/aops.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * aops.h - Defines for NTFS kernel address space operations and page cache
- *	    handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_AOPS_H
-#define _LINUX_NTFS_AOPS_H
-
-#include <linux/mm.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-#include <linux/fs.h>
-
-#include "inode.h"
-
-/**
- * ntfs_unmap_page - release a page that was mapped using ntfs_map_page()
- * @page:	the page to release
- *
- * Unpin, unmap and release a page that was obtained from ntfs_map_page().
- */
-static inline void ntfs_unmap_page(struct page *page)
-{
-	kunmap(page);
-	put_page(page);
-}
-
-/**
- * ntfs_map_page - map a page into accessible memory, reading it if necessary
- * @mapping:	address space for which to obtain the page
- * @index:	index into the page cache for @mapping of the page to map
- *
- * Read a page from the page cache of the address space @mapping at position
- * @index, where @index is in units of PAGE_SIZE, and not in bytes.
- *
- * If the page is not in memory it is loaded from disk first using the
- * read_folio method defined in the address space operations of @mapping
- * and the page is added to the page cache of @mapping in the process.
- *
- * If the page belongs to an mst protected attribute and it is marked as such
- * in its ntfs inode (NInoMstProtected()) the mst fixups are applied but no
- * error checking is performed.  This means the caller has to verify whether
- * the ntfs record(s) contained in the page are valid or not using one of the
- * ntfs_is_XXXX_record{,p}() macros, where XXXX is the record type you are
- * expecting to see.  (For details of the macros, see fs/ntfs/layout.h.)
- *
- * If the page is in high memory it is mapped into memory directly addressible
- * by the kernel.
- *
- * Finally the page count is incremented, thus pinning the page into place.
- *
- * The above means that page_address(page) can be used on all pages obtained
- * with ntfs_map_page() to get the kernel virtual address of the page.
- *
- * When finished with the page, the caller has to call ntfs_unmap_page() to
- * unpin, unmap and release the page.
- *
- * Note this does not grant exclusive access. If such is desired, the caller
- * must provide it independently of the ntfs_{un}map_page() calls by using
- * a {rw_}semaphore or other means of serialization. A spin lock cannot be
- * used as ntfs_map_page() can block.
- *
- * The unlocked and uptodate page is returned on success or an encoded error
- * on failure. Caller has to test for error using the IS_ERR() macro on the
- * return value. If that evaluates to 'true', the negative error code can be
- * obtained using PTR_ERR() on the return value of ntfs_map_page().
- */
-static inline struct page *ntfs_map_page(struct address_space *mapping,
-		unsigned long index)
-{
-	struct page *page = read_mapping_page(mapping, index, NULL);
-
-	if (!IS_ERR(page))
-		kmap(page);
-	return page;
-}
-
-#ifdef NTFS_RW
-
-extern void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_AOPS_H */
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
deleted file mode 100644
index f79408f9127a..000000000000
--- a/fs/ntfs/attrib.c
+++ /dev/null
@@ -1,2624 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * attrib.c - NTFS attribute operations.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/writeback.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "layout.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-#include "types.h"
-
-/**
- * ntfs_map_runlist_nolock - map (a part of) a runlist of an ntfs inode
- * @ni:		ntfs inode for which to map (part of) a runlist
- * @vcn:	map runlist part containing this vcn
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_map_runlist_nolock() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_map_runlist_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_map_runlist_nolock() saves the state of @ctx on entry and
- * restores it before returning.  Thus, @ctx will be left pointing to the same
- * attribute on return as on entry.  However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_map_runlist_nolock(), you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Return 0 on success and -errno on error.  There is one special error code
- * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
- * of bounds of the runlist.
- *
- * Note the runlist can be NULL after this function returns if @vcn is zero and
- * the attribute has zero allocated size, i.e. there simply is no runlist.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist will be modified.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn, ntfs_attr_search_ctx *ctx)
-{
-	VCN end_vcn;
-	unsigned long flags;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	runlist_element *rl;
-	struct page *put_this_page = NULL;
-	int err = 0;
-	bool ctx_is_temporary, ctx_needs_reset;
-	ntfs_attr_search_ctx old_ctx = { NULL, };
-
-	ntfs_debug("Mapping runlist part containing vcn 0x%llx.",
-			(unsigned long long)vcn);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	if (!ctx) {
-		ctx_is_temporary = ctx_needs_reset = true;
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m))
-			return PTR_ERR(m);
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-	} else {
-		VCN allocated_size_vcn;
-
-		BUG_ON(IS_ERR(ctx->mrec));
-		a = ctx->attr;
-		BUG_ON(!a->non_resident);
-		ctx_is_temporary = false;
-		end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size_vcn = ni->allocated_size >>
-				ni->vol->cluster_size_bits;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (!a->data.non_resident.lowest_vcn && end_vcn <= 0)
-			end_vcn = allocated_size_vcn - 1;
-		/*
-		 * If we already have the attribute extent containing @vcn in
-		 * @ctx, no need to look it up again.  We slightly cheat in
-		 * that if vcn exceeds the allocated size, we will refuse to
-		 * map the runlist below, so there is definitely no need to get
-		 * the right attribute extent.
-		 */
-		if (vcn >= allocated_size_vcn || (a->type == ni->type &&
-				a->name_length == ni->name_len &&
-				!memcmp((u8*)a + le16_to_cpu(a->name_offset),
-				ni->name, ni->name_len) &&
-				sle64_to_cpu(a->data.non_resident.lowest_vcn)
-				<= vcn && end_vcn >= vcn))
-			ctx_needs_reset = false;
-		else {
-			/* Save the old search context. */
-			old_ctx = *ctx;
-			/*
-			 * If the currently mapped (extent) inode is not the
-			 * base inode we will unmap it when we reinitialize the
-			 * search context which means we need to get a
-			 * reference to the page containing the mapped mft
-			 * record so we do not accidentally drop changes to the
-			 * mft record when it has not been marked dirty yet.
-			 */
-			if (old_ctx.base_ntfs_ino && old_ctx.ntfs_ino !=
-					old_ctx.base_ntfs_ino) {
-				put_this_page = old_ctx.ntfs_ino->page;
-				get_page(put_this_page);
-			}
-			/*
-			 * Reinitialize the search context so we can lookup the
-			 * needed attribute extent.
-			 */
-			ntfs_attr_reinit_search_ctx(ctx);
-			ctx_needs_reset = true;
-		}
-	}
-	if (ctx_needs_reset) {
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, vcn, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			goto err_out;
-		}
-		BUG_ON(!ctx->attr->non_resident);
-	}
-	a = ctx->attr;
-	/*
-	 * Only decompress the mapping pairs if @vcn is inside it.  Otherwise
-	 * we get into problems when we try to map an out of bounds vcn because
-	 * we then try to map the already mapped runlist fragment and
-	 * ntfs_mapping_pairs_decompress() fails.
-	 */
-	end_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn) + 1;
-	if (unlikely(vcn && vcn >= end_vcn)) {
-		err = -ENOENT;
-		goto err_out;
-	}
-	rl = ntfs_mapping_pairs_decompress(ni->vol, a, ni->runlist.rl);
-	if (IS_ERR(rl))
-		err = PTR_ERR(rl);
-	else
-		ni->runlist.rl = rl;
-err_out:
-	if (ctx_is_temporary) {
-		if (likely(ctx))
-			ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-	} else if (ctx_needs_reset) {
-		/*
-		 * If there is no attribute list, restoring the search context
-		 * is accomplished simply by copying the saved context back over
-		 * the caller supplied context.  If there is an attribute list,
-		 * things are more complicated as we need to deal with mapping
-		 * of mft records and resulting potential changes in pointers.
-		 */
-		if (NInoAttrList(base_ni)) {
-			/*
-			 * If the currently mapped (extent) inode is not the
-			 * one we had before, we need to unmap it and map the
-			 * old one.
-			 */
-			if (ctx->ntfs_ino != old_ctx.ntfs_ino) {
-				/*
-				 * If the currently mapped inode is not the
-				 * base inode, unmap it.
-				 */
-				if (ctx->base_ntfs_ino && ctx->ntfs_ino !=
-						ctx->base_ntfs_ino) {
-					unmap_extent_mft_record(ctx->ntfs_ino);
-					ctx->mrec = ctx->base_mrec;
-					BUG_ON(!ctx->mrec);
-				}
-				/*
-				 * If the old mapped inode is not the base
-				 * inode, map it.
-				 */
-				if (old_ctx.base_ntfs_ino &&
-						old_ctx.ntfs_ino !=
-						old_ctx.base_ntfs_ino) {
-retry_map:
-					ctx->mrec = map_mft_record(
-							old_ctx.ntfs_ino);
-					/*
-					 * Something bad has happened.  If out
-					 * of memory retry till it succeeds.
-					 * Any other errors are fatal and we
-					 * return the error code in ctx->mrec.
-					 * Let the caller deal with it...  We
-					 * just need to fudge things so the
-					 * caller can reinit and/or put the
-					 * search context safely.
-					 */
-					if (IS_ERR(ctx->mrec)) {
-						if (PTR_ERR(ctx->mrec) ==
-								-ENOMEM) {
-							schedule();
-							goto retry_map;
-						} else
-							old_ctx.ntfs_ino =
-								old_ctx.
-								base_ntfs_ino;
-					}
-				}
-			}
-			/* Update the changed pointers in the saved context. */
-			if (ctx->mrec != old_ctx.mrec) {
-				if (!IS_ERR(ctx->mrec))
-					old_ctx.attr = (ATTR_RECORD*)(
-							(u8*)ctx->mrec +
-							((u8*)old_ctx.attr -
-							(u8*)old_ctx.mrec));
-				old_ctx.mrec = ctx->mrec;
-			}
-		}
-		/* Restore the search context to the saved one. */
-		*ctx = old_ctx;
-		/*
-		 * We drop the reference on the page we took earlier.  In the
-		 * case that IS_ERR(ctx->mrec) is true this means we might lose
-		 * some changes to the mft record that had been made between
-		 * the last time it was marked dirty/written out and now.  This
-		 * at this stage is not a problem as the mapping error is fatal
-		 * enough that the mft record cannot be written out anyway and
-		 * the caller is very likely to shutdown the whole inode
-		 * immediately and mark the volume dirty for chkdsk to pick up
-		 * the pieces anyway.
-		 */
-		if (put_this_page)
-			put_page(put_this_page);
-	}
-	return err;
-}
-
-/**
- * ntfs_map_runlist - map (a part of) a runlist of an ntfs inode
- * @ni:		ntfs inode for which to map (part of) a runlist
- * @vcn:	map runlist part containing this vcn
- *
- * Map the part of a runlist containing the @vcn of the ntfs inode @ni.
- *
- * Return 0 on success and -errno on error.  There is one special error code
- * which is not an error as such.  This is -ENOENT.  It means that @vcn is out
- * of bounds of the runlist.
- *
- * Locking: - The runlist must be unlocked on entry and is unlocked on return.
- *	    - This function takes the runlist lock for writing and may modify
- *	      the runlist.
- */
-int ntfs_map_runlist(ntfs_inode *ni, VCN vcn)
-{
-	int err = 0;
-
-	down_write(&ni->runlist.lock);
-	/* Make sure someone else didn't do the work while we were sleeping. */
-	if (likely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) <=
-			LCN_RL_NOT_MAPPED))
-		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
-	up_write(&ni->runlist.lock);
-	return err;
-}
-
-/**
- * ntfs_attr_vcn_to_lcn_nolock - convert a vcn into a lcn given an ntfs inode
- * @ni:			ntfs inode of the attribute whose runlist to search
- * @vcn:		vcn to convert
- * @write_locked:	true if the runlist is locked for writing
- *
- * Find the virtual cluster number @vcn in the runlist of the ntfs attribute
- * described by the ntfs inode @ni and return the corresponding logical cluster
- * number (lcn).
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @write_locked is true the caller has locked the runlist for writing and
- * if false for reading.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code	Meaning / Description
- * ==========================================
- *  LCN_HOLE	Hole / not allocated on disk.
- *  LCN_ENOENT	There is no such vcn in the runlist, i.e. @vcn is out of bounds.
- *  LCN_ENOMEM	Not enough memory to map runlist.
- *  LCN_EIO	Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * Locking: - The runlist must be locked on entry and is left locked on return.
- *	    - If @write_locked is 'false', i.e. the runlist is locked for reading,
- *	      the lock may be dropped inside the function so you cannot rely on
- *	      the runlist still being the same when this function returns.
- */
-LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const bool write_locked)
-{
-	LCN lcn;
-	unsigned long flags;
-	bool is_retry = false;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, %s_locked.",
-			ni->mft_no, (unsigned long long)vcn,
-			write_locked ? "write" : "read");
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(vcn < 0);
-	if (!ni->runlist.rl) {
-		read_lock_irqsave(&ni->size_lock, flags);
-		if (!ni->allocated_size) {
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			return LCN_ENOENT;
-		}
-		read_unlock_irqrestore(&ni->size_lock, flags);
-	}
-retry_remap:
-	/* Convert vcn to lcn.  If that fails map the runlist and retry once. */
-	lcn = ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn);
-	if (likely(lcn >= LCN_HOLE)) {
-		ntfs_debug("Done, lcn 0x%llx.", (long long)lcn);
-		return lcn;
-	}
-	if (lcn != LCN_RL_NOT_MAPPED) {
-		if (lcn != LCN_ENOENT)
-			lcn = LCN_EIO;
-	} else if (!is_retry) {
-		int err;
-
-		if (!write_locked) {
-			up_read(&ni->runlist.lock);
-			down_write(&ni->runlist.lock);
-			if (unlikely(ntfs_rl_vcn_to_lcn(ni->runlist.rl, vcn) !=
-					LCN_RL_NOT_MAPPED)) {
-				up_write(&ni->runlist.lock);
-				down_read(&ni->runlist.lock);
-				goto retry_remap;
-			}
-		}
-		err = ntfs_map_runlist_nolock(ni, vcn, NULL);
-		if (!write_locked) {
-			up_write(&ni->runlist.lock);
-			down_read(&ni->runlist.lock);
-		}
-		if (likely(!err)) {
-			is_retry = true;
-			goto retry_remap;
-		}
-		if (err == -ENOENT)
-			lcn = LCN_ENOENT;
-		else if (err == -ENOMEM)
-			lcn = LCN_ENOMEM;
-		else
-			lcn = LCN_EIO;
-	}
-	if (lcn != LCN_ENOENT)
-		ntfs_error(ni->vol->sb, "Failed with error code %lli.",
-				(long long)lcn);
-	return lcn;
-}
-
-/**
- * ntfs_attr_find_vcn_nolock - find a vcn in the runlist of an ntfs inode
- * @ni:		ntfs inode describing the runlist to search
- * @vcn:	vcn to find
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Find the virtual cluster number @vcn in the runlist described by the ntfs
- * inode @ni and return the address of the runlist element containing the @vcn.
- *
- * If the @vcn is not mapped yet, the attempt is made to map the attribute
- * extent containing the @vcn and the vcn to lcn conversion is retried.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_attr_find_vcn_nolock() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and ntfs_attr_find_vcn_nolock()
- * will perform the necessary mapping and unmapping.
- *
- * Note, ntfs_attr_find_vcn_nolock() saves the state of @ctx on entry and
- * restores it before returning.  Thus, @ctx will be left pointing to the same
- * attribute on return as on entry.  However, the actual pointers in @ctx may
- * point to different memory locations on return, so you must remember to reset
- * any cached pointers from the @ctx, i.e. after the call to
- * ntfs_attr_find_vcn_nolock(), you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- * Note you need to distinguish between the lcn of the returned runlist element
- * being >= 0 and LCN_HOLE.  In the later case you have to return zeroes on
- * read and allocate clusters on write.
- *
- * Return the runlist element containing the @vcn on success and
- * ERR_PTR(-errno) on error.  You need to test the return value with IS_ERR()
- * to decide if the return is success or failure and PTR_ERR() to get to the
- * error code if IS_ERR() is true.
- *
- * The possible error return codes are:
- *	-ENOENT - No such vcn in the runlist, i.e. @vcn is out of bounds.
- *	-ENOMEM - Not enough memory to map runlist.
- *	-EIO	- Critical error (runlist/file is corrupt, i/o error, etc).
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni, const VCN vcn,
-		ntfs_attr_search_ctx *ctx)
-{
-	unsigned long flags;
-	runlist_element *rl;
-	int err = 0;
-	bool is_retry = false;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, vcn 0x%llx, with%s ctx.",
-			ni->mft_no, (unsigned long long)vcn, ctx ? "" : "out");
-	BUG_ON(!NInoNonResident(ni));
-	BUG_ON(vcn < 0);
-	if (!ni->runlist.rl) {
-		read_lock_irqsave(&ni->size_lock, flags);
-		if (!ni->allocated_size) {
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			return ERR_PTR(-ENOENT);
-		}
-		read_unlock_irqrestore(&ni->size_lock, flags);
-	}
-retry_remap:
-	rl = ni->runlist.rl;
-	if (likely(rl && vcn >= rl[0].vcn)) {
-		while (likely(rl->length)) {
-			if (unlikely(vcn < rl[1].vcn)) {
-				if (likely(rl->lcn >= LCN_HOLE)) {
-					ntfs_debug("Done.");
-					return rl;
-				}
-				break;
-			}
-			rl++;
-		}
-		if (likely(rl->lcn != LCN_RL_NOT_MAPPED)) {
-			if (likely(rl->lcn == LCN_ENOENT))
-				err = -ENOENT;
-			else
-				err = -EIO;
-		}
-	}
-	if (!err && !is_retry) {
-		/*
-		 * If the search context is invalid we cannot map the unmapped
-		 * region.
-		 */
-		if (IS_ERR(ctx->mrec))
-			err = PTR_ERR(ctx->mrec);
-		else {
-			/*
-			 * The @vcn is in an unmapped region, map the runlist
-			 * and retry.
-			 */
-			err = ntfs_map_runlist_nolock(ni, vcn, ctx);
-			if (likely(!err)) {
-				is_retry = true;
-				goto retry_remap;
-			}
-		}
-		if (err == -EINVAL)
-			err = -EIO;
-	} else if (!err)
-		err = -EIO;
-	if (err != -ENOENT)
-		ntfs_error(ni->vol->sb, "Failed with error code %i.", err);
-	return ERR_PTR(err);
-}
-
-/**
- * ntfs_attr_find - find (next) attribute in mft record
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * You should not need to call this function directly.  Use ntfs_attr_lookup()
- * instead.
- *
- * ntfs_attr_find() takes a search context @ctx as parameter and searches the
- * mft record specified by @ctx->mrec, beginning at @ctx->attr, for an
- * attribute of @type, optionally @name and @val.
- *
- * If the attribute is found, ntfs_attr_find() returns 0 and @ctx->attr will
- * point to the found attribute.
- *
- * If the attribute is not found, ntfs_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute before which the attribute being
- * searched for would need to be inserted if such an action were to be desired.
- *
- * On actual error, ntfs_attr_find() returns -EIO.  In this case @ctx->attr is
- * undefined and in particular do not rely on it not changing.
- *
- * If @ctx->is_first is 'true', the search begins with @ctx->attr itself.  If it
- * is 'false', the search begins after @ctx->attr.
- *
- * If @ic is IGNORE_CASE, the @name comparisson is not case sensitive and
- * @ctx->ntfs_ino must be set to the ntfs inode to which the mft record
- * @ctx->mrec belongs.  This is so we can get at the ntfs volume and hence at
- * the upcase table.  If @ic is CASE_SENSITIVE, the comparison is case
- * sensitive.  When @name is present, @name_len is the @name length in Unicode
- * characters.
- *
- * If @name is not present (NULL), we assume that the unnamed attribute is
- * being searched for.
- *
- * Finally, the resident attribute value @val is looked for, if present.  If
- * @val is not present (NULL), @val_len is ignored.
- *
- * ntfs_attr_find() only searches the specified mft record and it ignores the
- * presence of an attribute list attribute (unless it is the one being searched
- * for, obviously).  If you need to take attribute lists into consideration,
- * use ntfs_attr_lookup() instead (see below).  This also means that you cannot
- * use ntfs_attr_find() to search for extent records of non-resident
- * attributes, as extents with lowest_vcn != 0 are usually described by the
- * attribute list attribute only. - Note that it is possible that the first
- * extent is only in the attribute list while the last extent is in the base
- * mft record, so do not rely on being able to find the first extent in the
- * base mft record.
- *
- * Warning: Never use @val when looking for attribute types which can be
- *	    non-resident as this most likely will result in a crash!
- */
-static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
-	ATTR_RECORD *a;
-	ntfs_volume *vol = ctx->ntfs_ino->vol;
-	ntfschar *upcase = vol->upcase;
-	u32 upcase_len = vol->upcase_len;
-
-	/*
-	 * Iterate over attributes in mft record starting at @ctx->attr, or the
-	 * attribute following that, if @ctx->is_first is 'true'.
-	 */
-	if (ctx->is_first) {
-		a = ctx->attr;
-		ctx->is_first = false;
-	} else
-		a = (ATTR_RECORD*)((u8*)ctx->attr +
-				le32_to_cpu(ctx->attr->length));
-	for (;;	a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
-		u8 *mrec_end = (u8 *)ctx->mrec +
-		               le32_to_cpu(ctx->mrec->bytes_allocated);
-		u8 *name_end;
-
-		/* check whether ATTR_RECORD wrap */
-		if ((u8 *)a < (u8 *)ctx->mrec)
-			break;
-
-		/* check whether Attribute Record Header is within bounds */
-		if ((u8 *)a > mrec_end ||
-		    (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
-			break;
-
-		/* check whether ATTR_RECORD's name is within bounds */
-		name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
-			   a->name_length * sizeof(ntfschar);
-		if (name_end > mrec_end)
-			break;
-
-		ctx->attr = a;
-		if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
-				a->type == AT_END))
-			return -ENOENT;
-		if (unlikely(!a->length))
-			break;
-
-		/* check whether ATTR_RECORD's length wrap */
-		if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
-			break;
-		/* check whether ATTR_RECORD's length is within bounds */
-		if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
-			break;
-
-		if (a->type != type)
-			continue;
-		/*
-		 * If @name is present, compare the two names.  If @name is
-		 * missing, assume we want an unnamed attribute.
-		 */
-		if (!name) {
-			/* The search failed if the found attribute is named. */
-			if (a->name_length)
-				return -ENOENT;
-		} else if (!ntfs_are_names_equal(name, name_len,
-			    (ntfschar*)((u8*)a + le16_to_cpu(a->name_offset)),
-			    a->name_length, ic, upcase, upcase_len)) {
-			register int rc;
-
-			rc = ntfs_collate_names(name, name_len,
-					(ntfschar*)((u8*)a +
-					le16_to_cpu(a->name_offset)),
-					a->name_length, 1, IGNORE_CASE,
-					upcase, upcase_len);
-			/*
-			 * If @name collates before a->name, there is no
-			 * matching attribute.
-			 */
-			if (rc == -1)
-				return -ENOENT;
-			/* If the strings are not equal, continue search. */
-			if (rc)
-				continue;
-			rc = ntfs_collate_names(name, name_len,
-					(ntfschar*)((u8*)a +
-					le16_to_cpu(a->name_offset)),
-					a->name_length, 1, CASE_SENSITIVE,
-					upcase, upcase_len);
-			if (rc == -1)
-				return -ENOENT;
-			if (rc)
-				continue;
-		}
-		/*
-		 * The names match or @name not present and attribute is
-		 * unnamed.  If no @val specified, we have found the attribute
-		 * and are done.
-		 */
-		if (!val)
-			return 0;
-		/* @val is present; compare values. */
-		else {
-			register int rc;
-
-			rc = memcmp(val, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					min_t(u32, val_len, le32_to_cpu(
-					a->data.resident.value_length)));
-			/*
-			 * If @val collates before the current attribute's
-			 * value, there is no matching attribute.
-			 */
-			if (!rc) {
-				register u32 avl;
-
-				avl = le32_to_cpu(
-						a->data.resident.value_length);
-				if (val_len == avl)
-					return 0;
-				if (val_len < avl)
-					return -ENOENT;
-			} else if (rc < 0)
-				return -ENOENT;
-		}
-	}
-	ntfs_error(vol->sb, "Inode is corrupt.  Run chkdsk.");
-	NVolSetErrors(vol);
-	return -EIO;
-}
-
-/**
- * load_attribute_list - load an attribute list into memory
- * @vol:		ntfs volume from which to read
- * @runlist:		runlist of the attribute list
- * @al_start:		destination buffer
- * @size:		size of the destination buffer in bytes
- * @initialized_size:	initialized size of the attribute list
- *
- * Walk the runlist @runlist and load all clusters from it copying them into
- * the linear buffer @al. The maximum number of bytes copied to @al is @size
- * bytes. Note, @size does not need to be a multiple of the cluster size. If
- * @initialized_size is less than @size, the region in @al between
- * @initialized_size and @size will be zeroed and not read from disk.
- *
- * Return 0 on success or -errno on error.
- */
-int load_attribute_list(ntfs_volume *vol, runlist *runlist, u8 *al_start,
-		const s64 size, const s64 initialized_size)
-{
-	LCN lcn;
-	u8 *al = al_start;
-	u8 *al_end = al + initialized_size;
-	runlist_element *rl;
-	struct buffer_head *bh;
-	struct super_block *sb;
-	unsigned long block_size;
-	unsigned long block, max_block;
-	int err = 0;
-	unsigned char block_size_bits;
-
-	ntfs_debug("Entering.");
-	if (!vol || !runlist || !al || size <= 0 || initialized_size < 0 ||
-			initialized_size > size)
-		return -EINVAL;
-	if (!initialized_size) {
-		memset(al, 0, size);
-		return 0;
-	}
-	sb = vol->sb;
-	block_size = sb->s_blocksize;
-	block_size_bits = sb->s_blocksize_bits;
-	down_read(&runlist->lock);
-	rl = runlist->rl;
-	if (!rl) {
-		ntfs_error(sb, "Cannot read attribute list since runlist is "
-				"missing.");
-		goto err_out;	
-	}
-	/* Read all clusters specified by the runlist one run at a time. */
-	while (rl->length) {
-		lcn = ntfs_rl_vcn_to_lcn(rl, rl->vcn);
-		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
-				(unsigned long long)rl->vcn,
-				(unsigned long long)lcn);
-		/* The attribute list cannot be sparse. */
-		if (lcn < 0) {
-			ntfs_error(sb, "ntfs_rl_vcn_to_lcn() failed.  Cannot "
-					"read attribute list.");
-			goto err_out;
-		}
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		/* Read the run from device in chunks of block_size bytes. */
-		max_block = block + (rl->length << vol->cluster_size_bits >>
-				block_size_bits);
-		ntfs_debug("max_block = 0x%lx.", max_block);
-		do {
-			ntfs_debug("Reading block = 0x%lx.", block);
-			bh = sb_bread(sb, block);
-			if (!bh) {
-				ntfs_error(sb, "sb_bread() failed. Cannot "
-						"read attribute list.");
-				goto err_out;
-			}
-			if (al + block_size >= al_end)
-				goto do_final;
-			memcpy(al, bh->b_data, block_size);
-			brelse(bh);
-			al += block_size;
-		} while (++block < max_block);
-		rl++;
-	}
-	if (initialized_size < size) {
-initialize:
-		memset(al_start + initialized_size, 0, size - initialized_size);
-	}
-done:
-	up_read(&runlist->lock);
-	return err;
-do_final:
-	if (al < al_end) {
-		/*
-		 * Partial block.
-		 *
-		 * Note: The attribute list can be smaller than its allocation
-		 * by multiple clusters.  This has been encountered by at least
-		 * two people running Windows XP, thus we cannot do any
-		 * truncation sanity checking here. (AIA)
-		 */
-		memcpy(al, bh->b_data, al_end - al);
-		brelse(bh);
-		if (initialized_size < size)
-			goto initialize;
-		goto done;
-	}
-	brelse(bh);
-	/* Real overflow! */
-	ntfs_error(sb, "Attribute list buffer overflow. Read attribute list "
-			"is truncated.");
-err_out:
-	err = -EIO;
-	goto done;
-}
-
-/**
- * ntfs_external_attr_find - find an attribute in the attribute list of an inode
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * You should not need to call this function directly.  Use ntfs_attr_lookup()
- * instead.
- *
- * Find an attribute by searching the attribute list for the corresponding
- * attribute list entry.  Having found the entry, map the mft record if the
- * attribute is in a different mft record/inode, ntfs_attr_find() the attribute
- * in there and return it.
- *
- * On first search @ctx->ntfs_ino must be the base mft record and @ctx must
- * have been obtained from a call to ntfs_attr_get_search_ctx().  On subsequent
- * calls @ctx->ntfs_ino can be any extent inode, too (@ctx->base_ntfs_ino is
- * then the base inode).
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * If the attribute is found, ntfs_external_attr_find() returns 0 and
- * @ctx->attr will point to the found attribute.  @ctx->mrec will point to the
- * mft record in which @ctx->attr is located and @ctx->al_entry will point to
- * the attribute list entry for the attribute.
- *
- * If the attribute is not found, ntfs_external_attr_find() returns -ENOENT and
- * @ctx->attr will point to the attribute in the base mft record before which
- * the attribute being searched for would need to be inserted if such an action
- * were to be desired.  @ctx->mrec will point to the mft record in which
- * @ctx->attr is located and @ctx->al_entry will point to the attribute list
- * entry of the attribute before which the attribute being searched for would
- * need to be inserted if such an action were to be desired.
- *
- * Thus to insert the not found attribute, one wants to add the attribute to
- * @ctx->mrec (the base mft record) and if there is not enough space, the
- * attribute should be placed in a newly allocated extent mft record.  The
- * attribute list entry for the inserted attribute should be inserted in the
- * attribute list attribute at @ctx->al_entry.
- *
- * On actual error, ntfs_external_attr_find() returns -EIO.  In this case
- * @ctx->attr is undefined and in particular do not rely on it not changing.
- */
-static int ntfs_external_attr_find(const ATTR_TYPE type,
-		const ntfschar *name, const u32 name_len,
-		const IGNORE_CASE_BOOL ic, const VCN lowest_vcn,
-		const u8 *val, const u32 val_len, ntfs_attr_search_ctx *ctx)
-{
-	ntfs_inode *base_ni, *ni;
-	ntfs_volume *vol;
-	ATTR_LIST_ENTRY *al_entry, *next_al_entry;
-	u8 *al_start, *al_end;
-	ATTR_RECORD *a;
-	ntfschar *al_name;
-	u32 al_name_len;
-	int err = 0;
-	static const char *es = " Unmount and run chkdsk.";
-
-	ni = ctx->ntfs_ino;
-	base_ni = ctx->base_ntfs_ino;
-	ntfs_debug("Entering for inode 0x%lx, type 0x%x.", ni->mft_no, type);
-	if (!base_ni) {
-		/* First call happens with the base mft record. */
-		base_ni = ctx->base_ntfs_ino = ctx->ntfs_ino;
-		ctx->base_mrec = ctx->mrec;
-	}
-	if (ni == base_ni)
-		ctx->base_attr = ctx->attr;
-	if (type == AT_END)
-		goto not_found;
-	vol = base_ni->vol;
-	al_start = base_ni->attr_list;
-	al_end = al_start + base_ni->attr_list_size;
-	if (!ctx->al_entry)
-		ctx->al_entry = (ATTR_LIST_ENTRY*)al_start;
-	/*
-	 * Iterate over entries in attribute list starting at @ctx->al_entry,
-	 * or the entry following that, if @ctx->is_first is 'true'.
-	 */
-	if (ctx->is_first) {
-		al_entry = ctx->al_entry;
-		ctx->is_first = false;
-	} else
-		al_entry = (ATTR_LIST_ENTRY*)((u8*)ctx->al_entry +
-				le16_to_cpu(ctx->al_entry->length));
-	for (;; al_entry = next_al_entry) {
-		/* Out of bounds check. */
-		if ((u8*)al_entry < base_ni->attr_list ||
-				(u8*)al_entry > al_end)
-			break;	/* Inode is corrupt. */
-		ctx->al_entry = al_entry;
-		/* Catch the end of the attribute list. */
-		if ((u8*)al_entry == al_end)
-			goto not_found;
-		if (!al_entry->length)
-			break;
-		if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
-				le16_to_cpu(al_entry->length) > al_end)
-			break;
-		next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
-				le16_to_cpu(al_entry->length));
-		if (le32_to_cpu(al_entry->type) > le32_to_cpu(type))
-			goto not_found;
-		if (type != al_entry->type)
-			continue;
-		/*
-		 * If @name is present, compare the two names.  If @name is
-		 * missing, assume we want an unnamed attribute.
-		 */
-		al_name_len = al_entry->name_length;
-		al_name = (ntfschar*)((u8*)al_entry + al_entry->name_offset);
-		if (!name) {
-			if (al_name_len)
-				goto not_found;
-		} else if (!ntfs_are_names_equal(al_name, al_name_len, name,
-				name_len, ic, vol->upcase, vol->upcase_len)) {
-			register int rc;
-
-			rc = ntfs_collate_names(name, name_len, al_name,
-					al_name_len, 1, IGNORE_CASE,
-					vol->upcase, vol->upcase_len);
-			/*
-			 * If @name collates before al_name, there is no
-			 * matching attribute.
-			 */
-			if (rc == -1)
-				goto not_found;
-			/* If the strings are not equal, continue search. */
-			if (rc)
-				continue;
-			/*
-			 * FIXME: Reverse engineering showed 0, IGNORE_CASE but
-			 * that is inconsistent with ntfs_attr_find().  The
-			 * subsequent rc checks were also different.  Perhaps I
-			 * made a mistake in one of the two.  Need to recheck
-			 * which is correct or at least see what is going on...
-			 * (AIA)
-			 */
-			rc = ntfs_collate_names(name, name_len, al_name,
-					al_name_len, 1, CASE_SENSITIVE,
-					vol->upcase, vol->upcase_len);
-			if (rc == -1)
-				goto not_found;
-			if (rc)
-				continue;
-		}
-		/*
-		 * The names match or @name not present and attribute is
-		 * unnamed.  Now check @lowest_vcn.  Continue search if the
-		 * next attribute list entry still fits @lowest_vcn.  Otherwise
-		 * we have reached the right one or the search has failed.
-		 */
-		if (lowest_vcn && (u8*)next_al_entry >= al_start	    &&
-				(u8*)next_al_entry + 6 < al_end		    &&
-				(u8*)next_al_entry + le16_to_cpu(
-					next_al_entry->length) <= al_end    &&
-				sle64_to_cpu(next_al_entry->lowest_vcn) <=
-					lowest_vcn			    &&
-				next_al_entry->type == al_entry->type	    &&
-				next_al_entry->name_length == al_name_len   &&
-				ntfs_are_names_equal((ntfschar*)((u8*)
-					next_al_entry +
-					next_al_entry->name_offset),
-					next_al_entry->name_length,
-					al_name, al_name_len, CASE_SENSITIVE,
-					vol->upcase, vol->upcase_len))
-			continue;
-		if (MREF_LE(al_entry->mft_reference) == ni->mft_no) {
-			if (MSEQNO_LE(al_entry->mft_reference) != ni->seq_no) {
-				ntfs_error(vol->sb, "Found stale mft "
-						"reference in attribute list "
-						"of base inode 0x%lx.%s",
-						base_ni->mft_no, es);
-				err = -EIO;
-				break;
-			}
-		} else { /* Mft references do not match. */
-			/* If there is a mapped record unmap it first. */
-			if (ni != base_ni)
-				unmap_extent_mft_record(ni);
-			/* Do we want the base record back? */
-			if (MREF_LE(al_entry->mft_reference) ==
-					base_ni->mft_no) {
-				ni = ctx->ntfs_ino = base_ni;
-				ctx->mrec = ctx->base_mrec;
-			} else {
-				/* We want an extent record. */
-				ctx->mrec = map_extent_mft_record(base_ni,
-						le64_to_cpu(
-						al_entry->mft_reference), &ni);
-				if (IS_ERR(ctx->mrec)) {
-					ntfs_error(vol->sb, "Failed to map "
-							"extent mft record "
-							"0x%lx of base inode "
-							"0x%lx.%s",
-							MREF_LE(al_entry->
-							mft_reference),
-							base_ni->mft_no, es);
-					err = PTR_ERR(ctx->mrec);
-					if (err == -ENOENT)
-						err = -EIO;
-					/* Cause @ctx to be sanitized below. */
-					ni = NULL;
-					break;
-				}
-				ctx->ntfs_ino = ni;
-			}
-			ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-					le16_to_cpu(ctx->mrec->attrs_offset));
-		}
-		/*
-		 * ctx->vfs_ino, ctx->mrec, and ctx->attr now point to the
-		 * mft record containing the attribute represented by the
-		 * current al_entry.
-		 */
-		/*
-		 * We could call into ntfs_attr_find() to find the right
-		 * attribute in this mft record but this would be less
-		 * efficient and not quite accurate as ntfs_attr_find() ignores
-		 * the attribute instance numbers for example which become
-		 * important when one plays with attribute lists.  Also,
-		 * because a proper match has been found in the attribute list
-		 * entry above, the comparison can now be optimized.  So it is
-		 * worth re-implementing a simplified ntfs_attr_find() here.
-		 */
-		a = ctx->attr;
-		/*
-		 * Use a manual loop so we can still use break and continue
-		 * with the same meanings as above.
-		 */
-do_next_attr_loop:
-		if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
-				le32_to_cpu(ctx->mrec->bytes_allocated))
-			break;
-		if (a->type == AT_END)
-			break;
-		if (!a->length)
-			break;
-		if (al_entry->instance != a->instance)
-			goto do_next_attr;
-		/*
-		 * If the type and/or the name are mismatched between the
-		 * attribute list entry and the attribute record, there is
-		 * corruption so we break and return error EIO.
-		 */
-		if (al_entry->type != a->type)
-			break;
-		if (!ntfs_are_names_equal((ntfschar*)((u8*)a +
-				le16_to_cpu(a->name_offset)), a->name_length,
-				al_name, al_name_len, CASE_SENSITIVE,
-				vol->upcase, vol->upcase_len))
-			break;
-		ctx->attr = a;
-		/*
-		 * If no @val specified or @val specified and it matches, we
-		 * have found it!
-		 */
-		if (!val || (!a->non_resident && le32_to_cpu(
-				a->data.resident.value_length) == val_len &&
-				!memcmp((u8*)a +
-				le16_to_cpu(a->data.resident.value_offset),
-				val, val_len))) {
-			ntfs_debug("Done, found.");
-			return 0;
-		}
-do_next_attr:
-		/* Proceed to the next attribute in the current mft record. */
-		a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length));
-		goto do_next_attr_loop;
-	}
-	if (!err) {
-		ntfs_error(vol->sb, "Base inode 0x%lx contains corrupt "
-				"attribute list attribute.%s", base_ni->mft_no,
-				es);
-		err = -EIO;
-	}
-	if (ni != base_ni) {
-		if (ni)
-			unmap_extent_mft_record(ni);
-		ctx->ntfs_ino = base_ni;
-		ctx->mrec = ctx->base_mrec;
-		ctx->attr = ctx->base_attr;
-	}
-	if (err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-not_found:
-	/*
-	 * If we were looking for AT_END, we reset the search context @ctx and
-	 * use ntfs_attr_find() to seek to the end of the base mft record.
-	 */
-	if (type == AT_END) {
-		ntfs_attr_reinit_search_ctx(ctx);
-		return ntfs_attr_find(AT_END, name, name_len, ic, val, val_len,
-				ctx);
-	}
-	/*
-	 * The attribute was not found.  Before we return, we want to ensure
-	 * @ctx->mrec and @ctx->attr indicate the position at which the
-	 * attribute should be inserted in the base mft record.  Since we also
-	 * want to preserve @ctx->al_entry we cannot reinitialize the search
-	 * context using ntfs_attr_reinit_search_ctx() as this would set
-	 * @ctx->al_entry to NULL.  Thus we do the necessary bits manually (see
-	 * ntfs_attr_init_search_ctx() below).  Note, we _only_ preserve
-	 * @ctx->al_entry as the remaining fields (base_*) are identical to
-	 * their non base_ counterparts and we cannot set @ctx->base_attr
-	 * correctly yet as we do not know what @ctx->attr will be set to by
-	 * the call to ntfs_attr_find() below.
-	 */
-	if (ni != base_ni)
-		unmap_extent_mft_record(ni);
-	ctx->mrec = ctx->base_mrec;
-	ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-			le16_to_cpu(ctx->mrec->attrs_offset));
-	ctx->is_first = true;
-	ctx->ntfs_ino = base_ni;
-	ctx->base_ntfs_ino = NULL;
-	ctx->base_mrec = NULL;
-	ctx->base_attr = NULL;
-	/*
-	 * In case there are multiple matches in the base mft record, need to
-	 * keep enumerating until we get an attribute not found response (or
-	 * another error), otherwise we would keep returning the same attribute
-	 * over and over again and all programs using us for enumeration would
-	 * lock up in a tight loop.
-	 */
-	do {
-		err = ntfs_attr_find(type, name, name_len, ic, val, val_len,
-				ctx);
-	} while (!err);
-	ntfs_debug("Done, not found.");
-	return err;
-}
-
-/**
- * ntfs_attr_lookup - find an attribute in an ntfs inode
- * @type:	attribute type to find
- * @name:	attribute name to find (optional, i.e. NULL means don't care)
- * @name_len:	attribute name length (only needed if @name present)
- * @ic:		IGNORE_CASE or CASE_SENSITIVE (ignored if @name not present)
- * @lowest_vcn:	lowest vcn to find (optional, non-resident attributes only)
- * @val:	attribute value to find (optional, resident attributes only)
- * @val_len:	attribute value length
- * @ctx:	search context with mft record and attribute to search from
- *
- * Find an attribute in an ntfs inode.  On first search @ctx->ntfs_ino must
- * be the base mft record and @ctx must have been obtained from a call to
- * ntfs_attr_get_search_ctx().
- *
- * This function transparently handles attribute lists and @ctx is used to
- * continue searches where they were left off at.
- *
- * After finishing with the attribute/mft record you need to call
- * ntfs_attr_put_search_ctx() to cleanup the search context (unmapping any
- * mapped inodes, etc).
- *
- * Return 0 if the search was successful and -errno if not.
- *
- * When 0, @ctx->attr is the found attribute and it is in mft record
- * @ctx->mrec.  If an attribute list attribute is present, @ctx->al_entry is
- * the attribute list entry of the found attribute.
- *
- * When -ENOENT, @ctx->attr is the attribute which collates just after the
- * attribute being searched for, i.e. if one wants to add the attribute to the
- * mft record this is the correct place to insert it into.  If an attribute
- * list attribute is present, @ctx->al_entry is the attribute list entry which
- * collates just after the attribute list entry of the attribute being searched
- * for, i.e. if one wants to add the attribute to the mft record this is the
- * correct place to insert its attribute list entry into.
- *
- * When -errno != -ENOENT, an error occurred during the lookup.  @ctx->attr is
- * then undefined and in particular you should not rely on it not changing.
- */
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const VCN lowest_vcn, const u8 *val, const u32 val_len,
-		ntfs_attr_search_ctx *ctx)
-{
-	ntfs_inode *base_ni;
-
-	ntfs_debug("Entering.");
-	BUG_ON(IS_ERR(ctx->mrec));
-	if (ctx->base_ntfs_ino)
-		base_ni = ctx->base_ntfs_ino;
-	else
-		base_ni = ctx->ntfs_ino;
-	/* Sanity check, just for debugging really. */
-	BUG_ON(!base_ni);
-	if (!NInoAttrList(base_ni) || type == AT_ATTRIBUTE_LIST)
-		return ntfs_attr_find(type, name, name_len, ic, val, val_len,
-				ctx);
-	return ntfs_external_attr_find(type, name, name_len, ic, lowest_vcn,
-			val, val_len, ctx);
-}
-
-/**
- * ntfs_attr_init_search_ctx - initialize an attribute search context
- * @ctx:	attribute search context to initialize
- * @ni:		ntfs inode with which to initialize the search context
- * @mrec:	mft record with which to initialize the search context
- *
- * Initialize the attribute search context @ctx with @ni and @mrec.
- */
-static inline void ntfs_attr_init_search_ctx(ntfs_attr_search_ctx *ctx,
-		ntfs_inode *ni, MFT_RECORD *mrec)
-{
-	*ctx = (ntfs_attr_search_ctx) {
-		.mrec = mrec,
-		/* Sanity checks are performed elsewhere. */
-		.attr = (ATTR_RECORD*)((u8*)mrec +
-				le16_to_cpu(mrec->attrs_offset)),
-		.is_first = true,
-		.ntfs_ino = ni,
-	};
-}
-
-/**
- * ntfs_attr_reinit_search_ctx - reinitialize an attribute search context
- * @ctx:	attribute search context to reinitialize
- *
- * Reinitialize the attribute search context @ctx, unmapping an associated
- * extent mft record if present, and initialize the search context again.
- *
- * This is used when a search for a new attribute is being started to reset
- * the search context to the beginning.
- */
-void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx)
-{
-	if (likely(!ctx->base_ntfs_ino)) {
-		/* No attribute list. */
-		ctx->is_first = true;
-		/* Sanity checks are performed elsewhere. */
-		ctx->attr = (ATTR_RECORD*)((u8*)ctx->mrec +
-				le16_to_cpu(ctx->mrec->attrs_offset));
-		/*
-		 * This needs resetting due to ntfs_external_attr_find() which
-		 * can leave it set despite having zeroed ctx->base_ntfs_ino.
-		 */
-		ctx->al_entry = NULL;
-		return;
-	} /* Attribute list. */
-	if (ctx->ntfs_ino != ctx->base_ntfs_ino)
-		unmap_extent_mft_record(ctx->ntfs_ino);
-	ntfs_attr_init_search_ctx(ctx, ctx->base_ntfs_ino, ctx->base_mrec);
-	return;
-}
-
-/**
- * ntfs_attr_get_search_ctx - allocate/initialize a new attribute search context
- * @ni:		ntfs inode with which to initialize the search context
- * @mrec:	mft record with which to initialize the search context
- *
- * Allocate a new attribute search context, initialize it with @ni and @mrec,
- * and return it. Return NULL if allocation failed.
- */
-ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni, MFT_RECORD *mrec)
-{
-	ntfs_attr_search_ctx *ctx;
-
-	ctx = kmem_cache_alloc(ntfs_attr_ctx_cache, GFP_NOFS);
-	if (ctx)
-		ntfs_attr_init_search_ctx(ctx, ni, mrec);
-	return ctx;
-}
-
-/**
- * ntfs_attr_put_search_ctx - release an attribute search context
- * @ctx:	attribute search context to free
- *
- * Release the attribute search context @ctx, unmapping an associated extent
- * mft record if present.
- */
-void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx)
-{
-	if (ctx->base_ntfs_ino && ctx->ntfs_ino != ctx->base_ntfs_ino)
-		unmap_extent_mft_record(ctx->ntfs_ino);
-	kmem_cache_free(ntfs_attr_ctx_cache, ctx);
-	return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_find_in_attrdef - find an attribute in the $AttrDef system file
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to find
- *
- * Search for the attribute definition record corresponding to the attribute
- * @type in the $AttrDef system file.
- *
- * Return the attribute type definition record if found and NULL if not found.
- */
-static ATTR_DEF *ntfs_attr_find_in_attrdef(const ntfs_volume *vol,
-		const ATTR_TYPE type)
-{
-	ATTR_DEF *ad;
-
-	BUG_ON(!vol->attrdef);
-	BUG_ON(!type);
-	for (ad = vol->attrdef; (u8*)ad - (u8*)vol->attrdef <
-			vol->attrdef_size && ad->type; ++ad) {
-		/* We have not found it yet, carry on searching. */
-		if (likely(le32_to_cpu(ad->type) < le32_to_cpu(type)))
-			continue;
-		/* We found the attribute; return it. */
-		if (likely(ad->type == type))
-			return ad;
-		/* We have gone too far already.  No point in continuing. */
-		break;
-	}
-	/* Attribute not found. */
-	ntfs_debug("Attribute type 0x%x not found in $AttrDef.",
-			le32_to_cpu(type));
-	return NULL;
-}
-
-/**
- * ntfs_attr_size_bounds_check - check a size of an attribute type for validity
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- * @size:	size which to check
- *
- * Check whether the @size in bytes is valid for an attribute of @type on the
- * ntfs volume @vol.  This information is obtained from $AttrDef system file.
- *
- * Return 0 if valid, -ERANGE if not valid, or -ENOENT if the attribute is not
- * listed in $AttrDef.
- */
-int ntfs_attr_size_bounds_check(const ntfs_volume *vol, const ATTR_TYPE type,
-		const s64 size)
-{
-	ATTR_DEF *ad;
-
-	BUG_ON(size < 0);
-	/*
-	 * $ATTRIBUTE_LIST has a maximum size of 256kiB, but this is not
-	 * listed in $AttrDef.
-	 */
-	if (unlikely(type == AT_ATTRIBUTE_LIST && size > 256 * 1024))
-		return -ERANGE;
-	/* Get the $AttrDef entry for the attribute @type. */
-	ad = ntfs_attr_find_in_attrdef(vol, type);
-	if (unlikely(!ad))
-		return -ENOENT;
-	/* Do the bounds check. */
-	if (((sle64_to_cpu(ad->min_size) > 0) &&
-			size < sle64_to_cpu(ad->min_size)) ||
-			((sle64_to_cpu(ad->max_size) > 0) && size >
-			sle64_to_cpu(ad->max_size)))
-		return -ERANGE;
-	return 0;
-}
-
-/**
- * ntfs_attr_can_be_non_resident - check if an attribute can be non-resident
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be non-resident.  This information is obtained from $AttrDef system file.
- *
- * Return 0 if the attribute is allowed to be non-resident, -EPERM if not, and
- * -ENOENT if the attribute is not listed in $AttrDef.
- */
-int ntfs_attr_can_be_non_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
-	ATTR_DEF *ad;
-
-	/* Find the attribute definition record in $AttrDef. */
-	ad = ntfs_attr_find_in_attrdef(vol, type);
-	if (unlikely(!ad))
-		return -ENOENT;
-	/* Check the flags and return the result. */
-	if (ad->flags & ATTR_DEF_RESIDENT)
-		return -EPERM;
-	return 0;
-}
-
-/**
- * ntfs_attr_can_be_resident - check if an attribute can be resident
- * @vol:	ntfs volume to which the attribute belongs
- * @type:	attribute type which to check
- *
- * Check whether the attribute of @type on the ntfs volume @vol is allowed to
- * be resident.  This information is derived from our ntfs knowledge and may
- * not be completely accurate, especially when user defined attributes are
- * present.  Basically we allow everything to be resident except for index
- * allocation and $EA attributes.
- *
- * Return 0 if the attribute is allowed to be non-resident and -EPERM if not.
- *
- * Warning: In the system file $MFT the attribute $Bitmap must be non-resident
- *	    otherwise windows will not boot (blue screen of death)!  We cannot
- *	    check for this here as we do not know which inode's $Bitmap is
- *	    being asked about so the caller needs to special case this.
- */
-int ntfs_attr_can_be_resident(const ntfs_volume *vol, const ATTR_TYPE type)
-{
-	if (type == AT_INDEX_ALLOCATION)
-		return -EPERM;
-	return 0;
-}
-
-/**
- * ntfs_attr_record_resize - resize an attribute record
- * @m:		mft record containing attribute record
- * @a:		attribute record to resize
- * @new_size:	new size in bytes to which to resize the attribute record @a
- *
- * Resize the attribute record @a, i.e. the resident part of the attribute, in
- * the mft record @m to @new_size bytes.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- *	    are interested in the data may be overwritten.
- */
-int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size)
-{
-	ntfs_debug("Entering for new_size %u.", new_size);
-	/* Align to 8 bytes if it is not already done. */
-	if (new_size & 7)
-		new_size = (new_size + 7) & ~7;
-	/* If the actual attribute length has changed, move things around. */
-	if (new_size != le32_to_cpu(a->length)) {
-		u32 new_muse = le32_to_cpu(m->bytes_in_use) -
-				le32_to_cpu(a->length) + new_size;
-		/* Not enough space in this mft record. */
-		if (new_muse > le32_to_cpu(m->bytes_allocated))
-			return -ENOSPC;
-		/* Move attributes following @a to their new location. */
-		memmove((u8*)a + new_size, (u8*)a + le32_to_cpu(a->length),
-				le32_to_cpu(m->bytes_in_use) - ((u8*)a -
-				(u8*)m) - le32_to_cpu(a->length));
-		/* Adjust @m to reflect the change in used space. */
-		m->bytes_in_use = cpu_to_le32(new_muse);
-		/* Adjust @a to reflect the new size. */
-		if (new_size >= offsetof(ATTR_REC, length) + sizeof(a->length))
-			a->length = cpu_to_le32(new_size);
-	}
-	return 0;
-}
-
-/**
- * ntfs_resident_attr_value_resize - resize the value of a resident attribute
- * @m:		mft record containing attribute record
- * @a:		attribute record whose value to resize
- * @new_size:	new size in bytes to which to resize the attribute value of @a
- *
- * Resize the value of the attribute @a in the mft record @m to @new_size bytes.
- * If the value is made bigger, the newly allocated space is cleared.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-ENOSPC	- Not enough space in the mft record @m to perform the resize.
- *
- * Note: On error, no modifications have been performed whatsoever.
- *
- * Warning: If you make a record smaller without having copied all the data you
- *	    are interested in the data may be overwritten.
- */
-int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
-		const u32 new_size)
-{
-	u32 old_size;
-
-	/* Resize the resident part of the attribute record. */
-	if (ntfs_attr_record_resize(m, a,
-			le16_to_cpu(a->data.resident.value_offset) + new_size))
-		return -ENOSPC;
-	/*
-	 * The resize succeeded!  If we made the attribute value bigger, clear
-	 * the area between the old size and @new_size.
-	 */
-	old_size = le32_to_cpu(a->data.resident.value_length);
-	if (new_size > old_size)
-		memset((u8*)a + le16_to_cpu(a->data.resident.value_offset) +
-				old_size, 0, new_size - old_size);
-	/* Finally update the length of the attribute value. */
-	a->data.resident.value_length = cpu_to_le32(new_size);
-	return 0;
-}
-
-/**
- * ntfs_attr_make_non_resident - convert a resident to a non-resident attribute
- * @ni:		ntfs inode describing the attribute to convert
- * @data_size:	size of the resident data to copy to the non-resident attribute
- *
- * Convert the resident ntfs attribute described by the ntfs inode @ni to a
- * non-resident one.
- *
- * @data_size must be equal to the attribute value size.  This is needed since
- * we need to know the size before we can map the mft record and our callers
- * always know it.  The reason we cannot simply read the size from the vfs
- * inode i_size is that this is not necessarily uptodate.  This happens when
- * ntfs_attr_make_non_resident() is called in the ->truncate call path(s).
- *
- * Return 0 on success and -errno on error.  The following error return codes
- * are defined:
- *	-EPERM	- The attribute is not allowed to be non-resident.
- *	-ENOMEM	- Not enough memory.
- *	-ENOSPC	- Not enough disk space.
- *	-EINVAL	- Attribute not defined on the volume.
- *	-EIO	- I/o error or other error.
- * Note that -ENOSPC is also returned in the case that there is not enough
- * space in the mft record to do the conversion.  This can happen when the mft
- * record is already very full.  The caller is responsible for trying to make
- * space in the mft record and trying again.  FIXME: Do we need a separate
- * error return code for this kind of -ENOSPC or is it always worth trying
- * again in case the attribute may then fit in a resident state so no need to
- * make it non-resident at all?  Ho-hum...  (AIA)
- *
- * NOTE to self: No changes in the attribute list are required to move from
- *		 a resident to a non-resident attribute.
- *
- * Locking: - The caller must hold i_mutex on the inode.
- */
-int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size)
-{
-	s64 new_size;
-	struct inode *vi = VFS_I(ni);
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	struct page *page;
-	runlist_element *rl;
-	u8 *kaddr;
-	unsigned long flags;
-	int mp_size, mp_ofs, name_ofs, arec_size, err, err2;
-	u32 attr_size;
-	u8 old_res_attr_flags;
-
-	/* Check that the attribute is allowed to be non-resident. */
-	err = ntfs_attr_can_be_non_resident(vol, ni->type);
-	if (unlikely(err)) {
-		if (err == -EPERM)
-			ntfs_debug("Attribute is not allowed to be "
-					"non-resident.");
-		else
-			ntfs_debug("Attribute not defined on the NTFS "
-					"volume!");
-		return err;
-	}
-	/*
-	 * FIXME: Compressed and encrypted attributes are not supported when
-	 * writing and we should never have gotten here for them.
-	 */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoEncrypted(ni));
-	/*
-	 * The size needs to be aligned to a cluster boundary for allocation
-	 * purposes.
-	 */
-	new_size = (data_size + vol->cluster_size - 1) &
-			~(vol->cluster_size - 1);
-	if (new_size > 0) {
-		/*
-		 * Will need the page later and since the page lock nests
-		 * outside all ntfs locks, we need to get the page now.
-		 */
-		page = find_or_create_page(vi->i_mapping, 0,
-				mapping_gfp_mask(vi->i_mapping));
-		if (unlikely(!page))
-			return -ENOMEM;
-		/* Start by allocating clusters to hold the attribute value. */
-		rl = ntfs_cluster_alloc(vol, 0, new_size >>
-				vol->cluster_size_bits, -1, DATA_ZONE, true);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			ntfs_debug("Failed to allocate cluster%s, error code "
-					"%i.", (new_size >>
-					vol->cluster_size_bits) > 1 ? "s" : "",
-					err);
-			goto page_err_out;
-		}
-	} else {
-		rl = NULL;
-		page = NULL;
-	}
-	/* Determine the size of the mapping pairs array. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl, 0, -1);
-	if (unlikely(mp_size < 0)) {
-		err = mp_size;
-		ntfs_debug("Failed to get size for mapping pairs array, error "
-				"code %i.", err);
-		goto rl_err_out;
-	}
-	down_write(&ni->runlist.lock);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(NInoNonResident(ni));
-	BUG_ON(a->non_resident);
-	/*
-	 * Calculate new offsets for the name and the mapping pairs array.
-	 */
-	if (NInoSparse(ni) || NInoCompressed(ni))
-		name_ofs = (offsetof(ATTR_REC,
-				data.non_resident.compressed_size) +
-				sizeof(a->data.non_resident.compressed_size) +
-				7) & ~7;
-	else
-		name_ofs = (offsetof(ATTR_REC,
-				data.non_resident.compressed_size) + 7) & ~7;
-	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
-	/*
-	 * Determine the size of the resident part of the now non-resident
-	 * attribute record.
-	 */
-	arec_size = (mp_ofs + mp_size + 7) & ~7;
-	/*
-	 * If the page is not uptodate bring it uptodate by copying from the
-	 * attribute value.
-	 */
-	attr_size = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(attr_size != data_size);
-	if (page && !PageUptodate(page)) {
-		kaddr = kmap_atomic(page);
-		memcpy(kaddr, (u8*)a +
-				le16_to_cpu(a->data.resident.value_offset),
-				attr_size);
-		memset(kaddr + attr_size, 0, PAGE_SIZE - attr_size);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	/* Backup the attribute flag. */
-	old_res_attr_flags = a->data.resident.flags;
-	/* Resize the resident part of the attribute record. */
-	err = ntfs_attr_record_resize(m, a, arec_size);
-	if (unlikely(err))
-		goto err_out;
-	/*
-	 * Convert the resident part of the attribute record to describe a
-	 * non-resident attribute.
-	 */
-	a->non_resident = 1;
-	/* Move the attribute name if it exists and update the offset. */
-	if (a->name_length)
-		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
-				a->name_length * sizeof(ntfschar));
-	a->name_offset = cpu_to_le16(name_ofs);
-	/* Setup the fields specific to non-resident attributes. */
-	a->data.non_resident.lowest_vcn = 0;
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_size - 1) >>
-			vol->cluster_size_bits);
-	a->data.non_resident.mapping_pairs_offset = cpu_to_le16(mp_ofs);
-	memset(&a->data.non_resident.reserved, 0,
-			sizeof(a->data.non_resident.reserved));
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_size);
-	a->data.non_resident.data_size =
-			a->data.non_resident.initialized_size =
-			cpu_to_sle64(attr_size);
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		a->data.non_resident.compression_unit = 0;
-		if (NInoCompressed(ni) || vol->major_ver < 3)
-			a->data.non_resident.compression_unit = 4;
-		a->data.non_resident.compressed_size =
-				a->data.non_resident.allocated_size;
-	} else
-		a->data.non_resident.compression_unit = 0;
-	/* Generate the mapping pairs array into the attribute record. */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a + mp_ofs,
-			arec_size - mp_ofs, rl, 0, -1, NULL);
-	if (unlikely(err)) {
-		ntfs_debug("Failed to build mapping pairs, error code %i.",
-				err);
-		goto undo_err_out;
-	}
-	/* Setup the in-memory attribute structure to be non-resident. */
-	ni->runlist.rl = rl;
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_size;
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		ni->itype.compressed.size = ni->allocated_size;
-		if (a->data.non_resident.compression_unit) {
-			ni->itype.compressed.block_size = 1U << (a->data.
-					non_resident.compression_unit +
-					vol->cluster_size_bits);
-			ni->itype.compressed.block_size_bits =
-					ffs(ni->itype.compressed.block_size) -
-					1;
-			ni->itype.compressed.block_clusters = 1U <<
-					a->data.non_resident.compression_unit;
-		} else {
-			ni->itype.compressed.block_size = 0;
-			ni->itype.compressed.block_size_bits = 0;
-			ni->itype.compressed.block_clusters = 0;
-		}
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	} else
-		vi->i_blocks = ni->allocated_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * This needs to be last since the address space operations ->read_folio
-	 * and ->writepage can run concurrently with us as they are not
-	 * serialized on i_mutex.  Note, we are not allowed to fail once we flip
-	 * this switch, which is another reason to do this last.
-	 */
-	NInoSetNonResident(ni);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	if (page) {
-		set_page_dirty(page);
-		unlock_page(page);
-		put_page(page);
-	}
-	ntfs_debug("Done.");
-	return 0;
-undo_err_out:
-	/* Convert the attribute back into a resident attribute. */
-	a->non_resident = 0;
-	/* Move the attribute name if it exists and update the offset. */
-	name_ofs = (offsetof(ATTR_RECORD, data.resident.reserved) +
-			sizeof(a->data.resident.reserved) + 7) & ~7;
-	if (a->name_length)
-		memmove((u8*)a + name_ofs, (u8*)a + le16_to_cpu(a->name_offset),
-				a->name_length * sizeof(ntfschar));
-	mp_ofs = (name_ofs + a->name_length * sizeof(ntfschar) + 7) & ~7;
-	a->name_offset = cpu_to_le16(name_ofs);
-	arec_size = (mp_ofs + attr_size + 7) & ~7;
-	/* Resize the resident part of the attribute record. */
-	err2 = ntfs_attr_record_resize(m, a, arec_size);
-	if (unlikely(err2)) {
-		/*
-		 * This cannot happen (well if memory corruption is at work it
-		 * could happen in theory), but deal with it as well as we can.
-		 * If the old size is too small, truncate the attribute,
-		 * otherwise simply give it a larger allocated size.
-		 * FIXME: Should check whether chkdsk complains when the
-		 * allocated size is much bigger than the resident value size.
-		 */
-		arec_size = le32_to_cpu(a->length);
-		if ((mp_ofs + attr_size) > arec_size) {
-			err2 = attr_size;
-			attr_size = arec_size - mp_ofs;
-			ntfs_error(vol->sb, "Failed to undo partial resident "
-					"to non-resident attribute "
-					"conversion.  Truncating inode 0x%lx, "
-					"attribute type 0x%x from %i bytes to "
-					"%i bytes to maintain metadata "
-					"consistency.  THIS MEANS YOU ARE "
-					"LOSING %i BYTES DATA FROM THIS %s.",
-					vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type),
-					err2, attr_size, err2 - attr_size,
-					((ni->type == AT_DATA) &&
-					!ni->name_len) ? "FILE": "ATTRIBUTE");
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->initialized_size = attr_size;
-			i_size_write(vi, attr_size);
-			write_unlock_irqrestore(&ni->size_lock, flags);
-		}
-	}
-	/* Setup the fields specific to resident attributes. */
-	a->data.resident.value_length = cpu_to_le32(attr_size);
-	a->data.resident.value_offset = cpu_to_le16(mp_ofs);
-	a->data.resident.flags = old_res_attr_flags;
-	memset(&a->data.resident.reserved, 0,
-			sizeof(a->data.resident.reserved));
-	/* Copy the data from the page back to the attribute value. */
-	if (page) {
-		kaddr = kmap_atomic(page);
-		memcpy((u8*)a + mp_ofs, kaddr, attr_size);
-		kunmap_atomic(kaddr);
-	}
-	/* Setup the allocated size in the ntfs inode in case it changed. */
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = arec_size - mp_ofs;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ni->runlist.rl = NULL;
-	up_write(&ni->runlist.lock);
-rl_err_out:
-	if (rl) {
-		if (ntfs_cluster_free_from_rl(vol, rl) < 0) {
-			ntfs_error(vol->sb, "Failed to release allocated "
-					"cluster(s) in error code path.  Run "
-					"chkdsk to recover the lost "
-					"cluster(s).");
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl);
-page_err_out:
-		unlock_page(page);
-		put_page(page);
-	}
-	if (err == -EINVAL)
-		err = -EIO;
-	return err;
-}
-
-/**
- * ntfs_attr_extend_allocation - extend the allocated space of an attribute
- * @ni:			ntfs inode of the attribute whose allocation to extend
- * @new_alloc_size:	new size in bytes to which to extend the allocation to
- * @new_data_size:	new size in bytes to which to extend the data to
- * @data_start:		beginning of region which is required to be non-sparse
- *
- * Extend the allocated space of an attribute described by the ntfs inode @ni
- * to @new_alloc_size bytes.  If @data_start is -1, the whole extension may be
- * implemented as a hole in the file (as long as both the volume and the ntfs
- * inode @ni have sparse support enabled).  If @data_start is >= 0, then the
- * region between the old allocated size and @data_start - 1 may be made sparse
- * but the regions between @data_start and @new_alloc_size must be backed by
- * actual clusters.
- *
- * If @new_data_size is -1, it is ignored.  If it is >= 0, then the data size
- * of the attribute is extended to @new_data_size.  Note that the i_size of the
- * vfs inode is not updated.  Only the data size in the base attribute record
- * is updated.  The caller has to update i_size separately if this is required.
- * WARNING: It is a BUG() for @new_data_size to be smaller than the old data
- * size as well as for @new_data_size to be greater than @new_alloc_size.
- *
- * For resident attributes this involves resizing the attribute record and if
- * necessary moving it and/or other attributes into extent mft records and/or
- * converting the attribute to a non-resident attribute which in turn involves
- * extending the allocation of a non-resident attribute as described below.
- *
- * For non-resident attributes this involves allocating clusters in the data
- * zone on the volume (except for regions that are being made sparse) and
- * extending the run list to describe the allocated clusters as well as
- * updating the mapping pairs array of the attribute.  This in turn involves
- * resizing the attribute record and if necessary moving it and/or other
- * attributes into extent mft records and/or splitting the attribute record
- * into multiple extent attribute records.
- *
- * Also, the attribute list attribute is updated if present and in some of the
- * above cases (the ones where extent mft records/attributes come into play),
- * an attribute list attribute is created if not already present.
- *
- * Return the new allocated size on success and -errno on error.  In the case
- * that an error is encountered but a partial extension at least up to
- * @data_start (if present) is possible, the allocation is partially extended
- * and this is returned.  This means the caller must check the returned size to
- * determine if the extension was partial.  If @data_start is -1 then partial
- * allocations are not performed.
- *
- * WARNING: Do not call ntfs_attr_extend_allocation() for $MFT/$DATA.
- *
- * Locking: This function takes the runlist lock of @ni for writing as well as
- * locking the mft record of the base ntfs inode.  These locks are maintained
- * throughout execution of the function.  These locks are required so that the
- * attribute can be resized safely and so that it can for example be converted
- * from resident to non-resident safely.
- *
- * TODO: At present attribute list attribute handling is not implemented.
- *
- * TODO: At present it is not safe to call this function for anything other
- * than the $DATA attribute(s) of an uncompressed and unencrypted file.
- */
-s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
-		const s64 new_data_size, const s64 data_start)
-{
-	VCN vcn;
-	s64 ll, allocated_size, start = data_start;
-	struct inode *vi = VFS_I(ni);
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	runlist_element *rl, *rl2;
-	unsigned long flags;
-	int err, mp_size;
-	u32 attr_len = 0; /* Silence stupid gcc warning. */
-	bool mp_rebuilt;
-
-#ifdef DEBUG
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
-			"old_allocated_size 0x%llx, "
-			"new_allocated_size 0x%llx, new_data_size 0x%llx, "
-			"data_start 0x%llx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)allocated_size,
-			(unsigned long long)new_alloc_size,
-			(unsigned long long)new_data_size,
-			(unsigned long long)start);
-#endif
-retry_extend:
-	/*
-	 * For non-resident attributes, @start and @new_size need to be aligned
-	 * to cluster boundaries for allocation purposes.
-	 */
-	if (NInoNonResident(ni)) {
-		if (start > 0)
-			start &= ~(s64)vol->cluster_size_mask;
-		new_alloc_size = (new_alloc_size + vol->cluster_size - 1) &
-				~(s64)vol->cluster_size_mask;
-	}
-	BUG_ON(new_data_size >= 0 && new_data_size > new_alloc_size);
-	/* Check if new size is allowed in $AttrDef. */
-	err = ntfs_attr_size_bounds_check(vol, ni->type, new_alloc_size);
-	if (unlikely(err)) {
-		/* Only emit errors when the write will fail completely. */
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (start < 0 || start >= allocated_size) {
-			if (err == -ERANGE) {
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because the new "
-						"allocation would exceed the "
-						"maximum allowed size for "
-						"this attribute type.",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-			} else {
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because this "
-						"attribute type is not "
-						"defined on the NTFS volume.  "
-						"Possible corruption!  You "
-						"should run chkdsk!",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-			}
-		}
-		/* Translate error code to be POSIX conformant for write(2). */
-		if (err == -ERANGE)
-			err = -EFBIG;
-		else
-			err = -EIO;
-		return err;
-	}
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/*
-	 * We will be modifying both the runlist (if non-resident) and the mft
-	 * record so lock them both down.
-	 */
-	down_write(&ni->runlist.lock);
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * If non-resident, seek to the last extent.  If resident, there is
-	 * only one extent, so seek to that.
-	 */
-	vcn = NInoNonResident(ni) ? allocated_size >> vol->cluster_size_bits :
-			0;
-	/*
-	 * Abort if someone did the work whilst we waited for the locks.  If we
-	 * just converted the attribute from resident to non-resident it is
-	 * likely that exactly this has happened already.  We cannot quite
-	 * abort if we need to update the data size.
-	 */
-	if (unlikely(new_alloc_size <= allocated_size)) {
-		ntfs_debug("Allocated size already exceeds requested size.");
-		new_alloc_size = allocated_size;
-		if (new_data_size < 0)
-			goto done;
-		/*
-		 * We want the first attribute extent so that we can update the
-		 * data size.
-		 */
-		vcn = 0;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, vcn, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/* Use goto to reduce indentation. */
-	if (a->non_resident)
-		goto do_non_resident_extend;
-	BUG_ON(NInoNonResident(ni));
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	/*
-	 * Extend the attribute record to be able to store the new attribute
-	 * size.  ntfs_attr_record_resize() will not do anything if the size is
-	 * not changing.
-	 */
-	if (new_alloc_size < vol->mft_record_size &&
-			!ntfs_attr_record_resize(m, a,
-			le16_to_cpu(a->data.resident.value_offset) +
-			new_alloc_size)) {
-		/* The resize succeeded! */
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		if (new_data_size >= 0) {
-			BUG_ON(new_data_size < attr_len);
-			a->data.resident.value_length =
-					cpu_to_le32((u32)new_data_size);
-		}
-		goto flush_done;
-	}
-	/*
-	 * We have to drop all the locks so we can call
-	 * ntfs_attr_make_non_resident().  This could be optimised by try-
-	 * locking the first page cache page and only if that fails dropping
-	 * the locks, locking the page, and redoing all the locking and
-	 * lookups.  While this would be a huge optimisation, it is not worth
-	 * it as this is definitely a slow code path.
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	/*
-	 * Not enough space in the mft record, try to make the attribute
-	 * non-resident and if successful restart the extension process.
-	 */
-	err = ntfs_attr_make_non_resident(ni, attr_len);
-	if (likely(!err))
-		goto retry_extend;
-	/*
-	 * Could not make non-resident.  If this is due to this not being
-	 * permitted for this attribute type or there not being enough space,
-	 * try to make other attributes non-resident.  Otherwise fail.
-	 */
-	if (unlikely(err != -EPERM && err != -ENOSPC)) {
-		/* Only emit errors when the write will fail completely. */
-		read_lock_irqsave(&ni->size_lock, flags);
-		allocated_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the conversion from resident "
-					"to non-resident attribute failed "
-					"with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		goto conv_err_out;
-	}
-	/* TODO: Not implemented from here, abort. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	allocated_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (start < 0 || start >= allocated_size) {
-		if (err == -ENOSPC)
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record/on disk for the non-resident "
-					"attribute value.  This case is not "
-					"implemented yet.");
-		else /* if (err == -EPERM) */
-			ntfs_error(vol->sb, "This attribute type may not be "
-					"non-resident.  This case is not "
-					"implemented yet.");
-	}
-	err = -EOPNOTSUPP;
-	goto conv_err_out;
-#if 0
-	// TODO: Attempt to make other attributes non-resident.
-	if (!err)
-		goto do_resident_extend;
-	/*
-	 * Both the attribute list attribute and the standard information
-	 * attribute must remain in the base inode.  Thus, if this is one of
-	 * these attributes, we have to try to move other attributes out into
-	 * extent mft records instead.
-	 */
-	if (ni->type == AT_ATTRIBUTE_LIST ||
-			ni->type == AT_STANDARD_INFORMATION) {
-		// TODO: Attempt to move other attributes into extent mft
-		// records.
-		err = -EOPNOTSUPP;
-		if (!err)
-			goto do_resident_extend;
-		goto err_out;
-	}
-	// TODO: Attempt to move this attribute to an extent mft record, but
-	// only if it is not already the only attribute in an mft record in
-	// which case there would be nothing to gain.
-	err = -EOPNOTSUPP;
-	if (!err)
-		goto do_resident_extend;
-	/* There is nothing we can do to make enough space. )-: */
-	goto err_out;
-#endif
-do_non_resident_extend:
-	BUG_ON(!NInoNonResident(ni));
-	if (new_alloc_size == allocated_size) {
-		BUG_ON(vcn);
-		goto alloc_done;
-	}
-	/*
-	 * If the data starts after the end of the old allocation, this is a
-	 * $DATA attribute and sparse attributes are enabled on the volume and
-	 * for this inode, then create a sparse region between the old
-	 * allocated size and the start of the data.  Otherwise simply proceed
-	 * with filling the whole space between the old allocated size and the
-	 * new allocated size with clusters.
-	 */
-	if ((start >= 0 && start <= allocated_size) || ni->type != AT_DATA ||
-			!NVolSparseEnabled(vol) || NInoSparseDisabled(ni))
-		goto skip_sparse;
-	// TODO: This is not implemented yet.  We just fill in with real
-	// clusters for now...
-	ntfs_debug("Inserting holes is not-implemented yet.  Falling back to "
-			"allocating real clusters instead.");
-skip_sparse:
-	rl = ni->runlist.rl;
-	if (likely(rl)) {
-		/* Seek to the end of the runlist. */
-		while (rl->length)
-			rl++;
-	}
-	/* If this attribute extent is not mapped, map it now. */
-	if (unlikely(!rl || rl->lcn == LCN_RL_NOT_MAPPED ||
-			(rl->lcn == LCN_ENOENT && rl > ni->runlist.rl &&
-			(rl-1)->lcn == LCN_RL_NOT_MAPPED))) {
-		if (!rl && !allocated_size)
-			goto first_alloc;
-		rl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			if (start < 0 || start >= allocated_size)
-				ntfs_error(vol->sb, "Cannot extend allocation "
-						"of inode 0x%lx, attribute "
-						"type 0x%x, because the "
-						"mapping of a runlist "
-						"fragment failed with error "
-						"code %i.", vi->i_ino,
-						(unsigned)le32_to_cpu(ni->type),
-						err);
-			if (err != -ENOMEM)
-				err = -EIO;
-			goto err_out;
-		}
-		ni->runlist.rl = rl;
-		/* Seek to the end of the runlist. */
-		while (rl->length)
-			rl++;
-	}
-	/*
-	 * We now know the runlist of the last extent is mapped and @rl is at
-	 * the end of the runlist.  We want to begin allocating clusters
-	 * starting at the last allocated cluster to reduce fragmentation.  If
-	 * there are no valid LCNs in the attribute we let the cluster
-	 * allocator choose the starting cluster.
-	 */
-	/* If the last LCN is a hole or simillar seek back to last real LCN. */
-	while (rl->lcn < 0 && rl > ni->runlist.rl)
-		rl--;
-first_alloc:
-	// FIXME: Need to implement partial allocations so at least part of the
-	// write can be performed when start >= 0.  (Needed for POSIX write(2)
-	// conformance.)
-	rl2 = ntfs_cluster_alloc(vol, allocated_size >> vol->cluster_size_bits,
-			(new_alloc_size - allocated_size) >>
-			vol->cluster_size_bits, (rl && (rl->lcn >= 0)) ?
-			rl->lcn + rl->length : -1, DATA_ZONE, true);
-	if (IS_ERR(rl2)) {
-		err = PTR_ERR(rl2);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the allocation of clusters "
-					"failed with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM && err != -ENOSPC)
-			err = -EIO;
-		goto err_out;
-	}
-	rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
-	if (IS_ERR(rl)) {
-		err = PTR_ERR(rl);
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because the runlist merge failed "
-					"with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		if (ntfs_cluster_free_from_rl(vol, rl2)) {
-			ntfs_error(vol->sb, "Failed to release allocated "
-					"cluster(s) in error code path.  Run "
-					"chkdsk to recover the lost "
-					"cluster(s).");
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl2);
-		goto err_out;
-	}
-	ni->runlist.rl = rl;
-	ntfs_debug("Allocated 0x%llx clusters.", (long long)(new_alloc_size -
-			allocated_size) >> vol->cluster_size_bits);
-	/* Find the runlist element with which the attribute extent starts. */
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	rl2 = ntfs_rl_find_vcn_nolock(rl, ll);
-	BUG_ON(!rl2);
-	BUG_ON(!rl2->length);
-	BUG_ON(rl2->lcn < LCN_HOLE);
-	mp_rebuilt = false;
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		err = mp_size;
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because determining the size for the "
-					"mapping pairs failed with error code "
-					"%i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		err = -EIO;
-		goto undo_alloc;
-	}
-	/* Extend the attribute record to fit the bigger mapping pairs array. */
-	attr_len = le32_to_cpu(a->length);
-	err = ntfs_attr_record_resize(m, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(err)) {
-		BUG_ON(err != -ENOSPC);
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record,
-		// possibly by extending this extent partially and filling it
-		// and creating a new extent for the remainder, or by making
-		// other attributes non-resident and/or by moving other
-		// attributes out of this mft record.
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record for the extended attribute "
-					"record.  This case is not "
-					"implemented yet.");
-		err = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	mp_rebuilt = true;
-	/* Generate the mapping pairs array directly into the attr record. */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(err)) {
-		if (start < 0 || start >= allocated_size)
-			ntfs_error(vol->sb, "Cannot extend allocation of "
-					"inode 0x%lx, attribute type 0x%x, "
-					"because building the mapping pairs "
-					"failed with error code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-		err = -EIO;
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
-			vol->cluster_size_bits) - 1);
-	/*
-	 * We now have extended the allocated size of the attribute.  Reflect
-	 * this in the ntfs_inode structure and the attribute record.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err))
-			goto restore_undo_alloc;
-		/* @m is not used any more so no need to set it. */
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_alloc_size;
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
-	/*
-	 * FIXME: This would fail if @ni is a directory, $MFT, or an index,
-	 * since those can have sparse/compressed set.  For example can be
-	 * set compressed even though it is not compressed itself and in that
-	 * case the bit means that files are to be created compressed in the
-	 * directory...  At present this is ok as this code is only called for
-	 * regular files, and only for their $DATA attribute(s).
-	 * FIXME: The calculation is wrong if we created a hole above.  For now
-	 * it does not matter as we never create holes.
-	 */
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		ni->itype.compressed.size += new_alloc_size - allocated_size;
-		a->data.non_resident.compressed_size =
-				cpu_to_sle64(ni->itype.compressed.size);
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	} else
-		vi->i_blocks = new_alloc_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-alloc_done:
-	if (new_data_size >= 0) {
-		BUG_ON(new_data_size <
-				sle64_to_cpu(a->data.non_resident.data_size));
-		a->data.non_resident.data_size = cpu_to_sle64(new_data_size);
-	}
-flush_done:
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-done:
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	ntfs_debug("Done, new_allocated_size 0x%llx.",
-			(unsigned long long)new_alloc_size);
-	return new_alloc_size;
-restore_undo_alloc:
-	if (start < 0 || start >= allocated_size)
-		ntfs_error(vol->sb, "Cannot complete extension of allocation "
-				"of inode 0x%lx, attribute type 0x%x, because "
-				"lookup of first attribute extent failed with "
-				"error code %i.", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), err);
-	if (err == -ENOENT)
-		err = -EIO;
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len, CASE_SENSITIVE,
-			allocated_size >> vol->cluster_size_bits, NULL, 0,
-			ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"attribute in error code path.  Run chkdsk to "
-				"recover.");
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->allocated_size = new_alloc_size;
-		/*
-		 * FIXME: This would fail if @ni is a directory...  See above.
-		 * FIXME: The calculation is wrong if we created a hole above.
-		 * For now it does not matter as we never create holes.
-		 */
-		if (NInoSparse(ni) || NInoCompressed(ni)) {
-			ni->itype.compressed.size += new_alloc_size -
-					allocated_size;
-			vi->i_blocks = ni->itype.compressed.size >> 9;
-		} else
-			vi->i_blocks = new_alloc_size >> 9;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		up_write(&ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is the allocated size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return err;
-	}
-	ctx->attr->data.non_resident.highest_vcn = cpu_to_sle64(
-			(allocated_size >> vol->cluster_size_bits) - 1);
-undo_alloc:
-	ll = allocated_size >> vol->cluster_size_bits;
-	if (ntfs_cluster_free(ni, ll, -1, ctx) < 0) {
-		ntfs_error(vol->sb, "Failed to release allocated cluster(s) "
-				"in error code path.  Run chkdsk to recover "
-				"the lost cluster(s).");
-		NVolSetErrors(vol);
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/*
-	 * If the runlist truncation fails and/or the search context is no
-	 * longer valid, we cannot resize the attribute record or build the
-	 * mapping pairs array thus we mark the inode bad so that no access to
-	 * the freed clusters can happen.
-	 */
-	if (ntfs_rl_truncate_nolock(vol, &ni->runlist, ll) || IS_ERR(m)) {
-		ntfs_error(vol->sb, "Failed to %s in error code path.  Run "
-				"chkdsk to recover.", IS_ERR(m) ?
-				"restore attribute search context" :
-				"truncate attribute runlist");
-		NVolSetErrors(vol);
-	} else if (mp_rebuilt) {
-		if (ntfs_attr_record_resize(m, a, attr_len)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record in error code path.  Run "
-					"chkdsk to recover.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-					a->data.non_resident.
-					mapping_pairs_offset), attr_len -
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), rl2, ll, -1,
-					NULL)) {
-				ntfs_error(vol->sb, "Failed to restore "
-						"mapping pairs array in error "
-						"code path.  Run chkdsk to "
-						"recover.");
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		}
-	}
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-conv_err_out:
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-}
-
-/**
- * ntfs_attr_set - fill (a part of) an attribute with a byte
- * @ni:		ntfs inode describing the attribute to fill
- * @ofs:	offset inside the attribute at which to start to fill
- * @cnt:	number of bytes to fill
- * @val:	the unsigned 8-bit value with which to fill the attribute
- *
- * Fill @cnt bytes of the attribute described by the ntfs inode @ni starting at
- * byte offset @ofs inside the attribute with the constant byte @val.
- *
- * This function is effectively like memset() applied to an ntfs attribute.
- * Note this function actually only operates on the page cache pages belonging
- * to the ntfs attribute and it marks them dirty after doing the memset().
- * Thus it relies on the vm dirty page write code paths to cause the modified
- * pages to be written to the mft record/disk.
- *
- * Return 0 on success and -errno on error.  An error code of -ESPIPE means
- * that @ofs + @cnt were outside the end of the attribute and no write was
- * performed.
- */
-int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt, const u8 val)
-{
-	ntfs_volume *vol = ni->vol;
-	struct address_space *mapping;
-	struct page *page;
-	u8 *kaddr;
-	pgoff_t idx, end;
-	unsigned start_ofs, end_ofs, size;
-
-	ntfs_debug("Entering for ofs 0x%llx, cnt 0x%llx, val 0x%hx.",
-			(long long)ofs, (long long)cnt, val);
-	BUG_ON(ofs < 0);
-	BUG_ON(cnt < 0);
-	if (!cnt)
-		goto done;
-	/*
-	 * FIXME: Compressed and encrypted attributes are not supported when
-	 * writing and we should never have gotten here for them.
-	 */
-	BUG_ON(NInoCompressed(ni));
-	BUG_ON(NInoEncrypted(ni));
-	mapping = VFS_I(ni)->i_mapping;
-	/* Work out the starting index and page offset. */
-	idx = ofs >> PAGE_SHIFT;
-	start_ofs = ofs & ~PAGE_MASK;
-	/* Work out the ending index and page offset. */
-	end = ofs + cnt;
-	end_ofs = end & ~PAGE_MASK;
-	/* If the end is outside the inode size return -ESPIPE. */
-	if (unlikely(end > i_size_read(VFS_I(ni)))) {
-		ntfs_error(vol->sb, "Request exceeds end of attribute.");
-		return -ESPIPE;
-	}
-	end >>= PAGE_SHIFT;
-	/* If there is a first partial page, need to do it the slow way. */
-	if (start_ofs) {
-		page = read_mapping_page(mapping, idx, NULL);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read first partial "
-					"page (error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		/*
-		 * If the last page is the same as the first page, need to
-		 * limit the write to the end offset.
-		 */
-		size = PAGE_SIZE;
-		if (idx == end)
-			size = end_ofs;
-		kaddr = kmap_atomic(page);
-		memset(kaddr + start_ofs, val, size - start_ofs);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		set_page_dirty(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-		if (idx == end)
-			goto done;
-		idx++;
-	}
-	/* Do the whole pages the fast way. */
-	for (; idx < end; idx++) {
-		/* Find or create the current page.  (The page is locked.) */
-		page = grab_cache_page(mapping, idx);
-		if (unlikely(!page)) {
-			ntfs_error(vol->sb, "Insufficient memory to grab "
-					"page (index 0x%lx).", idx);
-			return -ENOMEM;
-		}
-		kaddr = kmap_atomic(page);
-		memset(kaddr, val, PAGE_SIZE);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		/*
-		 * If the page has buffers, mark them uptodate since buffer
-		 * state and not page state is definitive in 2.6 kernels.
-		 */
-		if (page_has_buffers(page)) {
-			struct buffer_head *bh, *head;
-
-			bh = head = page_buffers(page);
-			do {
-				set_buffer_uptodate(bh);
-			} while ((bh = bh->b_this_page) != head);
-		}
-		/* Now that buffers are uptodate, set the page uptodate, too. */
-		SetPageUptodate(page);
-		/*
-		 * Set the page and all its buffers dirty and mark the inode
-		 * dirty, too.  The VM will write the page later on.
-		 */
-		set_page_dirty(page);
-		/* Finally unlock and release the page. */
-		unlock_page(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	}
-	/* If there is a last partial page, need to do it the slow way. */
-	if (end_ofs) {
-		page = read_mapping_page(mapping, idx, NULL);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read last partial page "
-					"(error, index 0x%lx).", idx);
-			return PTR_ERR(page);
-		}
-		kaddr = kmap_atomic(page);
-		memset(kaddr, val, end_ofs);
-		flush_dcache_page(page);
-		kunmap_atomic(kaddr);
-		set_page_dirty(page);
-		put_page(page);
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	}
-done:
-	ntfs_debug("Done.");
-	return 0;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/attrib.h b/fs/ntfs/attrib.h
deleted file mode 100644
index fe0890d3d072..000000000000
--- a/fs/ntfs/attrib.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * attrib.h - Defines for attribute handling in NTFS Linux kernel driver.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_ATTRIB_H
-#define _LINUX_NTFS_ATTRIB_H
-
-#include "endian.h"
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-/**
- * ntfs_attr_search_ctx - used in attribute search functions
- * @mrec:	buffer containing mft record to search
- * @attr:	attribute record in @mrec where to begin/continue search
- * @is_first:	if true ntfs_attr_lookup() begins search with @attr, else after
- *
- * Structure must be initialized to zero before the first call to one of the
- * attribute search functions. Initialize @mrec to point to the mft record to
- * search, and @attr to point to the first attribute within @mrec (not necessary
- * if calling the _first() functions), and set @is_first to 'true' (not necessary
- * if calling the _first() functions).
- *
- * If @is_first is 'true', the search begins with @attr. If @is_first is 'false',
- * the search begins after @attr. This is so that, after the first call to one
- * of the search attribute functions, we can call the function again, without
- * any modification of the search context, to automagically get the next
- * matching attribute.
- */
-typedef struct {
-	MFT_RECORD *mrec;
-	ATTR_RECORD *attr;
-	bool is_first;
-	ntfs_inode *ntfs_ino;
-	ATTR_LIST_ENTRY *al_entry;
-	ntfs_inode *base_ntfs_ino;
-	MFT_RECORD *base_mrec;
-	ATTR_RECORD *base_attr;
-} ntfs_attr_search_ctx;
-
-extern int ntfs_map_runlist_nolock(ntfs_inode *ni, VCN vcn,
-		ntfs_attr_search_ctx *ctx);
-extern int ntfs_map_runlist(ntfs_inode *ni, VCN vcn);
-
-extern LCN ntfs_attr_vcn_to_lcn_nolock(ntfs_inode *ni, const VCN vcn,
-		const bool write_locked);
-
-extern runlist_element *ntfs_attr_find_vcn_nolock(ntfs_inode *ni,
-		const VCN vcn, ntfs_attr_search_ctx *ctx);
-
-int ntfs_attr_lookup(const ATTR_TYPE type, const ntfschar *name,
-		const u32 name_len, const IGNORE_CASE_BOOL ic,
-		const VCN lowest_vcn, const u8 *val, const u32 val_len,
-		ntfs_attr_search_ctx *ctx);
-
-extern int load_attribute_list(ntfs_volume *vol, runlist *rl, u8 *al_start,
-		const s64 size, const s64 initialized_size);
-
-static inline s64 ntfs_attr_size(const ATTR_RECORD *a)
-{
-	if (!a->non_resident)
-		return (s64)le32_to_cpu(a->data.resident.value_length);
-	return sle64_to_cpu(a->data.non_resident.data_size);
-}
-
-extern void ntfs_attr_reinit_search_ctx(ntfs_attr_search_ctx *ctx);
-extern ntfs_attr_search_ctx *ntfs_attr_get_search_ctx(ntfs_inode *ni,
-		MFT_RECORD *mrec);
-extern void ntfs_attr_put_search_ctx(ntfs_attr_search_ctx *ctx);
-
-#ifdef NTFS_RW
-
-extern int ntfs_attr_size_bounds_check(const ntfs_volume *vol,
-		const ATTR_TYPE type, const s64 size);
-extern int ntfs_attr_can_be_non_resident(const ntfs_volume *vol,
-		const ATTR_TYPE type);
-extern int ntfs_attr_can_be_resident(const ntfs_volume *vol,
-		const ATTR_TYPE type);
-
-extern int ntfs_attr_record_resize(MFT_RECORD *m, ATTR_RECORD *a, u32 new_size);
-extern int ntfs_resident_attr_value_resize(MFT_RECORD *m, ATTR_RECORD *a,
-		const u32 new_size);
-
-extern int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size);
-
-extern s64 ntfs_attr_extend_allocation(ntfs_inode *ni, s64 new_alloc_size,
-		const s64 new_data_size, const s64 data_start);
-
-extern int ntfs_attr_set(ntfs_inode *ni, const s64 ofs, const s64 cnt,
-		const u8 val);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_ATTRIB_H */
diff --git a/fs/ntfs/bitmap.c b/fs/ntfs/bitmap.c
deleted file mode 100644
index 0675b2400873..000000000000
--- a/fs/ntfs/bitmap.c
+++ /dev/null
@@ -1,179 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * bitmap.c - NTFS kernel bitmap handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "bitmap.h"
-#include "debug.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * __ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi:			vfs inode describing the bitmap
- * @start_bit:		first bit to set
- * @count:		number of bits to set
- * @value:		value to set the bits to (i.e. 0 or 1)
- * @is_rollback:	if 'true' this is a rollback operation
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors.  You probably want to use ntfs_bitmap_set_bits_in_run() instead.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const bool is_rollback)
-{
-	s64 cnt = count;
-	pgoff_t index, end_index;
-	struct address_space *mapping;
-	struct page *page;
-	u8 *kaddr;
-	int pos, len;
-	u8 bit;
-
-	BUG_ON(!vi);
-	ntfs_debug("Entering for i_ino 0x%lx, start_bit 0x%llx, count 0x%llx, "
-			"value %u.%s", vi->i_ino, (unsigned long long)start_bit,
-			(unsigned long long)cnt, (unsigned int)value,
-			is_rollback ? " (rollback)" : "");
-	BUG_ON(start_bit < 0);
-	BUG_ON(cnt < 0);
-	BUG_ON(value > 1);
-	/*
-	 * Calculate the indices for the pages containing the first and last
-	 * bits, i.e. @start_bit and @start_bit + @cnt - 1, respectively.
-	 */
-	index = start_bit >> (3 + PAGE_SHIFT);
-	end_index = (start_bit + cnt - 1) >> (3 + PAGE_SHIFT);
-
-	/* Get the page containing the first bit (@start_bit). */
-	mapping = vi->i_mapping;
-	page = ntfs_map_page(mapping, index);
-	if (IS_ERR(page)) {
-		if (!is_rollback)
-			ntfs_error(vi->i_sb, "Failed to map first page (error "
-					"%li), aborting.", PTR_ERR(page));
-		return PTR_ERR(page);
-	}
-	kaddr = page_address(page);
-
-	/* Set @pos to the position of the byte containing @start_bit. */
-	pos = (start_bit >> 3) & ~PAGE_MASK;
-
-	/* Calculate the position of @start_bit in the first byte. */
-	bit = start_bit & 7;
-
-	/* If the first byte is partial, modify the appropriate bits in it. */
-	if (bit) {
-		u8 *byte = kaddr + pos;
-		while ((bit & 7) && cnt) {
-			cnt--;
-			if (value)
-				*byte |= 1 << bit++;
-			else
-				*byte &= ~(1 << bit++);
-		}
-		/* If we are done, unmap the page and return success. */
-		if (!cnt)
-			goto done;
-
-		/* Update @pos to the new position. */
-		pos++;
-	}
-	/*
-	 * Depending on @value, modify all remaining whole bytes in the page up
-	 * to @cnt.
-	 */
-	len = min_t(s64, cnt >> 3, PAGE_SIZE - pos);
-	memset(kaddr + pos, value ? 0xff : 0, len);
-	cnt -= len << 3;
-
-	/* Update @len to point to the first not-done byte in the page. */
-	if (cnt < 8)
-		len += pos;
-
-	/* If we are not in the last page, deal with all subsequent pages. */
-	while (index < end_index) {
-		BUG_ON(cnt <= 0);
-
-		/* Update @index and get the next page. */
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		ntfs_unmap_page(page);
-		page = ntfs_map_page(mapping, ++index);
-		if (IS_ERR(page))
-			goto rollback;
-		kaddr = page_address(page);
-		/*
-		 * Depending on @value, modify all remaining whole bytes in the
-		 * page up to @cnt.
-		 */
-		len = min_t(s64, cnt >> 3, PAGE_SIZE);
-		memset(kaddr, value ? 0xff : 0, len);
-		cnt -= len << 3;
-	}
-	/*
-	 * The currently mapped page is the last one.  If the last byte is
-	 * partial, modify the appropriate bits in it.  Note, @len is the
-	 * position of the last byte inside the page.
-	 */
-	if (cnt) {
-		u8 *byte;
-
-		BUG_ON(cnt > 7);
-
-		bit = cnt;
-		byte = kaddr + len;
-		while (bit--) {
-			if (value)
-				*byte |= 1 << bit;
-			else
-				*byte &= ~(1 << bit);
-		}
-	}
-done:
-	/* We are done.  Unmap the page and return success. */
-	flush_dcache_page(page);
-	set_page_dirty(page);
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return 0;
-rollback:
-	/*
-	 * Current state:
-	 *	- no pages are mapped
-	 *	- @count - @cnt is the number of bits that have been modified
-	 */
-	if (is_rollback)
-		return PTR_ERR(page);
-	if (count != cnt)
-		pos = __ntfs_bitmap_set_bits_in_run(vi, start_bit, count - cnt,
-				value ? 0 : 1, true);
-	else
-		pos = 0;
-	if (!pos) {
-		/* Rollback was successful. */
-		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
-				"%li), aborting.", PTR_ERR(page));
-	} else {
-		/* Rollback failed. */
-		ntfs_error(vi->i_sb, "Failed to map subsequent page (error "
-				"%li) and rollback failed (error %i).  "
-				"Aborting and leaving inconsistent metadata.  "
-				"Unmount and run chkdsk.", PTR_ERR(page), pos);
-		NVolSetErrors(NTFS_SB(vi->i_sb));
-	}
-	return PTR_ERR(page);
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/bitmap.h b/fs/ntfs/bitmap.h
deleted file mode 100644
index 9dd2224ca9c4..000000000000
--- a/fs/ntfs/bitmap.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * bitmap.h - Defines for NTFS kernel bitmap handling.  Part of the Linux-NTFS
- *	      project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_BITMAP_H
-#define _LINUX_NTFS_BITMAP_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-
-extern int __ntfs_bitmap_set_bits_in_run(struct inode *vi, const s64 start_bit,
-		const s64 count, const u8 value, const bool is_rollback);
-
-/**
- * ntfs_bitmap_set_bits_in_run - set a run of bits in a bitmap to a value
- * @vi:			vfs inode describing the bitmap
- * @start_bit:		first bit to set
- * @count:		number of bits to set
- * @value:		value to set the bits to (i.e. 0 or 1)
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi to @value, where @value is either 0 or 1.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bits_in_run(struct inode *vi,
-		const s64 start_bit, const s64 count, const u8 value)
-{
-	return __ntfs_bitmap_set_bits_in_run(vi, start_bit, count, value,
-			false);
-}
-
-/**
- * ntfs_bitmap_set_run - set a run of bits in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @start_bit:	first bit to set
- * @count:	number of bits to set
- *
- * Set @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_run(struct inode *vi, const s64 start_bit,
-		const s64 count)
-{
-	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 1);
-}
-
-/**
- * ntfs_bitmap_clear_run - clear a run of bits in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @start_bit:	first bit to clear
- * @count:	number of bits to clear
- *
- * Clear @count bits starting at bit @start_bit in the bitmap described by the
- * vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_run(struct inode *vi, const s64 start_bit,
-		const s64 count)
-{
-	return ntfs_bitmap_set_bits_in_run(vi, start_bit, count, 0);
-}
-
-/**
- * ntfs_bitmap_set_bit - set a bit in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @bit:	bit to set
- *
- * Set bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_set_bit(struct inode *vi, const s64 bit)
-{
-	return ntfs_bitmap_set_run(vi, bit, 1);
-}
-
-/**
- * ntfs_bitmap_clear_bit - clear a bit in a bitmap
- * @vi:		vfs inode describing the bitmap
- * @bit:	bit to clear
- *
- * Clear bit @bit in the bitmap described by the vfs inode @vi.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_bitmap_clear_bit(struct inode *vi, const s64 bit)
-{
-	return ntfs_bitmap_clear_run(vi, bit, 1);
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_BITMAP_H */
diff --git a/fs/ntfs/collate.c b/fs/ntfs/collate.c
deleted file mode 100644
index 3ab6ec96abfe..000000000000
--- a/fs/ntfs/collate.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * collate.c - NTFS kernel collation handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#include "collate.h"
-#include "debug.h"
-#include "ntfs.h"
-
-static int ntfs_collate_binary(ntfs_volume *vol,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len)
-{
-	int rc;
-
-	ntfs_debug("Entering.");
-	rc = memcmp(data1, data2, min(data1_len, data2_len));
-	if (!rc && (data1_len != data2_len)) {
-		if (data1_len < data2_len)
-			rc = -1;
-		else
-			rc = 1;
-	}
-	ntfs_debug("Done, returning %i", rc);
-	return rc;
-}
-
-static int ntfs_collate_ntofs_ulong(ntfs_volume *vol,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len)
-{
-	int rc;
-	u32 d1, d2;
-
-	ntfs_debug("Entering.");
-	// FIXME:  We don't really want to bug here.
-	BUG_ON(data1_len != data2_len);
-	BUG_ON(data1_len != 4);
-	d1 = le32_to_cpup(data1);
-	d2 = le32_to_cpup(data2);
-	if (d1 < d2)
-		rc = -1;
-	else {
-		if (d1 == d2)
-			rc = 0;
-		else
-			rc = 1;
-	}
-	ntfs_debug("Done, returning %i", rc);
-	return rc;
-}
-
-typedef int (*ntfs_collate_func_t)(ntfs_volume *, const void *, const int,
-		const void *, const int);
-
-static ntfs_collate_func_t ntfs_do_collate0x0[3] = {
-	ntfs_collate_binary,
-	NULL/*ntfs_collate_file_name*/,
-	NULL/*ntfs_collate_unicode_string*/,
-};
-
-static ntfs_collate_func_t ntfs_do_collate0x1[4] = {
-	ntfs_collate_ntofs_ulong,
-	NULL/*ntfs_collate_ntofs_sid*/,
-	NULL/*ntfs_collate_ntofs_security_hash*/,
-	NULL/*ntfs_collate_ntofs_ulongs*/,
-};
-
-/**
- * ntfs_collate - collate two data items using a specified collation rule
- * @vol:	ntfs volume to which the data items belong
- * @cr:		collation rule to use when comparing the items
- * @data1:	first data item to collate
- * @data1_len:	length in bytes of @data1
- * @data2:	second data item to collate
- * @data2_len:	length in bytes of @data2
- *
- * Collate the two data items @data1 and @data2 using the collation rule @cr
- * and return -1, 0, ir 1 if @data1 is found, respectively, to collate before,
- * to match, or to collate after @data2.
- *
- * For speed we use the collation rule @cr as an index into two tables of
- * function pointers to call the appropriate collation function.
- */
-int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len) {
-	int i;
-
-	ntfs_debug("Entering.");
-	/*
-	 * FIXME:  At the moment we only support COLLATION_BINARY and
-	 * COLLATION_NTOFS_ULONG, so we BUG() for everything else for now.
-	 */
-	BUG_ON(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG);
-	i = le32_to_cpu(cr);
-	BUG_ON(i < 0);
-	if (i <= 0x02)
-		return ntfs_do_collate0x0[i](vol, data1, data1_len,
-				data2, data2_len);
-	BUG_ON(i < 0x10);
-	i -= 0x10;
-	if (likely(i <= 3))
-		return ntfs_do_collate0x1[i](vol, data1, data1_len,
-				data2, data2_len);
-	BUG();
-	return 0;
-}
diff --git a/fs/ntfs/collate.h b/fs/ntfs/collate.h
deleted file mode 100644
index f2255619b4f4..000000000000
--- a/fs/ntfs/collate.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * collate.h - Defines for NTFS kernel collation handling.  Part of the
- *	       Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_COLLATE_H
-#define _LINUX_NTFS_COLLATE_H
-
-#include "types.h"
-#include "volume.h"
-
-static inline bool ntfs_is_collation_rule_supported(COLLATION_RULE cr) {
-	int i;
-
-	/*
-	 * FIXME:  At the moment we only support COLLATION_BINARY and
-	 * COLLATION_NTOFS_ULONG, so we return false for everything else for
-	 * now.
-	 */
-	if (unlikely(cr != COLLATION_BINARY && cr != COLLATION_NTOFS_ULONG))
-		return false;
-	i = le32_to_cpu(cr);
-	if (likely(((i >= 0) && (i <= 0x02)) ||
-			((i >= 0x10) && (i <= 0x13))))
-		return true;
-	return false;
-}
-
-extern int ntfs_collate(ntfs_volume *vol, COLLATION_RULE cr,
-		const void *data1, const int data1_len,
-		const void *data2, const int data2_len);
-
-#endif /* _LINUX_NTFS_COLLATE_H */
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
deleted file mode 100644
index 761aaa0195d6..000000000000
--- a/fs/ntfs/compress.c
+++ /dev/null
@@ -1,950 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * compress.c - NTFS kernel compressed attributes handling.
- *		Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "inode.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_compression_constants - enum of constants used in the compression code
- */
-typedef enum {
-	/* Token types and access mask. */
-	NTFS_SYMBOL_TOKEN	=	0,
-	NTFS_PHRASE_TOKEN	=	1,
-	NTFS_TOKEN_MASK		=	1,
-
-	/* Compression sub-block constants. */
-	NTFS_SB_SIZE_MASK	=	0x0fff,
-	NTFS_SB_SIZE		=	0x1000,
-	NTFS_SB_IS_COMPRESSED	=	0x8000,
-
-	/*
-	 * The maximum compression block size is by definition 16 * the cluster
-	 * size, with the maximum supported cluster size being 4kiB. Thus the
-	 * maximum compression buffer size is 64kiB, so we use this when
-	 * initializing the compression buffer.
-	 */
-	NTFS_MAX_CB_SIZE	= 64 * 1024,
-} ntfs_compression_constants;
-
-/*
- * ntfs_compression_buffer - one buffer for the decompression engine
- */
-static u8 *ntfs_compression_buffer;
-
-/*
- * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer
- */
-static DEFINE_SPINLOCK(ntfs_cb_lock);
-
-/**
- * allocate_compression_buffers - allocate the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- *
- * Return 0 on success or -ENOMEM if the allocations failed.
- */
-int allocate_compression_buffers(void)
-{
-	BUG_ON(ntfs_compression_buffer);
-
-	ntfs_compression_buffer = vmalloc(NTFS_MAX_CB_SIZE);
-	if (!ntfs_compression_buffer)
-		return -ENOMEM;
-	return 0;
-}
-
-/**
- * free_compression_buffers - free the decompression buffers
- *
- * Caller has to hold the ntfs_lock mutex.
- */
-void free_compression_buffers(void)
-{
-	BUG_ON(!ntfs_compression_buffer);
-	vfree(ntfs_compression_buffer);
-	ntfs_compression_buffer = NULL;
-}
-
-/**
- * zero_partial_compressed_page - zero out of bounds compressed page region
- */
-static void zero_partial_compressed_page(struct page *page,
-		const s64 initialized_size)
-{
-	u8 *kp = page_address(page);
-	unsigned int kp_ofs;
-
-	ntfs_debug("Zeroing page region outside initialized size.");
-	if (((s64)page->index << PAGE_SHIFT) >= initialized_size) {
-		clear_page(kp);
-		return;
-	}
-	kp_ofs = initialized_size & ~PAGE_MASK;
-	memset(kp + kp_ofs, 0, PAGE_SIZE - kp_ofs);
-	return;
-}
-
-/**
- * handle_bounds_compressed_page - test for&handle out of bounds compressed page
- */
-static inline void handle_bounds_compressed_page(struct page *page,
-		const loff_t i_size, const s64 initialized_size)
-{
-	if ((page->index >= (initialized_size >> PAGE_SHIFT)) &&
-			(initialized_size < i_size))
-		zero_partial_compressed_page(page, initialized_size);
-	return;
-}
-
-/**
- * ntfs_decompress - decompress a compression block into an array of pages
- * @dest_pages:		destination array of pages
- * @completed_pages:	scratch space to track completed pages
- * @dest_index:		current index into @dest_pages (IN/OUT)
- * @dest_ofs:		current offset within @dest_pages[@dest_index] (IN/OUT)
- * @dest_max_index:	maximum index into @dest_pages (IN)
- * @dest_max_ofs:	maximum offset within @dest_pages[@dest_max_index] (IN)
- * @xpage:		the target page (-1 if none) (IN)
- * @xpage_done:		set to 1 if xpage was completed successfully (IN/OUT)
- * @cb_start:		compression block to decompress (IN)
- * @cb_size:		size of compression block @cb_start in bytes (IN)
- * @i_size:		file size when we started the read (IN)
- * @initialized_size:	initialized file size when we started the read (IN)
- *
- * The caller must have disabled preemption. ntfs_decompress() reenables it when
- * the critical section is finished.
- *
- * This decompresses the compression block @cb_start into the array of
- * destination pages @dest_pages starting at index @dest_index into @dest_pages
- * and at offset @dest_pos into the page @dest_pages[@dest_index].
- *
- * When the page @dest_pages[@xpage] is completed, @xpage_done is set to 1.
- * If xpage is -1 or @xpage has not been completed, @xpage_done is not modified.
- *
- * @cb_start is a pointer to the compression block which needs decompressing
- * and @cb_size is the size of @cb_start in bytes (8-64kiB).
- *
- * Return 0 if success or -EOVERFLOW on error in the compressed stream.
- * @xpage_done indicates whether the target page (@dest_pages[@xpage]) was
- * completed during the decompression of the compression block (@cb_start).
- *
- * Warning: This function *REQUIRES* PAGE_SIZE >= 4096 or it will blow up
- * unpredicatbly! You have been warned!
- *
- * Note to hackers: This function may not sleep until it has finished accessing
- * the compression block @cb_start as it is a per-CPU buffer.
- */
-static int ntfs_decompress(struct page *dest_pages[], int completed_pages[],
-		int *dest_index, int *dest_ofs, const int dest_max_index,
-		const int dest_max_ofs, const int xpage, char *xpage_done,
-		u8 *const cb_start, const u32 cb_size, const loff_t i_size,
-		const s64 initialized_size)
-{
-	/*
-	 * Pointers into the compressed data, i.e. the compression block (cb),
-	 * and the therein contained sub-blocks (sb).
-	 */
-	u8 *cb_end = cb_start + cb_size; /* End of cb. */
-	u8 *cb = cb_start;	/* Current position in cb. */
-	u8 *cb_sb_start;	/* Beginning of the current sb in the cb. */
-	u8 *cb_sb_end;		/* End of current sb / beginning of next sb. */
-
-	/* Variables for uncompressed data / destination. */
-	struct page *dp;	/* Current destination page being worked on. */
-	u8 *dp_addr;		/* Current pointer into dp. */
-	u8 *dp_sb_start;	/* Start of current sub-block in dp. */
-	u8 *dp_sb_end;		/* End of current sb in dp (dp_sb_start +
-				   NTFS_SB_SIZE). */
-	u16 do_sb_start;	/* @dest_ofs when starting this sub-block. */
-	u16 do_sb_end;		/* @dest_ofs of end of this sb (do_sb_start +
-				   NTFS_SB_SIZE). */
-
-	/* Variables for tag and token parsing. */
-	u8 tag;			/* Current tag. */
-	int token;		/* Loop counter for the eight tokens in tag. */
-	int nr_completed_pages = 0;
-
-	/* Default error code. */
-	int err = -EOVERFLOW;
-
-	ntfs_debug("Entering, cb_size = 0x%x.", cb_size);
-do_next_sb:
-	ntfs_debug("Beginning sub-block at offset = 0x%zx in the cb.",
-			cb - cb_start);
-	/*
-	 * Have we reached the end of the compression block or the end of the
-	 * decompressed data?  The latter can happen for example if the current
-	 * position in the compression block is one byte before its end so the
-	 * first two checks do not detect it.
-	 */
-	if (cb == cb_end || !le16_to_cpup((le16*)cb) ||
-			(*dest_index == dest_max_index &&
-			*dest_ofs == dest_max_ofs)) {
-		int i;
-
-		ntfs_debug("Completed. Returning success (0).");
-		err = 0;
-return_error:
-		/* We can sleep from now on, so we drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		/* Second stage: finalize completed pages. */
-		if (nr_completed_pages > 0) {
-			for (i = 0; i < nr_completed_pages; i++) {
-				int di = completed_pages[i];
-
-				dp = dest_pages[di];
-				/*
-				 * If we are outside the initialized size, zero
-				 * the out of bounds page range.
-				 */
-				handle_bounds_compressed_page(dp, i_size,
-						initialized_size);
-				flush_dcache_page(dp);
-				kunmap(dp);
-				SetPageUptodate(dp);
-				unlock_page(dp);
-				if (di == xpage)
-					*xpage_done = 1;
-				else
-					put_page(dp);
-				dest_pages[di] = NULL;
-			}
-		}
-		return err;
-	}
-
-	/* Setup offsets for the current sub-block destination. */
-	do_sb_start = *dest_ofs;
-	do_sb_end = do_sb_start + NTFS_SB_SIZE;
-
-	/* Check that we are still within allowed boundaries. */
-	if (*dest_index == dest_max_index && do_sb_end > dest_max_ofs)
-		goto return_overflow;
-
-	/* Does the minimum size of a compressed sb overflow valid range? */
-	if (cb + 6 > cb_end)
-		goto return_overflow;
-
-	/* Setup the current sub-block source pointers and validate range. */
-	cb_sb_start = cb;
-	cb_sb_end = cb_sb_start + (le16_to_cpup((le16*)cb) & NTFS_SB_SIZE_MASK)
-			+ 3;
-	if (cb_sb_end > cb_end)
-		goto return_overflow;
-
-	/* Get the current destination page. */
-	dp = dest_pages[*dest_index];
-	if (!dp) {
-		/* No page present. Skip decompression of this sub-block. */
-		cb = cb_sb_end;
-
-		/* Advance destination position to next sub-block. */
-		*dest_ofs = (*dest_ofs + NTFS_SB_SIZE) & ~PAGE_MASK;
-		if (!*dest_ofs && (++*dest_index > dest_max_index))
-			goto return_overflow;
-		goto do_next_sb;
-	}
-
-	/* We have a valid destination page. Setup the destination pointers. */
-	dp_addr = (u8*)page_address(dp) + do_sb_start;
-
-	/* Now, we are ready to process the current sub-block (sb). */
-	if (!(le16_to_cpup((le16*)cb) & NTFS_SB_IS_COMPRESSED)) {
-		ntfs_debug("Found uncompressed sub-block.");
-		/* This sb is not compressed, just copy it into destination. */
-
-		/* Advance source position to first data byte. */
-		cb += 2;
-
-		/* An uncompressed sb must be full size. */
-		if (cb_sb_end - cb != NTFS_SB_SIZE)
-			goto return_overflow;
-
-		/* Copy the block and advance the source position. */
-		memcpy(dp_addr, cb, NTFS_SB_SIZE);
-		cb += NTFS_SB_SIZE;
-
-		/* Advance destination position to next sub-block. */
-		*dest_ofs += NTFS_SB_SIZE;
-		if (!(*dest_ofs &= ~PAGE_MASK)) {
-finalize_page:
-			/*
-			 * First stage: add current page index to array of
-			 * completed pages.
-			 */
-			completed_pages[nr_completed_pages++] = *dest_index;
-			if (++*dest_index > dest_max_index)
-				goto return_overflow;
-		}
-		goto do_next_sb;
-	}
-	ntfs_debug("Found compressed sub-block.");
-	/* This sb is compressed, decompress it into destination. */
-
-	/* Setup destination pointers. */
-	dp_sb_start = dp_addr;
-	dp_sb_end = dp_sb_start + NTFS_SB_SIZE;
-
-	/* Forward to the first tag in the sub-block. */
-	cb += 2;
-do_next_tag:
-	if (cb == cb_sb_end) {
-		/* Check if the decompressed sub-block was not full-length. */
-		if (dp_addr < dp_sb_end) {
-			int nr_bytes = do_sb_end - *dest_ofs;
-
-			ntfs_debug("Filling incomplete sub-block with "
-					"zeroes.");
-			/* Zero remainder and update destination position. */
-			memset(dp_addr, 0, nr_bytes);
-			*dest_ofs += nr_bytes;
-		}
-		/* We have finished the current sub-block. */
-		if (!(*dest_ofs &= ~PAGE_MASK))
-			goto finalize_page;
-		goto do_next_sb;
-	}
-
-	/* Check we are still in range. */
-	if (cb > cb_sb_end || dp_addr > dp_sb_end)
-		goto return_overflow;
-
-	/* Get the next tag and advance to first token. */
-	tag = *cb++;
-
-	/* Parse the eight tokens described by the tag. */
-	for (token = 0; token < 8; token++, tag >>= 1) {
-		u16 lg, pt, length, max_non_overlap;
-		register u16 i;
-		u8 *dp_back_addr;
-
-		/* Check if we are done / still in range. */
-		if (cb >= cb_sb_end || dp_addr > dp_sb_end)
-			break;
-
-		/* Determine token type and parse appropriately.*/
-		if ((tag & NTFS_TOKEN_MASK) == NTFS_SYMBOL_TOKEN) {
-			/*
-			 * We have a symbol token, copy the symbol across, and
-			 * advance the source and destination positions.
-			 */
-			*dp_addr++ = *cb++;
-			++*dest_ofs;
-
-			/* Continue with the next token. */
-			continue;
-		}
-
-		/*
-		 * We have a phrase token. Make sure it is not the first tag in
-		 * the sb as this is illegal and would confuse the code below.
-		 */
-		if (dp_addr == dp_sb_start)
-			goto return_overflow;
-
-		/*
-		 * Determine the number of bytes to go back (p) and the number
-		 * of bytes to copy (l). We use an optimized algorithm in which
-		 * we first calculate log2(current destination position in sb),
-		 * which allows determination of l and p in O(1) rather than
-		 * O(n). We just need an arch-optimized log2() function now.
-		 */
-		lg = 0;
-		for (i = *dest_ofs - do_sb_start - 1; i >= 0x10; i >>= 1)
-			lg++;
-
-		/* Get the phrase token into i. */
-		pt = le16_to_cpup((le16*)cb);
-
-		/*
-		 * Calculate starting position of the byte sequence in
-		 * the destination using the fact that p = (pt >> (12 - lg)) + 1
-		 * and make sure we don't go too far back.
-		 */
-		dp_back_addr = dp_addr - (pt >> (12 - lg)) - 1;
-		if (dp_back_addr < dp_sb_start)
-			goto return_overflow;
-
-		/* Now calculate the length of the byte sequence. */
-		length = (pt & (0xfff >> lg)) + 3;
-
-		/* Advance destination position and verify it is in range. */
-		*dest_ofs += length;
-		if (*dest_ofs > do_sb_end)
-			goto return_overflow;
-
-		/* The number of non-overlapping bytes. */
-		max_non_overlap = dp_addr - dp_back_addr;
-
-		if (length <= max_non_overlap) {
-			/* The byte sequence doesn't overlap, just copy it. */
-			memcpy(dp_addr, dp_back_addr, length);
-
-			/* Advance destination pointer. */
-			dp_addr += length;
-		} else {
-			/*
-			 * The byte sequence does overlap, copy non-overlapping
-			 * part and then do a slow byte by byte copy for the
-			 * overlapping part. Also, advance the destination
-			 * pointer.
-			 */
-			memcpy(dp_addr, dp_back_addr, max_non_overlap);
-			dp_addr += max_non_overlap;
-			dp_back_addr += max_non_overlap;
-			length -= max_non_overlap;
-			while (length--)
-				*dp_addr++ = *dp_back_addr++;
-		}
-
-		/* Advance source position and continue with the next token. */
-		cb += 2;
-	}
-
-	/* No tokens left in the current tag. Continue with the next tag. */
-	goto do_next_tag;
-
-return_overflow:
-	ntfs_error(NULL, "Failed. Returning -EOVERFLOW.");
-	goto return_error;
-}
-
-/**
- * ntfs_read_compressed_block - read a compressed block into the page cache
- * @page:	locked page in the compression block(s) we need to read
- *
- * When we are called the page has already been verified to be locked and the
- * attribute is known to be non-resident, not encrypted, but compressed.
- *
- * 1. Determine which compression block(s) @page is in.
- * 2. Get hold of all pages corresponding to this/these compression block(s).
- * 3. Read the (first) compression block.
- * 4. Decompress it into the corresponding pages.
- * 5. Throw the compressed data away and proceed to 3. for the next compression
- *    block or return success if no more compression blocks left.
- *
- * Warning: We have to be careful what we do about existing pages. They might
- * have been written to so that we would lose data if we were to just overwrite
- * them with the out-of-date uncompressed data.
- *
- * FIXME: For PAGE_SIZE > cb_size we are not doing the Right Thing(TM) at
- * the end of the file I think. We need to detect this case and zero the out
- * of bounds remainder of the page in question and mark it as handled. At the
- * moment we would just return -EIO on such a page. This bug will only become
- * apparent if pages are above 8kiB and the NTFS volume only uses 512 byte
- * clusters so is probably not going to be seen by anyone. Still this should
- * be fixed. (AIA)
- *
- * FIXME: Again for PAGE_SIZE > cb_size we are screwing up both in
- * handling sparse and compressed cbs. (AIA)
- *
- * FIXME: At the moment we don't do any zeroing out in the case that
- * initialized_size is less than data_size. This should be safe because of the
- * nature of the compression algorithm used. Just in case we check and output
- * an error message in read inode if the two sizes are not equal for a
- * compressed file. (AIA)
- */
-int ntfs_read_compressed_block(struct page *page)
-{
-	loff_t i_size;
-	s64 initialized_size;
-	struct address_space *mapping = page->mapping;
-	ntfs_inode *ni = NTFS_I(mapping->host);
-	ntfs_volume *vol = ni->vol;
-	struct super_block *sb = vol->sb;
-	runlist_element *rl;
-	unsigned long flags, block_size = sb->s_blocksize;
-	unsigned char block_size_bits = sb->s_blocksize_bits;
-	u8 *cb, *cb_pos, *cb_end;
-	struct buffer_head **bhs;
-	unsigned long offset, index = page->index;
-	u32 cb_size = ni->itype.compressed.block_size;
-	u64 cb_size_mask = cb_size - 1UL;
-	VCN vcn;
-	LCN lcn;
-	/* The first wanted vcn (minimum alignment is PAGE_SIZE). */
-	VCN start_vcn = (((s64)index << PAGE_SHIFT) & ~cb_size_mask) >>
-			vol->cluster_size_bits;
-	/*
-	 * The first vcn after the last wanted vcn (minimum alignment is again
-	 * PAGE_SIZE.
-	 */
-	VCN end_vcn = ((((s64)(index + 1UL) << PAGE_SHIFT) + cb_size - 1)
-			& ~cb_size_mask) >> vol->cluster_size_bits;
-	/* Number of compression blocks (cbs) in the wanted vcn range. */
-	unsigned int nr_cbs = (end_vcn - start_vcn) << vol->cluster_size_bits
-			>> ni->itype.compressed.block_size_bits;
-	/*
-	 * Number of pages required to store the uncompressed data from all
-	 * compression blocks (cbs) overlapping @page. Due to alignment
-	 * guarantees of start_vcn and end_vcn, no need to round up here.
-	 */
-	unsigned int nr_pages = (end_vcn - start_vcn) <<
-			vol->cluster_size_bits >> PAGE_SHIFT;
-	unsigned int xpage, max_page, cur_page, cur_ofs, i;
-	unsigned int cb_clusters, cb_max_ofs;
-	int block, max_block, cb_max_page, bhs_size, nr_bhs, err = 0;
-	struct page **pages;
-	int *completed_pages;
-	unsigned char xpage_done = 0;
-
-	ntfs_debug("Entering, page->index = 0x%lx, cb_size = 0x%x, nr_pages = "
-			"%i.", index, cb_size, nr_pages);
-	/*
-	 * Bad things happen if we get here for anything that is not an
-	 * unnamed $DATA attribute.
-	 */
-	BUG_ON(ni->type != AT_DATA);
-	BUG_ON(ni->name_len);
-
-	pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS);
-	completed_pages = kmalloc_array(nr_pages + 1, sizeof(int), GFP_NOFS);
-
-	/* Allocate memory to store the buffer heads we need. */
-	bhs_size = cb_size / block_size * sizeof(struct buffer_head *);
-	bhs = kmalloc(bhs_size, GFP_NOFS);
-
-	if (unlikely(!pages || !bhs || !completed_pages)) {
-		kfree(bhs);
-		kfree(pages);
-		kfree(completed_pages);
-		unlock_page(page);
-		ntfs_error(vol->sb, "Failed to allocate internal buffers.");
-		return -ENOMEM;
-	}
-
-	/*
-	 * We have already been given one page, this is the one we must do.
-	 * Once again, the alignment guarantees keep it simple.
-	 */
-	offset = start_vcn << vol->cluster_size_bits >> PAGE_SHIFT;
-	xpage = index - offset;
-	pages[xpage] = page;
-	/*
-	 * The remaining pages need to be allocated and inserted into the page
-	 * cache, alignment guarantees keep all the below much simpler. (-8
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	i_size = i_size_read(VFS_I(ni));
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	max_page = ((i_size + PAGE_SIZE - 1) >> PAGE_SHIFT) -
-			offset;
-	/* Is the page fully outside i_size? (truncate in progress) */
-	if (xpage >= max_page) {
-		kfree(bhs);
-		kfree(pages);
-		kfree(completed_pages);
-		zero_user(page, 0, PAGE_SIZE);
-		ntfs_debug("Compressed read outside i_size - truncated?");
-		SetPageUptodate(page);
-		unlock_page(page);
-		return 0;
-	}
-	if (nr_pages < max_page)
-		max_page = nr_pages;
-	for (i = 0; i < max_page; i++, offset++) {
-		if (i != xpage)
-			pages[i] = grab_cache_page_nowait(mapping, offset);
-		page = pages[i];
-		if (page) {
-			/*
-			 * We only (re)read the page if it isn't already read
-			 * in and/or dirty or we would be losing data or at
-			 * least wasting our time.
-			 */
-			if (!PageDirty(page) && (!PageUptodate(page) ||
-					PageError(page))) {
-				ClearPageError(page);
-				kmap(page);
-				continue;
-			}
-			unlock_page(page);
-			put_page(page);
-			pages[i] = NULL;
-		}
-	}
-
-	/*
-	 * We have the runlist, and all the destination pages we need to fill.
-	 * Now read the first compression block.
-	 */
-	cur_page = 0;
-	cur_ofs = 0;
-	cb_clusters = ni->itype.compressed.block_clusters;
-do_next_cb:
-	nr_cbs--;
-	nr_bhs = 0;
-
-	/* Read all cb buffer heads one cluster at a time. */
-	rl = NULL;
-	for (vcn = start_vcn, start_vcn += cb_clusters; vcn < start_vcn;
-			vcn++) {
-		bool is_retry = false;
-
-		if (!rl) {
-lock_retry_remap:
-			down_read(&ni->runlist.lock);
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		ntfs_debug("Reading vcn = 0x%llx, lcn = 0x%llx.",
-				(unsigned long long)vcn,
-				(unsigned long long)lcn);
-		if (lcn < 0) {
-			/*
-			 * When we reach the first sparse cluster we have
-			 * finished with the cb.
-			 */
-			if (lcn == LCN_HOLE)
-				break;
-			if (is_retry || lcn != LCN_RL_NOT_MAPPED)
-				goto rl_err;
-			is_retry = true;
-			/*
-			 * Attempt to map runlist, dropping lock for the
-			 * duration.
-			 */
-			up_read(&ni->runlist.lock);
-			if (!ntfs_map_runlist(ni, vcn))
-				goto lock_retry_remap;
-			goto map_rl_err;
-		}
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		/* Read the lcn from device in chunks of block_size bytes. */
-		max_block = block + (vol->cluster_size >> block_size_bits);
-		do {
-			ntfs_debug("block = 0x%x.", block);
-			if (unlikely(!(bhs[nr_bhs] = sb_getblk(sb, block))))
-				goto getblk_err;
-			nr_bhs++;
-		} while (++block < max_block);
-	}
-
-	/* Release the lock if we took it. */
-	if (rl)
-		up_read(&ni->runlist.lock);
-
-	/* Setup and initiate io on all buffer heads. */
-	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
-		if (!trylock_buffer(tbh))
-			continue;
-		if (unlikely(buffer_uptodate(tbh))) {
-			unlock_buffer(tbh);
-			continue;
-		}
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_read_sync;
-		submit_bh(REQ_OP_READ, tbh);
-	}
-
-	/* Wait for io completion on all buffer heads. */
-	for (i = 0; i < nr_bhs; i++) {
-		struct buffer_head *tbh = bhs[i];
-
-		if (buffer_uptodate(tbh))
-			continue;
-		wait_on_buffer(tbh);
-		/*
-		 * We need an optimization barrier here, otherwise we start
-		 * hitting the below fixup code when accessing a loopback
-		 * mounted ntfs partition. This indicates either there is a
-		 * race condition in the loop driver or, more likely, gcc
-		 * overoptimises the code without the barrier and it doesn't
-		 * do the Right Thing(TM).
-		 */
-		barrier();
-		if (unlikely(!buffer_uptodate(tbh))) {
-			ntfs_warning(vol->sb, "Buffer is unlocked but not "
-					"uptodate! Unplugging the disk queue "
-					"and rescheduling.");
-			get_bh(tbh);
-			io_schedule();
-			put_bh(tbh);
-			if (unlikely(!buffer_uptodate(tbh)))
-				goto read_err;
-			ntfs_warning(vol->sb, "Buffer is now uptodate. Good.");
-		}
-	}
-
-	/*
-	 * Get the compression buffer. We must not sleep any more
-	 * until we are finished with it.
-	 */
-	spin_lock(&ntfs_cb_lock);
-	cb = ntfs_compression_buffer;
-
-	BUG_ON(!cb);
-
-	cb_pos = cb;
-	cb_end = cb + cb_size;
-
-	/* Copy the buffer heads into the contiguous buffer. */
-	for (i = 0; i < nr_bhs; i++) {
-		memcpy(cb_pos, bhs[i]->b_data, block_size);
-		cb_pos += block_size;
-	}
-
-	/* Just a precaution. */
-	if (cb_pos + 2 <= cb + cb_size)
-		*(u16*)cb_pos = 0;
-
-	/* Reset cb_pos back to the beginning. */
-	cb_pos = cb;
-
-	/* We now have both source (if present) and destination. */
-	ntfs_debug("Successfully read the compression block.");
-
-	/* The last page and maximum offset within it for the current cb. */
-	cb_max_page = (cur_page << PAGE_SHIFT) + cur_ofs + cb_size;
-	cb_max_ofs = cb_max_page & ~PAGE_MASK;
-	cb_max_page >>= PAGE_SHIFT;
-
-	/* Catch end of file inside a compression block. */
-	if (cb_max_page > max_page)
-		cb_max_page = max_page;
-
-	if (vcn == start_vcn - cb_clusters) {
-		/* Sparse cb, zero out page range overlapping the cb. */
-		ntfs_debug("Found sparse compression block.");
-		/* We can sleep from now on, so we drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		if (cb_max_ofs)
-			cb_max_page--;
-		for (; cur_page < cb_max_page; cur_page++) {
-			page = pages[cur_page];
-			if (page) {
-				if (likely(!cur_ofs))
-					clear_page(page_address(page));
-				else
-					memset(page_address(page) + cur_ofs, 0,
-							PAGE_SIZE -
-							cur_ofs);
-				flush_dcache_page(page);
-				kunmap(page);
-				SetPageUptodate(page);
-				unlock_page(page);
-				if (cur_page == xpage)
-					xpage_done = 1;
-				else
-					put_page(page);
-				pages[cur_page] = NULL;
-			}
-			cb_pos += PAGE_SIZE - cur_ofs;
-			cur_ofs = 0;
-			if (cb_pos >= cb_end)
-				break;
-		}
-		/* If we have a partial final page, deal with it now. */
-		if (cb_max_ofs && cb_pos < cb_end) {
-			page = pages[cur_page];
-			if (page)
-				memset(page_address(page) + cur_ofs, 0,
-						cb_max_ofs - cur_ofs);
-			/*
-			 * No need to update cb_pos at this stage:
-			 *	cb_pos += cb_max_ofs - cur_ofs;
-			 */
-			cur_ofs = cb_max_ofs;
-		}
-	} else if (vcn == start_vcn) {
-		/* We can't sleep so we need two stages. */
-		unsigned int cur2_page = cur_page;
-		unsigned int cur_ofs2 = cur_ofs;
-		u8 *cb_pos2 = cb_pos;
-
-		ntfs_debug("Found uncompressed compression block.");
-		/* Uncompressed cb, copy it to the destination pages. */
-		/*
-		 * TODO: As a big optimization, we could detect this case
-		 * before we read all the pages and use block_read_full_folio()
-		 * on all full pages instead (we still have to treat partial
-		 * pages especially but at least we are getting rid of the
-		 * synchronous io for the majority of pages.
-		 * Or if we choose not to do the read-ahead/-behind stuff, we
-		 * could just return block_read_full_folio(pages[xpage]) as long
-		 * as PAGE_SIZE <= cb_size.
-		 */
-		if (cb_max_ofs)
-			cb_max_page--;
-		/* First stage: copy data into destination pages. */
-		for (; cur_page < cb_max_page; cur_page++) {
-			page = pages[cur_page];
-			if (page)
-				memcpy(page_address(page) + cur_ofs, cb_pos,
-						PAGE_SIZE - cur_ofs);
-			cb_pos += PAGE_SIZE - cur_ofs;
-			cur_ofs = 0;
-			if (cb_pos >= cb_end)
-				break;
-		}
-		/* If we have a partial final page, deal with it now. */
-		if (cb_max_ofs && cb_pos < cb_end) {
-			page = pages[cur_page];
-			if (page)
-				memcpy(page_address(page) + cur_ofs, cb_pos,
-						cb_max_ofs - cur_ofs);
-			cb_pos += cb_max_ofs - cur_ofs;
-			cur_ofs = cb_max_ofs;
-		}
-		/* We can sleep from now on, so drop lock. */
-		spin_unlock(&ntfs_cb_lock);
-		/* Second stage: finalize pages. */
-		for (; cur2_page < cb_max_page; cur2_page++) {
-			page = pages[cur2_page];
-			if (page) {
-				/*
-				 * If we are outside the initialized size, zero
-				 * the out of bounds page range.
-				 */
-				handle_bounds_compressed_page(page, i_size,
-						initialized_size);
-				flush_dcache_page(page);
-				kunmap(page);
-				SetPageUptodate(page);
-				unlock_page(page);
-				if (cur2_page == xpage)
-					xpage_done = 1;
-				else
-					put_page(page);
-				pages[cur2_page] = NULL;
-			}
-			cb_pos2 += PAGE_SIZE - cur_ofs2;
-			cur_ofs2 = 0;
-			if (cb_pos2 >= cb_end)
-				break;
-		}
-	} else {
-		/* Compressed cb, decompress it into the destination page(s). */
-		unsigned int prev_cur_page = cur_page;
-
-		ntfs_debug("Found compressed compression block.");
-		err = ntfs_decompress(pages, completed_pages, &cur_page,
-				&cur_ofs, cb_max_page, cb_max_ofs, xpage,
-				&xpage_done, cb_pos, cb_size - (cb_pos - cb),
-				i_size, initialized_size);
-		/*
-		 * We can sleep from now on, lock already dropped by
-		 * ntfs_decompress().
-		 */
-		if (err) {
-			ntfs_error(vol->sb, "ntfs_decompress() failed in inode "
-					"0x%lx with error code %i. Skipping "
-					"this compression block.",
-					ni->mft_no, -err);
-			/* Release the unfinished pages. */
-			for (; prev_cur_page < cur_page; prev_cur_page++) {
-				page = pages[prev_cur_page];
-				if (page) {
-					flush_dcache_page(page);
-					kunmap(page);
-					unlock_page(page);
-					if (prev_cur_page != xpage)
-						put_page(page);
-					pages[prev_cur_page] = NULL;
-				}
-			}
-		}
-	}
-
-	/* Release the buffer heads. */
-	for (i = 0; i < nr_bhs; i++)
-		brelse(bhs[i]);
-
-	/* Do we have more work to do? */
-	if (nr_cbs)
-		goto do_next_cb;
-
-	/* We no longer need the list of buffer heads. */
-	kfree(bhs);
-
-	/* Clean up if we have any pages left. Should never happen. */
-	for (cur_page = 0; cur_page < max_page; cur_page++) {
-		page = pages[cur_page];
-		if (page) {
-			ntfs_error(vol->sb, "Still have pages left! "
-					"Terminating them with extreme "
-					"prejudice.  Inode 0x%lx, page index "
-					"0x%lx.", ni->mft_no, page->index);
-			flush_dcache_page(page);
-			kunmap(page);
-			unlock_page(page);
-			if (cur_page != xpage)
-				put_page(page);
-			pages[cur_page] = NULL;
-		}
-	}
-
-	/* We no longer need the list of pages. */
-	kfree(pages);
-	kfree(completed_pages);
-
-	/* If we have completed the requested page, we return success. */
-	if (likely(xpage_done))
-		return 0;
-
-	ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
-			"EOVERFLOW" : (!err ? "EIO" : "unknown error"));
-	return err < 0 ? err : -EIO;
-
-read_err:
-	ntfs_error(vol->sb, "IO error while reading compressed data.");
-	/* Release the buffer heads. */
-	for (i = 0; i < nr_bhs; i++)
-		brelse(bhs[i]);
-	goto err_out;
-
-map_rl_err:
-	ntfs_error(vol->sb, "ntfs_map_runlist() failed. Cannot read "
-			"compression block.");
-	goto err_out;
-
-rl_err:
-	up_read(&ni->runlist.lock);
-	ntfs_error(vol->sb, "ntfs_rl_vcn_to_lcn() failed. Cannot read "
-			"compression block.");
-	goto err_out;
-
-getblk_err:
-	up_read(&ni->runlist.lock);
-	ntfs_error(vol->sb, "getblk() failed. Cannot read compression block.");
-
-err_out:
-	kfree(bhs);
-	for (i = cur_page; i < max_page; i++) {
-		page = pages[i];
-		if (page) {
-			flush_dcache_page(page);
-			kunmap(page);
-			unlock_page(page);
-			if (i != xpage)
-				put_page(page);
-		}
-	}
-	kfree(pages);
-	kfree(completed_pages);
-	return -EIO;
-}
diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c
deleted file mode 100644
index a3c1c5656f8f..000000000000
--- a/fs/ntfs/debug.c
+++ /dev/null
@@ -1,159 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * debug.c - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include "debug.h"
-
-/**
- * __ntfs_warning - output a warning to the syslog
- * @function:	name of function outputting the warning
- * @sb:		super block of mounted ntfs filesystem
- * @fmt:	warning string containing format specifications
- * @...:	a variable number of arguments specified in @fmt
- *
- * Outputs a warning to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the warning string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_warning is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_warning(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_warning(const char *function, const struct super_block *sb,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-#ifndef DEBUG
-	if (!printk_ratelimit())
-		return;
-#endif
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (sb)
-		pr_warn("(device %s): %s(): %pV\n",
-			sb->s_id, flen ? function : "", &vaf);
-	else
-		pr_warn("%s(): %pV\n", flen ? function : "", &vaf);
-	va_end(args);
-}
-
-/**
- * __ntfs_error - output an error to the syslog
- * @function:	name of function outputting the error
- * @sb:		super block of mounted ntfs filesystem
- * @fmt:	error string containing format specifications
- * @...:	a variable number of arguments specified in @fmt
- *
- * Outputs an error to the syslog for the mounted ntfs filesystem described
- * by @sb.
- *
- * @fmt and the corresponding @... is printf style format string containing
- * the error string and the corresponding format arguments, respectively.
- *
- * @function is the name of the function from which __ntfs_error is being
- * called.
- *
- * Note, you should be using debug.h::ntfs_error(@sb, @fmt, @...) instead
- * as this provides the @function parameter automatically.
- */
-void __ntfs_error(const char *function, const struct super_block *sb,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-#ifndef DEBUG
-	if (!printk_ratelimit())
-		return;
-#endif
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	if (sb)
-		pr_err("(device %s): %s(): %pV\n",
-		       sb->s_id, flen ? function : "", &vaf);
-	else
-		pr_err("%s(): %pV\n", flen ? function : "", &vaf);
-	va_end(args);
-}
-
-#ifdef DEBUG
-
-/* If 1, output debug messages, and if 0, don't. */
-int debug_msgs = 0;
-
-void __ntfs_debug(const char *file, int line, const char *function,
-		const char *fmt, ...)
-{
-	struct va_format vaf;
-	va_list args;
-	int flen = 0;
-
-	if (!debug_msgs)
-		return;
-	if (function)
-		flen = strlen(function);
-	va_start(args, fmt);
-	vaf.fmt = fmt;
-	vaf.va = &args;
-	pr_debug("(%s, %d): %s(): %pV", file, line, flen ? function : "", &vaf);
-	va_end(args);
-}
-
-/* Dump a runlist. Caller has to provide synchronisation for @rl. */
-void ntfs_debug_dump_runlist(const runlist_element *rl)
-{
-	int i;
-	const char *lcn_str[5] = { "LCN_HOLE         ", "LCN_RL_NOT_MAPPED",
-				   "LCN_ENOENT       ", "LCN_unknown      " };
-
-	if (!debug_msgs)
-		return;
-	pr_debug("Dumping runlist (values in hex):\n");
-	if (!rl) {
-		pr_debug("Run list not present.\n");
-		return;
-	}
-	pr_debug("VCN              LCN               Run length\n");
-	for (i = 0; ; i++) {
-		LCN lcn = (rl + i)->lcn;
-
-		if (lcn < (LCN)0) {
-			int index = -lcn - 1;
-
-			if (index > -LCN_ENOENT - 1)
-				index = 3;
-			pr_debug("%-16Lx %s %-16Lx%s\n",
-					(long long)(rl + i)->vcn, lcn_str[index],
-					(long long)(rl + i)->length,
-					(rl + i)->length ? "" :
-						" (runlist end)");
-		} else
-			pr_debug("%-16Lx %-16Lx  %-16Lx%s\n",
-					(long long)(rl + i)->vcn,
-					(long long)(rl + i)->lcn,
-					(long long)(rl + i)->length,
-					(rl + i)->length ? "" :
-						" (runlist end)");
-		if (!(rl + i)->length)
-			break;
-	}
-}
-
-#endif
diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h
deleted file mode 100644
index 6fdef388f129..000000000000
--- a/fs/ntfs/debug.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * debug.h - NTFS kernel debug support. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DEBUG_H
-#define _LINUX_NTFS_DEBUG_H
-
-#include <linux/fs.h>
-
-#include "runlist.h"
-
-#ifdef DEBUG
-
-extern int debug_msgs;
-
-extern __printf(4, 5)
-void __ntfs_debug(const char *file, int line, const char *function,
-		  const char *format, ...);
-/**
- * ntfs_debug - write a debug level message to syslog
- * @f:		a printf format string containing the message
- * @...:	the variables to substitute into @f
- *
- * ntfs_debug() writes a DEBUG level message to the syslog but only if the
- * driver was compiled with -DDEBUG. Otherwise, the call turns into a NOP.
- */
-#define ntfs_debug(f, a...)						\
-	__ntfs_debug(__FILE__, __LINE__, __func__, f, ##a)
-
-extern void ntfs_debug_dump_runlist(const runlist_element *rl);
-
-#else	/* !DEBUG */
-
-#define ntfs_debug(fmt, ...)						\
-do {									\
-	if (0)								\
-		no_printk(fmt, ##__VA_ARGS__);				\
-} while (0)
-
-#define ntfs_debug_dump_runlist(rl)	do {} while (0)
-
-#endif	/* !DEBUG */
-
-extern  __printf(3, 4)
-void __ntfs_warning(const char *function, const struct super_block *sb,
-		    const char *fmt, ...);
-#define ntfs_warning(sb, f, a...)	__ntfs_warning(__func__, sb, f, ##a)
-
-extern  __printf(3, 4)
-void __ntfs_error(const char *function, const struct super_block *sb,
-		  const char *fmt, ...);
-#define ntfs_error(sb, f, a...)		__ntfs_error(__func__, sb, f, ##a)
-
-#endif /* _LINUX_NTFS_DEBUG_H */
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
deleted file mode 100644
index 629723a8d712..000000000000
--- a/fs/ntfs/dir.c
+++ /dev/null
@@ -1,1540 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * dir.c - NTFS kernel directory operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-
-#include "dir.h"
-#include "aops.h"
-#include "attrib.h"
-#include "mft.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * The little endian Unicode string $I30 as a global constant.
- */
-ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
-		cpu_to_le16('3'),	cpu_to_le16('0'), 0 };
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni:	ntfs inode of the directory in which to search for the name
- * @uname:	Unicode name for which to search in the directory
- * @uname_len:	length of the name @uname in Unicode characters
- * @res:	return the found file name if necessary (see below)
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- *
- * Note, we look for a case sensitive match first but we also look for a case
- * insensitive match at the same time. If we find a case insensitive match, we
- * save that for the case that we don't find an exact match, where we return
- * the case insensitive match and setup @res (which we allocate!) with the mft
- * reference, the file name type, length and with a copy of the little endian
- * Unicode file name itself. If we match a file name which is in the DOS name
- * space, we only return the mft reference and file name type in @res.
- * ntfs_lookup() then uses this to find the long file name in the inode itself.
- * This is to avoid polluting the dcache with short file names. We want them to
- * work but we don't care for how quickly one can access them. This also fixes
- * the dcache aliasing issues.
- *
- * Locking:  - Caller must hold i_mutex on the directory.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
-		const int uname_len, ntfs_name **res)
-{
-	ntfs_volume *vol = dir_ni->vol;
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end;
-	u64 mref;
-	ntfs_attr_search_ctx *ctx;
-	int err, rc;
-	VCN vcn, old_vcn;
-	struct address_space *ia_mapping;
-	struct page *page;
-	u8 *kaddr;
-	ntfs_name *name = NULL;
-
-	BUG_ON(!S_ISDIR(VFS_I(dir_ni)->i_mode));
-	BUG_ON(NInoAttr(dir_ni));
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(dir_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return ERR_MREF(PTR_ERR(m));
-	}
-	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it's been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end)
-			goto dir_err_out;
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * We perform a case sensitive comparison and if that matches
-		 * we are done and return the mft reference of the inode (i.e.
-		 * the inode number together with the sequence number for
-		 * consistency checking). We convert it to cpu format before
-		 * returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it:
-			/*
-			 * We have a perfect match, so we don't need to care
-			 * about having matched imperfectly before, so we can
-			 * free name and set *res to NULL.
-			 * However, if the perfect match is a short file name,
-			 * we need to signal this through *res, so that
-			 * ntfs_lookup() can fix dcache aliasing issues.
-			 * As an optimization we just reuse an existing
-			 * allocation of *res.
-			 */
-			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
-				if (!name) {
-					name = kmalloc(sizeof(ntfs_name),
-							GFP_NOFS);
-					if (!name) {
-						err = -ENOMEM;
-						goto err_out;
-					}
-				}
-				name->mref = le64_to_cpu(
-						ie->data.dir.indexed_file);
-				name->type = FILE_NAME_DOS;
-				name->len = 0;
-				*res = name;
-			} else {
-				kfree(name);
-				*res = NULL;
-			}
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return mref;
-		}
-		/*
-		 * For a case insensitive mount, we also perform a case
-		 * insensitive comparison (provided the file name is not in the
-		 * POSIX namespace). If the comparison matches, and the name is
-		 * in the WIN32 namespace, we cache the filename in *res so
-		 * that the caller, ntfs_lookup(), can work on it. If the
-		 * comparison matches, and the name is in the DOS namespace, we
-		 * only cache the mft reference and the file name type (we set
-		 * the name length to zero for simplicity).
-		 */
-		if (!NVolCaseSensitive(vol) &&
-				ie->key.file_name.file_name_type &&
-				ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
-			int name_size = sizeof(ntfs_name);
-			u8 type = ie->key.file_name.file_name_type;
-			u8 len = ie->key.file_name.file_name_length;
-
-			/* Only one case insensitive matching name allowed. */
-			if (name) {
-				ntfs_error(sb, "Found already allocated name "
-						"in phase 1. Please run chkdsk "
-						"and if that doesn't find any "
-						"errors please report you saw "
-						"this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net.");
-				goto dir_err_out;
-			}
-
-			if (type != FILE_NAME_DOS)
-				name_size += len * sizeof(ntfschar);
-			name = kmalloc(name_size, GFP_NOFS);
-			if (!name) {
-				err = -ENOMEM;
-				goto err_out;
-			}
-			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
-			name->type = type;
-			if (type != FILE_NAME_DOS) {
-				name->len = len;
-				memcpy(name->name, ie->key.file_name.file_name,
-						len * sizeof(ntfschar));
-			} else
-				name->len = 0;
-			*res = name;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it;
-	}
-	/*
-	 * We have finished with this index without success. Check for the
-	 * presence of a child node and if not present return -ENOENT, unless
-	 * we have got a matching name cached in name in which case return the
-	 * mft reference associated with it.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		if (name) {
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return name->mref;
-		}
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto err_out;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(dir_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one. Directory inode 0x%lx is "
-				"corrupt or driver bug.", dir_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	ia_mapping = VFS_I(dir_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record. Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(dir_ni);
-	m = NULL;
-	ctx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map directory index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug.", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)vcn, dir_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				dir_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + dir_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds check. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end) {
-			ntfs_error(sb, "Index entry out of bounds in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * We perform a case sensitive comparison and if that matches
-		 * we are done and return the mft reference of the inode (i.e.
-		 * the inode number together with the sequence number for
-		 * consistency checking). We convert it to cpu format before
-		 * returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len)) {
-found_it2:
-			/*
-			 * We have a perfect match, so we don't need to care
-			 * about having matched imperfectly before, so we can
-			 * free name and set *res to NULL.
-			 * However, if the perfect match is a short file name,
-			 * we need to signal this through *res, so that
-			 * ntfs_lookup() can fix dcache aliasing issues.
-			 * As an optimization we just reuse an existing
-			 * allocation of *res.
-			 */
-			if (ie->key.file_name.file_name_type == FILE_NAME_DOS) {
-				if (!name) {
-					name = kmalloc(sizeof(ntfs_name),
-							GFP_NOFS);
-					if (!name) {
-						err = -ENOMEM;
-						goto unm_err_out;
-					}
-				}
-				name->mref = le64_to_cpu(
-						ie->data.dir.indexed_file);
-				name->type = FILE_NAME_DOS;
-				name->len = 0;
-				*res = name;
-			} else {
-				kfree(name);
-				*res = NULL;
-			}
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			return mref;
-		}
-		/*
-		 * For a case insensitive mount, we also perform a case
-		 * insensitive comparison (provided the file name is not in the
-		 * POSIX namespace). If the comparison matches, and the name is
-		 * in the WIN32 namespace, we cache the filename in *res so
-		 * that the caller, ntfs_lookup(), can work on it. If the
-		 * comparison matches, and the name is in the DOS namespace, we
-		 * only cache the mft reference and the file name type (we set
-		 * the name length to zero for simplicity).
-		 */
-		if (!NVolCaseSensitive(vol) &&
-				ie->key.file_name.file_name_type &&
-				ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length,
-				IGNORE_CASE, vol->upcase, vol->upcase_len)) {
-			int name_size = sizeof(ntfs_name);
-			u8 type = ie->key.file_name.file_name_type;
-			u8 len = ie->key.file_name.file_name_length;
-
-			/* Only one case insensitive matching name allowed. */
-			if (name) {
-				ntfs_error(sb, "Found already allocated name "
-						"in phase 2. Please run chkdsk "
-						"and if that doesn't find any "
-						"errors please report you saw "
-						"this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net.");
-				unlock_page(page);
-				ntfs_unmap_page(page);
-				goto dir_err_out;
-			}
-
-			if (type != FILE_NAME_DOS)
-				name_size += len * sizeof(ntfschar);
-			name = kmalloc(name_size, GFP_NOFS);
-			if (!name) {
-				err = -ENOMEM;
-				goto unm_err_out;
-			}
-			name->mref = le64_to_cpu(ie->data.dir.indexed_file);
-			name->type = type;
-			if (type != FILE_NAME_DOS) {
-				name->len = len;
-				memcpy(name->name, ie->key.file_name.file_name,
-						len * sizeof(ntfschar));
-			} else
-				name->len = 0;
-			*res = name;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it2;
-	}
-	/*
-	 * We have finished with this index buffer without success. Check for
-	 * the presence of a child node.
-	 */
-	if (ie->flags & INDEX_ENTRY_NODE) {
-		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-			ntfs_error(sb, "Index entry with child node found in "
-					"a leaf node in directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* Child node present, descend into it. */
-		old_vcn = vcn;
-		vcn = sle64_to_cpup((sle64*)((u8*)ie +
-				le16_to_cpu(ie->length) - 8));
-		if (vcn >= 0) {
-			/* If vcn is in the same page cache page as old_vcn we
-			 * recycle the mapped page. */
-			if (old_vcn << vol->cluster_size_bits >>
-					PAGE_SHIFT == vcn <<
-					vol->cluster_size_bits >>
-					PAGE_SHIFT)
-				goto fast_descend_into_child_node;
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto descend_into_child_node;
-		}
-		ntfs_error(sb, "Negative child node vcn in directory inode "
-				"0x%lx.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/*
-	 * No child node present, return -ENOENT, unless we have got a matching
-	 * name cached in name in which case return the mft reference
-	 * associated with it.
-	 */
-	if (name) {
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		return name->mref;
-	}
-	ntfs_debug("Entry not found.");
-	err = -ENOENT;
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(dir_ni);
-	if (name) {
-		kfree(name);
-		*res = NULL;
-	}
-	return ERR_MREF(err);
-dir_err_out:
-	ntfs_error(sb, "Corrupt directory.  Aborting lookup.");
-	goto err_out;
-}
-
-#if 0
-
-// TODO: (AIA)
-// The algorithm embedded in this code will be required for the time when we
-// want to support adding of entries to directories, where we require correct
-// collation of file names in order not to cause corruption of the filesystem.
-
-/**
- * ntfs_lookup_inode_by_name - find an inode in a directory given its name
- * @dir_ni:	ntfs inode of the directory in which to search for the name
- * @uname:	Unicode name for which to search in the directory
- * @uname_len:	length of the name @uname in Unicode characters
- *
- * Look for an inode with name @uname in the directory with inode @dir_ni.
- * ntfs_lookup_inode_by_name() walks the contents of the directory looking for
- * the Unicode name. If the name is found in the directory, the corresponding
- * inode number (>= 0) is returned as a mft reference in cpu format, i.e. it
- * is a 64-bit number containing the sequence number.
- *
- * On error, a negative value is returned corresponding to the error code. In
- * particular if the inode is not found -ENOENT is returned. Note that you
- * can't just check the return value for being negative, you have to check the
- * inode number for being negative which you can extract using MREC(return
- * value).
- *
- * Note, @uname_len does not include the (optional) terminating NULL character.
- */
-u64 ntfs_lookup_inode_by_name(ntfs_inode *dir_ni, const ntfschar *uname,
-		const int uname_len)
-{
-	ntfs_volume *vol = dir_ni->vol;
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end;
-	u64 mref;
-	ntfs_attr_search_ctx *ctx;
-	int err, rc;
-	IGNORE_CASE_BOOL ic;
-	VCN vcn, old_vcn;
-	struct address_space *ia_mapping;
-	struct page *page;
-	u8 *kaddr;
-
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(dir_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return ERR_MREF(PTR_ERR(m));
-	}
-	ctx = ntfs_attr_get_search_ctx(dir_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it's been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ctx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end)
-			goto dir_err_out;
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * If the current entry has a name type of POSIX, the name is
-		 * case sensitive and not otherwise. This has the effect of us
-		 * not being able to access any POSIX file names which collate
-		 * after the non-POSIX one when they only differ in case, but
-		 * anyone doing screwy stuff like that deserves to burn in
-		 * hell... Doing that kind of stuff on NT4 actually causes
-		 * corruption on the partition even when using SP6a and Linux
-		 * is not involved at all.
-		 */
-		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
-				CASE_SENSITIVE;
-		/*
-		 * If the names match perfectly, we are done and return the
-		 * mft reference of the inode (i.e. the inode number together
-		 * with the sequence number for consistency checking. We
-		 * convert it to cpu format before returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, ic,
-				vol->upcase, vol->upcase_len)) {
-found_it:
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(dir_ni);
-			return mref;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it;
-	}
-	/*
-	 * We have finished with this index without success. Check for the
-	 * presence of a child node.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		/* No child node, return -ENOENT. */
-		err = -ENOENT;
-		goto err_out;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(dir_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one. Directory inode 0x%lx is "
-				"corrupt or driver bug.", dir_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
-	ia_mapping = VFS_I(dir_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record. Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(dir_ni);
-	m = NULL;
-	ctx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			dir_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map directory index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			dir_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug.", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)vcn, dir_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				dir_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + dir_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + dir_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds check. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end) {
-			ntfs_error(sb, "Index entry out of bounds in "
-					"directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a name. It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/*
-		 * If the current entry has a name type of POSIX, the name is
-		 * case sensitive and not otherwise. This has the effect of us
-		 * not being able to access any POSIX file names which collate
-		 * after the non-POSIX one when they only differ in case, but
-		 * anyone doing screwy stuff like that deserves to burn in
-		 * hell... Doing that kind of stuff on NT4 actually causes
-		 * corruption on the partition even when using SP6a and Linux
-		 * is not involved at all.
-		 */
-		ic = ie->key.file_name.file_name_type ? IGNORE_CASE :
-				CASE_SENSITIVE;
-		/*
-		 * If the names match perfectly, we are done and return the
-		 * mft reference of the inode (i.e. the inode number together
-		 * with the sequence number for consistency checking. We
-		 * convert it to cpu format before returning.
-		 */
-		if (ntfs_are_names_equal(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, ic,
-				vol->upcase, vol->upcase_len)) {
-found_it2:
-			mref = le64_to_cpu(ie->data.dir.indexed_file);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			return mref;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				IGNORE_CASE, vol->upcase, vol->upcase_len);
-		/*
-		 * If uname collates before the name of the current entry, there
-		 * is definitely no such name in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/* The names are not equal, continue the search. */
-		if (rc)
-			continue;
-		/*
-		 * Names match with case insensitive comparison, now try the
-		 * case sensitive comparison, which is required for proper
-		 * collation.
-		 */
-		rc = ntfs_collate_names(uname, uname_len,
-				(ntfschar*)&ie->key.file_name.file_name,
-				ie->key.file_name.file_name_length, 1,
-				CASE_SENSITIVE, vol->upcase, vol->upcase_len);
-		if (rc == -1)
-			break;
-		if (rc)
-			continue;
-		/*
-		 * Perfect match, this will never happen as the
-		 * ntfs_are_names_equal() call will have gotten a match but we
-		 * still treat it correctly.
-		 */
-		goto found_it2;
-	}
-	/*
-	 * We have finished with this index buffer without success. Check for
-	 * the presence of a child node.
-	 */
-	if (ie->flags & INDEX_ENTRY_NODE) {
-		if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-			ntfs_error(sb, "Index entry with child node found in "
-					"a leaf node in directory inode 0x%lx.",
-					dir_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* Child node present, descend into it. */
-		old_vcn = vcn;
-		vcn = sle64_to_cpup((u8*)ie + le16_to_cpu(ie->length) - 8);
-		if (vcn >= 0) {
-			/* If vcn is in the same page cache page as old_vcn we
-			 * recycle the mapped page. */
-			if (old_vcn << vol->cluster_size_bits >>
-					PAGE_SHIFT == vcn <<
-					vol->cluster_size_bits >>
-					PAGE_SHIFT)
-				goto fast_descend_into_child_node;
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto descend_into_child_node;
-		}
-		ntfs_error(sb, "Negative child node vcn in directory inode "
-				"0x%lx.", dir_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* No child node, return -ENOENT. */
-	ntfs_debug("Entry not found.");
-	err = -ENOENT;
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(dir_ni);
-	return ERR_MREF(err);
-dir_err_out:
-	ntfs_error(sb, "Corrupt directory. Aborting lookup.");
-	goto err_out;
-}
-
-#endif
-
-/**
- * ntfs_filldir - ntfs specific filldir method
- * @vol:	current ntfs volume
- * @ndir:	ntfs inode of current directory
- * @ia_page:	page in which the index allocation buffer @ie is in resides
- * @ie:		current index entry
- * @name:	buffer to use for the converted name
- * @actor:	what to feed the entries to
- *
- * Convert the Unicode @name to the loaded NLS and pass it to the @filldir
- * callback.
- *
- * If @ia_page is not NULL it is the locked page containing the index
- * allocation block containing the index entry @ie.
- *
- * Note, we drop (and then reacquire) the page lock on @ia_page across the
- * @filldir() call otherwise we would deadlock with NFSd when it calls ->lookup
- * since ntfs_lookup() will lock the same page.  As an optimization, we do not
- * retake the lock if we are returning a non-zero value as ntfs_readdir()
- * would need to drop the lock immediately anyway.
- */
-static inline int ntfs_filldir(ntfs_volume *vol,
-		ntfs_inode *ndir, struct page *ia_page, INDEX_ENTRY *ie,
-		u8 *name, struct dir_context *actor)
-{
-	unsigned long mref;
-	int name_len;
-	unsigned dt_type;
-	FILE_NAME_TYPE_FLAGS name_type;
-
-	name_type = ie->key.file_name.file_name_type;
-	if (name_type == FILE_NAME_DOS) {
-		ntfs_debug("Skipping DOS name space entry.");
-		return 0;
-	}
-	if (MREF_LE(ie->data.dir.indexed_file) == FILE_root) {
-		ntfs_debug("Skipping root directory self reference entry.");
-		return 0;
-	}
-	if (MREF_LE(ie->data.dir.indexed_file) < FILE_first_user &&
-			!NVolShowSystemFiles(vol)) {
-		ntfs_debug("Skipping system file.");
-		return 0;
-	}
-	name_len = ntfs_ucstonls(vol, (ntfschar*)&ie->key.file_name.file_name,
-			ie->key.file_name.file_name_length, &name,
-			NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1);
-	if (name_len <= 0) {
-		ntfs_warning(vol->sb, "Skipping unrepresentable inode 0x%llx.",
-				(long long)MREF_LE(ie->data.dir.indexed_file));
-		return 0;
-	}
-	if (ie->key.file_name.file_attributes &
-			FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT)
-		dt_type = DT_DIR;
-	else
-		dt_type = DT_REG;
-	mref = MREF_LE(ie->data.dir.indexed_file);
-	/*
-	 * Drop the page lock otherwise we deadlock with NFS when it calls
-	 * ->lookup since ntfs_lookup() will lock the same page.
-	 */
-	if (ia_page)
-		unlock_page(ia_page);
-	ntfs_debug("Calling filldir for %s with len %i, fpos 0x%llx, inode "
-			"0x%lx, DT_%s.", name, name_len, actor->pos, mref,
-			dt_type == DT_DIR ? "DIR" : "REG");
-	if (!dir_emit(actor, name, name_len, mref, dt_type))
-		return 1;
-	/* Relock the page but not if we are aborting ->readdir. */
-	if (ia_page)
-		lock_page(ia_page);
-	return 0;
-}
-
-/*
- * We use the same basic approach as the old NTFS driver, i.e. we parse the
- * index root entries and then the index allocation entries that are marked
- * as in use in the index bitmap.
- *
- * While this will return the names in random order this doesn't matter for
- * ->readdir but OTOH results in a faster ->readdir.
- *
- * VFS calls ->readdir without BKL but with i_mutex held. This protects the VFS
- * parts (e.g. ->f_pos and ->i_size, and it also protects against directory
- * modifications).
- *
- * Locking:  - Caller must hold i_mutex on the directory.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-static int ntfs_readdir(struct file *file, struct dir_context *actor)
-{
-	s64 ia_pos, ia_start, prev_ia_pos, bmp_pos;
-	loff_t i_size;
-	struct inode *bmp_vi, *vdir = file_inode(file);
-	struct super_block *sb = vdir->i_sb;
-	ntfs_inode *ndir = NTFS_I(vdir);
-	ntfs_volume *vol = NTFS_SB(sb);
-	MFT_RECORD *m;
-	INDEX_ROOT *ir = NULL;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *name = NULL;
-	int rc, err, ir_pos, cur_bmp_pos;
-	struct address_space *ia_mapping, *bmp_mapping;
-	struct page *bmp_page = NULL, *ia_page = NULL;
-	u8 *kaddr, *bmp, *index_end;
-	ntfs_attr_search_ctx *ctx;
-
-	ntfs_debug("Entering for inode 0x%lx, fpos 0x%llx.",
-			vdir->i_ino, actor->pos);
-	rc = err = 0;
-	/* Are we at end of dir yet? */
-	i_size = i_size_read(vdir);
-	if (actor->pos >= i_size + vol->mft_record_size)
-		return 0;
-	/* Emulate . and .. for all directories. */
-	if (!dir_emit_dots(file, actor))
-		return 0;
-	m = NULL;
-	ctx = NULL;
-	/*
-	 * Allocate a buffer to store the current name being processed
-	 * converted to format determined by current NLS.
-	 */
-	name = kmalloc(NTFS_MAX_NAME_LEN * NLS_MAX_CHARSET_SIZE + 1, GFP_NOFS);
-	if (unlikely(!name)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Are we jumping straight into the index allocation attribute? */
-	if (actor->pos >= vol->mft_record_size)
-		goto skip_index_root;
-	/* Get hold of the mft record for the directory. */
-	m = map_mft_record(ndir);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ndir, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Get the offset into the index root attribute. */
-	ir_pos = (s64)actor->pos;
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		ntfs_error(sb, "Index root attribute missing in directory "
-				"inode 0x%lx.", vdir->i_ino);
-		goto err_out;
-	}
-	/*
-	 * Copy the index root attribute value to a buffer so that we can put
-	 * the search context and unmap the mft record before calling the
-	 * filldir() callback.  We need to do this because of NFSd which calls
-	 * ->lookup() from its filldir callback() and this causes NTFS to
-	 * deadlock as ntfs_lookup() maps the mft record of the directory and
-	 * we have got it mapped here already.  The only solution is for us to
-	 * unmap the mft record here so that a call to ntfs_lookup() is able to
-	 * map the mft record without deadlocking.
-	 */
-	rc = le32_to_cpu(ctx->attr->data.resident.value_length);
-	ir = kmalloc(rc, GFP_NOFS);
-	if (unlikely(!ir)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Copy the index root value (it has been verified in read_inode). */
-	memcpy(ir, (u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset), rc);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ndir);
-	ctx = NULL;
-	m = NULL;
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry or until filldir tells us it has had enough
-	 * or signals an error (both covered by the rc test).
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		ntfs_debug("In index root, offset 0x%zx.", (u8*)ie - (u8*)ir);
-		/* Bounds checks. */
-		if (unlikely((u8*)ie < (u8*)ir || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end))
-			goto err_out;
-		/* The last entry cannot contain a name. */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Skip index root entry if continuing previous readdir. */
-		if (ir_pos > (u8*)ie - (u8*)ir)
-			continue;
-		/* Advance the position even if going to skip the entry. */
-		actor->pos = (u8*)ie - (u8*)ir;
-		/* Submit the name to the filldir callback. */
-		rc = ntfs_filldir(vol, ndir, NULL, ie, name, actor);
-		if (rc) {
-			kfree(ir);
-			goto abort;
-		}
-	}
-	/* We are done with the index root and can free the buffer. */
-	kfree(ir);
-	ir = NULL;
-	/* If there is no index allocation attribute we are finished. */
-	if (!NInoIndexAllocPresent(ndir))
-		goto EOD;
-	/* Advance fpos to the beginning of the index allocation. */
-	actor->pos = vol->mft_record_size;
-skip_index_root:
-	kaddr = NULL;
-	prev_ia_pos = -1LL;
-	/* Get the offset into the index allocation attribute. */
-	ia_pos = (s64)actor->pos - vol->mft_record_size;
-	ia_mapping = vdir->i_mapping;
-	ntfs_debug("Inode 0x%lx, getting index bitmap.", vdir->i_ino);
-	bmp_vi = ntfs_attr_iget(vdir, AT_BITMAP, I30, 4);
-	if (IS_ERR(bmp_vi)) {
-		ntfs_error(sb, "Failed to get bitmap attribute.");
-		err = PTR_ERR(bmp_vi);
-		goto err_out;
-	}
-	bmp_mapping = bmp_vi->i_mapping;
-	/* Get the starting bitmap bit position and sanity check it. */
-	bmp_pos = ia_pos >> ndir->itype.index.block_size_bits;
-	if (unlikely(bmp_pos >> 3 >= i_size_read(bmp_vi))) {
-		ntfs_error(sb, "Current index allocation position exceeds "
-				"index bitmap size.");
-		goto iput_err_out;
-	}
-	/* Get the starting bit position in the current bitmap page. */
-	cur_bmp_pos = bmp_pos & ((PAGE_SIZE * 8) - 1);
-	bmp_pos &= ~(u64)((PAGE_SIZE * 8) - 1);
-get_next_bmp_page:
-	ntfs_debug("Reading bitmap with page index 0x%llx, bit ofs 0x%llx",
-			(unsigned long long)bmp_pos >> (3 + PAGE_SHIFT),
-			(unsigned long long)bmp_pos &
-			(unsigned long long)((PAGE_SIZE * 8) - 1));
-	bmp_page = ntfs_map_page(bmp_mapping,
-			bmp_pos >> (3 + PAGE_SHIFT));
-	if (IS_ERR(bmp_page)) {
-		ntfs_error(sb, "Reading index bitmap failed.");
-		err = PTR_ERR(bmp_page);
-		bmp_page = NULL;
-		goto iput_err_out;
-	}
-	bmp = (u8*)page_address(bmp_page);
-	/* Find next index block in use. */
-	while (!(bmp[cur_bmp_pos >> 3] & (1 << (cur_bmp_pos & 7)))) {
-find_next_index_buffer:
-		cur_bmp_pos++;
-		/*
-		 * If we have reached the end of the bitmap page, get the next
-		 * page, and put away the old one.
-		 */
-		if (unlikely((cur_bmp_pos >> 3) >= PAGE_SIZE)) {
-			ntfs_unmap_page(bmp_page);
-			bmp_pos += PAGE_SIZE * 8;
-			cur_bmp_pos = 0;
-			goto get_next_bmp_page;
-		}
-		/* If we have reached the end of the bitmap, we are done. */
-		if (unlikely(((bmp_pos + cur_bmp_pos) >> 3) >= i_size))
-			goto unm_EOD;
-		ia_pos = (bmp_pos + cur_bmp_pos) <<
-				ndir->itype.index.block_size_bits;
-	}
-	ntfs_debug("Handling index buffer 0x%llx.",
-			(unsigned long long)bmp_pos + cur_bmp_pos);
-	/* If the current index buffer is in the same page we reuse the page. */
-	if ((prev_ia_pos & (s64)PAGE_MASK) !=
-			(ia_pos & (s64)PAGE_MASK)) {
-		prev_ia_pos = ia_pos;
-		if (likely(ia_page != NULL)) {
-			unlock_page(ia_page);
-			ntfs_unmap_page(ia_page);
-		}
-		/*
-		 * Map the page cache page containing the current ia_pos,
-		 * reading it from disk if necessary.
-		 */
-		ia_page = ntfs_map_page(ia_mapping, ia_pos >> PAGE_SHIFT);
-		if (IS_ERR(ia_page)) {
-			ntfs_error(sb, "Reading index allocation data failed.");
-			err = PTR_ERR(ia_page);
-			ia_page = NULL;
-			goto err_out;
-		}
-		lock_page(ia_page);
-		kaddr = (u8*)page_address(ia_page);
-	}
-	/* Get the current index buffer. */
-	ia = (INDEX_ALLOCATION*)(kaddr + (ia_pos & ~PAGE_MASK &
-					  ~(s64)(ndir->itype.index.block_size - 1)));
-	/* Bounds checks. */
-	if (unlikely((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE)) {
-		ntfs_error(sb, "Out of bounds check failed. Corrupt directory "
-				"inode 0x%lx or driver bug.", vdir->i_ino);
-		goto err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Directory index record with vcn 0x%llx is "
-				"corrupt.  Corrupt inode 0x%lx.  Run chkdsk.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	if (unlikely(sle64_to_cpu(ia->index_block_vcn) != (ia_pos &
-			~(s64)(ndir->itype.index.block_size - 1)) >>
-			ndir->itype.index.vcn_size_bits)) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx). "
-				"Directory inode 0x%lx is corrupt or driver "
-				"bug. ", (unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	if (unlikely(le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			ndir->itype.index.block_size)) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx has a size (%u) differing from the "
-				"directory specified size (%u). Directory "
-				"inode is corrupt or driver bug.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				ndir->itype.index.block_size);
-		goto err_out;
-	}
-	index_end = (u8*)ia + ndir->itype.index.block_size;
-	if (unlikely(index_end > kaddr + PAGE_SIZE)) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of directory inode "
-				"0x%lx crosses page boundary. Impossible! "
-				"Cannot access! This is probably a bug in the "
-				"driver.", (unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	ia_start = ia_pos & ~(s64)(ndir->itype.index.block_size - 1);
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (unlikely(index_end > (u8*)ia + ndir->itype.index.block_size)) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of directory "
-				"inode 0x%lx exceeds maximum size.",
-				(unsigned long long)ia_pos >>
-				ndir->itype.index.vcn_size_bits, vdir->i_ino);
-		goto err_out;
-	}
-	/* The first index entry in this index buffer. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry or until filldir tells us it has had enough
-	 * or signals an error (both covered by the rc test).
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		ntfs_debug("In index allocation, offset 0x%llx.",
-				(unsigned long long)ia_start +
-				(unsigned long long)((u8*)ie - (u8*)ia));
-		/* Bounds checks. */
-		if (unlikely((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->key_length) >
-				index_end))
-			goto err_out;
-		/* The last entry cannot contain a name. */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Skip index block entry if continuing previous readdir. */
-		if (ia_pos - ia_start > (u8*)ie - (u8*)ia)
-			continue;
-		/* Advance the position even if going to skip the entry. */
-		actor->pos = (u8*)ie - (u8*)ia +
-				(sle64_to_cpu(ia->index_block_vcn) <<
-				ndir->itype.index.vcn_size_bits) +
-				vol->mft_record_size;
-		/*
-		 * Submit the name to the @filldir callback.  Note,
-		 * ntfs_filldir() drops the lock on @ia_page but it retakes it
-		 * before returning, unless a non-zero value is returned in
-		 * which case the page is left unlocked.
-		 */
-		rc = ntfs_filldir(vol, ndir, ia_page, ie, name, actor);
-		if (rc) {
-			/* @ia_page is already unlocked in this case. */
-			ntfs_unmap_page(ia_page);
-			ntfs_unmap_page(bmp_page);
-			iput(bmp_vi);
-			goto abort;
-		}
-	}
-	goto find_next_index_buffer;
-unm_EOD:
-	if (ia_page) {
-		unlock_page(ia_page);
-		ntfs_unmap_page(ia_page);
-	}
-	ntfs_unmap_page(bmp_page);
-	iput(bmp_vi);
-EOD:
-	/* We are finished, set fpos to EOD. */
-	actor->pos = i_size + vol->mft_record_size;
-abort:
-	kfree(name);
-	return 0;
-err_out:
-	if (bmp_page) {
-		ntfs_unmap_page(bmp_page);
-iput_err_out:
-		iput(bmp_vi);
-	}
-	if (ia_page) {
-		unlock_page(ia_page);
-		ntfs_unmap_page(ia_page);
-	}
-	kfree(ir);
-	kfree(name);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ndir);
-	if (!err)
-		err = -EIO;
-	ntfs_debug("Failed. Returning error code %i.", -err);
-	return err;
-}
-
-/**
- * ntfs_dir_open - called when an inode is about to be opened
- * @vi:		inode to be opened
- * @filp:	file structure describing the inode
- *
- * Limit directory size to the page cache limit on architectures where unsigned
- * long is 32-bits. This is the most we can do for now without overflowing the
- * page cache page index. Doing it this way means we don't run into problems
- * because of existing too large directories. It would be better to allow the
- * user to read the accessible part of the directory but I doubt very much
- * anyone is going to hit this check on a 32-bit architecture, so there is no
- * point in adding the extra complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- */
-static int ntfs_dir_open(struct inode *vi, struct file *filp)
-{
-	if (sizeof(unsigned long) < 8) {
-		if (i_size_read(vi) > MAX_LFS_FILESIZE)
-			return -EFBIG;
-	}
-	return 0;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_dir_fsync - sync a directory to disk
- * @filp:	directory to be synced
- * @start:	offset in bytes of the beginning of data range to sync
- * @end:	offset in bytes of the end of data range (inclusive)
- * @datasync:	if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a directory to disk.  Used for fsync, fdatasync, and
- * msync system calls.  This function is based on file.c::ntfs_file_fsync().
- *
- * Write the mft record and all associated extent mft records as well as the
- * $INDEX_ALLOCATION and $BITMAP attributes and then sync the block device.
- *
- * If @datasync is true, we do not wait on the inode(s) to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.  We do write the $BITMAP attribute if it is present
- * which is the important one for a directory so things are not too bad.
- */
-static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
-			  int datasync)
-{
-	struct inode *bmp_vi, *vi = filp->f_mapping->host;
-	int err, ret;
-	ntfs_attr na;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-	inode_lock(vi);
-
-	BUG_ON(!S_ISDIR(vi->i_mode));
-	/* If the bitmap attribute inode is in memory sync it, too. */
-	na.mft_no = vi->i_ino;
-	na.type = AT_BITMAP;
-	na.name = I30;
-	na.name_len = 4;
-	bmp_vi = ilookup5(vi->i_sb, vi->i_ino, ntfs_test_inode, &na);
-	if (bmp_vi) {
- 		write_inode_now(bmp_vi, !datasync);
-		iput(bmp_vi);
-	}
-	ret = __ntfs_write_inode(vi, 1);
-	write_inode_now(vi, !datasync);
-	err = sync_blockdev(vi->i_sb->s_bdev);
-	if (unlikely(err && !ret))
-		ret = err;
-	if (likely(!ret))
-		ntfs_debug("Done.");
-	else
-		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
-				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	inode_unlock(vi);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-WRAP_DIR_ITER(ntfs_readdir) // FIXME!
-const struct file_operations ntfs_dir_ops = {
-	.llseek		= generic_file_llseek,	/* Seek inside directory. */
-	.read		= generic_read_dir,	/* Return -EISDIR. */
-	.iterate_shared	= shared_ntfs_readdir,	/* Read directory contents. */
-#ifdef NTFS_RW
-	.fsync		= ntfs_dir_fsync,	/* Sync a directory to disk. */
-#endif /* NTFS_RW */
-	/*.ioctl	= ,*/			/* Perform function on the
-						   mounted filesystem. */
-	.open		= ntfs_dir_open,	/* Open directory. */
-};
diff --git a/fs/ntfs/dir.h b/fs/ntfs/dir.h
deleted file mode 100644
index 0e326753df40..000000000000
--- a/fs/ntfs/dir.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * dir.h - Defines for directory handling in NTFS Linux kernel driver. Part of
- *	   the Linux-NTFS project.
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_DIR_H
-#define _LINUX_NTFS_DIR_H
-
-#include "layout.h"
-#include "inode.h"
-#include "types.h"
-
-/*
- * ntfs_name is used to return the file name to the caller of
- * ntfs_lookup_inode_by_name() in order for the caller (namei.c::ntfs_lookup())
- * to be able to deal with dcache aliasing issues.
- */
-typedef struct {
-	MFT_REF mref;
-	FILE_NAME_TYPE_FLAGS type;
-	u8 len;
-	ntfschar name[0];
-} __attribute__ ((__packed__)) ntfs_name;
-
-/* The little endian Unicode string $I30 as a global constant. */
-extern ntfschar I30[5];
-
-extern MFT_REF ntfs_lookup_inode_by_name(ntfs_inode *dir_ni,
-		const ntfschar *uname, const int uname_len, ntfs_name **res);
-
-#endif /* _LINUX_NTFS_FS_DIR_H */
diff --git a/fs/ntfs/endian.h b/fs/ntfs/endian.h
deleted file mode 100644
index f30c139bf9ae..000000000000
--- a/fs/ntfs/endian.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * endian.h - Defines for endianness handling in NTFS Linux kernel driver.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_ENDIAN_H
-#define _LINUX_NTFS_ENDIAN_H
-
-#include <asm/byteorder.h>
-#include "types.h"
-
-/*
- * Signed endianness conversion functions.
- */
-
-static inline s16 sle16_to_cpu(sle16 x)
-{
-	return le16_to_cpu((__force le16)x);
-}
-
-static inline s32 sle32_to_cpu(sle32 x)
-{
-	return le32_to_cpu((__force le32)x);
-}
-
-static inline s64 sle64_to_cpu(sle64 x)
-{
-	return le64_to_cpu((__force le64)x);
-}
-
-static inline s16 sle16_to_cpup(sle16 *x)
-{
-	return le16_to_cpu(*(__force le16*)x);
-}
-
-static inline s32 sle32_to_cpup(sle32 *x)
-{
-	return le32_to_cpu(*(__force le32*)x);
-}
-
-static inline s64 sle64_to_cpup(sle64 *x)
-{
-	return le64_to_cpu(*(__force le64*)x);
-}
-
-static inline sle16 cpu_to_sle16(s16 x)
-{
-	return (__force sle16)cpu_to_le16(x);
-}
-
-static inline sle32 cpu_to_sle32(s32 x)
-{
-	return (__force sle32)cpu_to_le32(x);
-}
-
-static inline sle64 cpu_to_sle64(s64 x)
-{
-	return (__force sle64)cpu_to_le64(x);
-}
-
-static inline sle16 cpu_to_sle16p(s16 *x)
-{
-	return (__force sle16)cpu_to_le16(*x);
-}
-
-static inline sle32 cpu_to_sle32p(s32 *x)
-{
-	return (__force sle32)cpu_to_le32(*x);
-}
-
-static inline sle64 cpu_to_sle64p(s64 *x)
-{
-	return (__force sle64)cpu_to_le64(*x);
-}
-
-#endif /* _LINUX_NTFS_ENDIAN_H */
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
deleted file mode 100644
index 297c0b9db621..000000000000
--- a/fs/ntfs/file.c
+++ /dev/null
@@ -1,1997 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * file.c - NTFS kernel file operations.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/blkdev.h>
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/pagemap.h>
-#include <linux/pagevec.h>
-#include <linux/sched/signal.h>
-#include <linux/swap.h>
-#include <linux/uio.h>
-#include <linux/writeback.h>
-
-#include <asm/page.h>
-#include <linux/uaccess.h>
-
-#include "attrib.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "debug.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_file_open - called when an inode is about to be opened
- * @vi:		inode to be opened
- * @filp:	file structure describing the inode
- *
- * Limit file size to the page cache limit on architectures where unsigned long
- * is 32-bits. This is the most we can do for now without overflowing the page
- * cache page index. Doing it this way means we don't run into problems because
- * of existing too large files. It would be better to allow the user to read
- * the beginning of the file but I doubt very much anyone is going to hit this
- * check on a 32-bit architecture, so there is no point in adding the extra
- * complexity required to support this.
- *
- * On 64-bit architectures, the check is hopefully optimized away by the
- * compiler.
- *
- * After the check passes, just call generic_file_open() to do its work.
- */
-static int ntfs_file_open(struct inode *vi, struct file *filp)
-{
-	if (sizeof(unsigned long) < 8) {
-		if (i_size_read(vi) > MAX_LFS_FILESIZE)
-			return -EOVERFLOW;
-	}
-	return generic_file_open(vi, filp);
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_attr_extend_initialized - extend the initialized size of an attribute
- * @ni:			ntfs inode of the attribute to extend
- * @new_init_size:	requested new initialized size in bytes
- *
- * Extend the initialized size of an attribute described by the ntfs inode @ni
- * to @new_init_size bytes.  This involves zeroing any non-sparse space between
- * the old initialized size and @new_init_size both in the page cache and on
- * disk (if relevant complete pages are already uptodate in the page cache then
- * these are simply marked dirty).
- *
- * As a side-effect, the file size (vfs inode->i_size) may be incremented as,
- * in the resident attribute case, it is tied to the initialized size and, in
- * the non-resident attribute case, it may not fall below the initialized size.
- *
- * Note that if the attribute is resident, we do not need to touch the page
- * cache at all.  This is because if the page cache page is not uptodate we
- * bring it uptodate later, when doing the write to the mft record since we
- * then already have the page mapped.  And if the page is uptodate, the
- * non-initialized region will already have been zeroed when the page was
- * brought uptodate and the region may in fact already have been overwritten
- * with new data via mmap() based writes, so we cannot just zero it.  And since
- * POSIX specifies that the behaviour of resizing a file whilst it is mmap()ped
- * is unspecified, we choose not to do zeroing and thus we do not need to touch
- * the page at all.  For a more detailed explanation see ntfs_truncate() in
- * fs/ntfs/inode.c.
- *
- * Return 0 on success and -errno on error.  In the case that an error is
- * encountered it is possible that the initialized size will already have been
- * incremented some way towards @new_init_size but it is guaranteed that if
- * this is the case, the necessary zeroing will also have happened and that all
- * metadata is self-consistent.
- *
- * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
- *	    held by the caller.
- */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-{
-	s64 old_init_size;
-	loff_t old_i_size;
-	pgoff_t index, end_index;
-	unsigned long flags;
-	struct inode *vi = VFS_I(ni);
-	ntfs_inode *base_ni;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx = NULL;
-	struct address_space *mapping;
-	struct page *page = NULL;
-	u8 *kattr;
-	int err;
-	u32 attr_len;
-
-	read_lock_irqsave(&ni->size_lock, flags);
-	old_init_size = ni->initialized_size;
-	old_i_size = i_size_read(vi);
-	BUG_ON(new_init_size > ni->allocated_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
-			"old_initialized_size 0x%llx, "
-			"new_initialized_size 0x%llx, i_size 0x%llx.",
-			vi->i_ino, (unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)old_init_size,
-			(unsigned long long)new_init_size, old_i_size);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Use goto to reduce indentation and we need the label below anyway. */
-	if (NInoNonResident(ni))
-		goto do_non_resident_extend;
-	BUG_ON(old_init_size != old_i_size);
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(a->non_resident);
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	BUG_ON(old_i_size != (loff_t)attr_len);
-	/*
-	 * Do the zeroing in the mft record and update the attribute size in
-	 * the mft record.
-	 */
-	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
-	memset(kattr + attr_len, 0, new_init_size - attr_len);
-	a->data.resident.value_length = cpu_to_le32((u32)new_init_size);
-	/* Finally, update the sizes in the vfs and ntfs inodes. */
-	write_lock_irqsave(&ni->size_lock, flags);
-	i_size_write(vi, new_init_size);
-	ni->initialized_size = new_init_size;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	goto done;
-do_non_resident_extend:
-	/*
-	 * If the new initialized size @new_init_size exceeds the current file
-	 * size (vfs inode->i_size), we need to extend the file size to the
-	 * new initialized size.
-	 */
-	if (new_init_size > old_i_size) {
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			m = NULL;
-			goto err_out;
-		}
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			goto err_out;
-		}
-		m = ctx->mrec;
-		a = ctx->attr;
-		BUG_ON(!a->non_resident);
-		BUG_ON(old_i_size != (loff_t)
-				sle64_to_cpu(a->data.non_resident.data_size));
-		a->data.non_resident.data_size = cpu_to_sle64(new_init_size);
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		/* Update the file size in the vfs inode. */
-		i_size_write(vi, new_init_size);
-		ntfs_attr_put_search_ctx(ctx);
-		ctx = NULL;
-		unmap_mft_record(base_ni);
-		m = NULL;
-	}
-	mapping = vi->i_mapping;
-	index = old_init_size >> PAGE_SHIFT;
-	end_index = (new_init_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	do {
-		/*
-		 * Read the page.  If the page is not present, this will zero
-		 * the uninitialized regions for us.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			goto init_err_out;
-		}
-		/*
-		 * Update the initialized size in the ntfs inode.  This is
-		 * enough to make ntfs_writepage() work.
-		 */
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = (s64)(index + 1) << PAGE_SHIFT;
-		if (ni->initialized_size > new_init_size)
-			ni->initialized_size = new_init_size;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		/* Set the page dirty so it gets written out. */
-		set_page_dirty(page);
-		put_page(page);
-		/*
-		 * Play nice with the vm and the rest of the system.  This is
-		 * very much needed as we can potentially be modifying the
-		 * initialised size from a very small value to a really huge
-		 * value, e.g.
-		 *	f = open(somefile, O_TRUNC);
-		 *	truncate(f, 10GiB);
-		 *	seek(f, 10GiB);
-		 *	write(f, 1);
-		 * And this would mean we would be marking dirty hundreds of
-		 * thousands of pages or as in the above example more than
-		 * two and a half million pages!
-		 *
-		 * TODO: For sparse pages could optimize this workload by using
-		 * the FsMisc / MiscFs page bit as a "PageIsSparse" bit.  This
-		 * would be set in read_folio for sparse pages and here we would
-		 * not need to mark dirty any pages which have this bit set.
-		 * The only caveat is that we have to clear the bit everywhere
-		 * where we allocate any clusters that lie in the page or that
-		 * contain the page.
-		 *
-		 * TODO: An even greater optimization would be for us to only
-		 * call read_folio() on pages which are not in sparse regions as
-		 * determined from the runlist.  This would greatly reduce the
-		 * number of pages we read and make dirty in the case of sparse
-		 * files.
-		 */
-		balance_dirty_pages_ratelimited(mapping);
-		cond_resched();
-	} while (++index < end_index);
-	read_lock_irqsave(&ni->size_lock, flags);
-	BUG_ON(ni->initialized_size != new_init_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/* Now bring in sync the initialized_size in the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		goto init_err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto init_err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto init_err_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	BUG_ON(!a->non_resident);
-	a->data.non_resident.initialized_size = cpu_to_sle64(new_init_size);
-done:
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_debug("Done, initialized_size 0x%llx, i_size 0x%llx.",
-			(unsigned long long)new_init_size, i_size_read(vi));
-	return 0;
-init_err_out:
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->initialized_size = old_init_size;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-}
-
-static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb,
-		struct iov_iter *from)
-{
-	loff_t pos;
-	s64 end, ll;
-	ssize_t err;
-	unsigned long flags;
-	struct file *file = iocb->ki_filp;
-	struct inode *vi = file_inode(file);
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-			"0x%llx, count 0x%zx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)iocb->ki_pos,
-			iov_iter_count(from));
-	err = generic_write_checks(iocb, from);
-	if (unlikely(err <= 0))
-		goto out;
-	/*
-	 * All checks have passed.  Before we start doing any writing we want
-	 * to abort any totally illegal writes.
-	 */
-	BUG_ON(NInoMstProtected(ni));
-	BUG_ON(ni->type != AT_DATA);
-	/* If file is encrypted, deny access, just like NT4. */
-	if (NInoEncrypted(ni)) {
-		/* Only $DATA attributes can be encrypted. */
-		/*
-		 * Reminder for later: Encrypted files are _always_
-		 * non-resident so that the content can always be encrypted.
-		 */
-		ntfs_debug("Denying write access to encrypted file.");
-		err = -EACCES;
-		goto out;
-	}
-	if (NInoCompressed(ni)) {
-		/* Only unnamed $DATA attribute can be compressed. */
-		BUG_ON(ni->name_len);
-		/*
-		 * Reminder for later: If resident, the data is not actually
-		 * compressed.  Only on the switch to non-resident does
-		 * compression kick in.  This is in contrast to encrypted files
-		 * (see above).
-		 */
-		ntfs_error(vi->i_sb, "Writing to compressed files is not "
-				"implemented yet.  Sorry.");
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-	err = file_remove_privs(file);
-	if (unlikely(err))
-		goto out;
-	/*
-	 * Our ->update_time method always succeeds thus file_update_time()
-	 * cannot fail either so there is no need to check the return code.
-	 */
-	file_update_time(file);
-	pos = iocb->ki_pos;
-	/* The first byte after the last cluster being written to. */
-	end = (pos + iov_iter_count(from) + vol->cluster_size_mask) &
-			~(u64)vol->cluster_size_mask;
-	/*
-	 * If the write goes beyond the allocated size, extend the allocation
-	 * to cover the whole of the write, rounded up to the nearest cluster.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	ll = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (end > ll) {
-		/*
-		 * Extend the allocation without changing the data size.
-		 *
-		 * Note we ensure the allocation is big enough to at least
-		 * write some data but we do not require the allocation to be
-		 * complete, i.e. it may be partial.
-		 */
-		ll = ntfs_attr_extend_allocation(ni, end, -1, pos);
-		if (likely(ll >= 0)) {
-			BUG_ON(pos >= ll);
-			/* If the extension was partial truncate the write. */
-			if (end > ll) {
-				ntfs_debug("Truncating write to inode 0x%lx, "
-						"attribute type 0x%x, because "
-						"the allocation was only "
-						"partially extended.",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type));
-				iov_iter_truncate(from, ll - pos);
-			}
-		} else {
-			err = ll;
-			read_lock_irqsave(&ni->size_lock, flags);
-			ll = ni->allocated_size;
-			read_unlock_irqrestore(&ni->size_lock, flags);
-			/* Perform a partial write if possible or fail. */
-			if (pos < ll) {
-				ntfs_debug("Truncating write to inode 0x%lx "
-						"attribute type 0x%x, because "
-						"extending the allocation "
-						"failed (error %d).",
-						vi->i_ino, (unsigned)
-						le32_to_cpu(ni->type),
-						(int)-err);
-				iov_iter_truncate(from, ll - pos);
-			} else {
-				if (err != -ENOSPC)
-					ntfs_error(vi->i_sb, "Cannot perform "
-							"write to inode "
-							"0x%lx, attribute "
-							"type 0x%x, because "
-							"extending the "
-							"allocation failed "
-							"(error %ld).",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type),
-							(long)-err);
-				else
-					ntfs_debug("Cannot perform write to "
-							"inode 0x%lx, "
-							"attribute type 0x%x, "
-							"because there is not "
-							"space left.",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type));
-				goto out;
-			}
-		}
-	}
-	/*
-	 * If the write starts beyond the initialized size, extend it up to the
-	 * beginning of the write and initialize all non-sparse space between
-	 * the old initialized size and the new one.  This automatically also
-	 * increments the vfs inode->i_size to keep it above or equal to the
-	 * initialized_size.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	ll = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (pos > ll) {
-		/*
-		 * Wait for ongoing direct i/o to complete before proceeding.
-		 * New direct i/o cannot start as we hold i_mutex.
-		 */
-		inode_dio_wait(vi);
-		err = ntfs_attr_extend_initialized(ni, pos);
-		if (unlikely(err < 0))
-			ntfs_error(vi->i_sb, "Cannot perform write to inode "
-					"0x%lx, attribute type 0x%x, because "
-					"extending the initialized size "
-					"failed (error %d).", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type),
-					(int)-err);
-	}
-out:
-	return err;
-}
-
-/**
- * __ntfs_grab_cache_pages - obtain a number of locked pages
- * @mapping:	address space mapping from which to obtain page cache pages
- * @index:	starting index in @mapping at which to begin obtaining pages
- * @nr_pages:	number of page cache pages to obtain
- * @pages:	array of pages in which to return the obtained page cache pages
- * @cached_page: allocated but as yet unused page
- *
- * Obtain @nr_pages locked page cache pages from the mapping @mapping and
- * starting at index @index.
- *
- * If a page is newly created, add it to lru list
- *
- * Note, the page locks are obtained in ascending page index order.
- */
-static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
-		pgoff_t index, const unsigned nr_pages, struct page **pages,
-		struct page **cached_page)
-{
-	int err, nr;
-
-	BUG_ON(!nr_pages);
-	err = nr = 0;
-	do {
-		pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK |
-				FGP_ACCESSED);
-		if (!pages[nr]) {
-			if (!*cached_page) {
-				*cached_page = page_cache_alloc(mapping);
-				if (unlikely(!*cached_page)) {
-					err = -ENOMEM;
-					goto err_out;
-				}
-			}
-			err = add_to_page_cache_lru(*cached_page, mapping,
-				   index,
-				   mapping_gfp_constraint(mapping, GFP_KERNEL));
-			if (unlikely(err)) {
-				if (err == -EEXIST)
-					continue;
-				goto err_out;
-			}
-			pages[nr] = *cached_page;
-			*cached_page = NULL;
-		}
-		index++;
-		nr++;
-	} while (nr < nr_pages);
-out:
-	return err;
-err_out:
-	while (nr > 0) {
-		unlock_page(pages[--nr]);
-		put_page(pages[nr]);
-	}
-	goto out;
-}
-
-static inline void ntfs_submit_bh_for_read(struct buffer_head *bh)
-{
-	lock_buffer(bh);
-	get_bh(bh);
-	bh->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, bh);
-}
-
-/**
- * ntfs_prepare_pages_for_non_resident_write - prepare pages for receiving data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * This is called for non-resident attributes from ntfs_file_buffered_write()
- * with i_mutex held on the inode (@pages[0]->mapping->host).  There are
- * @nr_pages pages in @pages which are locked but not kmap()ped.  The source
- * data has not yet been copied into the @pages.
- * 
- * Need to fill any holes with actual clusters, allocate buffers if necessary,
- * ensure all the buffers are mapped, and bring uptodate any buffers that are
- * only partially being written to.
- *
- * If @nr_pages is greater than one, we are guaranteed that the cluster size is
- * greater than PAGE_SIZE, that all pages in @pages are entirely inside
- * the same cluster and that they are the entirety of that cluster, and that
- * the cluster is sparse, i.e. we need to allocate a cluster to fill the hole.
- *
- * i_size is not to be modified yet.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_prepare_pages_for_non_resident_write(struct page **pages,
-		unsigned nr_pages, s64 pos, size_t bytes)
-{
-	VCN vcn, highest_vcn = 0, cpos, cend, bh_cpos, bh_cend;
-	LCN lcn;
-	s64 bh_pos, vcn_len, end, initialized_size;
-	sector_t lcn_block;
-	struct folio *folio;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni = NULL;
-	ntfs_volume *vol;
-	runlist_element *rl, *rl2;
-	struct buffer_head *bh, *head, *wait[2], **wait_bh = wait;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a = NULL;
-	unsigned long flags;
-	u32 attr_rec_len = 0;
-	unsigned blocksize, u;
-	int err, mp_size;
-	bool rl_write_locked, was_hole, is_retry;
-	unsigned char blocksize_bits;
-	struct {
-		u8 runlist_merged:1;
-		u8 mft_attr_mapped:1;
-		u8 mp_rebuilt:1;
-		u8 attr_switched:1;
-	} status = { 0, 0, 0, 0 };
-
-	BUG_ON(!nr_pages);
-	BUG_ON(!pages);
-	BUG_ON(!*pages);
-	vi = pages[0]->mapping->host;
-	ni = NTFS_I(vi);
-	vol = ni->vol;
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
-			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
-			vi->i_ino, ni->type, pages[0]->index, nr_pages,
-			(long long)pos, bytes);
-	blocksize = vol->sb->s_blocksize;
-	blocksize_bits = vol->sb->s_blocksize_bits;
-	rl_write_locked = false;
-	rl = NULL;
-	err = 0;
-	vcn = lcn = -1;
-	vcn_len = 0;
-	lcn_block = -1;
-	was_hole = false;
-	cpos = pos >> vol->cluster_size_bits;
-	end = pos + bytes;
-	cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits;
-	/*
-	 * Loop over each buffer in each folio.  Use goto to
-	 * reduce indentation.
-	 */
-	u = 0;
-do_next_folio:
-	folio = page_folio(pages[u]);
-	bh_pos = folio_pos(folio);
-	head = folio_buffers(folio);
-	if (!head)
-		/*
-		 * create_empty_buffers() will create uptodate/dirty
-		 * buffers if the folio is uptodate/dirty.
-		 */
-		head = create_empty_buffers(folio, blocksize, 0);
-	bh = head;
-	do {
-		VCN cdelta;
-		s64 bh_end;
-		unsigned bh_cofs;
-
-		/* Clear buffer_new on all buffers to reinitialise state. */
-		if (buffer_new(bh))
-			clear_buffer_new(bh);
-		bh_end = bh_pos + blocksize;
-		bh_cpos = bh_pos >> vol->cluster_size_bits;
-		bh_cofs = bh_pos & vol->cluster_size_mask;
-		if (buffer_mapped(bh)) {
-			/*
-			 * The buffer is already mapped.  If it is uptodate,
-			 * ignore it.
-			 */
-			if (buffer_uptodate(bh))
-				continue;
-			/*
-			 * The buffer is not uptodate.  If the folio is uptodate
-			 * set the buffer uptodate and otherwise ignore it.
-			 */
-			if (folio_test_uptodate(folio)) {
-				set_buffer_uptodate(bh);
-				continue;
-			}
-			/*
-			 * Neither the folio nor the buffer are uptodate.  If
-			 * the buffer is only partially being written to, we
-			 * need to read it in before the write, i.e. now.
-			 */
-			if ((bh_pos < pos && bh_end > pos) ||
-					(bh_pos < end && bh_end > end)) {
-				/*
-				 * If the buffer is fully or partially within
-				 * the initialized size, do an actual read.
-				 * Otherwise, simply zero the buffer.
-				 */
-				read_lock_irqsave(&ni->size_lock, flags);
-				initialized_size = ni->initialized_size;
-				read_unlock_irqrestore(&ni->size_lock, flags);
-				if (bh_pos < initialized_size) {
-					ntfs_submit_bh_for_read(bh);
-					*wait_bh++ = bh;
-				} else {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-			}
-			continue;
-		}
-		/* Unmapped buffer.  Need to map it. */
-		bh->b_bdev = vol->sb->s_bdev;
-		/*
-		 * If the current buffer is in the same clusters as the map
-		 * cache, there is no need to check the runlist again.  The
-		 * map cache is made up of @vcn, which is the first cached file
-		 * cluster, @vcn_len which is the number of cached file
-		 * clusters, @lcn is the device cluster corresponding to @vcn,
-		 * and @lcn_block is the block number corresponding to @lcn.
-		 */
-		cdelta = bh_cpos - vcn;
-		if (likely(!cdelta || (cdelta > 0 && cdelta < vcn_len))) {
-map_buffer_cached:
-			BUG_ON(lcn < 0);
-			bh->b_blocknr = lcn_block +
-					(cdelta << (vol->cluster_size_bits -
-					blocksize_bits)) +
-					(bh_cofs >> blocksize_bits);
-			set_buffer_mapped(bh);
-			/*
-			 * If the folio is uptodate so is the buffer.  If the
-			 * buffer is fully outside the write, we ignore it if
-			 * it was already allocated and we mark it dirty so it
-			 * gets written out if we allocated it.  On the other
-			 * hand, if we allocated the buffer but we are not
-			 * marking it dirty we set buffer_new so we can do
-			 * error recovery.
-			 */
-			if (folio_test_uptodate(folio)) {
-				if (!buffer_uptodate(bh))
-					set_buffer_uptodate(bh);
-				if (unlikely(was_hole)) {
-					/* We allocated the buffer. */
-					clean_bdev_bh_alias(bh);
-					if (bh_end <= pos || bh_pos >= end)
-						mark_buffer_dirty(bh);
-					else
-						set_buffer_new(bh);
-				}
-				continue;
-			}
-			/* Page is _not_ uptodate. */
-			if (likely(!was_hole)) {
-				/*
-				 * Buffer was already allocated.  If it is not
-				 * uptodate and is only partially being written
-				 * to, we need to read it in before the write,
-				 * i.e. now.
-				 */
-				if (!buffer_uptodate(bh) && bh_pos < end &&
-						bh_end > pos &&
-						(bh_pos < pos ||
-						bh_end > end)) {
-					/*
-					 * If the buffer is fully or partially
-					 * within the initialized size, do an
-					 * actual read.  Otherwise, simply zero
-					 * the buffer.
-					 */
-					read_lock_irqsave(&ni->size_lock,
-							flags);
-					initialized_size = ni->initialized_size;
-					read_unlock_irqrestore(&ni->size_lock,
-							flags);
-					if (bh_pos < initialized_size) {
-						ntfs_submit_bh_for_read(bh);
-						*wait_bh++ = bh;
-					} else {
-						folio_zero_range(folio,
-								bh_offset(bh),
-								blocksize);
-						set_buffer_uptodate(bh);
-					}
-				}
-				continue;
-			}
-			/* We allocated the buffer. */
-			clean_bdev_bh_alias(bh);
-			/*
-			 * If the buffer is fully outside the write, zero it,
-			 * set it uptodate, and mark it dirty so it gets
-			 * written out.  If it is partially being written to,
-			 * zero region surrounding the write but leave it to
-			 * commit write to do anything else.  Finally, if the
-			 * buffer is fully being overwritten, do nothing.
-			 */
-			if (bh_end <= pos || bh_pos >= end) {
-				if (!buffer_uptodate(bh)) {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-				mark_buffer_dirty(bh);
-				continue;
-			}
-			set_buffer_new(bh);
-			if (!buffer_uptodate(bh) &&
-					(bh_pos < pos || bh_end > end)) {
-				u8 *kaddr;
-				unsigned pofs;
-					
-				kaddr = kmap_local_folio(folio, 0);
-				if (bh_pos < pos) {
-					pofs = bh_pos & ~PAGE_MASK;
-					memset(kaddr + pofs, 0, pos - bh_pos);
-				}
-				if (bh_end > end) {
-					pofs = end & ~PAGE_MASK;
-					memset(kaddr + pofs, 0, bh_end - end);
-				}
-				kunmap_local(kaddr);
-				flush_dcache_folio(folio);
-			}
-			continue;
-		}
-		/*
-		 * Slow path: this is the first buffer in the cluster.  If it
-		 * is outside allocated size and is not uptodate, zero it and
-		 * set it uptodate.
-		 */
-		read_lock_irqsave(&ni->size_lock, flags);
-		initialized_size = ni->allocated_size;
-		read_unlock_irqrestore(&ni->size_lock, flags);
-		if (bh_pos > initialized_size) {
-			if (folio_test_uptodate(folio)) {
-				if (!buffer_uptodate(bh))
-					set_buffer_uptodate(bh);
-			} else if (!buffer_uptodate(bh)) {
-				folio_zero_range(folio, bh_offset(bh),
-						blocksize);
-				set_buffer_uptodate(bh);
-			}
-			continue;
-		}
-		is_retry = false;
-		if (!rl) {
-			down_read(&ni->runlist.lock);
-retry_remap:
-			rl = ni->runlist.rl;
-		}
-		if (likely(rl != NULL)) {
-			/* Seek to element containing target cluster. */
-			while (rl->length && rl[1].vcn <= bh_cpos)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, bh_cpos);
-			if (likely(lcn >= 0)) {
-				/*
-				 * Successful remap, setup the map cache and
-				 * use that to deal with the buffer.
-				 */
-				was_hole = false;
-				vcn = bh_cpos;
-				vcn_len = rl[1].vcn - vcn;
-				lcn_block = lcn << (vol->cluster_size_bits -
-						blocksize_bits);
-				cdelta = 0;
-				/*
-				 * If the number of remaining clusters touched
-				 * by the write is smaller or equal to the
-				 * number of cached clusters, unlock the
-				 * runlist as the map cache will be used from
-				 * now on.
-				 */
-				if (likely(vcn + vcn_len >= cend)) {
-					if (rl_write_locked) {
-						up_write(&ni->runlist.lock);
-						rl_write_locked = false;
-					} else
-						up_read(&ni->runlist.lock);
-					rl = NULL;
-				}
-				goto map_buffer_cached;
-			}
-		} else
-			lcn = LCN_RL_NOT_MAPPED;
-		/*
-		 * If it is not a hole and not out of bounds, the runlist is
-		 * probably unmapped so try to map it now.
-		 */
-		if (unlikely(lcn != LCN_HOLE && lcn != LCN_ENOENT)) {
-			if (likely(!is_retry && lcn == LCN_RL_NOT_MAPPED)) {
-				/* Attempt to map runlist. */
-				if (!rl_write_locked) {
-					/*
-					 * We need the runlist locked for
-					 * writing, so if it is locked for
-					 * reading relock it now and retry in
-					 * case it changed whilst we dropped
-					 * the lock.
-					 */
-					up_read(&ni->runlist.lock);
-					down_write(&ni->runlist.lock);
-					rl_write_locked = true;
-					goto retry_remap;
-				}
-				err = ntfs_map_runlist_nolock(ni, bh_cpos,
-						NULL);
-				if (likely(!err)) {
-					is_retry = true;
-					goto retry_remap;
-				}
-				/*
-				 * If @vcn is out of bounds, pretend @lcn is
-				 * LCN_ENOENT.  As long as the buffer is out
-				 * of bounds this will work fine.
-				 */
-				if (err == -ENOENT) {
-					lcn = LCN_ENOENT;
-					err = 0;
-					goto rl_not_mapped_enoent;
-				}
-			} else
-				err = -EIO;
-			/* Failed to map the buffer, even after retrying. */
-			bh->b_blocknr = -1;
-			ntfs_error(vol->sb, "Failed to write to inode 0x%lx, "
-					"attribute type 0x%x, vcn 0x%llx, "
-					"vcn offset 0x%x, because its "
-					"location on disk could not be "
-					"determined%s (error code %i).",
-					ni->mft_no, ni->type,
-					(unsigned long long)bh_cpos,
-					(unsigned)bh_pos &
-					vol->cluster_size_mask,
-					is_retry ? " even after retrying" : "",
-					err);
-			break;
-		}
-rl_not_mapped_enoent:
-		/*
-		 * The buffer is in a hole or out of bounds.  We need to fill
-		 * the hole, unless the buffer is in a cluster which is not
-		 * touched by the write, in which case we just leave the buffer
-		 * unmapped.  This can only happen when the cluster size is
-		 * less than the page cache size.
-		 */
-		if (unlikely(vol->cluster_size < PAGE_SIZE)) {
-			bh_cend = (bh_end + vol->cluster_size - 1) >>
-					vol->cluster_size_bits;
-			if ((bh_cend <= cpos || bh_cpos >= cend)) {
-				bh->b_blocknr = -1;
-				/*
-				 * If the buffer is uptodate we skip it.  If it
-				 * is not but the folio is uptodate, we can set
-				 * the buffer uptodate.  If the folio is not
-				 * uptodate, we can clear the buffer and set it
-				 * uptodate.  Whether this is worthwhile is
-				 * debatable and this could be removed.
-				 */
-				if (folio_test_uptodate(folio)) {
-					if (!buffer_uptodate(bh))
-						set_buffer_uptodate(bh);
-				} else if (!buffer_uptodate(bh)) {
-					folio_zero_range(folio, bh_offset(bh),
-						blocksize);
-					set_buffer_uptodate(bh);
-				}
-				continue;
-			}
-		}
-		/*
-		 * Out of bounds buffer is invalid if it was not really out of
-		 * bounds.
-		 */
-		BUG_ON(lcn != LCN_HOLE);
-		/*
-		 * We need the runlist locked for writing, so if it is locked
-		 * for reading relock it now and retry in case it changed
-		 * whilst we dropped the lock.
-		 */
-		BUG_ON(!rl);
-		if (!rl_write_locked) {
-			up_read(&ni->runlist.lock);
-			down_write(&ni->runlist.lock);
-			rl_write_locked = true;
-			goto retry_remap;
-		}
-		/* Find the previous last allocated cluster. */
-		BUG_ON(rl->lcn != LCN_HOLE);
-		lcn = -1;
-		rl2 = rl;
-		while (--rl2 >= ni->runlist.rl) {
-			if (rl2->lcn >= 0) {
-				lcn = rl2->lcn + rl2->length;
-				break;
-			}
-		}
-		rl2 = ntfs_cluster_alloc(vol, bh_cpos, 1, lcn, DATA_ZONE,
-				false);
-		if (IS_ERR(rl2)) {
-			err = PTR_ERR(rl2);
-			ntfs_debug("Failed to allocate cluster, error code %i.",
-					err);
-			break;
-		}
-		lcn = rl2->lcn;
-		rl = ntfs_runlists_merge(ni->runlist.rl, rl2);
-		if (IS_ERR(rl)) {
-			err = PTR_ERR(rl);
-			if (err != -ENOMEM)
-				err = -EIO;
-			if (ntfs_cluster_free_from_rl(vol, rl2)) {
-				ntfs_error(vol->sb, "Failed to release "
-						"allocated cluster in error "
-						"code path.  Run chkdsk to "
-						"recover the lost cluster.");
-				NVolSetErrors(vol);
-			}
-			ntfs_free(rl2);
-			break;
-		}
-		ni->runlist.rl = rl;
-		status.runlist_merged = 1;
-		ntfs_debug("Allocated cluster, lcn 0x%llx.",
-				(unsigned long long)lcn);
-		/* Map and lock the mft record and get the attribute record. */
-		if (!NInoAttr(ni))
-			base_ni = ni;
-		else
-			base_ni = ni->ext.base_ntfs_ino;
-		m = map_mft_record(base_ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			break;
-		}
-		ctx = ntfs_attr_get_search_ctx(base_ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			unmap_mft_record(base_ni);
-			break;
-		}
-		status.mft_attr_mapped = 1;
-		err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				err = -EIO;
-			break;
-		}
-		m = ctx->mrec;
-		a = ctx->attr;
-		/*
-		 * Find the runlist element with which the attribute extent
-		 * starts.  Note, we cannot use the _attr_ version because we
-		 * have mapped the mft record.  That is ok because we know the
-		 * runlist fragment must be mapped already to have ever gotten
-		 * here, so we can just use the _rl_ version.
-		 */
-		vcn = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-		rl2 = ntfs_rl_find_vcn_nolock(rl, vcn);
-		BUG_ON(!rl2);
-		BUG_ON(!rl2->length);
-		BUG_ON(rl2->lcn < LCN_HOLE);
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		/*
-		 * If @highest_vcn is zero, calculate the real highest_vcn
-		 * (which can really be zero).
-		 */
-		if (!highest_vcn)
-			highest_vcn = (sle64_to_cpu(
-					a->data.non_resident.allocated_size) >>
-					vol->cluster_size_bits) - 1;
-		/*
-		 * Determine the size of the mapping pairs array for the new
-		 * extent, i.e. the old extent with the hole filled.
-		 */
-		mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, vcn,
-				highest_vcn);
-		if (unlikely(mp_size <= 0)) {
-			if (!(err = mp_size))
-				err = -EIO;
-			ntfs_debug("Failed to get size for mapping pairs "
-					"array, error code %i.", err);
-			break;
-		}
-		/*
-		 * Resize the attribute record to fit the new mapping pairs
-		 * array.
-		 */
-		attr_rec_len = le32_to_cpu(a->length);
-		err = ntfs_attr_record_resize(m, a, mp_size + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset));
-		if (unlikely(err)) {
-			BUG_ON(err != -ENOSPC);
-			// TODO: Deal with this by using the current attribute
-			// and fill it with as much of the mapping pairs
-			// array as possible.  Then loop over each attribute
-			// extent rewriting the mapping pairs arrays as we go
-			// along and if when we reach the end we have not
-			// enough space, try to resize the last attribute
-			// extent and if even that fails, add a new attribute
-			// extent.
-			// We could also try to resize at each step in the hope
-			// that we will not need to rewrite every single extent.
-			// Note, we may need to decompress some extents to fill
-			// the runlist as we are walking the extents...
-			ntfs_error(vol->sb, "Not enough space in the mft "
-					"record for the extended attribute "
-					"record.  This case is not "
-					"implemented yet.");
-			err = -EOPNOTSUPP;
-			break ;
-		}
-		status.mp_rebuilt = 1;
-		/*
-		 * Generate the mapping pairs array directly into the attribute
-		 * record.
-		 */
-		err = ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				mp_size, rl2, vcn, highest_vcn, NULL);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Cannot fill hole in inode 0x%lx, "
-					"attribute type 0x%x, because building "
-					"the mapping pairs failed with error "
-					"code %i.", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-			err = -EIO;
-			break;
-		}
-		/* Update the highest_vcn but only if it was not set. */
-		if (unlikely(!a->data.non_resident.highest_vcn))
-			a->data.non_resident.highest_vcn =
-					cpu_to_sle64(highest_vcn);
-		/*
-		 * If the attribute is sparse/compressed, update the compressed
-		 * size in the ntfs_inode structure and the attribute record.
-		 */
-		if (likely(NInoSparse(ni) || NInoCompressed(ni))) {
-			/*
-			 * If we are not in the first attribute extent, switch
-			 * to it, but first ensure the changes will make it to
-			 * disk later.
-			 */
-			if (a->data.non_resident.lowest_vcn) {
-				flush_dcache_mft_record_page(ctx->ntfs_ino);
-				mark_mft_record_dirty(ctx->ntfs_ino);
-				ntfs_attr_reinit_search_ctx(ctx);
-				err = ntfs_attr_lookup(ni->type, ni->name,
-						ni->name_len, CASE_SENSITIVE,
-						0, NULL, 0, ctx);
-				if (unlikely(err)) {
-					status.attr_switched = 1;
-					break;
-				}
-				/* @m is not used any more so do not set it. */
-				a = ctx->attr;
-			}
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->itype.compressed.size += vol->cluster_size;
-			a->data.non_resident.compressed_size =
-					cpu_to_sle64(ni->itype.compressed.size);
-			write_unlock_irqrestore(&ni->size_lock, flags);
-		}
-		/* Ensure the changes make it to disk. */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		/* Successfully filled the hole. */
-		status.runlist_merged = 0;
-		status.mft_attr_mapped = 0;
-		status.mp_rebuilt = 0;
-		/* Setup the map cache and use that to deal with the buffer. */
-		was_hole = true;
-		vcn = bh_cpos;
-		vcn_len = 1;
-		lcn_block = lcn << (vol->cluster_size_bits - blocksize_bits);
-		cdelta = 0;
-		/*
-		 * If the number of remaining clusters in the @pages is smaller
-		 * or equal to the number of cached clusters, unlock the
-		 * runlist as the map cache will be used from now on.
-		 */
-		if (likely(vcn + vcn_len >= cend)) {
-			up_write(&ni->runlist.lock);
-			rl_write_locked = false;
-			rl = NULL;
-		}
-		goto map_buffer_cached;
-	} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
-	/* If there are no errors, do the next page. */
-	if (likely(!err && ++u < nr_pages))
-		goto do_next_folio;
-	/* If there are no errors, release the runlist lock if we took it. */
-	if (likely(!err)) {
-		if (unlikely(rl_write_locked)) {
-			up_write(&ni->runlist.lock);
-			rl_write_locked = false;
-		} else if (unlikely(rl))
-			up_read(&ni->runlist.lock);
-		rl = NULL;
-	}
-	/* If we issued read requests, let them complete. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	while (wait_bh > wait) {
-		bh = *--wait_bh;
-		wait_on_buffer(bh);
-		if (likely(buffer_uptodate(bh))) {
-			folio = bh->b_folio;
-			bh_pos = folio_pos(folio) + bh_offset(bh);
-			/*
-			 * If the buffer overflows the initialized size, need
-			 * to zero the overflowing region.
-			 */
-			if (unlikely(bh_pos + blocksize > initialized_size)) {
-				int ofs = 0;
-
-				if (likely(bh_pos < initialized_size))
-					ofs = initialized_size - bh_pos;
-				folio_zero_segment(folio, bh_offset(bh) + ofs,
-						blocksize);
-			}
-		} else /* if (unlikely(!buffer_uptodate(bh))) */
-			err = -EIO;
-	}
-	if (likely(!err)) {
-		/* Clear buffer_new on all buffers. */
-		u = 0;
-		do {
-			bh = head = page_buffers(pages[u]);
-			do {
-				if (buffer_new(bh))
-					clear_buffer_new(bh);
-			} while ((bh = bh->b_this_page) != head);
-		} while (++u < nr_pages);
-		ntfs_debug("Done.");
-		return err;
-	}
-	if (status.attr_switched) {
-		/* Get back to the attribute extent we modified. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		if (ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-				CASE_SENSITIVE, bh_cpos, NULL, 0, ctx)) {
-			ntfs_error(vol->sb, "Failed to find required "
-					"attribute extent of attribute in "
-					"error code path.  Run chkdsk to "
-					"recover.");
-			write_lock_irqsave(&ni->size_lock, flags);
-			ni->itype.compressed.size += vol->cluster_size;
-			write_unlock_irqrestore(&ni->size_lock, flags);
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-			/*
-			 * The only thing that is now wrong is the compressed
-			 * size of the base attribute extent which chkdsk
-			 * should be able to fix.
-			 */
-			NVolSetErrors(vol);
-		} else {
-			m = ctx->mrec;
-			a = ctx->attr;
-			status.attr_switched = 0;
-		}
-	}
-	/*
-	 * If the runlist has been modified, need to restore it by punching a
-	 * hole into it and we then need to deallocate the on-disk cluster as
-	 * well.  Note, we only modify the runlist if we are able to generate a
-	 * new mapping pairs array, i.e. only when the mapped attribute extent
-	 * is not switched.
-	 */
-	if (status.runlist_merged && !status.attr_switched) {
-		BUG_ON(!rl_write_locked);
-		/* Make the file cluster we allocated sparse in the runlist. */
-		if (ntfs_rl_punch_nolock(vol, &ni->runlist, bh_cpos, 1)) {
-			ntfs_error(vol->sb, "Failed to punch hole into "
-					"attribute runlist in error code "
-					"path.  Run chkdsk to recover the "
-					"lost cluster.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			status.runlist_merged = 0;
-			/*
-			 * Deallocate the on-disk cluster we allocated but only
-			 * if we succeeded in punching its vcn out of the
-			 * runlist.
-			 */
-			down_write(&vol->lcnbmp_lock);
-			if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
-				ntfs_error(vol->sb, "Failed to release "
-						"allocated cluster in error "
-						"code path.  Run chkdsk to "
-						"recover the lost cluster.");
-				NVolSetErrors(vol);
-			}
-			up_write(&vol->lcnbmp_lock);
-		}
-	}
-	/*
-	 * Resize the attribute record to its old size and rebuild the mapping
-	 * pairs array.  Note, we only can do this if the runlist has been
-	 * restored to its old state which also implies that the mapped
-	 * attribute extent is not switched.
-	 */
-	if (status.mp_rebuilt && !status.runlist_merged) {
-		if (ntfs_attr_record_resize(m, a, attr_rec_len)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record in error code path.  Run "
-					"chkdsk to recover.");
-			NVolSetErrors(vol);
-		} else /* if (success) */ {
-			if (ntfs_mapping_pairs_build(vol, (u8*)a +
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), attr_rec_len -
-					le16_to_cpu(a->data.non_resident.
-					mapping_pairs_offset), ni->runlist.rl,
-					vcn, highest_vcn, NULL)) {
-				ntfs_error(vol->sb, "Failed to restore "
-						"mapping pairs array in error "
-						"code path.  Run chkdsk to "
-						"recover.");
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		}
-	}
-	/* Release the mft record and the attribute. */
-	if (status.mft_attr_mapped) {
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-	}
-	/* Release the runlist lock. */
-	if (rl_write_locked)
-		up_write(&ni->runlist.lock);
-	else if (rl)
-		up_read(&ni->runlist.lock);
-	/*
-	 * Zero out any newly allocated blocks to avoid exposing stale data.
-	 * If BH_New is set, we know that the block was newly allocated above
-	 * and that it has not been fully zeroed and marked dirty yet.
-	 */
-	nr_pages = u;
-	u = 0;
-	end = bh_cpos << vol->cluster_size_bits;
-	do {
-		folio = page_folio(pages[u]);
-		bh = head = folio_buffers(folio);
-		do {
-			if (u == nr_pages &&
-			    folio_pos(folio) + bh_offset(bh) >= end)
-				break;
-			if (!buffer_new(bh))
-				continue;
-			clear_buffer_new(bh);
-			if (!buffer_uptodate(bh)) {
-				if (folio_test_uptodate(folio))
-					set_buffer_uptodate(bh);
-				else {
-					folio_zero_range(folio, bh_offset(bh),
-							blocksize);
-					set_buffer_uptodate(bh);
-				}
-			}
-			mark_buffer_dirty(bh);
-		} while ((bh = bh->b_this_page) != head);
-	} while (++u <= nr_pages);
-	ntfs_error(vol->sb, "Failed.  Returning error code %i.", err);
-	return err;
-}
-
-static inline void ntfs_flush_dcache_pages(struct page **pages,
-		unsigned nr_pages)
-{
-	BUG_ON(!nr_pages);
-	/*
-	 * Warning: Do not do the decrement at the same time as the call to
-	 * flush_dcache_page() because it is a NULL macro on i386 and hence the
-	 * decrement never happens so the loop never terminates.
-	 */
-	do {
-		--nr_pages;
-		flush_dcache_page(pages[nr_pages]);
-	} while (nr_pages > 0);
-}
-
-/**
- * ntfs_commit_pages_after_non_resident_write - commit the received data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * See description of ntfs_commit_pages_after_write(), below.
- */
-static inline int ntfs_commit_pages_after_non_resident_write(
-		struct page **pages, const unsigned nr_pages,
-		s64 pos, size_t bytes)
-{
-	s64 end, initialized_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	struct buffer_head *bh, *head;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	unsigned long flags;
-	unsigned blocksize, u;
-	int err;
-
-	vi = pages[0]->mapping->host;
-	ni = NTFS_I(vi);
-	blocksize = vi->i_sb->s_blocksize;
-	end = pos + bytes;
-	u = 0;
-	do {
-		s64 bh_pos;
-		struct page *page;
-		bool partial;
-
-		page = pages[u];
-		bh_pos = (s64)page->index << PAGE_SHIFT;
-		bh = head = page_buffers(page);
-		partial = false;
-		do {
-			s64 bh_end;
-
-			bh_end = bh_pos + blocksize;
-			if (bh_end <= pos || bh_pos >= end) {
-				if (!buffer_uptodate(bh))
-					partial = true;
-			} else {
-				set_buffer_uptodate(bh);
-				mark_buffer_dirty(bh);
-			}
-		} while (bh_pos += blocksize, (bh = bh->b_this_page) != head);
-		/*
-		 * If all buffers are now uptodate but the page is not, set the
-		 * page uptodate.
-		 */
-		if (!partial && !PageUptodate(page))
-			SetPageUptodate(page);
-	} while (++u < nr_pages);
-	/*
-	 * Finally, if we do not need to update initialized_size or i_size we
-	 * are finished.
-	 */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	if (end <= initialized_size) {
-		ntfs_debug("Done.");
-		return 0;
-	}
-	/*
-	 * Update initialized_size/i_size as appropriate, both in the inode and
-	 * the mft record.
-	 */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	BUG_ON(!NInoNonResident(ni));
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	a = ctx->attr;
-	BUG_ON(!a->non_resident);
-	write_lock_irqsave(&ni->size_lock, flags);
-	BUG_ON(end > ni->allocated_size);
-	ni->initialized_size = end;
-	a->data.non_resident.initialized_size = cpu_to_sle64(end);
-	if (end > i_size_read(vi)) {
-		i_size_write(vi, end);
-		a->data.non_resident.data_size =
-				a->data.non_resident.initialized_size;
-	}
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	ntfs_error(vi->i_sb, "Failed to update initialized_size/i_size (error "
-			"code %i).", err);
-	if (err != -ENOMEM)
-		NVolSetErrors(ni->vol);
-	return err;
-}
-
-/**
- * ntfs_commit_pages_after_write - commit the received data
- * @pages:	array of destination pages
- * @nr_pages:	number of pages in @pages
- * @pos:	byte position in file at which the write begins
- * @bytes:	number of bytes to be written
- *
- * This is called from ntfs_file_buffered_write() with i_mutex held on the inode
- * (@pages[0]->mapping->host).  There are @nr_pages pages in @pages which are
- * locked but not kmap()ped.  The source data has already been copied into the
- * @page.  ntfs_prepare_pages_for_non_resident_write() has been called before
- * the data was copied (for non-resident attributes only) and it returned
- * success.
- *
- * Need to set uptodate and mark dirty all buffers within the boundary of the
- * write.  If all buffers in a page are uptodate we set the page uptodate, too.
- *
- * Setting the buffers dirty ensures that they get written out later when
- * ntfs_writepage() is invoked by the VM.
- *
- * Finally, we need to update i_size and initialized_size as appropriate both
- * in the inode and the mft record.
- *
- * This is modelled after fs/buffer.c::generic_commit_write(), which marks
- * buffers uptodate and dirty, sets the page uptodate if all buffers in the
- * page are uptodate, and updates i_size if the end of io is beyond i_size.  In
- * that case, it also marks the inode dirty.
- *
- * If things have gone as outlined in
- * ntfs_prepare_pages_for_non_resident_write(), we do not need to do any page
- * content modifications here for non-resident attributes.  For resident
- * attributes we need to do the uptodate bringing here which we combine with
- * the copying into the mft record which means we save one atomic kmap.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_commit_pages_after_write(struct page **pages,
-		const unsigned nr_pages, s64 pos, size_t bytes)
-{
-	s64 end, initialized_size;
-	loff_t i_size;
-	struct inode *vi;
-	ntfs_inode *ni, *base_ni;
-	struct page *page;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	char *kattr, *kaddr;
-	unsigned long flags;
-	u32 attr_len;
-	int err;
-
-	BUG_ON(!nr_pages);
-	BUG_ON(!pages);
-	page = pages[0];
-	BUG_ON(!page);
-	vi = page->mapping->host;
-	ni = NTFS_I(vi);
-	ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, start page "
-			"index 0x%lx, nr_pages 0x%x, pos 0x%llx, bytes 0x%zx.",
-			vi->i_ino, ni->type, page->index, nr_pages,
-			(long long)pos, bytes);
-	if (NInoNonResident(ni))
-		return ntfs_commit_pages_after_non_resident_write(pages,
-				nr_pages, pos, bytes);
-	BUG_ON(nr_pages > 1);
-	/*
-	 * Attribute is resident, implying it is not compressed, encrypted, or
-	 * sparse.
-	 */
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	BUG_ON(NInoNonResident(ni));
-	/* Map, pin, and lock the mft record. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		m = NULL;
-		ctx = NULL;
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			err = -EIO;
-		goto err_out;
-	}
-	a = ctx->attr;
-	BUG_ON(a->non_resident);
-	/* The total length of the attribute value. */
-	attr_len = le32_to_cpu(a->data.resident.value_length);
-	i_size = i_size_read(vi);
-	BUG_ON(attr_len != i_size);
-	BUG_ON(pos > attr_len);
-	end = pos + bytes;
-	BUG_ON(end > le32_to_cpu(a->length) -
-			le16_to_cpu(a->data.resident.value_offset));
-	kattr = (u8*)a + le16_to_cpu(a->data.resident.value_offset);
-	kaddr = kmap_atomic(page);
-	/* Copy the received data from the page to the mft record. */
-	memcpy(kattr + pos, kaddr + pos, bytes);
-	/* Update the attribute length if necessary. */
-	if (end > attr_len) {
-		attr_len = end;
-		a->data.resident.value_length = cpu_to_le32(attr_len);
-	}
-	/*
-	 * If the page is not uptodate, bring the out of bounds area(s)
-	 * uptodate by copying data from the mft record to the page.
-	 */
-	if (!PageUptodate(page)) {
-		if (pos > 0)
-			memcpy(kaddr, kattr, pos);
-		if (end < attr_len)
-			memcpy(kaddr + end, kattr + end, attr_len - end);
-		/* Zero the region outside the end of the attribute value. */
-		memset(kaddr + attr_len, 0, PAGE_SIZE - attr_len);
-		flush_dcache_page(page);
-		SetPageUptodate(page);
-	}
-	kunmap_atomic(kaddr);
-	/* Update initialized_size/i_size if necessary. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	initialized_size = ni->initialized_size;
-	BUG_ON(end > ni->allocated_size);
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	BUG_ON(initialized_size != i_size);
-	if (end > initialized_size) {
-		write_lock_irqsave(&ni->size_lock, flags);
-		ni->initialized_size = end;
-		i_size_write(vi, end);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-	}
-	/* Mark the mft record dirty, so it gets written back. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	ntfs_debug("Done.");
-	return 0;
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Error allocating memory required to "
-				"commit the write.");
-		if (PageUptodate(page)) {
-			ntfs_warning(vi->i_sb, "Page is uptodate, setting "
-					"dirty so the write will be retried "
-					"later on by the VM.");
-			/*
-			 * Put the page on mapping->dirty_pages, but leave its
-			 * buffers' dirty state as-is.
-			 */
-			__set_page_dirty_nobuffers(page);
-			err = 0;
-		} else
-			ntfs_error(vi->i_sb, "Page is not uptodate.  Written "
-					"data has been lost.");
-	} else {
-		ntfs_error(vi->i_sb, "Resident attribute commit write failed "
-				"with error %i.", err);
-		NVolSetErrors(ni->vol);
-	}
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-}
-
-/*
- * Copy as much as we can into the pages and return the number of bytes which
- * were successfully copied.  If a fault is encountered then clear the pages
- * out to (ofs + bytes) and return the number of bytes which were copied.
- */
-static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages,
-		unsigned ofs, struct iov_iter *i, size_t bytes)
-{
-	struct page **last_page = pages + nr_pages;
-	size_t total = 0;
-	unsigned len, copied;
-
-	do {
-		len = PAGE_SIZE - ofs;
-		if (len > bytes)
-			len = bytes;
-		copied = copy_page_from_iter_atomic(*pages, ofs, len, i);
-		total += copied;
-		bytes -= copied;
-		if (!bytes)
-			break;
-		if (copied < len)
-			goto err;
-		ofs = 0;
-	} while (++pages < last_page);
-out:
-	return total;
-err:
-	/* Zero the rest of the target like __copy_from_user(). */
-	len = PAGE_SIZE - copied;
-	do {
-		if (len > bytes)
-			len = bytes;
-		zero_user(*pages, copied, len);
-		bytes -= len;
-		copied = 0;
-		len = PAGE_SIZE;
-	} while (++pages < last_page);
-	goto out;
-}
-
-/**
- * ntfs_perform_write - perform buffered write to a file
- * @file:	file to write to
- * @i:		iov_iter with data to write
- * @pos:	byte offset in file at which to begin writing to
- */
-static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i,
-		loff_t pos)
-{
-	struct address_space *mapping = file->f_mapping;
-	struct inode *vi = mapping->host;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER];
-	struct page *cached_page = NULL;
-	VCN last_vcn;
-	LCN lcn;
-	size_t bytes;
-	ssize_t status, written = 0;
-	unsigned nr_pages;
-
-	ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos "
-			"0x%llx, count 0x%lx.", vi->i_ino,
-			(unsigned)le32_to_cpu(ni->type),
-			(unsigned long long)pos,
-			(unsigned long)iov_iter_count(i));
-	/*
-	 * If a previous ntfs_truncate() failed, repeat it and abort if it
-	 * fails again.
-	 */
-	if (unlikely(NInoTruncateFailed(ni))) {
-		int err;
-
-		inode_dio_wait(vi);
-		err = ntfs_truncate(vi);
-		if (err || NInoTruncateFailed(ni)) {
-			if (!err)
-				err = -EIO;
-			ntfs_error(vol->sb, "Cannot perform write to inode "
-					"0x%lx, attribute type 0x%x, because "
-					"ntfs_truncate() failed (error code "
-					"%i).", vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type), err);
-			return err;
-		}
-	}
-	/*
-	 * Determine the number of pages per cluster for non-resident
-	 * attributes.
-	 */
-	nr_pages = 1;
-	if (vol->cluster_size > PAGE_SIZE && NInoNonResident(ni))
-		nr_pages = vol->cluster_size >> PAGE_SHIFT;
-	last_vcn = -1;
-	do {
-		VCN vcn;
-		pgoff_t start_idx;
-		unsigned ofs, do_pages, u;
-		size_t copied;
-
-		start_idx = pos >> PAGE_SHIFT;
-		ofs = pos & ~PAGE_MASK;
-		bytes = PAGE_SIZE - ofs;
-		do_pages = 1;
-		if (nr_pages > 1) {
-			vcn = pos >> vol->cluster_size_bits;
-			if (vcn != last_vcn) {
-				last_vcn = vcn;
-				/*
-				 * Get the lcn of the vcn the write is in.  If
-				 * it is a hole, need to lock down all pages in
-				 * the cluster.
-				 */
-				down_read(&ni->runlist.lock);
-				lcn = ntfs_attr_vcn_to_lcn_nolock(ni, pos >>
-						vol->cluster_size_bits, false);
-				up_read(&ni->runlist.lock);
-				if (unlikely(lcn < LCN_HOLE)) {
-					if (lcn == LCN_ENOMEM)
-						status = -ENOMEM;
-					else {
-						status = -EIO;
-						ntfs_error(vol->sb, "Cannot "
-							"perform write to "
-							"inode 0x%lx, "
-							"attribute type 0x%x, "
-							"because the attribute "
-							"is corrupt.",
-							vi->i_ino, (unsigned)
-							le32_to_cpu(ni->type));
-					}
-					break;
-				}
-				if (lcn == LCN_HOLE) {
-					start_idx = (pos & ~(s64)
-							vol->cluster_size_mask)
-							>> PAGE_SHIFT;
-					bytes = vol->cluster_size - (pos &
-							vol->cluster_size_mask);
-					do_pages = nr_pages;
-				}
-			}
-		}
-		if (bytes > iov_iter_count(i))
-			bytes = iov_iter_count(i);
-again:
-		/*
-		 * Bring in the user page(s) that we will copy from _first_.
-		 * Otherwise there is a nasty deadlock on copying from the same
-		 * page(s) as we are writing to, without it/them being marked
-		 * up-to-date.  Note, at present there is nothing to stop the
-		 * pages being swapped out between us bringing them into memory
-		 * and doing the actual copying.
-		 */
-		if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
-			status = -EFAULT;
-			break;
-		}
-		/* Get and lock @do_pages starting at index @start_idx. */
-		status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-				pages, &cached_page);
-		if (unlikely(status))
-			break;
-		/*
-		 * For non-resident attributes, we need to fill any holes with
-		 * actual clusters and ensure all bufferes are mapped.  We also
-		 * need to bring uptodate any buffers that are only partially
-		 * being written to.
-		 */
-		if (NInoNonResident(ni)) {
-			status = ntfs_prepare_pages_for_non_resident_write(
-					pages, do_pages, pos, bytes);
-			if (unlikely(status)) {
-				do {
-					unlock_page(pages[--do_pages]);
-					put_page(pages[do_pages]);
-				} while (do_pages);
-				break;
-			}
-		}
-		u = (pos >> PAGE_SHIFT) - pages[0]->index;
-		copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs,
-					i, bytes);
-		ntfs_flush_dcache_pages(pages + u, do_pages - u);
-		status = 0;
-		if (likely(copied == bytes)) {
-			status = ntfs_commit_pages_after_write(pages, do_pages,
-					pos, bytes);
-		}
-		do {
-			unlock_page(pages[--do_pages]);
-			put_page(pages[do_pages]);
-		} while (do_pages);
-		if (unlikely(status < 0)) {
-			iov_iter_revert(i, copied);
-			break;
-		}
-		cond_resched();
-		if (unlikely(copied < bytes)) {
-			iov_iter_revert(i, copied);
-			if (copied)
-				bytes = copied;
-			else if (bytes > PAGE_SIZE - ofs)
-				bytes = PAGE_SIZE - ofs;
-			goto again;
-		}
-		pos += copied;
-		written += copied;
-		balance_dirty_pages_ratelimited(mapping);
-		if (fatal_signal_pending(current)) {
-			status = -EINTR;
-			break;
-		}
-	} while (iov_iter_count(i));
-	if (cached_page)
-		put_page(cached_page);
-	ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
-			written ? "written" : "status", (unsigned long)written,
-			(long)status);
-	return written ? written : status;
-}
-
-/**
- * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock()
- * @iocb:	IO state structure
- * @from:	iov_iter with data to write
- *
- * Basically the same as generic_file_write_iter() except that it ends up
- * up calling ntfs_perform_write() instead of generic_perform_write() and that
- * O_DIRECT is not implemented.
- */
-static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
-{
-	struct file *file = iocb->ki_filp;
-	struct inode *vi = file_inode(file);
-	ssize_t written = 0;
-	ssize_t err;
-
-	inode_lock(vi);
-	/* We can write back this queue in page reclaim. */
-	err = ntfs_prepare_file_for_write(iocb, from);
-	if (iov_iter_count(from) && !err)
-		written = ntfs_perform_write(file, from, iocb->ki_pos);
-	inode_unlock(vi);
-	iocb->ki_pos += written;
-	if (likely(written > 0))
-		written = generic_write_sync(iocb, written);
-	return written ? written : err;
-}
-
-/**
- * ntfs_file_fsync - sync a file to disk
- * @filp:	file to be synced
- * @datasync:	if non-zero only flush user data and not metadata
- *
- * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
- * system calls.  This function is inspired by fs/buffer.c::file_fsync().
- *
- * If @datasync is false, write the mft record and all associated extent mft
- * records as well as the $DATA attribute and then sync the block device.
- *
- * If @datasync is true and the attribute is non-resident, we skip the writing
- * of the mft record and all associated extent mft records (this might still
- * happen due to the write_inode_now() call).
- *
- * Also, if @datasync is true, we do not wait on the inode to be written out
- * but we always wait on the page cache pages to be written out.
- *
- * Locking: Caller must hold i_mutex on the inode.
- *
- * TODO: We should probably also write all attribute/index inodes associated
- * with this inode but since we have no simple way of getting to them we ignore
- * this problem for now.
- */
-static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
-			   int datasync)
-{
-	struct inode *vi = filp->f_mapping->host;
-	int err, ret = 0;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-
-	err = file_write_and_wait_range(filp, start, end);
-	if (err)
-		return err;
-	inode_lock(vi);
-
-	BUG_ON(S_ISDIR(vi->i_mode));
-	if (!datasync || !NInoNonResident(NTFS_I(vi)))
-		ret = __ntfs_write_inode(vi, 1);
-	write_inode_now(vi, !datasync);
-	/*
-	 * NOTE: If we were to use mapping->private_list (see ext2 and
-	 * fs/buffer.c) for dirty blocks then we could optimize the below to be
-	 * sync_mapping_buffers(vi->i_mapping).
-	 */
-	err = sync_blockdev(vi->i_sb->s_bdev);
-	if (unlikely(err && !ret))
-		ret = err;
-	if (likely(!ret))
-		ntfs_debug("Done.");
-	else
-		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
-				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	inode_unlock(vi);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-const struct file_operations ntfs_file_ops = {
-	.llseek		= generic_file_llseek,
-	.read_iter	= generic_file_read_iter,
-#ifdef NTFS_RW
-	.write_iter	= ntfs_file_write_iter,
-	.fsync		= ntfs_file_fsync,
-#endif /* NTFS_RW */
-	.mmap		= generic_file_mmap,
-	.open		= ntfs_file_open,
-	.splice_read	= filemap_splice_read,
-};
-
-const struct inode_operations ntfs_file_inode_ops = {
-#ifdef NTFS_RW
-	.setattr	= ntfs_setattr,
-#endif /* NTFS_RW */
-};
-
-const struct file_operations ntfs_empty_file_ops = {};
-
-const struct inode_operations ntfs_empty_inode_ops = {};
diff --git a/fs/ntfs/index.c b/fs/ntfs/index.c
deleted file mode 100644
index d46c2c03a032..000000000000
--- a/fs/ntfs/index.c
+++ /dev/null
@@ -1,440 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * index.c - NTFS kernel index handling.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "aops.h"
-#include "collate.h"
-#include "debug.h"
-#include "index.h"
-#include "ntfs.h"
-
-/**
- * ntfs_index_ctx_get - allocate and initialize a new index context
- * @idx_ni:	ntfs index inode with which to initialize the context
- *
- * Allocate a new index context, initialize it with @idx_ni and return it.
- * Return NULL if allocation failed.
- *
- * Locking:  Caller must hold i_mutex on the index inode.
- */
-ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni)
-{
-	ntfs_index_context *ictx;
-
-	ictx = kmem_cache_alloc(ntfs_index_ctx_cache, GFP_NOFS);
-	if (ictx)
-		*ictx = (ntfs_index_context){ .idx_ni = idx_ni };
-	return ictx;
-}
-
-/**
- * ntfs_index_ctx_put - release an index context
- * @ictx:	index context to free
- *
- * Release the index context @ictx, releasing all associated resources.
- *
- * Locking:  Caller must hold i_mutex on the index inode.
- */
-void ntfs_index_ctx_put(ntfs_index_context *ictx)
-{
-	if (ictx->entry) {
-		if (ictx->is_in_root) {
-			if (ictx->actx)
-				ntfs_attr_put_search_ctx(ictx->actx);
-			if (ictx->base_ni)
-				unmap_mft_record(ictx->base_ni);
-		} else {
-			struct page *page = ictx->page;
-			if (page) {
-				BUG_ON(!PageLocked(page));
-				unlock_page(page);
-				ntfs_unmap_page(page);
-			}
-		}
-	}
-	kmem_cache_free(ntfs_index_ctx_cache, ictx);
-	return;
-}
-
-/**
- * ntfs_index_lookup - find a key in an index and return its index entry
- * @key:	[IN] key for which to search in the index
- * @key_len:	[IN] length of @key in bytes
- * @ictx:	[IN/OUT] context describing the index and the returned entry
- *
- * Before calling ntfs_index_lookup(), @ictx must have been obtained from a
- * call to ntfs_index_ctx_get().
- *
- * Look for the @key in the index specified by the index lookup context @ictx.
- * ntfs_index_lookup() walks the contents of the index looking for the @key.
- *
- * If the @key is found in the index, 0 is returned and @ictx is setup to
- * describe the index entry containing the matching @key.  @ictx->entry is the
- * index entry and @ictx->data and @ictx->data_len are the index entry data and
- * its length in bytes, respectively.
- *
- * If the @key is not found in the index, -ENOENT is returned and @ictx is
- * setup to describe the index entry whose key collates immediately after the
- * search @key, i.e. this is the position in the index at which an index entry
- * with a key of @key would need to be inserted.
- *
- * If an error occurs return the negative error code and @ictx is left
- * untouched.
- *
- * When finished with the entry and its data, call ntfs_index_ctx_put() to free
- * the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- *
- * Locking:  - Caller must hold i_mutex on the index inode.
- *	     - Each page cache page in the index allocation mapping must be
- *	       locked whilst being accessed otherwise we may find a corrupt
- *	       page due to it being under ->writepage at the moment which
- *	       applies the mst protection fixups before writing out and then
- *	       removes them again after the write is complete after which it 
- *	       unlocks the page.
- */
-int ntfs_index_lookup(const void *key, const int key_len,
-		ntfs_index_context *ictx)
-{
-	VCN vcn, old_vcn;
-	ntfs_inode *idx_ni = ictx->idx_ni;
-	ntfs_volume *vol = idx_ni->vol;
-	struct super_block *sb = vol->sb;
-	ntfs_inode *base_ni = idx_ni->ext.base_ntfs_ino;
-	MFT_RECORD *m;
-	INDEX_ROOT *ir;
-	INDEX_ENTRY *ie;
-	INDEX_ALLOCATION *ia;
-	u8 *index_end, *kaddr;
-	ntfs_attr_search_ctx *actx;
-	struct address_space *ia_mapping;
-	struct page *page;
-	int rc, err = 0;
-
-	ntfs_debug("Entering.");
-	BUG_ON(!NInoAttr(idx_ni));
-	BUG_ON(idx_ni->type != AT_INDEX_ALLOCATION);
-	BUG_ON(idx_ni->nr_extents != -1);
-	BUG_ON(!base_ni);
-	BUG_ON(!key);
-	BUG_ON(key_len <= 0);
-	if (!ntfs_is_collation_rule_supported(
-			idx_ni->itype.index.collation_rule)) {
-		ntfs_error(sb, "Index uses unsupported collation rule 0x%x.  "
-				"Aborting lookup.", le32_to_cpu(
-				idx_ni->itype.index.collation_rule));
-		return -EOPNOTSUPP;
-	}
-	/* Get hold of the mft record for the index inode. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(sb, "map_mft_record() failed with error code %ld.",
-				-PTR_ERR(m));
-		return PTR_ERR(m);
-	}
-	actx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!actx)) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-	/* Find the index root attribute in the mft record. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, idx_ni->name, idx_ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, actx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(sb, "Index root attribute missing in inode "
-					"0x%lx.", idx_ni->mft_no);
-			err = -EIO;
-		}
-		goto err_out;
-	}
-	/* Get to the index root value (it has been verified in read_inode). */
-	ir = (INDEX_ROOT*)((u8*)actx->attr +
-			le16_to_cpu(actx->attr->data.resident.value_offset));
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ir->index +
-			le32_to_cpu(ir->index.entries_offset));
-	/*
-	 * Loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)actx->mrec || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->length) > index_end)
-			goto idx_err_out;
-		/*
-		 * The last entry cannot contain a key.  It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Further bounds checks. */
-		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
-				le16_to_cpu(ie->key_length) >
-				le16_to_cpu(ie->data.vi.data_offset) ||
-				(u32)le16_to_cpu(ie->data.vi.data_offset) +
-				le16_to_cpu(ie->data.vi.data_length) >
-				le16_to_cpu(ie->length))
-			goto idx_err_out;
-		/* If the keys match perfectly, we setup @ictx and return 0. */
-		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
-				&ie->key, key_len)) {
-ir_done:
-			ictx->is_in_root = true;
-			ictx->ir = ir;
-			ictx->actx = actx;
-			ictx->base_ni = base_ni;
-			ictx->ia = NULL;
-			ictx->page = NULL;
-done:
-			ictx->entry = ie;
-			ictx->data = (u8*)ie +
-					le16_to_cpu(ie->data.vi.data_offset);
-			ictx->data_len = le16_to_cpu(ie->data.vi.data_length);
-			ntfs_debug("Done.");
-			return err;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
-				key_len, &ie->key, le16_to_cpu(ie->key_length));
-		/*
-		 * If @key collates before the key of the current entry, there
-		 * is definitely no such key in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/*
-		 * A match should never happen as the memcmp() call should have
-		 * cought it, but we still treat it correctly.
-		 */
-		if (!rc)
-			goto ir_done;
-		/* The keys are not equal, continue the search. */
-	}
-	/*
-	 * We have finished with this index without success.  Check for the
-	 * presence of a child node and if not present setup @ictx and return
-	 * -ENOENT.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto ir_done;
-	} /* Child node present, descend into it. */
-	/* Consistency check: Verify that an index allocation exists. */
-	if (!NInoIndexAllocPresent(idx_ni)) {
-		ntfs_error(sb, "No index allocation attribute but index entry "
-				"requires one.  Inode 0x%lx is corrupt or "
-				"driver bug.", idx_ni->mft_no);
-		goto err_out;
-	}
-	/* Get the starting vcn of the index_block holding the child node. */
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	ia_mapping = VFS_I(idx_ni)->i_mapping;
-	/*
-	 * We are done with the index root and the mft record.  Release them,
-	 * otherwise we deadlock with ntfs_map_page().
-	 */
-	ntfs_attr_put_search_ctx(actx);
-	unmap_mft_record(base_ni);
-	m = NULL;
-	actx = NULL;
-descend_into_child_node:
-	/*
-	 * Convert vcn to index into the index allocation attribute in units
-	 * of PAGE_SIZE and map the page cache page, reading it from
-	 * disk if necessary.
-	 */
-	page = ntfs_map_page(ia_mapping, vcn <<
-			idx_ni->itype.index.vcn_size_bits >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		ntfs_error(sb, "Failed to map index page, error %ld.",
-				-PTR_ERR(page));
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	kaddr = (u8*)page_address(page);
-fast_descend_into_child_node:
-	/* Get to the index allocation block. */
-	ia = (INDEX_ALLOCATION*)(kaddr + ((vcn <<
-			idx_ni->itype.index.vcn_size_bits) & ~PAGE_MASK));
-	/* Bounds checks. */
-	if ((u8*)ia < kaddr || (u8*)ia > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Out of bounds check failed.  Corrupt inode "
-				"0x%lx or driver bug.", idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Catch multi sector transfer fixup errors. */
-	if (unlikely(!ntfs_is_indx_record(ia->magic))) {
-		ntfs_error(sb, "Index record with vcn 0x%llx is corrupt.  "
-				"Corrupt inode 0x%lx.  Run chkdsk.",
-				(long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (sle64_to_cpu(ia->index_block_vcn) != vcn) {
-		ntfs_error(sb, "Actual VCN (0x%llx) of index buffer is "
-				"different from expected VCN (0x%llx).  Inode "
-				"0x%lx is corrupt or driver bug.",
-				(unsigned long long)
-				sle64_to_cpu(ia->index_block_vcn),
-				(unsigned long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	if (le32_to_cpu(ia->index.allocated_size) + 0x18 !=
-			idx_ni->itype.index.block_size) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx has "
-				"a size (%u) differing from the index "
-				"specified size (%u).  Inode is corrupt or "
-				"driver bug.", (unsigned long long)vcn,
-				idx_ni->mft_no,
-				le32_to_cpu(ia->index.allocated_size) + 0x18,
-				idx_ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	index_end = (u8*)ia + idx_ni->itype.index.block_size;
-	if (index_end > kaddr + PAGE_SIZE) {
-		ntfs_error(sb, "Index buffer (VCN 0x%llx) of inode 0x%lx "
-				"crosses page boundary.  Impossible!  Cannot "
-				"access!  This is probably a bug in the "
-				"driver.", (unsigned long long)vcn,
-				idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ia->index + le32_to_cpu(ia->index.index_length);
-	if (index_end > (u8*)ia + idx_ni->itype.index.block_size) {
-		ntfs_error(sb, "Size of index buffer (VCN 0x%llx) of inode "
-				"0x%lx exceeds maximum size.",
-				(unsigned long long)vcn, idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* The first index entry. */
-	ie = (INDEX_ENTRY*)((u8*)&ia->index +
-			le32_to_cpu(ia->index.entries_offset));
-	/*
-	 * Iterate similar to above big loop but applied to index buffer, thus
-	 * loop until we exceed valid memory (corruption case) or until we
-	 * reach the last entry.
-	 */
-	for (;; ie = (INDEX_ENTRY*)((u8*)ie + le16_to_cpu(ie->length))) {
-		/* Bounds checks. */
-		if ((u8*)ie < (u8*)ia || (u8*)ie +
-				sizeof(INDEX_ENTRY_HEADER) > index_end ||
-				(u8*)ie + le16_to_cpu(ie->length) > index_end) {
-			ntfs_error(sb, "Index entry out of bounds in inode "
-					"0x%lx.", idx_ni->mft_no);
-			goto unm_err_out;
-		}
-		/*
-		 * The last entry cannot contain a key.  It can however contain
-		 * a pointer to a child node in the B+tree so we just break out.
-		 */
-		if (ie->flags & INDEX_ENTRY_END)
-			break;
-		/* Further bounds checks. */
-		if ((u32)sizeof(INDEX_ENTRY_HEADER) +
-				le16_to_cpu(ie->key_length) >
-				le16_to_cpu(ie->data.vi.data_offset) ||
-				(u32)le16_to_cpu(ie->data.vi.data_offset) +
-				le16_to_cpu(ie->data.vi.data_length) >
-				le16_to_cpu(ie->length)) {
-			ntfs_error(sb, "Index entry out of bounds in inode "
-					"0x%lx.", idx_ni->mft_no);
-			goto unm_err_out;
-		}
-		/* If the keys match perfectly, we setup @ictx and return 0. */
-		if ((key_len == le16_to_cpu(ie->key_length)) && !memcmp(key,
-				&ie->key, key_len)) {
-ia_done:
-			ictx->is_in_root = false;
-			ictx->actx = NULL;
-			ictx->base_ni = NULL;
-			ictx->ia = ia;
-			ictx->page = page;
-			goto done;
-		}
-		/*
-		 * Not a perfect match, need to do full blown collation so we
-		 * know which way in the B+tree we have to go.
-		 */
-		rc = ntfs_collate(vol, idx_ni->itype.index.collation_rule, key,
-				key_len, &ie->key, le16_to_cpu(ie->key_length));
-		/*
-		 * If @key collates before the key of the current entry, there
-		 * is definitely no such key in this index but we might need to
-		 * descend into the B+tree so we just break out of the loop.
-		 */
-		if (rc == -1)
-			break;
-		/*
-		 * A match should never happen as the memcmp() call should have
-		 * cought it, but we still treat it correctly.
-		 */
-		if (!rc)
-			goto ia_done;
-		/* The keys are not equal, continue the search. */
-	}
-	/*
-	 * We have finished with this index buffer without success.  Check for
-	 * the presence of a child node and if not present return -ENOENT.
-	 */
-	if (!(ie->flags & INDEX_ENTRY_NODE)) {
-		ntfs_debug("Entry not found.");
-		err = -ENOENT;
-		goto ia_done;
-	}
-	if ((ia->index.flags & NODE_MASK) == LEAF_NODE) {
-		ntfs_error(sb, "Index entry with child node found in a leaf "
-				"node in inode 0x%lx.", idx_ni->mft_no);
-		goto unm_err_out;
-	}
-	/* Child node present, descend into it. */
-	old_vcn = vcn;
-	vcn = sle64_to_cpup((sle64*)((u8*)ie + le16_to_cpu(ie->length) - 8));
-	if (vcn >= 0) {
-		/*
-		 * If vcn is in the same page cache page as old_vcn we recycle
-		 * the mapped page.
-		 */
-		if (old_vcn << vol->cluster_size_bits >>
-				PAGE_SHIFT == vcn <<
-				vol->cluster_size_bits >>
-				PAGE_SHIFT)
-			goto fast_descend_into_child_node;
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		goto descend_into_child_node;
-	}
-	ntfs_error(sb, "Negative child node vcn in inode 0x%lx.",
-			idx_ni->mft_no);
-unm_err_out:
-	unlock_page(page);
-	ntfs_unmap_page(page);
-err_out:
-	if (!err)
-		err = -EIO;
-	if (actx)
-		ntfs_attr_put_search_ctx(actx);
-	if (m)
-		unmap_mft_record(base_ni);
-	return err;
-idx_err_out:
-	ntfs_error(sb, "Corrupt index.  Aborting lookup.");
-	goto err_out;
-}
diff --git a/fs/ntfs/index.h b/fs/ntfs/index.h
deleted file mode 100644
index bb3c3ae55138..000000000000
--- a/fs/ntfs/index.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * index.h - Defines for NTFS kernel index handling.  Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_INDEX_H
-#define _LINUX_NTFS_INDEX_H
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "layout.h"
-#include "inode.h"
-#include "attrib.h"
-#include "mft.h"
-#include "aops.h"
-
-/**
- * @idx_ni:	index inode containing the @entry described by this context
- * @entry:	index entry (points into @ir or @ia)
- * @data:	index entry data (points into @entry)
- * @data_len:	length in bytes of @data
- * @is_in_root:	'true' if @entry is in @ir and 'false' if it is in @ia
- * @ir:		index root if @is_in_root and NULL otherwise
- * @actx:	attribute search context if @is_in_root and NULL otherwise
- * @base_ni:	base inode if @is_in_root and NULL otherwise
- * @ia:		index block if @is_in_root is 'false' and NULL otherwise
- * @page:	page if @is_in_root is 'false' and NULL otherwise
- *
- * @idx_ni is the index inode this context belongs to.
- *
- * @entry is the index entry described by this context.  @data and @data_len
- * are the index entry data and its length in bytes, respectively.  @data
- * simply points into @entry.  This is probably what the user is interested in.
- *
- * If @is_in_root is 'true', @entry is in the index root attribute @ir described
- * by the attribute search context @actx and the base inode @base_ni.  @ia and
- * @page are NULL in this case.
- *
- * If @is_in_root is 'false', @entry is in the index allocation attribute and @ia
- * and @page point to the index allocation block and the mapped, locked page it
- * is in, respectively.  @ir, @actx and @base_ni are NULL in this case.
- *
- * To obtain a context call ntfs_index_ctx_get().
- *
- * We use this context to allow ntfs_index_lookup() to return the found index
- * @entry and its @data without having to allocate a buffer and copy the @entry
- * and/or its @data into it.
- *
- * When finished with the @entry and its @data, call ntfs_index_ctx_put() to
- * free the context and other associated resources.
- *
- * If the index entry was modified, call flush_dcache_index_entry_page()
- * immediately after the modification and either ntfs_index_entry_mark_dirty()
- * or ntfs_index_entry_write() before the call to ntfs_index_ctx_put() to
- * ensure that the changes are written to disk.
- */
-typedef struct {
-	ntfs_inode *idx_ni;
-	INDEX_ENTRY *entry;
-	void *data;
-	u16 data_len;
-	bool is_in_root;
-	INDEX_ROOT *ir;
-	ntfs_attr_search_ctx *actx;
-	ntfs_inode *base_ni;
-	INDEX_ALLOCATION *ia;
-	struct page *page;
-} ntfs_index_context;
-
-extern ntfs_index_context *ntfs_index_ctx_get(ntfs_inode *idx_ni);
-extern void ntfs_index_ctx_put(ntfs_index_context *ictx);
-
-extern int ntfs_index_lookup(const void *key, const int key_len,
-		ntfs_index_context *ictx);
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_index_entry_flush_dcache_page - flush_dcache_page() for index entries
- * @ictx:	ntfs index context describing the index entry
- *
- * Call flush_dcache_page() for the page in which an index entry resides.
- *
- * This must be called every time an index entry is modified, just after the
- * modification.
- *
- * If the index entry is in the index root attribute, simply flush the page
- * containing the mft record containing the index root attribute.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, simply flush the page cache page containing the index block.
- */
-static inline void ntfs_index_entry_flush_dcache_page(ntfs_index_context *ictx)
-{
-	if (ictx->is_in_root)
-		flush_dcache_mft_record_page(ictx->actx->ntfs_ino);
-	else
-		flush_dcache_page(ictx->page);
-}
-
-/**
- * ntfs_index_entry_mark_dirty - mark an index entry dirty
- * @ictx:	ntfs index context describing the index entry
- *
- * Mark the index entry described by the index entry context @ictx dirty.
- *
- * If the index entry is in the index root attribute, simply mark the mft
- * record containing the index root attribute dirty.  This ensures the mft
- * record, and hence the index root attribute, will be written out to disk
- * later.
- *
- * If the index entry is in an index block belonging to the index allocation
- * attribute, mark the buffers belonging to the index record as well as the
- * page cache page the index block is in dirty.  This automatically marks the
- * VFS inode of the ntfs index inode to which the index entry belongs dirty,
- * too (I_DIRTY_PAGES) and this in turn ensures the page buffers, and hence the
- * dirty index block, will be written out to disk later.
- */
-static inline void ntfs_index_entry_mark_dirty(ntfs_index_context *ictx)
-{
-	if (ictx->is_in_root)
-		mark_mft_record_dirty(ictx->actx->ntfs_ino);
-	else
-		mark_ntfs_record_dirty(ictx->page,
-				(u8*)ictx->ia - (u8*)page_address(ictx->page));
-}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INDEX_H */
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
deleted file mode 100644
index aba1e22db4e9..000000000000
--- a/fs/ntfs/inode.c
+++ /dev/null
@@ -1,3102 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * inode.c - NTFS kernel inode handling.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- */
-
-#include <linux/buffer_head.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/mount.h>
-#include <linux/mutex.h>
-#include <linux/pagemap.h>
-#include <linux/quotaops.h>
-#include <linux/slab.h>
-#include <linux/log2.h>
-
-#include "aops.h"
-#include "attrib.h"
-#include "bitmap.h"
-#include "dir.h"
-#include "debug.h"
-#include "inode.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "time.h"
-#include "ntfs.h"
-
-/**
- * ntfs_test_inode - compare two (possibly fake) inodes for equality
- * @vi:		vfs inode which to test
- * @data:	data which is being tested with
- *
- * Compare the ntfs attribute embedded in the ntfs specific part of the vfs
- * inode @vi for equality with the ntfs attribute @data.
- *
- * If searching for the normal file/directory inode, set @na->type to AT_UNUSED.
- * @na->name and @na->name_len are then ignored.
- *
- * Return 1 if the attributes match and 0 if not.
- *
- * NOTE: This function runs with the inode_hash_lock spin lock held so it is not
- * allowed to sleep.
- */
-int ntfs_test_inode(struct inode *vi, void *data)
-{
-	ntfs_attr *na = (ntfs_attr *)data;
-	ntfs_inode *ni;
-
-	if (vi->i_ino != na->mft_no)
-		return 0;
-	ni = NTFS_I(vi);
-	/* If !NInoAttr(ni), @vi is a normal file or directory inode. */
-	if (likely(!NInoAttr(ni))) {
-		/* If not looking for a normal inode this is a mismatch. */
-		if (unlikely(na->type != AT_UNUSED))
-			return 0;
-	} else {
-		/* A fake inode describing an attribute. */
-		if (ni->type != na->type)
-			return 0;
-		if (ni->name_len != na->name_len)
-			return 0;
-		if (na->name_len && memcmp(ni->name, na->name,
-				na->name_len * sizeof(ntfschar)))
-			return 0;
-	}
-	/* Match! */
-	return 1;
-}
-
-/**
- * ntfs_init_locked_inode - initialize an inode
- * @vi:		vfs inode to initialize
- * @data:	data which to initialize @vi to
- *
- * Initialize the vfs inode @vi with the values from the ntfs attribute @data in
- * order to enable ntfs_test_inode() to do its work.
- *
- * If initializing the normal file/directory inode, set @na->type to AT_UNUSED.
- * In that case, @na->name and @na->name_len should be set to NULL and 0,
- * respectively. Although that is not strictly necessary as
- * ntfs_read_locked_inode() will fill them in later.
- *
- * Return 0 on success and -errno on error.
- *
- * NOTE: This function runs with the inode->i_lock spin lock held so it is not
- * allowed to sleep. (Hence the GFP_ATOMIC allocation.)
- */
-static int ntfs_init_locked_inode(struct inode *vi, void *data)
-{
-	ntfs_attr *na = (ntfs_attr *)data;
-	ntfs_inode *ni = NTFS_I(vi);
-
-	vi->i_ino = na->mft_no;
-
-	ni->type = na->type;
-	if (na->type == AT_INDEX_ALLOCATION)
-		NInoSetMstProtected(ni);
-
-	ni->name = na->name;
-	ni->name_len = na->name_len;
-
-	/* If initializing a normal inode, we are done. */
-	if (likely(na->type == AT_UNUSED)) {
-		BUG_ON(na->name);
-		BUG_ON(na->name_len);
-		return 0;
-	}
-
-	/* It is a fake inode. */
-	NInoSetAttr(ni);
-
-	/*
-	 * We have I30 global constant as an optimization as it is the name
-	 * in >99.9% of named attributes! The other <0.1% incur a GFP_ATOMIC
-	 * allocation but that is ok. And most attributes are unnamed anyway,
-	 * thus the fraction of named attributes with name != I30 is actually
-	 * absolutely tiny.
-	 */
-	if (na->name_len && na->name != I30) {
-		unsigned int i;
-
-		BUG_ON(!na->name);
-		i = na->name_len * sizeof(ntfschar);
-		ni->name = kmalloc(i + sizeof(ntfschar), GFP_ATOMIC);
-		if (!ni->name)
-			return -ENOMEM;
-		memcpy(ni->name, na->name, i);
-		ni->name[na->name_len] = 0;
-	}
-	return 0;
-}
-
-static int ntfs_read_locked_inode(struct inode *vi);
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi);
-static int ntfs_read_locked_index_inode(struct inode *base_vi,
-		struct inode *vi);
-
-/**
- * ntfs_iget - obtain a struct inode corresponding to a specific normal inode
- * @sb:		super block of mounted volume
- * @mft_no:	mft record number / inode number to obtain
- *
- * Obtain the struct inode corresponding to a specific normal inode (i.e. a
- * file or directory).
- *
- * If the inode is in the cache, it is just returned with an increased
- * reference count. Otherwise, a new struct inode is allocated and initialized,
- * and finally ntfs_read_locked_inode() is called to read in the inode and
- * fill in the remainder of the inode structure.
- *
- * Return the struct inode on success. Check the return value with IS_ERR() and
- * if true, the function failed and the error code is obtained from PTR_ERR().
- */
-struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	na.mft_no = mft_no;
-	na.type = AT_UNUSED;
-	na.name = NULL;
-	na.name_len = 0;
-
-	vi = iget5_locked(sb, mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_inode(vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad inodes around if the failure was
-	 * due to ENOMEM. We want to be able to retry again later.
-	 */
-	if (unlikely(err == -ENOMEM)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-/**
- * ntfs_attr_iget - obtain a struct inode corresponding to an attribute
- * @base_vi:	vfs base inode containing the attribute
- * @type:	attribute type
- * @name:	Unicode name of the attribute (NULL if unnamed)
- * @name_len:	length of @name in Unicode characters (0 if unnamed)
- *
- * Obtain the (fake) struct inode corresponding to the attribute specified by
- * @type, @name, and @name_len, which is present in the base mft record
- * specified by the vfs inode @base_vi.
- *
- * If the attribute inode is in the cache, it is just returned with an
- * increased reference count. Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_attr_inode() is called to read the
- * attribute and fill in the inode structure.
- *
- * Note, for index allocation attributes, you need to use ntfs_index_iget()
- * instead of ntfs_attr_iget() as working with indices is a lot more complex.
- *
- * Return the struct inode of the attribute inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
-		ntfschar *name, u32 name_len)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	/* Make sure no one calls ntfs_attr_iget() for indices. */
-	BUG_ON(type == AT_INDEX_ALLOCATION);
-
-	na.mft_no = base_vi->i_ino;
-	na.type = type;
-	na.name = name;
-	na.name_len = name_len;
-
-	vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_attr_inode(base_vi, vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad attribute inodes around. This also
-	 * simplifies things in that we never need to check for bad attribute
-	 * inodes elsewhere.
-	 */
-	if (unlikely(err)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-/**
- * ntfs_index_iget - obtain a struct inode corresponding to an index
- * @base_vi:	vfs base inode containing the index related attributes
- * @name:	Unicode name of the index
- * @name_len:	length of @name in Unicode characters
- *
- * Obtain the (fake) struct inode corresponding to the index specified by @name
- * and @name_len, which is present in the base mft record specified by the vfs
- * inode @base_vi.
- *
- * If the index inode is in the cache, it is just returned with an increased
- * reference count.  Otherwise, a new struct inode is allocated and
- * initialized, and finally ntfs_read_locked_index_inode() is called to read
- * the index related attributes and fill in the inode structure.
- *
- * Return the struct inode of the index inode on success. Check the return
- * value with IS_ERR() and if true, the function failed and the error code is
- * obtained from PTR_ERR().
- */
-struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
-		u32 name_len)
-{
-	struct inode *vi;
-	int err;
-	ntfs_attr na;
-
-	na.mft_no = base_vi->i_ino;
-	na.type = AT_INDEX_ALLOCATION;
-	na.name = name;
-	na.name_len = name_len;
-
-	vi = iget5_locked(base_vi->i_sb, na.mft_no, ntfs_test_inode,
-			ntfs_init_locked_inode, &na);
-	if (unlikely(!vi))
-		return ERR_PTR(-ENOMEM);
-
-	err = 0;
-
-	/* If this is a freshly allocated inode, need to read it now. */
-	if (vi->i_state & I_NEW) {
-		err = ntfs_read_locked_index_inode(base_vi, vi);
-		unlock_new_inode(vi);
-	}
-	/*
-	 * There is no point in keeping bad index inodes around.  This also
-	 * simplifies things in that we never need to check for bad index
-	 * inodes elsewhere.
-	 */
-	if (unlikely(err)) {
-		iput(vi);
-		vi = ERR_PTR(err);
-	}
-	return vi;
-}
-
-struct inode *ntfs_alloc_big_inode(struct super_block *sb)
-{
-	ntfs_inode *ni;
-
-	ntfs_debug("Entering.");
-	ni = alloc_inode_sb(sb, ntfs_big_inode_cache, GFP_NOFS);
-	if (likely(ni != NULL)) {
-		ni->state = 0;
-		return VFS_I(ni);
-	}
-	ntfs_error(sb, "Allocation of NTFS big inode structure failed.");
-	return NULL;
-}
-
-void ntfs_free_big_inode(struct inode *inode)
-{
-	kmem_cache_free(ntfs_big_inode_cache, NTFS_I(inode));
-}
-
-static inline ntfs_inode *ntfs_alloc_extent_inode(void)
-{
-	ntfs_inode *ni;
-
-	ntfs_debug("Entering.");
-	ni = kmem_cache_alloc(ntfs_inode_cache, GFP_NOFS);
-	if (likely(ni != NULL)) {
-		ni->state = 0;
-		return ni;
-	}
-	ntfs_error(NULL, "Allocation of NTFS inode structure failed.");
-	return NULL;
-}
-
-static void ntfs_destroy_extent_inode(ntfs_inode *ni)
-{
-	ntfs_debug("Entering.");
-	BUG_ON(ni->page);
-	if (!atomic_dec_and_test(&ni->count))
-		BUG();
-	kmem_cache_free(ntfs_inode_cache, ni);
-}
-
-/*
- * The attribute runlist lock has separate locking rules from the
- * normal runlist lock, so split the two lock-classes:
- */
-static struct lock_class_key attr_list_rl_lock_class;
-
-/**
- * __ntfs_init_inode - initialize ntfs specific part of an inode
- * @sb:		super block of mounted volume
- * @ni:		freshly allocated ntfs inode which to initialize
- *
- * Initialize an ntfs inode to defaults.
- *
- * NOTE: ni->mft_no, ni->state, ni->type, ni->name, and ni->name_len are left
- * untouched. Make sure to initialize them elsewhere.
- *
- * Return zero on success and -ENOMEM on error.
- */
-void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni)
-{
-	ntfs_debug("Entering.");
-	rwlock_init(&ni->size_lock);
-	ni->initialized_size = ni->allocated_size = 0;
-	ni->seq_no = 0;
-	atomic_set(&ni->count, 1);
-	ni->vol = NTFS_SB(sb);
-	ntfs_init_runlist(&ni->runlist);
-	mutex_init(&ni->mrec_lock);
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	ni->attr_list_size = 0;
-	ni->attr_list = NULL;
-	ntfs_init_runlist(&ni->attr_list_rl);
-	lockdep_set_class(&ni->attr_list_rl.lock,
-				&attr_list_rl_lock_class);
-	ni->itype.index.block_size = 0;
-	ni->itype.index.vcn_size = 0;
-	ni->itype.index.collation_rule = 0;
-	ni->itype.index.block_size_bits = 0;
-	ni->itype.index.vcn_size_bits = 0;
-	mutex_init(&ni->extent_lock);
-	ni->nr_extents = 0;
-	ni->ext.base_ntfs_ino = NULL;
-}
-
-/*
- * Extent inodes get MFT-mapped in a nested way, while the base inode
- * is still mapped. Teach this nesting to the lock validator by creating
- * a separate class for nested inode's mrec_lock's:
- */
-static struct lock_class_key extent_inode_mrec_lock_key;
-
-inline ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
-		unsigned long mft_no)
-{
-	ntfs_inode *ni = ntfs_alloc_extent_inode();
-
-	ntfs_debug("Entering.");
-	if (likely(ni != NULL)) {
-		__ntfs_init_inode(sb, ni);
-		lockdep_set_class(&ni->mrec_lock, &extent_inode_mrec_lock_key);
-		ni->mft_no = mft_no;
-		ni->type = AT_UNUSED;
-		ni->name = NULL;
-		ni->name_len = 0;
-	}
-	return ni;
-}
-
-/**
- * ntfs_is_extended_system_file - check if a file is in the $Extend directory
- * @ctx:	initialized attribute search context
- *
- * Search all file name attributes in the inode described by the attribute
- * search context @ctx and check if any of the names are in the $Extend system
- * directory.
- *
- * Return values:
- *	   1: file is in $Extend directory
- *	   0: file is not in $Extend directory
- *    -errno: failed to determine if the file is in the $Extend directory
- */
-static int ntfs_is_extended_system_file(ntfs_attr_search_ctx *ctx)
-{
-	int nr_links, err;
-
-	/* Restart search. */
-	ntfs_attr_reinit_search_ctx(ctx);
-
-	/* Get number of hard links. */
-	nr_links = le16_to_cpu(ctx->mrec->link_count);
-
-	/* Loop through all hard links. */
-	while (!(err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0, NULL, 0,
-			ctx))) {
-		FILE_NAME_ATTR *file_name_attr;
-		ATTR_RECORD *attr = ctx->attr;
-		u8 *p, *p2;
-
-		nr_links--;
-		/*
-		 * Maximum sanity checking as we are called on an inode that
-		 * we suspect might be corrupt.
-		 */
-		p = (u8*)attr + le32_to_cpu(attr->length);
-		if (p < (u8*)ctx->mrec || (u8*)p > (u8*)ctx->mrec +
-				le32_to_cpu(ctx->mrec->bytes_in_use)) {
-err_corrupt_attr:
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Corrupt file name "
-					"attribute. You should run chkdsk.");
-			return -EIO;
-		}
-		if (attr->non_resident) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Non-resident file "
-					"name. You should run chkdsk.");
-			return -EIO;
-		}
-		if (attr->flags) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "File name with "
-					"invalid flags. You should run "
-					"chkdsk.");
-			return -EIO;
-		}
-		if (!(attr->data.resident.flags & RESIDENT_ATTR_IS_INDEXED)) {
-			ntfs_error(ctx->ntfs_ino->vol->sb, "Unindexed file "
-					"name. You should run chkdsk.");
-			return -EIO;
-		}
-		file_name_attr = (FILE_NAME_ATTR*)((u8*)attr +
-				le16_to_cpu(attr->data.resident.value_offset));
-		p2 = (u8 *)file_name_attr + le32_to_cpu(attr->data.resident.value_length);
-		if (p2 < (u8*)attr || p2 > p)
-			goto err_corrupt_attr;
-		/* This attribute is ok, but is it in the $Extend directory? */
-		if (MREF_LE(file_name_attr->parent_directory) == FILE_Extend)
-			return 1;	/* YES, it's an extended system file. */
-	}
-	if (unlikely(err != -ENOENT))
-		return err;
-	if (unlikely(nr_links)) {
-		ntfs_error(ctx->ntfs_ino->vol->sb, "Inode hard link count "
-				"doesn't match number of name attributes. You "
-				"should run chkdsk.");
-		return -EIO;
-	}
-	return 0;	/* NO, it is not an extended system file. */
-}
-
-/**
- * ntfs_read_locked_inode - read an inode from its device
- * @vi:		inode to read
- *
- * ntfs_read_locked_inode() is called from ntfs_iget() to read the inode
- * described by @vi into memory from the device.
- *
- * The only fields in @vi that we need to/can look at when the function is
- * called are i_sb, pointing to the mounted device's super block, and i_ino,
- * the number of the inode to load.
- *
- * ntfs_read_locked_inode() maps, pins and locks the mft record number i_ino
- * for reading and sets up the necessary @vi fields as well as initializing
- * the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *    i_flags is set to 0 and we have no business touching it.  Only an ioctl()
- *    is allowed to write to them. We should of course be honouring them but
- *    we need to do that using the IS_* macros defined in include/linux/fs.h.
- *    In any case ntfs_read_locked_inode() has nothing to do with i_flags.
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_inode(struct inode *vi)
-{
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni;
-	struct inode *bvi;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	STANDARD_INFORMATION *si;
-	ntfs_attr_search_ctx *ctx;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
-	/* Setup the generic vfs inode parts now. */
-	vi->i_uid = vol->uid;
-	vi->i_gid = vol->gid;
-	vi->i_mode = 0;
-
-	/*
-	 * Initialize the ntfs specific part of @vi special casing
-	 * FILE_MFT which we need to do at mount time.
-	 */
-	if (vi->i_ino != FILE_MFT)
-		ntfs_init_big_inode(vi);
-	ni = NTFS_I(vi);
-
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-
-	if (!(m->flags & MFT_RECORD_IN_USE)) {
-		ntfs_error(vi->i_sb, "Inode is not in use!");
-		goto unm_err_out;
-	}
-	if (m->base_mft_record) {
-		ntfs_error(vi->i_sb, "Inode is an extent inode!");
-		goto unm_err_out;
-	}
-
-	/* Transfer information from mft record into vfs and ntfs inodes. */
-	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
-	/*
-	 * FIXME: Keep in mind that link_count is two for files which have both
-	 * a long file name and a short file name as separate entries, so if
-	 * we are hiding short file names this will be too high. Either we need
-	 * to account for the short file names by subtracting them or we need
-	 * to make sure we delete files even though i_nlink is not zero which
-	 * might be tricky due to vfs interactions. Need to think about this
-	 * some more when implementing the unlink command.
-	 */
-	set_nlink(vi, le16_to_cpu(m->link_count));
-	/*
-	 * FIXME: Reparse points can have the directory bit set even though
-	 * they would be S_IFLNK. Need to deal with this further below when we
-	 * implement reparse points / symbolic links but it will do for now.
-	 * Also if not a directory, it could be something else, rather than
-	 * a regular file. But again, will do for now.
-	 */
-	/* Everyone gets all permissions. */
-	vi->i_mode |= S_IRWXUGO;
-	/* If read-only, no one gets write permissions. */
-	if (IS_RDONLY(vi))
-		vi->i_mode &= ~S_IWUGO;
-	if (m->flags & MFT_RECORD_IS_DIRECTORY) {
-		vi->i_mode |= S_IFDIR;
-		/*
-		 * Apply the directory permissions mask set in the mount
-		 * options.
-		 */
-		vi->i_mode &= ~vol->dmask;
-		/* Things break without this kludge! */
-		if (vi->i_nlink > 1)
-			set_nlink(vi, 1);
-	} else {
-		vi->i_mode |= S_IFREG;
-		/* Apply the file permissions mask set in the mount options. */
-		vi->i_mode &= ~vol->fmask;
-	}
-	/*
-	 * Find the standard information attribute in the mft record. At this
-	 * stage we haven't setup the attribute list stuff yet, so this could
-	 * in fact fail if the standard information is in an extent record, but
-	 * I don't think this actually ever happens.
-	 */
-	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			/*
-			 * TODO: We should be performing a hot fix here (if the
-			 * recover mount option is set) by creating a new
-			 * attribute.
-			 */
-			ntfs_error(vi->i_sb, "$STANDARD_INFORMATION attribute "
-					"is missing.");
-		}
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	/* Get the standard information attribute value. */
-	if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset)
-			+ le32_to_cpu(a->data.resident.value_length) >
-			(u8 *)ctx->mrec + vol->mft_record_size) {
-		ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode.");
-		goto unm_err_out;
-	}
-	si = (STANDARD_INFORMATION*)((u8*)a +
-			le16_to_cpu(a->data.resident.value_offset));
-
-	/* Transfer information from the standard information into vi. */
-	/*
-	 * Note: The i_?times do not quite map perfectly onto the NTFS times,
-	 * but they are close enough, and in the end it doesn't really matter
-	 * that much...
-	 */
-	/*
-	 * mtime is the last change of the data within the file. Not changed
-	 * when only metadata is changed, e.g. a rename doesn't affect mtime.
-	 */
-	inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time));
-	/*
-	 * ctime is the last change of the metadata of the file. This obviously
-	 * always changes, when mtime is changed. ctime can be changed on its
-	 * own, mtime is then not changed, e.g. when a file is renamed.
-	 */
-	inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time));
-	/*
-	 * Last access to the data within the file. Not changed during a rename
-	 * for example but changed whenever the file is written to.
-	 */
-	inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time));
-
-	/* Find the attribute list attribute if present. */
-	ntfs_attr_reinit_search_ctx(ctx);
-	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
-	if (err) {
-		if (unlikely(err != -ENOENT)) {
-			ntfs_error(vi->i_sb, "Failed to lookup attribute list "
-					"attribute.");
-			goto unm_err_out;
-		}
-	} else /* if (!err) */ {
-		if (vi->i_ino == FILE_MFT)
-			goto skip_attr_list_load;
-		ntfs_debug("Attribute list found in inode 0x%lx.", vi->i_ino);
-		NInoSetAttrList(ni);
-		a = ctx->attr;
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(vi->i_sb, "Attribute list attribute is "
-					"compressed.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			if (a->non_resident) {
-				ntfs_error(vi->i_sb, "Non-resident attribute "
-						"list attribute is encrypted/"
-						"sparse.");
-				goto unm_err_out;
-			}
-			ntfs_warning(vi->i_sb, "Resident attribute list "
-					"attribute in inode 0x%lx is marked "
-					"encrypted/sparse which is not true.  "
-					"However, Windows allows this and "
-					"chkdsk does not detect or correct it "
-					"so we will just ignore the invalid "
-					"flags and pretend they are not set.",
-					vi->i_ino);
-		}
-		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(a);
-		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
-		if (!ni->attr_list) {
-			ntfs_error(vi->i_sb, "Not enough memory to allocate "
-					"buffer for attribute list.");
-			err = -ENOMEM;
-			goto unm_err_out;
-		}
-		if (a->non_resident) {
-			NInoSetAttrListNonResident(ni);
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(vi->i_sb, "Attribute list has non "
-						"zero lowest_vcn.");
-				goto unm_err_out;
-			}
-			/*
-			 * Setup the runlist. No need for locking as we have
-			 * exclusive access to the inode at this time.
-			 */
-			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					a, NULL);
-			if (IS_ERR(ni->attr_list_rl.rl)) {
-				err = PTR_ERR(ni->attr_list_rl.rl);
-				ni->attr_list_rl.rl = NULL;
-				ntfs_error(vi->i_sb, "Mapping pairs "
-						"decompression failed.");
-				goto unm_err_out;
-			}
-			/* Now load the attribute list. */
-			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
-					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(a->data.non_resident.
-					initialized_size)))) {
-				ntfs_error(vi->i_sb, "Failed to load "
-						"attribute list attribute.");
-				goto unm_err_out;
-			}
-		} else /* if (!a->non_resident) */ {
-			if ((u8*)a + le16_to_cpu(a->data.resident.value_offset)
-					+ le32_to_cpu(
-					a->data.resident.value_length) >
-					(u8*)ctx->mrec + vol->mft_record_size) {
-				ntfs_error(vi->i_sb, "Corrupt attribute list "
-						"in inode.");
-				goto unm_err_out;
-			}
-			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					le32_to_cpu(
-					a->data.resident.value_length));
-		}
-	}
-skip_attr_list_load:
-	/*
-	 * If an attribute list is present we now have the attribute list value
-	 * in ntfs_ino->attr_list and it is ntfs_ino->attr_list_size bytes.
-	 */
-	if (S_ISDIR(vi->i_mode)) {
-		loff_t bvi_size;
-		ntfs_inode *bni;
-		INDEX_ROOT *ir;
-		u8 *ir_end, *index_end;
-
-		/* It is a directory, find index root attribute. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(AT_INDEX_ROOT, I30, 4, CASE_SENSITIVE,
-				0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT) {
-				// FIXME: File is corrupt! Hot-fix with empty
-				// index root attribute if recovery option is
-				// set.
-				ntfs_error(vi->i_sb, "$INDEX_ROOT attribute "
-						"is missing.");
-			}
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		/* Set up the state. */
-		if (unlikely(a->non_resident)) {
-			ntfs_error(vol->sb, "$INDEX_ROOT attribute is not "
-					"resident.");
-			goto unm_err_out;
-		}
-		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(a->data.resident.value_offset)))) {
-			ntfs_error(vol->sb, "$INDEX_ROOT attribute name is "
-					"placed after the attribute value.");
-			goto unm_err_out;
-		}
-		/*
-		 * Compressed/encrypted index root just means that the newly
-		 * created files in that directory should be created compressed/
-		 * encrypted. However index root cannot be both compressed and
-		 * encrypted.
-		 */
-		if (a->flags & ATTR_COMPRESSION_MASK)
-			NInoSetCompressed(ni);
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed attribute.");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
-		if (a->flags & ATTR_IS_SPARSE)
-			NInoSetSparse(ni);
-		ir = (INDEX_ROOT*)((u8*)a +
-				le16_to_cpu(a->data.resident.value_offset));
-		ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
-		if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
-			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
-					"corrupt.");
-			goto unm_err_out;
-		}
-		index_end = (u8*)&ir->index +
-				le32_to_cpu(ir->index.index_length);
-		if (index_end > ir_end) {
-			ntfs_error(vi->i_sb, "Directory index is corrupt.");
-			goto unm_err_out;
-		}
-		if (ir->type != AT_FILE_NAME) {
-			ntfs_error(vi->i_sb, "Indexed attribute is not "
-					"$FILE_NAME.");
-			goto unm_err_out;
-		}
-		if (ir->collation_rule != COLLATION_FILE_NAME) {
-			ntfs_error(vi->i_sb, "Index collation rule is not "
-					"COLLATION_FILE_NAME.");
-			goto unm_err_out;
-		}
-		ni->itype.index.collation_rule = ir->collation_rule;
-		ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-		if (ni->itype.index.block_size &
-				(ni->itype.index.block_size - 1)) {
-			ntfs_error(vi->i_sb, "Index block size (%u) is not a "
-					"power of two.",
-					ni->itype.index.block_size);
-			goto unm_err_out;
-		}
-		if (ni->itype.index.block_size > PAGE_SIZE) {
-			ntfs_error(vi->i_sb, "Index block size (%u) > "
-					"PAGE_SIZE (%ld) is not "
-					"supported.  Sorry.",
-					ni->itype.index.block_size,
-					PAGE_SIZE);
-			err = -EOPNOTSUPP;
-			goto unm_err_out;
-		}
-		if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
-			ntfs_error(vi->i_sb, "Index block size (%u) < "
-					"NTFS_BLOCK_SIZE (%i) is not "
-					"supported.  Sorry.",
-					ni->itype.index.block_size,
-					NTFS_BLOCK_SIZE);
-			err = -EOPNOTSUPP;
-			goto unm_err_out;
-		}
-		ni->itype.index.block_size_bits =
-				ffs(ni->itype.index.block_size) - 1;
-		/* Determine the size of a vcn in the directory index. */
-		if (vol->cluster_size <= ni->itype.index.block_size) {
-			ni->itype.index.vcn_size = vol->cluster_size;
-			ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
-		} else {
-			ni->itype.index.vcn_size = vol->sector_size;
-			ni->itype.index.vcn_size_bits = vol->sector_size_bits;
-		}
-
-		/* Setup the index allocation attribute, even if not present. */
-		NInoSetMstProtected(ni);
-		ni->type = AT_INDEX_ALLOCATION;
-		ni->name = I30;
-		ni->name_len = 4;
-
-		if (!(ir->index.flags & LARGE_INDEX)) {
-			/* No index allocation. */
-			vi->i_size = ni->initialized_size =
-					ni->allocated_size = 0;
-			/* We are done with the mft record, so we release it. */
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(ni);
-			m = NULL;
-			ctx = NULL;
-			goto skip_large_dir_stuff;
-		} /* LARGE_INDEX: Index allocation present. Setup state. */
-		NInoSetIndexAllocPresent(ni);
-		/* Find index allocation attribute. */
-		ntfs_attr_reinit_search_ctx(ctx);
-		err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, I30, 4,
-				CASE_SENSITIVE, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			if (err == -ENOENT)
-				ntfs_error(vi->i_sb, "$INDEX_ALLOCATION "
-						"attribute is not present but "
-						"$INDEX_ROOT indicated it is.");
-			else
-				ntfs_error(vi->i_sb, "Failed to lookup "
-						"$INDEX_ALLOCATION "
-						"attribute.");
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		if (!a->non_resident) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is resident.");
-			goto unm_err_out;
-		}
-		/*
-		 * Ensure the attribute name is placed before the mapping pairs
-		 * array.
-		 */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset)))) {
-			ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name "
-					"is placed after the mapping pairs "
-					"array.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is encrypted.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_SPARSE) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is sparse.");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute "
-					"is compressed.");
-			goto unm_err_out;
-		}
-		if (a->data.non_resident.lowest_vcn) {
-			ntfs_error(vi->i_sb, "First extent of "
-					"$INDEX_ALLOCATION attribute has non "
-					"zero lowest_vcn.");
-			goto unm_err_out;
-		}
-		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-		ni->initialized_size = sle64_to_cpu(
-				a->data.non_resident.initialized_size);
-		ni->allocated_size = sle64_to_cpu(
-				a->data.non_resident.allocated_size);
-		/*
-		 * We are done with the mft record, so we release it. Otherwise
-		 * we would deadlock in ntfs_attr_iget().
-		 */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		m = NULL;
-		ctx = NULL;
-		/* Get the index bitmap attribute inode. */
-		bvi = ntfs_attr_iget(vi, AT_BITMAP, I30, 4);
-		if (IS_ERR(bvi)) {
-			ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
-			err = PTR_ERR(bvi);
-			goto unm_err_out;
-		}
-		bni = NTFS_I(bvi);
-		if (NInoCompressed(bni) || NInoEncrypted(bni) ||
-				NInoSparse(bni)) {
-			ntfs_error(vi->i_sb, "$BITMAP attribute is compressed "
-					"and/or encrypted and/or sparse.");
-			goto iput_unm_err_out;
-		}
-		/* Consistency check bitmap size vs. index allocation size. */
-		bvi_size = i_size_read(bvi);
-		if ((bvi_size << 3) < (vi->i_size >>
-				ni->itype.index.block_size_bits)) {
-			ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) "
-					"for index allocation (0x%llx).",
-					bvi_size << 3, vi->i_size);
-			goto iput_unm_err_out;
-		}
-		/* No longer need the bitmap attribute inode. */
-		iput(bvi);
-skip_large_dir_stuff:
-		/* Setup the operations for this inode. */
-		vi->i_op = &ntfs_dir_inode_ops;
-		vi->i_fop = &ntfs_dir_ops;
-		vi->i_mapping->a_ops = &ntfs_mst_aops;
-	} else {
-		/* It is a file. */
-		ntfs_attr_reinit_search_ctx(ctx);
-
-		/* Setup the data attribute, even if not present. */
-		ni->type = AT_DATA;
-		ni->name = NULL;
-		ni->name_len = 0;
-
-		/* Find first extent of the unnamed data attribute. */
-		err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, 0, NULL, 0, ctx);
-		if (unlikely(err)) {
-			vi->i_size = ni->initialized_size =
-					ni->allocated_size = 0;
-			if (err != -ENOENT) {
-				ntfs_error(vi->i_sb, "Failed to lookup $DATA "
-						"attribute.");
-				goto unm_err_out;
-			}
-			/*
-			 * FILE_Secure does not have an unnamed $DATA
-			 * attribute, so we special case it here.
-			 */
-			if (vi->i_ino == FILE_Secure)
-				goto no_data_attr_special_case;
-			/*
-			 * Most if not all the system files in the $Extend
-			 * system directory do not have unnamed data
-			 * attributes so we need to check if the parent
-			 * directory of the file is FILE_Extend and if it is
-			 * ignore this error. To do this we need to get the
-			 * name of this inode from the mft record as the name
-			 * contains the back reference to the parent directory.
-			 */
-			if (ntfs_is_extended_system_file(ctx) > 0)
-				goto no_data_attr_special_case;
-			// FIXME: File is corrupt! Hot-fix with empty data
-			// attribute if recovery option is set.
-			ntfs_error(vi->i_sb, "$DATA attribute is missing.");
-			goto unm_err_out;
-		}
-		a = ctx->attr;
-		/* Setup the state. */
-		if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-			if (a->flags & ATTR_COMPRESSION_MASK) {
-				NInoSetCompressed(ni);
-				if (vol->cluster_size > 4096) {
-					ntfs_error(vi->i_sb, "Found "
-							"compressed data but "
-							"compression is "
-							"disabled due to "
-							"cluster size (%i) > "
-							"4kiB.",
-							vol->cluster_size);
-					goto unm_err_out;
-				}
-				if ((a->flags & ATTR_COMPRESSION_MASK)
-						!= ATTR_IS_COMPRESSED) {
-					ntfs_error(vi->i_sb, "Found unknown "
-							"compression method "
-							"or corrupt file.");
-					goto unm_err_out;
-				}
-			}
-			if (a->flags & ATTR_IS_SPARSE)
-				NInoSetSparse(ni);
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED) {
-			if (NInoCompressed(ni)) {
-				ntfs_error(vi->i_sb, "Found encrypted and "
-						"compressed data.");
-				goto unm_err_out;
-			}
-			NInoSetEncrypted(ni);
-		}
-		if (a->non_resident) {
-			NInoSetNonResident(ni);
-			if (NInoCompressed(ni) || NInoSparse(ni)) {
-				if (NInoCompressed(ni) && a->data.non_resident.
-						compression_unit != 4) {
-					ntfs_error(vi->i_sb, "Found "
-							"non-standard "
-							"compression unit (%u "
-							"instead of 4).  "
-							"Cannot handle this.",
-							a->data.non_resident.
-							compression_unit);
-					err = -EOPNOTSUPP;
-					goto unm_err_out;
-				}
-				if (a->data.non_resident.compression_unit) {
-					ni->itype.compressed.block_size = 1U <<
-							(a->data.non_resident.
-							compression_unit +
-							vol->cluster_size_bits);
-					ni->itype.compressed.block_size_bits =
-							ffs(ni->itype.
-							compressed.
-							block_size) - 1;
-					ni->itype.compressed.block_clusters =
-							1U << a->data.
-							non_resident.
-							compression_unit;
-				} else {
-					ni->itype.compressed.block_size = 0;
-					ni->itype.compressed.block_size_bits =
-							0;
-					ni->itype.compressed.block_clusters =
-							0;
-				}
-				ni->itype.compressed.size = sle64_to_cpu(
-						a->data.non_resident.
-						compressed_size);
-			}
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(vi->i_sb, "First extent of $DATA "
-						"attribute has non zero "
-						"lowest_vcn.");
-				goto unm_err_out;
-			}
-			vi->i_size = sle64_to_cpu(
-					a->data.non_resident.data_size);
-			ni->initialized_size = sle64_to_cpu(
-					a->data.non_resident.initialized_size);
-			ni->allocated_size = sle64_to_cpu(
-					a->data.non_resident.allocated_size);
-		} else { /* Resident attribute. */
-			vi->i_size = ni->initialized_size = le32_to_cpu(
-					a->data.resident.value_length);
-			ni->allocated_size = le32_to_cpu(a->length) -
-					le16_to_cpu(
-					a->data.resident.value_offset);
-			if (vi->i_size > ni->allocated_size) {
-				ntfs_error(vi->i_sb, "Resident data attribute "
-						"is corrupt (size exceeds "
-						"allocation).");
-				goto unm_err_out;
-			}
-		}
-no_data_attr_special_case:
-		/* We are done with the mft record, so we release it. */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		m = NULL;
-		ctx = NULL;
-		/* Setup the operations for this inode. */
-		vi->i_op = &ntfs_file_inode_ops;
-		vi->i_fop = &ntfs_file_ops;
-		vi->i_mapping->a_ops = &ntfs_normal_aops;
-		if (NInoMstProtected(ni))
-			vi->i_mapping->a_ops = &ntfs_mst_aops;
-		else if (NInoCompressed(ni))
-			vi->i_mapping->a_ops = &ntfs_compressed_aops;
-	}
-	/*
-	 * The number of 512-byte blocks used on disk (for stat). This is in so
-	 * far inaccurate as it doesn't account for any named streams or other
-	 * special non-resident attributes, but that is how Windows works, too,
-	 * so we are at least consistent with Windows, if not entirely
-	 * consistent with the Linux Way. Doing it the Linux Way would cause a
-	 * significant slowdown as it would involve iterating over all
-	 * attributes in the mft record and adding the allocated/compressed
-	 * sizes of all non-resident attributes present to give us the Linux
-	 * correct size that should go into i_blocks (after division by 512).
-	 */
-	if (S_ISREG(vi->i_mode) && (NInoCompressed(ni) || NInoSparse(ni)))
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	else
-		vi->i_blocks = ni->allocated_size >> 9;
-	ntfs_debug("Done.");
-	return 0;
-iput_unm_err_out:
-	iput(bvi);
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i.  Marking corrupt "
-			"inode 0x%lx as bad.  Run chkdsk.", err, vi->i_ino);
-	make_bad_inode(vi);
-	if (err != -EOPNOTSUPP && err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_read_locked_attr_inode - read an attribute inode from its base inode
- * @base_vi:	base inode
- * @vi:		attribute inode to read
- *
- * ntfs_read_locked_attr_inode() is called from ntfs_attr_iget() to read the
- * attribute inode described by @vi into memory from the base mft record
- * described by @base_ni.
- *
- * ntfs_read_locked_attr_inode() maps, pins and locks the base inode for
- * reading and looks up the attribute described by @vi before setting up the
- * necessary fields in @vi as well as initializing the ntfs inode.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- *
- * Note this cannot be called for AT_INDEX_ALLOCATION.
- */
-static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
-{
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni, *base_ni;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-
-	ntfs_init_big_inode(vi);
-
-	ni	= NTFS_I(vi);
-	base_ni = NTFS_I(base_vi);
-
-	/* Just mirror the values from the base inode. */
-	vi->i_uid	= base_vi->i_uid;
-	vi->i_gid	= base_vi->i_gid;
-	set_nlink(vi, base_vi->i_nlink);
-	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
-	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
-	vi->i_generation = ni->seq_no = base_ni->seq_no;
-
-	/* Set inode type to zero but preserve permissions. */
-	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
-
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	/* Find the attribute. */
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err))
-		goto unm_err_out;
-	a = ctx->attr;
-	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_SPARSE)) {
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			NInoSetCompressed(ni);
-			if ((ni->type != AT_DATA) || (ni->type == AT_DATA &&
-					ni->name_len)) {
-				ntfs_error(vi->i_sb, "Found compressed "
-						"non-data or named data "
-						"attribute.  Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto unm_err_out;
-			}
-			if (vol->cluster_size > 4096) {
-				ntfs_error(vi->i_sb, "Found compressed "
-						"attribute but compression is "
-						"disabled due to cluster size "
-						"(%i) > 4kiB.",
-						vol->cluster_size);
-				goto unm_err_out;
-			}
-			if ((a->flags & ATTR_COMPRESSION_MASK) !=
-					ATTR_IS_COMPRESSED) {
-				ntfs_error(vi->i_sb, "Found unknown "
-						"compression method.");
-				goto unm_err_out;
-			}
-		}
-		/*
-		 * The compressed/sparse flag set in an index root just means
-		 * to compress all files.
-		 */
-		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is %s.  Please "
-					"report you saw this message to "
-					"linux-ntfs-dev@lists.sourceforge.net",
-					NInoCompressed(ni) ? "compressed" :
-					"sparse");
-			goto unm_err_out;
-		}
-		if (a->flags & ATTR_IS_SPARSE)
-			NInoSetSparse(ni);
-	}
-	if (a->flags & ATTR_IS_ENCRYPTED) {
-		if (NInoCompressed(ni)) {
-			ntfs_error(vi->i_sb, "Found encrypted and compressed "
-					"data.");
-			goto unm_err_out;
-		}
-		/*
-		 * The encryption flag set in an index root just means to
-		 * encrypt all files.
-		 */
-		if (NInoMstProtected(ni) && ni->type != AT_INDEX_ROOT) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is encrypted.  "
-					"Please report you saw this message "
-					"to linux-ntfs-dev@lists.sourceforge."
-					"net");
-			goto unm_err_out;
-		}
-		if (ni->type != AT_DATA) {
-			ntfs_error(vi->i_sb, "Found encrypted non-data "
-					"attribute.");
-			goto unm_err_out;
-		}
-		NInoSetEncrypted(ni);
-	}
-	if (!a->non_resident) {
-		/* Ensure the attribute name is placed before the value. */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(a->data.resident.value_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the attribute value.");
-			goto unm_err_out;
-		}
-		if (NInoMstProtected(ni)) {
-			ntfs_error(vi->i_sb, "Found mst protected attribute "
-					"but the attribute is resident.  "
-					"Please report you saw this message to "
-					"linux-ntfs-dev@lists.sourceforge.net");
-			goto unm_err_out;
-		}
-		vi->i_size = ni->initialized_size = le32_to_cpu(
-				a->data.resident.value_length);
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		if (vi->i_size > ni->allocated_size) {
-			ntfs_error(vi->i_sb, "Resident attribute is corrupt "
-					"(size exceeds allocation).");
-			goto unm_err_out;
-		}
-	} else {
-		NInoSetNonResident(ni);
-		/*
-		 * Ensure the attribute name is placed before the mapping pairs
-		 * array.
-		 */
-		if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-				le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset)))) {
-			ntfs_error(vol->sb, "Attribute name is placed after "
-					"the mapping pairs array.");
-			goto unm_err_out;
-		}
-		if (NInoCompressed(ni) || NInoSparse(ni)) {
-			if (NInoCompressed(ni) && a->data.non_resident.
-					compression_unit != 4) {
-				ntfs_error(vi->i_sb, "Found non-standard "
-						"compression unit (%u instead "
-						"of 4).  Cannot handle this.",
-						a->data.non_resident.
-						compression_unit);
-				err = -EOPNOTSUPP;
-				goto unm_err_out;
-			}
-			if (a->data.non_resident.compression_unit) {
-				ni->itype.compressed.block_size = 1U <<
-						(a->data.non_resident.
-						compression_unit +
-						vol->cluster_size_bits);
-				ni->itype.compressed.block_size_bits =
-						ffs(ni->itype.compressed.
-						block_size) - 1;
-				ni->itype.compressed.block_clusters = 1U <<
-						a->data.non_resident.
-						compression_unit;
-			} else {
-				ni->itype.compressed.block_size = 0;
-				ni->itype.compressed.block_size_bits = 0;
-				ni->itype.compressed.block_clusters = 0;
-			}
-			ni->itype.compressed.size = sle64_to_cpu(
-					a->data.non_resident.compressed_size);
-		}
-		if (a->data.non_resident.lowest_vcn) {
-			ntfs_error(vi->i_sb, "First extent of attribute has "
-					"non-zero lowest_vcn.");
-			goto unm_err_out;
-		}
-		vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-		ni->initialized_size = sle64_to_cpu(
-				a->data.non_resident.initialized_size);
-		ni->allocated_size = sle64_to_cpu(
-				a->data.non_resident.allocated_size);
-	}
-	vi->i_mapping->a_ops = &ntfs_normal_aops;
-	if (NInoMstProtected(ni))
-		vi->i_mapping->a_ops = &ntfs_mst_aops;
-	else if (NInoCompressed(ni))
-		vi->i_mapping->a_ops = &ntfs_compressed_aops;
-	if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT)
-		vi->i_blocks = ni->itype.compressed.size >> 9;
-	else
-		vi->i_blocks = ni->allocated_size >> 9;
-	/*
-	 * Make sure the base inode does not go away and attach it to the
-	 * attribute inode.
-	 */
-	igrab(base_vi);
-	ni->ext.base_ntfs_ino = base_ni;
-	ni->nr_extents = -1;
-
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-
-	ntfs_debug("Done.");
-	return 0;
-
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i while reading attribute "
-			"inode (mft_no 0x%lx, type 0x%x, name_len %i).  "
-			"Marking corrupt inode and base inode 0x%lx as bad.  "
-			"Run chkdsk.", err, vi->i_ino, ni->type, ni->name_len,
-			base_vi->i_ino);
-	make_bad_inode(vi);
-	if (err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_read_locked_index_inode - read an index inode from its base inode
- * @base_vi:	base inode
- * @vi:		index inode to read
- *
- * ntfs_read_locked_index_inode() is called from ntfs_index_iget() to read the
- * index inode described by @vi into memory from the base mft record described
- * by @base_ni.
- *
- * ntfs_read_locked_index_inode() maps, pins and locks the base inode for
- * reading and looks up the attributes relating to the index described by @vi
- * before setting up the necessary fields in @vi as well as initializing the
- * ntfs inode.
- *
- * Note, index inodes are essentially attribute inodes (NInoAttr() is true)
- * with the attribute type set to AT_INDEX_ALLOCATION.  Apart from that, they
- * are setup like directory inodes since directories are a special case of
- * indices ao they need to be treated in much the same way.  Most importantly,
- * for small indices the index allocation attribute might not actually exist.
- * However, the index root attribute always exists but this does not need to
- * have an inode associated with it and this is why we define a new inode type
- * index.  Also, like for directories, we need to have an attribute inode for
- * the bitmap attribute corresponding to the index allocation attribute and we
- * can store this in the appropriate field of the inode, just like we do for
- * normal directory inodes.
- *
- * Q: What locks are held when the function is called?
- * A: i_state has I_NEW set, hence the inode is locked, also
- *    i_count is set to 1, so it is not going to go away
- *
- * Return 0 on success and -errno on error.  In the error case, the inode will
- * have had make_bad_inode() executed on it.
- */
-static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi)
-{
-	loff_t bvi_size;
-	ntfs_volume *vol = NTFS_SB(vi->i_sb);
-	ntfs_inode *ni, *base_ni, *bni;
-	struct inode *bvi;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	INDEX_ROOT *ir;
-	u8 *ir_end, *index_end;
-	int err = 0;
-
-	ntfs_debug("Entering for i_ino 0x%lx.", vi->i_ino);
-	ntfs_init_big_inode(vi);
-	ni	= NTFS_I(vi);
-	base_ni = NTFS_I(base_vi);
-	/* Just mirror the values from the base inode. */
-	vi->i_uid	= base_vi->i_uid;
-	vi->i_gid	= base_vi->i_gid;
-	set_nlink(vi, base_vi->i_nlink);
-	inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi));
-	inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi));
-	inode_set_atime_to_ts(vi, inode_get_atime(base_vi));
-	vi->i_generation = ni->seq_no = base_ni->seq_no;
-	/* Set inode type to zero but preserve permissions. */
-	vi->i_mode	= base_vi->i_mode & ~S_IFMT;
-	/* Map the mft record for the base inode. */
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	/* Find the index root attribute. */
-	err = ntfs_attr_lookup(AT_INDEX_ROOT, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is "
-					"missing.");
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	/* Set up the state. */
-	if (unlikely(a->non_resident)) {
-		ntfs_error(vol->sb, "$INDEX_ROOT attribute is not resident.");
-		goto unm_err_out;
-	}
-	/* Ensure the attribute name is placed before the value. */
-	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-			le16_to_cpu(a->data.resident.value_offset)))) {
-		ntfs_error(vol->sb, "$INDEX_ROOT attribute name is placed "
-				"after the attribute value.");
-		goto unm_err_out;
-	}
-	/*
-	 * Compressed/encrypted/sparse index root is not allowed, except for
-	 * directories of course but those are not dealt with here.
-	 */
-	if (a->flags & (ATTR_COMPRESSION_MASK | ATTR_IS_ENCRYPTED |
-			ATTR_IS_SPARSE)) {
-		ntfs_error(vi->i_sb, "Found compressed/encrypted/sparse index "
-				"root attribute.");
-		goto unm_err_out;
-	}
-	ir = (INDEX_ROOT*)((u8*)a + le16_to_cpu(a->data.resident.value_offset));
-	ir_end = (u8*)ir + le32_to_cpu(a->data.resident.value_length);
-	if (ir_end > (u8*)ctx->mrec + vol->mft_record_size) {
-		ntfs_error(vi->i_sb, "$INDEX_ROOT attribute is corrupt.");
-		goto unm_err_out;
-	}
-	index_end = (u8*)&ir->index + le32_to_cpu(ir->index.index_length);
-	if (index_end > ir_end) {
-		ntfs_error(vi->i_sb, "Index is corrupt.");
-		goto unm_err_out;
-	}
-	if (ir->type) {
-		ntfs_error(vi->i_sb, "Index type is not 0 (type is 0x%x).",
-				le32_to_cpu(ir->type));
-		goto unm_err_out;
-	}
-	ni->itype.index.collation_rule = ir->collation_rule;
-	ntfs_debug("Index collation rule is 0x%x.",
-			le32_to_cpu(ir->collation_rule));
-	ni->itype.index.block_size = le32_to_cpu(ir->index_block_size);
-	if (!is_power_of_2(ni->itype.index.block_size)) {
-		ntfs_error(vi->i_sb, "Index block size (%u) is not a power of "
-				"two.", ni->itype.index.block_size);
-		goto unm_err_out;
-	}
-	if (ni->itype.index.block_size > PAGE_SIZE) {
-		ntfs_error(vi->i_sb, "Index block size (%u) > PAGE_SIZE "
-				"(%ld) is not supported.  Sorry.",
-				ni->itype.index.block_size, PAGE_SIZE);
-		err = -EOPNOTSUPP;
-		goto unm_err_out;
-	}
-	if (ni->itype.index.block_size < NTFS_BLOCK_SIZE) {
-		ntfs_error(vi->i_sb, "Index block size (%u) < NTFS_BLOCK_SIZE "
-				"(%i) is not supported.  Sorry.",
-				ni->itype.index.block_size, NTFS_BLOCK_SIZE);
-		err = -EOPNOTSUPP;
-		goto unm_err_out;
-	}
-	ni->itype.index.block_size_bits = ffs(ni->itype.index.block_size) - 1;
-	/* Determine the size of a vcn in the index. */
-	if (vol->cluster_size <= ni->itype.index.block_size) {
-		ni->itype.index.vcn_size = vol->cluster_size;
-		ni->itype.index.vcn_size_bits = vol->cluster_size_bits;
-	} else {
-		ni->itype.index.vcn_size = vol->sector_size;
-		ni->itype.index.vcn_size_bits = vol->sector_size_bits;
-	}
-	/* Check for presence of index allocation attribute. */
-	if (!(ir->index.flags & LARGE_INDEX)) {
-		/* No index allocation. */
-		vi->i_size = ni->initialized_size = ni->allocated_size = 0;
-		/* We are done with the mft record, so we release it. */
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(base_ni);
-		m = NULL;
-		ctx = NULL;
-		goto skip_large_index_stuff;
-	} /* LARGE_INDEX:  Index allocation present.  Setup state. */
-	NInoSetIndexAllocPresent(ni);
-	/* Find index allocation attribute. */
-	ntfs_attr_reinit_search_ctx(ctx);
-	err = ntfs_attr_lookup(AT_INDEX_ALLOCATION, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-					"not present but $INDEX_ROOT "
-					"indicated it is.");
-		else
-			ntfs_error(vi->i_sb, "Failed to lookup "
-					"$INDEX_ALLOCATION attribute.");
-		goto unm_err_out;
-	}
-	a = ctx->attr;
-	if (!a->non_resident) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"resident.");
-		goto unm_err_out;
-	}
-	/*
-	 * Ensure the attribute name is placed before the mapping pairs array.
-	 */
-	if (unlikely(a->name_length && (le16_to_cpu(a->name_offset) >=
-			le16_to_cpu(
-			a->data.non_resident.mapping_pairs_offset)))) {
-		ntfs_error(vol->sb, "$INDEX_ALLOCATION attribute name is "
-				"placed after the mapping pairs array.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_IS_ENCRYPTED) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"encrypted.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_IS_SPARSE) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is sparse.");
-		goto unm_err_out;
-	}
-	if (a->flags & ATTR_COMPRESSION_MASK) {
-		ntfs_error(vi->i_sb, "$INDEX_ALLOCATION attribute is "
-				"compressed.");
-		goto unm_err_out;
-	}
-	if (a->data.non_resident.lowest_vcn) {
-		ntfs_error(vi->i_sb, "First extent of $INDEX_ALLOCATION "
-				"attribute has non zero lowest_vcn.");
-		goto unm_err_out;
-	}
-	vi->i_size = sle64_to_cpu(a->data.non_resident.data_size);
-	ni->initialized_size = sle64_to_cpu(
-			a->data.non_resident.initialized_size);
-	ni->allocated_size = sle64_to_cpu(a->data.non_resident.allocated_size);
-	/*
-	 * We are done with the mft record, so we release it.  Otherwise
-	 * we would deadlock in ntfs_attr_iget().
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	m = NULL;
-	ctx = NULL;
-	/* Get the index bitmap attribute inode. */
-	bvi = ntfs_attr_iget(base_vi, AT_BITMAP, ni->name, ni->name_len);
-	if (IS_ERR(bvi)) {
-		ntfs_error(vi->i_sb, "Failed to get bitmap attribute.");
-		err = PTR_ERR(bvi);
-		goto unm_err_out;
-	}
-	bni = NTFS_I(bvi);
-	if (NInoCompressed(bni) || NInoEncrypted(bni) ||
-			NInoSparse(bni)) {
-		ntfs_error(vi->i_sb, "$BITMAP attribute is compressed and/or "
-				"encrypted and/or sparse.");
-		goto iput_unm_err_out;
-	}
-	/* Consistency check bitmap size vs. index allocation size. */
-	bvi_size = i_size_read(bvi);
-	if ((bvi_size << 3) < (vi->i_size >> ni->itype.index.block_size_bits)) {
-		ntfs_error(vi->i_sb, "Index bitmap too small (0x%llx) for "
-				"index allocation (0x%llx).", bvi_size << 3,
-				vi->i_size);
-		goto iput_unm_err_out;
-	}
-	iput(bvi);
-skip_large_index_stuff:
-	/* Setup the operations for this index inode. */
-	vi->i_mapping->a_ops = &ntfs_mst_aops;
-	vi->i_blocks = ni->allocated_size >> 9;
-	/*
-	 * Make sure the base inode doesn't go away and attach it to the
-	 * index inode.
-	 */
-	igrab(base_vi);
-	ni->ext.base_ntfs_ino = base_ni;
-	ni->nr_extents = -1;
-
-	ntfs_debug("Done.");
-	return 0;
-iput_unm_err_out:
-	iput(bvi);
-unm_err_out:
-	if (!err)
-		err = -EIO;
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-err_out:
-	ntfs_error(vi->i_sb, "Failed with error code %i while reading index "
-			"inode (mft_no 0x%lx, name_len %i.", err, vi->i_ino,
-			ni->name_len);
-	make_bad_inode(vi);
-	if (err != -EOPNOTSUPP && err != -ENOMEM)
-		NVolSetErrors(vol);
-	return err;
-}
-
-/*
- * The MFT inode has special locking, so teach the lock validator
- * about this by splitting off the locking rules of the MFT from
- * the locking rules of other inodes. The MFT inode can never be
- * accessed from the VFS side (or even internally), only by the
- * map_mft functions.
- */
-static struct lock_class_key mft_ni_runlist_lock_key, mft_ni_mrec_lock_key;
-
-/**
- * ntfs_read_inode_mount - special read_inode for mount time use only
- * @vi:		inode to read
- *
- * Read inode FILE_MFT at mount time, only called with super_block lock
- * held from within the read_super() code path.
- *
- * This function exists because when it is called the page cache for $MFT/$DATA
- * is not initialized and hence we cannot get at the contents of mft records
- * by calling map_mft_record*().
- *
- * Further it needs to cope with the circular references problem, i.e. cannot
- * load any attributes other than $ATTRIBUTE_LIST until $DATA is loaded, because
- * we do not know where the other extent mft records are yet and again, because
- * we cannot call map_mft_record*() yet.  Obviously this applies only when an
- * attribute list is actually present in $MFT inode.
- *
- * We solve these problems by starting with the $DATA attribute before anything
- * else and iterating using ntfs_attr_lookup($DATA) over all extents.  As each
- * extent is found, we ntfs_mapping_pairs_decompress() including the implied
- * ntfs_runlists_merge().  Each step of the iteration necessarily provides
- * sufficient information for the next step to complete.
- *
- * This should work but there are two possible pit falls (see inline comments
- * below), but only time will tell if they are real pits or just smoke...
- */
-int ntfs_read_inode_mount(struct inode *vi)
-{
-	VCN next_vcn, last_vcn, highest_vcn;
-	s64 block;
-	struct super_block *sb = vi->i_sb;
-	ntfs_volume *vol = NTFS_SB(sb);
-	struct buffer_head *bh;
-	ntfs_inode *ni;
-	MFT_RECORD *m = NULL;
-	ATTR_RECORD *a;
-	ntfs_attr_search_ctx *ctx;
-	unsigned int i, nr_blocks;
-	int err;
-
-	ntfs_debug("Entering.");
-
-	/* Initialize the ntfs specific part of @vi. */
-	ntfs_init_big_inode(vi);
-
-	ni = NTFS_I(vi);
-
-	/* Setup the data attribute. It is special as it is mst protected. */
-	NInoSetNonResident(ni);
-	NInoSetMstProtected(ni);
-	NInoSetSparseDisabled(ni);
-	ni->type = AT_DATA;
-	ni->name = NULL;
-	ni->name_len = 0;
-	/*
-	 * This sets up our little cheat allowing us to reuse the async read io
-	 * completion handler for directories.
-	 */
-	ni->itype.index.block_size = vol->mft_record_size;
-	ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-
-	/* Very important! Needed to be able to call map_mft_record*(). */
-	vol->mft_ino = vi;
-
-	/* Allocate enough memory to read the first mft record. */
-	if (vol->mft_record_size > 64 * 1024) {
-		ntfs_error(sb, "Unsupported mft record size %i (max 64kiB).",
-				vol->mft_record_size);
-		goto err_out;
-	}
-	i = vol->mft_record_size;
-	if (i < sb->s_blocksize)
-		i = sb->s_blocksize;
-	m = (MFT_RECORD*)ntfs_malloc_nofs(i);
-	if (!m) {
-		ntfs_error(sb, "Failed to allocate buffer for $MFT record 0.");
-		goto err_out;
-	}
-
-	/* Determine the first block of the $MFT/$DATA attribute. */
-	block = vol->mft_lcn << vol->cluster_size_bits >>
-			sb->s_blocksize_bits;
-	nr_blocks = vol->mft_record_size >> sb->s_blocksize_bits;
-	if (!nr_blocks)
-		nr_blocks = 1;
-
-	/* Load $MFT/$DATA's first mft record. */
-	for (i = 0; i < nr_blocks; i++) {
-		bh = sb_bread(sb, block++);
-		if (!bh) {
-			ntfs_error(sb, "Device read failed.");
-			goto err_out;
-		}
-		memcpy((char*)m + (i << sb->s_blocksize_bits), bh->b_data,
-				sb->s_blocksize);
-		brelse(bh);
-	}
-
-	if (le32_to_cpu(m->bytes_allocated) != vol->mft_record_size) {
-		ntfs_error(sb, "Incorrect mft record size %u in superblock, should be %u.",
-				le32_to_cpu(m->bytes_allocated), vol->mft_record_size);
-		goto err_out;
-	}
-
-	/* Apply the mst fixups. */
-	if (post_read_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size)) {
-		/* FIXME: Try to use the $MFTMirr now. */
-		ntfs_error(sb, "MST fixup failed. $MFT is corrupt.");
-		goto err_out;
-	}
-
-	/* Sanity check offset to the first attribute */
-	if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
-		ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
-			       le16_to_cpu(m->attrs_offset));
-		goto err_out;
-	}
-
-	/* Need this to sanity check attribute list references to $MFT. */
-	vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-
-	/* Provides read_folio() for map_mft_record(). */
-	vi->i_mapping->a_ops = &ntfs_mst_aops;
-
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto err_out;
-	}
-
-	/* Find the attribute list attribute if present. */
-	err = ntfs_attr_lookup(AT_ATTRIBUTE_LIST, NULL, 0, 0, 0, NULL, 0, ctx);
-	if (err) {
-		if (unlikely(err != -ENOENT)) {
-			ntfs_error(sb, "Failed to lookup attribute list "
-					"attribute. You should run chkdsk.");
-			goto put_err_out;
-		}
-	} else /* if (!err) */ {
-		ATTR_LIST_ENTRY *al_entry, *next_al_entry;
-		u8 *al_end;
-		static const char *es = "  Not allowed.  $MFT is corrupt.  "
-				"You should run chkdsk.";
-
-		ntfs_debug("Attribute list attribute found in $MFT.");
-		NInoSetAttrList(ni);
-		a = ctx->attr;
-		if (a->flags & ATTR_COMPRESSION_MASK) {
-			ntfs_error(sb, "Attribute list attribute is "
-					"compressed.%s", es);
-			goto put_err_out;
-		}
-		if (a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			if (a->non_resident) {
-				ntfs_error(sb, "Non-resident attribute list "
-						"attribute is encrypted/"
-						"sparse.%s", es);
-				goto put_err_out;
-			}
-			ntfs_warning(sb, "Resident attribute list attribute "
-					"in $MFT system file is marked "
-					"encrypted/sparse which is not true.  "
-					"However, Windows allows this and "
-					"chkdsk does not detect or correct it "
-					"so we will just ignore the invalid "
-					"flags and pretend they are not set.");
-		}
-		/* Now allocate memory for the attribute list. */
-		ni->attr_list_size = (u32)ntfs_attr_size(a);
-		if (!ni->attr_list_size) {
-			ntfs_error(sb, "Attr_list_size is zero");
-			goto put_err_out;
-		}
-		ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
-		if (!ni->attr_list) {
-			ntfs_error(sb, "Not enough memory to allocate buffer "
-					"for attribute list.");
-			goto put_err_out;
-		}
-		if (a->non_resident) {
-			NInoSetAttrListNonResident(ni);
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(sb, "Attribute list has non zero "
-						"lowest_vcn. $MFT is corrupt. "
-						"You should run chkdsk.");
-				goto put_err_out;
-			}
-			/* Setup the runlist. */
-			ni->attr_list_rl.rl = ntfs_mapping_pairs_decompress(vol,
-					a, NULL);
-			if (IS_ERR(ni->attr_list_rl.rl)) {
-				err = PTR_ERR(ni->attr_list_rl.rl);
-				ni->attr_list_rl.rl = NULL;
-				ntfs_error(sb, "Mapping pairs decompression "
-						"failed with error code %i.",
-						-err);
-				goto put_err_out;
-			}
-			/* Now load the attribute list. */
-			if ((err = load_attribute_list(vol, &ni->attr_list_rl,
-					ni->attr_list, ni->attr_list_size,
-					sle64_to_cpu(a->data.
-					non_resident.initialized_size)))) {
-				ntfs_error(sb, "Failed to load attribute list "
-						"attribute with error code %i.",
-						-err);
-				goto put_err_out;
-			}
-		} else /* if (!ctx.attr->non_resident) */ {
-			if ((u8*)a + le16_to_cpu(
-					a->data.resident.value_offset) +
-					le32_to_cpu(
-					a->data.resident.value_length) >
-					(u8*)ctx->mrec + vol->mft_record_size) {
-				ntfs_error(sb, "Corrupt attribute list "
-						"attribute.");
-				goto put_err_out;
-			}
-			/* Now copy the attribute list. */
-			memcpy(ni->attr_list, (u8*)a + le16_to_cpu(
-					a->data.resident.value_offset),
-					le32_to_cpu(
-					a->data.resident.value_length));
-		}
-		/* The attribute list is now setup in memory. */
-		/*
-		 * FIXME: I don't know if this case is actually possible.
-		 * According to logic it is not possible but I have seen too
-		 * many weird things in MS software to rely on logic... Thus we
-		 * perform a manual search and make sure the first $MFT/$DATA
-		 * extent is in the base inode. If it is not we abort with an
-		 * error and if we ever see a report of this error we will need
-		 * to do some magic in order to have the necessary mft record
-		 * loaded and in the right place in the page cache. But
-		 * hopefully logic will prevail and this never happens...
-		 */
-		al_entry = (ATTR_LIST_ENTRY*)ni->attr_list;
-		al_end = (u8*)al_entry + ni->attr_list_size;
-		for (;; al_entry = next_al_entry) {
-			/* Out of bounds check. */
-			if ((u8*)al_entry < ni->attr_list ||
-					(u8*)al_entry > al_end)
-				goto em_put_err_out;
-			/* Catch the end of the attribute list. */
-			if ((u8*)al_entry == al_end)
-				goto em_put_err_out;
-			if (!al_entry->length)
-				goto em_put_err_out;
-			if ((u8*)al_entry + 6 > al_end || (u8*)al_entry +
-					le16_to_cpu(al_entry->length) > al_end)
-				goto em_put_err_out;
-			next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
-					le16_to_cpu(al_entry->length));
-			if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
-				goto em_put_err_out;
-			if (AT_DATA != al_entry->type)
-				continue;
-			/* We want an unnamed attribute. */
-			if (al_entry->name_length)
-				goto em_put_err_out;
-			/* Want the first entry, i.e. lowest_vcn == 0. */
-			if (al_entry->lowest_vcn)
-				goto em_put_err_out;
-			/* First entry has to be in the base mft record. */
-			if (MREF_LE(al_entry->mft_reference) != vi->i_ino) {
-				/* MFT references do not match, logic fails. */
-				ntfs_error(sb, "BUG: The first $DATA extent "
-						"of $MFT is not in the base "
-						"mft record. Please report "
-						"you saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				goto put_err_out;
-			} else {
-				/* Sequence numbers must match. */
-				if (MSEQNO_LE(al_entry->mft_reference) !=
-						ni->seq_no)
-					goto em_put_err_out;
-				/* Got it. All is ok. We can stop now. */
-				break;
-			}
-		}
-	}
-
-	ntfs_attr_reinit_search_ctx(ctx);
-
-	/* Now load all attribute extents. */
-	a = NULL;
-	next_vcn = last_vcn = highest_vcn = 0;
-	while (!(err = ntfs_attr_lookup(AT_DATA, NULL, 0, 0, next_vcn, NULL, 0,
-			ctx))) {
-		runlist_element *nrl;
-
-		/* Cache the current attribute. */
-		a = ctx->attr;
-		/* $MFT must be non-resident. */
-		if (!a->non_resident) {
-			ntfs_error(sb, "$MFT must be non-resident but a "
-					"resident extent was found. $MFT is "
-					"corrupt. Run chkdsk.");
-			goto put_err_out;
-		}
-		/* $MFT must be uncompressed and unencrypted. */
-		if (a->flags & ATTR_COMPRESSION_MASK ||
-				a->flags & ATTR_IS_ENCRYPTED ||
-				a->flags & ATTR_IS_SPARSE) {
-			ntfs_error(sb, "$MFT must be uncompressed, "
-					"non-sparse, and unencrypted but a "
-					"compressed/sparse/encrypted extent "
-					"was found. $MFT is corrupt. Run "
-					"chkdsk.");
-			goto put_err_out;
-		}
-		/*
-		 * Decompress the mapping pairs array of this extent and merge
-		 * the result into the existing runlist. No need for locking
-		 * as we have exclusive access to the inode at this time and we
-		 * are a mount in progress task, too.
-		 */
-		nrl = ntfs_mapping_pairs_decompress(vol, a, ni->runlist.rl);
-		if (IS_ERR(nrl)) {
-			ntfs_error(sb, "ntfs_mapping_pairs_decompress() "
-					"failed with error code %ld.  $MFT is "
-					"corrupt.", PTR_ERR(nrl));
-			goto put_err_out;
-		}
-		ni->runlist.rl = nrl;
-
-		/* Are we in the first extent? */
-		if (!next_vcn) {
-			if (a->data.non_resident.lowest_vcn) {
-				ntfs_error(sb, "First extent of $DATA "
-						"attribute has non zero "
-						"lowest_vcn. $MFT is corrupt. "
-						"You should run chkdsk.");
-				goto put_err_out;
-			}
-			/* Get the last vcn in the $DATA attribute. */
-			last_vcn = sle64_to_cpu(
-					a->data.non_resident.allocated_size)
-					>> vol->cluster_size_bits;
-			/* Fill in the inode size. */
-			vi->i_size = sle64_to_cpu(
-					a->data.non_resident.data_size);
-			ni->initialized_size = sle64_to_cpu(
-					a->data.non_resident.initialized_size);
-			ni->allocated_size = sle64_to_cpu(
-					a->data.non_resident.allocated_size);
-			/*
-			 * Verify the number of mft records does not exceed
-			 * 2^32 - 1.
-			 */
-			if ((vi->i_size >> vol->mft_record_size_bits) >=
-					(1ULL << 32)) {
-				ntfs_error(sb, "$MFT is too big! Aborting.");
-				goto put_err_out;
-			}
-			/*
-			 * We have got the first extent of the runlist for
-			 * $MFT which means it is now relatively safe to call
-			 * the normal ntfs_read_inode() function.
-			 * Complete reading the inode, this will actually
-			 * re-read the mft record for $MFT, this time entering
-			 * it into the page cache with which we complete the
-			 * kick start of the volume. It should be safe to do
-			 * this now as the first extent of $MFT/$DATA is
-			 * already known and we would hope that we don't need
-			 * further extents in order to find the other
-			 * attributes belonging to $MFT. Only time will tell if
-			 * this is really the case. If not we will have to play
-			 * magic at this point, possibly duplicating a lot of
-			 * ntfs_read_inode() at this point. We will need to
-			 * ensure we do enough of its work to be able to call
-			 * ntfs_read_inode() on extents of $MFT/$DATA. But lets
-			 * hope this never happens...
-			 */
-			ntfs_read_locked_inode(vi);
-			if (is_bad_inode(vi)) {
-				ntfs_error(sb, "ntfs_read_inode() of $MFT "
-						"failed. BUG or corrupt $MFT. "
-						"Run chkdsk and if no errors "
-						"are found, please report you "
-						"saw this message to "
-						"linux-ntfs-dev@lists."
-						"sourceforge.net");
-				ntfs_attr_put_search_ctx(ctx);
-				/* Revert to the safe super operations. */
-				ntfs_free(m);
-				return -1;
-			}
-			/*
-			 * Re-initialize some specifics about $MFT's inode as
-			 * ntfs_read_inode() will have set up the default ones.
-			 */
-			/* Set uid and gid to root. */
-			vi->i_uid = GLOBAL_ROOT_UID;
-			vi->i_gid = GLOBAL_ROOT_GID;
-			/* Regular file. No access for anyone. */
-			vi->i_mode = S_IFREG;
-			/* No VFS initiated operations allowed for $MFT. */
-			vi->i_op = &ntfs_empty_inode_ops;
-			vi->i_fop = &ntfs_empty_file_ops;
-		}
-
-		/* Get the lowest vcn for the next extent. */
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		next_vcn = highest_vcn + 1;
-
-		/* Only one extent or error, which we catch below. */
-		if (next_vcn <= 0)
-			break;
-
-		/* Avoid endless loops due to corruption. */
-		if (next_vcn < sle64_to_cpu(
-				a->data.non_resident.lowest_vcn)) {
-			ntfs_error(sb, "$MFT has corrupt attribute list "
-					"attribute. Run chkdsk.");
-			goto put_err_out;
-		}
-	}
-	if (err != -ENOENT) {
-		ntfs_error(sb, "Failed to lookup $MFT/$DATA attribute extent. "
-				"$MFT is corrupt. Run chkdsk.");
-		goto put_err_out;
-	}
-	if (!a) {
-		ntfs_error(sb, "$MFT/$DATA attribute not found. $MFT is "
-				"corrupt. Run chkdsk.");
-		goto put_err_out;
-	}
-	if (highest_vcn && highest_vcn != last_vcn - 1) {
-		ntfs_error(sb, "Failed to load the complete runlist for "
-				"$MFT/$DATA. Driver bug or corrupt $MFT. "
-				"Run chkdsk.");
-		ntfs_debug("highest_vcn = 0x%llx, last_vcn - 1 = 0x%llx",
-				(unsigned long long)highest_vcn,
-				(unsigned long long)last_vcn - 1);
-		goto put_err_out;
-	}
-	ntfs_attr_put_search_ctx(ctx);
-	ntfs_debug("Done.");
-	ntfs_free(m);
-
-	/*
-	 * Split the locking rules of the MFT inode from the
-	 * locking rules of other inodes:
-	 */
-	lockdep_set_class(&ni->runlist.lock, &mft_ni_runlist_lock_key);
-	lockdep_set_class(&ni->mrec_lock, &mft_ni_mrec_lock_key);
-
-	return 0;
-
-em_put_err_out:
-	ntfs_error(sb, "Couldn't find first extent of $DATA attribute in "
-			"attribute list. $MFT is corrupt. Run chkdsk.");
-put_err_out:
-	ntfs_attr_put_search_ctx(ctx);
-err_out:
-	ntfs_error(sb, "Failed. Marking inode as bad.");
-	make_bad_inode(vi);
-	ntfs_free(m);
-	return -1;
-}
-
-static void __ntfs_clear_inode(ntfs_inode *ni)
-{
-	/* Free all alocated memory. */
-	down_write(&ni->runlist.lock);
-	if (ni->runlist.rl) {
-		ntfs_free(ni->runlist.rl);
-		ni->runlist.rl = NULL;
-	}
-	up_write(&ni->runlist.lock);
-
-	if (ni->attr_list) {
-		ntfs_free(ni->attr_list);
-		ni->attr_list = NULL;
-	}
-
-	down_write(&ni->attr_list_rl.lock);
-	if (ni->attr_list_rl.rl) {
-		ntfs_free(ni->attr_list_rl.rl);
-		ni->attr_list_rl.rl = NULL;
-	}
-	up_write(&ni->attr_list_rl.lock);
-
-	if (ni->name_len && ni->name != I30) {
-		/* Catch bugs... */
-		BUG_ON(!ni->name);
-		kfree(ni->name);
-	}
-}
-
-void ntfs_clear_extent_inode(ntfs_inode *ni)
-{
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(ni->nr_extents != -1);
-
-#ifdef NTFS_RW
-	if (NInoDirty(ni)) {
-		if (!is_bad_inode(VFS_I(ni->ext.base_ntfs_ino)))
-			ntfs_error(ni->vol->sb, "Clearing dirty extent inode!  "
-					"Losing data!  This is a BUG!!!");
-		// FIXME:  Do something!!!
-	}
-#endif /* NTFS_RW */
-
-	__ntfs_clear_inode(ni);
-
-	/* Bye, bye... */
-	ntfs_destroy_extent_inode(ni);
-}
-
-/**
- * ntfs_evict_big_inode - clean up the ntfs specific part of an inode
- * @vi:		vfs inode pending annihilation
- *
- * When the VFS is going to remove an inode from memory, ntfs_clear_big_inode()
- * is called, which deallocates all memory belonging to the NTFS specific part
- * of the inode and returns.
- *
- * If the MFT record is dirty, we commit it before doing anything else.
- */
-void ntfs_evict_big_inode(struct inode *vi)
-{
-	ntfs_inode *ni = NTFS_I(vi);
-
-	truncate_inode_pages_final(&vi->i_data);
-	clear_inode(vi);
-
-#ifdef NTFS_RW
-	if (NInoDirty(ni)) {
-		bool was_bad = (is_bad_inode(vi));
-
-		/* Committing the inode also commits all extent inodes. */
-		ntfs_commit_inode(vi);
-
-		if (!was_bad && (is_bad_inode(vi) || NInoDirty(ni))) {
-			ntfs_error(vi->i_sb, "Failed to commit dirty inode "
-					"0x%lx.  Losing data!", vi->i_ino);
-			// FIXME:  Do something!!!
-		}
-	}
-#endif /* NTFS_RW */
-
-	/* No need to lock at this stage as no one else has a reference. */
-	if (ni->nr_extents > 0) {
-		int i;
-
-		for (i = 0; i < ni->nr_extents; i++)
-			ntfs_clear_extent_inode(ni->ext.extent_ntfs_inos[i]);
-		kfree(ni->ext.extent_ntfs_inos);
-	}
-
-	__ntfs_clear_inode(ni);
-
-	if (NInoAttr(ni)) {
-		/* Release the base inode if we are holding it. */
-		if (ni->nr_extents == -1) {
-			iput(VFS_I(ni->ext.base_ntfs_ino));
-			ni->nr_extents = 0;
-			ni->ext.base_ntfs_ino = NULL;
-		}
-	}
-	BUG_ON(ni->page);
-	if (!atomic_dec_and_test(&ni->count))
-		BUG();
-	return;
-}
-
-/**
- * ntfs_show_options - show mount options in /proc/mounts
- * @sf:		seq_file in which to write our mount options
- * @root:	root of the mounted tree whose mount options to display
- *
- * Called by the VFS once for each mounted ntfs volume when someone reads
- * /proc/mounts in order to display the NTFS specific mount options of each
- * mount. The mount options of fs specified by @root are written to the seq file
- * @sf and success is returned.
- */
-int ntfs_show_options(struct seq_file *sf, struct dentry *root)
-{
-	ntfs_volume *vol = NTFS_SB(root->d_sb);
-	int i;
-
-	seq_printf(sf, ",uid=%i", from_kuid_munged(&init_user_ns, vol->uid));
-	seq_printf(sf, ",gid=%i", from_kgid_munged(&init_user_ns, vol->gid));
-	if (vol->fmask == vol->dmask)
-		seq_printf(sf, ",umask=0%o", vol->fmask);
-	else {
-		seq_printf(sf, ",fmask=0%o", vol->fmask);
-		seq_printf(sf, ",dmask=0%o", vol->dmask);
-	}
-	seq_printf(sf, ",nls=%s", vol->nls_map->charset);
-	if (NVolCaseSensitive(vol))
-		seq_printf(sf, ",case_sensitive");
-	if (NVolShowSystemFiles(vol))
-		seq_printf(sf, ",show_sys_files");
-	if (!NVolSparseEnabled(vol))
-		seq_printf(sf, ",disable_sparse");
-	for (i = 0; on_errors_arr[i].val; i++) {
-		if (on_errors_arr[i].val & vol->on_errors)
-			seq_printf(sf, ",errors=%s", on_errors_arr[i].str);
-	}
-	seq_printf(sf, ",mft_zone_multiplier=%i", vol->mft_zone_multiplier);
-	return 0;
-}
-
-#ifdef NTFS_RW
-
-static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
-		"chkdsk.";
-
-/**
- * ntfs_truncate - called when the i_size of an ntfs inode is changed
- * @vi:		inode for which the i_size was changed
- *
- * We only support i_size changes for normal files at present, i.e. not
- * compressed and not encrypted.  This is enforced in ntfs_setattr(), see
- * below.
- *
- * The kernel guarantees that @vi is a regular file (S_ISREG() is true) and
- * that the change is allowed.
- *
- * This implies for us that @vi is a file inode rather than a directory, index,
- * or attribute inode as well as that @vi is a base inode.
- *
- * Returns 0 on success or -errno on error.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_truncate(struct inode *vi)
-{
-	s64 new_size, old_size, nr_freed, new_alloc_size, old_alloc_size;
-	VCN highest_vcn;
-	unsigned long flags;
-	ntfs_inode *base_ni, *ni = NTFS_I(vi);
-	ntfs_volume *vol = ni->vol;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	const char *te = "  Leaving file length out of sync with i_size.";
-	int err, mp_size, size_change, alloc_change;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(S_ISDIR(vi->i_mode));
-	BUG_ON(NInoMstProtected(ni));
-	BUG_ON(ni->nr_extents < 0);
-retry_truncate:
-	/*
-	 * Lock the runlist for writing and map the mft record to ensure it is
-	 * safe to mess with the attribute runlist and sizes.
-	 */
-	down_write(&ni->runlist.lock);
-	if (!NInoAttr(ni))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	m = map_mft_record(base_ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		ntfs_error(vi->i_sb, "Failed to map mft record for inode 0x%lx "
-				"(error code %d).%s", vi->i_ino, err, te);
-		ctx = NULL;
-		m = NULL;
-		goto old_bad_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(base_ni, m);
-	if (unlikely(!ctx)) {
-		ntfs_error(vi->i_sb, "Failed to allocate a search context for "
-				"inode 0x%lx (not enough memory).%s",
-				vi->i_ino, te);
-		err = -ENOMEM;
-		goto old_bad_out;
-	}
-	err = ntfs_attr_lookup(ni->type, ni->name, ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		if (err == -ENOENT) {
-			ntfs_error(vi->i_sb, "Open attribute is missing from "
-					"mft record.  Inode 0x%lx is corrupt.  "
-					"Run chkdsk.%s", vi->i_ino, te);
-			err = -EIO;
-		} else
-			ntfs_error(vi->i_sb, "Failed to lookup attribute in "
-					"inode 0x%lx (error code %d).%s",
-					vi->i_ino, err, te);
-		goto old_bad_out;
-	}
-	m = ctx->mrec;
-	a = ctx->attr;
-	/*
-	 * The i_size of the vfs inode is the new size for the attribute value.
-	 */
-	new_size = i_size_read(vi);
-	/* The current size of the attribute value is the old size. */
-	old_size = ntfs_attr_size(a);
-	/* Calculate the new allocated size. */
-	if (NInoNonResident(ni))
-		new_alloc_size = (new_size + vol->cluster_size - 1) &
-				~(s64)vol->cluster_size_mask;
-	else
-		new_alloc_size = (new_size + 7) & ~7;
-	/* The current allocated size is the old allocated size. */
-	read_lock_irqsave(&ni->size_lock, flags);
-	old_alloc_size = ni->allocated_size;
-	read_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * The change in the file size.  This will be 0 if no change, >0 if the
-	 * size is growing, and <0 if the size is shrinking.
-	 */
-	size_change = -1;
-	if (new_size - old_size >= 0) {
-		size_change = 1;
-		if (new_size == old_size)
-			size_change = 0;
-	}
-	/* As above for the allocated size. */
-	alloc_change = -1;
-	if (new_alloc_size - old_alloc_size >= 0) {
-		alloc_change = 1;
-		if (new_alloc_size == old_alloc_size)
-			alloc_change = 0;
-	}
-	/*
-	 * If neither the size nor the allocation are being changed there is
-	 * nothing to do.
-	 */
-	if (!size_change && !alloc_change)
-		goto unm_done;
-	/* If the size is changing, check if new size is allowed in $AttrDef. */
-	if (size_change) {
-		err = ntfs_attr_size_bounds_check(vol, ni->type, new_size);
-		if (unlikely(err)) {
-			if (err == -ERANGE) {
-				ntfs_error(vol->sb, "Truncate would cause the "
-						"inode 0x%lx to %simum size "
-						"for its attribute type "
-						"(0x%x).  Aborting truncate.",
-						vi->i_ino,
-						new_size > old_size ? "exceed "
-						"the max" : "go under the min",
-						le32_to_cpu(ni->type));
-				err = -EFBIG;
-			} else {
-				ntfs_error(vol->sb, "Inode 0x%lx has unknown "
-						"attribute type 0x%x.  "
-						"Aborting truncate.",
-						vi->i_ino,
-						le32_to_cpu(ni->type));
-				err = -EIO;
-			}
-			/* Reset the vfs inode size to the old size. */
-			i_size_write(vi, old_size);
-			goto err_out;
-		}
-	}
-	if (NInoCompressed(ni) || NInoEncrypted(ni)) {
-		ntfs_warning(vi->i_sb, "Changes in inode size are not "
-				"supported yet for %s files, ignoring.",
-				NInoCompressed(ni) ? "compressed" :
-				"encrypted");
-		err = -EOPNOTSUPP;
-		goto bad_out;
-	}
-	if (a->non_resident)
-		goto do_non_resident_truncate;
-	BUG_ON(NInoNonResident(ni));
-	/* Resize the attribute record to best fit the new attribute size. */
-	if (new_size < vol->mft_record_size &&
-			!ntfs_resident_attr_value_resize(m, a, new_size)) {
-		/* The resize succeeded! */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		write_lock_irqsave(&ni->size_lock, flags);
-		/* Update the sizes in the ntfs inode and all is done. */
-		ni->allocated_size = le32_to_cpu(a->length) -
-				le16_to_cpu(a->data.resident.value_offset);
-		/*
-		 * Note ntfs_resident_attr_value_resize() has already done any
-		 * necessary data clearing in the attribute record.  When the
-		 * file is being shrunk vmtruncate() will already have cleared
-		 * the top part of the last partial page, i.e. since this is
-		 * the resident case this is the page with index 0.  However,
-		 * when the file is being expanded, the page cache page data
-		 * between the old data_size, i.e. old_size, and the new_size
-		 * has not been zeroed.  Fortunately, we do not need to zero it
-		 * either since on one hand it will either already be zero due
-		 * to both read_folio and writepage clearing partial page data
-		 * beyond i_size in which case there is nothing to do or in the
-		 * case of the file being mmap()ped at the same time, POSIX
-		 * specifies that the behaviour is unspecified thus we do not
-		 * have to do anything.  This means that in our implementation
-		 * in the rare case that the file is mmap()ped and a write
-		 * occurred into the mmap()ped region just beyond the file size
-		 * and writepage has not yet been called to write out the page
-		 * (which would clear the area beyond the file size) and we now
-		 * extend the file size to incorporate this dirty region
-		 * outside the file size, a write of the page would result in
-		 * this data being written to disk instead of being cleared.
-		 * Given both POSIX and the Linux mmap(2) man page specify that
-		 * this corner case is undefined, we choose to leave it like
-		 * that as this is much simpler for us as we cannot lock the
-		 * relevant page now since we are holding too many ntfs locks
-		 * which would result in a lock reversal deadlock.
-		 */
-		ni->initialized_size = new_size;
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		goto unm_done;
-	}
-	/* If the above resize failed, this must be an attribute extension. */
-	BUG_ON(size_change < 0);
-	/*
-	 * We have to drop all the locks so we can call
-	 * ntfs_attr_make_non_resident().  This could be optimised by try-
-	 * locking the first page cache page and only if that fails dropping
-	 * the locks, locking the page, and redoing all the locking and
-	 * lookups.  While this would be a huge optimisation, it is not worth
-	 * it as this is definitely a slow code path as it only ever can happen
-	 * once for any given file.
-	 */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-	/*
-	 * Not enough space in the mft record, try to make the attribute
-	 * non-resident and if successful restart the truncation process.
-	 */
-	err = ntfs_attr_make_non_resident(ni, old_size);
-	if (likely(!err))
-		goto retry_truncate;
-	/*
-	 * Could not make non-resident.  If this is due to this not being
-	 * permitted for this attribute type or there not being enough space,
-	 * try to make other attributes non-resident.  Otherwise fail.
-	 */
-	if (unlikely(err != -EPERM && err != -ENOSPC)) {
-		ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, attribute "
-				"type 0x%x, because the conversion from "
-				"resident to non-resident attribute failed "
-				"with error code %i.", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), err);
-		if (err != -ENOMEM)
-			err = -EIO;
-		goto conv_err_out;
-	}
-	/* TODO: Not implemented from here, abort. */
-	if (err == -ENOSPC)
-		ntfs_error(vol->sb, "Not enough space in the mft record/on "
-				"disk for the non-resident attribute value.  "
-				"This case is not implemented yet.");
-	else /* if (err == -EPERM) */
-		ntfs_error(vol->sb, "This attribute type may not be "
-				"non-resident.  This case is not implemented "
-				"yet.");
-	err = -EOPNOTSUPP;
-	goto conv_err_out;
-#if 0
-	// TODO: Attempt to make other attributes non-resident.
-	if (!err)
-		goto do_resident_extend;
-	/*
-	 * Both the attribute list attribute and the standard information
-	 * attribute must remain in the base inode.  Thus, if this is one of
-	 * these attributes, we have to try to move other attributes out into
-	 * extent mft records instead.
-	 */
-	if (ni->type == AT_ATTRIBUTE_LIST ||
-			ni->type == AT_STANDARD_INFORMATION) {
-		// TODO: Attempt to move other attributes into extent mft
-		// records.
-		err = -EOPNOTSUPP;
-		if (!err)
-			goto do_resident_extend;
-		goto err_out;
-	}
-	// TODO: Attempt to move this attribute to an extent mft record, but
-	// only if it is not already the only attribute in an mft record in
-	// which case there would be nothing to gain.
-	err = -EOPNOTSUPP;
-	if (!err)
-		goto do_resident_extend;
-	/* There is nothing we can do to make enough space. )-: */
-	goto err_out;
-#endif
-do_non_resident_truncate:
-	BUG_ON(!NInoNonResident(ni));
-	if (alloc_change < 0) {
-		highest_vcn = sle64_to_cpu(a->data.non_resident.highest_vcn);
-		if (highest_vcn > 0 &&
-				old_alloc_size >> vol->cluster_size_bits >
-				highest_vcn + 1) {
-			/*
-			 * This attribute has multiple extents.  Not yet
-			 * supported.
-			 */
-			ntfs_error(vol->sb, "Cannot truncate inode 0x%lx, "
-					"attribute type 0x%x, because the "
-					"attribute is highly fragmented (it "
-					"consists of multiple extents) and "
-					"this case is not implemented yet.",
-					vi->i_ino,
-					(unsigned)le32_to_cpu(ni->type));
-			err = -EOPNOTSUPP;
-			goto bad_out;
-		}
-	}
-	/*
-	 * If the size is shrinking, need to reduce the initialized_size and
-	 * the data_size before reducing the allocation.
-	 */
-	if (size_change < 0) {
-		/*
-		 * Make the valid size smaller (i_size is already up-to-date).
-		 */
-		write_lock_irqsave(&ni->size_lock, flags);
-		if (new_size < ni->initialized_size) {
-			ni->initialized_size = new_size;
-			a->data.non_resident.initialized_size =
-					cpu_to_sle64(new_size);
-		}
-		a->data.non_resident.data_size = cpu_to_sle64(new_size);
-		write_unlock_irqrestore(&ni->size_lock, flags);
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		/* If the allocated size is not changing, we are done. */
-		if (!alloc_change)
-			goto unm_done;
-		/*
-		 * If the size is shrinking it makes no sense for the
-		 * allocation to be growing.
-		 */
-		BUG_ON(alloc_change > 0);
-	} else /* if (size_change >= 0) */ {
-		/*
-		 * The file size is growing or staying the same but the
-		 * allocation can be shrinking, growing or staying the same.
-		 */
-		if (alloc_change > 0) {
-			/*
-			 * We need to extend the allocation and possibly update
-			 * the data size.  If we are updating the data size,
-			 * since we are not touching the initialized_size we do
-			 * not need to worry about the actual data on disk.
-			 * And as far as the page cache is concerned, there
-			 * will be no pages beyond the old data size and any
-			 * partial region in the last page between the old and
-			 * new data size (or the end of the page if the new
-			 * data size is outside the page) does not need to be
-			 * modified as explained above for the resident
-			 * attribute truncate case.  To do this, we simply drop
-			 * the locks we hold and leave all the work to our
-			 * friendly helper ntfs_attr_extend_allocation().
-			 */
-			ntfs_attr_put_search_ctx(ctx);
-			unmap_mft_record(base_ni);
-			up_write(&ni->runlist.lock);
-			err = ntfs_attr_extend_allocation(ni, new_size,
-					size_change > 0 ? new_size : -1, -1);
-			/*
-			 * ntfs_attr_extend_allocation() will have done error
-			 * output already.
-			 */
-			goto done;
-		}
-		if (!alloc_change)
-			goto alloc_done;
-	}
-	/* alloc_change < 0 */
-	/* Free the clusters. */
-	nr_freed = ntfs_cluster_free(ni, new_alloc_size >>
-			vol->cluster_size_bits, -1, ctx);
-	m = ctx->mrec;
-	a = ctx->attr;
-	if (unlikely(nr_freed < 0)) {
-		ntfs_error(vol->sb, "Failed to release cluster(s) (error code "
-				"%lli).  Unmount and run chkdsk to recover "
-				"the lost cluster(s).", (long long)nr_freed);
-		NVolSetErrors(vol);
-		nr_freed = 0;
-	}
-	/* Truncate the runlist. */
-	err = ntfs_rl_truncate_nolock(vol, &ni->runlist,
-			new_alloc_size >> vol->cluster_size_bits);
-	/*
-	 * If the runlist truncation failed and/or the search context is no
-	 * longer valid, we cannot resize the attribute record or build the
-	 * mapping pairs array thus we mark the inode bad so that no access to
-	 * the freed clusters can happen.
-	 */
-	if (unlikely(err || IS_ERR(m))) {
-		ntfs_error(vol->sb, "Failed to %s (error code %li).%s",
-				IS_ERR(m) ?
-				"restore attribute search context" :
-				"truncate attribute runlist",
-				IS_ERR(m) ? PTR_ERR(m) : err, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/* Get the size for the shrunk mapping pairs array for the runlist. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, ni->runlist.rl, 0, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
-				"attribute type 0x%x, because determining the "
-				"size for the mapping pairs failed with error "
-				"code %i.%s", vi->i_ino,
-				(unsigned)le32_to_cpu(ni->type), mp_size, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/*
-	 * Shrink the attribute record for the new mapping pairs array.  Note,
-	 * this cannot fail since we are making the attribute smaller thus by
-	 * definition there is enough space to do so.
-	 */
-	err = ntfs_attr_record_resize(m, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	BUG_ON(err);
-	/*
-	 * Generate the mapping pairs array directly into the attribute record.
-	 */
-	err = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, ni->runlist.rl, 0, -1, NULL);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Cannot shrink allocation of inode 0x%lx, "
-				"attribute type 0x%x, because building the "
-				"mapping pairs failed with error code %i.%s",
-				vi->i_ino, (unsigned)le32_to_cpu(ni->type),
-				err, es);
-		err = -EIO;
-		goto bad_out;
-	}
-	/* Update the allocated/compressed size as well as the highest vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64((new_alloc_size >>
-			vol->cluster_size_bits) - 1);
-	write_lock_irqsave(&ni->size_lock, flags);
-	ni->allocated_size = new_alloc_size;
-	a->data.non_resident.allocated_size = cpu_to_sle64(new_alloc_size);
-	if (NInoSparse(ni) || NInoCompressed(ni)) {
-		if (nr_freed) {
-			ni->itype.compressed.size -= nr_freed <<
-					vol->cluster_size_bits;
-			BUG_ON(ni->itype.compressed.size < 0);
-			a->data.non_resident.compressed_size = cpu_to_sle64(
-					ni->itype.compressed.size);
-			vi->i_blocks = ni->itype.compressed.size >> 9;
-		}
-	} else
-		vi->i_blocks = new_alloc_size >> 9;
-	write_unlock_irqrestore(&ni->size_lock, flags);
-	/*
-	 * We have shrunk the allocation.  If this is a shrinking truncate we
-	 * have already dealt with the initialized_size and the data_size above
-	 * and we are done.  If the truncate is only changing the allocation
-	 * and not the data_size, we are also done.  If this is an extending
-	 * truncate, need to extend the data_size now which is ensured by the
-	 * fact that @size_change is positive.
-	 */
-alloc_done:
-	/*
-	 * If the size is growing, need to update it now.  If it is shrinking,
-	 * we have already updated it above (before the allocation change).
-	 */
-	if (size_change > 0)
-		a->data.non_resident.data_size = cpu_to_sle64(new_size);
-	/* Ensure the modified mft record is written out. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-unm_done:
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-done:
-	/* Update the mtime and ctime on the base inode. */
-	/* normally ->truncate shouldn't update ctime or mtime,
-	 * but ntfs did before so it got a copy & paste version
-	 * of file_update_time.  one day someone should fix this
-	 * for real.
-	 */
-	if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) {
-		struct timespec64 now = current_time(VFS_I(base_ni));
-		struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni));
-		struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni));
-		int sync_it = 0;
-
-		if (!timespec64_equal(&mtime, &now) ||
-		    !timespec64_equal(&ctime, &now))
-			sync_it = 1;
-		inode_set_ctime_to_ts(VFS_I(base_ni), now);
-		inode_set_mtime_to_ts(VFS_I(base_ni), now);
-
-		if (sync_it)
-			mark_inode_dirty_sync(VFS_I(base_ni));
-	}
-
-	if (likely(!err)) {
-		NInoClearTruncateFailed(ni);
-		ntfs_debug("Done.");
-	}
-	return err;
-old_bad_out:
-	old_size = -1;
-bad_out:
-	if (err != -ENOMEM && err != -EOPNOTSUPP)
-		NVolSetErrors(vol);
-	if (err != -EOPNOTSUPP)
-		NInoSetTruncateFailed(ni);
-	else if (old_size >= 0)
-		i_size_write(vi, old_size);
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(base_ni);
-	up_write(&ni->runlist.lock);
-out:
-	ntfs_debug("Failed.  Returning error code %i.", err);
-	return err;
-conv_err_out:
-	if (err != -ENOMEM && err != -EOPNOTSUPP)
-		NVolSetErrors(vol);
-	if (err != -EOPNOTSUPP)
-		NInoSetTruncateFailed(ni);
-	else
-		i_size_write(vi, old_size);
-	goto out;
-}
-
-/**
- * ntfs_truncate_vfs - wrapper for ntfs_truncate() that has no return value
- * @vi:		inode for which the i_size was changed
- *
- * Wrapper for ntfs_truncate() that has no return value.
- *
- * See ntfs_truncate() description above for details.
- */
-#ifdef NTFS_RW
-void ntfs_truncate_vfs(struct inode *vi) {
-	ntfs_truncate(vi);
-}
-#endif
-
-/**
- * ntfs_setattr - called from notify_change() when an attribute is being changed
- * @idmap:	idmap of the mount the inode was found from
- * @dentry:	dentry whose attributes to change
- * @attr:	structure describing the attributes and the changes
- *
- * We have to trap VFS attempts to truncate the file described by @dentry as
- * soon as possible, because we do not implement changes in i_size yet.  So we
- * abort all i_size changes here.
- *
- * We also abort all changes of user, group, and mode as we do not implement
- * the NTFS ACLs yet.
- *
- * Called with ->i_mutex held.
- */
-int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
-		 struct iattr *attr)
-{
-	struct inode *vi = d_inode(dentry);
-	int err;
-	unsigned int ia_valid = attr->ia_valid;
-
-	err = setattr_prepare(&nop_mnt_idmap, dentry, attr);
-	if (err)
-		goto out;
-	/* We do not support NTFS ACLs yet. */
-	if (ia_valid & (ATTR_UID | ATTR_GID | ATTR_MODE)) {
-		ntfs_warning(vi->i_sb, "Changes in user/group/mode are not "
-				"supported yet, ignoring.");
-		err = -EOPNOTSUPP;
-		goto out;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(vi)) {
-			ntfs_inode *ni = NTFS_I(vi);
-			/*
-			 * FIXME: For now we do not support resizing of
-			 * compressed or encrypted files yet.
-			 */
-			if (NInoCompressed(ni) || NInoEncrypted(ni)) {
-				ntfs_warning(vi->i_sb, "Changes in inode size "
-						"are not supported yet for "
-						"%s files, ignoring.",
-						NInoCompressed(ni) ?
-						"compressed" : "encrypted");
-				err = -EOPNOTSUPP;
-			} else {
-				truncate_setsize(vi, attr->ia_size);
-				ntfs_truncate_vfs(vi);
-			}
-			if (err || ia_valid == ATTR_SIZE)
-				goto out;
-		} else {
-			/*
-			 * We skipped the truncate but must still update
-			 * timestamps.
-			 */
-			ia_valid |= ATTR_MTIME | ATTR_CTIME;
-		}
-	}
-	if (ia_valid & ATTR_ATIME)
-		inode_set_atime_to_ts(vi, attr->ia_atime);
-	if (ia_valid & ATTR_MTIME)
-		inode_set_mtime_to_ts(vi, attr->ia_mtime);
-	if (ia_valid & ATTR_CTIME)
-		inode_set_ctime_to_ts(vi, attr->ia_ctime);
-	mark_inode_dirty(vi);
-out:
-	return err;
-}
-
-/**
- * __ntfs_write_inode - write out a dirty inode
- * @vi:		inode to write out
- * @sync:	if true, write out synchronously
- *
- * Write out a dirty inode to disk including any extent inodes if present.
- *
- * If @sync is true, commit the inode to disk and wait for io completion.  This
- * is done using write_mft_record().
- *
- * If @sync is false, just schedule the write to happen but do not wait for i/o
- * completion.  In 2.6 kernels, scheduling usually happens just by virtue of
- * marking the page (and in this case mft record) dirty but we do not implement
- * this yet as write_mft_record() largely ignores the @sync parameter and
- * always performs synchronous writes.
- *
- * Return 0 on success and -errno on error.
- */
-int __ntfs_write_inode(struct inode *vi, int sync)
-{
-	sle64 nt;
-	ntfs_inode *ni = NTFS_I(vi);
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	STANDARD_INFORMATION *si;
-	int err = 0;
-	bool modified = false;
-
-	ntfs_debug("Entering for %sinode 0x%lx.", NInoAttr(ni) ? "attr " : "",
-			vi->i_ino);
-	/*
-	 * Dirty attribute inodes are written via their real inodes so just
-	 * clean them here.  Access time updates are taken care off when the
-	 * real inode is written.
-	 */
-	if (NInoAttr(ni)) {
-		NInoClearDirty(ni);
-		ntfs_debug("Done.");
-		return 0;
-	}
-	/* Map, pin, and lock the mft record belonging to the inode. */
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	/* Update the access times in the standard information attribute. */
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (unlikely(!ctx)) {
-		err = -ENOMEM;
-		goto unm_err_out;
-	}
-	err = ntfs_attr_lookup(AT_STANDARD_INFORMATION, NULL, 0,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		ntfs_attr_put_search_ctx(ctx);
-		goto unm_err_out;
-	}
-	si = (STANDARD_INFORMATION*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	/* Update the access times if they have changed. */
-	nt = utc2ntfs(inode_get_mtime(vi));
-	if (si->last_data_change_time != nt) {
-		ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino, (long long)
-				sle64_to_cpu(si->last_data_change_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_data_change_time = nt;
-		modified = true;
-	}
-	nt = utc2ntfs(inode_get_ctime(vi));
-	if (si->last_mft_change_time != nt) {
-		ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino, (long long)
-				sle64_to_cpu(si->last_mft_change_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_mft_change_time = nt;
-		modified = true;
-	}
-	nt = utc2ntfs(inode_get_atime(vi));
-	if (si->last_access_time != nt) {
-		ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, "
-				"new = 0x%llx", vi->i_ino,
-				(long long)sle64_to_cpu(si->last_access_time),
-				(long long)sle64_to_cpu(nt));
-		si->last_access_time = nt;
-		modified = true;
-	}
-	/*
-	 * If we just modified the standard information attribute we need to
-	 * mark the mft record it is in dirty.  We do this manually so that
-	 * mark_inode_dirty() is not called which would redirty the inode and
-	 * hence result in an infinite loop of trying to write the inode.
-	 * There is no need to mark the base inode nor the base mft record
-	 * dirty, since we are going to write this mft record below in any case
-	 * and the base mft record may actually not have been modified so it
-	 * might not need to be written out.
-	 * NOTE: It is not a problem when the inode for $MFT itself is being
-	 * written out as mark_ntfs_record_dirty() will only set I_DIRTY_PAGES
-	 * on the $MFT inode and hence __ntfs_write_inode() will not be
-	 * re-invoked because of it which in turn is ok since the dirtied mft
-	 * record will be cleaned and written out to disk below, i.e. before
-	 * this function returns.
-	 */
-	if (modified) {
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		if (!NInoTestSetDirty(ctx->ntfs_ino))
-			mark_ntfs_record_dirty(ctx->ntfs_ino->page,
-					ctx->ntfs_ino->page_ofs);
-	}
-	ntfs_attr_put_search_ctx(ctx);
-	/* Now the access times are updated, write the base mft record. */
-	if (NInoDirty(ni))
-		err = write_mft_record(ni, m, sync);
-	/* Write all attached extent mft records. */
-	mutex_lock(&ni->extent_lock);
-	if (ni->nr_extents > 0) {
-		ntfs_inode **extent_nis = ni->ext.extent_ntfs_inos;
-		int i;
-
-		ntfs_debug("Writing %i extent inodes.", ni->nr_extents);
-		for (i = 0; i < ni->nr_extents; i++) {
-			ntfs_inode *tni = extent_nis[i];
-
-			if (NInoDirty(tni)) {
-				MFT_RECORD *tm = map_mft_record(tni);
-				int ret;
-
-				if (IS_ERR(tm)) {
-					if (!err || err == -ENOMEM)
-						err = PTR_ERR(tm);
-					continue;
-				}
-				ret = write_mft_record(tni, tm, sync);
-				unmap_mft_record(tni);
-				if (unlikely(ret)) {
-					if (!err || err == -ENOMEM)
-						err = ret;
-				}
-			}
-		}
-	}
-	mutex_unlock(&ni->extent_lock);
-	unmap_mft_record(ni);
-	if (unlikely(err))
-		goto err_out;
-	ntfs_debug("Done.");
-	return 0;
-unm_err_out:
-	unmap_mft_record(ni);
-err_out:
-	if (err == -ENOMEM) {
-		ntfs_warning(vi->i_sb, "Not enough memory to write inode.  "
-				"Marking the inode dirty again, so the VFS "
-				"retries later.");
-		mark_inode_dirty(vi);
-	} else {
-		ntfs_error(vi->i_sb, "Failed (error %i):  Run chkdsk.", -err);
-		NVolSetErrors(ni->vol);
-	}
-	return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h
deleted file mode 100644
index 147ef4ddb691..000000000000
--- a/fs/ntfs/inode.h
+++ /dev/null
@@ -1,310 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * inode.h - Defines for inode structures NTFS Linux kernel driver. Part of
- *	     the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_INODE_H
-#define _LINUX_NTFS_INODE_H
-
-#include <linux/atomic.h>
-
-#include <linux/fs.h>
-#include <linux/list.h>
-#include <linux/mm.h>
-#include <linux/mutex.h>
-#include <linux/seq_file.h>
-
-#include "layout.h"
-#include "volume.h"
-#include "types.h"
-#include "runlist.h"
-#include "debug.h"
-
-typedef struct _ntfs_inode ntfs_inode;
-
-/*
- * The NTFS in-memory inode structure. It is just used as an extension to the
- * fields already provided in the VFS inode.
- */
-struct _ntfs_inode {
-	rwlock_t size_lock;	/* Lock serializing access to inode sizes. */
-	s64 initialized_size;	/* Copy from the attribute record. */
-	s64 allocated_size;	/* Copy from the attribute record. */
-	unsigned long state;	/* NTFS specific flags describing this inode.
-				   See ntfs_inode_state_bits below. */
-	unsigned long mft_no;	/* Number of the mft record / inode. */
-	u16 seq_no;		/* Sequence number of the mft record. */
-	atomic_t count;		/* Inode reference count for book keeping. */
-	ntfs_volume *vol;	/* Pointer to the ntfs volume of this inode. */
-	/*
-	 * If NInoAttr() is true, the below fields describe the attribute which
-	 * this fake inode belongs to. The actual inode of this attribute is
-	 * pointed to by base_ntfs_ino and nr_extents is always set to -1 (see
-	 * below). For real inodes, we also set the type (AT_DATA for files and
-	 * AT_INDEX_ALLOCATION for directories), with the name = NULL and
-	 * name_len = 0 for files and name = I30 (global constant) and
-	 * name_len = 4 for directories.
-	 */
-	ATTR_TYPE type;	/* Attribute type of this fake inode. */
-	ntfschar *name;		/* Attribute name of this fake inode. */
-	u32 name_len;		/* Attribute name length of this fake inode. */
-	runlist runlist;	/* If state has the NI_NonResident bit set,
-				   the runlist of the unnamed data attribute
-				   (if a file) or of the index allocation
-				   attribute (directory) or of the attribute
-				   described by the fake inode (if NInoAttr()).
-				   If runlist.rl is NULL, the runlist has not
-				   been read in yet or has been unmapped. If
-				   NI_NonResident is clear, the attribute is
-				   resident (file and fake inode) or there is
-				   no $I30 index allocation attribute
-				   (small directory). In the latter case
-				   runlist.rl is always NULL.*/
-	/*
-	 * The following fields are only valid for real inodes and extent
-	 * inodes.
-	 */
-	struct mutex mrec_lock;	/* Lock for serializing access to the
-				   mft record belonging to this inode. */
-	struct page *page;	/* The page containing the mft record of the
-				   inode. This should only be touched by the
-				   (un)map_mft_record*() functions. */
-	int page_ofs;		/* Offset into the page at which the mft record
-				   begins. This should only be touched by the
-				   (un)map_mft_record*() functions. */
-	/*
-	 * Attribute list support (only for use by the attribute lookup
-	 * functions). Setup during read_inode for all inodes with attribute
-	 * lists. Only valid if NI_AttrList is set in state, and attr_list_rl is
-	 * further only valid if NI_AttrListNonResident is set.
-	 */
-	u32 attr_list_size;	/* Length of attribute list value in bytes. */
-	u8 *attr_list;		/* Attribute list value itself. */
-	runlist attr_list_rl;	/* Run list for the attribute list value. */
-	union {
-		struct { /* It is a directory, $MFT, or an index inode. */
-			u32 block_size;		/* Size of an index block. */
-			u32 vcn_size;		/* Size of a vcn in this
-						   index. */
-			COLLATION_RULE collation_rule; /* The collation rule
-						   for the index. */
-			u8 block_size_bits; 	/* Log2 of the above. */
-			u8 vcn_size_bits;	/* Log2 of the above. */
-		} index;
-		struct { /* It is a compressed/sparse file/attribute inode. */
-			s64 size;		/* Copy of compressed_size from
-						   $DATA. */
-			u32 block_size;		/* Size of a compression block
-						   (cb). */
-			u8 block_size_bits;	/* Log2 of the size of a cb. */
-			u8 block_clusters;	/* Number of clusters per cb. */
-		} compressed;
-	} itype;
-	struct mutex extent_lock;	/* Lock for accessing/modifying the
-					   below . */
-	s32 nr_extents;	/* For a base mft record, the number of attached extent
-			   inodes (0 if none), for extent records and for fake
-			   inodes describing an attribute this is -1. */
-	union {		/* This union is only used if nr_extents != 0. */
-		ntfs_inode **extent_ntfs_inos;	/* For nr_extents > 0, array of
-						   the ntfs inodes of the extent
-						   mft records belonging to
-						   this base inode which have
-						   been loaded. */
-		ntfs_inode *base_ntfs_ino;	/* For nr_extents == -1, the
-						   ntfs inode of the base mft
-						   record. For fake inodes, the
-						   real (base) inode to which
-						   the attribute belongs. */
-	} ext;
-};
-
-/*
- * Defined bits for the state field in the ntfs_inode structure.
- * (f) = files only, (d) = directories only, (a) = attributes/fake inodes only
- */
-typedef enum {
-	NI_Dirty,		/* 1: Mft record needs to be written to disk. */
-	NI_AttrList,		/* 1: Mft record contains an attribute list. */
-	NI_AttrListNonResident,	/* 1: Attribute list is non-resident. Implies
-				      NI_AttrList is set. */
-
-	NI_Attr,		/* 1: Fake inode for attribute i/o.
-				   0: Real inode or extent inode. */
-
-	NI_MstProtected,	/* 1: Attribute is protected by MST fixups.
-				   0: Attribute is not protected by fixups. */
-	NI_NonResident,		/* 1: Unnamed data attr is non-resident (f).
-				   1: Attribute is non-resident (a). */
-	NI_IndexAllocPresent = NI_NonResident,	/* 1: $I30 index alloc attr is
-						   present (d). */
-	NI_Compressed,		/* 1: Unnamed data attr is compressed (f).
-				   1: Create compressed files by default (d).
-				   1: Attribute is compressed (a). */
-	NI_Encrypted,		/* 1: Unnamed data attr is encrypted (f).
-				   1: Create encrypted files by default (d).
-				   1: Attribute is encrypted (a). */
-	NI_Sparse,		/* 1: Unnamed data attr is sparse (f).
-				   1: Create sparse files by default (d).
-				   1: Attribute is sparse (a). */
-	NI_SparseDisabled,	/* 1: May not create sparse regions. */
-	NI_TruncateFailed,	/* 1: Last ntfs_truncate() call failed. */
-} ntfs_inode_state_bits;
-
-/*
- * NOTE: We should be adding dirty mft records to a list somewhere and they
- * should be independent of the (ntfs/vfs) inode structure so that an inode can
- * be removed but the record can be left dirty for syncing later.
- */
-
-/*
- * Macro tricks to expand the NInoFoo(), NInoSetFoo(), and NInoClearFoo()
- * functions.
- */
-#define NINO_FNS(flag)					\
-static inline int NIno##flag(ntfs_inode *ni)		\
-{							\
-	return test_bit(NI_##flag, &(ni)->state);	\
-}							\
-static inline void NInoSet##flag(ntfs_inode *ni)	\
-{							\
-	set_bit(NI_##flag, &(ni)->state);		\
-}							\
-static inline void NInoClear##flag(ntfs_inode *ni)	\
-{							\
-	clear_bit(NI_##flag, &(ni)->state);		\
-}
-
-/*
- * As above for NInoTestSetFoo() and NInoTestClearFoo().
- */
-#define TAS_NINO_FNS(flag)					\
-static inline int NInoTestSet##flag(ntfs_inode *ni)		\
-{								\
-	return test_and_set_bit(NI_##flag, &(ni)->state);	\
-}								\
-static inline int NInoTestClear##flag(ntfs_inode *ni)		\
-{								\
-	return test_and_clear_bit(NI_##flag, &(ni)->state);	\
-}
-
-/* Emit the ntfs inode bitops functions. */
-NINO_FNS(Dirty)
-TAS_NINO_FNS(Dirty)
-NINO_FNS(AttrList)
-NINO_FNS(AttrListNonResident)
-NINO_FNS(Attr)
-NINO_FNS(MstProtected)
-NINO_FNS(NonResident)
-NINO_FNS(IndexAllocPresent)
-NINO_FNS(Compressed)
-NINO_FNS(Encrypted)
-NINO_FNS(Sparse)
-NINO_FNS(SparseDisabled)
-NINO_FNS(TruncateFailed)
-
-/*
- * The full structure containing a ntfs_inode and a vfs struct inode. Used for
- * all real and fake inodes but not for extent inodes which lack the vfs struct
- * inode.
- */
-typedef struct {
-	ntfs_inode ntfs_inode;
-	struct inode vfs_inode;		/* The vfs inode structure. */
-} big_ntfs_inode;
-
-/**
- * NTFS_I - return the ntfs inode given a vfs inode
- * @inode:	VFS inode
- *
- * NTFS_I() returns the ntfs inode associated with the VFS @inode.
- */
-static inline ntfs_inode *NTFS_I(struct inode *inode)
-{
-	return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode);
-}
-
-static inline struct inode *VFS_I(ntfs_inode *ni)
-{
-	return &((big_ntfs_inode *)ni)->vfs_inode;
-}
-
-/**
- * ntfs_attr - ntfs in memory attribute structure
- * @mft_no:	mft record number of the base mft record of this attribute
- * @name:	Unicode name of the attribute (NULL if unnamed)
- * @name_len:	length of @name in Unicode characters (0 if unnamed)
- * @type:	attribute type (see layout.h)
- *
- * This structure exists only to provide a small structure for the
- * ntfs_{attr_}iget()/ntfs_test_inode()/ntfs_init_locked_inode() mechanism.
- *
- * NOTE: Elements are ordered by size to make the structure as compact as
- * possible on all architectures.
- */
-typedef struct {
-	unsigned long mft_no;
-	ntfschar *name;
-	u32 name_len;
-	ATTR_TYPE type;
-} ntfs_attr;
-
-extern int ntfs_test_inode(struct inode *vi, void *data);
-
-extern struct inode *ntfs_iget(struct super_block *sb, unsigned long mft_no);
-extern struct inode *ntfs_attr_iget(struct inode *base_vi, ATTR_TYPE type,
-		ntfschar *name, u32 name_len);
-extern struct inode *ntfs_index_iget(struct inode *base_vi, ntfschar *name,
-		u32 name_len);
-
-extern struct inode *ntfs_alloc_big_inode(struct super_block *sb);
-extern void ntfs_free_big_inode(struct inode *inode);
-extern void ntfs_evict_big_inode(struct inode *vi);
-
-extern void __ntfs_init_inode(struct super_block *sb, ntfs_inode *ni);
-
-static inline void ntfs_init_big_inode(struct inode *vi)
-{
-	ntfs_inode *ni = NTFS_I(vi);
-
-	ntfs_debug("Entering.");
-	__ntfs_init_inode(vi->i_sb, ni);
-	ni->mft_no = vi->i_ino;
-}
-
-extern ntfs_inode *ntfs_new_extent_inode(struct super_block *sb,
-		unsigned long mft_no);
-extern void ntfs_clear_extent_inode(ntfs_inode *ni);
-
-extern int ntfs_read_inode_mount(struct inode *vi);
-
-extern int ntfs_show_options(struct seq_file *sf, struct dentry *root);
-
-#ifdef NTFS_RW
-
-extern int ntfs_truncate(struct inode *vi);
-extern void ntfs_truncate_vfs(struct inode *vi);
-
-extern int ntfs_setattr(struct mnt_idmap *idmap,
-			struct dentry *dentry, struct iattr *attr);
-
-extern int __ntfs_write_inode(struct inode *vi, int sync);
-
-static inline void ntfs_commit_inode(struct inode *vi)
-{
-	if (!is_bad_inode(vi))
-		__ntfs_write_inode(vi, 1);
-	return;
-}
-
-#else
-
-static inline void ntfs_truncate_vfs(struct inode *vi) {}
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_INODE_H */
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
deleted file mode 100644
index 5d4bf7a3259f..000000000000
--- a/fs/ntfs/layout.h
+++ /dev/null
@@ -1,2421 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * layout.h - All NTFS associated on-disk structures. Part of the Linux-NTFS
- *	      project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_LAYOUT_H
-#define _LINUX_NTFS_LAYOUT_H
-
-#include <linux/types.h>
-#include <linux/bitops.h>
-#include <linux/list.h>
-#include <asm/byteorder.h>
-
-#include "types.h"
-
-/* The NTFS oem_id "NTFS    " */
-#define magicNTFS	cpu_to_le64(0x202020205346544eULL)
-
-/*
- * Location of bootsector on partition:
- *	The standard NTFS_BOOT_SECTOR is on sector 0 of the partition.
- *	On NT4 and above there is one backup copy of the boot sector to
- *	be found on the last sector of the partition (not normally accessible
- *	from within Windows as the bootsector contained number of sectors
- *	value is one less than the actual value!).
- *	On versions of NT 3.51 and earlier, the backup copy was located at
- *	number of sectors/2 (integer divide), i.e. in the middle of the volume.
- */
-
-/*
- * BIOS parameter block (bpb) structure.
- */
-typedef struct {
-	le16 bytes_per_sector;		/* Size of a sector in bytes. */
-	u8  sectors_per_cluster;	/* Size of a cluster in sectors. */
-	le16 reserved_sectors;		/* zero */
-	u8  fats;			/* zero */
-	le16 root_entries;		/* zero */
-	le16 sectors;			/* zero */
-	u8  media_type;			/* 0xf8 = hard disk */
-	le16 sectors_per_fat;		/* zero */
-	le16 sectors_per_track;		/* irrelevant */
-	le16 heads;			/* irrelevant */
-	le32 hidden_sectors;		/* zero */
-	le32 large_sectors;		/* zero */
-} __attribute__ ((__packed__)) BIOS_PARAMETER_BLOCK;
-
-/*
- * NTFS boot sector structure.
- */
-typedef struct {
-	u8  jump[3];			/* Irrelevant (jump to boot up code).*/
-	le64 oem_id;			/* Magic "NTFS    ". */
-	BIOS_PARAMETER_BLOCK bpb;	/* See BIOS_PARAMETER_BLOCK. */
-	u8  unused[4];			/* zero, NTFS diskedit.exe states that
-					   this is actually:
-						__u8 physical_drive;	// 0x80
-						__u8 current_head;	// zero
-						__u8 extended_boot_signature;
-									// 0x80
-						__u8 unused;		// zero
-					 */
-/*0x28*/sle64 number_of_sectors;	/* Number of sectors in volume. Gives
-					   maximum volume size of 2^63 sectors.
-					   Assuming standard sector size of 512
-					   bytes, the maximum byte size is
-					   approx. 4.7x10^21 bytes. (-; */
-	sle64 mft_lcn;			/* Cluster location of mft data. */
-	sle64 mftmirr_lcn;		/* Cluster location of copy of mft. */
-	s8  clusters_per_mft_record;	/* Mft record size in clusters. */
-	u8  reserved0[3];		/* zero */
-	s8  clusters_per_index_record;	/* Index block size in clusters. */
-	u8  reserved1[3];		/* zero */
-	le64 volume_serial_number;	/* Irrelevant (serial number). */
-	le32 checksum;			/* Boot sector checksum. */
-/*0x54*/u8  bootstrap[426];		/* Irrelevant (boot up code). */
-	le16 end_of_sector_marker;	/* End of bootsector magic. Always is
-					   0xaa55 in little endian. */
-/* sizeof() = 512 (0x200) bytes */
-} __attribute__ ((__packed__)) NTFS_BOOT_SECTOR;
-
-/*
- * Magic identifiers present at the beginning of all ntfs record containing
- * records (like mft records for example).
- */
-enum {
-	/* Found in $MFT/$DATA. */
-	magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
-	magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
-	magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
-
-	/* Found in $LogFile/$DATA. */
-	magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
-	magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
-
-	/* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-	magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
-
-	/* Found in all ntfs record containing records. */
-	magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
-						       transfer was detected. */
-	/*
-	 * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
-	 * thus not initialized.  Page must be initialized before using it.
-	 */
-	magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
-};
-
-typedef le32 NTFS_RECORD_TYPE;
-
-/*
- * Generic magic comparison macros. Finally found a use for the ## preprocessor
- * operator! (-8
- */
-
-static inline bool __ntfs_is_magic(le32 x, NTFS_RECORD_TYPE r)
-{
-	return (x == r);
-}
-#define ntfs_is_magic(x, m)	__ntfs_is_magic(x, magic_##m)
-
-static inline bool __ntfs_is_magicp(le32 *p, NTFS_RECORD_TYPE r)
-{
-	return (*p == r);
-}
-#define ntfs_is_magicp(p, m)	__ntfs_is_magicp(p, magic_##m)
-
-/*
- * Specialised magic comparison macros for the NTFS_RECORD_TYPEs defined above.
- */
-#define ntfs_is_file_record(x)		( ntfs_is_magic (x, FILE) )
-#define ntfs_is_file_recordp(p)		( ntfs_is_magicp(p, FILE) )
-#define ntfs_is_mft_record(x)		( ntfs_is_file_record (x) )
-#define ntfs_is_mft_recordp(p)		( ntfs_is_file_recordp(p) )
-#define ntfs_is_indx_record(x)		( ntfs_is_magic (x, INDX) )
-#define ntfs_is_indx_recordp(p)		( ntfs_is_magicp(p, INDX) )
-#define ntfs_is_hole_record(x)		( ntfs_is_magic (x, HOLE) )
-#define ntfs_is_hole_recordp(p)		( ntfs_is_magicp(p, HOLE) )
-
-#define ntfs_is_rstr_record(x)		( ntfs_is_magic (x, RSTR) )
-#define ntfs_is_rstr_recordp(p)		( ntfs_is_magicp(p, RSTR) )
-#define ntfs_is_rcrd_record(x)		( ntfs_is_magic (x, RCRD) )
-#define ntfs_is_rcrd_recordp(p)		( ntfs_is_magicp(p, RCRD) )
-
-#define ntfs_is_chkd_record(x)		( ntfs_is_magic (x, CHKD) )
-#define ntfs_is_chkd_recordp(p)		( ntfs_is_magicp(p, CHKD) )
-
-#define ntfs_is_baad_record(x)		( ntfs_is_magic (x, BAAD) )
-#define ntfs_is_baad_recordp(p)		( ntfs_is_magicp(p, BAAD) )
-
-#define ntfs_is_empty_record(x)		( ntfs_is_magic (x, empty) )
-#define ntfs_is_empty_recordp(p)	( ntfs_is_magicp(p, empty) )
-
-/*
- * The Update Sequence Array (usa) is an array of the le16 values which belong
- * to the end of each sector protected by the update sequence record in which
- * this array is contained. Note that the first entry is the Update Sequence
- * Number (usn), a cyclic counter of how many times the protected record has
- * been written to disk. The values 0 and -1 (ie. 0xffff) are not used. All
- * last le16's of each sector have to be equal to the usn (during reading) or
- * are set to it (during writing). If they are not, an incomplete multi sector
- * transfer has occurred when the data was written.
- * The maximum size for the update sequence array is fixed to:
- *	maximum size = usa_ofs + (usa_count * 2) = 510 bytes
- * The 510 bytes comes from the fact that the last le16 in the array has to
- * (obviously) finish before the last le16 of the first 512-byte sector.
- * This formula can be used as a consistency check in that usa_ofs +
- * (usa_count * 2) has to be less than or equal to 510.
- */
-typedef struct {
-	NTFS_RECORD_TYPE magic;	/* A four-byte magic identifying the record
-				   type and/or status. */
-	le16 usa_ofs;		/* Offset to the Update Sequence Array (usa)
-				   from the start of the ntfs record. */
-	le16 usa_count;		/* Number of le16 sized entries in the usa
-				   including the Update Sequence Number (usn),
-				   thus the number of fixups is the usa_count
-				   minus 1. */
-} __attribute__ ((__packed__)) NTFS_RECORD;
-
-/*
- * System files mft record numbers. All these files are always marked as used
- * in the bitmap attribute of the mft; presumably in order to avoid accidental
- * allocation for random other mft records. Also, the sequence number for each
- * of the system files is always equal to their mft record number and it is
- * never modified.
- */
-typedef enum {
-	FILE_MFT       = 0,	/* Master file table (mft). Data attribute
-				   contains the entries and bitmap attribute
-				   records which ones are in use (bit==1). */
-	FILE_MFTMirr   = 1,	/* Mft mirror: copy of first four mft records
-				   in data attribute. If cluster size > 4kiB,
-				   copy of first N mft records, with
-					N = cluster_size / mft_record_size. */
-	FILE_LogFile   = 2,	/* Journalling log in data attribute. */
-	FILE_Volume    = 3,	/* Volume name attribute and volume information
-				   attribute (flags and ntfs version). Windows
-				   refers to this file as volume DASD (Direct
-				   Access Storage Device). */
-	FILE_AttrDef   = 4,	/* Array of attribute definitions in data
-				   attribute. */
-	FILE_root      = 5,	/* Root directory. */
-	FILE_Bitmap    = 6,	/* Allocation bitmap of all clusters (lcns) in
-				   data attribute. */
-	FILE_Boot      = 7,	/* Boot sector (always at cluster 0) in data
-				   attribute. */
-	FILE_BadClus   = 8,	/* Contains all bad clusters in the non-resident
-				   data attribute. */
-	FILE_Secure    = 9,	/* Shared security descriptors in data attribute
-				   and two indexes into the descriptors.
-				   Appeared in Windows 2000. Before that, this
-				   file was named $Quota but was unused. */
-	FILE_UpCase    = 10,	/* Uppercase equivalents of all 65536 Unicode
-				   characters in data attribute. */
-	FILE_Extend    = 11,	/* Directory containing other system files (eg.
-				   $ObjId, $Quota, $Reparse and $UsnJrnl). This
-				   is new to NTFS3.0. */
-	FILE_reserved12 = 12,	/* Reserved for future use (records 12-15). */
-	FILE_reserved13 = 13,
-	FILE_reserved14 = 14,
-	FILE_reserved15 = 15,
-	FILE_first_user = 16,	/* First user file, used as test limit for
-				   whether to allow opening a file or not. */
-} NTFS_SYSTEM_FILES;
-
-/*
- * These are the so far known MFT_RECORD_* flags (16-bit) which contain
- * information about the mft record in which they are present.
- */
-enum {
-	MFT_RECORD_IN_USE	= cpu_to_le16(0x0001),
-	MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
-} __attribute__ ((__packed__));
-
-typedef le16 MFT_RECORD_FLAGS;
-
-/*
- * mft references (aka file references or file record segment references) are
- * used whenever a structure needs to refer to a record in the mft.
- *
- * A reference consists of a 48-bit index into the mft and a 16-bit sequence
- * number used to detect stale references.
- *
- * For error reporting purposes we treat the 48-bit index as a signed quantity.
- *
- * The sequence number is a circular counter (skipping 0) describing how many
- * times the referenced mft record has been (re)used. This has to match the
- * sequence number of the mft record being referenced, otherwise the reference
- * is considered stale and removed (FIXME: only ntfsck or the driver itself?).
- *
- * If the sequence number is zero it is assumed that no sequence number
- * consistency checking should be performed.
- *
- * FIXME: Since inodes are 32-bit as of now, the driver needs to always check
- * for high_part being 0 and if not either BUG(), cause a panic() or handle
- * the situation in some other way. This shouldn't be a problem as a volume has
- * to become HUGE in order to need more than 32-bits worth of mft records.
- * Assuming the standard mft record size of 1kb only the records (never mind
- * the non-resident attributes, etc.) would require 4Tb of space on their own
- * for the first 32 bits worth of records. This is only if some strange person
- * doesn't decide to foul play and make the mft sparse which would be a really
- * horrible thing to do as it would trash our current driver implementation. )-:
- * Do I hear screams "we want 64-bit inodes!" ?!? (-;
- *
- * FIXME: The mft zone is defined as the first 12% of the volume. This space is
- * reserved so that the mft can grow contiguously and hence doesn't become
- * fragmented. Volume free space includes the empty part of the mft zone and
- * when the volume's free 88% are used up, the mft zone is shrunk by a factor
- * of 2, thus making more space available for more files/data. This process is
- * repeated every time there is no more free space except for the mft zone until
- * there really is no more free space.
- */
-
-/*
- * Typedef the MFT_REF as a 64-bit value for easier handling.
- * Also define two unpacking macros to get to the reference (MREF) and
- * sequence number (MSEQNO) respectively.
- * The _LE versions are to be applied on little endian MFT_REFs.
- * Note: The _LE versions will return a CPU endian formatted value!
- */
-#define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
-
-typedef u64 MFT_REF;
-typedef le64 leMFT_REF;
-
-#define MK_MREF(m, s)	((MFT_REF)(((MFT_REF)(s) << 48) |		\
-					((MFT_REF)(m) & MFT_REF_MASK_CPU)))
-#define MK_LE_MREF(m, s) cpu_to_le64(MK_MREF(m, s))
-
-#define MREF(x)		((unsigned long)((x) & MFT_REF_MASK_CPU))
-#define MSEQNO(x)	((u16)(((x) >> 48) & 0xffff))
-#define MREF_LE(x)	((unsigned long)(le64_to_cpu(x) & MFT_REF_MASK_CPU))
-#define MSEQNO_LE(x)	((u16)((le64_to_cpu(x) >> 48) & 0xffff))
-
-#define IS_ERR_MREF(x)	(((x) & 0x0000800000000000ULL) ? true : false)
-#define ERR_MREF(x)	((u64)((s64)(x)))
-#define MREF_ERR(x)	((int)((s64)(x)))
-
-/*
- * The mft record header present at the beginning of every record in the mft.
- * This is followed by a sequence of variable length attribute records which
- * is terminated by an attribute of type AT_END which is a truncated attribute
- * in that it only consists of the attribute type code AT_END and none of the
- * other members of the attribute structure are present.
- */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
-	le16 usa_count;		/* See NTFS_RECORD definition above. */
-
-/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
-				   Changed every time the record is modified. */
-/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
-				   reused. (See description for MFT_REF
-				   above.) NOTE: The increment (skipping zero)
-				   is done when the file is deleted. NOTE: If
-				   this is zero it is left zero. */
-/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
-				   directory entries referencing this record.
-				   NOTE: Only used in mft base records.
-				   NOTE: When deleting a directory entry we
-				   check the link_count and if it is 1 we
-				   delete the file. Otherwise we delete the
-				   FILE_NAME_ATTR being referenced by the
-				   directory entry from the mft record and
-				   decrement the link_count.
-				   FIXME: Careful with Win32 + DOS names! */
-/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
-				   mft record from the start of the mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
-				   is deleted, the MFT_RECORD_IN_USE flag is
-				   set to zero. */
-/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
-				   record. This should be equal to the mft
-				   record size. */
-/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
-				   When it is not zero it is a mft reference
-				   pointing to the base mft record to which
-				   this record belongs (this is then used to
-				   locate the attribute list attribute present
-				   in the base record which describes this
-				   extension record and hence might need
-				   modification when the extension record
-				   itself is modified, also locating the
-				   attribute list also means finding the other
-				   potential extents, belonging to the non-base
-				   mft record). */
-/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
-				   the next attribute added to this mft record.
-				   NOTE: Incremented each time after it is used.
-				   NOTE: Every time the mft record is reused
-				   this number is set to zero.  NOTE: The first
-				   instance number is always 0. */
-/* The below fields are specific to NTFS 3.1+ (Windows XP and above): */
-/* 42*/ le16 reserved;		/* Reserved/alignment. */
-/* 44*/ le32 mft_record_number;	/* Number of this mft record. */
-/* sizeof() = 48 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes.  This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work.  As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD;
-
-/* This is the version without the NTFS 3.1+ specific fields. */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Usually the magic is "FILE". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition above. */
-	le16 usa_count;		/* See NTFS_RECORD definition above. */
-
-/*  8*/	le64 lsn;		/* $LogFile sequence number for this record.
-				   Changed every time the record is modified. */
-/* 16*/	le16 sequence_number;	/* Number of times this mft record has been
-				   reused. (See description for MFT_REF
-				   above.) NOTE: The increment (skipping zero)
-				   is done when the file is deleted. NOTE: If
-				   this is zero it is left zero. */
-/* 18*/	le16 link_count;	/* Number of hard links, i.e. the number of
-				   directory entries referencing this record.
-				   NOTE: Only used in mft base records.
-				   NOTE: When deleting a directory entry we
-				   check the link_count and if it is 1 we
-				   delete the file. Otherwise we delete the
-				   FILE_NAME_ATTR being referenced by the
-				   directory entry from the mft record and
-				   decrement the link_count.
-				   FIXME: Careful with Win32 + DOS names! */
-/* 20*/	le16 attrs_offset;	/* Byte offset to the first attribute in this
-				   mft record from the start of the mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 22*/	MFT_RECORD_FLAGS flags;	/* Bit array of MFT_RECORD_FLAGS. When a file
-				   is deleted, the MFT_RECORD_IN_USE flag is
-				   set to zero. */
-/* 24*/	le32 bytes_in_use;	/* Number of bytes used in this mft record.
-				   NOTE: Must be aligned to 8-byte boundary. */
-/* 28*/	le32 bytes_allocated;	/* Number of bytes allocated for this mft
-				   record. This should be equal to the mft
-				   record size. */
-/* 32*/	leMFT_REF base_mft_record;/* This is zero for base mft records.
-				   When it is not zero it is a mft reference
-				   pointing to the base mft record to which
-				   this record belongs (this is then used to
-				   locate the attribute list attribute present
-				   in the base record which describes this
-				   extension record and hence might need
-				   modification when the extension record
-				   itself is modified, also locating the
-				   attribute list also means finding the other
-				   potential extents, belonging to the non-base
-				   mft record). */
-/* 40*/	le16 next_attr_instance;/* The instance number that will be assigned to
-				   the next attribute added to this mft record.
-				   NOTE: Incremented each time after it is used.
-				   NOTE: Every time the mft record is reused
-				   this number is set to zero.  NOTE: The first
-				   instance number is always 0. */
-/* sizeof() = 42 bytes */
-/*
- * When (re)using the mft record, we place the update sequence array at this
- * offset, i.e. before we start with the attributes.  This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work.  As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading we obviously use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) MFT_RECORD_OLD;
-
-/*
- * System defined attributes (32-bit).  Each attribute type has a corresponding
- * attribute name (Unicode string of maximum 64 character length) as described
- * by the attribute definitions present in the data attribute of the $AttrDef
- * system file.  On NTFS 3.0 volumes the names are just as the types are named
- * in the below defines exchanging AT_ for the dollar sign ($).  If that is not
- * a revealing choice of symbol I do not know what is... (-;
- */
-enum {
-	AT_UNUSED			= cpu_to_le32(         0),
-	AT_STANDARD_INFORMATION		= cpu_to_le32(      0x10),
-	AT_ATTRIBUTE_LIST		= cpu_to_le32(      0x20),
-	AT_FILE_NAME			= cpu_to_le32(      0x30),
-	AT_OBJECT_ID			= cpu_to_le32(      0x40),
-	AT_SECURITY_DESCRIPTOR		= cpu_to_le32(      0x50),
-	AT_VOLUME_NAME			= cpu_to_le32(      0x60),
-	AT_VOLUME_INFORMATION		= cpu_to_le32(      0x70),
-	AT_DATA				= cpu_to_le32(      0x80),
-	AT_INDEX_ROOT			= cpu_to_le32(      0x90),
-	AT_INDEX_ALLOCATION		= cpu_to_le32(      0xa0),
-	AT_BITMAP			= cpu_to_le32(      0xb0),
-	AT_REPARSE_POINT		= cpu_to_le32(      0xc0),
-	AT_EA_INFORMATION		= cpu_to_le32(      0xd0),
-	AT_EA				= cpu_to_le32(      0xe0),
-	AT_PROPERTY_SET			= cpu_to_le32(      0xf0),
-	AT_LOGGED_UTILITY_STREAM	= cpu_to_le32(     0x100),
-	AT_FIRST_USER_DEFINED_ATTRIBUTE	= cpu_to_le32(    0x1000),
-	AT_END				= cpu_to_le32(0xffffffff)
-};
-
-typedef le32 ATTR_TYPE;
-
-/*
- * The collation rules for sorting views/indexes/etc (32-bit).
- *
- * COLLATION_BINARY - Collate by binary compare where the first byte is most
- *	significant.
- * COLLATION_UNICODE_STRING - Collate Unicode strings by comparing their binary
- *	Unicode values, except that when a character can be uppercased, the
- *	upper case value collates before the lower case one.
- * COLLATION_FILE_NAME - Collate file names as Unicode strings. The collation
- *	is done very much like COLLATION_UNICODE_STRING. In fact I have no idea
- *	what the difference is. Perhaps the difference is that file names
- *	would treat some special characters in an odd way (see
- *	unistr.c::ntfs_collate_names() and unistr.c::legal_ansi_char_array[]
- *	for what I mean but COLLATION_UNICODE_STRING would not give any special
- *	treatment to any characters at all, but this is speculation.
- * COLLATION_NTOFS_ULONG - Sorting is done according to ascending le32 key
- *	values. E.g. used for $SII index in FILE_Secure, which sorts by
- *	security_id (le32).
- * COLLATION_NTOFS_SID - Sorting is done according to ascending SID values.
- *	E.g. used for $O index in FILE_Extend/$Quota.
- * COLLATION_NTOFS_SECURITY_HASH - Sorting is done first by ascending hash
- *	values and second by ascending security_id values. E.g. used for $SDH
- *	index in FILE_Secure.
- * COLLATION_NTOFS_ULONGS - Sorting is done according to a sequence of ascending
- *	le32 key values. E.g. used for $O index in FILE_Extend/$ObjId, which
- *	sorts by object_id (16-byte), by splitting up the object_id in four
- *	le32 values and using them as individual keys. E.g. take the following
- *	two security_ids, stored as follows on disk:
- *		1st: a1 61 65 b7 65 7b d4 11 9e 3d 00 e0 81 10 42 59
- *		2nd: 38 14 37 d2 d2 f3 d4 11 a5 21 c8 6b 79 b1 97 45
- *	To compare them, they are split into four le32 values each, like so:
- *		1st: 0xb76561a1 0x11d47b65 0xe0003d9e 0x59421081
- *		2nd: 0xd2371438 0x11d4f3d2 0x6bc821a5 0x4597b179
- *	Now, it is apparent why the 2nd object_id collates after the 1st: the
- *	first le32 value of the 1st object_id is less than the first le32 of
- *	the 2nd object_id. If the first le32 values of both object_ids were
- *	equal then the second le32 values would be compared, etc.
- */
-enum {
-	COLLATION_BINARY		= cpu_to_le32(0x00),
-	COLLATION_FILE_NAME		= cpu_to_le32(0x01),
-	COLLATION_UNICODE_STRING	= cpu_to_le32(0x02),
-	COLLATION_NTOFS_ULONG		= cpu_to_le32(0x10),
-	COLLATION_NTOFS_SID		= cpu_to_le32(0x11),
-	COLLATION_NTOFS_SECURITY_HASH	= cpu_to_le32(0x12),
-	COLLATION_NTOFS_ULONGS		= cpu_to_le32(0x13),
-};
-
-typedef le32 COLLATION_RULE;
-
-/*
- * The flags (32-bit) describing attribute properties in the attribute
- * definition structure.  FIXME: This information is based on Regis's
- * information and, according to him, it is not certain and probably
- * incomplete.  The INDEXABLE flag is fairly certainly correct as only the file
- * name attribute has this flag set and this is the only attribute indexed in
- * NT4.
- */
-enum {
-	ATTR_DEF_INDEXABLE	= cpu_to_le32(0x02), /* Attribute can be
-					indexed. */
-	ATTR_DEF_MULTIPLE	= cpu_to_le32(0x04), /* Attribute type
-					can be present multiple times in the
-					mft records of an inode. */
-	ATTR_DEF_NOT_ZERO	= cpu_to_le32(0x08), /* Attribute value
-					must contain at least one non-zero
-					byte. */
-	ATTR_DEF_INDEXED_UNIQUE	= cpu_to_le32(0x10), /* Attribute must be
-					indexed and the attribute value must be
-					unique for the attribute type in all of
-					the mft records of an inode. */
-	ATTR_DEF_NAMED_UNIQUE	= cpu_to_le32(0x20), /* Attribute must be
-					named and the name must be unique for
-					the attribute type in all of the mft
-					records of an inode. */
-	ATTR_DEF_RESIDENT	= cpu_to_le32(0x40), /* Attribute must be
-					resident. */
-	ATTR_DEF_ALWAYS_LOG	= cpu_to_le32(0x80), /* Always log
-					modifications to this attribute,
-					regardless of whether it is resident or
-					non-resident.  Without this, only log
-					modifications if the attribute is
-					resident. */
-};
-
-typedef le32 ATTR_DEF_FLAGS;
-
-/*
- * The data attribute of FILE_AttrDef contains a sequence of attribute
- * definitions for the NTFS volume. With this, it is supposed to be safe for an
- * older NTFS driver to mount a volume containing a newer NTFS version without
- * damaging it (that's the theory. In practice it's: not damaging it too much).
- * Entries are sorted by attribute type. The flags describe whether the
- * attribute can be resident/non-resident and possibly other things, but the
- * actual bits are unknown.
- */
-typedef struct {
-/*hex ofs*/
-/*  0*/	ntfschar name[0x40];		/* Unicode name of the attribute. Zero
-					   terminated. */
-/* 80*/	ATTR_TYPE type;			/* Type of the attribute. */
-/* 84*/	le32 display_rule;		/* Default display rule.
-					   FIXME: What does it mean? (AIA) */
-/* 88*/ COLLATION_RULE collation_rule;	/* Default collation rule. */
-/* 8c*/	ATTR_DEF_FLAGS flags;		/* Flags describing the attribute. */
-/* 90*/	sle64 min_size;			/* Optional minimum attribute size. */
-/* 98*/	sle64 max_size;			/* Maximum size of attribute. */
-/* sizeof() = 0xa0 or 160 bytes */
-} __attribute__ ((__packed__)) ATTR_DEF;
-
-/*
- * Attribute flags (16-bit).
- */
-enum {
-	ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
-	ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
-							      mask.  Also, first
-							      illegal value. */
-	ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
-	ATTR_IS_SPARSE	      = cpu_to_le16(0x8000),
-} __attribute__ ((__packed__));
-
-typedef le16 ATTR_FLAGS;
-
-/*
- * Attribute compression.
- *
- * Only the data attribute is ever compressed in the current ntfs driver in
- * Windows. Further, compression is only applied when the data attribute is
- * non-resident. Finally, to use compression, the maximum allowed cluster size
- * on a volume is 4kib.
- *
- * The compression method is based on independently compressing blocks of X
- * clusters, where X is determined from the compression_unit value found in the
- * non-resident attribute record header (more precisely: X = 2^compression_unit
- * clusters). On Windows NT/2k, X always is 16 clusters (compression_unit = 4).
- *
- * There are three different cases of how a compression block of X clusters
- * can be stored:
- *
- *   1) The data in the block is all zero (a sparse block):
- *	  This is stored as a sparse block in the runlist, i.e. the runlist
- *	  entry has length = X and lcn = -1. The mapping pairs array actually
- *	  uses a delta_lcn value length of 0, i.e. delta_lcn is not present at
- *	  all, which is then interpreted by the driver as lcn = -1.
- *	  NOTE: Even uncompressed files can be sparse on NTFS 3.0 volumes, then
- *	  the same principles apply as above, except that the length is not
- *	  restricted to being any particular value.
- *
- *   2) The data in the block is not compressed:
- *	  This happens when compression doesn't reduce the size of the block
- *	  in clusters. I.e. if compression has a small effect so that the
- *	  compressed data still occupies X clusters, then the uncompressed data
- *	  is stored in the block.
- *	  This case is recognised by the fact that the runlist entry has
- *	  length = X and lcn >= 0. The mapping pairs array stores this as
- *	  normal with a run length of X and some specific delta_lcn, i.e.
- *	  delta_lcn has to be present.
- *
- *   3) The data in the block is compressed:
- *	  The common case. This case is recognised by the fact that the run
- *	  list entry has length L < X and lcn >= 0. The mapping pairs array
- *	  stores this as normal with a run length of X and some specific
- *	  delta_lcn, i.e. delta_lcn has to be present. This runlist entry is
- *	  immediately followed by a sparse entry with length = X - L and
- *	  lcn = -1. The latter entry is to make up the vcn counting to the
- *	  full compression block size X.
- *
- * In fact, life is more complicated because adjacent entries of the same type
- * can be coalesced. This means that one has to keep track of the number of
- * clusters handled and work on a basis of X clusters at a time being one
- * block. An example: if length L > X this means that this particular runlist
- * entry contains a block of length X and part of one or more blocks of length
- * L - X. Another example: if length L < X, this does not necessarily mean that
- * the block is compressed as it might be that the lcn changes inside the block
- * and hence the following runlist entry describes the continuation of the
- * potentially compressed block. The block would be compressed if the
- * following runlist entry describes at least X - L sparse clusters, thus
- * making up the compression block length as described in point 3 above. (Of
- * course, there can be several runlist entries with small lengths so that the
- * sparse entry does not follow the first data containing entry with
- * length < X.)
- *
- * NOTE: At the end of the compressed attribute value, there most likely is not
- * just the right amount of data to make up a compression block, thus this data
- * is not even attempted to be compressed. It is just stored as is, unless
- * the number of clusters it occupies is reduced when compressed in which case
- * it is stored as a compressed compression block, complete with sparse
- * clusters at the end.
- */
-
-/*
- * Flags of resident attributes (8-bit).
- */
-enum {
-	RESIDENT_ATTR_IS_INDEXED = 0x01, /* Attribute is referenced in an index
-					    (has implications for deleting and
-					    modifying the attribute). */
-} __attribute__ ((__packed__));
-
-typedef u8 RESIDENT_ATTR_FLAGS;
-
-/*
- * Attribute record header. Always aligned to 8-byte boundary.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ATTR_TYPE type;		/* The (32-bit) type of the attribute. */
-/*  4*/	le32 length;		/* Byte size of the resident part of the
-				   attribute (aligned to 8-byte boundary).
-				   Used to get to the next attribute. */
-/*  8*/	u8 non_resident;	/* If 0, attribute is resident.
-				   If 1, attribute is non-resident. */
-/*  9*/	u8 name_length;		/* Unicode character size of name of attribute.
-				   0 if unnamed. */
-/* 10*/	le16 name_offset;	/* If name_length != 0, the byte offset to the
-				   beginning of the name from the attribute
-				   record. Note that the name is stored as a
-				   Unicode string. When creating, place offset
-				   just at the end of the record header. Then,
-				   follow with attribute value or mapping pairs
-				   array, resident and non-resident attributes
-				   respectively, aligning to an 8-byte
-				   boundary. */
-/* 12*/	ATTR_FLAGS flags;	/* Flags describing the attribute. */
-/* 14*/	le16 instance;		/* The instance of this attribute record. This
-				   number is unique within this mft record (see
-				   MFT_RECORD/next_attribute_instance notes in
-				   mft.h for more details). */
-/* 16*/	union {
-		/* Resident attributes. */
-		struct {
-/* 16 */		le32 value_length;/* Byte size of attribute value. */
-/* 20 */		le16 value_offset;/* Byte offset of the attribute
-					     value from the start of the
-					     attribute record. When creating,
-					     align to 8-byte boundary if we
-					     have a name present as this might
-					     not have a length of a multiple
-					     of 8-bytes. */
-/* 22 */		RESIDENT_ATTR_FLAGS flags; /* See above. */
-/* 23 */		s8 reserved;	  /* Reserved/alignment to 8-byte
-					     boundary. */
-		} __attribute__ ((__packed__)) resident;
-		/* Non-resident attributes. */
-		struct {
-/* 16*/			leVCN lowest_vcn;/* Lowest valid virtual cluster number
-				for this portion of the attribute value or
-				0 if this is the only extent (usually the
-				case). - Only when an attribute list is used
-				does lowest_vcn != 0 ever occur. */
-/* 24*/			leVCN highest_vcn;/* Highest valid vcn of this extent of
-				the attribute value. - Usually there is only one
-				portion, so this usually equals the attribute
-				value size in clusters minus 1. Can be -1 for
-				zero length files. Can be 0 for "single extent"
-				attributes. */
-/* 32*/			le16 mapping_pairs_offset; /* Byte offset from the
-				beginning of the structure to the mapping pairs
-				array which contains the mappings between the
-				vcns and the logical cluster numbers (lcns).
-				When creating, place this at the end of this
-				record header aligned to 8-byte boundary. */
-/* 34*/			u8 compression_unit; /* The compression unit expressed
-				as the log to the base 2 of the number of
-				clusters in a compression unit.  0 means not
-				compressed.  (This effectively limits the
-				compression unit size to be a power of two
-				clusters.)  WinNT4 only uses a value of 4.
-				Sparse files have this set to 0 on XPSP2. */
-/* 35*/			u8 reserved[5];		/* Align to 8-byte boundary. */
-/* The sizes below are only used when lowest_vcn is zero, as otherwise it would
-   be difficult to keep them up-to-date.*/
-/* 40*/			sle64 allocated_size;	/* Byte size of disk space
-				allocated to hold the attribute value. Always
-				is a multiple of the cluster size. When a file
-				is compressed, this field is a multiple of the
-				compression block size (2^compression_unit) and
-				it represents the logically allocated space
-				rather than the actual on disk usage. For this
-				use the compressed_size (see below). */
-/* 48*/			sle64 data_size;	/* Byte size of the attribute
-				value. Can be larger than allocated_size if
-				attribute value is compressed or sparse. */
-/* 56*/			sle64 initialized_size;	/* Byte size of initialized
-				portion of the attribute value. Usually equals
-				data_size. */
-/* sizeof(uncompressed attr) = 64*/
-/* 64*/			sle64 compressed_size;	/* Byte size of the attribute
-				value after compression.  Only present when
-				compressed or sparse.  Always is a multiple of
-				the cluster size.  Represents the actual amount
-				of disk space being used on the disk. */
-/* sizeof(compressed attr) = 72*/
-		} __attribute__ ((__packed__)) non_resident;
-	} __attribute__ ((__packed__)) data;
-} __attribute__ ((__packed__)) ATTR_RECORD;
-
-typedef ATTR_RECORD ATTR_REC;
-
-/*
- * File attribute flags (32-bit) appearing in the file_attributes fields of the
- * STANDARD_INFORMATION attribute of MFT_RECORDs and the FILENAME_ATTR
- * attributes of MFT_RECORDs and directory index entries.
- *
- * All of the below flags appear in the directory index entries but only some
- * appear in the STANDARD_INFORMATION attribute whilst only some others appear
- * in the FILENAME_ATTR attribute of MFT_RECORDs.  Unless otherwise stated the
- * flags appear in all of the above.
- */
-enum {
-	FILE_ATTR_READONLY		= cpu_to_le32(0x00000001),
-	FILE_ATTR_HIDDEN		= cpu_to_le32(0x00000002),
-	FILE_ATTR_SYSTEM		= cpu_to_le32(0x00000004),
-	/* Old DOS volid. Unused in NT.	= cpu_to_le32(0x00000008), */
-
-	FILE_ATTR_DIRECTORY		= cpu_to_le32(0x00000010),
-	/* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
-	   reserved for the DOS SUBDIRECTORY flag. */
-	FILE_ATTR_ARCHIVE		= cpu_to_le32(0x00000020),
-	FILE_ATTR_DEVICE		= cpu_to_le32(0x00000040),
-	FILE_ATTR_NORMAL		= cpu_to_le32(0x00000080),
-
-	FILE_ATTR_TEMPORARY		= cpu_to_le32(0x00000100),
-	FILE_ATTR_SPARSE_FILE		= cpu_to_le32(0x00000200),
-	FILE_ATTR_REPARSE_POINT		= cpu_to_le32(0x00000400),
-	FILE_ATTR_COMPRESSED		= cpu_to_le32(0x00000800),
-
-	FILE_ATTR_OFFLINE		= cpu_to_le32(0x00001000),
-	FILE_ATTR_NOT_CONTENT_INDEXED	= cpu_to_le32(0x00002000),
-	FILE_ATTR_ENCRYPTED		= cpu_to_le32(0x00004000),
-
-	FILE_ATTR_VALID_FLAGS		= cpu_to_le32(0x00007fb7),
-	/* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
-	   FILE_ATTR_DEVICE and preserves everything else.  This mask is used
-	   to obtain all flags that are valid for reading. */
-	FILE_ATTR_VALID_SET_FLAGS	= cpu_to_le32(0x000031a7),
-	/* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
-	   F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
-	   F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
-	   is used to obtain all flags that are valid for setting. */
-	/*
-	 * The flag FILE_ATTR_DUP_FILENAME_INDEX_PRESENT is present in all
-	 * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
-	 * attribute of an mft record.
-	 */
-	FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT	= cpu_to_le32(0x10000000),
-	/* Note, this is a copy of the corresponding bit from the mft record,
-	   telling us whether this is a directory or not, i.e. whether it has
-	   an index root attribute or not. */
-	FILE_ATTR_DUP_VIEW_INDEX_PRESENT	= cpu_to_le32(0x20000000),
-	/* Note, this is a copy of the corresponding bit from the mft record,
-	   telling us whether this file has a view index present (eg. object id
-	   index, quota index, one of the security indexes or the encrypting
-	   filesystem related indexes). */
-};
-
-typedef le32 FILE_ATTR_FLAGS;
-
-/*
- * NOTE on times in NTFS: All times are in MS standard time format, i.e. they
- * are the number of 100-nanosecond intervals since 1st January 1601, 00:00:00
- * universal coordinated time (UTC). (In Linux time starts 1st January 1970,
- * 00:00:00 UTC and is stored as the number of 1-second intervals since then.)
- */
-
-/*
- * Attribute: Standard information (0x10).
- *
- * NOTE: Always resident.
- * NOTE: Present in all base file records on a volume.
- * NOTE: There is conflicting information about the meaning of each of the time
- *	 fields but the meaning as defined below has been verified to be
- *	 correct by practical experimentation on Windows NT4 SP6a and is hence
- *	 assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	sle64 creation_time;		/* Time file was created. Updated when
-					   a filename is changed(?). */
-/*  8*/	sle64 last_data_change_time;	/* Time the data attribute was last
-					   modified. */
-/* 16*/	sle64 last_mft_change_time;	/* Time this mft record was last
-					   modified. */
-/* 24*/	sle64 last_access_time;		/* Approximate time when the file was
-					   last accessed (obviously this is not
-					   updated on read-only volumes). In
-					   Windows this is only updated when
-					   accessed if some time delta has
-					   passed since the last update. Also,
-					   last access time updates can be
-					   disabled altogether for speed. */
-/* 32*/	FILE_ATTR_FLAGS file_attributes; /* Flags describing the file. */
-/* 36*/	union {
-	/* NTFS 1.2 */
-		struct {
-		/* 36*/	u8 reserved12[12];	/* Reserved/alignment to 8-byte
-						   boundary. */
-		} __attribute__ ((__packed__)) v1;
-	/* sizeof() = 48 bytes */
-	/* NTFS 3.x */
-		struct {
-/*
- * If a volume has been upgraded from a previous NTFS version, then these
- * fields are present only if the file has been accessed since the upgrade.
- * Recognize the difference by comparing the length of the resident attribute
- * value. If it is 48, then the following fields are missing. If it is 72 then
- * the fields are present. Maybe just check like this:
- *	if (resident.ValueLength < sizeof(STANDARD_INFORMATION)) {
- *		Assume NTFS 1.2- format.
- *		If (volume version is 3.x)
- *			Upgrade attribute to NTFS 3.x format.
- *		else
- *			Use NTFS 1.2- format for access.
- *	} else
- *		Use NTFS 3.x format for access.
- * Only problem is that it might be legal to set the length of the value to
- * arbitrarily large values thus spoiling this check. - But chkdsk probably
- * views that as a corruption, assuming that it behaves like this for all
- * attributes.
- */
-		/* 36*/	le32 maximum_versions;	/* Maximum allowed versions for
-				file. Zero if version numbering is disabled. */
-		/* 40*/	le32 version_number;	/* This file's version (if any).
-				Set to zero if maximum_versions is zero. */
-		/* 44*/	le32 class_id;		/* Class id from bidirectional
-				class id index (?). */
-		/* 48*/	le32 owner_id;		/* Owner_id of the user owning
-				the file. Translate via $Q index in FILE_Extend
-				/$Quota to the quota control entry for the user
-				owning the file. Zero if quotas are disabled. */
-		/* 52*/	le32 security_id;	/* Security_id for the file.
-				Translate via $SII index and $SDS data stream
-				in FILE_Secure to the security descriptor. */
-		/* 56*/	le64 quota_charged;	/* Byte size of the charge to
-				the quota for all streams of the file. Note: Is
-				zero if quotas are disabled. */
-		/* 64*/	leUSN usn;		/* Last update sequence number
-				of the file.  This is a direct index into the
-				transaction log file ($UsnJrnl).  It is zero if
-				the usn journal is disabled or this file has
-				not been subject to logging yet.  See usnjrnl.h
-				for details. */
-		} __attribute__ ((__packed__)) v3;
-	/* sizeof() = 72 bytes (NTFS 3.x) */
-	} __attribute__ ((__packed__)) ver;
-} __attribute__ ((__packed__)) STANDARD_INFORMATION;
-
-/*
- * Attribute: Attribute list (0x20).
- *
- * - Can be either resident or non-resident.
- * - Value consists of a sequence of variable length, 8-byte aligned,
- * ATTR_LIST_ENTRY records.
- * - The list is not terminated by anything at all! The only way to know when
- * the end is reached is to keep track of the current offset and compare it to
- * the attribute value size.
- * - The attribute list attribute contains one entry for each attribute of
- * the file in which the list is located, except for the list attribute
- * itself. The list is sorted: first by attribute type, second by attribute
- * name (if present), third by instance number. The extents of one
- * non-resident attribute (if present) immediately follow after the initial
- * extent. They are ordered by lowest_vcn and have their instace set to zero.
- * It is not allowed to have two attributes with all sorting keys equal.
- * - Further restrictions:
- *	- If not resident, the vcn to lcn mapping array has to fit inside the
- *	  base mft record.
- *	- The attribute list attribute value has a maximum size of 256kb. This
- *	  is imposed by the Windows cache manager.
- * - Attribute lists are only used when the attributes of mft record do not
- * fit inside the mft record despite all attributes (that can be made
- * non-resident) having been made non-resident. This can happen e.g. when:
- *	- File has a large number of hard links (lots of file name
- *	  attributes present).
- *	- The mapping pairs array of some non-resident attribute becomes so
- *	  large due to fragmentation that it overflows the mft record.
- *	- The security descriptor is very complex (not applicable to
- *	  NTFS 3.0 volumes).
- *	- There are many named streams.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ATTR_TYPE type;		/* Type of referenced attribute. */
-/*  4*/	le16 length;		/* Byte size of this entry (8-byte aligned). */
-/*  6*/	u8 name_length;		/* Size in Unicode chars of the name of the
-				   attribute or 0 if unnamed. */
-/*  7*/	u8 name_offset;		/* Byte offset to beginning of attribute name
-				   (always set this to where the name would
-				   start even if unnamed). */
-/*  8*/	leVCN lowest_vcn;	/* Lowest virtual cluster number of this portion
-				   of the attribute value. This is usually 0. It
-				   is non-zero for the case where one attribute
-				   does not fit into one mft record and thus
-				   several mft records are allocated to hold
-				   this attribute. In the latter case, each mft
-				   record holds one extent of the attribute and
-				   there is one attribute list entry for each
-				   extent. NOTE: This is DEFINITELY a signed
-				   value! The windows driver uses cmp, followed
-				   by jg when comparing this, thus it treats it
-				   as signed. */
-/* 16*/	leMFT_REF mft_reference;/* The reference of the mft record holding
-				   the ATTR_RECORD for this portion of the
-				   attribute value. */
-/* 24*/	le16 instance;		/* If lowest_vcn = 0, the instance of the
-				   attribute being referenced; otherwise 0. */
-/* 26*/	ntfschar name[0];	/* Use when creating only. When reading use
-				   name_offset to determine the location of the
-				   name. */
-/* sizeof() = 26 + (attribute_name_length * 2) bytes */
-} __attribute__ ((__packed__)) ATTR_LIST_ENTRY;
-
-/*
- * The maximum allowed length for a file name.
- */
-#define MAXIMUM_FILE_NAME_LENGTH	255
-
-/*
- * Possible namespaces for filenames in ntfs (8-bit).
- */
-enum {
-	FILE_NAME_POSIX		= 0x00,
-	/* This is the largest namespace. It is case sensitive and allows all
-	   Unicode characters except for: '\0' and '/'.  Beware that in
-	   WinNT/2k/2003 by default files which eg have the same name except
-	   for their case will not be distinguished by the standard utilities
-	   and thus a "del filename" will delete both "filename" and "fileName"
-	   without warning.  However if for example Services For Unix (SFU) are
-	   installed and the case sensitive option was enabled at installation
-	   time, then you can create/access/delete such files.
-	   Note that even SFU places restrictions on the filenames beyond the
-	   '\0' and '/' and in particular the following set of characters is
-	   not allowed: '"', '/', '<', '>', '\'.  All other characters,
-	   including the ones no allowed in WIN32 namespace are allowed.
-	   Tested with SFU 3.5 (this is now free) running on Windows XP. */
-	FILE_NAME_WIN32		= 0x01,
-	/* The standard WinNT/2k NTFS long filenames. Case insensitive.  All
-	   Unicode chars except: '\0', '"', '*', '/', ':', '<', '>', '?', '\',
-	   and '|'.  Further, names cannot end with a '.' or a space. */
-	FILE_NAME_DOS		= 0x02,
-	/* The standard DOS filenames (8.3 format). Uppercase only.  All 8-bit
-	   characters greater space, except: '"', '*', '+', ',', '/', ':', ';',
-	   '<', '=', '>', '?', and '\'. */
-	FILE_NAME_WIN32_AND_DOS	= 0x03,
-	/* 3 means that both the Win32 and the DOS filenames are identical and
-	   hence have been saved in this single filename record. */
-} __attribute__ ((__packed__));
-
-typedef u8 FILE_NAME_TYPE_FLAGS;
-
-/*
- * Attribute: Filename (0x30).
- *
- * NOTE: Always resident.
- * NOTE: All fields, except the parent_directory, are only updated when the
- *	 filename is changed. Until then, they just become out of sync with
- *	 reality and the more up to date values are present in the standard
- *	 information attribute.
- * NOTE: There is conflicting information about the meaning of each of the time
- *	 fields but the meaning as defined below has been verified to be
- *	 correct by practical experimentation on Windows NT4 SP6a and is hence
- *	 assumed to be the one and only correct interpretation.
- */
-typedef struct {
-/*hex ofs*/
-/*  0*/	leMFT_REF parent_directory;	/* Directory this filename is
-					   referenced from. */
-/*  8*/	sle64 creation_time;		/* Time file was created. */
-/* 10*/	sle64 last_data_change_time;	/* Time the data attribute was last
-					   modified. */
-/* 18*/	sle64 last_mft_change_time;	/* Time this mft record was last
-					   modified. */
-/* 20*/	sle64 last_access_time;		/* Time this mft record was last
-					   accessed. */
-/* 28*/	sle64 allocated_size;		/* Byte size of on-disk allocated space
-					   for the unnamed data attribute.  So
-					   for normal $DATA, this is the
-					   allocated_size from the unnamed
-					   $DATA attribute and for compressed
-					   and/or sparse $DATA, this is the
-					   compressed_size from the unnamed
-					   $DATA attribute.  For a directory or
-					   other inode without an unnamed $DATA
-					   attribute, this is always 0.  NOTE:
-					   This is a multiple of the cluster
-					   size. */
-/* 30*/	sle64 data_size;		/* Byte size of actual data in unnamed
-					   data attribute.  For a directory or
-					   other inode without an unnamed $DATA
-					   attribute, this is always 0. */
-/* 38*/	FILE_ATTR_FLAGS file_attributes;	/* Flags describing the file. */
-/* 3c*/	union {
-	/* 3c*/	struct {
-		/* 3c*/	le16 packed_ea_size;	/* Size of the buffer needed to
-						   pack the extended attributes
-						   (EAs), if such are present.*/
-		/* 3e*/	le16 reserved;		/* Reserved for alignment. */
-		} __attribute__ ((__packed__)) ea;
-	/* 3c*/	struct {
-		/* 3c*/	le32 reparse_point_tag;	/* Type of reparse point,
-						   present only in reparse
-						   points and only if there are
-						   no EAs. */
-		} __attribute__ ((__packed__)) rp;
-	} __attribute__ ((__packed__)) type;
-/* 40*/	u8 file_name_length;			/* Length of file name in
-						   (Unicode) characters. */
-/* 41*/	FILE_NAME_TYPE_FLAGS file_name_type;	/* Namespace of the file name.*/
-/* 42*/	ntfschar file_name[0];			/* File name in Unicode. */
-} __attribute__ ((__packed__)) FILE_NAME_ATTR;
-
-/*
- * GUID structures store globally unique identifiers (GUID). A GUID is a
- * 128-bit value consisting of one group of eight hexadecimal digits, followed
- * by three groups of four hexadecimal digits each, followed by one group of
- * twelve hexadecimal digits. GUIDs are Microsoft's implementation of the
- * distributed computing environment (DCE) universally unique identifier (UUID).
- * Example of a GUID:
- *	1F010768-5A73-BC91-0010A52216A7
- */
-typedef struct {
-	le32 data1;	/* The first eight hexadecimal digits of the GUID. */
-	le16 data2;	/* The first group of four hexadecimal digits. */
-	le16 data3;	/* The second group of four hexadecimal digits. */
-	u8 data4[8];	/* The first two bytes are the third group of four
-			   hexadecimal digits. The remaining six bytes are the
-			   final 12 hexadecimal digits. */
-} __attribute__ ((__packed__)) GUID;
-
-/*
- * FILE_Extend/$ObjId contains an index named $O. This index contains all
- * object_ids present on the volume as the index keys and the corresponding
- * mft_record numbers as the index entry data parts. The data part (defined
- * below) also contains three other object_ids:
- *	birth_volume_id - object_id of FILE_Volume on which the file was first
- *			  created. Optional (i.e. can be zero).
- *	birth_object_id - object_id of file when it was first created. Usually
- *			  equals the object_id. Optional (i.e. can be zero).
- *	domain_id	- Reserved (always zero).
- */
-typedef struct {
-	leMFT_REF mft_reference;/* Mft record containing the object_id in
-				   the index entry key. */
-	union {
-		struct {
-			GUID birth_volume_id;
-			GUID birth_object_id;
-			GUID domain_id;
-		} __attribute__ ((__packed__)) origin;
-		u8 extended_info[48];
-	} __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJ_ID_INDEX_DATA;
-
-/*
- * Attribute: Object id (NTFS 3.0+) (0x40).
- *
- * NOTE: Always resident.
- */
-typedef struct {
-	GUID object_id;				/* Unique id assigned to the
-						   file.*/
-	/* The following fields are optional. The attribute value size is 16
-	   bytes, i.e. sizeof(GUID), if these are not present at all. Note,
-	   the entries can be present but one or more (or all) can be zero
-	   meaning that that particular value(s) is(are) not defined. */
-	union {
-		struct {
-			GUID birth_volume_id;	/* Unique id of volume on which
-						   the file was first created.*/
-			GUID birth_object_id;	/* Unique id of file when it was
-						   first created. */
-			GUID domain_id;		/* Reserved, zero. */
-		} __attribute__ ((__packed__)) origin;
-		u8 extended_info[48];
-	} __attribute__ ((__packed__)) opt;
-} __attribute__ ((__packed__)) OBJECT_ID_ATTR;
-
-/*
- * The pre-defined IDENTIFIER_AUTHORITIES used as SID_IDENTIFIER_AUTHORITY in
- * the SID structure (see below).
- */
-//typedef enum {					/* SID string prefix. */
-//	SECURITY_NULL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 0},	/* S-1-0 */
-//	SECURITY_WORLD_SID_AUTHORITY	= {0, 0, 0, 0, 0, 1},	/* S-1-1 */
-//	SECURITY_LOCAL_SID_AUTHORITY	= {0, 0, 0, 0, 0, 2},	/* S-1-2 */
-//	SECURITY_CREATOR_SID_AUTHORITY	= {0, 0, 0, 0, 0, 3},	/* S-1-3 */
-//	SECURITY_NON_UNIQUE_AUTHORITY	= {0, 0, 0, 0, 0, 4},	/* S-1-4 */
-//	SECURITY_NT_SID_AUTHORITY	= {0, 0, 0, 0, 0, 5},	/* S-1-5 */
-//} IDENTIFIER_AUTHORITIES;
-
-/*
- * These relative identifiers (RIDs) are used with the above identifier
- * authorities to make up universal well-known SIDs.
- *
- * Note: The relative identifier (RID) refers to the portion of a SID, which
- * identifies a user or group in relation to the authority that issued the SID.
- * For example, the universal well-known SID Creator Owner ID (S-1-3-0) is
- * made up of the identifier authority SECURITY_CREATOR_SID_AUTHORITY (3) and
- * the relative identifier SECURITY_CREATOR_OWNER_RID (0).
- */
-typedef enum {					/* Identifier authority. */
-	SECURITY_NULL_RID		  = 0,	/* S-1-0 */
-	SECURITY_WORLD_RID		  = 0,	/* S-1-1 */
-	SECURITY_LOCAL_RID		  = 0,	/* S-1-2 */
-
-	SECURITY_CREATOR_OWNER_RID	  = 0,	/* S-1-3 */
-	SECURITY_CREATOR_GROUP_RID	  = 1,	/* S-1-3 */
-
-	SECURITY_CREATOR_OWNER_SERVER_RID = 2,	/* S-1-3 */
-	SECURITY_CREATOR_GROUP_SERVER_RID = 3,	/* S-1-3 */
-
-	SECURITY_DIALUP_RID		  = 1,
-	SECURITY_NETWORK_RID		  = 2,
-	SECURITY_BATCH_RID		  = 3,
-	SECURITY_INTERACTIVE_RID	  = 4,
-	SECURITY_SERVICE_RID		  = 6,
-	SECURITY_ANONYMOUS_LOGON_RID	  = 7,
-	SECURITY_PROXY_RID		  = 8,
-	SECURITY_ENTERPRISE_CONTROLLERS_RID=9,
-	SECURITY_SERVER_LOGON_RID	  = 9,
-	SECURITY_PRINCIPAL_SELF_RID	  = 0xa,
-	SECURITY_AUTHENTICATED_USER_RID	  = 0xb,
-	SECURITY_RESTRICTED_CODE_RID	  = 0xc,
-	SECURITY_TERMINAL_SERVER_RID	  = 0xd,
-
-	SECURITY_LOGON_IDS_RID		  = 5,
-	SECURITY_LOGON_IDS_RID_COUNT	  = 3,
-
-	SECURITY_LOCAL_SYSTEM_RID	  = 0x12,
-
-	SECURITY_NT_NON_UNIQUE		  = 0x15,
-
-	SECURITY_BUILTIN_DOMAIN_RID	  = 0x20,
-
-	/*
-	 * Well-known domain relative sub-authority values (RIDs).
-	 */
-
-	/* Users. */
-	DOMAIN_USER_RID_ADMIN		  = 0x1f4,
-	DOMAIN_USER_RID_GUEST		  = 0x1f5,
-	DOMAIN_USER_RID_KRBTGT		  = 0x1f6,
-
-	/* Groups. */
-	DOMAIN_GROUP_RID_ADMINS		  = 0x200,
-	DOMAIN_GROUP_RID_USERS		  = 0x201,
-	DOMAIN_GROUP_RID_GUESTS		  = 0x202,
-	DOMAIN_GROUP_RID_COMPUTERS	  = 0x203,
-	DOMAIN_GROUP_RID_CONTROLLERS	  = 0x204,
-	DOMAIN_GROUP_RID_CERT_ADMINS	  = 0x205,
-	DOMAIN_GROUP_RID_SCHEMA_ADMINS	  = 0x206,
-	DOMAIN_GROUP_RID_ENTERPRISE_ADMINS= 0x207,
-	DOMAIN_GROUP_RID_POLICY_ADMINS	  = 0x208,
-
-	/* Aliases. */
-	DOMAIN_ALIAS_RID_ADMINS		  = 0x220,
-	DOMAIN_ALIAS_RID_USERS		  = 0x221,
-	DOMAIN_ALIAS_RID_GUESTS		  = 0x222,
-	DOMAIN_ALIAS_RID_POWER_USERS	  = 0x223,
-
-	DOMAIN_ALIAS_RID_ACCOUNT_OPS	  = 0x224,
-	DOMAIN_ALIAS_RID_SYSTEM_OPS	  = 0x225,
-	DOMAIN_ALIAS_RID_PRINT_OPS	  = 0x226,
-	DOMAIN_ALIAS_RID_BACKUP_OPS	  = 0x227,
-
-	DOMAIN_ALIAS_RID_REPLICATOR	  = 0x228,
-	DOMAIN_ALIAS_RID_RAS_SERVERS	  = 0x229,
-	DOMAIN_ALIAS_RID_PREW2KCOMPACCESS = 0x22a,
-} RELATIVE_IDENTIFIERS;
-
-/*
- * The universal well-known SIDs:
- *
- *	NULL_SID			S-1-0-0
- *	WORLD_SID			S-1-1-0
- *	LOCAL_SID			S-1-2-0
- *	CREATOR_OWNER_SID		S-1-3-0
- *	CREATOR_GROUP_SID		S-1-3-1
- *	CREATOR_OWNER_SERVER_SID	S-1-3-2
- *	CREATOR_GROUP_SERVER_SID	S-1-3-3
- *
- *	(Non-unique IDs)		S-1-4
- *
- * NT well-known SIDs:
- *
- *	NT_AUTHORITY_SID	S-1-5
- *	DIALUP_SID		S-1-5-1
- *
- *	NETWORD_SID		S-1-5-2
- *	BATCH_SID		S-1-5-3
- *	INTERACTIVE_SID		S-1-5-4
- *	SERVICE_SID		S-1-5-6
- *	ANONYMOUS_LOGON_SID	S-1-5-7		(aka null logon session)
- *	PROXY_SID		S-1-5-8
- *	SERVER_LOGON_SID	S-1-5-9		(aka domain controller account)
- *	SELF_SID		S-1-5-10	(self RID)
- *	AUTHENTICATED_USER_SID	S-1-5-11
- *	RESTRICTED_CODE_SID	S-1-5-12	(running restricted code)
- *	TERMINAL_SERVER_SID	S-1-5-13	(running on terminal server)
- *
- *	(Logon IDs)		S-1-5-5-X-Y
- *
- *	(NT non-unique IDs)	S-1-5-0x15-...
- *
- *	(Built-in domain)	S-1-5-0x20
- */
-
-/*
- * The SID_IDENTIFIER_AUTHORITY is a 48-bit value used in the SID structure.
- *
- * NOTE: This is stored as a big endian number, hence the high_part comes
- * before the low_part.
- */
-typedef union {
-	struct {
-		u16 high_part;	/* High 16-bits. */
-		u32 low_part;	/* Low 32-bits. */
-	} __attribute__ ((__packed__)) parts;
-	u8 value[6];		/* Value as individual bytes. */
-} __attribute__ ((__packed__)) SID_IDENTIFIER_AUTHORITY;
-
-/*
- * The SID structure is a variable-length structure used to uniquely identify
- * users or groups. SID stands for security identifier.
- *
- * The standard textual representation of the SID is of the form:
- *	S-R-I-S-S...
- * Where:
- *    - The first "S" is the literal character 'S' identifying the following
- *	digits as a SID.
- *    - R is the revision level of the SID expressed as a sequence of digits
- *	either in decimal or hexadecimal (if the later, prefixed by "0x").
- *    - I is the 48-bit identifier_authority, expressed as digits as R above.
- *    - S... is one or more sub_authority values, expressed as digits as above.
- *
- * Example SID; the domain-relative SID of the local Administrators group on
- * Windows NT/2k:
- *	S-1-5-32-544
- * This translates to a SID with:
- *	revision = 1,
- *	sub_authority_count = 2,
- *	identifier_authority = {0,0,0,0,0,5},	// SECURITY_NT_AUTHORITY
- *	sub_authority[0] = 32,			// SECURITY_BUILTIN_DOMAIN_RID
- *	sub_authority[1] = 544			// DOMAIN_ALIAS_RID_ADMINS
- */
-typedef struct {
-	u8 revision;
-	u8 sub_authority_count;
-	SID_IDENTIFIER_AUTHORITY identifier_authority;
-	le32 sub_authority[1];		/* At least one sub_authority. */
-} __attribute__ ((__packed__)) SID;
-
-/*
- * Current constants for SIDs.
- */
-typedef enum {
-	SID_REVISION			=  1,	/* Current revision level. */
-	SID_MAX_SUB_AUTHORITIES		= 15,	/* Maximum number of those. */
-	SID_RECOMMENDED_SUB_AUTHORITIES	=  1,	/* Will change to around 6 in
-						   a future revision. */
-} SID_CONSTANTS;
-
-/*
- * The predefined ACE types (8-bit, see below).
- */
-enum {
-	ACCESS_MIN_MS_ACE_TYPE		= 0,
-	ACCESS_ALLOWED_ACE_TYPE		= 0,
-	ACCESS_DENIED_ACE_TYPE		= 1,
-	SYSTEM_AUDIT_ACE_TYPE		= 2,
-	SYSTEM_ALARM_ACE_TYPE		= 3, /* Not implemented as of Win2k. */
-	ACCESS_MAX_MS_V2_ACE_TYPE	= 3,
-
-	ACCESS_ALLOWED_COMPOUND_ACE_TYPE= 4,
-	ACCESS_MAX_MS_V3_ACE_TYPE	= 4,
-
-	/* The following are Win2k only. */
-	ACCESS_MIN_MS_OBJECT_ACE_TYPE	= 5,
-	ACCESS_ALLOWED_OBJECT_ACE_TYPE	= 5,
-	ACCESS_DENIED_OBJECT_ACE_TYPE	= 6,
-	SYSTEM_AUDIT_OBJECT_ACE_TYPE	= 7,
-	SYSTEM_ALARM_OBJECT_ACE_TYPE	= 8,
-	ACCESS_MAX_MS_OBJECT_ACE_TYPE	= 8,
-
-	ACCESS_MAX_MS_V4_ACE_TYPE	= 8,
-
-	/* This one is for WinNT/2k. */
-	ACCESS_MAX_MS_ACE_TYPE		= 8,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_TYPES;
-
-/*
- * The ACE flags (8-bit) for audit and inheritance (see below).
- *
- * SUCCESSFUL_ACCESS_ACE_FLAG is only used with system audit and alarm ACE
- * types to indicate that a message is generated (in Windows!) for successful
- * accesses.
- *
- * FAILED_ACCESS_ACE_FLAG is only used with system audit and alarm ACE types
- * to indicate that a message is generated (in Windows!) for failed accesses.
- */
-enum {
-	/* The inheritance flags. */
-	OBJECT_INHERIT_ACE		= 0x01,
-	CONTAINER_INHERIT_ACE		= 0x02,
-	NO_PROPAGATE_INHERIT_ACE	= 0x04,
-	INHERIT_ONLY_ACE		= 0x08,
-	INHERITED_ACE			= 0x10,	/* Win2k only. */
-	VALID_INHERIT_FLAGS		= 0x1f,
-
-	/* The audit flags. */
-	SUCCESSFUL_ACCESS_ACE_FLAG	= 0x40,
-	FAILED_ACCESS_ACE_FLAG		= 0x80,
-} __attribute__ ((__packed__));
-
-typedef u8 ACE_FLAGS;
-
-/*
- * An ACE is an access-control entry in an access-control list (ACL).
- * An ACE defines access to an object for a specific user or group or defines
- * the types of access that generate system-administration messages or alarms
- * for a specific user or group. The user or group is identified by a security
- * identifier (SID).
- *
- * Each ACE starts with an ACE_HEADER structure (aligned on 4-byte boundary),
- * which specifies the type and size of the ACE. The format of the subsequent
- * data depends on the ACE type.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	ACE_TYPES type;		/* Type of the ACE. */
-/*  1*/	ACE_FLAGS flags;	/* Flags describing the ACE. */
-/*  2*/	le16 size;		/* Size in bytes of the ACE. */
-} __attribute__ ((__packed__)) ACE_HEADER;
-
-/*
- * The access mask (32-bit). Defines the access rights.
- *
- * The specific rights (bits 0 to 15).  These depend on the type of the object
- * being secured by the ACE.
- */
-enum {
-	/* Specific rights for files and directories are as follows: */
-
-	/* Right to read data from the file. (FILE) */
-	FILE_READ_DATA			= cpu_to_le32(0x00000001),
-	/* Right to list contents of a directory. (DIRECTORY) */
-	FILE_LIST_DIRECTORY		= cpu_to_le32(0x00000001),
-
-	/* Right to write data to the file. (FILE) */
-	FILE_WRITE_DATA			= cpu_to_le32(0x00000002),
-	/* Right to create a file in the directory. (DIRECTORY) */
-	FILE_ADD_FILE			= cpu_to_le32(0x00000002),
-
-	/* Right to append data to the file. (FILE) */
-	FILE_APPEND_DATA		= cpu_to_le32(0x00000004),
-	/* Right to create a subdirectory. (DIRECTORY) */
-	FILE_ADD_SUBDIRECTORY		= cpu_to_le32(0x00000004),
-
-	/* Right to read extended attributes. (FILE/DIRECTORY) */
-	FILE_READ_EA			= cpu_to_le32(0x00000008),
-
-	/* Right to write extended attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_EA			= cpu_to_le32(0x00000010),
-
-	/* Right to execute a file. (FILE) */
-	FILE_EXECUTE			= cpu_to_le32(0x00000020),
-	/* Right to traverse the directory. (DIRECTORY) */
-	FILE_TRAVERSE			= cpu_to_le32(0x00000020),
-
-	/*
-	 * Right to delete a directory and all the files it contains (its
-	 * children), even if the files are read-only. (DIRECTORY)
-	 */
-	FILE_DELETE_CHILD		= cpu_to_le32(0x00000040),
-
-	/* Right to read file attributes. (FILE/DIRECTORY) */
-	FILE_READ_ATTRIBUTES		= cpu_to_le32(0x00000080),
-
-	/* Right to change file attributes. (FILE/DIRECTORY) */
-	FILE_WRITE_ATTRIBUTES		= cpu_to_le32(0x00000100),
-
-	/*
-	 * The standard rights (bits 16 to 23).  These are independent of the
-	 * type of object being secured.
-	 */
-
-	/* Right to delete the object. */
-	DELETE				= cpu_to_le32(0x00010000),
-
-	/*
-	 * Right to read the information in the object's security descriptor,
-	 * not including the information in the SACL, i.e. right to read the
-	 * security descriptor and owner.
-	 */
-	READ_CONTROL			= cpu_to_le32(0x00020000),
-
-	/* Right to modify the DACL in the object's security descriptor. */
-	WRITE_DAC			= cpu_to_le32(0x00040000),
-
-	/* Right to change the owner in the object's security descriptor. */
-	WRITE_OWNER			= cpu_to_le32(0x00080000),
-
-	/*
-	 * Right to use the object for synchronization.  Enables a process to
-	 * wait until the object is in the signalled state.  Some object types
-	 * do not support this access right.
-	 */
-	SYNCHRONIZE			= cpu_to_le32(0x00100000),
-
-	/*
-	 * The following STANDARD_RIGHTS_* are combinations of the above for
-	 * convenience and are defined by the Win32 API.
-	 */
-
-	/* These are currently defined to READ_CONTROL. */
-	STANDARD_RIGHTS_READ		= cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_WRITE		= cpu_to_le32(0x00020000),
-	STANDARD_RIGHTS_EXECUTE		= cpu_to_le32(0x00020000),
-
-	/* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-	STANDARD_RIGHTS_REQUIRED	= cpu_to_le32(0x000f0000),
-
-	/*
-	 * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
-	 * SYNCHRONIZE access.
-	 */
-	STANDARD_RIGHTS_ALL		= cpu_to_le32(0x001f0000),
-
-	/*
-	 * The access system ACL and maximum allowed access types (bits 24 to
-	 * 25, bits 26 to 27 are reserved).
-	 */
-	ACCESS_SYSTEM_SECURITY		= cpu_to_le32(0x01000000),
-	MAXIMUM_ALLOWED			= cpu_to_le32(0x02000000),
-
-	/*
-	 * The generic rights (bits 28 to 31).  These map onto the standard and
-	 * specific rights.
-	 */
-
-	/* Read, write, and execute access. */
-	GENERIC_ALL			= cpu_to_le32(0x10000000),
-
-	/* Execute access. */
-	GENERIC_EXECUTE			= cpu_to_le32(0x20000000),
-
-	/*
-	 * Write access.  For files, this maps onto:
-	 *	FILE_APPEND_DATA | FILE_WRITE_ATTRIBUTES | FILE_WRITE_DATA |
-	 *	FILE_WRITE_EA | STANDARD_RIGHTS_WRITE | SYNCHRONIZE
-	 * For directories, the mapping has the same numerical value.  See
-	 * above for the descriptions of the rights granted.
-	 */
-	GENERIC_WRITE			= cpu_to_le32(0x40000000),
-
-	/*
-	 * Read access.  For files, this maps onto:
-	 *	FILE_READ_ATTRIBUTES | FILE_READ_DATA | FILE_READ_EA |
-	 *	STANDARD_RIGHTS_READ | SYNCHRONIZE
-	 * For directories, the mapping has the same numberical value.  See
-	 * above for the descriptions of the rights granted.
-	 */
-	GENERIC_READ			= cpu_to_le32(0x80000000),
-};
-
-typedef le32 ACCESS_MASK;
-
-/*
- * The generic mapping array. Used to denote the mapping of each generic
- * access right to a specific access mask.
- *
- * FIXME: What exactly is this and what is it for? (AIA)
- */
-typedef struct {
-	ACCESS_MASK generic_read;
-	ACCESS_MASK generic_write;
-	ACCESS_MASK generic_execute;
-	ACCESS_MASK generic_all;
-} __attribute__ ((__packed__)) GENERIC_MAPPING;
-
-/*
- * The predefined ACE type structures are as defined below.
- */
-
-/*
- * ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE, SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE
- */
-typedef struct {
-/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
-	ACE_TYPES type;		/* Type of the ACE. */
-	ACE_FLAGS flags;	/* Flags describing the ACE. */
-	le16 size;		/* Size in bytes of the ACE. */
-/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
-
-/*  8*/	SID sid;		/* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_ACE, ACCESS_DENIED_ACE,
-			       SYSTEM_AUDIT_ACE, SYSTEM_ALARM_ACE;
-
-/*
- * The object ACE flags (32-bit).
- */
-enum {
-	ACE_OBJECT_TYPE_PRESENT			= cpu_to_le32(1),
-	ACE_INHERITED_OBJECT_TYPE_PRESENT	= cpu_to_le32(2),
-};
-
-typedef le32 OBJECT_ACE_FLAGS;
-
-typedef struct {
-/*  0	ACE_HEADER; -- Unfolded here as gcc doesn't like unnamed structs. */
-	ACE_TYPES type;		/* Type of the ACE. */
-	ACE_FLAGS flags;	/* Flags describing the ACE. */
-	le16 size;		/* Size in bytes of the ACE. */
-/*  4*/	ACCESS_MASK mask;	/* Access mask associated with the ACE. */
-
-/*  8*/	OBJECT_ACE_FLAGS object_flags;	/* Flags describing the object ACE. */
-/* 12*/	GUID object_type;
-/* 28*/	GUID inherited_object_type;
-
-/* 44*/	SID sid;		/* The SID associated with the ACE. */
-} __attribute__ ((__packed__)) ACCESS_ALLOWED_OBJECT_ACE,
-			       ACCESS_DENIED_OBJECT_ACE,
-			       SYSTEM_AUDIT_OBJECT_ACE,
-			       SYSTEM_ALARM_OBJECT_ACE;
-
-/*
- * An ACL is an access-control list (ACL).
- * An ACL starts with an ACL header structure, which specifies the size of
- * the ACL and the number of ACEs it contains. The ACL header is followed by
- * zero or more access control entries (ACEs). The ACL as well as each ACE
- * are aligned on 4-byte boundaries.
- */
-typedef struct {
-	u8 revision;	/* Revision of this ACL. */
-	u8 alignment1;
-	le16 size;	/* Allocated space in bytes for ACL. Includes this
-			   header, the ACEs and the remaining free space. */
-	le16 ace_count;	/* Number of ACEs in the ACL. */
-	le16 alignment2;
-/* sizeof() = 8 bytes */
-} __attribute__ ((__packed__)) ACL;
-
-/*
- * Current constants for ACLs.
- */
-typedef enum {
-	/* Current revision. */
-	ACL_REVISION		= 2,
-	ACL_REVISION_DS		= 4,
-
-	/* History of revisions. */
-	ACL_REVISION1		= 1,
-	MIN_ACL_REVISION	= 2,
-	ACL_REVISION2		= 2,
-	ACL_REVISION3		= 3,
-	ACL_REVISION4		= 4,
-	MAX_ACL_REVISION	= 4,
-} ACL_CONSTANTS;
-
-/*
- * The security descriptor control flags (16-bit).
- *
- * SE_OWNER_DEFAULTED - This boolean flag, when set, indicates that the SID
- *	pointed to by the Owner field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the SID with
- *	respect to inheritance of an owner.
- *
- * SE_GROUP_DEFAULTED - This boolean flag, when set, indicates that the SID in
- *	the Group field was provided by a defaulting mechanism rather than
- *	explicitly provided by the original provider of the security
- *	descriptor.  This may affect the treatment of the SID with respect to
- *	inheritance of a primary group.
- *
- * SE_DACL_PRESENT - This boolean flag, when set, indicates that the security
- *	descriptor contains a discretionary ACL.  If this flag is set and the
- *	Dacl field of the SECURITY_DESCRIPTOR is null, then a null ACL is
- *	explicitly being specified.
- *
- * SE_DACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- *	pointed to by the Dacl field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritance of an ACL.  This flag is ignored if the
- *	DaclPresent flag is not set.
- *
- * SE_SACL_PRESENT - This boolean flag, when set,  indicates that the security
- *	descriptor contains a system ACL pointed to by the Sacl field.  If this
- *	flag is set and the Sacl field of the SECURITY_DESCRIPTOR is null, then
- *	an empty (but present) ACL is being specified.
- *
- * SE_SACL_DEFAULTED - This boolean flag, when set, indicates that the ACL
- *	pointed to by the Sacl field was provided by a defaulting mechanism
- *	rather than explicitly provided by the original provider of the
- *	security descriptor.  This may affect the treatment of the ACL with
- *	respect to inheritance of an ACL.  This flag is ignored if the
- *	SaclPresent flag is not set.
- *
- * SE_SELF_RELATIVE - This boolean flag, when set, indicates that the security
- *	descriptor is in self-relative form.  In this form, all fields of the
- *	security descriptor are contiguous in memory and all pointer fields are
- *	expressed as offsets from the beginning of the security descriptor.
- */
-enum {
-	SE_OWNER_DEFAULTED		= cpu_to_le16(0x0001),
-	SE_GROUP_DEFAULTED		= cpu_to_le16(0x0002),
-	SE_DACL_PRESENT			= cpu_to_le16(0x0004),
-	SE_DACL_DEFAULTED		= cpu_to_le16(0x0008),
-
-	SE_SACL_PRESENT			= cpu_to_le16(0x0010),
-	SE_SACL_DEFAULTED		= cpu_to_le16(0x0020),
-
-	SE_DACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0100),
-	SE_SACL_AUTO_INHERIT_REQ	= cpu_to_le16(0x0200),
-	SE_DACL_AUTO_INHERITED		= cpu_to_le16(0x0400),
-	SE_SACL_AUTO_INHERITED		= cpu_to_le16(0x0800),
-
-	SE_DACL_PROTECTED		= cpu_to_le16(0x1000),
-	SE_SACL_PROTECTED		= cpu_to_le16(0x2000),
-	SE_RM_CONTROL_VALID		= cpu_to_le16(0x4000),
-	SE_SELF_RELATIVE		= cpu_to_le16(0x8000)
-} __attribute__ ((__packed__));
-
-typedef le16 SECURITY_DESCRIPTOR_CONTROL;
-
-/*
- * Self-relative security descriptor. Contains the owner and group SIDs as well
- * as the sacl and dacl ACLs inside the security descriptor itself.
- */
-typedef struct {
-	u8 revision;	/* Revision level of the security descriptor. */
-	u8 alignment;
-	SECURITY_DESCRIPTOR_CONTROL control; /* Flags qualifying the type of
-			   the descriptor as well as the following fields. */
-	le32 owner;	/* Byte offset to a SID representing an object's
-			   owner. If this is NULL, no owner SID is present in
-			   the descriptor. */
-	le32 group;	/* Byte offset to a SID representing an object's
-			   primary group. If this is NULL, no primary group
-			   SID is present in the descriptor. */
-	le32 sacl;	/* Byte offset to a system ACL. Only valid, if
-			   SE_SACL_PRESENT is set in the control field. If
-			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
-			   is specified. */
-	le32 dacl;	/* Byte offset to a discretionary ACL. Only valid, if
-			   SE_DACL_PRESENT is set in the control field. If
-			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
-			   (unconditionally granting access) is specified. */
-/* sizeof() = 0x14 bytes */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_RELATIVE;
-
-/*
- * Absolute security descriptor. Does not contain the owner and group SIDs, nor
- * the sacl and dacl ACLs inside the security descriptor. Instead, it contains
- * pointers to these structures in memory. Obviously, absolute security
- * descriptors are only useful for in memory representations of security
- * descriptors. On disk, a self-relative security descriptor is used.
- */
-typedef struct {
-	u8 revision;	/* Revision level of the security descriptor. */
-	u8 alignment;
-	SECURITY_DESCRIPTOR_CONTROL control;	/* Flags qualifying the type of
-			   the descriptor as well as the following fields. */
-	SID *owner;	/* Points to a SID representing an object's owner. If
-			   this is NULL, no owner SID is present in the
-			   descriptor. */
-	SID *group;	/* Points to a SID representing an object's primary
-			   group. If this is NULL, no primary group SID is
-			   present in the descriptor. */
-	ACL *sacl;	/* Points to a system ACL. Only valid, if
-			   SE_SACL_PRESENT is set in the control field. If
-			   SE_SACL_PRESENT is set but sacl is NULL, a NULL ACL
-			   is specified. */
-	ACL *dacl;	/* Points to a discretionary ACL. Only valid, if
-			   SE_DACL_PRESENT is set in the control field. If
-			   SE_DACL_PRESENT is set but dacl is NULL, a NULL ACL
-			   (unconditionally granting access) is specified. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR;
-
-/*
- * Current constants for security descriptors.
- */
-typedef enum {
-	/* Current revision. */
-	SECURITY_DESCRIPTOR_REVISION	= 1,
-	SECURITY_DESCRIPTOR_REVISION1	= 1,
-
-	/* The sizes of both the absolute and relative security descriptors is
-	   the same as pointers, at least on ia32 architecture are 32-bit. */
-	SECURITY_DESCRIPTOR_MIN_LENGTH	= sizeof(SECURITY_DESCRIPTOR),
-} SECURITY_DESCRIPTOR_CONSTANTS;
-
-/*
- * Attribute: Security descriptor (0x50). A standard self-relative security
- * descriptor.
- *
- * NOTE: Can be resident or non-resident.
- * NOTE: Not used in NTFS 3.0+, as security descriptors are stored centrally
- * in FILE_Secure and the correct descriptor is found using the security_id
- * from the standard information attribute.
- */
-typedef SECURITY_DESCRIPTOR_RELATIVE SECURITY_DESCRIPTOR_ATTR;
-
-/*
- * On NTFS 3.0+, all security descriptors are stored in FILE_Secure. Only one
- * referenced instance of each unique security descriptor is stored.
- *
- * FILE_Secure contains no unnamed data attribute, i.e. it has zero length. It
- * does, however, contain two indexes ($SDH and $SII) as well as a named data
- * stream ($SDS).
- *
- * Every unique security descriptor is assigned a unique security identifier
- * (security_id, not to be confused with a SID). The security_id is unique for
- * the NTFS volume and is used as an index into the $SII index, which maps
- * security_ids to the security descriptor's storage location within the $SDS
- * data attribute. The $SII index is sorted by ascending security_id.
- *
- * A simple hash is computed from each security descriptor. This hash is used
- * as an index into the $SDH index, which maps security descriptor hashes to
- * the security descriptor's storage location within the $SDS data attribute.
- * The $SDH index is sorted by security descriptor hash and is stored in a B+
- * tree. When searching $SDH (with the intent of determining whether or not a
- * new security descriptor is already present in the $SDS data stream), if a
- * matching hash is found, but the security descriptors do not match, the
- * search in the $SDH index is continued, searching for a next matching hash.
- *
- * When a precise match is found, the security_id coresponding to the security
- * descriptor in the $SDS attribute is read from the found $SDH index entry and
- * is stored in the $STANDARD_INFORMATION attribute of the file/directory to
- * which the security descriptor is being applied. The $STANDARD_INFORMATION
- * attribute is present in all base mft records (i.e. in all files and
- * directories).
- *
- * If a match is not found, the security descriptor is assigned a new unique
- * security_id and is added to the $SDS data attribute. Then, entries
- * referencing the this security descriptor in the $SDS data attribute are
- * added to the $SDH and $SII indexes.
- *
- * Note: Entries are never deleted from FILE_Secure, even if nothing
- * references an entry any more.
- */
-
-/*
- * This header precedes each security descriptor in the $SDS data stream.
- * This is also the index entry data part of both the $SII and $SDH indexes.
- */
-typedef struct {
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
-	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
-} __attribute__ ((__packed__)) SECURITY_DESCRIPTOR_HEADER;
-
-/*
- * The $SDS data stream contains the security descriptors, aligned on 16-byte
- * boundaries, sorted by security_id in a B+ tree. Security descriptors cannot
- * cross 256kib boundaries (this restriction is imposed by the Windows cache
- * manager). Each security descriptor is contained in a SDS_ENTRY structure.
- * Also, each security descriptor is stored twice in the $SDS stream with a
- * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size)
- * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the
- * first copy of the security descriptor will be at offset 0x51d0 in the
- * $SDS data stream and the second copy will be at offset 0x451d0.
- */
-typedef struct {
-/*Ofs*/
-/*  0	SECURITY_DESCRIPTOR_HEADER; -- Unfolded here as gcc doesn't like
-				       unnamed structs. */
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-	le64 offset;	  /* Byte offset of this entry in the $SDS stream. */
-	le32 length;	  /* Size in bytes of this entry in $SDS stream. */
-/* 20*/	SECURITY_DESCRIPTOR_RELATIVE sid; /* The self-relative security
-					     descriptor. */
-} __attribute__ ((__packed__)) SDS_ENTRY;
-
-/*
- * The index entry key used in the $SII index. The collation type is
- * COLLATION_NTOFS_ULONG.
- */
-typedef struct {
-	le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SII_INDEX_KEY;
-
-/*
- * The index entry key used in the $SDH index. The keys are sorted first by
- * hash and then by security_id. The collation rule is
- * COLLATION_NTOFS_SECURITY_HASH.
- */
-typedef struct {
-	le32 hash;	  /* Hash of the security descriptor. */
-	le32 security_id; /* The security_id assigned to the descriptor. */
-} __attribute__ ((__packed__)) SDH_INDEX_KEY;
-
-/*
- * Attribute: Volume name (0x60).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- */
-typedef struct {
-	ntfschar name[0];	/* The name of the volume in Unicode. */
-} __attribute__ ((__packed__)) VOLUME_NAME;
-
-/*
- * Possible flags for the volume (16-bit).
- */
-enum {
-	VOLUME_IS_DIRTY			= cpu_to_le16(0x0001),
-	VOLUME_RESIZE_LOG_FILE		= cpu_to_le16(0x0002),
-	VOLUME_UPGRADE_ON_MOUNT		= cpu_to_le16(0x0004),
-	VOLUME_MOUNTED_ON_NT4		= cpu_to_le16(0x0008),
-
-	VOLUME_DELETE_USN_UNDERWAY	= cpu_to_le16(0x0010),
-	VOLUME_REPAIR_OBJECT_ID		= cpu_to_le16(0x0020),
-
-	VOLUME_CHKDSK_UNDERWAY		= cpu_to_le16(0x4000),
-	VOLUME_MODIFIED_BY_CHKDSK	= cpu_to_le16(0x8000),
-
-	VOLUME_FLAGS_MASK		= cpu_to_le16(0xc03f),
-
-	/* To make our life easier when checking if we must mount read-only. */
-	VOLUME_MUST_MOUNT_RO_MASK	= cpu_to_le16(0xc027),
-} __attribute__ ((__packed__));
-
-typedef le16 VOLUME_FLAGS;
-
-/*
- * Attribute: Volume information (0x70).
- *
- * NOTE: Always resident.
- * NOTE: Present only in FILE_Volume.
- * NOTE: Windows 2000 uses NTFS 3.0 while Windows NT4 service pack 6a uses
- *	 NTFS 1.2. I haven't personally seen other values yet.
- */
-typedef struct {
-	le64 reserved;		/* Not used (yet?). */
-	u8 major_ver;		/* Major version of the ntfs format. */
-	u8 minor_ver;		/* Minor version of the ntfs format. */
-	VOLUME_FLAGS flags;	/* Bit array of VOLUME_* flags. */
-} __attribute__ ((__packed__)) VOLUME_INFORMATION;
-
-/*
- * Attribute: Data attribute (0x80).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Data contents of a file (i.e. the unnamed stream) or of a named stream.
- */
-typedef struct {
-	u8 data[0];		/* The file's data contents. */
-} __attribute__ ((__packed__)) DATA_ATTR;
-
-/*
- * Index header flags (8-bit).
- */
-enum {
-	/*
-	 * When index header is in an index root attribute:
-	 */
-	SMALL_INDEX = 0, /* The index is small enough to fit inside the index
-			    root attribute and there is no index allocation
-			    attribute present. */
-	LARGE_INDEX = 1, /* The index is too large to fit in the index root
-			    attribute and/or an index allocation attribute is
-			    present. */
-	/*
-	 * When index header is in an index block, i.e. is part of index
-	 * allocation attribute:
-	 */
-	LEAF_NODE  = 0, /* This is a leaf node, i.e. there are no more nodes
-			   branching off it. */
-	INDEX_NODE = 1, /* This node indexes other nodes, i.e. it is not a leaf
-			   node. */
-	NODE_MASK  = 1, /* Mask for accessing the *_NODE bits. */
-} __attribute__ ((__packed__));
-
-typedef u8 INDEX_HEADER_FLAGS;
-
-/*
- * This is the header for indexes, describing the INDEX_ENTRY records, which
- * follow the INDEX_HEADER. Together the index header and the index entries
- * make up a complete index.
- *
- * IMPORTANT NOTE: The offset, length and size structure members are counted
- * relative to the start of the index header structure and not relative to the
- * start of the index root or index allocation structures themselves.
- */
-typedef struct {
-	le32 entries_offset;		/* Byte offset to first INDEX_ENTRY
-					   aligned to 8-byte boundary. */
-	le32 index_length;		/* Data size of the index in bytes,
-					   i.e. bytes used from allocated
-					   size, aligned to 8-byte boundary. */
-	le32 allocated_size;		/* Byte size of this index (block),
-					   multiple of 8 bytes. */
-	/* NOTE: For the index root attribute, the above two numbers are always
-	   equal, as the attribute is resident and it is resized as needed. In
-	   the case of the index allocation attribute the attribute is not
-	   resident and hence the allocated_size is a fixed value and must
-	   equal the index_block_size specified by the INDEX_ROOT attribute
-	   corresponding to the INDEX_ALLOCATION attribute this INDEX_BLOCK
-	   belongs to. */
-	INDEX_HEADER_FLAGS flags;	/* Bit field of INDEX_HEADER_FLAGS. */
-	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
-} __attribute__ ((__packed__)) INDEX_HEADER;
-
-/*
- * Attribute: Index root (0x90).
- *
- * NOTE: Always resident.
- *
- * This is followed by a sequence of index entries (INDEX_ENTRY structures)
- * as described by the index header.
- *
- * When a directory is small enough to fit inside the index root then this
- * is the only attribute describing the directory. When the directory is too
- * large to fit in the index root, on the other hand, two additional attributes
- * are present: an index allocation attribute, containing sub-nodes of the B+
- * directory tree (see below), and a bitmap attribute, describing which virtual
- * cluster numbers (vcns) in the index allocation attribute are in use by an
- * index block.
- *
- * NOTE: The root directory (FILE_root) contains an entry for itself. Other
- * directories do not contain entries for themselves, though.
- */
-typedef struct {
-	ATTR_TYPE type;			/* Type of the indexed attribute. Is
-					   $FILE_NAME for directories, zero
-					   for view indexes. No other values
-					   allowed. */
-	COLLATION_RULE collation_rule;	/* Collation rule used to sort the
-					   index entries. If type is $FILE_NAME,
-					   this must be COLLATION_FILE_NAME. */
-	le32 index_block_size;		/* Size of each index block in bytes (in
-					   the index allocation attribute). */
-	u8 clusters_per_index_block;	/* Cluster size of each index block (in
-					   the index allocation attribute), when
-					   an index block is >= than a cluster,
-					   otherwise this will be the log of
-					   the size (like how the encoding of
-					   the mft record size and the index
-					   record size found in the boot sector
-					   work). Has to be a power of 2. */
-	u8 reserved[3];			/* Reserved/align to 8-byte boundary. */
-	INDEX_HEADER index;		/* Index header describing the
-					   following index entries. */
-} __attribute__ ((__packed__)) INDEX_ROOT;
-
-/*
- * Attribute: Index allocation (0xa0).
- *
- * NOTE: Always non-resident (doesn't make sense to be resident anyway!).
- *
- * This is an array of index blocks. Each index block starts with an
- * INDEX_BLOCK structure containing an index header, followed by a sequence of
- * index entries (INDEX_ENTRY structures), as described by the INDEX_HEADER.
- */
-typedef struct {
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-	NTFS_RECORD_TYPE magic;	/* Magic is "INDX". */
-	le16 usa_ofs;		/* See NTFS_RECORD definition. */
-	le16 usa_count;		/* See NTFS_RECORD definition. */
-
-/*  8*/	sle64 lsn;		/* $LogFile sequence number of the last
-				   modification of this index block. */
-/* 16*/	leVCN index_block_vcn;	/* Virtual cluster number of the index block.
-				   If the cluster_size on the volume is <= the
-				   index_block_size of the directory,
-				   index_block_vcn counts in units of clusters,
-				   and in units of sectors otherwise. */
-/* 24*/	INDEX_HEADER index;	/* Describes the following index entries. */
-/* sizeof()= 40 (0x28) bytes */
-/*
- * When creating the index block, we place the update sequence array at this
- * offset, i.e. before we start with the index entries. This also makes sense,
- * otherwise we could run into problems with the update sequence array
- * containing in itself the last two bytes of a sector which would mean that
- * multi sector transfer protection wouldn't work. As you can't protect data
- * by overwriting it since you then can't get it back...
- * When reading use the data from the ntfs record header.
- */
-} __attribute__ ((__packed__)) INDEX_BLOCK;
-
-typedef INDEX_BLOCK INDEX_ALLOCATION;
-
-/*
- * The system file FILE_Extend/$Reparse contains an index named $R listing
- * all reparse points on the volume. The index entry keys are as defined
- * below. Note, that there is no index data associated with the index entries.
- *
- * The index entries are sorted by the index key file_id. The collation rule is
- * COLLATION_NTOFS_ULONGS. FIXME: Verify whether the reparse_tag is not the
- * primary key / is not a key at all. (AIA)
- */
-typedef struct {
-	le32 reparse_tag;	/* Reparse point type (inc. flags). */
-	leMFT_REF file_id;	/* Mft record of the file containing the
-				   reparse point attribute. */
-} __attribute__ ((__packed__)) REPARSE_INDEX_KEY;
-
-/*
- * Quota flags (32-bit).
- *
- * The user quota flags.  Names explain meaning.
- */
-enum {
-	QUOTA_FLAG_DEFAULT_LIMITS	= cpu_to_le32(0x00000001),
-	QUOTA_FLAG_LIMIT_REACHED	= cpu_to_le32(0x00000002),
-	QUOTA_FLAG_ID_DELETED		= cpu_to_le32(0x00000004),
-
-	QUOTA_FLAG_USER_MASK		= cpu_to_le32(0x00000007),
-	/* This is a bit mask for the user quota flags. */
-
-	/*
-	 * These flags are only present in the quota defaults index entry, i.e.
-	 * in the entry where owner_id = QUOTA_DEFAULTS_ID.
-	 */
-	QUOTA_FLAG_TRACKING_ENABLED	= cpu_to_le32(0x00000010),
-	QUOTA_FLAG_ENFORCEMENT_ENABLED	= cpu_to_le32(0x00000020),
-	QUOTA_FLAG_TRACKING_REQUESTED	= cpu_to_le32(0x00000040),
-	QUOTA_FLAG_LOG_THRESHOLD	= cpu_to_le32(0x00000080),
-
-	QUOTA_FLAG_LOG_LIMIT		= cpu_to_le32(0x00000100),
-	QUOTA_FLAG_OUT_OF_DATE		= cpu_to_le32(0x00000200),
-	QUOTA_FLAG_CORRUPT		= cpu_to_le32(0x00000400),
-	QUOTA_FLAG_PENDING_DELETES	= cpu_to_le32(0x00000800),
-};
-
-typedef le32 QUOTA_FLAGS;
-
-/*
- * The system file FILE_Extend/$Quota contains two indexes $O and $Q. Quotas
- * are on a per volume and per user basis.
- *
- * The $Q index contains one entry for each existing user_id on the volume. The
- * index key is the user_id of the user/group owning this quota control entry,
- * i.e. the key is the owner_id. The user_id of the owner of a file, i.e. the
- * owner_id, is found in the standard information attribute. The collation rule
- * for $Q is COLLATION_NTOFS_ULONG.
- *
- * The $O index contains one entry for each user/group who has been assigned
- * a quota on that volume. The index key holds the SID of the user_id the
- * entry belongs to, i.e. the owner_id. The collation rule for $O is
- * COLLATION_NTOFS_SID.
- *
- * The $O index entry data is the user_id of the user corresponding to the SID.
- * This user_id is used as an index into $Q to find the quota control entry
- * associated with the SID.
- *
- * The $Q index entry data is the quota control entry and is defined below.
- */
-typedef struct {
-	le32 version;		/* Currently equals 2. */
-	QUOTA_FLAGS flags;	/* Flags describing this quota entry. */
-	le64 bytes_used;	/* How many bytes of the quota are in use. */
-	sle64 change_time;	/* Last time this quota entry was changed. */
-	sle64 threshold;	/* Soft quota (-1 if not limited). */
-	sle64 limit;		/* Hard quota (-1 if not limited). */
-	sle64 exceeded_time;	/* How long the soft quota has been exceeded. */
-	SID sid;		/* The SID of the user/object associated with
-				   this quota entry.  Equals zero for the quota
-				   defaults entry (and in fact on a WinXP
-				   volume, it is not present at all). */
-} __attribute__ ((__packed__)) QUOTA_CONTROL_ENTRY;
-
-/*
- * Predefined owner_id values (32-bit).
- */
-enum {
-	QUOTA_INVALID_ID	= cpu_to_le32(0x00000000),
-	QUOTA_DEFAULTS_ID	= cpu_to_le32(0x00000001),
-	QUOTA_FIRST_USER_ID	= cpu_to_le32(0x00000100),
-};
-
-/*
- * Current constants for quota control entries.
- */
-typedef enum {
-	/* Current version. */
-	QUOTA_VERSION	= 2,
-} QUOTA_CONTROL_ENTRY_CONSTANTS;
-
-/*
- * Index entry flags (16-bit).
- */
-enum {
-	INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
-			sub-node, i.e. a reference to an index block in form of
-			a virtual cluster number (see below). */
-	INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
-			entry in an index block.  The index entry does not
-			represent a file but it can point to a sub-node. */
-
-	INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
-			enum bit width to 16-bit. */
-} __attribute__ ((__packed__));
-
-typedef le16 INDEX_ENTRY_FLAGS;
-
-/*
- * This the index entry header (see below).
- */
-typedef struct {
-/*  0*/	union {
-		struct { /* Only valid when INDEX_ENTRY_END is not set. */
-			leMFT_REF indexed_file;	/* The mft reference of the file
-						   described by this index
-						   entry. Used for directory
-						   indexes. */
-		} __attribute__ ((__packed__)) dir;
-		struct { /* Used for views/indexes to find the entry's data. */
-			le16 data_offset;	/* Data byte offset from this
-						   INDEX_ENTRY. Follows the
-						   index key. */
-			le16 data_length;	/* Data length in bytes. */
-			le32 reservedV;		/* Reserved (zero). */
-		} __attribute__ ((__packed__)) vi;
-	} __attribute__ ((__packed__)) data;
-/*  8*/	le16 length;		 /* Byte size of this index entry, multiple of
-				    8-bytes. */
-/* 10*/	le16 key_length;	 /* Byte size of the key value, which is in the
-				    index entry. It follows field reserved. Not
-				    multiple of 8-bytes. */
-/* 12*/	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-/* 14*/	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
-/* sizeof() = 16 bytes */
-} __attribute__ ((__packed__)) INDEX_ENTRY_HEADER;
-
-/*
- * This is an index entry. A sequence of such entries follows each INDEX_HEADER
- * structure. Together they make up a complete index. The index follows either
- * an index root attribute or an index allocation attribute.
- *
- * NOTE: Before NTFS 3.0 only filename attributes were indexed.
- */
-typedef struct {
-/*Ofs*/
-/*  0	INDEX_ENTRY_HEADER; -- Unfolded here as gcc dislikes unnamed structs. */
-	union {
-		struct { /* Only valid when INDEX_ENTRY_END is not set. */
-			leMFT_REF indexed_file;	/* The mft reference of the file
-						   described by this index
-						   entry. Used for directory
-						   indexes. */
-		} __attribute__ ((__packed__)) dir;
-		struct { /* Used for views/indexes to find the entry's data. */
-			le16 data_offset;	/* Data byte offset from this
-						   INDEX_ENTRY. Follows the
-						   index key. */
-			le16 data_length;	/* Data length in bytes. */
-			le32 reservedV;		/* Reserved (zero). */
-		} __attribute__ ((__packed__)) vi;
-	} __attribute__ ((__packed__)) data;
-	le16 length;		 /* Byte size of this index entry, multiple of
-				    8-bytes. */
-	le16 key_length;	 /* Byte size of the key value, which is in the
-				    index entry. It follows field reserved. Not
-				    multiple of 8-bytes. */
-	INDEX_ENTRY_FLAGS flags; /* Bit field of INDEX_ENTRY_* flags. */
-	le16 reserved;		 /* Reserved/align to 8-byte boundary. */
-
-/* 16*/	union {		/* The key of the indexed attribute. NOTE: Only present
-			   if INDEX_ENTRY_END bit in flags is not set. NOTE: On
-			   NTFS versions before 3.0 the only valid key is the
-			   FILE_NAME_ATTR. On NTFS 3.0+ the following
-			   additional index keys are defined: */
-		FILE_NAME_ATTR file_name;/* $I30 index in directories. */
-		SII_INDEX_KEY sii;	/* $SII index in $Secure. */
-		SDH_INDEX_KEY sdh;	/* $SDH index in $Secure. */
-		GUID object_id;		/* $O index in FILE_Extend/$ObjId: The
-					   object_id of the mft record found in
-					   the data part of the index. */
-		REPARSE_INDEX_KEY reparse;	/* $R index in
-						   FILE_Extend/$Reparse. */
-		SID sid;		/* $O index in FILE_Extend/$Quota:
-					   SID of the owner of the user_id. */
-		le32 owner_id;		/* $Q index in FILE_Extend/$Quota:
-					   user_id of the owner of the quota
-					   control entry in the data part of
-					   the index. */
-	} __attribute__ ((__packed__)) key;
-	/* The (optional) index data is inserted here when creating. */
-	// leVCN vcn;	/* If INDEX_ENTRY_NODE bit in flags is set, the last
-	//		   eight bytes of this index entry contain the virtual
-	//		   cluster number of the index block that holds the
-	//		   entries immediately preceding the current entry (the
-	//		   vcn references the corresponding cluster in the data
-	//		   of the non-resident index allocation attribute). If
-	//		   the key_length is zero, then the vcn immediately
-	//		   follows the INDEX_ENTRY_HEADER. Regardless of
-	//		   key_length, the address of the 8-byte boundary
-	//		   aligned vcn of INDEX_ENTRY{_HEADER} *ie is given by
-	//		   (char*)ie + le16_to_cpu(ie*)->length) - sizeof(VCN),
-	//		   where sizeof(VCN) can be hardcoded as 8 if wanted. */
-} __attribute__ ((__packed__)) INDEX_ENTRY;
-
-/*
- * Attribute: Bitmap (0xb0).
- *
- * Contains an array of bits (aka a bitfield).
- *
- * When used in conjunction with the index allocation attribute, each bit
- * corresponds to one index block within the index allocation attribute. Thus
- * the number of bits in the bitmap * index block size / cluster size is the
- * number of clusters in the index allocation attribute.
- */
-typedef struct {
-	u8 bitmap[0];			/* Array of bits. */
-} __attribute__ ((__packed__)) BITMAP_ATTR;
-
-/*
- * The reparse point tag defines the type of the reparse point. It also
- * includes several flags, which further describe the reparse point.
- *
- * The reparse point tag is an unsigned 32-bit value divided in three parts:
- *
- * 1. The least significant 16 bits (i.e. bits 0 to 15) specifiy the type of
- *    the reparse point.
- * 2. The 13 bits after this (i.e. bits 16 to 28) are reserved for future use.
- * 3. The most significant three bits are flags describing the reparse point.
- *    They are defined as follows:
- *	bit 29: Name surrogate bit. If set, the filename is an alias for
- *		another object in the system.
- *	bit 30: High-latency bit. If set, accessing the first byte of data will
- *		be slow. (E.g. the data is stored on a tape drive.)
- *	bit 31: Microsoft bit. If set, the tag is owned by Microsoft. User
- *		defined tags have to use zero here.
- *
- * These are the predefined reparse point tags:
- */
-enum {
-	IO_REPARSE_TAG_IS_ALIAS		= cpu_to_le32(0x20000000),
-	IO_REPARSE_TAG_IS_HIGH_LATENCY	= cpu_to_le32(0x40000000),
-	IO_REPARSE_TAG_IS_MICROSOFT	= cpu_to_le32(0x80000000),
-
-	IO_REPARSE_TAG_RESERVED_ZERO	= cpu_to_le32(0x00000000),
-	IO_REPARSE_TAG_RESERVED_ONE	= cpu_to_le32(0x00000001),
-	IO_REPARSE_TAG_RESERVED_RANGE	= cpu_to_le32(0x00000001),
-
-	IO_REPARSE_TAG_NSS		= cpu_to_le32(0x68000005),
-	IO_REPARSE_TAG_NSS_RECOVER	= cpu_to_le32(0x68000006),
-	IO_REPARSE_TAG_SIS		= cpu_to_le32(0x68000007),
-	IO_REPARSE_TAG_DFS		= cpu_to_le32(0x68000008),
-
-	IO_REPARSE_TAG_MOUNT_POINT	= cpu_to_le32(0x88000003),
-
-	IO_REPARSE_TAG_HSM		= cpu_to_le32(0xa8000004),
-
-	IO_REPARSE_TAG_SYMBOLIC_LINK	= cpu_to_le32(0xe8000000),
-
-	IO_REPARSE_TAG_VALID_VALUES	= cpu_to_le32(0xe000ffff),
-};
-
-/*
- * Attribute: Reparse point (0xc0).
- *
- * NOTE: Can be resident or non-resident.
- */
-typedef struct {
-	le32 reparse_tag;		/* Reparse point type (inc. flags). */
-	le16 reparse_data_length;	/* Byte size of reparse data. */
-	le16 reserved;			/* Align to 8-byte boundary. */
-	u8 reparse_data[0];		/* Meaning depends on reparse_tag. */
-} __attribute__ ((__packed__)) REPARSE_POINT;
-
-/*
- * Attribute: Extended attribute (EA) information (0xd0).
- *
- * NOTE: Always resident. (Is this true???)
- */
-typedef struct {
-	le16 ea_length;		/* Byte size of the packed extended
-				   attributes. */
-	le16 need_ea_count;	/* The number of extended attributes which have
-				   the NEED_EA bit set. */
-	le32 ea_query_length;	/* Byte size of the buffer required to query
-				   the extended attributes when calling
-				   ZwQueryEaFile() in Windows NT/2k. I.e. the
-				   byte size of the unpacked extended
-				   attributes. */
-} __attribute__ ((__packed__)) EA_INFORMATION;
-
-/*
- * Extended attribute flags (8-bit).
- */
-enum {
-	NEED_EA	= 0x80		/* If set the file to which the EA belongs
-				   cannot be interpreted without understanding
-				   the associates extended attributes. */
-} __attribute__ ((__packed__));
-
-typedef u8 EA_FLAGS;
-
-/*
- * Attribute: Extended attribute (EA) (0xe0).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Like the attribute list and the index buffer list, the EA attribute value is
- * a sequence of EA_ATTR variable length records.
- */
-typedef struct {
-	le32 next_entry_offset;	/* Offset to the next EA_ATTR. */
-	EA_FLAGS flags;		/* Flags describing the EA. */
-	u8 ea_name_length;	/* Length of the name of the EA in bytes
-				   excluding the '\0' byte terminator. */
-	le16 ea_value_length;	/* Byte size of the EA's value. */
-	u8 ea_name[0];		/* Name of the EA.  Note this is ASCII, not
-				   Unicode and it is zero terminated. */
-	u8 ea_value[0];		/* The value of the EA.  Immediately follows
-				   the name. */
-} __attribute__ ((__packed__)) EA_ATTR;
-
-/*
- * Attribute: Property set (0xf0).
- *
- * Intended to support Native Structure Storage (NSS) - a feature removed from
- * NTFS 3.0 during beta testing.
- */
-typedef struct {
-	/* Irrelevant as feature unused. */
-} __attribute__ ((__packed__)) PROPERTY_SET;
-
-/*
- * Attribute: Logged utility stream (0x100).
- *
- * NOTE: Can be resident or non-resident.
- *
- * Operations on this attribute are logged to the journal ($LogFile) like
- * normal metadata changes.
- *
- * Used by the Encrypting File System (EFS). All encrypted files have this
- * attribute with the name $EFS.
- */
-typedef struct {
-	/* Can be anything the creator chooses. */
-	/* EFS uses it as follows: */
-	// FIXME: Type this info, verifying it along the way. (AIA)
-} __attribute__ ((__packed__)) LOGGED_UTILITY_STREAM, EFS_ATTR;
-
-#endif /* _LINUX_NTFS_LAYOUT_H */
diff --git a/fs/ntfs/lcnalloc.c b/fs/ntfs/lcnalloc.c
deleted file mode 100644
index eda9972e6159..000000000000
--- a/fs/ntfs/lcnalloc.c
+++ /dev/null
@@ -1,1000 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * lcnalloc.c - Cluster (de)allocation code.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/pagemap.h>
-
-#include "lcnalloc.h"
-#include "debug.h"
-#include "bitmap.h"
-#include "inode.h"
-#include "volume.h"
-#include "attrib.h"
-#include "malloc.h"
-#include "aops.h"
-#include "ntfs.h"
-
-/**
- * ntfs_cluster_free_from_rl_nolock - free clusters from runlist
- * @vol:	mounted ntfs volume on which to free the clusters
- * @rl:		runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol.  In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - The volume lcn bitmap must be locked for writing on entry and is
- *	      left locked on return.
- */
-int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
-		const runlist_element *rl)
-{
-	struct inode *lcnbmp_vi = vol->lcnbmp_ino;
-	int ret = 0;
-
-	ntfs_debug("Entering.");
-	if (!rl)
-		return 0;
-	for (; rl->length; rl++) {
-		int err;
-
-		if (rl->lcn < 0)
-			continue;
-		err = ntfs_bitmap_clear_run(lcnbmp_vi, rl->lcn, rl->length);
-		if (unlikely(err && (!ret || ret == -ENOMEM) && ret != err))
-			ret = err;
-	}
-	ntfs_debug("Done.");
-	return ret;
-}
-
-/**
- * ntfs_cluster_alloc - allocate clusters on an ntfs volume
- * @vol:	mounted ntfs volume on which to allocate the clusters
- * @start_vcn:	vcn to use for the first allocated cluster
- * @count:	number of clusters to allocate
- * @start_lcn:	starting lcn at which to allocate the clusters (or -1 if none)
- * @zone:	zone from which to allocate the clusters
- * @is_extension:	if 'true', this is an attribute extension
- *
- * Allocate @count clusters preferably starting at cluster @start_lcn or at the
- * current allocator position if @start_lcn is -1, on the mounted ntfs volume
- * @vol. @zone is either DATA_ZONE for allocation of normal clusters or
- * MFT_ZONE for allocation of clusters for the master file table, i.e. the
- * $MFT/$DATA attribute.
- *
- * @start_vcn specifies the vcn of the first allocated cluster.  This makes
- * merging the resulting runlist with the old runlist easier.
- *
- * If @is_extension is 'true', the caller is allocating clusters to extend an
- * attribute and if it is 'false', the caller is allocating clusters to fill a
- * hole in an attribute.  Practically the difference is that if @is_extension
- * is 'true' the returned runlist will be terminated with LCN_ENOENT and if
- * @is_extension is 'false' the runlist will be terminated with
- * LCN_RL_NOT_MAPPED.
- *
- * You need to check the return value with IS_ERR().  If this is false, the
- * function was successful and the return value is a runlist describing the
- * allocated cluster(s).  If IS_ERR() is true, the function failed and
- * PTR_ERR() gives you the error code.
- *
- * Notes on the allocation algorithm
- * =================================
- *
- * There are two data zones.  First is the area between the end of the mft zone
- * and the end of the volume, and second is the area between the start of the
- * volume and the start of the mft zone.  On unmodified/standard NTFS 1.x
- * volumes, the second data zone does not exist due to the mft zone being
- * expanded to cover the start of the volume in order to reserve space for the
- * mft bitmap attribute.
- *
- * This is not the prettiest function but the complexity stems from the need of
- * implementing the mft vs data zoned approach and from the fact that we have
- * access to the lcn bitmap in portions of up to 8192 bytes at a time, so we
- * need to cope with crossing over boundaries of two buffers.  Further, the
- * fact that the allocator allows for caller supplied hints as to the location
- * of where allocation should begin and the fact that the allocator keeps track
- * of where in the data zones the next natural allocation should occur,
- * contribute to the complexity of the function.  But it should all be
- * worthwhile, because this allocator should: 1) be a full implementation of
- * the MFT zone approach used by Windows NT, 2) cause reduction in
- * fragmentation, and 3) be speedy in allocations (the code is not optimized
- * for speed, but the algorithm is, so further speed improvements are probably
- * possible).
- *
- * FIXME: We should be monitoring cluster allocation and increment the MFT zone
- * size dynamically but this is something for the future.  We will just cause
- * heavier fragmentation by not doing it and I am not even sure Windows would
- * grow the MFT zone dynamically, so it might even be correct not to do this.
- * The overhead in doing dynamic MFT zone expansion would be very large and
- * unlikely worth the effort. (AIA)
- *
- * TODO: I have added in double the required zone position pointer wrap around
- * logic which can be optimized to having only one of the two logic sets.
- * However, having the double logic will work fine, but if we have only one of
- * the sets and we get it wrong somewhere, then we get into trouble, so
- * removing the duplicate logic requires _very_ careful consideration of _all_
- * possible code paths.  So at least for now, I am leaving the double logic -
- * better safe than sorry... (AIA)
- *
- * Locking: - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- */
-runlist_element *ntfs_cluster_alloc(ntfs_volume *vol, const VCN start_vcn,
-		const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const bool is_extension)
-{
-	LCN zone_start, zone_end, bmp_pos, bmp_initial_pos, last_read_pos, lcn;
-	LCN prev_lcn = 0, prev_run_len = 0, mft_zone_size;
-	s64 clusters;
-	loff_t i_size;
-	struct inode *lcnbmp_vi;
-	runlist_element *rl = NULL;
-	struct address_space *mapping;
-	struct page *page = NULL;
-	u8 *buf, *byte;
-	int err = 0, rlpos, rlsize, buf_size;
-	u8 pass, done_zones, search_zone, need_writeback = 0, bit;
-
-	ntfs_debug("Entering for start_vcn 0x%llx, count 0x%llx, start_lcn "
-			"0x%llx, zone %s_ZONE.", (unsigned long long)start_vcn,
-			(unsigned long long)count,
-			(unsigned long long)start_lcn,
-			zone == MFT_ZONE ? "MFT" : "DATA");
-	BUG_ON(!vol);
-	lcnbmp_vi = vol->lcnbmp_ino;
-	BUG_ON(!lcnbmp_vi);
-	BUG_ON(start_vcn < 0);
-	BUG_ON(count < 0);
-	BUG_ON(start_lcn < -1);
-	BUG_ON(zone < FIRST_ZONE);
-	BUG_ON(zone > LAST_ZONE);
-
-	/* Return NULL if @count is zero. */
-	if (!count)
-		return NULL;
-	/* Take the lcnbmp lock for writing. */
-	down_write(&vol->lcnbmp_lock);
-	/*
-	 * If no specific @start_lcn was requested, use the current data zone
-	 * position, otherwise use the requested @start_lcn but make sure it
-	 * lies outside the mft zone.  Also set done_zones to 0 (no zones done)
-	 * and pass depending on whether we are starting inside a zone (1) or
-	 * at the beginning of a zone (2).  If requesting from the MFT_ZONE,
-	 * we either start at the current position within the mft zone or at
-	 * the specified position.  If the latter is out of bounds then we start
-	 * at the beginning of the MFT_ZONE.
-	 */
-	done_zones = 0;
-	pass = 1;
-	/*
-	 * zone_start and zone_end are the current search range.  search_zone
-	 * is 1 for mft zone, 2 for data zone 1 (end of mft zone till end of
-	 * volume) and 4 for data zone 2 (start of volume till start of mft
-	 * zone).
-	 */
-	zone_start = start_lcn;
-	if (zone_start < 0) {
-		if (zone == DATA_ZONE)
-			zone_start = vol->data1_zone_pos;
-		else
-			zone_start = vol->mft_zone_pos;
-		if (!zone_start) {
-			/*
-			 * Zone starts at beginning of volume which means a
-			 * single pass is sufficient.
-			 */
-			pass = 2;
-		}
-	} else if (zone == DATA_ZONE && zone_start >= vol->mft_zone_start &&
-			zone_start < vol->mft_zone_end) {
-		zone_start = vol->mft_zone_end;
-		/*
-		 * Starting at beginning of data1_zone which means a single
-		 * pass in this zone is sufficient.
-		 */
-		pass = 2;
-	} else if (zone == MFT_ZONE && (zone_start < vol->mft_zone_start ||
-			zone_start >= vol->mft_zone_end)) {
-		zone_start = vol->mft_lcn;
-		if (!vol->mft_zone_end)
-			zone_start = 0;
-		/*
-		 * Starting at beginning of volume which means a single pass
-		 * is sufficient.
-		 */
-		pass = 2;
-	}
-	if (zone == MFT_ZONE) {
-		zone_end = vol->mft_zone_end;
-		search_zone = 1;
-	} else /* if (zone == DATA_ZONE) */ {
-		/* Skip searching the mft zone. */
-		done_zones |= 1;
-		if (zone_start >= vol->mft_zone_end) {
-			zone_end = vol->nr_clusters;
-			search_zone = 2;
-		} else {
-			zone_end = vol->mft_zone_start;
-			search_zone = 4;
-		}
-	}
-	/*
-	 * bmp_pos is the current bit position inside the bitmap.  We use
-	 * bmp_initial_pos to determine whether or not to do a zone switch.
-	 */
-	bmp_pos = bmp_initial_pos = zone_start;
-
-	/* Loop until all clusters are allocated, i.e. clusters == 0. */
-	clusters = count;
-	rlpos = rlsize = 0;
-	mapping = lcnbmp_vi->i_mapping;
-	i_size = i_size_read(lcnbmp_vi);
-	while (1) {
-		ntfs_debug("Start of outer while loop: done_zones 0x%x, "
-				"search_zone %i, pass %i, zone_start 0x%llx, "
-				"zone_end 0x%llx, bmp_initial_pos 0x%llx, "
-				"bmp_pos 0x%llx, rlpos %i, rlsize %i.",
-				done_zones, search_zone, pass,
-				(unsigned long long)zone_start,
-				(unsigned long long)zone_end,
-				(unsigned long long)bmp_initial_pos,
-				(unsigned long long)bmp_pos, rlpos, rlsize);
-		/* Loop until we run out of free clusters. */
-		last_read_pos = bmp_pos >> 3;
-		ntfs_debug("last_read_pos 0x%llx.",
-				(unsigned long long)last_read_pos);
-		if (last_read_pos > i_size) {
-			ntfs_debug("End of attribute reached.  "
-					"Skipping to zone_pass_done.");
-			goto zone_pass_done;
-		}
-		if (likely(page)) {
-			if (need_writeback) {
-				ntfs_debug("Marking page dirty.");
-				flush_dcache_page(page);
-				set_page_dirty(page);
-				need_writeback = 0;
-			}
-			ntfs_unmap_page(page);
-		}
-		page = ntfs_map_page(mapping, last_read_pos >>
-				PAGE_SHIFT);
-		if (IS_ERR(page)) {
-			err = PTR_ERR(page);
-			ntfs_error(vol->sb, "Failed to map page.");
-			goto out;
-		}
-		buf_size = last_read_pos & ~PAGE_MASK;
-		buf = page_address(page) + buf_size;
-		buf_size = PAGE_SIZE - buf_size;
-		if (unlikely(last_read_pos + buf_size > i_size))
-			buf_size = i_size - last_read_pos;
-		buf_size <<= 3;
-		lcn = bmp_pos & 7;
-		bmp_pos &= ~(LCN)7;
-		ntfs_debug("Before inner while loop: buf_size %i, lcn 0x%llx, "
-				"bmp_pos 0x%llx, need_writeback %i.", buf_size,
-				(unsigned long long)lcn,
-				(unsigned long long)bmp_pos, need_writeback);
-		while (lcn < buf_size && lcn + bmp_pos < zone_end) {
-			byte = buf + (lcn >> 3);
-			ntfs_debug("In inner while loop: buf_size %i, "
-					"lcn 0x%llx, bmp_pos 0x%llx, "
-					"need_writeback %i, byte ofs 0x%x, "
-					"*byte 0x%x.", buf_size,
-					(unsigned long long)lcn,
-					(unsigned long long)bmp_pos,
-					need_writeback,
-					(unsigned int)(lcn >> 3),
-					(unsigned int)*byte);
-			/* Skip full bytes. */
-			if (*byte == 0xff) {
-				lcn = (lcn + 8) & ~(LCN)7;
-				ntfs_debug("Continuing while loop 1.");
-				continue;
-			}
-			bit = 1 << (lcn & 7);
-			ntfs_debug("bit 0x%x.", bit);
-			/* If the bit is already set, go onto the next one. */
-			if (*byte & bit) {
-				lcn++;
-				ntfs_debug("Continuing while loop 2.");
-				continue;
-			}
-			/*
-			 * Allocate more memory if needed, including space for
-			 * the terminator element.
-			 * ntfs_malloc_nofs() operates on whole pages only.
-			 */
-			if ((rlpos + 2) * sizeof(*rl) > rlsize) {
-				runlist_element *rl2;
-
-				ntfs_debug("Reallocating memory.");
-				if (!rl)
-					ntfs_debug("First free bit is at LCN "
-							"0x%llx.",
-							(unsigned long long)
-							(lcn + bmp_pos));
-				rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
-				if (unlikely(!rl2)) {
-					err = -ENOMEM;
-					ntfs_error(vol->sb, "Failed to "
-							"allocate memory.");
-					goto out;
-				}
-				memcpy(rl2, rl, rlsize);
-				ntfs_free(rl);
-				rl = rl2;
-				rlsize += PAGE_SIZE;
-				ntfs_debug("Reallocated memory, rlsize 0x%x.",
-						rlsize);
-			}
-			/* Allocate the bitmap bit. */
-			*byte |= bit;
-			/* We need to write this bitmap page to disk. */
-			need_writeback = 1;
-			ntfs_debug("*byte 0x%x, need_writeback is set.",
-					(unsigned int)*byte);
-			/*
-			 * Coalesce with previous run if adjacent LCNs.
-			 * Otherwise, append a new run.
-			 */
-			ntfs_debug("Adding run (lcn 0x%llx, len 0x%llx), "
-					"prev_lcn 0x%llx, lcn 0x%llx, "
-					"bmp_pos 0x%llx, prev_run_len 0x%llx, "
-					"rlpos %i.",
-					(unsigned long long)(lcn + bmp_pos),
-					1ULL, (unsigned long long)prev_lcn,
-					(unsigned long long)lcn,
-					(unsigned long long)bmp_pos,
-					(unsigned long long)prev_run_len,
-					rlpos);
-			if (prev_lcn == lcn + bmp_pos - prev_run_len && rlpos) {
-				ntfs_debug("Coalescing to run (lcn 0x%llx, "
-						"len 0x%llx).",
-						(unsigned long long)
-						rl[rlpos - 1].lcn,
-						(unsigned long long)
-						rl[rlpos - 1].length);
-				rl[rlpos - 1].length = ++prev_run_len;
-				ntfs_debug("Run now (lcn 0x%llx, len 0x%llx), "
-						"prev_run_len 0x%llx.",
-						(unsigned long long)
-						rl[rlpos - 1].lcn,
-						(unsigned long long)
-						rl[rlpos - 1].length,
-						(unsigned long long)
-						prev_run_len);
-			} else {
-				if (likely(rlpos)) {
-					ntfs_debug("Adding new run, (previous "
-							"run lcn 0x%llx, "
-							"len 0x%llx).",
-							(unsigned long long)
-							rl[rlpos - 1].lcn,
-							(unsigned long long)
-							rl[rlpos - 1].length);
-					rl[rlpos].vcn = rl[rlpos - 1].vcn +
-							prev_run_len;
-				} else {
-					ntfs_debug("Adding new run, is first "
-							"run.");
-					rl[rlpos].vcn = start_vcn;
-				}
-				rl[rlpos].lcn = prev_lcn = lcn + bmp_pos;
-				rl[rlpos].length = prev_run_len = 1;
-				rlpos++;
-			}
-			/* Done? */
-			if (!--clusters) {
-				LCN tc;
-				/*
-				 * Update the current zone position.  Positions
-				 * of already scanned zones have been updated
-				 * during the respective zone switches.
-				 */
-				tc = lcn + bmp_pos + 1;
-				ntfs_debug("Done. Updating current zone "
-						"position, tc 0x%llx, "
-						"search_zone %i.",
-						(unsigned long long)tc,
-						search_zone);
-				switch (search_zone) {
-				case 1:
-					ntfs_debug("Before checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					if (tc >= vol->mft_zone_end) {
-						vol->mft_zone_pos =
-								vol->mft_lcn;
-						if (!vol->mft_zone_end)
-							vol->mft_zone_pos = 0;
-					} else if ((bmp_initial_pos >=
-							vol->mft_zone_pos ||
-							tc > vol->mft_zone_pos)
-							&& tc >= vol->mft_lcn)
-						vol->mft_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					break;
-				case 2:
-					ntfs_debug("Before checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					if (tc >= vol->nr_clusters)
-						vol->data1_zone_pos =
-							     vol->mft_zone_end;
-					else if ((bmp_initial_pos >=
-						    vol->data1_zone_pos ||
-						    tc > vol->data1_zone_pos)
-						    && tc >= vol->mft_zone_end)
-						vol->data1_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					break;
-				case 4:
-					ntfs_debug("Before checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					if (tc >= vol->mft_zone_start)
-						vol->data2_zone_pos = 0;
-					else if (bmp_initial_pos >=
-						      vol->data2_zone_pos ||
-						      tc > vol->data2_zone_pos)
-						vol->data2_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					break;
-				default:
-					BUG();
-				}
-				ntfs_debug("Finished.  Going to out.");
-				goto out;
-			}
-			lcn++;
-		}
-		bmp_pos += buf_size;
-		ntfs_debug("After inner while loop: buf_size 0x%x, lcn "
-				"0x%llx, bmp_pos 0x%llx, need_writeback %i.",
-				buf_size, (unsigned long long)lcn,
-				(unsigned long long)bmp_pos, need_writeback);
-		if (bmp_pos < zone_end) {
-			ntfs_debug("Continuing outer while loop, "
-					"bmp_pos 0x%llx, zone_end 0x%llx.",
-					(unsigned long long)bmp_pos,
-					(unsigned long long)zone_end);
-			continue;
-		}
-zone_pass_done:	/* Finished with the current zone pass. */
-		ntfs_debug("At zone_pass_done, pass %i.", pass);
-		if (pass == 1) {
-			/*
-			 * Now do pass 2, scanning the first part of the zone
-			 * we omitted in pass 1.
-			 */
-			pass = 2;
-			zone_end = zone_start;
-			switch (search_zone) {
-			case 1: /* mft_zone */
-				zone_start = vol->mft_zone_start;
-				break;
-			case 2: /* data1_zone */
-				zone_start = vol->mft_zone_end;
-				break;
-			case 4: /* data2_zone */
-				zone_start = 0;
-				break;
-			default:
-				BUG();
-			}
-			/* Sanity check. */
-			if (zone_end < zone_start)
-				zone_end = zone_start;
-			bmp_pos = zone_start;
-			ntfs_debug("Continuing outer while loop, pass 2, "
-					"zone_start 0x%llx, zone_end 0x%llx, "
-					"bmp_pos 0x%llx.",
-					(unsigned long long)zone_start,
-					(unsigned long long)zone_end,
-					(unsigned long long)bmp_pos);
-			continue;
-		} /* pass == 2 */
-done_zones_check:
-		ntfs_debug("At done_zones_check, search_zone %i, done_zones "
-				"before 0x%x, done_zones after 0x%x.",
-				search_zone, done_zones,
-				done_zones | search_zone);
-		done_zones |= search_zone;
-		if (done_zones < 7) {
-			ntfs_debug("Switching zone.");
-			/* Now switch to the next zone we haven't done yet. */
-			pass = 1;
-			switch (search_zone) {
-			case 1:
-				ntfs_debug("Switching from mft zone to data1 "
-						"zone.");
-				/* Update mft zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->mft_zone_end) {
-						vol->mft_zone_pos =
-								vol->mft_lcn;
-						if (!vol->mft_zone_end)
-							vol->mft_zone_pos = 0;
-					} else if ((bmp_initial_pos >=
-							vol->mft_zone_pos ||
-							tc > vol->mft_zone_pos)
-							&& tc >= vol->mft_lcn)
-						vol->mft_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->mft_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->mft_zone_pos);
-				}
-				/* Switch from mft zone to data1 zone. */
-switch_to_data1_zone:		search_zone = 2;
-				zone_start = bmp_initial_pos =
-						vol->data1_zone_pos;
-				zone_end = vol->nr_clusters;
-				if (zone_start == vol->mft_zone_end)
-					pass = 2;
-				if (zone_start >= zone_end) {
-					vol->data1_zone_pos = zone_start =
-							vol->mft_zone_end;
-					pass = 2;
-				}
-				break;
-			case 2:
-				ntfs_debug("Switching from data1 zone to "
-						"data2 zone.");
-				/* Update data1 zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->nr_clusters)
-						vol->data1_zone_pos =
-							     vol->mft_zone_end;
-					else if ((bmp_initial_pos >=
-						    vol->data1_zone_pos ||
-						    tc > vol->data1_zone_pos)
-						    && tc >= vol->mft_zone_end)
-						vol->data1_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data1_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data1_zone_pos);
-				}
-				/* Switch from data1 zone to data2 zone. */
-				search_zone = 4;
-				zone_start = bmp_initial_pos =
-						vol->data2_zone_pos;
-				zone_end = vol->mft_zone_start;
-				if (!zone_start)
-					pass = 2;
-				if (zone_start >= zone_end) {
-					vol->data2_zone_pos = zone_start =
-							bmp_initial_pos = 0;
-					pass = 2;
-				}
-				break;
-			case 4:
-				ntfs_debug("Switching from data2 zone to "
-						"data1 zone.");
-				/* Update data2 zone position. */
-				if (rlpos) {
-					LCN tc;
-
-					ntfs_debug("Before checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-					tc = rl[rlpos - 1].lcn +
-							rl[rlpos - 1].length;
-					if (tc >= vol->mft_zone_start)
-						vol->data2_zone_pos = 0;
-					else if (bmp_initial_pos >=
-						      vol->data2_zone_pos ||
-						      tc > vol->data2_zone_pos)
-						vol->data2_zone_pos = tc;
-					ntfs_debug("After checks, "
-							"vol->data2_zone_pos "
-							"0x%llx.",
-							(unsigned long long)
-							vol->data2_zone_pos);
-				}
-				/* Switch from data2 zone to data1 zone. */
-				goto switch_to_data1_zone;
-			default:
-				BUG();
-			}
-			ntfs_debug("After zone switch, search_zone %i, "
-					"pass %i, bmp_initial_pos 0x%llx, "
-					"zone_start 0x%llx, zone_end 0x%llx.",
-					search_zone, pass,
-					(unsigned long long)bmp_initial_pos,
-					(unsigned long long)zone_start,
-					(unsigned long long)zone_end);
-			bmp_pos = zone_start;
-			if (zone_start == zone_end) {
-				ntfs_debug("Empty zone, going to "
-						"done_zones_check.");
-				/* Empty zone. Don't bother searching it. */
-				goto done_zones_check;
-			}
-			ntfs_debug("Continuing outer while loop.");
-			continue;
-		} /* done_zones == 7 */
-		ntfs_debug("All zones are finished.");
-		/*
-		 * All zones are finished!  If DATA_ZONE, shrink mft zone.  If
-		 * MFT_ZONE, we have really run out of space.
-		 */
-		mft_zone_size = vol->mft_zone_end - vol->mft_zone_start;
-		ntfs_debug("vol->mft_zone_start 0x%llx, vol->mft_zone_end "
-				"0x%llx, mft_zone_size 0x%llx.",
-				(unsigned long long)vol->mft_zone_start,
-				(unsigned long long)vol->mft_zone_end,
-				(unsigned long long)mft_zone_size);
-		if (zone == MFT_ZONE || mft_zone_size <= 0) {
-			ntfs_debug("No free clusters left, going to out.");
-			/* Really no more space left on device. */
-			err = -ENOSPC;
-			goto out;
-		} /* zone == DATA_ZONE && mft_zone_size > 0 */
-		ntfs_debug("Shrinking mft zone.");
-		zone_end = vol->mft_zone_end;
-		mft_zone_size >>= 1;
-		if (mft_zone_size > 0)
-			vol->mft_zone_end = vol->mft_zone_start + mft_zone_size;
-		else /* mft zone and data2 zone no longer exist. */
-			vol->data2_zone_pos = vol->mft_zone_start =
-					vol->mft_zone_end = 0;
-		if (vol->mft_zone_pos >= vol->mft_zone_end) {
-			vol->mft_zone_pos = vol->mft_lcn;
-			if (!vol->mft_zone_end)
-				vol->mft_zone_pos = 0;
-		}
-		bmp_pos = zone_start = bmp_initial_pos =
-				vol->data1_zone_pos = vol->mft_zone_end;
-		search_zone = 2;
-		pass = 2;
-		done_zones &= ~2;
-		ntfs_debug("After shrinking mft zone, mft_zone_size 0x%llx, "
-				"vol->mft_zone_start 0x%llx, "
-				"vol->mft_zone_end 0x%llx, "
-				"vol->mft_zone_pos 0x%llx, search_zone 2, "
-				"pass 2, dones_zones 0x%x, zone_start 0x%llx, "
-				"zone_end 0x%llx, vol->data1_zone_pos 0x%llx, "
-				"continuing outer while loop.",
-				(unsigned long long)mft_zone_size,
-				(unsigned long long)vol->mft_zone_start,
-				(unsigned long long)vol->mft_zone_end,
-				(unsigned long long)vol->mft_zone_pos,
-				done_zones, (unsigned long long)zone_start,
-				(unsigned long long)zone_end,
-				(unsigned long long)vol->data1_zone_pos);
-	}
-	ntfs_debug("After outer while loop.");
-out:
-	ntfs_debug("At out.");
-	/* Add runlist terminator element. */
-	if (likely(rl)) {
-		rl[rlpos].vcn = rl[rlpos - 1].vcn + rl[rlpos - 1].length;
-		rl[rlpos].lcn = is_extension ? LCN_ENOENT : LCN_RL_NOT_MAPPED;
-		rl[rlpos].length = 0;
-	}
-	if (likely(page && !IS_ERR(page))) {
-		if (need_writeback) {
-			ntfs_debug("Marking page dirty.");
-			flush_dcache_page(page);
-			set_page_dirty(page);
-			need_writeback = 0;
-		}
-		ntfs_unmap_page(page);
-	}
-	if (likely(!err)) {
-		up_write(&vol->lcnbmp_lock);
-		ntfs_debug("Done.");
-		return rl;
-	}
-	ntfs_error(vol->sb, "Failed to allocate clusters, aborting "
-			"(error %i).", err);
-	if (rl) {
-		int err2;
-
-		if (err == -ENOSPC)
-			ntfs_debug("Not enough space to complete allocation, "
-					"err -ENOSPC, first free lcn 0x%llx, "
-					"could allocate up to 0x%llx "
-					"clusters.",
-					(unsigned long long)rl[0].lcn,
-					(unsigned long long)(count - clusters));
-		/* Deallocate all allocated clusters. */
-		ntfs_debug("Attempting rollback...");
-		err2 = ntfs_cluster_free_from_rl_nolock(vol, rl);
-		if (err2) {
-			ntfs_error(vol->sb, "Failed to rollback (error %i).  "
-					"Leaving inconsistent metadata!  "
-					"Unmount and run chkdsk.", err2);
-			NVolSetErrors(vol);
-		}
-		/* Free the runlist. */
-		ntfs_free(rl);
-	} else if (err == -ENOSPC)
-		ntfs_debug("No space left at all, err = -ENOSPC, first free "
-				"lcn = 0x%llx.",
-				(long long)vol->data1_zone_pos);
-	up_write(&vol->lcnbmp_lock);
-	return ERR_PTR(err);
-}
-
-/**
- * __ntfs_cluster_free - free clusters on an ntfs volume
- * @ni:		ntfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
- * @count:	number of clusters to free or -1 for all clusters
- * @ctx:	active attribute search context if present or NULL if not
- * @is_rollback:	true if this is a rollback operation
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the vfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated.  Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when __ntfs_cluster_free() encounters unmapped
- * runlist fragments and allows their mapping.  If you do not have the mft
- * record mapped, you can specify @ctx as NULL and __ntfs_cluster_free() will
- * perform the necessary mapping and unmapping.
- *
- * Note, __ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning.  Thus, @ctx will be left pointing to the same attribute on
- * return as on entry.  However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to __ntfs_cluster_free(),
- * you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * @is_rollback should always be 'false', it is for internal use to rollback
- * errors.  You probably want to use ntfs_cluster_free() instead.
- *
- * Note, __ntfs_cluster_free() does not modify the runlist, so you have to
- * remove from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn, s64 count,
-		ntfs_attr_search_ctx *ctx, const bool is_rollback)
-{
-	s64 delta, to_free, total_freed, real_freed;
-	ntfs_volume *vol;
-	struct inode *lcnbmp_vi;
-	runlist_element *rl;
-	int err;
-
-	BUG_ON(!ni);
-	ntfs_debug("Entering for i_ino 0x%lx, start_vcn 0x%llx, count "
-			"0x%llx.%s", ni->mft_no, (unsigned long long)start_vcn,
-			(unsigned long long)count,
-			is_rollback ? " (rollback)" : "");
-	vol = ni->vol;
-	lcnbmp_vi = vol->lcnbmp_ino;
-	BUG_ON(!lcnbmp_vi);
-	BUG_ON(start_vcn < 0);
-	BUG_ON(count < -1);
-	/*
-	 * Lock the lcn bitmap for writing but only if not rolling back.  We
-	 * must hold the lock all the way including through rollback otherwise
-	 * rollback is not possible because once we have cleared a bit and
-	 * dropped the lock, anyone could have set the bit again, thus
-	 * allocating the cluster for another use.
-	 */
-	if (likely(!is_rollback))
-		down_write(&vol->lcnbmp_lock);
-
-	total_freed = real_freed = 0;
-
-	rl = ntfs_attr_find_vcn_nolock(ni, start_vcn, ctx);
-	if (IS_ERR(rl)) {
-		if (!is_rollback)
-			ntfs_error(vol->sb, "Failed to find first runlist "
-					"element (error %li), aborting.",
-					PTR_ERR(rl));
-		err = PTR_ERR(rl);
-		goto err_out;
-	}
-	if (unlikely(rl->lcn < LCN_HOLE)) {
-		if (!is_rollback)
-			ntfs_error(vol->sb, "First runlist element has "
-					"invalid lcn, aborting.");
-		err = -EIO;
-		goto err_out;
-	}
-	/* Find the starting cluster inside the run that needs freeing. */
-	delta = start_vcn - rl->vcn;
-
-	/* The number of clusters in this run that need freeing. */
-	to_free = rl->length - delta;
-	if (count >= 0 && to_free > count)
-		to_free = count;
-
-	if (likely(rl->lcn >= 0)) {
-		/* Do the actual freeing of the clusters in this run. */
-		err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn + delta,
-				to_free, likely(!is_rollback) ? 0 : 1);
-		if (unlikely(err)) {
-			if (!is_rollback)
-				ntfs_error(vol->sb, "Failed to clear first run "
-						"(error %i), aborting.", err);
-			goto err_out;
-		}
-		/* We have freed @to_free real clusters. */
-		real_freed = to_free;
-	};
-	/* Go to the next run and adjust the number of clusters left to free. */
-	++rl;
-	if (count >= 0)
-		count -= to_free;
-
-	/* Keep track of the total "freed" clusters, including sparse ones. */
-	total_freed = to_free;
-	/*
-	 * Loop over the remaining runs, using @count as a capping value, and
-	 * free them.
-	 */
-	for (; rl->length && count != 0; ++rl) {
-		if (unlikely(rl->lcn < LCN_HOLE)) {
-			VCN vcn;
-
-			/* Attempt to map runlist. */
-			vcn = rl->vcn;
-			rl = ntfs_attr_find_vcn_nolock(ni, vcn, ctx);
-			if (IS_ERR(rl)) {
-				err = PTR_ERR(rl);
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to map "
-							"runlist fragment or "
-							"failed to find "
-							"subsequent runlist "
-							"element.");
-				goto err_out;
-			}
-			if (unlikely(rl->lcn < LCN_HOLE)) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Runlist element "
-							"has invalid lcn "
-							"(0x%llx).",
-							(unsigned long long)
-							rl->lcn);
-				err = -EIO;
-				goto err_out;
-			}
-		}
-		/* The number of clusters in this run that need freeing. */
-		to_free = rl->length;
-		if (count >= 0 && to_free > count)
-			to_free = count;
-
-		if (likely(rl->lcn >= 0)) {
-			/* Do the actual freeing of the clusters in the run. */
-			err = ntfs_bitmap_set_bits_in_run(lcnbmp_vi, rl->lcn,
-					to_free, likely(!is_rollback) ? 0 : 1);
-			if (unlikely(err)) {
-				if (!is_rollback)
-					ntfs_error(vol->sb, "Failed to clear "
-							"subsequent run.");
-				goto err_out;
-			}
-			/* We have freed @to_free real clusters. */
-			real_freed += to_free;
-		}
-		/* Adjust the number of clusters left to free. */
-		if (count >= 0)
-			count -= to_free;
-	
-		/* Update the total done clusters. */
-		total_freed += to_free;
-	}
-	if (likely(!is_rollback))
-		up_write(&vol->lcnbmp_lock);
-
-	BUG_ON(count > 0);
-
-	/* We are done.  Return the number of actually freed clusters. */
-	ntfs_debug("Done.");
-	return real_freed;
-err_out:
-	if (is_rollback)
-		return err;
-	/* If no real clusters were freed, no need to rollback. */
-	if (!real_freed) {
-		up_write(&vol->lcnbmp_lock);
-		return err;
-	}
-	/*
-	 * Attempt to rollback and if that succeeds just return the error code.
-	 * If rollback fails, set the volume errors flag, emit an error
-	 * message, and return the error code.
-	 */
-	delta = __ntfs_cluster_free(ni, start_vcn, total_freed, ctx, true);
-	if (delta < 0) {
-		ntfs_error(vol->sb, "Failed to rollback (error %i).  Leaving "
-				"inconsistent metadata!  Unmount and run "
-				"chkdsk.", (int)delta);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->lcnbmp_lock);
-	ntfs_error(vol->sb, "Aborting (error %i).", err);
-	return err;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/lcnalloc.h b/fs/ntfs/lcnalloc.h
deleted file mode 100644
index 1589a6d8434b..000000000000
--- a/fs/ntfs/lcnalloc.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * lcnalloc.h - Exports for NTFS kernel cluster (de)allocation.  Part of the
- *		Linux-NTFS project.
- *
- * Copyright (c) 2004-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LCNALLOC_H
-#define _LINUX_NTFS_LCNALLOC_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "attrib.h"
-#include "types.h"
-#include "inode.h"
-#include "runlist.h"
-#include "volume.h"
-
-typedef enum {
-	FIRST_ZONE	= 0,	/* For sanity checking. */
-	MFT_ZONE	= 0,	/* Allocate from $MFT zone. */
-	DATA_ZONE	= 1,	/* Allocate from $DATA zone. */
-	LAST_ZONE	= 1,	/* For sanity checking. */
-} NTFS_CLUSTER_ALLOCATION_ZONES;
-
-extern runlist_element *ntfs_cluster_alloc(ntfs_volume *vol,
-		const VCN start_vcn, const s64 count, const LCN start_lcn,
-		const NTFS_CLUSTER_ALLOCATION_ZONES zone,
-		const bool is_extension);
-
-extern s64 __ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
-		s64 count, ntfs_attr_search_ctx *ctx, const bool is_rollback);
-
-/**
- * ntfs_cluster_free - free clusters on an ntfs volume
- * @ni:		ntfs inode whose runlist describes the clusters to free
- * @start_vcn:	vcn in the runlist of @ni at which to start freeing clusters
- * @count:	number of clusters to free or -1 for all clusters
- * @ctx:	active attribute search context if present or NULL if not
- *
- * Free @count clusters starting at the cluster @start_vcn in the runlist
- * described by the ntfs inode @ni.
- *
- * If @count is -1, all clusters from @start_vcn to the end of the runlist are
- * deallocated.  Thus, to completely free all clusters in a runlist, use
- * @start_vcn = 0 and @count = -1.
- *
- * If @ctx is specified, it is an active search context of @ni and its base mft
- * record.  This is needed when ntfs_cluster_free() encounters unmapped runlist
- * fragments and allows their mapping.  If you do not have the mft record
- * mapped, you can specify @ctx as NULL and ntfs_cluster_free() will perform
- * the necessary mapping and unmapping.
- *
- * Note, ntfs_cluster_free() saves the state of @ctx on entry and restores it
- * before returning.  Thus, @ctx will be left pointing to the same attribute on
- * return as on entry.  However, the actual pointers in @ctx may point to
- * different memory locations on return, so you must remember to reset any
- * cached pointers from the @ctx, i.e. after the call to ntfs_cluster_free(),
- * you will probably want to do:
- *	m = ctx->mrec;
- *	a = ctx->attr;
- * Assuming you cache ctx->attr in a variable @a of type ATTR_RECORD * and that
- * you cache ctx->mrec in a variable @m of type MFT_RECORD *.
- *
- * Note, ntfs_cluster_free() does not modify the runlist, so you have to remove
- * from the runlist or mark sparse the freed runs later.
- *
- * Return the number of deallocated clusters (not counting sparse ones) on
- * success and -errno on error.
- *
- * WARNING: If @ctx is supplied, regardless of whether success or failure is
- *	    returned, you need to check IS_ERR(@ctx->mrec) and if 'true' the @ctx
- *	    is no longer valid, i.e. you need to either call
- *	    ntfs_attr_reinit_search_ctx() or ntfs_attr_put_search_ctx() on it.
- *	    In that case PTR_ERR(@ctx->mrec) will give you the error code for
- *	    why the mapping of the old inode failed.
- *
- * Locking: - The runlist described by @ni must be locked for writing on entry
- *	      and is locked on return.  Note the runlist may be modified when
- *	      needed runlist fragments need to be mapped.
- *	    - The volume lcn bitmap must be unlocked on entry and is unlocked
- *	      on return.
- *	    - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - If @ctx is NULL, the base mft record of @ni must not be mapped on
- *	      entry and it will be left unmapped on return.
- *	    - If @ctx is not NULL, the base mft record must be mapped on entry
- *	      and it will be left mapped on return.
- */
-static inline s64 ntfs_cluster_free(ntfs_inode *ni, const VCN start_vcn,
-		s64 count, ntfs_attr_search_ctx *ctx)
-{
-	return __ntfs_cluster_free(ni, start_vcn, count, ctx, false);
-}
-
-extern int ntfs_cluster_free_from_rl_nolock(ntfs_volume *vol,
-		const runlist_element *rl);
-
-/**
- * ntfs_cluster_free_from_rl - free clusters from runlist
- * @vol:	mounted ntfs volume on which to free the clusters
- * @rl:		runlist describing the clusters to free
- *
- * Free all the clusters described by the runlist @rl on the volume @vol.  In
- * the case of an error being returned, at least some of the clusters were not
- * freed.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - This function takes the volume lcn bitmap lock for writing and
- *	      modifies the bitmap contents.
- *	    - The caller must have locked the runlist @rl for reading or
- *	      writing.
- */
-static inline int ntfs_cluster_free_from_rl(ntfs_volume *vol,
-		const runlist_element *rl)
-{
-	int ret;
-
-	down_write(&vol->lcnbmp_lock);
-	ret = ntfs_cluster_free_from_rl_nolock(vol, rl);
-	up_write(&vol->lcnbmp_lock);
-	return ret;
-}
-
-#endif /* NTFS_RW */
-
-#endif /* defined _LINUX_NTFS_LCNALLOC_H */
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
deleted file mode 100644
index 6ce60ffc6ac0..000000000000
--- a/fs/ntfs/logfile.c
+++ /dev/null
@@ -1,849 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * logfile.c - NTFS kernel journal handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2002-2007 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/types.h>
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/buffer_head.h>
-#include <linux/bitops.h>
-#include <linux/log2.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "debug.h"
-#include "logfile.h"
-#include "malloc.h"
-#include "volume.h"
-#include "ntfs.h"
-
-/**
- * ntfs_check_restart_page_header - check the page header for consistency
- * @vi:		$LogFile inode to which the restart page header belongs
- * @rp:		restart page header to check
- * @pos:	position in @vi at which the restart page header resides
- *
- * Check the restart page header @rp for consistency and return 'true' if it is
- * consistent and 'false' otherwise.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_page_header(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos)
-{
-	u32 logfile_system_page_size, logfile_log_page_size;
-	u16 ra_ofs, usa_count, usa_ofs, usa_end = 0;
-	bool have_usa = true;
-
-	ntfs_debug("Entering.");
-	/*
-	 * If the system or log page sizes are smaller than the ntfs block size
-	 * or either is not a power of 2 we cannot handle this log file.
-	 */
-	logfile_system_page_size = le32_to_cpu(rp->system_page_size);
-	logfile_log_page_size = le32_to_cpu(rp->log_page_size);
-	if (logfile_system_page_size < NTFS_BLOCK_SIZE ||
-			logfile_log_page_size < NTFS_BLOCK_SIZE ||
-			logfile_system_page_size &
-			(logfile_system_page_size - 1) ||
-			!is_power_of_2(logfile_log_page_size)) {
-		ntfs_error(vi->i_sb, "$LogFile uses unsupported page size.");
-		return false;
-	}
-	/*
-	 * We must be either at !pos (1st restart page) or at pos = system page
-	 * size (2nd restart page).
-	 */
-	if (pos && pos != logfile_system_page_size) {
-		ntfs_error(vi->i_sb, "Found restart area in incorrect "
-				"position in $LogFile.");
-		return false;
-	}
-	/* We only know how to handle version 1.1. */
-	if (sle16_to_cpu(rp->major_ver) != 1 ||
-			sle16_to_cpu(rp->minor_ver) != 1) {
-		ntfs_error(vi->i_sb, "$LogFile version %i.%i is not "
-				"supported.  (This driver supports version "
-				"1.1 only.)", (int)sle16_to_cpu(rp->major_ver),
-				(int)sle16_to_cpu(rp->minor_ver));
-		return false;
-	}
-	/*
-	 * If chkdsk has been run the restart page may not be protected by an
-	 * update sequence array.
-	 */
-	if (ntfs_is_chkd_record(rp->magic) && !le16_to_cpu(rp->usa_count)) {
-		have_usa = false;
-		goto skip_usa_checks;
-	}
-	/* Verify the size of the update sequence array. */
-	usa_count = 1 + (logfile_system_page_size >> NTFS_BLOCK_SIZE_BITS);
-	if (usa_count != le16_to_cpu(rp->usa_count)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent update sequence array count.");
-		return false;
-	}
-	/* Verify the position of the update sequence array. */
-	usa_ofs = le16_to_cpu(rp->usa_ofs);
-	usa_end = usa_ofs + usa_count * sizeof(u16);
-	if (usa_ofs < sizeof(RESTART_PAGE_HEADER) ||
-			usa_end > NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent update sequence array offset.");
-		return false;
-	}
-skip_usa_checks:
-	/*
-	 * Verify the position of the restart area.  It must be:
-	 *	- aligned to 8-byte boundary,
-	 *	- after the update sequence array, and
-	 *	- within the system page size.
-	 */
-	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	if (ra_ofs & 7 || (have_usa ? ra_ofs < usa_end :
-			ra_ofs < sizeof(RESTART_PAGE_HEADER)) ||
-			ra_ofs > logfile_system_page_size) {
-		ntfs_error(vi->i_sb, "$LogFile restart page specifies "
-				"inconsistent restart area offset.");
-		return false;
-	}
-	/*
-	 * Only restart pages modified by chkdsk are allowed to have chkdsk_lsn
-	 * set.
-	 */
-	if (!ntfs_is_chkd_record(rp->magic) && sle64_to_cpu(rp->chkdsk_lsn)) {
-		ntfs_error(vi->i_sb, "$LogFile restart page is not modified "
-				"by chkdsk but a chkdsk LSN is specified.");
-		return false;
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * ntfs_check_restart_area - check the restart area for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page whose restart area to check
- *
- * Check the restart area of the restart page @rp for consistency and return
- * 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header has already been
- * consistency checked.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- */
-static bool ntfs_check_restart_area(struct inode *vi, RESTART_PAGE_HEADER *rp)
-{
-	u64 file_size;
-	RESTART_AREA *ra;
-	u16 ra_ofs, ra_len, ca_ofs;
-	u8 fs_bits;
-
-	ntfs_debug("Entering.");
-	ra_ofs = le16_to_cpu(rp->restart_area_offset);
-	ra = (RESTART_AREA*)((u8*)rp + ra_ofs);
-	/*
-	 * Everything before ra->file_size must be before the first word
-	 * protected by an update sequence number.  This ensures that it is
-	 * safe to access ra->client_array_offset.
-	 */
-	if (ra_ofs + offsetof(RESTART_AREA, file_size) >
-			NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent file offset.");
-		return false;
-	}
-	/*
-	 * Now that we can access ra->client_array_offset, make sure everything
-	 * up to the log client array is before the first word protected by an
-	 * update sequence number.  This ensures we can access all of the
-	 * restart area elements safely.  Also, the client array offset must be
-	 * aligned to an 8-byte boundary.
-	 */
-	ca_ofs = le16_to_cpu(ra->client_array_offset);
-	if (((ca_ofs + 7) & ~7) != ca_ofs ||
-			ra_ofs + ca_ofs > NTFS_BLOCK_SIZE - sizeof(u16)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent client array offset.");
-		return false;
-	}
-	/*
-	 * The restart area must end within the system page size both when
-	 * calculated manually and as specified by ra->restart_area_length.
-	 * Also, the calculated length must not exceed the specified length.
-	 */
-	ra_len = ca_ofs + le16_to_cpu(ra->log_clients) *
-			sizeof(LOG_CLIENT_RECORD);
-	if (ra_ofs + ra_len > le32_to_cpu(rp->system_page_size) ||
-			ra_ofs + le16_to_cpu(ra->restart_area_length) >
-			le32_to_cpu(rp->system_page_size) ||
-			ra_len > le16_to_cpu(ra->restart_area_length)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area is out of bounds "
-				"of the system page size specified by the "
-				"restart page header and/or the specified "
-				"restart area length is inconsistent.");
-		return false;
-	}
-	/*
-	 * The ra->client_free_list and ra->client_in_use_list must be either
-	 * LOGFILE_NO_CLIENT or less than ra->log_clients or they are
-	 * overflowing the client array.
-	 */
-	if ((ra->client_free_list != LOGFILE_NO_CLIENT &&
-			le16_to_cpu(ra->client_free_list) >=
-			le16_to_cpu(ra->log_clients)) ||
-			(ra->client_in_use_list != LOGFILE_NO_CLIENT &&
-			le16_to_cpu(ra->client_in_use_list) >=
-			le16_to_cpu(ra->log_clients))) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"overflowing client free and/or in use lists.");
-		return false;
-	}
-	/*
-	 * Check ra->seq_number_bits against ra->file_size for consistency.
-	 * We cannot just use ffs() because the file size is not a power of 2.
-	 */
-	file_size = (u64)sle64_to_cpu(ra->file_size);
-	fs_bits = 0;
-	while (file_size) {
-		file_size >>= 1;
-		fs_bits++;
-	}
-	if (le32_to_cpu(ra->seq_number_bits) != 67 - fs_bits) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent sequence number bits.");
-		return false;
-	}
-	/* The log record header length must be a multiple of 8. */
-	if (((le16_to_cpu(ra->log_record_header_length) + 7) & ~7) !=
-			le16_to_cpu(ra->log_record_header_length)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent log record header length.");
-		return false;
-	}
-	/* Dito for the log page data offset. */
-	if (((le16_to_cpu(ra->log_page_data_offset) + 7) & ~7) !=
-			le16_to_cpu(ra->log_page_data_offset)) {
-		ntfs_error(vi->i_sb, "$LogFile restart area specifies "
-				"inconsistent log page data offset.");
-		return false;
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * ntfs_check_log_client_array - check the log client array for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page whose log client array to check
- *
- * Check the log client array of the restart page @rp for consistency and
- * return 'true' if it is consistent and 'false' otherwise.
- *
- * This function assumes that the restart page header and the restart area have
- * already been consistency checked.
- *
- * Unlike ntfs_check_restart_page_header() and ntfs_check_restart_area(), this
- * function needs @rp->system_page_size bytes in @rp, i.e. it requires the full
- * restart page and the page must be multi sector transfer deprotected.
- */
-static bool ntfs_check_log_client_array(struct inode *vi,
-		RESTART_PAGE_HEADER *rp)
-{
-	RESTART_AREA *ra;
-	LOG_CLIENT_RECORD *ca, *cr;
-	u16 nr_clients, idx;
-	bool in_free_list, idx_is_first;
-
-	ntfs_debug("Entering.");
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	ca = (LOG_CLIENT_RECORD*)((u8*)ra +
-			le16_to_cpu(ra->client_array_offset));
-	/*
-	 * Check the ra->client_free_list first and then check the
-	 * ra->client_in_use_list.  Check each of the log client records in
-	 * each of the lists and check that the array does not overflow the
-	 * ra->log_clients value.  Also keep track of the number of records
-	 * visited as there cannot be more than ra->log_clients records and
-	 * that way we detect eventual loops in within a list.
-	 */
-	nr_clients = le16_to_cpu(ra->log_clients);
-	idx = le16_to_cpu(ra->client_free_list);
-	in_free_list = true;
-check_list:
-	for (idx_is_first = true; idx != LOGFILE_NO_CLIENT_CPU; nr_clients--,
-			idx = le16_to_cpu(cr->next_client)) {
-		if (!nr_clients || idx >= le16_to_cpu(ra->log_clients))
-			goto err_out;
-		/* Set @cr to the current log client record. */
-		cr = ca + idx;
-		/* The first log client record must not have a prev_client. */
-		if (idx_is_first) {
-			if (cr->prev_client != LOGFILE_NO_CLIENT)
-				goto err_out;
-			idx_is_first = false;
-		}
-	}
-	/* Switch to and check the in use list if we just did the free list. */
-	if (in_free_list) {
-		in_free_list = false;
-		idx = le16_to_cpu(ra->client_in_use_list);
-		goto check_list;
-	}
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	ntfs_error(vi->i_sb, "$LogFile log client array is corrupt.");
-	return false;
-}
-
-/**
- * ntfs_check_and_load_restart_page - check the restart page for consistency
- * @vi:		$LogFile inode to which the restart page belongs
- * @rp:		restart page to check
- * @pos:	position in @vi at which the restart page resides
- * @wrp:	[OUT] copy of the multi sector transfer deprotected restart page
- * @lsn:	[OUT] set to the current logfile lsn on success
- *
- * Check the restart page @rp for consistency and return 0 if it is consistent
- * and -errno otherwise.  The restart page may have been modified by chkdsk in
- * which case its magic is CHKD instead of RSTR.
- *
- * This function only needs NTFS_BLOCK_SIZE bytes in @rp, i.e. it does not
- * require the full restart page.
- *
- * If @wrp is not NULL, on success, *@wrp will point to a buffer containing a
- * copy of the complete multi sector transfer deprotected page.  On failure,
- * *@wrp is undefined.
- *
- * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
- * logfile lsn according to this restart page.  On failure, *@lsn is undefined.
- *
- * The following error codes are defined:
- *	-EINVAL	- The restart page is inconsistent.
- *	-ENOMEM	- Not enough memory to load the restart page.
- *	-EIO	- Failed to reading from $LogFile.
- */
-static int ntfs_check_and_load_restart_page(struct inode *vi,
-		RESTART_PAGE_HEADER *rp, s64 pos, RESTART_PAGE_HEADER **wrp,
-		LSN *lsn)
-{
-	RESTART_AREA *ra;
-	RESTART_PAGE_HEADER *trp;
-	int size, err;
-
-	ntfs_debug("Entering.");
-	/* Check the restart page header for consistency. */
-	if (!ntfs_check_restart_page_header(vi, rp, pos)) {
-		/* Error output already done inside the function. */
-		return -EINVAL;
-	}
-	/* Check the restart area for consistency. */
-	if (!ntfs_check_restart_area(vi, rp)) {
-		/* Error output already done inside the function. */
-		return -EINVAL;
-	}
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	/*
-	 * Allocate a buffer to store the whole restart page so we can multi
-	 * sector transfer deprotect it.
-	 */
-	trp = ntfs_malloc_nofs(le32_to_cpu(rp->system_page_size));
-	if (!trp) {
-		ntfs_error(vi->i_sb, "Failed to allocate memory for $LogFile "
-				"restart page buffer.");
-		return -ENOMEM;
-	}
-	/*
-	 * Read the whole of the restart page into the buffer.  If it fits
-	 * completely inside @rp, just copy it from there.  Otherwise map all
-	 * the required pages and copy the data from them.
-	 */
-	size = PAGE_SIZE - (pos & ~PAGE_MASK);
-	if (size >= le32_to_cpu(rp->system_page_size)) {
-		memcpy(trp, rp, le32_to_cpu(rp->system_page_size));
-	} else {
-		pgoff_t idx;
-		struct page *page;
-		int have_read, to_read;
-
-		/* First copy what we already have in @rp. */
-		memcpy(trp, rp, size);
-		/* Copy the remaining data one page at a time. */
-		have_read = size;
-		to_read = le32_to_cpu(rp->system_page_size) - size;
-		idx = (pos + size) >> PAGE_SHIFT;
-		BUG_ON((pos + size) & ~PAGE_MASK);
-		do {
-			page = ntfs_map_page(vi->i_mapping, idx);
-			if (IS_ERR(page)) {
-				ntfs_error(vi->i_sb, "Error mapping $LogFile "
-						"page (index %lu).", idx);
-				err = PTR_ERR(page);
-				if (err != -EIO && err != -ENOMEM)
-					err = -EIO;
-				goto err_out;
-			}
-			size = min_t(int, to_read, PAGE_SIZE);
-			memcpy((u8*)trp + have_read, page_address(page), size);
-			ntfs_unmap_page(page);
-			have_read += size;
-			to_read -= size;
-			idx++;
-		} while (to_read > 0);
-	}
-	/*
-	 * Perform the multi sector transfer deprotection on the buffer if the
-	 * restart page is protected.
-	 */
-	if ((!ntfs_is_chkd_record(trp->magic) || le16_to_cpu(trp->usa_count))
-			&& post_read_mst_fixup((NTFS_RECORD*)trp,
-			le32_to_cpu(rp->system_page_size))) {
-		/*
-		 * A multi sector tranfer error was detected.  We only need to
-		 * abort if the restart page contents exceed the multi sector
-		 * transfer fixup of the first sector.
-		 */
-		if (le16_to_cpu(rp->restart_area_offset) +
-				le16_to_cpu(ra->restart_area_length) >
-				NTFS_BLOCK_SIZE - sizeof(u16)) {
-			ntfs_error(vi->i_sb, "Multi sector transfer error "
-					"detected in $LogFile restart page.");
-			err = -EINVAL;
-			goto err_out;
-		}
-	}
-	/*
-	 * If the restart page is modified by chkdsk or there are no active
-	 * logfile clients, the logfile is consistent.  Otherwise, need to
-	 * check the log client records for consistency, too.
-	 */
-	err = 0;
-	if (ntfs_is_rstr_record(rp->magic) &&
-			ra->client_in_use_list != LOGFILE_NO_CLIENT) {
-		if (!ntfs_check_log_client_array(vi, trp)) {
-			err = -EINVAL;
-			goto err_out;
-		}
-	}
-	if (lsn) {
-		if (ntfs_is_rstr_record(rp->magic))
-			*lsn = sle64_to_cpu(ra->current_lsn);
-		else /* if (ntfs_is_chkd_record(rp->magic)) */
-			*lsn = sle64_to_cpu(rp->chkdsk_lsn);
-	}
-	ntfs_debug("Done.");
-	if (wrp)
-		*wrp = trp;
-	else {
-err_out:
-		ntfs_free(trp);
-	}
-	return err;
-}
-
-/**
- * ntfs_check_logfile - check the journal for consistency
- * @log_vi:	struct inode of loaded journal $LogFile to check
- * @rp:		[OUT] on success this is a copy of the current restart page
- *
- * Check the $LogFile journal for consistency and return 'true' if it is
- * consistent and 'false' if not.  On success, the current restart page is
- * returned in *@rp.  Caller must call ntfs_free(*@rp) when finished with it.
- *
- * At present we only check the two restart pages and ignore the log record
- * pages.
- *
- * Note that the MstProtected flag is not set on the $LogFile inode and hence
- * when reading pages they are not deprotected.  This is because we do not know
- * if the $LogFile was created on a system with a different page size to ours
- * yet and mst deprotection would fail if our page size is smaller.
- */
-bool ntfs_check_logfile(struct inode *log_vi, RESTART_PAGE_HEADER **rp)
-{
-	s64 size, pos;
-	LSN rstr1_lsn, rstr2_lsn;
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	struct address_space *mapping = log_vi->i_mapping;
-	struct page *page = NULL;
-	u8 *kaddr = NULL;
-	RESTART_PAGE_HEADER *rstr1_ph = NULL;
-	RESTART_PAGE_HEADER *rstr2_ph = NULL;
-	int log_page_size, err;
-	bool logfile_is_empty = true;
-	u8 log_page_bits;
-
-	ntfs_debug("Entering.");
-	/* An empty $LogFile must have been clean before it got emptied. */
-	if (NVolLogFileEmpty(vol))
-		goto is_empty;
-	size = i_size_read(log_vi);
-	/* Make sure the file doesn't exceed the maximum allowed size. */
-	if (size > MaxLogFileSize)
-		size = MaxLogFileSize;
-	/*
-	 * Truncate size to a multiple of the page cache size or the default
-	 * log page size if the page cache size is between the default log page
-	 * log page size if the page cache size is between the default log page
-	 * size and twice that.
-	 */
-	if (PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <=
-			DefaultLogPageSize * 2)
-		log_page_size = DefaultLogPageSize;
-	else
-		log_page_size = PAGE_SIZE;
-	/*
-	 * Use ntfs_ffs() instead of ffs() to enable the compiler to
-	 * optimize log_page_size and log_page_bits into constants.
-	 */
-	log_page_bits = ntfs_ffs(log_page_size) - 1;
-	size &= ~(s64)(log_page_size - 1);
-	/*
-	 * Ensure the log file is big enough to store at least the two restart
-	 * pages and the minimum number of log record pages.
-	 */
-	if (size < log_page_size * 2 || (size - log_page_size * 2) >>
-			log_page_bits < MinLogRecordPages) {
-		ntfs_error(vol->sb, "$LogFile is too small.");
-		return false;
-	}
-	/*
-	 * Read through the file looking for a restart page.  Since the restart
-	 * page header is at the beginning of a page we only need to search at
-	 * what could be the beginning of a page (for each page size) rather
-	 * than scanning the whole file byte by byte.  If all potential places
-	 * contain empty and uninitialzed records, the log file can be assumed
-	 * to be empty.
-	 */
-	for (pos = 0; pos < size; pos <<= 1) {
-		pgoff_t idx = pos >> PAGE_SHIFT;
-		if (!page || page->index != idx) {
-			if (page)
-				ntfs_unmap_page(page);
-			page = ntfs_map_page(mapping, idx);
-			if (IS_ERR(page)) {
-				ntfs_error(vol->sb, "Error mapping $LogFile "
-						"page (index %lu).", idx);
-				goto err_out;
-			}
-		}
-		kaddr = (u8*)page_address(page) + (pos & ~PAGE_MASK);
-		/*
-		 * A non-empty block means the logfile is not empty while an
-		 * empty block after a non-empty block has been encountered
-		 * means we are done.
-		 */
-		if (!ntfs_is_empty_recordp((le32*)kaddr))
-			logfile_is_empty = false;
-		else if (!logfile_is_empty)
-			break;
-		/*
-		 * A log record page means there cannot be a restart page after
-		 * this so no need to continue searching.
-		 */
-		if (ntfs_is_rcrd_recordp((le32*)kaddr))
-			break;
-		/* If not a (modified by chkdsk) restart page, continue. */
-		if (!ntfs_is_rstr_recordp((le32*)kaddr) &&
-				!ntfs_is_chkd_recordp((le32*)kaddr)) {
-			if (!pos)
-				pos = NTFS_BLOCK_SIZE >> 1;
-			continue;
-		}
-		/*
-		 * Check the (modified by chkdsk) restart page for consistency
-		 * and get a copy of the complete multi sector transfer
-		 * deprotected restart page.
-		 */
-		err = ntfs_check_and_load_restart_page(log_vi,
-				(RESTART_PAGE_HEADER*)kaddr, pos,
-				!rstr1_ph ? &rstr1_ph : &rstr2_ph,
-				!rstr1_ph ? &rstr1_lsn : &rstr2_lsn);
-		if (!err) {
-			/*
-			 * If we have now found the first (modified by chkdsk)
-			 * restart page, continue looking for the second one.
-			 */
-			if (!pos) {
-				pos = NTFS_BLOCK_SIZE >> 1;
-				continue;
-			}
-			/*
-			 * We have now found the second (modified by chkdsk)
-			 * restart page, so we can stop looking.
-			 */
-			break;
-		}
-		/*
-		 * Error output already done inside the function.  Note, we do
-		 * not abort if the restart page was invalid as we might still
-		 * find a valid one further in the file.
-		 */
-		if (err != -EINVAL) {
-			ntfs_unmap_page(page);
-			goto err_out;
-		}
-		/* Continue looking. */
-		if (!pos)
-			pos = NTFS_BLOCK_SIZE >> 1;
-	}
-	if (page)
-		ntfs_unmap_page(page);
-	if (logfile_is_empty) {
-		NVolSetLogFileEmpty(vol);
-is_empty:
-		ntfs_debug("Done.  ($LogFile is empty.)");
-		return true;
-	}
-	if (!rstr1_ph) {
-		BUG_ON(rstr2_ph);
-		ntfs_error(vol->sb, "Did not find any restart pages in "
-				"$LogFile and it was not empty.");
-		return false;
-	}
-	/* If both restart pages were found, use the more recent one. */
-	if (rstr2_ph) {
-		/*
-		 * If the second restart area is more recent, switch to it.
-		 * Otherwise just throw it away.
-		 */
-		if (rstr2_lsn > rstr1_lsn) {
-			ntfs_debug("Using second restart page as it is more "
-					"recent.");
-			ntfs_free(rstr1_ph);
-			rstr1_ph = rstr2_ph;
-			/* rstr1_lsn = rstr2_lsn; */
-		} else {
-			ntfs_debug("Using first restart page as it is more "
-					"recent.");
-			ntfs_free(rstr2_ph);
-		}
-		rstr2_ph = NULL;
-	}
-	/* All consistency checks passed. */
-	if (rp)
-		*rp = rstr1_ph;
-	else
-		ntfs_free(rstr1_ph);
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	if (rstr1_ph)
-		ntfs_free(rstr1_ph);
-	return false;
-}
-
-/**
- * ntfs_is_logfile_clean - check in the journal if the volume is clean
- * @log_vi:	struct inode of loaded journal $LogFile to check
- * @rp:		copy of the current restart page
- *
- * Analyze the $LogFile journal and return 'true' if it indicates the volume was
- * shutdown cleanly and 'false' if not.
- *
- * At present we only look at the two restart pages and ignore the log record
- * pages.  This is a little bit crude in that there will be a very small number
- * of cases where we think that a volume is dirty when in fact it is clean.
- * This should only affect volumes that have not been shutdown cleanly but did
- * not have any pending, non-check-pointed i/o, i.e. they were completely idle
- * at least for the five seconds preceding the unclean shutdown.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and in particular if the $LogFile
- * is empty this function requires that NVolLogFileEmpty() is true otherwise an
- * empty volume will be reported as dirty.
- */
-bool ntfs_is_logfile_clean(struct inode *log_vi, const RESTART_PAGE_HEADER *rp)
-{
-	ntfs_volume *vol = NTFS_SB(log_vi->i_sb);
-	RESTART_AREA *ra;
-
-	ntfs_debug("Entering.");
-	/* An empty $LogFile must have been clean before it got emptied. */
-	if (NVolLogFileEmpty(vol)) {
-		ntfs_debug("Done.  ($LogFile is empty.)");
-		return true;
-	}
-	BUG_ON(!rp);
-	if (!ntfs_is_rstr_record(rp->magic) &&
-			!ntfs_is_chkd_record(rp->magic)) {
-		ntfs_error(vol->sb, "Restart page buffer is invalid.  This is "
-				"probably a bug in that the $LogFile should "
-				"have been consistency checked before calling "
-				"this function.");
-		return false;
-	}
-	ra = (RESTART_AREA*)((u8*)rp + le16_to_cpu(rp->restart_area_offset));
-	/*
-	 * If the $LogFile has active clients, i.e. it is open, and we do not
-	 * have the RESTART_VOLUME_IS_CLEAN bit set in the restart area flags,
-	 * we assume there was an unclean shutdown.
-	 */
-	if (ra->client_in_use_list != LOGFILE_NO_CLIENT &&
-			!(ra->flags & RESTART_VOLUME_IS_CLEAN)) {
-		ntfs_debug("Done.  $LogFile indicates a dirty shutdown.");
-		return false;
-	}
-	/* $LogFile indicates a clean shutdown. */
-	ntfs_debug("Done.  $LogFile indicates a clean shutdown.");
-	return true;
-}
-
-/**
- * ntfs_empty_logfile - empty the contents of the $LogFile journal
- * @log_vi:	struct inode of loaded journal $LogFile to empty
- *
- * Empty the contents of the $LogFile journal @log_vi and return 'true' on
- * success and 'false' on error.
- *
- * This function assumes that the $LogFile journal has already been consistency
- * checked by a call to ntfs_check_logfile() and that ntfs_is_logfile_clean()
- * has been used to ensure that the $LogFile is clean.
- */
-bool ntfs_empty_logfile(struct inode *log_vi)
-{
-	VCN vcn, end_vcn;
-	ntfs_inode *log_ni = NTFS_I(log_vi);
-	ntfs_volume *vol = log_ni->vol;
-	struct super_block *sb = vol->sb;
-	runlist_element *rl;
-	unsigned long flags;
-	unsigned block_size, block_size_bits;
-	int err;
-	bool should_wait = true;
-
-	ntfs_debug("Entering.");
-	if (NVolLogFileEmpty(vol)) {
-		ntfs_debug("Done.");
-		return true;
-	}
-	/*
-	 * We cannot use ntfs_attr_set() because we may be still in the middle
-	 * of a mount operation.  Thus we do the emptying by hand by first
-	 * zapping the page cache pages for the $LogFile/$DATA attribute and
-	 * then emptying each of the buffers in each of the clusters specified
-	 * by the runlist by hand.
-	 */
-	block_size = sb->s_blocksize;
-	block_size_bits = sb->s_blocksize_bits;
-	vcn = 0;
-	read_lock_irqsave(&log_ni->size_lock, flags);
-	end_vcn = (log_ni->initialized_size + vol->cluster_size_mask) >>
-			vol->cluster_size_bits;
-	read_unlock_irqrestore(&log_ni->size_lock, flags);
-	truncate_inode_pages(log_vi->i_mapping, 0);
-	down_write(&log_ni->runlist.lock);
-	rl = log_ni->runlist.rl;
-	if (unlikely(!rl || vcn < rl->vcn || !rl->length)) {
-map_vcn:
-		err = ntfs_map_runlist_nolock(log_ni, vcn, NULL);
-		if (err) {
-			ntfs_error(sb, "Failed to map runlist fragment (error "
-					"%d).", -err);
-			goto err;
-		}
-		rl = log_ni->runlist.rl;
-		BUG_ON(!rl || vcn < rl->vcn || !rl->length);
-	}
-	/* Seek to the runlist element containing @vcn. */
-	while (rl->length && vcn >= rl[1].vcn)
-		rl++;
-	do {
-		LCN lcn;
-		sector_t block, end_block;
-		s64 len;
-
-		/*
-		 * If this run is not mapped map it now and start again as the
-		 * runlist will have been updated.
-		 */
-		lcn = rl->lcn;
-		if (unlikely(lcn == LCN_RL_NOT_MAPPED)) {
-			vcn = rl->vcn;
-			goto map_vcn;
-		}
-		/* If this run is not valid abort with an error. */
-		if (unlikely(!rl->length || lcn < LCN_HOLE))
-			goto rl_err;
-		/* Skip holes. */
-		if (lcn == LCN_HOLE)
-			continue;
-		block = lcn << vol->cluster_size_bits >> block_size_bits;
-		len = rl->length;
-		if (rl[1].vcn > end_vcn)
-			len = end_vcn - rl->vcn;
-		end_block = (lcn + len) << vol->cluster_size_bits >>
-				block_size_bits;
-		/* Iterate over the blocks in the run and empty them. */
-		do {
-			struct buffer_head *bh;
-
-			/* Obtain the buffer, possibly not uptodate. */
-			bh = sb_getblk(sb, block);
-			BUG_ON(!bh);
-			/* Setup buffer i/o submission. */
-			lock_buffer(bh);
-			bh->b_end_io = end_buffer_write_sync;
-			get_bh(bh);
-			/* Set the entire contents of the buffer to 0xff. */
-			memset(bh->b_data, -1, block_size);
-			if (!buffer_uptodate(bh))
-				set_buffer_uptodate(bh);
-			if (buffer_dirty(bh))
-				clear_buffer_dirty(bh);
-			/*
-			 * Submit the buffer and wait for i/o to complete but
-			 * only for the first buffer so we do not miss really
-			 * serious i/o errors.  Once the first buffer has
-			 * completed ignore errors afterwards as we can assume
-			 * that if one buffer worked all of them will work.
-			 */
-			submit_bh(REQ_OP_WRITE, bh);
-			if (should_wait) {
-				should_wait = false;
-				wait_on_buffer(bh);
-				if (unlikely(!buffer_uptodate(bh)))
-					goto io_err;
-			}
-			brelse(bh);
-		} while (++block < end_block);
-	} while ((++rl)->vcn < end_vcn);
-	up_write(&log_ni->runlist.lock);
-	/*
-	 * Zap the pages again just in case any got instantiated whilst we were
-	 * emptying the blocks by hand.  FIXME: We may not have completed
-	 * writing to all the buffer heads yet so this may happen too early.
-	 * We really should use a kernel thread to do the emptying
-	 * asynchronously and then we can also set the volume dirty and output
-	 * an error message if emptying should fail.
-	 */
-	truncate_inode_pages(log_vi->i_mapping, 0);
-	/* Set the flag so we do not have to do it again on remount. */
-	NVolSetLogFileEmpty(vol);
-	ntfs_debug("Done.");
-	return true;
-io_err:
-	ntfs_error(sb, "Failed to write buffer.  Unmount and run chkdsk.");
-	goto dirty_err;
-rl_err:
-	ntfs_error(sb, "Runlist is corrupt.  Unmount and run chkdsk.");
-dirty_err:
-	NVolSetErrors(vol);
-	err = -EIO;
-err:
-	up_write(&log_ni->runlist.lock);
-	ntfs_error(sb, "Failed to fill $LogFile with 0xff bytes (error %d).",
-			-err);
-	return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
deleted file mode 100644
index 429d4909cc72..000000000000
--- a/fs/ntfs/logfile.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * logfile.h - Defines for NTFS kernel journal ($LogFile) handling.  Part of
- *	       the Linux-NTFS project.
- *
- * Copyright (c) 2000-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_LOGFILE_H
-#define _LINUX_NTFS_LOGFILE_H
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-
-/*
- * Journal ($LogFile) organization:
- *
- * Two restart areas present in the first two pages (restart pages, one restart
- * area in each page).  When the volume is dismounted they should be identical,
- * except for the update sequence array which usually has a different update
- * sequence number.
- *
- * These are followed by log records organized in pages headed by a log record
- * header going up to log file size.  Not all pages contain log records when a
- * volume is first formatted, but as the volume ages, all records will be used.
- * When the log file fills up, the records at the beginning are purged (by
- * modifying the oldest_lsn to a higher value presumably) and writing begins
- * at the beginning of the file.  Effectively, the log file is viewed as a
- * circular entity.
- *
- * NOTE: Windows NT, 2000, and XP all use log file version 1.1 but they accept
- * versions <= 1.x, including 0.-1.  (Yes, that is a minus one in there!)  We
- * probably only want to support 1.1 as this seems to be the current version
- * and we don't know how that differs from the older versions.  The only
- * exception is if the journal is clean as marked by the two restart pages
- * then it doesn't matter whether we are on an earlier version.  We can just
- * reinitialize the logfile and start again with version 1.1.
- */
-
-/* Some $LogFile related constants. */
-#define MaxLogFileSize		0x100000000ULL
-#define DefaultLogPageSize	4096
-#define MinLogRecordPages	48
-
-/*
- * Log file restart page header (begins the restart area).
- */
-typedef struct {
-/*Ofs*/
-/*  0	NTFS_RECORD; -- Unfolded here as gcc doesn't like unnamed structs. */
-/*  0*/	NTFS_RECORD_TYPE magic;	/* The magic is "RSTR". */
-/*  4*/	le16 usa_ofs;		/* See NTFS_RECORD definition in layout.h.
-				   When creating, set this to be immediately
-				   after this header structure (without any
-				   alignment). */
-/*  6*/	le16 usa_count;		/* See NTFS_RECORD definition in layout.h. */
-
-/*  8*/	leLSN chkdsk_lsn;	/* The last log file sequence number found by
-				   chkdsk.  Only used when the magic is changed
-				   to "CHKD".  Otherwise this is zero. */
-/* 16*/	le32 system_page_size;	/* Byte size of system pages when the log file
-				   was created, has to be >= 512 and a power of
-				   2.  Use this to calculate the required size
-				   of the usa (usa_count) and add it to usa_ofs.
-				   Then verify that the result is less than the
-				   value of the restart_area_offset. */
-/* 20*/	le32 log_page_size;	/* Byte size of log file pages, has to be >=
-				   512 and a power of 2.  The default is 4096
-				   and is used when the system page size is
-				   between 4096 and 8192.  Otherwise this is
-				   set to the system page size instead. */
-/* 24*/	le16 restart_area_offset;/* Byte offset from the start of this header to
-				   the RESTART_AREA.  Value has to be aligned
-				   to 8-byte boundary.  When creating, set this
-				   to be after the usa. */
-/* 26*/	sle16 minor_ver;	/* Log file minor version.  Only check if major
-				   version is 1. */
-/* 28*/	sle16 major_ver;	/* Log file major version.  We only support
-				   version 1.1. */
-/* sizeof() = 30 (0x1e) bytes */
-} __attribute__ ((__packed__)) RESTART_PAGE_HEADER;
-
-/*
- * Constant for the log client indices meaning that there are no client records
- * in this particular client array.  Also inside the client records themselves,
- * this means that there are no client records preceding or following this one.
- */
-#define LOGFILE_NO_CLIENT	cpu_to_le16(0xffff)
-#define LOGFILE_NO_CLIENT_CPU	0xffff
-
-/*
- * These are the so far known RESTART_AREA_* flags (16-bit) which contain
- * information about the log file in which they are present.
- */
-enum {
-	RESTART_VOLUME_IS_CLEAN	= cpu_to_le16(0x0002),
-	RESTART_SPACE_FILLER	= cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
-} __attribute__ ((__packed__));
-
-typedef le16 RESTART_AREA_FLAGS;
-
-/*
- * Log file restart area record.  The offset of this record is found by adding
- * the offset of the RESTART_PAGE_HEADER to the restart_area_offset value found
- * in it.  See notes at restart_area_offset above.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	leLSN current_lsn;	/* The current, i.e. last LSN inside the log
-				   when the restart area was last written.
-				   This happens often but what is the interval?
-				   Is it just fixed time or is it every time a
-				   check point is written or somethine else?
-				   On create set to 0. */
-/*  8*/	le16 log_clients;	/* Number of log client records in the array of
-				   log client records which follows this
-				   restart area.  Must be 1.  */
-/* 10*/	le16 client_free_list;	/* The index of the first free log client record
-				   in the array of log client records.
-				   LOGFILE_NO_CLIENT means that there are no
-				   free log client records in the array.
-				   If != LOGFILE_NO_CLIENT, check that
-				   log_clients > client_free_list.  On Win2k
-				   and presumably earlier, on a clean volume
-				   this is != LOGFILE_NO_CLIENT, and it should
-				   be 0, i.e. the first (and only) client
-				   record is free and thus the logfile is
-				   closed and hence clean.  A dirty volume
-				   would have left the logfile open and hence
-				   this would be LOGFILE_NO_CLIENT.  On WinXP
-				   and presumably later, the logfile is always
-				   open, even on clean shutdown so this should
-				   always be LOGFILE_NO_CLIENT. */
-/* 12*/	le16 client_in_use_list;/* The index of the first in-use log client
-				   record in the array of log client records.
-				   LOGFILE_NO_CLIENT means that there are no
-				   in-use log client records in the array.  If
-				   != LOGFILE_NO_CLIENT check that log_clients
-				   > client_in_use_list.  On Win2k and
-				   presumably earlier, on a clean volume this
-				   is LOGFILE_NO_CLIENT, i.e. there are no
-				   client records in use and thus the logfile
-				   is closed and hence clean.  A dirty volume
-				   would have left the logfile open and hence
-				   this would be != LOGFILE_NO_CLIENT, and it
-				   should be 0, i.e. the first (and only)
-				   client record is in use.  On WinXP and
-				   presumably later, the logfile is always
-				   open, even on clean shutdown so this should
-				   always be 0. */
-/* 14*/	RESTART_AREA_FLAGS flags;/* Flags modifying LFS behaviour.  On Win2k
-				   and presumably earlier this is always 0.  On
-				   WinXP and presumably later, if the logfile
-				   was shutdown cleanly, the second bit,
-				   RESTART_VOLUME_IS_CLEAN, is set.  This bit
-				   is cleared when the volume is mounted by
-				   WinXP and set when the volume is dismounted,
-				   thus if the logfile is dirty, this bit is
-				   clear.  Thus we don't need to check the
-				   Windows version to determine if the logfile
-				   is clean.  Instead if the logfile is closed,
-				   we know it must be clean.  If it is open and
-				   this bit is set, we also know it must be
-				   clean.  If on the other hand the logfile is
-				   open and this bit is clear, we can be almost
-				   certain that the logfile is dirty. */
-/* 16*/	le32 seq_number_bits;	/* How many bits to use for the sequence
-				   number.  This is calculated as 67 - the
-				   number of bits required to store the logfile
-				   size in bytes and this can be used in with
-				   the specified file_size as a consistency
-				   check. */
-/* 20*/	le16 restart_area_length;/* Length of the restart area including the
-				   client array.  Following checks required if
-				   version matches.  Otherwise, skip them.
-				   restart_area_offset + restart_area_length
-				   has to be <= system_page_size.  Also,
-				   restart_area_length has to be >=
-				   client_array_offset + (log_clients *
-				   sizeof(log client record)). */
-/* 22*/	le16 client_array_offset;/* Offset from the start of this record to
-				   the first log client record if versions are
-				   matched.  When creating, set this to be
-				   after this restart area structure, aligned
-				   to 8-bytes boundary.  If the versions do not
-				   match, this is ignored and the offset is
-				   assumed to be (sizeof(RESTART_AREA) + 7) &
-				   ~7, i.e. rounded up to first 8-byte
-				   boundary.  Either way, client_array_offset
-				   has to be aligned to an 8-byte boundary.
-				   Also, restart_area_offset +
-				   client_array_offset has to be <= 510.
-				   Finally, client_array_offset + (log_clients
-				   * sizeof(log client record)) has to be <=
-				   system_page_size.  On Win2k and presumably
-				   earlier, this is 0x30, i.e. immediately
-				   following this record.  On WinXP and
-				   presumably later, this is 0x40, i.e. there
-				   are 16 extra bytes between this record and
-				   the client array.  This probably means that
-				   the RESTART_AREA record is actually bigger
-				   in WinXP and later. */
-/* 24*/	sle64 file_size;	/* Usable byte size of the log file.  If the
-				   restart_area_offset + the offset of the
-				   file_size are > 510 then corruption has
-				   occurred.  This is the very first check when
-				   starting with the restart_area as if it
-				   fails it means that some of the above values
-				   will be corrupted by the multi sector
-				   transfer protection.  The file_size has to
-				   be rounded down to be a multiple of the
-				   log_page_size in the RESTART_PAGE_HEADER and
-				   then it has to be at least big enough to
-				   store the two restart pages and 48 (0x30)
-				   log record pages. */
-/* 32*/	le32 last_lsn_data_length;/* Length of data of last LSN, not including
-				   the log record header.  On create set to
-				   0. */
-/* 36*/	le16 log_record_header_length;/* Byte size of the log record header.
-				   If the version matches then check that the
-				   value of log_record_header_length is a
-				   multiple of 8, i.e.
-				   (log_record_header_length + 7) & ~7 ==
-				   log_record_header_length.  When creating set
-				   it to sizeof(LOG_RECORD_HEADER), aligned to
-				   8 bytes. */
-/* 38*/	le16 log_page_data_offset;/* Offset to the start of data in a log record
-				   page.  Must be a multiple of 8.  On create
-				   set it to immediately after the update
-				   sequence array of the log record page. */
-/* 40*/	le32 restart_log_open_count;/* A counter that gets incremented every
-				   time the logfile is restarted which happens
-				   at mount time when the logfile is opened.
-				   When creating set to a random value.  Win2k
-				   sets it to the low 32 bits of the current
-				   system time in NTFS format (see time.h). */
-/* 44*/	le32 reserved;		/* Reserved/alignment to 8-byte boundary. */
-/* sizeof() = 48 (0x30) bytes */
-} __attribute__ ((__packed__)) RESTART_AREA;
-
-/*
- * Log client record.  The offset of this record is found by adding the offset
- * of the RESTART_AREA to the client_array_offset value found in it.
- */
-typedef struct {
-/*Ofs*/
-/*  0*/	leLSN oldest_lsn;	/* Oldest LSN needed by this client.  On create
-				   set to 0. */
-/*  8*/	leLSN client_restart_lsn;/* LSN at which this client needs to restart
-				   the volume, i.e. the current position within
-				   the log file.  At present, if clean this
-				   should = current_lsn in restart area but it
-				   probably also = current_lsn when dirty most
-				   of the time.  At create set to 0. */
-/* 16*/	le16 prev_client;	/* The offset to the previous log client record
-				   in the array of log client records.
-				   LOGFILE_NO_CLIENT means there is no previous
-				   client record, i.e. this is the first one.
-				   This is always LOGFILE_NO_CLIENT. */
-/* 18*/	le16 next_client;	/* The offset to the next log client record in
-				   the array of log client records.
-				   LOGFILE_NO_CLIENT means there are no next
-				   client records, i.e. this is the last one.
-				   This is always LOGFILE_NO_CLIENT. */
-/* 20*/	le16 seq_number;	/* On Win2k and presumably earlier, this is set
-				   to zero every time the logfile is restarted
-				   and it is incremented when the logfile is
-				   closed at dismount time.  Thus it is 0 when
-				   dirty and 1 when clean.  On WinXP and
-				   presumably later, this is always 0. */
-/* 22*/	u8 reserved[6];		/* Reserved/alignment. */
-/* 28*/	le32 client_name_length;/* Length of client name in bytes.  Should
-				   always be 8. */
-/* 32*/	ntfschar client_name[64];/* Name of the client in Unicode.  Should
-				   always be "NTFS" with the remaining bytes
-				   set to 0. */
-/* sizeof() = 160 (0xa0) bytes */
-} __attribute__ ((__packed__)) LOG_CLIENT_RECORD;
-
-extern bool ntfs_check_logfile(struct inode *log_vi,
-		RESTART_PAGE_HEADER **rp);
-
-extern bool ntfs_is_logfile_clean(struct inode *log_vi,
-		const RESTART_PAGE_HEADER *rp);
-
-extern bool ntfs_empty_logfile(struct inode *log_vi);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_LOGFILE_H */
diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h
deleted file mode 100644
index 7068425735f1..000000000000
--- a/fs/ntfs/malloc.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * malloc.h - NTFS kernel memory handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MALLOC_H
-#define _LINUX_NTFS_MALLOC_H
-
-#include <linux/vmalloc.h>
-#include <linux/slab.h>
-#include <linux/highmem.h>
-
-/**
- * __ntfs_malloc - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- * @gfp_mask:	extra flags for the allocator
- *
- * Internal function.  You probably want ntfs_malloc_nofs()...
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- * Depending on @gfp_mask the allocation may be guaranteed to succeed.
- */
-static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask)
-{
-	if (likely(size <= PAGE_SIZE)) {
-		BUG_ON(!size);
-		/* kmalloc() has per-CPU caches so is faster for now. */
-		return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM);
-		/* return (void *)__get_free_page(gfp_mask); */
-	}
-	if (likely((size >> PAGE_SHIFT) < totalram_pages()))
-		return __vmalloc(size, gfp_mask);
-	return NULL;
-}
-
-/**
- * ntfs_malloc_nofs - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs(unsigned long size)
-{
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM);
-}
-
-/**
- * ntfs_malloc_nofs_nofail - allocate memory in multiples of pages
- * @size:	number of bytes to allocate
- *
- * Allocates @size bytes of memory, rounded up to multiples of PAGE_SIZE and
- * returns a pointer to the allocated memory.
- *
- * This function guarantees that the allocation will succeed.  It will sleep
- * for as long as it takes to complete the allocation.
- *
- * If there was insufficient memory to complete the request, return NULL.
- */
-static inline void *ntfs_malloc_nofs_nofail(unsigned long size)
-{
-	return __ntfs_malloc(size, GFP_NOFS | __GFP_HIGHMEM | __GFP_NOFAIL);
-}
-
-static inline void ntfs_free(void *addr)
-{
-	kvfree(addr);
-}
-
-#endif /* _LINUX_NTFS_MALLOC_H */
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
deleted file mode 100644
index 6fd1dc4b08c8..000000000000
--- a/fs/ntfs/mft.c
+++ /dev/null
@@ -1,2907 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mft.c - NTFS kernel mft record operations. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2002 Richard Russon
- */
-
-#include <linux/buffer_head.h>
-#include <linux/slab.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-
-#include "attrib.h"
-#include "aops.h"
-#include "bitmap.h"
-#include "debug.h"
-#include "dir.h"
-#include "lcnalloc.h"
-#include "malloc.h"
-#include "mft.h"
-#include "ntfs.h"
-
-#define MAX_BHS	(PAGE_SIZE / NTFS_BLOCK_SIZE)
-
-/**
- * map_mft_record_page - map the page in which a specific mft record resides
- * @ni:		ntfs inode whose mft record page to map
- *
- * This maps the page in which the mft record of the ntfs inode @ni is situated
- * and returns a pointer to the mft record within the mapped page.
- *
- * Return value needs to be checked with IS_ERR() and if that is true PTR_ERR()
- * contains the negative error code returned.
- */
-static inline MFT_RECORD *map_mft_record_page(ntfs_inode *ni)
-{
-	loff_t i_size;
-	ntfs_volume *vol = ni->vol;
-	struct inode *mft_vi = vol->mft_ino;
-	struct page *page;
-	unsigned long index, end_index;
-	unsigned ofs;
-
-	BUG_ON(ni->page);
-	/*
-	 * The index into the page cache and the offset within the page cache
-	 * page of the wanted mft record. FIXME: We need to check for
-	 * overflowing the unsigned long, but I don't think we would ever get
-	 * here if the volume was that big...
-	 */
-	index = (u64)ni->mft_no << vol->mft_record_size_bits >>
-			PAGE_SHIFT;
-	ofs = (ni->mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-
-	i_size = i_size_read(mft_vi);
-	/* The maximum valid index into the page cache for $MFT's data. */
-	end_index = i_size >> PAGE_SHIFT;
-
-	/* If the wanted index is out of bounds the mft record doesn't exist. */
-	if (unlikely(index >= end_index)) {
-		if (index > end_index || (i_size & ~PAGE_MASK) < ofs +
-				vol->mft_record_size) {
-			page = ERR_PTR(-ENOENT);
-			ntfs_error(vol->sb, "Attempt to read mft record 0x%lx, "
-					"which is beyond the end of the mft.  "
-					"This is probably a bug in the ntfs "
-					"driver.", ni->mft_no);
-			goto err_out;
-		}
-	}
-	/* Read, map, and pin the page. */
-	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (!IS_ERR(page)) {
-		/* Catch multi sector transfer fixup errors. */
-		if (likely(ntfs_is_mft_recordp((le32*)(page_address(page) +
-				ofs)))) {
-			ni->page = page;
-			ni->page_ofs = ofs;
-			return page_address(page) + ofs;
-		}
-		ntfs_error(vol->sb, "Mft record 0x%lx is corrupt.  "
-				"Run chkdsk.", ni->mft_no);
-		ntfs_unmap_page(page);
-		page = ERR_PTR(-EIO);
-		NVolSetErrors(vol);
-	}
-err_out:
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	return (void*)page;
-}
-
-/**
- * map_mft_record - map, pin and lock an mft record
- * @ni:		ntfs inode whose MFT record to map
- *
- * First, take the mrec_lock mutex.  We might now be sleeping, while waiting
- * for the mutex if it was already locked by someone else.
- *
- * The page of the record is mapped using map_mft_record_page() before being
- * returned to the caller.
- *
- * This in turn uses ntfs_map_page() to get the page containing the wanted mft
- * record (it in turn calls read_cache_page() which reads it in from disk if
- * necessary, increments the use count on the page so that it cannot disappear
- * under us and returns a reference to the page cache page).
- *
- * If read_cache_page() invokes ntfs_readpage() to load the page from disk, it
- * sets PG_locked and clears PG_uptodate on the page. Once I/O has completed
- * and the post-read mst fixups on each mft record in the page have been
- * performed, the page gets PG_uptodate set and PG_locked cleared (this is done
- * in our asynchronous I/O completion handler end_buffer_read_mft_async()).
- * ntfs_map_page() waits for PG_locked to become clear and checks if
- * PG_uptodate is set and returns an error code if not. This provides
- * sufficient protection against races when reading/using the page.
- *
- * However there is the write mapping to think about. Doing the above described
- * checking here will be fine, because when initiating the write we will set
- * PG_locked and clear PG_uptodate making sure nobody is touching the page
- * contents. Doing the locking this way means that the commit to disk code in
- * the page cache code paths is automatically sufficiently locked with us as
- * we will not touch a page that has been locked or is not uptodate. The only
- * locking problem then is them locking the page while we are accessing it.
- *
- * So that code will end up having to own the mrec_lock of all mft
- * records/inodes present in the page before I/O can proceed. In that case we
- * wouldn't need to bother with PG_locked and PG_uptodate as nobody will be
- * accessing anything without owning the mrec_lock mutex.  But we do need to
- * use them because of the read_cache_page() invocation and the code becomes so
- * much simpler this way that it is well worth it.
- *
- * The mft record is now ours and we return a pointer to it. You need to check
- * the returned pointer with IS_ERR() and if that is true, PTR_ERR() will return
- * the error code.
- *
- * NOTE: Caller is responsible for setting the mft record dirty before calling
- * unmap_mft_record(). This is obviously only necessary if the caller really
- * modified the mft record...
- * Q: Do we want to recycle one of the VFS inode state bits instead?
- * A: No, the inode ones mean we want to change the mft record, not we want to
- * write it out.
- */
-MFT_RECORD *map_mft_record(ntfs_inode *ni)
-{
-	MFT_RECORD *m;
-
-	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
-	/* Make sure the ntfs inode doesn't go away. */
-	atomic_inc(&ni->count);
-
-	/* Serialize access to this mft record. */
-	mutex_lock(&ni->mrec_lock);
-
-	m = map_mft_record_page(ni);
-	if (!IS_ERR(m))
-		return m;
-
-	mutex_unlock(&ni->mrec_lock);
-	atomic_dec(&ni->count);
-	ntfs_error(ni->vol->sb, "Failed with error code %lu.", -PTR_ERR(m));
-	return m;
-}
-
-/**
- * unmap_mft_record_page - unmap the page in which a specific mft record resides
- * @ni:		ntfs inode whose mft record page to unmap
- *
- * This unmaps the page in which the mft record of the ntfs inode @ni is
- * situated and returns. This is a NOOP if highmem is not configured.
- *
- * The unmap happens via ntfs_unmap_page() which in turn decrements the use
- * count on the page thus releasing it from the pinned state.
- *
- * We do not actually unmap the page from memory of course, as that will be
- * done by the page cache code itself when memory pressure increases or
- * whatever.
- */
-static inline void unmap_mft_record_page(ntfs_inode *ni)
-{
-	BUG_ON(!ni->page);
-
-	// TODO: If dirty, blah...
-	ntfs_unmap_page(ni->page);
-	ni->page = NULL;
-	ni->page_ofs = 0;
-	return;
-}
-
-/**
- * unmap_mft_record - release a mapped mft record
- * @ni:		ntfs inode whose MFT record to unmap
- *
- * We release the page mapping and the mrec_lock mutex which unmaps the mft
- * record and releases it for others to get hold of. We also release the ntfs
- * inode by decrementing the ntfs inode reference count.
- *
- * NOTE: If caller has modified the mft record, it is imperative to set the mft
- * record dirty BEFORE calling unmap_mft_record().
- */
-void unmap_mft_record(ntfs_inode *ni)
-{
-	struct page *page = ni->page;
-
-	BUG_ON(!page);
-
-	ntfs_debug("Entering for mft_no 0x%lx.", ni->mft_no);
-
-	unmap_mft_record_page(ni);
-	mutex_unlock(&ni->mrec_lock);
-	atomic_dec(&ni->count);
-	/*
-	 * If pure ntfs_inode, i.e. no vfs inode attached, we leave it to
-	 * ntfs_clear_extent_inode() in the extent inode case, and to the
-	 * caller in the non-extent, yet pure ntfs inode case, to do the actual
-	 * tear down of all structures and freeing of all allocated memory.
-	 */
-	return;
-}
-
-/**
- * map_extent_mft_record - load an extent inode and attach it to its base
- * @base_ni:	base ntfs inode
- * @mref:	mft reference of the extent inode to load
- * @ntfs_ino:	on successful return, pointer to the ntfs_inode structure
- *
- * Load the extent mft record @mref and attach it to its base inode @base_ni.
- * Return the mapped extent mft record if IS_ERR(result) is false.  Otherwise
- * PTR_ERR(result) gives the negative error code.
- *
- * On successful return, @ntfs_ino contains a pointer to the ntfs_inode
- * structure of the mapped extent inode.
- */
-MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
-		ntfs_inode **ntfs_ino)
-{
-	MFT_RECORD *m;
-	ntfs_inode *ni = NULL;
-	ntfs_inode **extent_nis = NULL;
-	int i;
-	unsigned long mft_no = MREF(mref);
-	u16 seq_no = MSEQNO(mref);
-	bool destroy_ni = false;
-
-	ntfs_debug("Mapping extent mft record 0x%lx (base mft record 0x%lx).",
-			mft_no, base_ni->mft_no);
-	/* Make sure the base ntfs inode doesn't go away. */
-	atomic_inc(&base_ni->count);
-	/*
-	 * Check if this extent inode has already been added to the base inode,
-	 * in which case just return it. If not found, add it to the base
-	 * inode before returning it.
-	 */
-	mutex_lock(&base_ni->extent_lock);
-	if (base_ni->nr_extents > 0) {
-		extent_nis = base_ni->ext.extent_ntfs_inos;
-		for (i = 0; i < base_ni->nr_extents; i++) {
-			if (mft_no != extent_nis[i]->mft_no)
-				continue;
-			ni = extent_nis[i];
-			/* Make sure the ntfs inode doesn't go away. */
-			atomic_inc(&ni->count);
-			break;
-		}
-	}
-	if (likely(ni != NULL)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		/* We found the record; just have to map and return it. */
-		m = map_mft_record(ni);
-		/* map_mft_record() has incremented this on success. */
-		atomic_dec(&ni->count);
-		if (!IS_ERR(m)) {
-			/* Verify the sequence number. */
-			if (likely(le16_to_cpu(m->sequence_number) == seq_no)) {
-				ntfs_debug("Done 1.");
-				*ntfs_ino = ni;
-				return m;
-			}
-			unmap_mft_record(ni);
-			ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-					"reference! Corrupt filesystem. "
-					"Run chkdsk.");
-			return ERR_PTR(-EIO);
-		}
-map_err_out:
-		ntfs_error(base_ni->vol->sb, "Failed to map extent "
-				"mft record, error code %ld.", -PTR_ERR(m));
-		return m;
-	}
-	/* Record wasn't there. Get a new ntfs inode and initialize it. */
-	ni = ntfs_new_extent_inode(base_ni->vol->sb, mft_no);
-	if (unlikely(!ni)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		return ERR_PTR(-ENOMEM);
-	}
-	ni->vol = base_ni->vol;
-	ni->seq_no = seq_no;
-	ni->nr_extents = -1;
-	ni->ext.base_ntfs_ino = base_ni;
-	/* Now map the record. */
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		mutex_unlock(&base_ni->extent_lock);
-		atomic_dec(&base_ni->count);
-		ntfs_clear_extent_inode(ni);
-		goto map_err_out;
-	}
-	/* Verify the sequence number if it is present. */
-	if (seq_no && (le16_to_cpu(m->sequence_number) != seq_no)) {
-		ntfs_error(base_ni->vol->sb, "Found stale extent mft "
-				"reference! Corrupt filesystem. Run chkdsk.");
-		destroy_ni = true;
-		m = ERR_PTR(-EIO);
-		goto unm_err_out;
-	}
-	/* Attach extent inode to base inode, reallocating memory if needed. */
-	if (!(base_ni->nr_extents & 3)) {
-		ntfs_inode **tmp;
-		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode *);
-
-		tmp = kmalloc(new_size, GFP_NOFS);
-		if (unlikely(!tmp)) {
-			ntfs_error(base_ni->vol->sb, "Failed to allocate "
-					"internal buffer.");
-			destroy_ni = true;
-			m = ERR_PTR(-ENOMEM);
-			goto unm_err_out;
-		}
-		if (base_ni->nr_extents) {
-			BUG_ON(!base_ni->ext.extent_ntfs_inos);
-			memcpy(tmp, base_ni->ext.extent_ntfs_inos, new_size -
-					4 * sizeof(ntfs_inode *));
-			kfree(base_ni->ext.extent_ntfs_inos);
-		}
-		base_ni->ext.extent_ntfs_inos = tmp;
-	}
-	base_ni->ext.extent_ntfs_inos[base_ni->nr_extents++] = ni;
-	mutex_unlock(&base_ni->extent_lock);
-	atomic_dec(&base_ni->count);
-	ntfs_debug("Done 2.");
-	*ntfs_ino = ni;
-	return m;
-unm_err_out:
-	unmap_mft_record(ni);
-	mutex_unlock(&base_ni->extent_lock);
-	atomic_dec(&base_ni->count);
-	/*
-	 * If the extent inode was not attached to the base inode we need to
-	 * release it or we will leak memory.
-	 */
-	if (destroy_ni)
-		ntfs_clear_extent_inode(ni);
-	return m;
-}
-
-#ifdef NTFS_RW
-
-/**
- * __mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni:		ntfs inode describing the mapped mft record
- *
- * Internal function.  Users should call mark_mft_record_dirty() instead.
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty.  Also, mark the base
- * vfs inode dirty.  This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE:  We only set I_DIRTY_DATASYNC (and not I_DIRTY_PAGES)
- * on the base vfs inode, because even though file data may have been modified,
- * it is dirty in the inode meta data rather than the data page cache of the
- * inode, and thus there are no data pages that need writing out.  Therefore, a
- * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because ->write_inode needs to be called even
- * in case of fdatasync. This needs to happen or the file data would not
- * necessarily hit the device synchronously, even though the vfs inode has the
- * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
- * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * which is not what I_DIRTY_SYNC on its own would suggest.
- */
-void __mark_mft_record_dirty(ntfs_inode *ni)
-{
-	ntfs_inode *base_ni;
-
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-	BUG_ON(NInoAttr(ni));
-	mark_ntfs_record_dirty(ni->page, ni->page_ofs);
-	/* Determine the base vfs inode and mark it dirty, too. */
-	mutex_lock(&ni->extent_lock);
-	if (likely(ni->nr_extents >= 0))
-		base_ni = ni;
-	else
-		base_ni = ni->ext.base_ntfs_ino;
-	mutex_unlock(&ni->extent_lock);
-	__mark_inode_dirty(VFS_I(base_ni), I_DIRTY_DATASYNC);
-}
-
-static const char *ntfs_please_email = "Please email "
-		"linux-ntfs-dev@lists.sourceforge.net and say that you saw "
-		"this message.  Thank you.";
-
-/**
- * ntfs_sync_mft_mirror_umount - synchronise an mft record to the mft mirror
- * @vol:	ntfs volume on which the mft record to synchronize resides
- * @mft_no:	mft record number of mft record to synchronize
- * @m:		mapped, mst protected (extent) mft record to synchronize
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol,
- * bypassing the page cache and the $MFTMirr inode itself.
- *
- * This function is only for use at umount time when the mft mirror inode has
- * already been disposed off.  We BUG() if we are called while the mft mirror
- * inode is still attached to the volume.
- *
- * On success return 0.  On error return -errno.
- *
- * NOTE:  This function is not implemented yet as I am not convinced it can
- * actually be triggered considering the sequence of commits we do in super.c::
- * ntfs_put_super().  But just in case we provide this place holder as the
- * alternative would be either to BUG() or to get a NULL pointer dereference
- * and Oops.
- */
-static int ntfs_sync_mft_mirror_umount(ntfs_volume *vol,
-		const unsigned long mft_no, MFT_RECORD *m)
-{
-	BUG_ON(vol->mftmirr_ino);
-	ntfs_error(vol->sb, "Umount time mft mirror syncing is not "
-			"implemented yet.  %s", ntfs_please_email);
-	return -EOPNOTSUPP;
-}
-
-/**
- * ntfs_sync_mft_mirror - synchronize an mft record to the mft mirror
- * @vol:	ntfs volume on which the mft record to synchronize resides
- * @mft_no:	mft record number of mft record to synchronize
- * @m:		mapped, mst protected (extent) mft record to synchronize
- * @sync:	if true, wait for i/o completion
- *
- * Write the mapped, mst protected (extent) mft record @m with mft record
- * number @mft_no to the mft mirror ($MFTMirr) of the ntfs volume @vol.
- *
- * On success return 0.  On error return -errno and set the volume errors flag
- * in the ntfs volume @vol.
- *
- * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
- *
- * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
-		MFT_RECORD *m, int sync)
-{
-	struct page *page;
-	unsigned int blocksize = vol->sb->s_blocksize;
-	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[MAX_BHS];
-	struct buffer_head *bh, *head;
-	u8 *kmirr;
-	runlist_element *rl;
-	unsigned int block_start, block_end, m_start, m_end, page_ofs;
-	int i_bhs, nr_bhs, err = 0;
-	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-
-	ntfs_debug("Entering for inode 0x%lx.", mft_no);
-	BUG_ON(!max_bhs);
-	if (WARN_ON(max_bhs > MAX_BHS))
-		return -EINVAL;
-	if (unlikely(!vol->mftmirr_ino)) {
-		/* This could happen during umount... */
-		err = ntfs_sync_mft_mirror_umount(vol, mft_no, m);
-		if (likely(!err))
-			return err;
-		goto err_out;
-	}
-	/* Get the page containing the mirror copy of the mft record @m. */
-	page = ntfs_map_page(vol->mftmirr_ino->i_mapping, mft_no >>
-			(PAGE_SHIFT - vol->mft_record_size_bits));
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map mft mirror page.");
-		err = PTR_ERR(page);
-		goto err_out;
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	/* Offset of the mft mirror record inside the page. */
-	page_ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* The address in the page of the mirror copy of the mft record @m. */
-	kmirr = page_address(page) + page_ofs;
-	/* Copy the mst protected mft record to the mirror. */
-	memcpy(kmirr, m, vol->mft_record_size);
-	/* Create uptodate buffers if not present. */
-	if (unlikely(!page_has_buffers(page))) {
-		struct buffer_head *tail;
-
-		bh = head = alloc_page_buffers(page, blocksize, true);
-		do {
-			set_buffer_uptodate(bh);
-			tail = bh;
-			bh = bh->b_this_page;
-		} while (bh);
-		tail->b_this_page = head;
-		attach_page_private(page, head);
-	}
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	rl = NULL;
-	nr_bhs = 0;
-	block_start = 0;
-	m_start = kmirr - (u8*)page_address(page);
-	m_end = m_start + vol->mft_record_size;
-	do {
-		block_end = block_start + blocksize;
-		/* If the buffer is outside the mft record, skip it. */
-		if (block_end <= m_start)
-			continue;
-		if (unlikely(block_start >= m_end))
-			break;
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = ((VCN)mft_no << vol->mft_record_size_bits) +
-					(block_start - m_start);
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-				down_read(&NTFS_I(vol->mftmirr_ino)->
-						runlist.lock);
-				rl = NTFS_I(vol->mftmirr_ino)->runlist.rl;
-				/*
-				 * $MFTMirr always has the whole of its runlist
-				 * in memory.
-				 */
-				BUG_ON(!rl);
-			}
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			/* For $MFTMirr, only lcn >= 0 is a successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-			} else {
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write mft mirror "
-						"record 0x%lx because its "
-						"location on disk could not "
-						"be determined (error code "
-						"%lli).", mft_no,
-						(long long)lcn);
-				err = -EIO;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(!nr_bhs && (m_start != block_start));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&NTFS_I(vol->mftmirr_ino)->runlist.lock);
-	if (likely(!err)) {
-		/* Lock buffers and start synchronous write i/o on them. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-			struct buffer_head *tbh = bhs[i_bhs];
-
-			if (!trylock_buffer(tbh))
-				BUG();
-			BUG_ON(!buffer_uptodate(tbh));
-			clear_buffer_dirty(tbh);
-			get_bh(tbh);
-			tbh->b_end_io = end_buffer_write_sync;
-			submit_bh(REQ_OP_WRITE, tbh);
-		}
-		/* Wait on i/o completion of buffers. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-			struct buffer_head *tbh = bhs[i_bhs];
-
-			wait_on_buffer(tbh);
-			if (unlikely(!buffer_uptodate(tbh))) {
-				err = -EIO;
-				/*
-				 * Set the buffer uptodate so the page and
-				 * buffer states do not become out of sync.
-				 */
-				set_buffer_uptodate(tbh);
-			}
-		}
-	} else /* if (unlikely(err)) */ {
-		/* Clean the buffers. */
-		for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
-			clear_buffer_dirty(bhs[i_bhs]);
-	}
-	/* Current state: all buffers are clean, unlocked, and uptodate. */
-	/* Remove the mst protection fixups again. */
-	post_write_mst_fixup((NTFS_RECORD*)kmirr);
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	unlock_page(page);
-	ntfs_unmap_page(page);
-	if (likely(!err)) {
-		ntfs_debug("Done.");
-	} else {
-		ntfs_error(vol->sb, "I/O error while writing mft mirror "
-				"record 0x%lx!", mft_no);
-err_out:
-		ntfs_error(vol->sb, "Failed to synchronize $MFTMirr (error "
-				"code %i).  Volume will be left marked dirty "
-				"on umount.  Run ntfsfix on the partition "
-				"after umounting to correct this.", -err);
-		NVolSetErrors(vol);
-	}
-	return err;
-}
-
-/**
- * write_mft_record_nolock - write out a mapped (extent) mft record
- * @ni:		ntfs inode describing the mapped (extent) mft record
- * @m:		mapped (extent) mft record to write
- * @sync:	if true, wait for i/o completion
- *
- * Write the mapped (extent) mft record @m described by the (regular or extent)
- * ntfs inode @ni to backing store.  If the mft record @m has a counterpart in
- * the mft mirror, that is also updated.
- *
- * We only write the mft record if the ntfs inode @ni is dirty and the first
- * buffer belonging to its mft record is dirty, too.  We ignore the dirty state
- * of subsequent buffers because we could have raced with
- * fs/ntfs/aops.c::mark_ntfs_record_dirty().
- *
- * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.
- *
- * NOTE:  We always perform synchronous i/o and ignore the @sync parameter.
- * However, if the mft record has a counterpart in the mft mirror and @sync is
- * true, we write the mft record, wait for i/o completion, and only then write
- * the mft mirror copy.  This ensures that if the system crashes either the mft
- * or the mft mirror will contain a self-consistent mft record @m.  If @sync is
- * false on the other hand, we start i/o on both and then wait for completion
- * on them.  This provides a speedup but no longer guarantees that you will end
- * up with a self-consistent mft record in the case of a crash but if you asked
- * for asynchronous writing you probably do not care about that anyway.
- *
- * TODO:  If @sync is false, want to do truly asynchronous i/o, i.e. just
- * schedule i/o via ->writepage or do it via kntfsd or whatever.
- */
-int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
-	ntfs_volume *vol = ni->vol;
-	struct page *page = ni->page;
-	unsigned int blocksize = vol->sb->s_blocksize;
-	unsigned char blocksize_bits = vol->sb->s_blocksize_bits;
-	int max_bhs = vol->mft_record_size / blocksize;
-	struct buffer_head *bhs[MAX_BHS];
-	struct buffer_head *bh, *head;
-	runlist_element *rl;
-	unsigned int block_start, block_end, m_start, m_end;
-	int i_bhs, nr_bhs, err = 0;
-
-	ntfs_debug("Entering for inode 0x%lx.", ni->mft_no);
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(!max_bhs);
-	BUG_ON(!PageLocked(page));
-	if (WARN_ON(max_bhs > MAX_BHS)) {
-		err = -EINVAL;
-		goto err_out;
-	}
-	/*
-	 * If the ntfs_inode is clean no need to do anything.  If it is dirty,
-	 * mark it as clean now so that it can be redirtied later on if needed.
-	 * There is no danger of races since the caller is holding the locks
-	 * for the mft record @m and the page it is in.
-	 */
-	if (!NInoTestClearDirty(ni))
-		goto done;
-	bh = head = page_buffers(page);
-	BUG_ON(!bh);
-	rl = NULL;
-	nr_bhs = 0;
-	block_start = 0;
-	m_start = ni->page_ofs;
-	m_end = m_start + vol->mft_record_size;
-	do {
-		block_end = block_start + blocksize;
-		/* If the buffer is outside the mft record, skip it. */
-		if (block_end <= m_start)
-			continue;
-		if (unlikely(block_start >= m_end))
-			break;
-		/*
-		 * If this block is not the first one in the record, we ignore
-		 * the buffer's dirty state because we could have raced with a
-		 * parallel mark_ntfs_record_dirty().
-		 */
-		if (block_start == m_start) {
-			/* This block is the first one in the record. */
-			if (!buffer_dirty(bh)) {
-				BUG_ON(nr_bhs);
-				/* Clean records are not written out. */
-				break;
-			}
-		}
-		/* Need to map the buffer if it is not mapped already. */
-		if (unlikely(!buffer_mapped(bh))) {
-			VCN vcn;
-			LCN lcn;
-			unsigned int vcn_ofs;
-
-			bh->b_bdev = vol->sb->s_bdev;
-			/* Obtain the vcn and offset of the current block. */
-			vcn = ((VCN)ni->mft_no << vol->mft_record_size_bits) +
-					(block_start - m_start);
-			vcn_ofs = vcn & vol->cluster_size_mask;
-			vcn >>= vol->cluster_size_bits;
-			if (!rl) {
-				down_read(&NTFS_I(vol->mft_ino)->runlist.lock);
-				rl = NTFS_I(vol->mft_ino)->runlist.rl;
-				BUG_ON(!rl);
-			}
-			/* Seek to element containing target vcn. */
-			while (rl->length && rl[1].vcn <= vcn)
-				rl++;
-			lcn = ntfs_rl_vcn_to_lcn(rl, vcn);
-			/* For $MFT, only lcn >= 0 is a successful remap. */
-			if (likely(lcn >= 0)) {
-				/* Setup buffer head to correct block. */
-				bh->b_blocknr = ((lcn <<
-						vol->cluster_size_bits) +
-						vcn_ofs) >> blocksize_bits;
-				set_buffer_mapped(bh);
-			} else {
-				bh->b_blocknr = -1;
-				ntfs_error(vol->sb, "Cannot write mft record "
-						"0x%lx because its location "
-						"on disk could not be "
-						"determined (error code %lli).",
-						ni->mft_no, (long long)lcn);
-				err = -EIO;
-			}
-		}
-		BUG_ON(!buffer_uptodate(bh));
-		BUG_ON(!nr_bhs && (m_start != block_start));
-		BUG_ON(nr_bhs >= max_bhs);
-		bhs[nr_bhs++] = bh;
-		BUG_ON((nr_bhs >= max_bhs) && (m_end != block_end));
-	} while (block_start = block_end, (bh = bh->b_this_page) != head);
-	if (unlikely(rl))
-		up_read(&NTFS_I(vol->mft_ino)->runlist.lock);
-	if (!nr_bhs)
-		goto done;
-	if (unlikely(err))
-		goto cleanup_out;
-	/* Apply the mst protection fixups. */
-	err = pre_write_mst_fixup((NTFS_RECORD*)m, vol->mft_record_size);
-	if (err) {
-		ntfs_error(vol->sb, "Failed to apply mst fixups!");
-		goto cleanup_out;
-	}
-	flush_dcache_mft_record_page(ni);
-	/* Lock buffers and start synchronous write i/o on them. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-		struct buffer_head *tbh = bhs[i_bhs];
-
-		if (!trylock_buffer(tbh))
-			BUG();
-		BUG_ON(!buffer_uptodate(tbh));
-		clear_buffer_dirty(tbh);
-		get_bh(tbh);
-		tbh->b_end_io = end_buffer_write_sync;
-		submit_bh(REQ_OP_WRITE, tbh);
-	}
-	/* Synchronize the mft mirror now if not @sync. */
-	if (!sync && ni->mft_no < vol->mftmirr_size)
-		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
-	/* Wait on i/o completion of buffers. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++) {
-		struct buffer_head *tbh = bhs[i_bhs];
-
-		wait_on_buffer(tbh);
-		if (unlikely(!buffer_uptodate(tbh))) {
-			err = -EIO;
-			/*
-			 * Set the buffer uptodate so the page and buffer
-			 * states do not become out of sync.
-			 */
-			if (PageUptodate(page))
-				set_buffer_uptodate(tbh);
-		}
-	}
-	/* If @sync, now synchronize the mft mirror. */
-	if (sync && ni->mft_no < vol->mftmirr_size)
-		ntfs_sync_mft_mirror(vol, ni->mft_no, m, sync);
-	/* Remove the mst protection fixups again. */
-	post_write_mst_fixup((NTFS_RECORD*)m);
-	flush_dcache_mft_record_page(ni);
-	if (unlikely(err)) {
-		/* I/O error during writing.  This is really bad! */
-		ntfs_error(vol->sb, "I/O error while writing mft record "
-				"0x%lx!  Marking base inode as bad.  You "
-				"should unmount the volume and run chkdsk.",
-				ni->mft_no);
-		goto err_out;
-	}
-done:
-	ntfs_debug("Done.");
-	return 0;
-cleanup_out:
-	/* Clean the buffers. */
-	for (i_bhs = 0; i_bhs < nr_bhs; i_bhs++)
-		clear_buffer_dirty(bhs[i_bhs]);
-err_out:
-	/*
-	 * Current state: all buffers are clean, unlocked, and uptodate.
-	 * The caller should mark the base inode as bad so that no more i/o
-	 * happens.  ->clear_inode() will still be invoked so all extent inodes
-	 * and other allocated memory will be freed.
-	 */
-	if (err == -ENOMEM) {
-		ntfs_error(vol->sb, "Not enough memory to write mft record.  "
-				"Redirtying so the write is retried later.");
-		mark_mft_record_dirty(ni);
-		err = 0;
-	} else
-		NVolSetErrors(vol);
-	return err;
-}
-
-/**
- * ntfs_may_write_mft_record - check if an mft record may be written out
- * @vol:	[IN]  ntfs volume on which the mft record to check resides
- * @mft_no:	[IN]  mft record number of the mft record to check
- * @m:		[IN]  mapped mft record to check
- * @locked_ni:	[OUT] caller has to unlock this ntfs inode if one is returned
- *
- * Check if the mapped (base or extent) mft record @m with mft record number
- * @mft_no belonging to the ntfs volume @vol may be written out.  If necessary
- * and possible the ntfs inode of the mft record is locked and the base vfs
- * inode is pinned.  The locked ntfs inode is then returned in @locked_ni.  The
- * caller is responsible for unlocking the ntfs inode and unpinning the base
- * vfs inode.
- *
- * Return 'true' if the mft record may be written out and 'false' if not.
- *
- * The caller has locked the page and cleared the uptodate flag on it which
- * means that we can safely write out any dirty mft records that do not have
- * their inodes in icache as determined by ilookup5() as anyone
- * opening/creating such an inode would block when attempting to map the mft
- * record in read_cache_page() until we are finished with the write out.
- *
- * Here is a description of the tests we perform:
- *
- * If the inode is found in icache we know the mft record must be a base mft
- * record.  If it is dirty, we do not write it and return 'false' as the vfs
- * inode write paths will result in the access times being updated which would
- * cause the base mft record to be redirtied and written out again.  (We know
- * the access time update will modify the base mft record because Windows
- * chkdsk complains if the standard information attribute is not in the base
- * mft record.)
- *
- * If the inode is in icache and not dirty, we attempt to lock the mft record
- * and if we find the lock was already taken, it is not safe to write the mft
- * record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the mft record,
- * which also allows us safe writeout of the mft record.  We then set
- * @locked_ni to the locked ntfs inode and return 'true'.
- *
- * Note we cannot just lock the mft record and sleep while waiting for the lock
- * because this would deadlock due to lock reversal (normally the mft record is
- * locked before the page is locked but we already have the page locked here
- * when we try to lock the mft record).
- *
- * If the inode is not in icache we need to perform further checks.
- *
- * If the mft record is not a FILE record or it is a base mft record, we can
- * safely write it and return 'true'.
- *
- * We now know the mft record is an extent mft record.  We check if the inode
- * corresponding to its base mft record is in icache and obtain a reference to
- * it if it is.  If it is not, we can safely write it and return 'true'.
- *
- * We now have the base inode for the extent mft record.  We check if it has an
- * ntfs inode for the extent mft record attached and if not it is safe to write
- * the extent mft record and we return 'true'.
- *
- * The ntfs inode for the extent mft record is attached to the base inode so we
- * attempt to lock the extent mft record and if we find the lock was already
- * taken, it is not safe to write the extent mft record and we return 'false'.
- *
- * If we manage to obtain the lock we have exclusive access to the extent mft
- * record, which also allows us safe writeout of the extent mft record.  We
- * set the ntfs inode of the extent mft record clean and then set @locked_ni to
- * the now locked ntfs inode and return 'true'.
- *
- * Note, the reason for actually writing dirty mft records here and not just
- * relying on the vfs inode dirty code paths is that we can have mft records
- * modified without them ever having actual inodes in memory.  Also we can have
- * dirty mft records with clean ntfs inodes in memory.  None of the described
- * cases would result in the dirty mft records being written out if we only
- * relied on the vfs inode dirty code paths.  And these cases can really occur
- * during allocation of new mft records and in particular when the
- * initialized_size of the $MFT/$DATA attribute is extended and the new space
- * is initialized using ntfs_mft_record_format().  The clean inode can then
- * appear if the mft record is reused for a new inode before it got written
- * out.
- */
-bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
-		const MFT_RECORD *m, ntfs_inode **locked_ni)
-{
-	struct super_block *sb = vol->sb;
-	struct inode *mft_vi = vol->mft_ino;
-	struct inode *vi;
-	ntfs_inode *ni, *eni, **extent_nis;
-	int i;
-	ntfs_attr na;
-
-	ntfs_debug("Entering for inode 0x%lx.", mft_no);
-	/*
-	 * Normally we do not return a locked inode so set @locked_ni to NULL.
-	 */
-	BUG_ON(!locked_ni);
-	*locked_ni = NULL;
-	/*
-	 * Check if the inode corresponding to this mft record is in the VFS
-	 * inode cache and obtain a reference to it if it is.
-	 */
-	ntfs_debug("Looking for inode 0x%lx in icache.", mft_no);
-	na.mft_no = mft_no;
-	na.name = NULL;
-	na.name_len = 0;
-	na.type = AT_UNUSED;
-	/*
-	 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
-	 * we get here for it rather often.
-	 */
-	if (!mft_no) {
-		/* Balance the below iput(). */
-		vi = igrab(mft_vi);
-		BUG_ON(vi != mft_vi);
-	} else {
-		/*
-		 * Have to use ilookup5_nowait() since ilookup5() waits for the
-		 * inode lock which causes ntfs to deadlock when a concurrent
-		 * inode write via the inode dirty code paths and the page
-		 * dirty code path of the inode dirty code path when writing
-		 * $MFT occurs.
-		 */
-		vi = ilookup5_nowait(sb, mft_no, ntfs_test_inode, &na);
-	}
-	if (vi) {
-		ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
-		/* The inode is in icache. */
-		ni = NTFS_I(vi);
-		/* Take a reference to the ntfs inode. */
-		atomic_inc(&ni->count);
-		/* If the inode is dirty, do not write this record. */
-		if (NInoDirty(ni)) {
-			ntfs_debug("Inode 0x%lx is dirty, do not write it.",
-					mft_no);
-			atomic_dec(&ni->count);
-			iput(vi);
-			return false;
-		}
-		ntfs_debug("Inode 0x%lx is not dirty.", mft_no);
-		/* The inode is not dirty, try to take the mft record lock. */
-		if (unlikely(!mutex_trylock(&ni->mrec_lock))) {
-			ntfs_debug("Mft record 0x%lx is already locked, do "
-					"not write it.", mft_no);
-			atomic_dec(&ni->count);
-			iput(vi);
-			return false;
-		}
-		ntfs_debug("Managed to lock mft record 0x%lx, write it.",
-				mft_no);
-		/*
-		 * The write has to occur while we hold the mft record lock so
-		 * return the locked ntfs inode.
-		 */
-		*locked_ni = ni;
-		return true;
-	}
-	ntfs_debug("Inode 0x%lx is not in icache.", mft_no);
-	/* The inode is not in icache. */
-	/* Write the record if it is not a mft record (type "FILE"). */
-	if (!ntfs_is_mft_record(m->magic)) {
-		ntfs_debug("Mft record 0x%lx is not a FILE record, write it.",
-				mft_no);
-		return true;
-	}
-	/* Write the mft record if it is a base inode. */
-	if (!m->base_mft_record) {
-		ntfs_debug("Mft record 0x%lx is a base record, write it.",
-				mft_no);
-		return true;
-	}
-	/*
-	 * This is an extent mft record.  Check if the inode corresponding to
-	 * its base mft record is in icache and obtain a reference to it if it
-	 * is.
-	 */
-	na.mft_no = MREF_LE(m->base_mft_record);
-	ntfs_debug("Mft record 0x%lx is an extent record.  Looking for base "
-			"inode 0x%lx in icache.", mft_no, na.mft_no);
-	if (!na.mft_no) {
-		/* Balance the below iput(). */
-		vi = igrab(mft_vi);
-		BUG_ON(vi != mft_vi);
-	} else
-		vi = ilookup5_nowait(sb, na.mft_no, ntfs_test_inode,
-				&na);
-	if (!vi) {
-		/*
-		 * The base inode is not in icache, write this extent mft
-		 * record.
-		 */
-		ntfs_debug("Base inode 0x%lx is not in icache, write the "
-				"extent record.", na.mft_no);
-		return true;
-	}
-	ntfs_debug("Base inode 0x%lx is in icache.", na.mft_no);
-	/*
-	 * The base inode is in icache.  Check if it has the extent inode
-	 * corresponding to this extent mft record attached.
-	 */
-	ni = NTFS_I(vi);
-	mutex_lock(&ni->extent_lock);
-	if (ni->nr_extents <= 0) {
-		/*
-		 * The base inode has no attached extent inodes, write this
-		 * extent mft record.
-		 */
-		mutex_unlock(&ni->extent_lock);
-		iput(vi);
-		ntfs_debug("Base inode 0x%lx has no attached extent inodes, "
-				"write the extent record.", na.mft_no);
-		return true;
-	}
-	/* Iterate over the attached extent inodes. */
-	extent_nis = ni->ext.extent_ntfs_inos;
-	for (eni = NULL, i = 0; i < ni->nr_extents; ++i) {
-		if (mft_no == extent_nis[i]->mft_no) {
-			/*
-			 * Found the extent inode corresponding to this extent
-			 * mft record.
-			 */
-			eni = extent_nis[i];
-			break;
-		}
-	}
-	/*
-	 * If the extent inode was not attached to the base inode, write this
-	 * extent mft record.
-	 */
-	if (!eni) {
-		mutex_unlock(&ni->extent_lock);
-		iput(vi);
-		ntfs_debug("Extent inode 0x%lx is not attached to its base "
-				"inode 0x%lx, write the extent record.",
-				mft_no, na.mft_no);
-		return true;
-	}
-	ntfs_debug("Extent inode 0x%lx is attached to its base inode 0x%lx.",
-			mft_no, na.mft_no);
-	/* Take a reference to the extent ntfs inode. */
-	atomic_inc(&eni->count);
-	mutex_unlock(&ni->extent_lock);
-	/*
-	 * Found the extent inode coresponding to this extent mft record.
-	 * Try to take the mft record lock.
-	 */
-	if (unlikely(!mutex_trylock(&eni->mrec_lock))) {
-		atomic_dec(&eni->count);
-		iput(vi);
-		ntfs_debug("Extent mft record 0x%lx is already locked, do "
-				"not write it.", mft_no);
-		return false;
-	}
-	ntfs_debug("Managed to lock extent mft record 0x%lx, write it.",
-			mft_no);
-	if (NInoTestClearDirty(eni))
-		ntfs_debug("Extent inode 0x%lx is dirty, marking it clean.",
-				mft_no);
-	/*
-	 * The write has to occur while we hold the mft record lock so return
-	 * the locked extent ntfs inode.
-	 */
-	*locked_ni = eni;
-	return true;
-}
-
-static const char *es = "  Leaving inconsistent metadata.  Unmount and run "
-		"chkdsk.";
-
-/**
- * ntfs_mft_bitmap_find_and_alloc_free_rec_nolock - see name
- * @vol:	volume on which to search for a free mft record
- * @base_ni:	open base inode if allocating an extent mft record or NULL
- *
- * Search for a free mft record in the mft bitmap attribute on the ntfs volume
- * @vol.
- *
- * If @base_ni is NULL start the search at the default allocator position.
- *
- * If @base_ni is not NULL start the search at the mft record after the base
- * mft record @base_ni.
- *
- * Return the free mft record on success and -errno on error.  An error code of
- * -ENOSPC means that there are no free mft records in the currently
- * initialized mft bitmap.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol,
-		ntfs_inode *base_ni)
-{
-	s64 pass_end, ll, data_pos, pass_start, ofs, bit;
-	unsigned long flags;
-	struct address_space *mftbmp_mapping;
-	u8 *buf, *byte;
-	struct page *page;
-	unsigned int page_ofs, size;
-	u8 pass, b;
-
-	ntfs_debug("Searching for free mft record in the currently "
-			"initialized mft bitmap.");
-	mftbmp_mapping = vol->mftbmp_ino->i_mapping;
-	/*
-	 * Set the end of the pass making sure we do not overflow the mft
-	 * bitmap.
-	 */
-	read_lock_irqsave(&NTFS_I(vol->mft_ino)->size_lock, flags);
-	pass_end = NTFS_I(vol->mft_ino)->allocated_size >>
-			vol->mft_record_size_bits;
-	read_unlock_irqrestore(&NTFS_I(vol->mft_ino)->size_lock, flags);
-	read_lock_irqsave(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
-	ll = NTFS_I(vol->mftbmp_ino)->initialized_size << 3;
-	read_unlock_irqrestore(&NTFS_I(vol->mftbmp_ino)->size_lock, flags);
-	if (pass_end > ll)
-		pass_end = ll;
-	pass = 1;
-	if (!base_ni)
-		data_pos = vol->mft_data_pos;
-	else
-		data_pos = base_ni->mft_no + 1;
-	if (data_pos < 24)
-		data_pos = 24;
-	if (data_pos >= pass_end) {
-		data_pos = 24;
-		pass = 2;
-		/* This happens on a freshly formatted volume. */
-		if (data_pos >= pass_end)
-			return -ENOSPC;
-	}
-	pass_start = data_pos;
-	ntfs_debug("Starting bitmap search: pass %u, pass_start 0x%llx, "
-			"pass_end 0x%llx, data_pos 0x%llx.", pass,
-			(long long)pass_start, (long long)pass_end,
-			(long long)data_pos);
-	/* Loop until a free mft record is found. */
-	for (; pass <= 2;) {
-		/* Cap size to pass_end. */
-		ofs = data_pos >> 3;
-		page_ofs = ofs & ~PAGE_MASK;
-		size = PAGE_SIZE - page_ofs;
-		ll = ((pass_end + 7) >> 3) - ofs;
-		if (size > ll)
-			size = ll;
-		size <<= 3;
-		/*
-		 * If we are still within the active pass, search the next page
-		 * for a zero bit.
-		 */
-		if (size) {
-			page = ntfs_map_page(mftbmp_mapping,
-					ofs >> PAGE_SHIFT);
-			if (IS_ERR(page)) {
-				ntfs_error(vol->sb, "Failed to read mft "
-						"bitmap, aborting.");
-				return PTR_ERR(page);
-			}
-			buf = (u8*)page_address(page) + page_ofs;
-			bit = data_pos & 7;
-			data_pos &= ~7ull;
-			ntfs_debug("Before inner for loop: size 0x%x, "
-					"data_pos 0x%llx, bit 0x%llx", size,
-					(long long)data_pos, (long long)bit);
-			for (; bit < size && data_pos + bit < pass_end;
-					bit &= ~7ull, bit += 8) {
-				byte = buf + (bit >> 3);
-				if (*byte == 0xff)
-					continue;
-				b = ffz((unsigned long)*byte);
-				if (b < 8 && b >= (bit & 7)) {
-					ll = data_pos + (bit & ~7ull) + b;
-					if (unlikely(ll > (1ll << 32))) {
-						ntfs_unmap_page(page);
-						return -ENOSPC;
-					}
-					*byte |= 1 << b;
-					flush_dcache_page(page);
-					set_page_dirty(page);
-					ntfs_unmap_page(page);
-					ntfs_debug("Done.  (Found and "
-							"allocated mft record "
-							"0x%llx.)",
-							(long long)ll);
-					return ll;
-				}
-			}
-			ntfs_debug("After inner for loop: size 0x%x, "
-					"data_pos 0x%llx, bit 0x%llx", size,
-					(long long)data_pos, (long long)bit);
-			data_pos += size;
-			ntfs_unmap_page(page);
-			/*
-			 * If the end of the pass has not been reached yet,
-			 * continue searching the mft bitmap for a zero bit.
-			 */
-			if (data_pos < pass_end)
-				continue;
-		}
-		/* Do the next pass. */
-		if (++pass == 2) {
-			/*
-			 * Starting the second pass, in which we scan the first
-			 * part of the zone which we omitted earlier.
-			 */
-			pass_end = pass_start;
-			data_pos = pass_start = 24;
-			ntfs_debug("pass %i, pass_start 0x%llx, pass_end "
-					"0x%llx.", pass, (long long)pass_start,
-					(long long)pass_end);
-			if (data_pos >= pass_end)
-				break;
-		}
-	}
-	/* No free mft records in currently initialized mft bitmap. */
-	ntfs_debug("Done.  (No free mft records left in currently initialized "
-			"mft bitmap.)");
-	return -ENOSPC;
-}
-
-/**
- * ntfs_mft_bitmap_extend_allocation_nolock - extend mft bitmap by a cluster
- * @vol:	volume on which to extend the mft bitmap attribute
- *
- * Extend the mft bitmap attribute on the ntfs volume @vol by one cluster.
- *
- * Note: Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- *	    - This function takes NTFS_I(vol->mftbmp_ino)->runlist.lock for
- *	      writing and releases it before returning.
- *	    - This function takes vol->lcnbmp_lock for writing and releases it
- *	      before returning.
- */
-static int ntfs_mft_bitmap_extend_allocation_nolock(ntfs_volume *vol)
-{
-	LCN lcn;
-	s64 ll;
-	unsigned long flags;
-	struct page *page;
-	ntfs_inode *mft_ni, *mftbmp_ni;
-	runlist_element *rl, *rl2 = NULL;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a = NULL;
-	int ret, mp_size;
-	u32 old_alen = 0;
-	u8 *b, tb;
-	struct {
-		u8 added_cluster:1;
-		u8 added_run:1;
-		u8 mp_rebuilt:1;
-	} status = { 0, 0, 0 };
-
-	ntfs_debug("Extending mft bitmap allocation.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
-	/*
-	 * Determine the last lcn of the mft bitmap.  The allocated size of the
-	 * mft bitmap cannot be zero so we are ok to do this.
-	 */
-	down_write(&mftbmp_ni->runlist.lock);
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ll = mftbmp_ni->allocated_size;
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	rl = ntfs_attr_find_vcn_nolock(mftbmp_ni,
-			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
-		up_write(&mftbmp_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to determine last allocated "
-				"cluster of mft bitmap attribute.");
-		if (!IS_ERR(rl))
-			ret = -EIO;
-		else
-			ret = PTR_ERR(rl);
-		return ret;
-	}
-	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft bitmap attribute is 0x%llx.",
-			(long long)lcn);
-	/*
-	 * Attempt to get the cluster following the last allocated cluster by
-	 * hand as it may be in the MFT zone so the allocator would not give it
-	 * to us.
-	 */
-	ll = lcn >> 3;
-	page = ntfs_map_page(vol->lcnbmp_ino->i_mapping,
-			ll >> PAGE_SHIFT);
-	if (IS_ERR(page)) {
-		up_write(&mftbmp_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to read from lcn bitmap.");
-		return PTR_ERR(page);
-	}
-	b = (u8*)page_address(page) + (ll & ~PAGE_MASK);
-	tb = 1 << (lcn & 7ull);
-	down_write(&vol->lcnbmp_lock);
-	if (*b != 0xff && !(*b & tb)) {
-		/* Next cluster is free, allocate it. */
-		*b |= tb;
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		up_write(&vol->lcnbmp_lock);
-		ntfs_unmap_page(page);
-		/* Update the mft bitmap runlist. */
-		rl->length++;
-		rl[1].vcn++;
-		status.added_cluster = 1;
-		ntfs_debug("Appending one cluster to mft bitmap.");
-	} else {
-		up_write(&vol->lcnbmp_lock);
-		ntfs_unmap_page(page);
-		/* Allocate a cluster from the DATA_ZONE. */
-		rl2 = ntfs_cluster_alloc(vol, rl[1].vcn, 1, lcn, DATA_ZONE,
-				true);
-		if (IS_ERR(rl2)) {
-			up_write(&mftbmp_ni->runlist.lock);
-			ntfs_error(vol->sb, "Failed to allocate a cluster for "
-					"the mft bitmap.");
-			return PTR_ERR(rl2);
-		}
-		rl = ntfs_runlists_merge(mftbmp_ni->runlist.rl, rl2);
-		if (IS_ERR(rl)) {
-			up_write(&mftbmp_ni->runlist.lock);
-			ntfs_error(vol->sb, "Failed to merge runlists for mft "
-					"bitmap.");
-			if (ntfs_cluster_free_from_rl(vol, rl2)) {
-				ntfs_error(vol->sb, "Failed to deallocate "
-						"allocated cluster.%s", es);
-				NVolSetErrors(vol);
-			}
-			ntfs_free(rl2);
-			return PTR_ERR(rl);
-		}
-		mftbmp_ni->runlist.rl = rl;
-		status.added_run = 1;
-		ntfs_debug("Adding one run to mft bitmap.");
-		/* Find the last run in the new runlist. */
-		for (; rl[1].length; rl++)
-			;
-	}
-	/*
-	 * Update the attribute record as well.  Note: @rl is the last
-	 * (non-terminator) runlist element of mft bitmap.
-	 */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		ret = PTR_ERR(mrec);
-		goto undo_alloc;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto undo_alloc;
-	}
-	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
-			0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft bitmap attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	a = ctx->attr;
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	/* Search back for the previous last allocated cluster of mft bitmap. */
-	for (rl2 = rl; rl2 > mftbmp_ni->runlist.rl; rl2--) {
-		if (ll >= rl2->vcn)
-			break;
-	}
-	BUG_ON(ll < rl2->vcn);
-	BUG_ON(ll >= rl2->vcn + rl2->length);
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
-				"mft bitmap attribute extent.");
-		ret = mp_size;
-		if (!ret)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	/* Expand the attribute record if necessary. */
-	old_alen = le32_to_cpu(a->length);
-	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(ret)) {
-		if (ret != -ENOSPC) {
-			ntfs_error(vol->sb, "Failed to resize attribute "
-					"record for mft bitmap attribute.");
-			goto undo_alloc;
-		}
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record or by
-		// moving other attributes out of this mft record.
-		// Note: It will need to be a special mft record and if none of
-		// those are available it gets rather complicated...
-		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accommodate extended mft bitmap attribute "
-				"extent.  Cannot handle this yet.");
-		ret = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	status.mp_rebuilt = 1;
-	/* Generate the mapping pairs array directly into the attr record. */
-	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to build mapping pairs array for "
-				"mft bitmap attribute.");
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
-	/*
-	 * We now have extended the mft bitmap allocated_size by one cluster.
-	 * Reflect this in the ntfs_inode structure and the attribute record.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-				mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL,
-				0, ctx);
-		if (unlikely(ret)) {
-			ntfs_error(vol->sb, "Failed to find first attribute "
-					"extent of mft bitmap attribute.");
-			goto restore_undo_alloc;
-		}
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	mftbmp_ni->allocated_size += vol->cluster_size;
-	a->data.non_resident.allocated_size =
-			cpu_to_sle64(mftbmp_ni->allocated_size);
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	up_write(&mftbmp_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return 0;
-restore_undo_alloc:
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, rl[1].vcn, NULL,
-			0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft bitmap attribute.%s", es);
-		write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-		mftbmp_ni->allocated_size += vol->cluster_size;
-		write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		up_write(&mftbmp_ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is ->allocated_size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return ret;
-	}
-	a = ctx->attr;
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 2);
-undo_alloc:
-	if (status.added_cluster) {
-		/* Truncate the last run in the runlist by one cluster. */
-		rl->length--;
-		rl[1].vcn--;
-	} else if (status.added_run) {
-		lcn = rl->lcn;
-		/* Remove the last run from the runlist. */
-		rl->lcn = rl[1].lcn;
-		rl->length = 0;
-	}
-	/* Deallocate the cluster. */
-	down_write(&vol->lcnbmp_lock);
-	if (ntfs_bitmap_clear_bit(vol->lcnbmp_ino, lcn)) {
-		ntfs_error(vol->sb, "Failed to free allocated cluster.%s", es);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->lcnbmp_lock);
-	if (status.mp_rebuilt) {
-		if (ntfs_mapping_pairs_build(vol, (u8*)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				old_alen - le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, -1, NULL)) {
-			ntfs_error(vol->sb, "Failed to restore mapping pairs "
-					"array.%s", es);
-			NVolSetErrors(vol);
-		}
-		if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
-			ntfs_error(vol->sb, "Failed to restore attribute "
-					"record.%s", es);
-			NVolSetErrors(vol);
-		}
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-	}
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (!IS_ERR(mrec))
-		unmap_mft_record(mft_ni);
-	up_write(&mftbmp_ni->runlist.lock);
-	return ret;
-}
-
-/**
- * ntfs_mft_bitmap_extend_initialized_nolock - extend mftbmp initialized data
- * @vol:	volume on which to extend the mft bitmap attribute
- *
- * Extend the initialized portion of the mft bitmap attribute on the ntfs
- * volume @vol by 8 bytes.
- *
- * Note:  Only changes initialized_size and data_size, i.e. requires that
- * allocated_size is big enough to fit the new initialized_size.
- *
- * Return 0 on success and -error on error.
- *
- * Locking: Caller must hold vol->mftbmp_lock for writing.
- */
-static int ntfs_mft_bitmap_extend_initialized_nolock(ntfs_volume *vol)
-{
-	s64 old_data_size, old_initialized_size;
-	unsigned long flags;
-	struct inode *mftbmp_vi;
-	ntfs_inode *mft_ni, *mftbmp_ni;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a;
-	int ret;
-
-	ntfs_debug("Extending mft bitmap initiailized (and data) size.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_vi = vol->mftbmp_ino;
-	mftbmp_ni = NTFS_I(mftbmp_vi);
-	/* Get the attribute record. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		return PTR_ERR(mrec);
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto unm_err_out;
-	}
-	ret = ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft bitmap attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto put_err_out;
-	}
-	a = ctx->attr;
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_size = i_size_read(mftbmp_vi);
-	old_initialized_size = mftbmp_ni->initialized_size;
-	/*
-	 * We can simply update the initialized_size before filling the space
-	 * with zeroes because the caller is holding the mft bitmap lock for
-	 * writing which ensures that no one else is trying to access the data.
-	 */
-	mftbmp_ni->initialized_size += 8;
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(mftbmp_ni->initialized_size);
-	if (mftbmp_ni->initialized_size > old_data_size) {
-		i_size_write(mftbmp_vi, mftbmp_ni->initialized_size);
-		a->data.non_resident.data_size =
-				cpu_to_sle64(mftbmp_ni->initialized_size);
-	}
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	/* Initialize the mft bitmap attribute value with zeroes. */
-	ret = ntfs_attr_set(mftbmp_ni, old_initialized_size, 8, 0);
-	if (likely(!ret)) {
-		ntfs_debug("Done.  (Wrote eight initialized bytes to mft "
-				"bitmap.");
-		return 0;
-	}
-	ntfs_error(vol->sb, "Failed to write to mft bitmap.");
-	/* Try to recover from the error. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.%s", es);
-		NVolSetErrors(vol);
-		return ret;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.%s", es);
-		NVolSetErrors(vol);
-		goto unm_err_out;
-	}
-	if (ntfs_attr_lookup(mftbmp_ni->type, mftbmp_ni->name,
-			mftbmp_ni->name_len, CASE_SENSITIVE, 0, NULL, 0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft bitmap attribute.%s", es);
-		NVolSetErrors(vol);
-put_err_out:
-		ntfs_attr_put_search_ctx(ctx);
-unm_err_out:
-		unmap_mft_record(mft_ni);
-		goto err_out;
-	}
-	a = ctx->attr;
-	write_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	mftbmp_ni->initialized_size = old_initialized_size;
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(old_initialized_size);
-	if (i_size_read(mftbmp_vi) != old_data_size) {
-		i_size_write(mftbmp_vi, old_data_size);
-		a->data.non_resident.data_size = cpu_to_sle64(old_data_size);
-	}
-	write_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-#ifdef DEBUG
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ntfs_debug("Restored status of mftbmp: allocated_size 0x%llx, "
-			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)i_size_read(mftbmp_vi),
-			(long long)mftbmp_ni->initialized_size);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-err_out:
-	return ret;
-}
-
-/**
- * ntfs_mft_data_extend_allocation_nolock - extend mft data attribute
- * @vol:	volume on which to extend the mft data attribute
- *
- * Extend the mft data attribute on the ntfs volume @vol by 16 mft records
- * worth of clusters or if not enough space for this by one mft record worth
- * of clusters.
- *
- * Note:  Only changes allocated_size, i.e. does not touch initialized_size or
- * data_size.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: - Caller must hold vol->mftbmp_lock for writing.
- *	    - This function takes NTFS_I(vol->mft_ino)->runlist.lock for
- *	      writing and releases it before returning.
- *	    - This function calls functions which take vol->lcnbmp_lock for
- *	      writing and release it before returning.
- */
-static int ntfs_mft_data_extend_allocation_nolock(ntfs_volume *vol)
-{
-	LCN lcn;
-	VCN old_last_vcn;
-	s64 min_nr, nr, ll;
-	unsigned long flags;
-	ntfs_inode *mft_ni;
-	runlist_element *rl, *rl2;
-	ntfs_attr_search_ctx *ctx = NULL;
-	MFT_RECORD *mrec;
-	ATTR_RECORD *a = NULL;
-	int ret, mp_size;
-	u32 old_alen = 0;
-	bool mp_rebuilt = false;
-
-	ntfs_debug("Extending mft data allocation.");
-	mft_ni = NTFS_I(vol->mft_ino);
-	/*
-	 * Determine the preferred allocation location, i.e. the last lcn of
-	 * the mft data attribute.  The allocated size of the mft data
-	 * attribute cannot be zero so we are ok to do this.
-	 */
-	down_write(&mft_ni->runlist.lock);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->allocated_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	rl = ntfs_attr_find_vcn_nolock(mft_ni,
-			(ll - 1) >> vol->cluster_size_bits, NULL);
-	if (IS_ERR(rl) || unlikely(!rl->length || rl->lcn < 0)) {
-		up_write(&mft_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to determine last allocated "
-				"cluster of mft data attribute.");
-		if (!IS_ERR(rl))
-			ret = -EIO;
-		else
-			ret = PTR_ERR(rl);
-		return ret;
-	}
-	lcn = rl->lcn + rl->length;
-	ntfs_debug("Last lcn of mft data attribute is 0x%llx.", (long long)lcn);
-	/* Minimum allocation is one mft record worth of clusters. */
-	min_nr = vol->mft_record_size >> vol->cluster_size_bits;
-	if (!min_nr)
-		min_nr = 1;
-	/* Want to allocate 16 mft records worth of clusters. */
-	nr = vol->mft_record_size << 4 >> vol->cluster_size_bits;
-	if (!nr)
-		nr = min_nr;
-	/* Ensure we do not go above 2^32-1 mft records. */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->allocated_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
-			vol->mft_record_size_bits >= (1ll << 32))) {
-		nr = min_nr;
-		if (unlikely((ll + (nr << vol->cluster_size_bits)) >>
-				vol->mft_record_size_bits >= (1ll << 32))) {
-			ntfs_warning(vol->sb, "Cannot allocate mft record "
-					"because the maximum number of inodes "
-					"(2^32) has already been reached.");
-			up_write(&mft_ni->runlist.lock);
-			return -ENOSPC;
-		}
-	}
-	ntfs_debug("Trying mft data allocation with %s cluster count %lli.",
-			nr > min_nr ? "default" : "minimal", (long long)nr);
-	old_last_vcn = rl[1].vcn;
-	do {
-		rl2 = ntfs_cluster_alloc(vol, old_last_vcn, nr, lcn, MFT_ZONE,
-				true);
-		if (!IS_ERR(rl2))
-			break;
-		if (PTR_ERR(rl2) != -ENOSPC || nr == min_nr) {
-			ntfs_error(vol->sb, "Failed to allocate the minimal "
-					"number of clusters (%lli) for the "
-					"mft data attribute.", (long long)nr);
-			up_write(&mft_ni->runlist.lock);
-			return PTR_ERR(rl2);
-		}
-		/*
-		 * There is not enough space to do the allocation, but there
-		 * might be enough space to do a minimal allocation so try that
-		 * before failing.
-		 */
-		nr = min_nr;
-		ntfs_debug("Retrying mft data allocation with minimal cluster "
-				"count %lli.", (long long)nr);
-	} while (1);
-	rl = ntfs_runlists_merge(mft_ni->runlist.rl, rl2);
-	if (IS_ERR(rl)) {
-		up_write(&mft_ni->runlist.lock);
-		ntfs_error(vol->sb, "Failed to merge runlists for mft data "
-				"attribute.");
-		if (ntfs_cluster_free_from_rl(vol, rl2)) {
-			ntfs_error(vol->sb, "Failed to deallocate clusters "
-					"from the mft data attribute.%s", es);
-			NVolSetErrors(vol);
-		}
-		ntfs_free(rl2);
-		return PTR_ERR(rl);
-	}
-	mft_ni->runlist.rl = rl;
-	ntfs_debug("Allocated %lli clusters.", (long long)nr);
-	/* Find the last run in the new runlist. */
-	for (; rl[1].length; rl++)
-		;
-	/* Update the attribute record as well. */
-	mrec = map_mft_record(mft_ni);
-	if (IS_ERR(mrec)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		ret = PTR_ERR(mrec);
-		goto undo_alloc;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, mrec);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		ret = -ENOMEM;
-		goto undo_alloc;
-	}
-	ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft data attribute.");
-		if (ret == -ENOENT)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	a = ctx->attr;
-	ll = sle64_to_cpu(a->data.non_resident.lowest_vcn);
-	/* Search back for the previous last allocated cluster of mft bitmap. */
-	for (rl2 = rl; rl2 > mft_ni->runlist.rl; rl2--) {
-		if (ll >= rl2->vcn)
-			break;
-	}
-	BUG_ON(ll < rl2->vcn);
-	BUG_ON(ll >= rl2->vcn + rl2->length);
-	/* Get the size for the new mapping pairs array for this extent. */
-	mp_size = ntfs_get_size_for_mapping_pairs(vol, rl2, ll, -1);
-	if (unlikely(mp_size <= 0)) {
-		ntfs_error(vol->sb, "Get size for mapping pairs failed for "
-				"mft data attribute extent.");
-		ret = mp_size;
-		if (!ret)
-			ret = -EIO;
-		goto undo_alloc;
-	}
-	/* Expand the attribute record if necessary. */
-	old_alen = le32_to_cpu(a->length);
-	ret = ntfs_attr_record_resize(ctx->mrec, a, mp_size +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset));
-	if (unlikely(ret)) {
-		if (ret != -ENOSPC) {
-			ntfs_error(vol->sb, "Failed to resize attribute "
-					"record for mft data attribute.");
-			goto undo_alloc;
-		}
-		// TODO: Deal with this by moving this extent to a new mft
-		// record or by starting a new extent in a new mft record or by
-		// moving other attributes out of this mft record.
-		// Note: Use the special reserved mft records and ensure that
-		// this extent is not required to find the mft record in
-		// question.  If no free special records left we would need to
-		// move an existing record away, insert ours in its place, and
-		// then place the moved record into the newly allocated space
-		// and we would then need to update all references to this mft
-		// record appropriately.  This is rather complicated...
-		ntfs_error(vol->sb, "Not enough space in this mft record to "
-				"accommodate extended mft data attribute "
-				"extent.  Cannot handle this yet.");
-		ret = -EOPNOTSUPP;
-		goto undo_alloc;
-	}
-	mp_rebuilt = true;
-	/* Generate the mapping pairs array directly into the attr record. */
-	ret = ntfs_mapping_pairs_build(vol, (u8*)a +
-			le16_to_cpu(a->data.non_resident.mapping_pairs_offset),
-			mp_size, rl2, ll, -1, NULL);
-	if (unlikely(ret)) {
-		ntfs_error(vol->sb, "Failed to build mapping pairs array of "
-				"mft data attribute.");
-		goto undo_alloc;
-	}
-	/* Update the highest_vcn. */
-	a->data.non_resident.highest_vcn = cpu_to_sle64(rl[1].vcn - 1);
-	/*
-	 * We now have extended the mft data allocated_size by nr clusters.
-	 * Reflect this in the ntfs_inode structure and the attribute record.
-	 * @rl is the last (non-terminator) runlist element of mft data
-	 * attribute.
-	 */
-	if (a->data.non_resident.lowest_vcn) {
-		/*
-		 * We are not in the first attribute extent, switch to it, but
-		 * first ensure the changes will make it to disk later.
-		 */
-		flush_dcache_mft_record_page(ctx->ntfs_ino);
-		mark_mft_record_dirty(ctx->ntfs_ino);
-		ntfs_attr_reinit_search_ctx(ctx);
-		ret = ntfs_attr_lookup(mft_ni->type, mft_ni->name,
-				mft_ni->name_len, CASE_SENSITIVE, 0, NULL, 0,
-				ctx);
-		if (unlikely(ret)) {
-			ntfs_error(vol->sb, "Failed to find first attribute "
-					"extent of mft data attribute.");
-			goto restore_undo_alloc;
-		}
-		a = ctx->attr;
-	}
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	mft_ni->allocated_size += nr << vol->cluster_size_bits;
-	a->data.non_resident.allocated_size =
-			cpu_to_sle64(mft_ni->allocated_size);
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	up_write(&mft_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return 0;
-restore_undo_alloc:
-	ntfs_attr_reinit_search_ctx(ctx);
-	if (ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, rl[1].vcn, NULL, 0, ctx)) {
-		ntfs_error(vol->sb, "Failed to find last attribute extent of "
-				"mft data attribute.%s", es);
-		write_lock_irqsave(&mft_ni->size_lock, flags);
-		mft_ni->allocated_size += nr << vol->cluster_size_bits;
-		write_unlock_irqrestore(&mft_ni->size_lock, flags);
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		up_write(&mft_ni->runlist.lock);
-		/*
-		 * The only thing that is now wrong is ->allocated_size of the
-		 * base attribute extent which chkdsk should be able to fix.
-		 */
-		NVolSetErrors(vol);
-		return ret;
-	}
-	ctx->attr->data.non_resident.highest_vcn =
-			cpu_to_sle64(old_last_vcn - 1);
-undo_alloc:
-	if (ntfs_cluster_free(mft_ni, old_last_vcn, -1, ctx) < 0) {
-		ntfs_error(vol->sb, "Failed to free clusters from mft data "
-				"attribute.%s", es);
-		NVolSetErrors(vol);
-	}
-
-	if (ntfs_rl_truncate_nolock(vol, &mft_ni->runlist, old_last_vcn)) {
-		ntfs_error(vol->sb, "Failed to truncate mft data attribute "
-				"runlist.%s", es);
-		NVolSetErrors(vol);
-	}
-	if (ctx) {
-		a = ctx->attr;
-		if (mp_rebuilt && !IS_ERR(ctx->mrec)) {
-			if (ntfs_mapping_pairs_build(vol, (u8 *)a + le16_to_cpu(
-				a->data.non_resident.mapping_pairs_offset),
-				old_alen - le16_to_cpu(
-					a->data.non_resident.mapping_pairs_offset),
-				rl2, ll, -1, NULL)) {
-				ntfs_error(vol->sb, "Failed to restore mapping pairs "
-					"array.%s", es);
-				NVolSetErrors(vol);
-			}
-			if (ntfs_attr_record_resize(ctx->mrec, a, old_alen)) {
-				ntfs_error(vol->sb, "Failed to restore attribute "
-					"record.%s", es);
-				NVolSetErrors(vol);
-			}
-			flush_dcache_mft_record_page(ctx->ntfs_ino);
-			mark_mft_record_dirty(ctx->ntfs_ino);
-		} else if (IS_ERR(ctx->mrec)) {
-			ntfs_error(vol->sb, "Failed to restore attribute search "
-				"context.%s", es);
-			NVolSetErrors(vol);
-		}
-		ntfs_attr_put_search_ctx(ctx);
-	}
-	if (!IS_ERR(mrec))
-		unmap_mft_record(mft_ni);
-	up_write(&mft_ni->runlist.lock);
-	return ret;
-}
-
-/**
- * ntfs_mft_record_layout - layout an mft record into a memory buffer
- * @vol:	volume to which the mft record will belong
- * @mft_no:	mft reference specifying the mft record number
- * @m:		destination buffer of size >= @vol->mft_record_size bytes
- *
- * Layout an empty, unused mft record with the mft record number @mft_no into
- * the buffer @m.  The volume @vol is needed because the mft record structure
- * was modified in NTFS 3.1 so we need to know which volume version this mft
- * record will be used on.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_layout(const ntfs_volume *vol, const s64 mft_no,
-		MFT_RECORD *m)
-{
-	ATTR_RECORD *a;
-
-	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
-	if (mft_no >= (1ll << 32)) {
-		ntfs_error(vol->sb, "Mft record number 0x%llx exceeds "
-				"maximum of 2^32.", (long long)mft_no);
-		return -ERANGE;
-	}
-	/* Start by clearing the whole mft record to gives us a clean slate. */
-	memset(m, 0, vol->mft_record_size);
-	/* Aligned to 2-byte boundary. */
-	if (vol->major_ver < 3 || (vol->major_ver == 3 && !vol->minor_ver))
-		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD_OLD) + 1) & ~1);
-	else {
-		m->usa_ofs = cpu_to_le16((sizeof(MFT_RECORD) + 1) & ~1);
-		/*
-		 * Set the NTFS 3.1+ specific fields while we know that the
-		 * volume version is 3.1+.
-		 */
-		m->reserved = 0;
-		m->mft_record_number = cpu_to_le32((u32)mft_no);
-	}
-	m->magic = magic_FILE;
-	if (vol->mft_record_size >= NTFS_BLOCK_SIZE)
-		m->usa_count = cpu_to_le16(vol->mft_record_size /
-				NTFS_BLOCK_SIZE + 1);
-	else {
-		m->usa_count = cpu_to_le16(1);
-		ntfs_warning(vol->sb, "Sector size is bigger than mft record "
-				"size.  Setting usa_count to 1.  If chkdsk "
-				"reports this as corruption, please email "
-				"linux-ntfs-dev@lists.sourceforge.net stating "
-				"that you saw this message and that the "
-				"modified filesystem created was corrupt.  "
-				"Thank you.");
-	}
-	/* Set the update sequence number to 1. */
-	*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = cpu_to_le16(1);
-	m->lsn = 0;
-	m->sequence_number = cpu_to_le16(1);
-	m->link_count = 0;
-	/*
-	 * Place the attributes straight after the update sequence array,
-	 * aligned to 8-byte boundary.
-	 */
-	m->attrs_offset = cpu_to_le16((le16_to_cpu(m->usa_ofs) +
-			(le16_to_cpu(m->usa_count) << 1) + 7) & ~7);
-	m->flags = 0;
-	/*
-	 * Using attrs_offset plus eight bytes (for the termination attribute).
-	 * attrs_offset is already aligned to 8-byte boundary, so no need to
-	 * align again.
-	 */
-	m->bytes_in_use = cpu_to_le32(le16_to_cpu(m->attrs_offset) + 8);
-	m->bytes_allocated = cpu_to_le32(vol->mft_record_size);
-	m->base_mft_record = 0;
-	m->next_attr_instance = 0;
-	/* Add the termination attribute. */
-	a = (ATTR_RECORD*)((u8*)m + le16_to_cpu(m->attrs_offset));
-	a->type = AT_END;
-	a->length = 0;
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_mft_record_format - format an mft record on an ntfs volume
- * @vol:	volume on which to format the mft record
- * @mft_no:	mft record number to format
- *
- * Format the mft record @mft_no in $MFT/$DATA, i.e. lay out an empty, unused
- * mft record into the appropriate place of the mft data attribute.  This is
- * used when extending the mft data attribute.
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no)
-{
-	loff_t i_size;
-	struct inode *mft_vi = vol->mft_ino;
-	struct page *page;
-	MFT_RECORD *m;
-	pgoff_t index, end_index;
-	unsigned int ofs;
-	int err;
-
-	ntfs_debug("Entering for mft record 0x%llx.", (long long)mft_no);
-	/*
-	 * The index into the page cache and the offset within the page cache
-	 * page of the wanted mft record.
-	 */
-	index = mft_no << vol->mft_record_size_bits >> PAGE_SHIFT;
-	ofs = (mft_no << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* The maximum valid index into the page cache for $MFT's data. */
-	i_size = i_size_read(mft_vi);
-	end_index = i_size >> PAGE_SHIFT;
-	if (unlikely(index >= end_index)) {
-		if (unlikely(index > end_index || ofs + vol->mft_record_size >=
-				(i_size & ~PAGE_MASK))) {
-			ntfs_error(vol->sb, "Tried to format non-existing mft "
-					"record 0x%llx.", (long long)mft_no);
-			return -ENOENT;
-		}
-	}
-	/* Read, map, and pin the page containing the mft record. */
-	page = ntfs_map_page(mft_vi->i_mapping, index);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map page containing mft record "
-				"to format 0x%llx.", (long long)mft_no);
-		return PTR_ERR(page);
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
-	err = ntfs_mft_record_layout(vol, mft_no, m);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to layout mft record 0x%llx.",
-				(long long)mft_no);
-		SetPageUptodate(page);
-		unlock_page(page);
-		ntfs_unmap_page(page);
-		return err;
-	}
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	unlock_page(page);
-	/*
-	 * Make sure the mft record is written out to disk.  We could use
-	 * ilookup5() to check if an inode is in icache and so on but this is
-	 * unnecessary as ntfs_writepage() will write the dirty record anyway.
-	 */
-	mark_ntfs_record_dirty(page, ofs);
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_mft_record_alloc - allocate an mft record on an ntfs volume
- * @vol:	[IN]  volume on which to allocate the mft record
- * @mode:	[IN]  mode if want a file or directory, i.e. base inode or 0
- * @base_ni:	[IN]  open base inode if allocating an extent mft record or NULL
- * @mrec:	[OUT] on successful return this is the mapped mft record
- *
- * Allocate an mft record in $MFT/$DATA of an open ntfs volume @vol.
- *
- * If @base_ni is NULL make the mft record a base mft record, i.e. a file or
- * direvctory inode, and allocate it at the default allocator position.  In
- * this case @mode is the file mode as given to us by the caller.  We in
- * particular use @mode to distinguish whether a file or a directory is being
- * created (S_IFDIR(mode) and S_IFREG(mode), respectively).
- *
- * If @base_ni is not NULL make the allocated mft record an extent record,
- * allocate it starting at the mft record after the base mft record and attach
- * the allocated and opened ntfs inode to the base inode @base_ni.  In this
- * case @mode must be 0 as it is meaningless for extent inodes.
- *
- * You need to check the return value with IS_ERR().  If false, the function
- * was successful and the return value is the now opened ntfs inode of the
- * allocated mft record.  *@mrec is then set to the allocated, mapped, pinned,
- * and locked mft record.  If IS_ERR() is true, the function failed and the
- * error code is obtained from PTR_ERR(return value).  *@mrec is undefined in
- * this case.
- *
- * Allocation strategy:
- *
- * To find a free mft record, we scan the mft bitmap for a zero bit.  To
- * optimize this we start scanning at the place specified by @base_ni or if
- * @base_ni is NULL we start where we last stopped and we perform wrap around
- * when we reach the end.  Note, we do not try to allocate mft records below
- * number 24 because numbers 0 to 15 are the defined system files anyway and 16
- * to 24 are special in that they are used for storing extension mft records
- * for the $DATA attribute of $MFT.  This is required to avoid the possibility
- * of creating a runlist with a circular dependency which once written to disk
- * can never be read in again.  Windows will only use records 16 to 24 for
- * normal files if the volume is completely out of space.  We never use them
- * which means that when the volume is really out of space we cannot create any
- * more files while Windows can still create up to 8 small files.  We can start
- * doing this at some later time, it does not matter much for now.
- *
- * When scanning the mft bitmap, we only search up to the last allocated mft
- * record.  If there are no free records left in the range 24 to number of
- * allocated mft records, then we extend the $MFT/$DATA attribute in order to
- * create free mft records.  We extend the allocated size of $MFT/$DATA by 16
- * records at a time or one cluster, if cluster size is above 16kiB.  If there
- * is not sufficient space to do this, we try to extend by a single mft record
- * or one cluster, if cluster size is above the mft record size.
- *
- * No matter how many mft records we allocate, we initialize only the first
- * allocated mft record, incrementing mft data size and initialized size
- * accordingly, open an ntfs_inode for it and return it to the caller, unless
- * there are less than 24 mft records, in which case we allocate and initialize
- * mft records until we reach record 24 which we consider as the first free mft
- * record for use by normal files.
- *
- * If during any stage we overflow the initialized data in the mft bitmap, we
- * extend the initialized size (and data size) by 8 bytes, allocating another
- * cluster if required.  The bitmap data size has to be at least equal to the
- * number of mft records in the mft, but it can be bigger, in which case the
- * superflous bits are padded with zeroes.
- *
- * Thus, when we return successfully (IS_ERR() is false), we will have:
- *	- initialized / extended the mft bitmap if necessary,
- *	- initialized / extended the mft data if necessary,
- *	- set the bit corresponding to the mft record being allocated in the
- *	  mft bitmap,
- *	- opened an ntfs_inode for the allocated mft record, and we will have
- *	- returned the ntfs_inode as well as the allocated mapped, pinned, and
- *	  locked mft record.
- *
- * On error, the volume will be left in a consistent state and no record will
- * be allocated.  If rolling back a partial operation fails, we may leave some
- * inconsistent metadata in which case we set NVolErrors() so the volume is
- * left dirty when unmounted.
- *
- * Note, this function cannot make use of most of the normal functions, like
- * for example for attribute resizing, etc, because when the run list overflows
- * the base mft record and an attribute list is used, it is very important that
- * the extension mft records used to store the $DATA attribute of $MFT can be
- * reached without having to read the information contained inside them, as
- * this would make it impossible to find them in the first place after the
- * volume is unmounted.  $MFT/$BITMAP probably does not need to follow this
- * rule because the bitmap is not essential for finding the mft records, but on
- * the other hand, handling the bitmap in this special way would make life
- * easier because otherwise there might be circular invocations of functions
- * when reading the bitmap.
- */
-ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
-		ntfs_inode *base_ni, MFT_RECORD **mrec)
-{
-	s64 ll, bit, old_data_initialized, old_data_size;
-	unsigned long flags;
-	struct inode *vi;
-	struct page *page;
-	ntfs_inode *mft_ni, *mftbmp_ni, *ni;
-	ntfs_attr_search_ctx *ctx;
-	MFT_RECORD *m;
-	ATTR_RECORD *a;
-	pgoff_t index;
-	unsigned int ofs;
-	int err;
-	le16 seq_no, usn;
-	bool record_formatted = false;
-
-	if (base_ni) {
-		ntfs_debug("Entering (allocating an extent mft record for "
-				"base mft record 0x%llx).",
-				(long long)base_ni->mft_no);
-		/* @mode and @base_ni are mutually exclusive. */
-		BUG_ON(mode);
-	} else
-		ntfs_debug("Entering (allocating a base mft record).");
-	if (mode) {
-		/* @mode and @base_ni are mutually exclusive. */
-		BUG_ON(base_ni);
-		/* We only support creation of normal files and directories. */
-		if (!S_ISREG(mode) && !S_ISDIR(mode))
-			return ERR_PTR(-EOPNOTSUPP);
-	}
-	BUG_ON(!mrec);
-	mft_ni = NTFS_I(vol->mft_ino);
-	mftbmp_ni = NTFS_I(vol->mftbmp_ino);
-	down_write(&vol->mftbmp_lock);
-	bit = ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(vol, base_ni);
-	if (bit >= 0) {
-		ntfs_debug("Found and allocated free record (#1), bit 0x%llx.",
-				(long long)bit);
-		goto have_alloc_rec;
-	}
-	if (bit != -ENOSPC) {
-		up_write(&vol->mftbmp_lock);
-		return ERR_PTR(bit);
-	}
-	/*
-	 * No free mft records left.  If the mft bitmap already covers more
-	 * than the currently used mft records, the next records are all free,
-	 * so we can simply allocate the first unused mft record.
-	 * Note: We also have to make sure that the mft bitmap at least covers
-	 * the first 24 mft records as they are special and whilst they may not
-	 * be in use, we do not allocate from them.
-	 */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ll = mft_ni->initialized_size >> vol->mft_record_size_bits;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_initialized = mftbmp_ni->initialized_size;
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	if (old_data_initialized << 3 > ll && old_data_initialized > 3) {
-		bit = ll;
-		if (bit < 24)
-			bit = 24;
-		if (unlikely(bit >= (1ll << 32)))
-			goto max_err_out;
-		ntfs_debug("Found free record (#2), bit 0x%llx.",
-				(long long)bit);
-		goto found_free_rec;
-	}
-	/*
-	 * The mft bitmap needs to be expanded until it covers the first unused
-	 * mft record that we can allocate.
-	 * Note: The smallest mft record we allocate is mft record 24.
-	 */
-	bit = old_data_initialized << 3;
-	if (unlikely(bit >= (1ll << 32)))
-		goto max_err_out;
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	old_data_size = mftbmp_ni->allocated_size;
-	ntfs_debug("Status of mftbmp before extension: allocated_size 0x%llx, "
-			"data_size 0x%llx, initialized_size 0x%llx.",
-			(long long)old_data_size,
-			(long long)i_size_read(vol->mftbmp_ino),
-			(long long)old_data_initialized);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-	if (old_data_initialized + 8 > old_data_size) {
-		/* Need to extend bitmap by one more cluster. */
-		ntfs_debug("mftbmp: initialized_size + 8 > allocated_size.");
-		err = ntfs_mft_bitmap_extend_allocation_nolock(vol);
-		if (unlikely(err)) {
-			up_write(&vol->mftbmp_lock);
-			goto err_out;
-		}
-#ifdef DEBUG
-		read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-		ntfs_debug("Status of mftbmp after allocation extension: "
-				"allocated_size 0x%llx, data_size 0x%llx, "
-				"initialized_size 0x%llx.",
-				(long long)mftbmp_ni->allocated_size,
-				(long long)i_size_read(vol->mftbmp_ino),
-				(long long)mftbmp_ni->initialized_size);
-		read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-	}
-	/*
-	 * We now have sufficient allocated space, extend the initialized_size
-	 * as well as the data_size if necessary and fill the new space with
-	 * zeroes.
-	 */
-	err = ntfs_mft_bitmap_extend_initialized_nolock(vol);
-	if (unlikely(err)) {
-		up_write(&vol->mftbmp_lock);
-		goto err_out;
-	}
-#ifdef DEBUG
-	read_lock_irqsave(&mftbmp_ni->size_lock, flags);
-	ntfs_debug("Status of mftbmp after initialized extension: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mftbmp_ni->allocated_size,
-			(long long)i_size_read(vol->mftbmp_ino),
-			(long long)mftbmp_ni->initialized_size);
-	read_unlock_irqrestore(&mftbmp_ni->size_lock, flags);
-#endif /* DEBUG */
-	ntfs_debug("Found free record (#3), bit 0x%llx.", (long long)bit);
-found_free_rec:
-	/* @bit is the found free mft record, allocate it in the mft bitmap. */
-	ntfs_debug("At found_free_rec.");
-	err = ntfs_bitmap_set_bit(vol->mftbmp_ino, bit);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to allocate bit in mft bitmap.");
-		up_write(&vol->mftbmp_lock);
-		goto err_out;
-	}
-	ntfs_debug("Set bit 0x%llx in mft bitmap.", (long long)bit);
-have_alloc_rec:
-	/*
-	 * The mft bitmap is now uptodate.  Deal with mft data attribute now.
-	 * Note, we keep hold of the mft bitmap lock for writing until all
-	 * modifications to the mft data attribute are complete, too, as they
-	 * will impact decisions for mft bitmap and mft record allocation done
-	 * by a parallel allocation and if the lock is not maintained a
-	 * parallel allocation could allocate the same mft record as this one.
-	 */
-	ll = (bit + 1) << vol->mft_record_size_bits;
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	old_data_initialized = mft_ni->initialized_size;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	if (ll <= old_data_initialized) {
-		ntfs_debug("Allocated mft record already initialized.");
-		goto mft_rec_already_initialized;
-	}
-	ntfs_debug("Initializing allocated mft record.");
-	/*
-	 * The mft record is outside the initialized data.  Extend the mft data
-	 * attribute until it covers the allocated record.  The loop is only
-	 * actually traversed more than once when a freshly formatted volume is
-	 * first written to so it optimizes away nicely in the common case.
-	 */
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ntfs_debug("Status of mft data before extension: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mft_ni->allocated_size,
-			(long long)i_size_read(vol->mft_ino),
-			(long long)mft_ni->initialized_size);
-	while (ll > mft_ni->allocated_size) {
-		read_unlock_irqrestore(&mft_ni->size_lock, flags);
-		err = ntfs_mft_data_extend_allocation_nolock(vol);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to extend mft data "
-					"allocation.");
-			goto undo_mftbmp_alloc_nolock;
-		}
-		read_lock_irqsave(&mft_ni->size_lock, flags);
-		ntfs_debug("Status of mft data after allocation extension: "
-				"allocated_size 0x%llx, data_size 0x%llx, "
-				"initialized_size 0x%llx.",
-				(long long)mft_ni->allocated_size,
-				(long long)i_size_read(vol->mft_ino),
-				(long long)mft_ni->initialized_size);
-	}
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/*
-	 * Extend mft data initialized size (and data size of course) to reach
-	 * the allocated mft record, formatting the mft records allong the way.
-	 * Note: We only modify the ntfs_inode structure as that is all that is
-	 * needed by ntfs_mft_record_format().  We will update the attribute
-	 * record itself in one fell swoop later on.
-	 */
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	old_data_initialized = mft_ni->initialized_size;
-	old_data_size = vol->mft_ino->i_size;
-	while (ll > mft_ni->initialized_size) {
-		s64 new_initialized_size, mft_no;
-		
-		new_initialized_size = mft_ni->initialized_size +
-				vol->mft_record_size;
-		mft_no = mft_ni->initialized_size >> vol->mft_record_size_bits;
-		if (new_initialized_size > i_size_read(vol->mft_ino))
-			i_size_write(vol->mft_ino, new_initialized_size);
-		write_unlock_irqrestore(&mft_ni->size_lock, flags);
-		ntfs_debug("Initializing mft record 0x%llx.",
-				(long long)mft_no);
-		err = ntfs_mft_record_format(vol, mft_no);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to format mft record.");
-			goto undo_data_init;
-		}
-		write_lock_irqsave(&mft_ni->size_lock, flags);
-		mft_ni->initialized_size = new_initialized_size;
-	}
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	record_formatted = true;
-	/* Update the mft data attribute record to reflect the new sizes. */
-	m = map_mft_record(mft_ni);
-	if (IS_ERR(m)) {
-		ntfs_error(vol->sb, "Failed to map mft record.");
-		err = PTR_ERR(m);
-		goto undo_data_init;
-	}
-	ctx = ntfs_attr_get_search_ctx(mft_ni, m);
-	if (unlikely(!ctx)) {
-		ntfs_error(vol->sb, "Failed to get search context.");
-		err = -ENOMEM;
-		unmap_mft_record(mft_ni);
-		goto undo_data_init;
-	}
-	err = ntfs_attr_lookup(mft_ni->type, mft_ni->name, mft_ni->name_len,
-			CASE_SENSITIVE, 0, NULL, 0, ctx);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to find first attribute extent of "
-				"mft data attribute.");
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(mft_ni);
-		goto undo_data_init;
-	}
-	a = ctx->attr;
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	a->data.non_resident.initialized_size =
-			cpu_to_sle64(mft_ni->initialized_size);
-	a->data.non_resident.data_size =
-			cpu_to_sle64(i_size_read(vol->mft_ino));
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Ensure the changes make it to disk. */
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(mft_ni);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	ntfs_debug("Status of mft data after mft record initialization: "
-			"allocated_size 0x%llx, data_size 0x%llx, "
-			"initialized_size 0x%llx.",
-			(long long)mft_ni->allocated_size,
-			(long long)i_size_read(vol->mft_ino),
-			(long long)mft_ni->initialized_size);
-	BUG_ON(i_size_read(vol->mft_ino) > mft_ni->allocated_size);
-	BUG_ON(mft_ni->initialized_size > i_size_read(vol->mft_ino));
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-mft_rec_already_initialized:
-	/*
-	 * We can finally drop the mft bitmap lock as the mft data attribute
-	 * has been fully updated.  The only disparity left is that the
-	 * allocated mft record still needs to be marked as in use to match the
-	 * set bit in the mft bitmap but this is actually not a problem since
-	 * this mft record is not referenced from anywhere yet and the fact
-	 * that it is allocated in the mft bitmap means that no-one will try to
-	 * allocate it either.
-	 */
-	up_write(&vol->mftbmp_lock);
-	/*
-	 * We now have allocated and initialized the mft record.  Calculate the
-	 * index of and the offset within the page cache page the record is in.
-	 */
-	index = bit << vol->mft_record_size_bits >> PAGE_SHIFT;
-	ofs = (bit << vol->mft_record_size_bits) & ~PAGE_MASK;
-	/* Read, map, and pin the page containing the mft record. */
-	page = ntfs_map_page(vol->mft_ino->i_mapping, index);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to map page containing allocated "
-				"mft record 0x%llx.", (long long)bit);
-		err = PTR_ERR(page);
-		goto undo_mftbmp_alloc;
-	}
-	lock_page(page);
-	BUG_ON(!PageUptodate(page));
-	ClearPageUptodate(page);
-	m = (MFT_RECORD*)((u8*)page_address(page) + ofs);
-	/* If we just formatted the mft record no need to do it again. */
-	if (!record_formatted) {
-		/* Sanity check that the mft record is really not in use. */
-		if (ntfs_is_file_record(m->magic) &&
-				(m->flags & MFT_RECORD_IN_USE)) {
-			ntfs_error(vol->sb, "Mft record 0x%llx was marked "
-					"free in mft bitmap but is marked "
-					"used itself.  Corrupt filesystem.  "
-					"Unmount and run chkdsk.",
-					(long long)bit);
-			err = -EIO;
-			SetPageUptodate(page);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			NVolSetErrors(vol);
-			goto undo_mftbmp_alloc;
-		}
-		/*
-		 * We need to (re-)format the mft record, preserving the
-		 * sequence number if it is not zero as well as the update
-		 * sequence number if it is not zero or -1 (0xffff).  This
-		 * means we do not need to care whether or not something went
-		 * wrong with the previous mft record.
-		 */
-		seq_no = m->sequence_number;
-		usn = *(le16*)((u8*)m + le16_to_cpu(m->usa_ofs));
-		err = ntfs_mft_record_layout(vol, bit, m);
-		if (unlikely(err)) {
-			ntfs_error(vol->sb, "Failed to layout allocated mft "
-					"record 0x%llx.", (long long)bit);
-			SetPageUptodate(page);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		if (seq_no)
-			m->sequence_number = seq_no;
-		if (usn && le16_to_cpu(usn) != 0xffff)
-			*(le16*)((u8*)m + le16_to_cpu(m->usa_ofs)) = usn;
-	}
-	/* Set the mft record itself in use. */
-	m->flags |= MFT_RECORD_IN_USE;
-	if (S_ISDIR(mode))
-		m->flags |= MFT_RECORD_IS_DIRECTORY;
-	flush_dcache_page(page);
-	SetPageUptodate(page);
-	if (base_ni) {
-		MFT_RECORD *m_tmp;
-
-		/*
-		 * Setup the base mft record in the extent mft record.  This
-		 * completes initialization of the allocated extent mft record
-		 * and we can simply use it with map_extent_mft_record().
-		 */
-		m->base_mft_record = MK_LE_MREF(base_ni->mft_no,
-				base_ni->seq_no);
-		/*
-		 * Allocate an extent inode structure for the new mft record,
-		 * attach it to the base inode @base_ni and map, pin, and lock
-		 * its, i.e. the allocated, mft record.
-		 */
-		m_tmp = map_extent_mft_record(base_ni, bit, &ni);
-		if (IS_ERR(m_tmp)) {
-			ntfs_error(vol->sb, "Failed to map allocated extent "
-					"mft record 0x%llx.", (long long)bit);
-			err = PTR_ERR(m_tmp);
-			/* Set the mft record itself not in use. */
-			m->flags &= cpu_to_le16(
-					~le16_to_cpu(MFT_RECORD_IN_USE));
-			flush_dcache_page(page);
-			/* Make sure the mft record is written out to disk. */
-			mark_ntfs_record_dirty(page, ofs);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		BUG_ON(m != m_tmp);
-		/*
-		 * Make sure the allocated mft record is written out to disk.
-		 * No need to set the inode dirty because the caller is going
-		 * to do that anyway after finishing with the new extent mft
-		 * record (e.g. at a minimum a new attribute will be added to
-		 * the mft record.
-		 */
-		mark_ntfs_record_dirty(page, ofs);
-		unlock_page(page);
-		/*
-		 * Need to unmap the page since map_extent_mft_record() mapped
-		 * it as well so we have it mapped twice at the moment.
-		 */
-		ntfs_unmap_page(page);
-	} else {
-		/*
-		 * Allocate a new VFS inode and set it up.  NOTE: @vi->i_nlink
-		 * is set to 1 but the mft record->link_count is 0.  The caller
-		 * needs to bear this in mind.
-		 */
-		vi = new_inode(vol->sb);
-		if (unlikely(!vi)) {
-			err = -ENOMEM;
-			/* Set the mft record itself not in use. */
-			m->flags &= cpu_to_le16(
-					~le16_to_cpu(MFT_RECORD_IN_USE));
-			flush_dcache_page(page);
-			/* Make sure the mft record is written out to disk. */
-			mark_ntfs_record_dirty(page, ofs);
-			unlock_page(page);
-			ntfs_unmap_page(page);
-			goto undo_mftbmp_alloc;
-		}
-		vi->i_ino = bit;
-
-		/* The owner and group come from the ntfs volume. */
-		vi->i_uid = vol->uid;
-		vi->i_gid = vol->gid;
-
-		/* Initialize the ntfs specific part of @vi. */
-		ntfs_init_big_inode(vi);
-		ni = NTFS_I(vi);
-		/*
-		 * Set the appropriate mode, attribute type, and name.  For
-		 * directories, also setup the index values to the defaults.
-		 */
-		if (S_ISDIR(mode)) {
-			vi->i_mode = S_IFDIR | S_IRWXUGO;
-			vi->i_mode &= ~vol->dmask;
-
-			NInoSetMstProtected(ni);
-			ni->type = AT_INDEX_ALLOCATION;
-			ni->name = I30;
-			ni->name_len = 4;
-
-			ni->itype.index.block_size = 4096;
-			ni->itype.index.block_size_bits = ntfs_ffs(4096) - 1;
-			ni->itype.index.collation_rule = COLLATION_FILE_NAME;
-			if (vol->cluster_size <= ni->itype.index.block_size) {
-				ni->itype.index.vcn_size = vol->cluster_size;
-				ni->itype.index.vcn_size_bits =
-						vol->cluster_size_bits;
-			} else {
-				ni->itype.index.vcn_size = vol->sector_size;
-				ni->itype.index.vcn_size_bits =
-						vol->sector_size_bits;
-			}
-		} else {
-			vi->i_mode = S_IFREG | S_IRWXUGO;
-			vi->i_mode &= ~vol->fmask;
-
-			ni->type = AT_DATA;
-			ni->name = NULL;
-			ni->name_len = 0;
-		}
-		if (IS_RDONLY(vi))
-			vi->i_mode &= ~S_IWUGO;
-
-		/* Set the inode times to the current time. */
-		simple_inode_init_ts(vi);
-		/*
-		 * Set the file size to 0, the ntfs inode sizes are set to 0 by
-		 * the call to ntfs_init_big_inode() below.
-		 */
-		vi->i_size = 0;
-		vi->i_blocks = 0;
-
-		/* Set the sequence number. */
-		vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
-		/*
-		 * Manually map, pin, and lock the mft record as we already
-		 * have its page mapped and it is very easy to do.
-		 */
-		atomic_inc(&ni->count);
-		mutex_lock(&ni->mrec_lock);
-		ni->page = page;
-		ni->page_ofs = ofs;
-		/*
-		 * Make sure the allocated mft record is written out to disk.
-		 * NOTE: We do not set the ntfs inode dirty because this would
-		 * fail in ntfs_write_inode() because the inode does not have a
-		 * standard information attribute yet.  Also, there is no need
-		 * to set the inode dirty because the caller is going to do
-		 * that anyway after finishing with the new mft record (e.g. at
-		 * a minimum some new attributes will be added to the mft
-		 * record.
-		 */
-		mark_ntfs_record_dirty(page, ofs);
-		unlock_page(page);
-
-		/* Add the inode to the inode hash for the superblock. */
-		insert_inode_hash(vi);
-
-		/* Update the default mft allocation position. */
-		vol->mft_data_pos = bit + 1;
-	}
-	/*
-	 * Return the opened, allocated inode of the allocated mft record as
-	 * well as the mapped, pinned, and locked mft record.
-	 */
-	ntfs_debug("Returning opened, allocated %sinode 0x%llx.",
-			base_ni ? "extent " : "", (long long)bit);
-	*mrec = m;
-	return ni;
-undo_data_init:
-	write_lock_irqsave(&mft_ni->size_lock, flags);
-	mft_ni->initialized_size = old_data_initialized;
-	i_size_write(vol->mft_ino, old_data_size);
-	write_unlock_irqrestore(&mft_ni->size_lock, flags);
-	goto undo_mftbmp_alloc_nolock;
-undo_mftbmp_alloc:
-	down_write(&vol->mftbmp_lock);
-undo_mftbmp_alloc_nolock:
-	if (ntfs_bitmap_clear_bit(vol->mftbmp_ino, bit)) {
-		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
-		NVolSetErrors(vol);
-	}
-	up_write(&vol->mftbmp_lock);
-err_out:
-	return ERR_PTR(err);
-max_err_out:
-	ntfs_warning(vol->sb, "Cannot allocate mft record because the maximum "
-			"number of inodes (2^32) has already been reached.");
-	up_write(&vol->mftbmp_lock);
-	return ERR_PTR(-ENOSPC);
-}
-
-/**
- * ntfs_extent_mft_record_free - free an extent mft record on an ntfs volume
- * @ni:		ntfs inode of the mapped extent mft record to free
- * @m:		mapped extent mft record of the ntfs inode @ni
- *
- * Free the mapped extent mft record @m of the extent ntfs inode @ni.
- *
- * Note that this function unmaps the mft record and closes and destroys @ni
- * internally and hence you cannot use either @ni nor @m any more after this
- * function returns success.
- *
- * On success return 0 and on error return -errno.  @ni and @m are still valid
- * in this case and have not been freed.
- *
- * For some errors an error message is displayed and the success code 0 is
- * returned and the volume is then left dirty on umount.  This makes sense in
- * case we could not rollback the changes that were already done since the
- * caller no longer wants to reference this mft record so it does not matter to
- * the caller if something is wrong with it as long as it is properly detached
- * from the base inode.
- */
-int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
-{
-	unsigned long mft_no = ni->mft_no;
-	ntfs_volume *vol = ni->vol;
-	ntfs_inode *base_ni;
-	ntfs_inode **extent_nis;
-	int i, err;
-	le16 old_seq_no;
-	u16 seq_no;
-	
-	BUG_ON(NInoAttr(ni));
-	BUG_ON(ni->nr_extents != -1);
-
-	mutex_lock(&ni->extent_lock);
-	base_ni = ni->ext.base_ntfs_ino;
-	mutex_unlock(&ni->extent_lock);
-
-	BUG_ON(base_ni->nr_extents <= 0);
-
-	ntfs_debug("Entering for extent inode 0x%lx, base inode 0x%lx.\n",
-			mft_no, base_ni->mft_no);
-
-	mutex_lock(&base_ni->extent_lock);
-
-	/* Make sure we are holding the only reference to the extent inode. */
-	if (atomic_read(&ni->count) > 2) {
-		ntfs_error(vol->sb, "Tried to free busy extent inode 0x%lx, "
-				"not freeing.", base_ni->mft_no);
-		mutex_unlock(&base_ni->extent_lock);
-		return -EBUSY;
-	}
-
-	/* Dissociate the ntfs inode from the base inode. */
-	extent_nis = base_ni->ext.extent_ntfs_inos;
-	err = -ENOENT;
-	for (i = 0; i < base_ni->nr_extents; i++) {
-		if (ni != extent_nis[i])
-			continue;
-		extent_nis += i;
-		base_ni->nr_extents--;
-		memmove(extent_nis, extent_nis + 1, (base_ni->nr_extents - i) *
-				sizeof(ntfs_inode*));
-		err = 0;
-		break;
-	}
-
-	mutex_unlock(&base_ni->extent_lock);
-
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Extent inode 0x%lx is not attached to "
-				"its base inode 0x%lx.", mft_no,
-				base_ni->mft_no);
-		BUG();
-	}
-
-	/*
-	 * The extent inode is no longer attached to the base inode so no one
-	 * can get a reference to it any more.
-	 */
-
-	/* Mark the mft record as not in use. */
-	m->flags &= ~MFT_RECORD_IN_USE;
-
-	/* Increment the sequence number, skipping zero, if it is not zero. */
-	old_seq_no = m->sequence_number;
-	seq_no = le16_to_cpu(old_seq_no);
-	if (seq_no == 0xffff)
-		seq_no = 1;
-	else if (seq_no)
-		seq_no++;
-	m->sequence_number = cpu_to_le16(seq_no);
-
-	/*
-	 * Set the ntfs inode dirty and write it out.  We do not need to worry
-	 * about the base inode here since whatever caused the extent mft
-	 * record to be freed is guaranteed to do it already.
-	 */
-	NInoSetDirty(ni);
-	err = write_mft_record(ni, m, 0);
-	if (unlikely(err)) {
-		ntfs_error(vol->sb, "Failed to write mft record 0x%lx, not "
-				"freeing.", mft_no);
-		goto rollback;
-	}
-rollback_error:
-	/* Unmap and throw away the now freed extent inode. */
-	unmap_extent_mft_record(ni);
-	ntfs_clear_extent_inode(ni);
-
-	/* Clear the bit in the $MFT/$BITMAP corresponding to this record. */
-	down_write(&vol->mftbmp_lock);
-	err = ntfs_bitmap_clear_bit(vol->mftbmp_ino, mft_no);
-	up_write(&vol->mftbmp_lock);
-	if (unlikely(err)) {
-		/*
-		 * The extent inode is gone but we failed to deallocate it in
-		 * the mft bitmap.  Just emit a warning and leave the volume
-		 * dirty on umount.
-		 */
-		ntfs_error(vol->sb, "Failed to clear bit in mft bitmap.%s", es);
-		NVolSetErrors(vol);
-	}
-	return 0;
-rollback:
-	/* Rollback what we did... */
-	mutex_lock(&base_ni->extent_lock);
-	extent_nis = base_ni->ext.extent_ntfs_inos;
-	if (!(base_ni->nr_extents & 3)) {
-		int new_size = (base_ni->nr_extents + 4) * sizeof(ntfs_inode*);
-
-		extent_nis = kmalloc(new_size, GFP_NOFS);
-		if (unlikely(!extent_nis)) {
-			ntfs_error(vol->sb, "Failed to allocate internal "
-					"buffer during rollback.%s", es);
-			mutex_unlock(&base_ni->extent_lock);
-			NVolSetErrors(vol);
-			goto rollback_error;
-		}
-		if (base_ni->nr_extents) {
-			BUG_ON(!base_ni->ext.extent_ntfs_inos);
-			memcpy(extent_nis, base_ni->ext.extent_ntfs_inos,
-					new_size - 4 * sizeof(ntfs_inode*));
-			kfree(base_ni->ext.extent_ntfs_inos);
-		}
-		base_ni->ext.extent_ntfs_inos = extent_nis;
-	}
-	m->flags |= MFT_RECORD_IN_USE;
-	m->sequence_number = old_seq_no;
-	extent_nis[base_ni->nr_extents++] = ni;
-	mutex_unlock(&base_ni->extent_lock);
-	mark_mft_record_dirty(ni);
-	return err;
-}
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/mft.h b/fs/ntfs/mft.h
deleted file mode 100644
index 49c001af16ed..000000000000
--- a/fs/ntfs/mft.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * mft.h - Defines for mft record handling in NTFS Linux kernel driver.
- *	   Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_MFT_H
-#define _LINUX_NTFS_MFT_H
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/pagemap.h>
-
-#include "inode.h"
-
-extern MFT_RECORD *map_mft_record(ntfs_inode *ni);
-extern void unmap_mft_record(ntfs_inode *ni);
-
-extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref,
-		ntfs_inode **ntfs_ino);
-
-static inline void unmap_extent_mft_record(ntfs_inode *ni)
-{
-	unmap_mft_record(ni);
-	return;
-}
-
-#ifdef NTFS_RW
-
-/**
- * flush_dcache_mft_record_page - flush_dcache_page() for mft records
- * @ni:		ntfs inode structure of mft record
- *
- * Call flush_dcache_page() for the page in which an mft record resides.
- *
- * This must be called every time an mft record is modified, just after the
- * modification.
- */
-static inline void flush_dcache_mft_record_page(ntfs_inode *ni)
-{
-	flush_dcache_page(ni->page);
-}
-
-extern void __mark_mft_record_dirty(ntfs_inode *ni);
-
-/**
- * mark_mft_record_dirty - set the mft record and the page containing it dirty
- * @ni:		ntfs inode describing the mapped mft record
- *
- * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni,
- * as well as the page containing the mft record, dirty.  Also, mark the base
- * vfs inode dirty.  This ensures that any changes to the mft record are
- * written out to disk.
- *
- * NOTE:  Do not do anything if the mft record is already marked dirty.
- */
-static inline void mark_mft_record_dirty(ntfs_inode *ni)
-{
-	if (!NInoTestSetDirty(ni))
-		__mark_mft_record_dirty(ni);
-}
-
-extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no,
-		MFT_RECORD *m, int sync);
-
-extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync);
-
-/**
- * write_mft_record - write out a mapped (extent) mft record
- * @ni:		ntfs inode describing the mapped (extent) mft record
- * @m:		mapped (extent) mft record to write
- * @sync:	if true, wait for i/o completion
- *
- * This is just a wrapper for write_mft_record_nolock() (see mft.c), which
- * locks the page for the duration of the write.  This ensures that there are
- * no race conditions between writing the mft record via the dirty inode code
- * paths and via the page cache write back code paths or between writing
- * neighbouring mft records residing in the same page.
- *
- * Locking the page also serializes us against ->read_folio() if the page is not
- * uptodate.
- *
- * On success, clean the mft record and return 0.  On error, leave the mft
- * record dirty and return -errno.
- */
-static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync)
-{
-	struct page *page = ni->page;
-	int err;
-
-	BUG_ON(!page);
-	lock_page(page);
-	err = write_mft_record_nolock(ni, m, sync);
-	unlock_page(page);
-	return err;
-}
-
-extern bool ntfs_may_write_mft_record(ntfs_volume *vol,
-		const unsigned long mft_no, const MFT_RECORD *m,
-		ntfs_inode **locked_ni);
-
-extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode,
-		ntfs_inode *base_ni, MFT_RECORD **mrec);
-extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_MFT_H */
diff --git a/fs/ntfs/mst.c b/fs/ntfs/mst.c
deleted file mode 100644
index 16b3c884abfc..000000000000
--- a/fs/ntfs/mst.c
+++ /dev/null
@@ -1,189 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * mst.c - NTFS multi sector transfer protection handling code. Part of the
- *	   Linux-NTFS project.
- *
- * Copyright (c) 2001-2004 Anton Altaparmakov
- */
-
-#include "ntfs.h"
-
-/**
- * post_read_mst_fixup - deprotect multi sector transfer protected data
- * @b:		pointer to the data to deprotect
- * @size:	size in bytes of @b
- *
- * Perform the necessary post read multi sector transfer fixup and detect the
- * presence of incomplete multi sector transfers. - In that case, overwrite the
- * magic of the ntfs record header being processed with "BAAD" (in memory only!)
- * and abort processing.
- *
- * Return 0 on success and -EINVAL on error ("BAAD" magic will be present).
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not protected at all and hence doesn't need to
- * be fixed up. Thus, we return success and not failure in this case. This is
- * in contrast to pre_write_mst_fixup(), see below.
- */
-int post_read_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
-	u16 usa_ofs, usa_count, usn;
-	u16 *usa_pos, *data_pos;
-
-	/* Setup the variables. */
-	usa_ofs = le16_to_cpu(b->usa_ofs);
-	/* Decrement usa_count to get number of fixups. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	/* Size and alignment checks. */
-	if ( size & (NTFS_BLOCK_SIZE - 1)	||
-	     usa_ofs & 1			||
-	     usa_ofs + (usa_count * 2) > size	||
-	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
-		return 0;
-	/* Position of usn in update sequence array. */
-	usa_pos = (u16*)b + usa_ofs/sizeof(u16);
-	/*
-	 * The update sequence number which has to be equal to each of the
-	 * u16 values before they are fixed up. Note no need to care for
-	 * endianness since we are comparing and moving data for on disk
-	 * structures which means the data is consistent. - If it is
-	 * consistenty the wrong endianness it doesn't make any difference.
-	 */
-	usn = *usa_pos;
-	/*
-	 * Position in protected data of first u16 that needs fixing up.
-	 */
-	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
-	/*
-	 * Check for incomplete multi sector transfer(s).
-	 */
-	while (usa_count--) {
-		if (*data_pos != usn) {
-			/*
-			 * Incomplete multi sector transfer detected! )-:
-			 * Set the magic to "BAAD" and return failure.
-			 * Note that magic_BAAD is already converted to le32.
-			 */
-			b->magic = magic_BAAD;
-			return -EINVAL;
-		}
-		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
-	}
-	/* Re-setup the variables. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	data_pos = (u16*)b + NTFS_BLOCK_SIZE/sizeof(u16) - 1;
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment position in usa and restore original data from
-		 * the usa into the data buffer.
-		 */
-		*data_pos = *(++usa_pos);
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(u16);
-	}
-	return 0;
-}
-
-/**
- * pre_write_mst_fixup - apply multi sector transfer protection
- * @b:		pointer to the data to protect
- * @size:	size in bytes of @b
- *
- * Perform the necessary pre write multi sector transfer fixup on the data
- * pointer to by @b of @size.
- *
- * Return 0 if fixup applied (success) or -EINVAL if no fixup was performed
- * (assumed not needed). This is in contrast to post_read_mst_fixup() above.
- *
- * NOTE: We consider the absence / invalidity of an update sequence array to
- * mean that the structure is not subject to protection and hence doesn't need
- * to be fixed up. This means that you have to create a valid update sequence
- * array header in the ntfs record before calling this function, otherwise it
- * will fail (the header needs to contain the position of the update sequence
- * array together with the number of elements in the array). You also need to
- * initialise the update sequence number before calling this function
- * otherwise a random word will be used (whatever was in the record at that
- * position at that time).
- */
-int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size)
-{
-	le16 *usa_pos, *data_pos;
-	u16 usa_ofs, usa_count, usn;
-	le16 le_usn;
-
-	/* Sanity check + only fixup if it makes sense. */
-	if (!b || ntfs_is_baad_record(b->magic) ||
-			ntfs_is_hole_record(b->magic))
-		return -EINVAL;
-	/* Setup the variables. */
-	usa_ofs = le16_to_cpu(b->usa_ofs);
-	/* Decrement usa_count to get number of fixups. */
-	usa_count = le16_to_cpu(b->usa_count) - 1;
-	/* Size and alignment checks. */
-	if ( size & (NTFS_BLOCK_SIZE - 1)	||
-	     usa_ofs & 1			||
-	     usa_ofs + (usa_count * 2) > size	||
-	     (size >> NTFS_BLOCK_SIZE_BITS) != usa_count)
-		return -EINVAL;
-	/* Position of usn in update sequence array. */
-	usa_pos = (le16*)((u8*)b + usa_ofs);
-	/*
-	 * Cyclically increment the update sequence number
-	 * (skipping 0 and -1, i.e. 0xffff).
-	 */
-	usn = le16_to_cpup(usa_pos) + 1;
-	if (usn == 0xffff || !usn)
-		usn = 1;
-	le_usn = cpu_to_le16(usn);
-	*usa_pos = le_usn;
-	/* Position in data of first u16 that needs fixing up. */
-	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment the position in the usa and save the
-		 * original data from the data buffer into the usa.
-		 */
-		*(++usa_pos) = *data_pos;
-		/* Apply fixup to data. */
-		*data_pos = le_usn;
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
-	}
-	return 0;
-}
-
-/**
- * post_write_mst_fixup - fast deprotect multi sector transfer protected data
- * @b:		pointer to the data to deprotect
- *
- * Perform the necessary post write multi sector transfer fixup, not checking
- * for any errors, because we assume we have just used pre_write_mst_fixup(),
- * thus the data will be fine or we would never have gotten here.
- */
-void post_write_mst_fixup(NTFS_RECORD *b)
-{
-	le16 *usa_pos, *data_pos;
-
-	u16 usa_ofs = le16_to_cpu(b->usa_ofs);
-	u16 usa_count = le16_to_cpu(b->usa_count) - 1;
-
-	/* Position of usn in update sequence array. */
-	usa_pos = (le16*)b + usa_ofs/sizeof(le16);
-
-	/* Position in protected data of first u16 that needs fixing up. */
-	data_pos = (le16*)b + NTFS_BLOCK_SIZE/sizeof(le16) - 1;
-
-	/* Fixup all sectors. */
-	while (usa_count--) {
-		/*
-		 * Increment position in usa and restore original data from
-		 * the usa into the data buffer.
-		 */
-		*data_pos = *(++usa_pos);
-
-		/* Increment position in data as well. */
-		data_pos += NTFS_BLOCK_SIZE/sizeof(le16);
-	}
-}
diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c
deleted file mode 100644
index d7498ddc4a72..000000000000
--- a/fs/ntfs/namei.c
+++ /dev/null
@@ -1,392 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * namei.c - NTFS kernel directory inode operations. Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/dcache.h>
-#include <linux/exportfs.h>
-#include <linux/security.h>
-#include <linux/slab.h>
-
-#include "attrib.h"
-#include "debug.h"
-#include "dir.h"
-#include "mft.h"
-#include "ntfs.h"
-
-/**
- * ntfs_lookup - find the inode represented by a dentry in a directory inode
- * @dir_ino:	directory inode in which to look for the inode
- * @dent:	dentry representing the inode to look for
- * @flags:	lookup flags
- *
- * In short, ntfs_lookup() looks for the inode represented by the dentry @dent
- * in the directory inode @dir_ino and if found attaches the inode to the
- * dentry @dent.
- *
- * In more detail, the dentry @dent specifies which inode to look for by
- * supplying the name of the inode in @dent->d_name.name. ntfs_lookup()
- * converts the name to Unicode and walks the contents of the directory inode
- * @dir_ino looking for the converted Unicode name. If the name is found in the
- * directory, the corresponding inode is loaded by calling ntfs_iget() on its
- * inode number and the inode is associated with the dentry @dent via a call to
- * d_splice_alias().
- *
- * If the name is not found in the directory, a NULL inode is inserted into the
- * dentry @dent via a call to d_add(). The dentry is then termed a negative
- * dentry.
- *
- * Only if an actual error occurs, do we return an error via ERR_PTR().
- *
- * In order to handle the case insensitivity issues of NTFS with regards to the
- * dcache and the dcache requiring only one dentry per directory, we deal with
- * dentry aliases that only differ in case in ->ntfs_lookup() while maintaining
- * a case sensitive dcache. This means that we get the full benefit of dcache
- * speed when the file/directory is looked up with the same case as returned by
- * ->ntfs_readdir() but that a lookup for any other case (or for the short file
- * name) will not find anything in dcache and will enter ->ntfs_lookup()
- * instead, where we search the directory for a fully matching file name
- * (including case) and if that is not found, we search for a file name that
- * matches with different case and if that has non-POSIX semantics we return
- * that. We actually do only one search (case sensitive) and keep tabs on
- * whether we have found a case insensitive match in the process.
- *
- * To simplify matters for us, we do not treat the short vs long filenames as
- * two hard links but instead if the lookup matches a short filename, we
- * return the dentry for the corresponding long filename instead.
- *
- * There are three cases we need to distinguish here:
- *
- * 1) @dent perfectly matches (i.e. including case) a directory entry with a
- *    file name in the WIN32 or POSIX namespaces. In this case
- *    ntfs_lookup_inode_by_name() will return with name set to NULL and we
- *    just d_splice_alias() @dent.
- * 2) @dent matches (not including case) a directory entry with a file name in
- *    the WIN32 namespace. In this case ntfs_lookup_inode_by_name() will return
- *    with name set to point to a kmalloc()ed ntfs_name structure containing
- *    the properly cased little endian Unicode name. We convert the name to the
- *    current NLS code page, search if a dentry with this name already exists
- *    and if so return that instead of @dent.  At this point things are
- *    complicated by the possibility of 'disconnected' dentries due to NFS
- *    which we deal with appropriately (see the code comments).  The VFS will
- *    then destroy the old @dent and use the one we returned.  If a dentry is
- *    not found, we allocate a new one, d_splice_alias() it, and return it as
- *    above.
- * 3) @dent matches either perfectly or not (i.e. we don't care about case) a
- *    directory entry with a file name in the DOS namespace. In this case
- *    ntfs_lookup_inode_by_name() will return with name set to point to a
- *    kmalloc()ed ntfs_name structure containing the mft reference (cpu endian)
- *    of the inode. We use the mft reference to read the inode and to find the
- *    file name in the WIN32 namespace corresponding to the matched short file
- *    name. We then convert the name to the current NLS code page, and proceed
- *    searching for a dentry with this name, etc, as in case 2), above.
- *
- * Locking: Caller must hold i_mutex on the directory.
- */
-static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent,
-		unsigned int flags)
-{
-	ntfs_volume *vol = NTFS_SB(dir_ino->i_sb);
-	struct inode *dent_inode;
-	ntfschar *uname;
-	ntfs_name *name = NULL;
-	MFT_REF mref;
-	unsigned long dent_ino;
-	int uname_len;
-
-	ntfs_debug("Looking up %pd in directory inode 0x%lx.",
-			dent, dir_ino->i_ino);
-	/* Convert the name of the dentry to Unicode. */
-	uname_len = ntfs_nlstoucs(vol, dent->d_name.name, dent->d_name.len,
-			&uname);
-	if (uname_len < 0) {
-		if (uname_len != -ENAMETOOLONG)
-			ntfs_error(vol->sb, "Failed to convert name to "
-					"Unicode.");
-		return ERR_PTR(uname_len);
-	}
-	mref = ntfs_lookup_inode_by_name(NTFS_I(dir_ino), uname, uname_len,
-			&name);
-	kmem_cache_free(ntfs_name_cache, uname);
-	if (!IS_ERR_MREF(mref)) {
-		dent_ino = MREF(mref);
-		ntfs_debug("Found inode 0x%lx. Calling ntfs_iget.", dent_ino);
-		dent_inode = ntfs_iget(vol->sb, dent_ino);
-		if (!IS_ERR(dent_inode)) {
-			/* Consistency check. */
-			if (is_bad_inode(dent_inode) || MSEQNO(mref) ==
-					NTFS_I(dent_inode)->seq_no ||
-					dent_ino == FILE_MFT) {
-				/* Perfect WIN32/POSIX match. -- Case 1. */
-				if (!name) {
-					ntfs_debug("Done.  (Case 1.)");
-					return d_splice_alias(dent_inode, dent);
-				}
-				/*
-				 * We are too indented.  Handle imperfect
-				 * matches and short file names further below.
-				 */
-				goto handle_name;
-			}
-			ntfs_error(vol->sb, "Found stale reference to inode "
-					"0x%lx (reference sequence number = "
-					"0x%x, inode sequence number = 0x%x), "
-					"returning -EIO. Run chkdsk.",
-					dent_ino, MSEQNO(mref),
-					NTFS_I(dent_inode)->seq_no);
-			iput(dent_inode);
-			dent_inode = ERR_PTR(-EIO);
-		} else
-			ntfs_error(vol->sb, "ntfs_iget(0x%lx) failed with "
-					"error code %li.", dent_ino,
-					PTR_ERR(dent_inode));
-		kfree(name);
-		/* Return the error code. */
-		return ERR_CAST(dent_inode);
-	}
-	/* It is guaranteed that @name is no longer allocated at this point. */
-	if (MREF_ERR(mref) == -ENOENT) {
-		ntfs_debug("Entry was not found, adding negative dentry.");
-		/* The dcache will handle negative entries. */
-		d_add(dent, NULL);
-		ntfs_debug("Done.");
-		return NULL;
-	}
-	ntfs_error(vol->sb, "ntfs_lookup_ino_by_name() failed with error "
-			"code %i.", -MREF_ERR(mref));
-	return ERR_PTR(MREF_ERR(mref));
-	// TODO: Consider moving this lot to a separate function! (AIA)
-handle_name:
-   {
-	MFT_RECORD *m;
-	ntfs_attr_search_ctx *ctx;
-	ntfs_inode *ni = NTFS_I(dent_inode);
-	int err;
-	struct qstr nls_name;
-
-	nls_name.name = NULL;
-	if (name->type != FILE_NAME_DOS) {			/* Case 2. */
-		ntfs_debug("Case 2.");
-		nls_name.len = (unsigned)ntfs_ucstonls(vol,
-				(ntfschar*)&name->name, name->len,
-				(unsigned char**)&nls_name.name, 0);
-		kfree(name);
-	} else /* if (name->type == FILE_NAME_DOS) */ {		/* Case 3. */
-		FILE_NAME_ATTR *fn;
-
-		ntfs_debug("Case 3.");
-		kfree(name);
-
-		/* Find the WIN32 name corresponding to the matched DOS name. */
-		ni = NTFS_I(dent_inode);
-		m = map_mft_record(ni);
-		if (IS_ERR(m)) {
-			err = PTR_ERR(m);
-			m = NULL;
-			ctx = NULL;
-			goto err_out;
-		}
-		ctx = ntfs_attr_get_search_ctx(ni, m);
-		if (unlikely(!ctx)) {
-			err = -ENOMEM;
-			goto err_out;
-		}
-		do {
-			ATTR_RECORD *a;
-			u32 val_len;
-
-			err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, 0, 0,
-					NULL, 0, ctx);
-			if (unlikely(err)) {
-				ntfs_error(vol->sb, "Inode corrupt: No WIN32 "
-						"namespace counterpart to DOS "
-						"file name. Run chkdsk.");
-				if (err == -ENOENT)
-					err = -EIO;
-				goto err_out;
-			}
-			/* Consistency checks. */
-			a = ctx->attr;
-			if (a->non_resident || a->flags)
-				goto eio_err_out;
-			val_len = le32_to_cpu(a->data.resident.value_length);
-			if (le16_to_cpu(a->data.resident.value_offset) +
-					val_len > le32_to_cpu(a->length))
-				goto eio_err_out;
-			fn = (FILE_NAME_ATTR*)((u8*)ctx->attr + le16_to_cpu(
-					ctx->attr->data.resident.value_offset));
-			if ((u32)(fn->file_name_length * sizeof(ntfschar) +
-					sizeof(FILE_NAME_ATTR)) > val_len)
-				goto eio_err_out;
-		} while (fn->file_name_type != FILE_NAME_WIN32);
-
-		/* Convert the found WIN32 name to current NLS code page. */
-		nls_name.len = (unsigned)ntfs_ucstonls(vol,
-				(ntfschar*)&fn->file_name, fn->file_name_length,
-				(unsigned char**)&nls_name.name, 0);
-
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-	}
-	m = NULL;
-	ctx = NULL;
-
-	/* Check if a conversion error occurred. */
-	if ((signed)nls_name.len < 0) {
-		err = (signed)nls_name.len;
-		goto err_out;
-	}
-	nls_name.hash = full_name_hash(dent, nls_name.name, nls_name.len);
-
-	dent = d_add_ci(dent, dent_inode, &nls_name);
-	kfree(nls_name.name);
-	return dent;
-
-eio_err_out:
-	ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk.");
-	err = -EIO;
-err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	if (m)
-		unmap_mft_record(ni);
-	iput(dent_inode);
-	ntfs_error(vol->sb, "Failed, returning error code %i.", err);
-	return ERR_PTR(err);
-   }
-}
-
-/*
- * Inode operations for directories.
- */
-const struct inode_operations ntfs_dir_inode_ops = {
-	.lookup	= ntfs_lookup,	/* VFS: Lookup directory. */
-};
-
-/**
- * ntfs_get_parent - find the dentry of the parent of a given directory dentry
- * @child_dent:		dentry of the directory whose parent directory to find
- *
- * Find the dentry for the parent directory of the directory specified by the
- * dentry @child_dent.  This function is called from
- * fs/exportfs/expfs.c::find_exported_dentry() which in turn is called from the
- * default ->decode_fh() which is export_decode_fh() in the same file.
- *
- * The code is based on the ext3 ->get_parent() implementation found in
- * fs/ext3/namei.c::ext3_get_parent().
- *
- * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down.
- *
- * Return the dentry of the parent directory on success or the error code on
- * error (IS_ERR() is true).
- */
-static struct dentry *ntfs_get_parent(struct dentry *child_dent)
-{
-	struct inode *vi = d_inode(child_dent);
-	ntfs_inode *ni = NTFS_I(vi);
-	MFT_RECORD *mrec;
-	ntfs_attr_search_ctx *ctx;
-	ATTR_RECORD *attr;
-	FILE_NAME_ATTR *fn;
-	unsigned long parent_ino;
-	int err;
-
-	ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
-	/* Get the mft record of the inode belonging to the child dentry. */
-	mrec = map_mft_record(ni);
-	if (IS_ERR(mrec))
-		return ERR_CAST(mrec);
-	/* Find the first file name attribute in the mft record. */
-	ctx = ntfs_attr_get_search_ctx(ni, mrec);
-	if (unlikely(!ctx)) {
-		unmap_mft_record(ni);
-		return ERR_PTR(-ENOMEM);
-	}
-try_next:
-	err = ntfs_attr_lookup(AT_FILE_NAME, NULL, 0, CASE_SENSITIVE, 0, NULL,
-			0, ctx);
-	if (unlikely(err)) {
-		ntfs_attr_put_search_ctx(ctx);
-		unmap_mft_record(ni);
-		if (err == -ENOENT)
-			ntfs_error(vi->i_sb, "Inode 0x%lx does not have a "
-					"file name attribute.  Run chkdsk.",
-					vi->i_ino);
-		return ERR_PTR(err);
-	}
-	attr = ctx->attr;
-	if (unlikely(attr->non_resident))
-		goto try_next;
-	fn = (FILE_NAME_ATTR *)((u8 *)attr +
-			le16_to_cpu(attr->data.resident.value_offset));
-	if (unlikely((u8 *)fn + le32_to_cpu(attr->data.resident.value_length) >
-			(u8*)attr + le32_to_cpu(attr->length)))
-		goto try_next;
-	/* Get the inode number of the parent directory. */
-	parent_ino = MREF_LE(fn->parent_directory);
-	/* Release the search context and the mft record of the child. */
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-
-	return d_obtain_alias(ntfs_iget(vi->i_sb, parent_ino));
-}
-
-static struct inode *ntfs_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
-{
-	struct inode *inode;
-
-	inode = ntfs_iget(sb, ino);
-	if (!IS_ERR(inode)) {
-		if (is_bad_inode(inode) || inode->i_generation != generation) {
-			iput(inode);
-			inode = ERR_PTR(-ESTALE);
-		}
-	}
-
-	return inode;
-}
-
-static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
-				    ntfs_nfs_get_inode);
-}
-
-static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
-{
-	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
-				    ntfs_nfs_get_inode);
-}
-
-/*
- * Export operations allowing NFS exporting of mounted NTFS partitions.
- *
- * We use the default ->encode_fh() for now.  Note that they
- * use 32 bits to store the inode number which is an unsigned long so on 64-bit
- * architectures is usually 64 bits so it would all fail horribly on huge
- * volumes.  I guess we need to define our own encode and decode fh functions
- * that store 64-bit inode numbers at some point but for now we will ignore the
- * problem...
- *
- * We also use the default ->get_name() helper (used by ->decode_fh() via
- * fs/exportfs/expfs.c::find_exported_dentry()) as that is completely fs
- * independent.
- *
- * The default ->get_parent() just returns -EACCES so we have to provide our
- * own and the default ->get_dentry() is incompatible with NTFS due to not
- * allowing the inode number 0 which is used in NTFS for the system file $MFT
- * and due to using iget() whereas NTFS needs ntfs_iget().
- */
-const struct export_operations ntfs_export_ops = {
-	.encode_fh	= generic_encode_ino32_fh,
-	.get_parent	= ntfs_get_parent,	/* Find the parent of a given
-						   directory. */
-	.fh_to_dentry	= ntfs_fh_to_dentry,
-	.fh_to_parent	= ntfs_fh_to_parent,
-};
diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h
deleted file mode 100644
index e81376ea9152..000000000000
--- a/fs/ntfs/ntfs.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * ntfs.h - Defines for NTFS Linux kernel driver.
- *
- * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.
- * Copyright (C) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_H
-#define _LINUX_NTFS_H
-
-#include <linux/stddef.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/compiler.h>
-#include <linux/fs.h>
-#include <linux/nls.h>
-#include <linux/smp.h>
-#include <linux/pagemap.h>
-
-#include "types.h"
-#include "volume.h"
-#include "layout.h"
-
-typedef enum {
-	NTFS_BLOCK_SIZE		= 512,
-	NTFS_BLOCK_SIZE_BITS	= 9,
-	NTFS_SB_MAGIC		= 0x5346544e,	/* 'NTFS' */
-	NTFS_MAX_NAME_LEN	= 255,
-	NTFS_MAX_ATTR_NAME_LEN	= 255,
-	NTFS_MAX_CLUSTER_SIZE	= 64 * 1024,	/* 64kiB */
-	NTFS_MAX_PAGES_PER_CLUSTER = NTFS_MAX_CLUSTER_SIZE / PAGE_SIZE,
-} NTFS_CONSTANTS;
-
-/* Global variables. */
-
-/* Slab caches (from super.c). */
-extern struct kmem_cache *ntfs_name_cache;
-extern struct kmem_cache *ntfs_inode_cache;
-extern struct kmem_cache *ntfs_big_inode_cache;
-extern struct kmem_cache *ntfs_attr_ctx_cache;
-extern struct kmem_cache *ntfs_index_ctx_cache;
-
-/* The various operations structs defined throughout the driver files. */
-extern const struct address_space_operations ntfs_normal_aops;
-extern const struct address_space_operations ntfs_compressed_aops;
-extern const struct address_space_operations ntfs_mst_aops;
-
-extern const struct  file_operations ntfs_file_ops;
-extern const struct inode_operations ntfs_file_inode_ops;
-
-extern const struct  file_operations ntfs_dir_ops;
-extern const struct inode_operations ntfs_dir_inode_ops;
-
-extern const struct  file_operations ntfs_empty_file_ops;
-extern const struct inode_operations ntfs_empty_inode_ops;
-
-extern const struct export_operations ntfs_export_ops;
-
-/**
- * NTFS_SB - return the ntfs volume given a vfs super block
- * @sb:		VFS super block
- *
- * NTFS_SB() returns the ntfs volume associated with the VFS super block @sb.
- */
-static inline ntfs_volume *NTFS_SB(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/* Declarations of functions and global variables. */
-
-/* From fs/ntfs/compress.c */
-extern int ntfs_read_compressed_block(struct page *page);
-extern int allocate_compression_buffers(void);
-extern void free_compression_buffers(void);
-
-/* From fs/ntfs/super.c */
-#define default_upcase_len 0x10000
-extern struct mutex ntfs_lock;
-
-typedef struct {
-	int val;
-	char *str;
-} option_t;
-extern const option_t on_errors_arr[];
-
-/* From fs/ntfs/mst.c */
-extern int post_read_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern int pre_write_mst_fixup(NTFS_RECORD *b, const u32 size);
-extern void post_write_mst_fixup(NTFS_RECORD *b);
-
-/* From fs/ntfs/unistr.c */
-extern bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
-		const ntfschar *s2, size_t s2_len,
-		const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_size);
-extern int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
-		const ntfschar *name2, const u32 name2_len,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n);
-extern int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
-		const ntfschar *upcase, const u32 upcase_size);
-extern void ntfs_upcase_name(ntfschar *name, u32 name_len,
-		const ntfschar *upcase, const u32 upcase_len);
-extern void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
-		FILE_NAME_ATTR *file_name_attr2,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len);
-extern int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
-		const int ins_len, ntfschar **outs);
-extern int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
-		const int ins_len, unsigned char **outs, int outs_len);
-
-/* From fs/ntfs/upcase.c */
-extern ntfschar *generate_default_upcase(void);
-
-static inline int ntfs_ffs(int x)
-{
-	int r = 1;
-
-	if (!x)
-		return 0;
-	if (!(x & 0xffff)) {
-		x >>= 16;
-		r += 16;
-	}
-	if (!(x & 0xff)) {
-		x >>= 8;
-		r += 8;
-	}
-	if (!(x & 0xf)) {
-		x >>= 4;
-		r += 4;
-	}
-	if (!(x & 3)) {
-		x >>= 2;
-		r += 2;
-	}
-	if (!(x & 1)) {
-		x >>= 1;
-		r += 1;
-	}
-	return r;
-}
-
-#endif /* _LINUX_NTFS_H */
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
deleted file mode 100644
index 9160480222fd..000000000000
--- a/fs/ntfs/quota.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * quota.c - NTFS kernel quota ($Quota) handling.  Part of the Linux-NTFS
- *	     project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include "index.h"
-#include "quota.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/**
- * ntfs_mark_quotas_out_of_date - mark the quotas out of date on an ntfs volume
- * @vol:	ntfs volume on which to mark the quotas out of date
- *
- * Mark the quotas out of date on the ntfs volume @vol and return 'true' on
- * success and 'false' on error.
- */
-bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
-{
-	ntfs_index_context *ictx;
-	QUOTA_CONTROL_ENTRY *qce;
-	const le32 qid = QUOTA_DEFAULTS_ID;
-	int err;
-
-	ntfs_debug("Entering.");
-	if (NVolQuotaOutOfDate(vol))
-		goto done;
-	if (!vol->quota_ino || !vol->quota_q_ino) {
-		ntfs_error(vol->sb, "Quota inodes are not open.");
-		return false;
-	}
-	inode_lock(vol->quota_q_ino);
-	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
-	if (!ictx) {
-		ntfs_error(vol->sb, "Failed to get index context.");
-		goto err_out;
-	}
-	err = ntfs_index_lookup(&qid, sizeof(qid), ictx);
-	if (err) {
-		if (err == -ENOENT)
-			ntfs_error(vol->sb, "Quota defaults entry is not "
-					"present.");
-		else
-			ntfs_error(vol->sb, "Lookup of quota defaults entry "
-					"failed.");
-		goto err_out;
-	}
-	if (ictx->data_len < offsetof(QUOTA_CONTROL_ENTRY, sid)) {
-		ntfs_error(vol->sb, "Quota defaults entry size is invalid.  "
-				"Run chkdsk.");
-		goto err_out;
-	}
-	qce = (QUOTA_CONTROL_ENTRY*)ictx->data;
-	if (le32_to_cpu(qce->version) != QUOTA_VERSION) {
-		ntfs_error(vol->sb, "Quota defaults entry version 0x%x is not "
-				"supported.", le32_to_cpu(qce->version));
-		goto err_out;
-	}
-	ntfs_debug("Quota defaults flags = 0x%x.", le32_to_cpu(qce->flags));
-	/* If quotas are already marked out of date, no need to do anything. */
-	if (qce->flags & QUOTA_FLAG_OUT_OF_DATE)
-		goto set_done;
-	/*
-	 * If quota tracking is neither requested, nor enabled and there are no
-	 * pending deletes, no need to mark the quotas out of date.
-	 */
-	if (!(qce->flags & (QUOTA_FLAG_TRACKING_ENABLED |
-			QUOTA_FLAG_TRACKING_REQUESTED |
-			QUOTA_FLAG_PENDING_DELETES)))
-		goto set_done;
-	/*
-	 * Set the QUOTA_FLAG_OUT_OF_DATE bit thus marking quotas out of date.
-	 * This is verified on WinXP to be sufficient to cause windows to
-	 * rescan the volume on boot and update all quota entries.
-	 */
-	qce->flags |= QUOTA_FLAG_OUT_OF_DATE;
-	/* Ensure the modified flags are written to disk. */
-	ntfs_index_entry_flush_dcache_page(ictx);
-	ntfs_index_entry_mark_dirty(ictx);
-set_done:
-	ntfs_index_ctx_put(ictx);
-	inode_unlock(vol->quota_q_ino);
-	/*
-	 * We set the flag so we do not try to mark the quotas out of date
-	 * again on remount.
-	 */
-	NVolSetQuotaOutOfDate(vol);
-done:
-	ntfs_debug("Done.");
-	return true;
-err_out:
-	if (ictx)
-		ntfs_index_ctx_put(ictx);
-	inode_unlock(vol->quota_q_ino);
-	return false;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/quota.h b/fs/ntfs/quota.h
deleted file mode 100644
index fe3132a3d6d2..000000000000
--- a/fs/ntfs/quota.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * quota.h - Defines for NTFS kernel quota ($Quota) handling.  Part of the
- *	     Linux-NTFS project.
- *
- * Copyright (c) 2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_QUOTA_H
-#define _LINUX_NTFS_QUOTA_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "volume.h"
-
-extern bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_QUOTA_H */
diff --git a/fs/ntfs/runlist.c b/fs/ntfs/runlist.c
deleted file mode 100644
index 0d448e9881f7..000000000000
--- a/fs/ntfs/runlist.c
+++ /dev/null
@@ -1,1893 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * runlist.c - NTFS runlist handling code.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2007 Anton Altaparmakov
- * Copyright (c) 2002-2005 Richard Russon
- */
-
-#include "debug.h"
-#include "dir.h"
-#include "endian.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/**
- * ntfs_rl_mm - runlist memmove
- *
- * It is up to the caller to serialize access to the runlist @base.
- */
-static inline void ntfs_rl_mm(runlist_element *base, int dst, int src,
-		int size)
-{
-	if (likely((dst != src) && (size > 0)))
-		memmove(base + dst, base + src, size * sizeof(*base));
-}
-
-/**
- * ntfs_rl_mc - runlist memory copy
- *
- * It is up to the caller to serialize access to the runlists @dstbase and
- * @srcbase.
- */
-static inline void ntfs_rl_mc(runlist_element *dstbase, int dst,
-		runlist_element *srcbase, int src, int size)
-{
-	if (likely(size > 0))
-		memcpy(dstbase + dst, srcbase + src, size * sizeof(*dstbase));
-}
-
-/**
- * ntfs_rl_realloc - Reallocate memory for runlists
- * @rl:		original runlist
- * @old_size:	number of runlist elements in the original runlist @rl
- * @new_size:	number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required.  To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B.  If the new allocation doesn't require a different number of pages in
- *       memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc(runlist_element *rl,
-		int old_size, int new_size)
-{
-	runlist_element *new_rl;
-
-	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
-	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
-	if (old_size == new_size)
-		return rl;
-
-	new_rl = ntfs_malloc_nofs(new_size);
-	if (unlikely(!new_rl))
-		return ERR_PTR(-ENOMEM);
-
-	if (likely(rl != NULL)) {
-		if (unlikely(old_size > new_size))
-			old_size = new_size;
-		memcpy(new_rl, rl, old_size);
-		ntfs_free(rl);
-	}
-	return new_rl;
-}
-
-/**
- * ntfs_rl_realloc_nofail - Reallocate memory for runlists
- * @rl:		original runlist
- * @old_size:	number of runlist elements in the original runlist @rl
- * @new_size:	number of runlist elements we need space for
- *
- * As the runlists grow, more memory will be required.  To prevent the
- * kernel having to allocate and reallocate large numbers of small bits of
- * memory, this function returns an entire page of memory.
- *
- * This function guarantees that the allocation will succeed.  It will sleep
- * for as long as it takes to complete the allocation.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * N.B.  If the new allocation doesn't require a different number of pages in
- *       memory, the function will return the original pointer.
- *
- * On success, return a pointer to the newly allocated, or recycled, memory.
- * On error, return -errno. The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_realloc_nofail(runlist_element *rl,
-		int old_size, int new_size)
-{
-	runlist_element *new_rl;
-
-	old_size = PAGE_ALIGN(old_size * sizeof(*rl));
-	new_size = PAGE_ALIGN(new_size * sizeof(*rl));
-	if (old_size == new_size)
-		return rl;
-
-	new_rl = ntfs_malloc_nofs_nofail(new_size);
-	BUG_ON(!new_rl);
-
-	if (likely(rl != NULL)) {
-		if (unlikely(old_size > new_size))
-			old_size = new_size;
-		memcpy(new_rl, rl, old_size);
-		ntfs_free(rl);
-	}
-	return new_rl;
-}
-
-/**
- * ntfs_are_rl_mergeable - test if two runlists can be joined together
- * @dst:	original runlist
- * @src:	new runlist to test for mergeability with @dst
- *
- * Test if two runlists can be joined together. For this, their VCNs and LCNs
- * must be adjacent.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * Return: true   Success, the runlists can be merged.
- *	   false  Failure, the runlists cannot be merged.
- */
-static inline bool ntfs_are_rl_mergeable(runlist_element *dst,
-		runlist_element *src)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* We can merge unmapped regions even if they are misaligned. */
-	if ((dst->lcn == LCN_RL_NOT_MAPPED) && (src->lcn == LCN_RL_NOT_MAPPED))
-		return true;
-	/* If the runs are misaligned, we cannot merge them. */
-	if ((dst->vcn + dst->length) != src->vcn)
-		return false;
-	/* If both runs are non-sparse and contiguous, we can merge them. */
-	if ((dst->lcn >= 0) && (src->lcn >= 0) &&
-			((dst->lcn + dst->length) == src->lcn))
-		return true;
-	/* If we are merging two holes, we can merge them. */
-	if ((dst->lcn == LCN_HOLE) && (src->lcn == LCN_HOLE))
-		return true;
-	/* Cannot merge. */
-	return false;
-}
-
-/**
- * __ntfs_rl_merge - merge two runlists without testing if they can be merged
- * @dst:	original, destination runlist
- * @src:	new runlist to merge with @dst
- *
- * Merge the two runlists, writing into the destination runlist @dst. The
- * caller must make sure the runlists can be merged or this will corrupt the
- * destination runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- */
-static inline void __ntfs_rl_merge(runlist_element *dst, runlist_element *src)
-{
-	dst->length += src->length;
-}
-
-/**
- * ntfs_rl_append - append a runlist after a given element
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	runlist to be inserted into @dst
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	append the new runlist @src after this element in @dst
- *
- * Append the runlist @src after element @loc in @dst.  Merge the right end of
- * the new runlist, if necessary. Adjust the size of the hole before the
- * appended runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_append(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	bool right = false;	/* Right end of @src needs merging. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, check if the right hand end needs merging. */
-	if ((loc + 1) < dsize)
-		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-
-	/* Space required: @dst size + @src size, less one if we merged. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - right);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* First, merge the right hand end, if necessary. */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-
-	/* First run after the @src runs that have been inserted. */
-	marker = loc + ssize + 1;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, loc + 1 + right, dsize - (loc + 1 + right));
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the preceding hole. */
-	dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-
-	/* We may have changed the length of the file, so fix the end marker */
-	if (dst[marker].lcn == LCN_ENOENT)
-		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-
-	return dst;
-}
-
-/**
- * ntfs_rl_insert - insert a runlist into another
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	insert the new runlist @src before this element in @dst
- *
- * Insert the runlist @src before element @loc in the runlist @dst. Merge the
- * left end of the new runlist, if necessary. Adjust the size of the hole
- * after the inserted runlist.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_insert(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	bool left = false;	/* Left end of @src needs merging. */
-	bool disc = false;	/* Discontinuity between @dst and @src. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/*
-	 * disc => Discontinuity between the end of @dst and the start of @src.
-	 *	   This means we might need to insert a "not mapped" run.
-	 */
-	if (loc == 0)
-		disc = (src[0].vcn > 0);
-	else {
-		s64 merged_length;
-
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-
-		merged_length = dst[loc - 1].length;
-		if (left)
-			merged_length += src->length;
-
-		disc = (src[0].vcn > dst[loc - 1].vcn + merged_length);
-	}
-	/*
-	 * Space required: @dst size + @src size, less one if we merged, plus
-	 * one if there was a discontinuity.
-	 */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize - left + disc);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlist.
-	 */
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-	/*
-	 * First run after the @src runs that have been inserted.
-	 * Nominally,  @marker equals @loc + @ssize, i.e. location + number of
-	 * runs in @src.  However, if @left, then the first run in @src has
-	 * been merged with one in @dst.  And if @disc, then @dst and @src do
-	 * not meet and we need an extra run to fill the gap.
-	 */
-	marker = loc + ssize - left + disc;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + disc, src, left, ssize - left);
-
-	/* Adjust the VCN of the first run after the insertion... */
-	dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-	/* ... and the length. */
-	if (dst[marker].lcn == LCN_HOLE || dst[marker].lcn == LCN_RL_NOT_MAPPED)
-		dst[marker].length = dst[marker + 1].vcn - dst[marker].vcn;
-
-	/* Writing beyond the end of the file and there is a discontinuity. */
-	if (disc) {
-		if (loc > 0) {
-			dst[loc].vcn = dst[loc - 1].vcn + dst[loc - 1].length;
-			dst[loc].length = dst[loc + 1].vcn - dst[loc].vcn;
-		} else {
-			dst[loc].vcn = 0;
-			dst[loc].length = dst[loc + 1].vcn;
-		}
-		dst[loc].lcn = LCN_RL_NOT_MAPPED;
-	}
-	return dst;
-}
-
-/**
- * ntfs_rl_replace - overwrite a runlist element with another runlist
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst to overwrite with @src
- *
- * Replace the runlist element @dst at @loc with @src. Merge the left and
- * right ends of the inserted runlist, if necessary.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_replace(runlist_element *dst,
-		int dsize, runlist_element *src, int ssize, int loc)
-{
-	signed delta;
-	bool left = false;	/* Left end of @src needs merging. */
-	bool right = false;	/* Right end of @src needs merging. */
-	int tail;		/* Start of tail of @dst. */
-	int marker;		/* End of the inserted runs. */
-
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* First, see if the left and right ends need merging. */
-	if ((loc + 1) < dsize)
-		right = ntfs_are_rl_mergeable(src + ssize - 1, dst + loc + 1);
-	if (loc > 0)
-		left = ntfs_are_rl_mergeable(dst + loc - 1, src);
-	/*
-	 * Allocate some space.  We will need less if the left, right, or both
-	 * ends get merged.  The -1 accounts for the run being replaced.
-	 */
-	delta = ssize - 1 - left - right;
-	if (delta > 0) {
-		dst = ntfs_rl_realloc(dst, dsize, dsize + delta);
-		if (IS_ERR(dst))
-			return dst;
-	}
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* First, merge the left and right ends, if necessary. */
-	if (right)
-		__ntfs_rl_merge(src + ssize - 1, dst + loc + 1);
-	if (left)
-		__ntfs_rl_merge(dst + loc - 1, src);
-	/*
-	 * Offset of the tail of @dst.  This needs to be moved out of the way
-	 * to make space for the runs to be copied from @src, i.e. the first
-	 * run of the tail of @dst.
-	 * Nominally, @tail equals @loc + 1, i.e. location, skipping the
-	 * replaced run.  However, if @right, then one of @dst's runs is
-	 * already merged into @src.
-	 */
-	tail = loc + right + 1;
-	/*
-	 * First run after the @src runs that have been inserted, i.e. where
-	 * the tail of @dst needs to be moved to.
-	 * Nominally, @marker equals @loc + @ssize, i.e. location + number of
-	 * runs in @src.  However, if @left, then the first run in @src has
-	 * been merged with one in @dst.
-	 */
-	marker = loc + ssize - left;
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, marker, tail, dsize - tail);
-	ntfs_rl_mc(dst, loc, src, left, ssize - left);
-
-	/* We may have changed the length of the file, so fix the end marker. */
-	if (dsize - tail > 0 && dst[marker].lcn == LCN_ENOENT)
-		dst[marker].vcn = dst[marker - 1].vcn + dst[marker - 1].length;
-	return dst;
-}
-
-/**
- * ntfs_rl_split - insert a runlist into the centre of a hole
- * @dst:	original runlist to be worked on
- * @dsize:	number of elements in @dst (including end marker)
- * @src:	new runlist to be inserted
- * @ssize:	number of elements in @src (excluding end marker)
- * @loc:	index in runlist @dst at which to split and insert @src
- *
- * Split the runlist @dst at @loc into two and insert @new in between the two
- * fragments. No merging of runlists is necessary. Adjust the size of the
- * holes either side.
- *
- * It is up to the caller to serialize access to the runlists @dst and @src.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @dst and @src are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- */
-static inline runlist_element *ntfs_rl_split(runlist_element *dst, int dsize,
-		runlist_element *src, int ssize, int loc)
-{
-	BUG_ON(!dst);
-	BUG_ON(!src);
-
-	/* Space required: @dst size + @src size + one new hole. */
-	dst = ntfs_rl_realloc(dst, dsize, dsize + ssize + 1);
-	if (IS_ERR(dst))
-		return dst;
-	/*
-	 * We are guaranteed to succeed from here so can start modifying the
-	 * original runlists.
-	 */
-
-	/* Move the tail of @dst out of the way, then copy in @src. */
-	ntfs_rl_mm(dst, loc + 1 + ssize, loc, dsize - loc);
-	ntfs_rl_mc(dst, loc + 1, src, 0, ssize);
-
-	/* Adjust the size of the holes either size of @src. */
-	dst[loc].length		= dst[loc+1].vcn       - dst[loc].vcn;
-	dst[loc+ssize+1].vcn    = dst[loc+ssize].vcn   + dst[loc+ssize].length;
-	dst[loc+ssize+1].length = dst[loc+ssize+2].vcn - dst[loc+ssize+1].vcn;
-
-	return dst;
-}
-
-/**
- * ntfs_runlists_merge - merge two runlists into one
- * @drl:	original runlist to be worked on
- * @srl:	new runlist to be merged into @drl
- *
- * First we sanity check the two runlists @srl and @drl to make sure that they
- * are sensible and can be merged. The runlist @srl must be either after the
- * runlist @drl or completely within a hole (or unmapped region) in @drl.
- *
- * It is up to the caller to serialize access to the runlists @drl and @srl.
- *
- * Merging of runlists is necessary in two cases:
- *   1. When attribute lists are used and a further extent is being mapped.
- *   2. When new clusters are allocated to fill a hole or extend a file.
- *
- * There are four possible ways @srl can be merged. It can:
- *	- be inserted at the beginning of a hole,
- *	- split the hole in two and be inserted between the two fragments,
- *	- be appended at the end of a hole, or it can
- *	- replace the whole hole.
- * It can also be appended to the end of the runlist, which is just a variant
- * of the insert case.
- *
- * On success, return a pointer to the new, combined, runlist. Note, both
- * runlists @drl and @srl are deallocated before returning so you cannot use
- * the pointers for anything any more. (Strictly speaking the returned runlist
- * may be the same as @dst but this is irrelevant.)
- *
- * On error, return -errno. Both runlists are left unmodified. The following
- * error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The runlists overlap and cannot be merged.
- */
-runlist_element *ntfs_runlists_merge(runlist_element *drl,
-		runlist_element *srl)
-{
-	int di, si;		/* Current index into @[ds]rl. */
-	int sstart;		/* First index with lcn > LCN_RL_NOT_MAPPED. */
-	int dins;		/* Index into @drl at which to insert @srl. */
-	int dend, send;		/* Last index into @[ds]rl. */
-	int dfinal, sfinal;	/* The last index into @[ds]rl with
-				   lcn >= LCN_HOLE. */
-	int marker = 0;
-	VCN marker_vcn = 0;
-
-#ifdef DEBUG
-	ntfs_debug("dst:");
-	ntfs_debug_dump_runlist(drl);
-	ntfs_debug("src:");
-	ntfs_debug_dump_runlist(srl);
-#endif
-
-	/* Check for silly calling... */
-	if (unlikely(!srl))
-		return drl;
-	if (IS_ERR(srl) || IS_ERR(drl))
-		return ERR_PTR(-EINVAL);
-
-	/* Check for the case where the first mapping is being done now. */
-	if (unlikely(!drl)) {
-		drl = srl;
-		/* Complete the source runlist if necessary. */
-		if (unlikely(drl[0].vcn)) {
-			/* Scan to the end of the source runlist. */
-			for (dend = 0; likely(drl[dend].length); dend++)
-				;
-			dend++;
-			drl = ntfs_rl_realloc(drl, dend, dend + 1);
-			if (IS_ERR(drl))
-				return drl;
-			/* Insert start element at the front of the runlist. */
-			ntfs_rl_mm(drl, 1, 0, dend);
-			drl[0].vcn = 0;
-			drl[0].lcn = LCN_RL_NOT_MAPPED;
-			drl[0].length = drl[1].vcn;
-		}
-		goto finished;
-	}
-
-	si = di = 0;
-
-	/* Skip any unmapped start element(s) in the source runlist. */
-	while (srl[si].length && srl[si].lcn < LCN_HOLE)
-		si++;
-
-	/* Can't have an entirely unmapped source runlist. */
-	BUG_ON(!srl[si].length);
-
-	/* Record the starting points. */
-	sstart = si;
-
-	/*
-	 * Skip forward in @drl until we reach the position where @srl needs to
-	 * be inserted. If we reach the end of @drl, @srl just needs to be
-	 * appended to @drl.
-	 */
-	for (; drl[di].length; di++) {
-		if (drl[di].vcn + drl[di].length > srl[sstart].vcn)
-			break;
-	}
-	dins = di;
-
-	/* Sanity check for illegal overlaps. */
-	if ((drl[di].vcn == srl[si].vcn) && (drl[di].lcn >= 0) &&
-			(srl[si].lcn >= 0)) {
-		ntfs_error(NULL, "Run lists overlap. Cannot merge!");
-		return ERR_PTR(-ERANGE);
-	}
-
-	/* Scan to the end of both runlists in order to know their sizes. */
-	for (send = si; srl[send].length; send++)
-		;
-	for (dend = di; drl[dend].length; dend++)
-		;
-
-	if (srl[send].lcn == LCN_ENOENT)
-		marker_vcn = srl[marker = send].vcn;
-
-	/* Scan to the last element with lcn >= LCN_HOLE. */
-	for (sfinal = send; sfinal >= 0 && srl[sfinal].lcn < LCN_HOLE; sfinal--)
-		;
-	for (dfinal = dend; dfinal >= 0 && drl[dfinal].lcn < LCN_HOLE; dfinal--)
-		;
-
-	{
-	bool start;
-	bool finish;
-	int ds = dend + 1;		/* Number of elements in drl & srl */
-	int ss = sfinal - sstart + 1;
-
-	start  = ((drl[dins].lcn <  LCN_RL_NOT_MAPPED) ||    /* End of file   */
-		  (drl[dins].vcn == srl[sstart].vcn));	     /* Start of hole */
-	finish = ((drl[dins].lcn >= LCN_RL_NOT_MAPPED) &&    /* End of file   */
-		 ((drl[dins].vcn + drl[dins].length) <=      /* End of hole   */
-		  (srl[send - 1].vcn + srl[send - 1].length)));
-
-	/* Or we will lose an end marker. */
-	if (finish && !drl[dins].length)
-		ss++;
-	if (marker && (drl[dins].vcn + drl[dins].length > srl[send - 1].vcn))
-		finish = false;
-#if 0
-	ntfs_debug("dfinal = %i, dend = %i", dfinal, dend);
-	ntfs_debug("sstart = %i, sfinal = %i, send = %i", sstart, sfinal, send);
-	ntfs_debug("start = %i, finish = %i", start, finish);
-	ntfs_debug("ds = %i, ss = %i, dins = %i", ds, ss, dins);
-#endif
-	if (start) {
-		if (finish)
-			drl = ntfs_rl_replace(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_insert(drl, ds, srl + sstart, ss, dins);
-	} else {
-		if (finish)
-			drl = ntfs_rl_append(drl, ds, srl + sstart, ss, dins);
-		else
-			drl = ntfs_rl_split(drl, ds, srl + sstart, ss, dins);
-	}
-	if (IS_ERR(drl)) {
-		ntfs_error(NULL, "Merge failed.");
-		return drl;
-	}
-	ntfs_free(srl);
-	if (marker) {
-		ntfs_debug("Triggering marker code.");
-		for (ds = dend; drl[ds].length; ds++)
-			;
-		/* We only need to care if @srl ended after @drl. */
-		if (drl[ds].vcn <= marker_vcn) {
-			int slots = 0;
-
-			if (drl[ds].vcn == marker_vcn) {
-				ntfs_debug("Old marker = 0x%llx, replacing "
-						"with LCN_ENOENT.",
-						(unsigned long long)
-						drl[ds].lcn);
-				drl[ds].lcn = LCN_ENOENT;
-				goto finished;
-			}
-			/*
-			 * We need to create an unmapped runlist element in
-			 * @drl or extend an existing one before adding the
-			 * ENOENT terminator.
-			 */
-			if (drl[ds].lcn == LCN_ENOENT) {
-				ds--;
-				slots = 1;
-			}
-			if (drl[ds].lcn != LCN_RL_NOT_MAPPED) {
-				/* Add an unmapped runlist element. */
-				if (!slots) {
-					drl = ntfs_rl_realloc_nofail(drl, ds,
-							ds + 2);
-					slots = 2;
-				}
-				ds++;
-				/* Need to set vcn if it isn't set already. */
-				if (slots != 1)
-					drl[ds].vcn = drl[ds - 1].vcn +
-							drl[ds - 1].length;
-				drl[ds].lcn = LCN_RL_NOT_MAPPED;
-				/* We now used up a slot. */
-				slots--;
-			}
-			drl[ds].length = marker_vcn - drl[ds].vcn;
-			/* Finally add the ENOENT terminator. */
-			ds++;
-			if (!slots)
-				drl = ntfs_rl_realloc_nofail(drl, ds, ds + 1);
-			drl[ds].vcn = marker_vcn;
-			drl[ds].lcn = LCN_ENOENT;
-			drl[ds].length = (s64)0;
-		}
-	}
-	}
-
-finished:
-	/* The merge was completed successfully. */
-	ntfs_debug("Merged runlist:");
-	ntfs_debug_dump_runlist(drl);
-	return drl;
-}
-
-/**
- * ntfs_mapping_pairs_decompress - convert mapping pairs array to runlist
- * @vol:	ntfs volume on which the attribute resides
- * @attr:	attribute record whose mapping pairs array to decompress
- * @old_rl:	optional runlist in which to insert @attr's runlist
- *
- * It is up to the caller to serialize access to the runlist @old_rl.
- *
- * Decompress the attribute @attr's mapping pairs array into a runlist. On
- * success, return the decompressed runlist.
- *
- * If @old_rl is not NULL, decompressed runlist is inserted into the
- * appropriate place in @old_rl and the resultant, combined runlist is
- * returned. The original @old_rl is deallocated.
- *
- * On error, return -errno. @old_rl is left unmodified in that case.
- *
- * The following error codes are defined:
- *	-ENOMEM	- Not enough memory to allocate runlist array.
- *	-EIO	- Corrupt runlist.
- *	-EINVAL	- Invalid parameters were passed in.
- *	-ERANGE	- The two runlists overlap.
- *
- * FIXME: For now we take the conceptionally simplest approach of creating the
- * new runlist disregarding the already existing one and then splicing the
- * two into one, if that is possible (we check for overlap and discard the new
- * runlist if overlap present before returning ERR_PTR(-ERANGE)).
- */
-runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl)
-{
-	VCN vcn;		/* Current vcn. */
-	LCN lcn;		/* Current lcn. */
-	s64 deltaxcn;		/* Change in [vl]cn. */
-	runlist_element *rl;	/* The output runlist. */
-	u8 *buf;		/* Current position in mapping pairs array. */
-	u8 *attr_end;		/* End of attribute. */
-	int rlsize;		/* Size of runlist buffer. */
-	u16 rlpos;		/* Current runlist position in units of
-				   runlist_elements. */
-	u8 b;			/* Current byte offset in buf. */
-
-#ifdef DEBUG
-	/* Make sure attr exists and is non-resident. */
-	if (!attr || !attr->non_resident || sle64_to_cpu(
-			attr->data.non_resident.lowest_vcn) < (VCN)0) {
-		ntfs_error(vol->sb, "Invalid arguments.");
-		return ERR_PTR(-EINVAL);
-	}
-#endif
-	/* Start at vcn = lowest_vcn and lcn 0. */
-	vcn = sle64_to_cpu(attr->data.non_resident.lowest_vcn);
-	lcn = 0;
-	/* Get start of the mapping pairs array. */
-	buf = (u8*)attr + le16_to_cpu(
-			attr->data.non_resident.mapping_pairs_offset);
-	attr_end = (u8*)attr + le32_to_cpu(attr->length);
-	if (unlikely(buf < (u8*)attr || buf > attr_end)) {
-		ntfs_error(vol->sb, "Corrupt attribute.");
-		return ERR_PTR(-EIO);
-	}
-	/* If the mapping pairs array is valid but empty, nothing to do. */
-	if (!vcn && !*buf)
-		return old_rl;
-	/* Current position in runlist array. */
-	rlpos = 0;
-	/* Allocate first page and set current runlist size to one page. */
-	rl = ntfs_malloc_nofs(rlsize = PAGE_SIZE);
-	if (unlikely(!rl))
-		return ERR_PTR(-ENOMEM);
-	/* Insert unmapped starting element if necessary. */
-	if (vcn) {
-		rl->vcn = 0;
-		rl->lcn = LCN_RL_NOT_MAPPED;
-		rl->length = vcn;
-		rlpos++;
-	}
-	while (buf < attr_end && *buf) {
-		/*
-		 * Allocate more memory if needed, including space for the
-		 * not-mapped and terminator elements. ntfs_malloc_nofs()
-		 * operates on whole pages only.
-		 */
-		if (((rlpos + 3) * sizeof(*old_rl)) > rlsize) {
-			runlist_element *rl2;
-
-			rl2 = ntfs_malloc_nofs(rlsize + (int)PAGE_SIZE);
-			if (unlikely(!rl2)) {
-				ntfs_free(rl);
-				return ERR_PTR(-ENOMEM);
-			}
-			memcpy(rl2, rl, rlsize);
-			ntfs_free(rl);
-			rl = rl2;
-			rlsize += PAGE_SIZE;
-		}
-		/* Enter the current vcn into the current runlist element. */
-		rl[rlpos].vcn = vcn;
-		/*
-		 * Get the change in vcn, i.e. the run length in clusters.
-		 * Doing it this way ensures that we signextend negative values.
-		 * A negative run length doesn't make any sense, but hey, I
-		 * didn't make up the NTFS specs and Windows NT4 treats the run
-		 * length as a signed value so that's how it is...
-		 */
-		b = *buf & 0xf;
-		if (b) {
-			if (unlikely(buf + b > attr_end))
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-		} else { /* The length entry is compulsory. */
-			ntfs_error(vol->sb, "Missing length entry in mapping "
-					"pairs array.");
-			deltaxcn = (s64)-1;
-		}
-		/*
-		 * Assume a negative length to indicate data corruption and
-		 * hence clean-up and return NULL.
-		 */
-		if (unlikely(deltaxcn < 0)) {
-			ntfs_error(vol->sb, "Invalid length in mapping pairs "
-					"array.");
-			goto err_out;
-		}
-		/*
-		 * Enter the current run length into the current runlist
-		 * element.
-		 */
-		rl[rlpos].length = deltaxcn;
-		/* Increment the current vcn by the current run length. */
-		vcn += deltaxcn;
-		/*
-		 * There might be no lcn change at all, as is the case for
-		 * sparse clusters on NTFS 3.0+, in which case we set the lcn
-		 * to LCN_HOLE.
-		 */
-		if (!(*buf & 0xf0))
-			rl[rlpos].lcn = LCN_HOLE;
-		else {
-			/* Get the lcn change which really can be negative. */
-			u8 b2 = *buf & 0xf;
-			b = b2 + ((*buf >> 4) & 0xf);
-			if (buf + b > attr_end)
-				goto io_error;
-			for (deltaxcn = (s8)buf[b--]; b > b2; b--)
-				deltaxcn = (deltaxcn << 8) + buf[b];
-			/* Change the current lcn to its new value. */
-			lcn += deltaxcn;
-#ifdef DEBUG
-			/*
-			 * On NTFS 1.2-, apparently can have lcn == -1 to
-			 * indicate a hole. But we haven't verified ourselves
-			 * whether it is really the lcn or the deltaxcn that is
-			 * -1. So if either is found give us a message so we
-			 * can investigate it further!
-			 */
-			if (vol->major_ver < 3) {
-				if (unlikely(deltaxcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn delta == -1");
-				if (unlikely(lcn == (LCN)-1))
-					ntfs_error(vol->sb, "lcn == -1");
-			}
-#endif
-			/* Check lcn is not below -1. */
-			if (unlikely(lcn < (LCN)-1)) {
-				ntfs_error(vol->sb, "Invalid LCN < -1 in "
-						"mapping pairs array.");
-				goto err_out;
-			}
-			/* Enter the current lcn into the runlist element. */
-			rl[rlpos].lcn = lcn;
-		}
-		/* Get to the next runlist element. */
-		rlpos++;
-		/* Increment the buffer position to the next mapping pair. */
-		buf += (*buf & 0xf) + ((*buf >> 4) & 0xf) + 1;
-	}
-	if (unlikely(buf >= attr_end))
-		goto io_error;
-	/*
-	 * If there is a highest_vcn specified, it must be equal to the final
-	 * vcn in the runlist - 1, or something has gone badly wrong.
-	 */
-	deltaxcn = sle64_to_cpu(attr->data.non_resident.highest_vcn);
-	if (unlikely(deltaxcn && vcn - 1 != deltaxcn)) {
-mpa_err:
-		ntfs_error(vol->sb, "Corrupt mapping pairs array in "
-				"non-resident attribute.");
-		goto err_out;
-	}
-	/* Setup not mapped runlist element if this is the base extent. */
-	if (!attr->data.non_resident.lowest_vcn) {
-		VCN max_cluster;
-
-		max_cluster = ((sle64_to_cpu(
-				attr->data.non_resident.allocated_size) +
-				vol->cluster_size - 1) >>
-				vol->cluster_size_bits) - 1;
-		/*
-		 * A highest_vcn of zero means this is a single extent
-		 * attribute so simply terminate the runlist with LCN_ENOENT).
-		 */
-		if (deltaxcn) {
-			/*
-			 * If there is a difference between the highest_vcn and
-			 * the highest cluster, the runlist is either corrupt
-			 * or, more likely, there are more extents following
-			 * this one.
-			 */
-			if (deltaxcn < max_cluster) {
-				ntfs_debug("More extents to follow; deltaxcn "
-						"= 0x%llx, max_cluster = "
-						"0x%llx",
-						(unsigned long long)deltaxcn,
-						(unsigned long long)
-						max_cluster);
-				rl[rlpos].vcn = vcn;
-				vcn += rl[rlpos].length = max_cluster -
-						deltaxcn;
-				rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-				rlpos++;
-			} else if (unlikely(deltaxcn > max_cluster)) {
-				ntfs_error(vol->sb, "Corrupt attribute.  "
-						"deltaxcn = 0x%llx, "
-						"max_cluster = 0x%llx",
-						(unsigned long long)deltaxcn,
-						(unsigned long long)
-						max_cluster);
-				goto mpa_err;
-			}
-		}
-		rl[rlpos].lcn = LCN_ENOENT;
-	} else /* Not the base extent. There may be more extents to follow. */
-		rl[rlpos].lcn = LCN_RL_NOT_MAPPED;
-
-	/* Setup terminating runlist element. */
-	rl[rlpos].vcn = vcn;
-	rl[rlpos].length = (s64)0;
-	/* If no existing runlist was specified, we are done. */
-	if (!old_rl) {
-		ntfs_debug("Mapping pairs array successfully decompressed:");
-		ntfs_debug_dump_runlist(rl);
-		return rl;
-	}
-	/* Now combine the new and old runlists checking for overlaps. */
-	old_rl = ntfs_runlists_merge(old_rl, rl);
-	if (!IS_ERR(old_rl))
-		return old_rl;
-	ntfs_free(rl);
-	ntfs_error(vol->sb, "Failed to merge runlists.");
-	return old_rl;
-io_error:
-	ntfs_error(vol->sb, "Corrupt attribute.");
-err_out:
-	ntfs_free(rl);
-	return ERR_PTR(-EIO);
-}
-
-/**
- * ntfs_rl_vcn_to_lcn - convert a vcn into a lcn given a runlist
- * @rl:		runlist to use for conversion
- * @vcn:	vcn to convert
- *
- * Convert the virtual cluster number @vcn of an attribute into a logical
- * cluster number (lcn) of a device using the runlist @rl to map vcns to their
- * corresponding lcns.
- *
- * It is up to the caller to serialize access to the runlist @rl.
- *
- * Since lcns must be >= 0, we use negative return codes with special meaning:
- *
- * Return code		Meaning / Description
- * ==================================================
- *  LCN_HOLE		Hole / not allocated on disk.
- *  LCN_RL_NOT_MAPPED	This is part of the runlist which has not been
- *			inserted into the runlist yet.
- *  LCN_ENOENT		There is no such vcn in the attribute.
- *
- * Locking: - The caller must have locked the runlist (for reading or writing).
- *	    - This function does not touch the lock, nor does it modify the
- *	      runlist.
- */
-LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn)
-{
-	int i;
-
-	BUG_ON(vcn < 0);
-	/*
-	 * If rl is NULL, assume that we have found an unmapped runlist. The
-	 * caller can then attempt to map it and fail appropriately if
-	 * necessary.
-	 */
-	if (unlikely(!rl))
-		return LCN_RL_NOT_MAPPED;
-
-	/* Catch out of lower bounds vcn. */
-	if (unlikely(vcn < rl[0].vcn))
-		return LCN_ENOENT;
-
-	for (i = 0; likely(rl[i].length); i++) {
-		if (unlikely(vcn < rl[i+1].vcn)) {
-			if (likely(rl[i].lcn >= (LCN)0))
-				return rl[i].lcn + (vcn - rl[i].vcn);
-			return rl[i].lcn;
-		}
-	}
-	/*
-	 * The terminator element is setup to the correct value, i.e. one of
-	 * LCN_HOLE, LCN_RL_NOT_MAPPED, or LCN_ENOENT.
-	 */
-	if (likely(rl[i].lcn < (LCN)0))
-		return rl[i].lcn;
-	/* Just in case... We could replace this with BUG() some day. */
-	return LCN_ENOENT;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_rl_find_vcn_nolock - find a vcn in a runlist
- * @rl:		runlist to search
- * @vcn:	vcn to find
- *
- * Find the virtual cluster number @vcn in the runlist @rl and return the
- * address of the runlist element containing the @vcn on success.
- *
- * Return NULL if @rl is NULL or @vcn is in an unmapped part/out of bounds of
- * the runlist.
- *
- * Locking: The runlist must be locked on entry.
- */
-runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl, const VCN vcn)
-{
-	BUG_ON(vcn < 0);
-	if (unlikely(!rl || vcn < rl[0].vcn))
-		return NULL;
-	while (likely(rl->length)) {
-		if (unlikely(vcn < rl[1].vcn)) {
-			if (likely(rl->lcn >= LCN_HOLE))
-				return rl;
-			return NULL;
-		}
-		rl++;
-	}
-	if (likely(rl->lcn == LCN_ENOENT))
-		return rl;
-	return NULL;
-}
-
-/**
- * ntfs_get_nr_significant_bytes - get number of bytes needed to store a number
- * @n:		number for which to get the number of bytes for
- *
- * Return the number of bytes required to store @n unambiguously as
- * a signed number.
- *
- * This is used in the context of the mapping pairs array to determine how
- * many bytes will be needed in the array to store a given logical cluster
- * number (lcn) or a specific run length.
- *
- * Return the number of bytes written.  This function cannot fail.
- */
-static inline int ntfs_get_nr_significant_bytes(const s64 n)
-{
-	s64 l = n;
-	int i;
-	s8 j;
-
-	i = 0;
-	do {
-		l >>= 8;
-		i++;
-	} while (l != 0 && l != -1);
-	j = (n >> 8 * (i - 1)) & 0xff;
-	/* If the sign bit is wrong, we need an extra byte. */
-	if ((n < 0 && j >= 0) || (n > 0 && j < 0))
-		i++;
-	return i;
-}
-
-/**
- * ntfs_get_size_for_mapping_pairs - get bytes needed for mapping pairs array
- * @vol:	ntfs volume (needed for the ntfs version)
- * @rl:		locked runlist to determine the size of the mapping pairs of
- * @first_vcn:	first vcn which to include in the mapping pairs array
- * @last_vcn:	last vcn which to include in the mapping pairs array
- *
- * Walk the locked runlist @rl and calculate the size in bytes of the mapping
- * pairs array corresponding to the runlist @rl, starting at vcn @first_vcn and
- * finishing with vcn @last_vcn.
- *
- * A @last_vcn of -1 means end of runlist and in that case the size of the
- * mapping pairs array corresponding to the runlist starting at vcn @first_vcn
- * and finishing at the end of the runlist is determined.
- *
- * This for example allows us to allocate a buffer of the right size when
- * building the mapping pairs array.
- *
- * If @rl is NULL, just return 1 (for the single terminator byte).
- *
- * Return the calculated size in bytes on success.  On error, return -errno.
- * The following error codes are defined:
- *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
- *		  fully mapped runlists to this function.
- *	-EIO	- The runlist is corrupt.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- *	    remains locked throughout, and is left locked upon return.
- */
-int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN first_vcn,
-		const VCN last_vcn)
-{
-	LCN prev_lcn;
-	int rls;
-	bool the_end = false;
-
-	BUG_ON(first_vcn < 0);
-	BUG_ON(last_vcn < -1);
-	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
-	if (!rl) {
-		BUG_ON(first_vcn);
-		BUG_ON(last_vcn > 0);
-		return 1;
-	}
-	/* Skip to runlist element containing @first_vcn. */
-	while (rl->length && first_vcn >= rl[1].vcn)
-		rl++;
-	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
-			first_vcn < rl->vcn))
-		return -EINVAL;
-	prev_lcn = 0;
-	/* Always need the termining zero byte. */
-	rls = 1;
-	/* Do the first partial run if present. */
-	if (first_vcn > rl->vcn) {
-		s64 delta, length = rl->length;
-
-		/* We know rl->length != 0 already. */
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		delta = first_vcn - rl->vcn;
-		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(length - delta);
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just store the lcn.
-		 * Note: this assumes that on NTFS 1.2-, holes are stored with
-		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			prev_lcn = rl->lcn;
-			if (likely(rl->lcn >= 0))
-				prev_lcn += delta;
-			/* Change in lcn. */
-			rls += ntfs_get_nr_significant_bytes(prev_lcn);
-		}
-		/* Go to next runlist element. */
-		rl++;
-	}
-	/* Do the full runs. */
-	for (; rl->length && !the_end; rl++) {
-		s64 length = rl->length;
-
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		/* Header byte + length. */
-		rls += 1 + ntfs_get_nr_significant_bytes(length);
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just store the lcn.
-		 * Note: this assumes that on NTFS 1.2-, holes are stored with
-		 * an lcn of -1 and not a delta_lcn of -1 (unless both are -1).
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			/* Change in lcn. */
-			rls += ntfs_get_nr_significant_bytes(rl->lcn -
-					prev_lcn);
-			prev_lcn = rl->lcn;
-		}
-	}
-	return rls;
-err_out:
-	if (rl->lcn == LCN_RL_NOT_MAPPED)
-		rls = -EINVAL;
-	else
-		rls = -EIO;
-	return rls;
-}
-
-/**
- * ntfs_write_significant_bytes - write the significant bytes of a number
- * @dst:	destination buffer to write to
- * @dst_max:	pointer to last byte of destination buffer for bounds checking
- * @n:		number whose significant bytes to write
- *
- * Store in @dst, the minimum bytes of the number @n which are required to
- * identify @n unambiguously as a signed number, taking care not to exceed
- * @dest_max, the maximum position within @dst to which we are allowed to
- * write.
- *
- * This is used when building the mapping pairs array of a runlist to compress
- * a given logical cluster number (lcn) or a specific run length to the minimum
- * size possible.
- *
- * Return the number of bytes written on success.  On error, i.e. the
- * destination buffer @dst is too small, return -ENOSPC.
- */
-static inline int ntfs_write_significant_bytes(s8 *dst, const s8 *dst_max,
-		const s64 n)
-{
-	s64 l = n;
-	int i;
-	s8 j;
-
-	i = 0;
-	do {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		*dst++ = l & 0xffll;
-		l >>= 8;
-		i++;
-	} while (l != 0 && l != -1);
-	j = (n >> 8 * (i - 1)) & 0xff;
-	/* If the sign bit is wrong, we need an extra byte. */
-	if (n < 0 && j >= 0) {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		i++;
-		*dst = (s8)-1;
-	} else if (n > 0 && j < 0) {
-		if (unlikely(dst > dst_max))
-			goto err_out;
-		i++;
-		*dst = (s8)0;
-	}
-	return i;
-err_out:
-	return -ENOSPC;
-}
-
-/**
- * ntfs_mapping_pairs_build - build the mapping pairs array from a runlist
- * @vol:	ntfs volume (needed for the ntfs version)
- * @dst:	destination buffer to which to write the mapping pairs array
- * @dst_len:	size of destination buffer @dst in bytes
- * @rl:		locked runlist for which to build the mapping pairs array
- * @first_vcn:	first vcn which to include in the mapping pairs array
- * @last_vcn:	last vcn which to include in the mapping pairs array
- * @stop_vcn:	first vcn outside destination buffer on success or -ENOSPC
- *
- * Create the mapping pairs array from the locked runlist @rl, starting at vcn
- * @first_vcn and finishing with vcn @last_vcn and save the array in @dst.
- * @dst_len is the size of @dst in bytes and it should be at least equal to the
- * value obtained by calling ntfs_get_size_for_mapping_pairs().
- *
- * A @last_vcn of -1 means end of runlist and in that case the mapping pairs
- * array corresponding to the runlist starting at vcn @first_vcn and finishing
- * at the end of the runlist is created.
- *
- * If @rl is NULL, just write a single terminator byte to @dst.
- *
- * On success or -ENOSPC error, if @stop_vcn is not NULL, *@stop_vcn is set to
- * the first vcn outside the destination buffer.  Note that on error, @dst has
- * been filled with all the mapping pairs that will fit, thus it can be treated
- * as partial success, in that a new attribute extent needs to be created or
- * the next extent has to be used and the mapping pairs build has to be
- * continued with @first_vcn set to *@stop_vcn.
- *
- * Return 0 on success and -errno on error.  The following error codes are
- * defined:
- *	-EINVAL	- Run list contains unmapped elements.  Make sure to only pass
- *		  fully mapped runlists to this function.
- *	-EIO	- The runlist is corrupt.
- *	-ENOSPC	- The destination buffer is too small.
- *
- * Locking: @rl must be locked on entry (either for reading or writing), it
- *	    remains locked throughout, and is left locked upon return.
- */
-int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
-		const int dst_len, const runlist_element *rl,
-		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn)
-{
-	LCN prev_lcn;
-	s8 *dst_max, *dst_next;
-	int err = -ENOSPC;
-	bool the_end = false;
-	s8 len_len, lcn_len;
-
-	BUG_ON(first_vcn < 0);
-	BUG_ON(last_vcn < -1);
-	BUG_ON(last_vcn >= 0 && first_vcn > last_vcn);
-	BUG_ON(dst_len < 1);
-	if (!rl) {
-		BUG_ON(first_vcn);
-		BUG_ON(last_vcn > 0);
-		if (stop_vcn)
-			*stop_vcn = 0;
-		/* Terminator byte. */
-		*dst = 0;
-		return 0;
-	}
-	/* Skip to runlist element containing @first_vcn. */
-	while (rl->length && first_vcn >= rl[1].vcn)
-		rl++;
-	if (unlikely((!rl->length && first_vcn > rl->vcn) ||
-			first_vcn < rl->vcn))
-		return -EINVAL;
-	/*
-	 * @dst_max is used for bounds checking in
-	 * ntfs_write_significant_bytes().
-	 */
-	dst_max = dst + dst_len - 1;
-	prev_lcn = 0;
-	/* Do the first partial run if present. */
-	if (first_vcn > rl->vcn) {
-		s64 delta, length = rl->length;
-
-		/* We know rl->length != 0 already. */
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		delta = first_vcn - rl->vcn;
-		/* Write length. */
-		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				length - delta);
-		if (unlikely(len_len < 0))
-			goto size_err;
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just write the lcn
-		 * change.  FIXME: Do we need to write the lcn change or just
-		 * the lcn in that case?  Not sure as I have never seen this
-		 * case on NT4. - We assume that we just need to write the lcn
-		 * change until someone tells us otherwise... (AIA)
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			prev_lcn = rl->lcn;
-			if (likely(rl->lcn >= 0))
-				prev_lcn += delta;
-			/* Write change in lcn. */
-			lcn_len = ntfs_write_significant_bytes(dst + 1 +
-					len_len, dst_max, prev_lcn);
-			if (unlikely(lcn_len < 0))
-				goto size_err;
-		} else
-			lcn_len = 0;
-		dst_next = dst + len_len + lcn_len + 1;
-		if (unlikely(dst_next > dst_max))
-			goto size_err;
-		/* Update header byte. */
-		*dst = lcn_len << 4 | len_len;
-		/* Position at next mapping pairs array element. */
-		dst = dst_next;
-		/* Go to next runlist element. */
-		rl++;
-	}
-	/* Do the full runs. */
-	for (; rl->length && !the_end; rl++) {
-		s64 length = rl->length;
-
-		if (unlikely(length < 0 || rl->lcn < LCN_HOLE))
-			goto err_out;
-		/*
-		 * If @stop_vcn is given and finishes inside this run, cap the
-		 * run length.
-		 */
-		if (unlikely(last_vcn >= 0 && rl[1].vcn > last_vcn)) {
-			s64 s1 = last_vcn + 1;
-			if (unlikely(rl[1].vcn > s1))
-				length = s1 - rl->vcn;
-			the_end = true;
-		}
-		/* Write length. */
-		len_len = ntfs_write_significant_bytes(dst + 1, dst_max,
-				length);
-		if (unlikely(len_len < 0))
-			goto size_err;
-		/*
-		 * If the logical cluster number (lcn) denotes a hole and we
-		 * are on NTFS 3.0+, we don't store it at all, i.e. we need
-		 * zero space.  On earlier NTFS versions we just write the lcn
-		 * change.  FIXME: Do we need to write the lcn change or just
-		 * the lcn in that case?  Not sure as I have never seen this
-		 * case on NT4. - We assume that we just need to write the lcn
-		 * change until someone tells us otherwise... (AIA)
-		 */
-		if (likely(rl->lcn >= 0 || vol->major_ver < 3)) {
-			/* Write change in lcn. */
-			lcn_len = ntfs_write_significant_bytes(dst + 1 +
-					len_len, dst_max, rl->lcn - prev_lcn);
-			if (unlikely(lcn_len < 0))
-				goto size_err;
-			prev_lcn = rl->lcn;
-		} else
-			lcn_len = 0;
-		dst_next = dst + len_len + lcn_len + 1;
-		if (unlikely(dst_next > dst_max))
-			goto size_err;
-		/* Update header byte. */
-		*dst = lcn_len << 4 | len_len;
-		/* Position at next mapping pairs array element. */
-		dst = dst_next;
-	}
-	/* Success. */
-	err = 0;
-size_err:
-	/* Set stop vcn. */
-	if (stop_vcn)
-		*stop_vcn = rl->vcn;
-	/* Add terminator byte. */
-	*dst = 0;
-	return err;
-err_out:
-	if (rl->lcn == LCN_RL_NOT_MAPPED)
-		err = -EINVAL;
-	else
-		err = -EIO;
-	return err;
-}
-
-/**
- * ntfs_rl_truncate_nolock - truncate a runlist starting at a specified vcn
- * @vol:	ntfs volume (needed for error output)
- * @runlist:	runlist to truncate
- * @new_length:	the new length of the runlist in VCNs
- *
- * Truncate the runlist described by @runlist as well as the memory buffer
- * holding the runlist elements to a length of @new_length VCNs.
- *
- * If @new_length lies within the runlist, the runlist elements with VCNs of
- * @new_length and above are discarded.  As a special case if @new_length is
- * zero, the runlist is discarded and set to NULL.
- *
- * If @new_length lies beyond the runlist, a sparse runlist element is added to
- * the end of the runlist @runlist or if the last runlist element is a sparse
- * one already, this is extended.
- *
- * Note, no checking is done for unmapped runlist elements.  It is assumed that
- * the caller has mapped any elements that need to be mapped already.
- *
- * Return 0 on success and -errno on error.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_truncate_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const s64 new_length)
-{
-	runlist_element *rl;
-	int old_size;
-
-	ntfs_debug("Entering for new_length 0x%llx.", (long long)new_length);
-	BUG_ON(!runlist);
-	BUG_ON(new_length < 0);
-	rl = runlist->rl;
-	if (!new_length) {
-		ntfs_debug("Freeing runlist.");
-		runlist->rl = NULL;
-		if (rl)
-			ntfs_free(rl);
-		return 0;
-	}
-	if (unlikely(!rl)) {
-		/*
-		 * Create a runlist consisting of a sparse runlist element of
-		 * length @new_length followed by a terminator runlist element.
-		 */
-		rl = ntfs_malloc_nofs(PAGE_SIZE);
-		if (unlikely(!rl)) {
-			ntfs_error(vol->sb, "Not enough memory to allocate "
-					"runlist element buffer.");
-			return -ENOMEM;
-		}
-		runlist->rl = rl;
-		rl[1].length = rl->vcn = 0;
-		rl->lcn = LCN_HOLE;
-		rl[1].vcn = rl->length = new_length;
-		rl[1].lcn = LCN_ENOENT;
-		return 0;
-	}
-	BUG_ON(new_length < rl->vcn);
-	/* Find @new_length in the runlist. */
-	while (likely(rl->length && new_length >= rl[1].vcn))
-		rl++;
-	/*
-	 * If not at the end of the runlist we need to shrink it.
-	 * If at the end of the runlist we need to expand it.
-	 */
-	if (rl->length) {
-		runlist_element *trl;
-		bool is_end;
-
-		ntfs_debug("Shrinking runlist.");
-		/* Determine the runlist size. */
-		trl = rl + 1;
-		while (likely(trl->length))
-			trl++;
-		old_size = trl - runlist->rl + 1;
-		/* Truncate the run. */
-		rl->length = new_length - rl->vcn;
-		/*
-		 * If a run was partially truncated, make the following runlist
-		 * element a terminator.
-		 */
-		is_end = false;
-		if (rl->length) {
-			rl++;
-			if (!rl->length)
-				is_end = true;
-			rl->vcn = new_length;
-			rl->length = 0;
-		}
-		rl->lcn = LCN_ENOENT;
-		/* Reallocate memory if necessary. */
-		if (!is_end) {
-			int new_size = rl - runlist->rl + 1;
-			rl = ntfs_rl_realloc(runlist->rl, old_size, new_size);
-			if (IS_ERR(rl))
-				ntfs_warning(vol->sb, "Failed to shrink "
-						"runlist buffer.  This just "
-						"wastes a bit of memory "
-						"temporarily so we ignore it "
-						"and return success.");
-			else
-				runlist->rl = rl;
-		}
-	} else if (likely(/* !rl->length && */ new_length > rl->vcn)) {
-		ntfs_debug("Expanding runlist.");
-		/*
-		 * If there is a previous runlist element and it is a sparse
-		 * one, extend it.  Otherwise need to add a new, sparse runlist
-		 * element.
-		 */
-		if ((rl > runlist->rl) && ((rl - 1)->lcn == LCN_HOLE))
-			(rl - 1)->length = new_length - (rl - 1)->vcn;
-		else {
-			/* Determine the runlist size. */
-			old_size = rl - runlist->rl + 1;
-			/* Reallocate memory if necessary. */
-			rl = ntfs_rl_realloc(runlist->rl, old_size,
-					old_size + 1);
-			if (IS_ERR(rl)) {
-				ntfs_error(vol->sb, "Failed to expand runlist "
-						"buffer, aborting.");
-				return PTR_ERR(rl);
-			}
-			runlist->rl = rl;
-			/*
-			 * Set @rl to the same runlist element in the new
-			 * runlist as before in the old runlist.
-			 */
-			rl += old_size - 1;
-			/* Add a new, sparse runlist element. */
-			rl->lcn = LCN_HOLE;
-			rl->length = new_length - rl->vcn;
-			/* Add a new terminator runlist element. */
-			rl++;
-			rl->length = 0;
-		}
-		rl->vcn = new_length;
-		rl->lcn = LCN_ENOENT;
-	} else /* if (unlikely(!rl->length && new_length == rl->vcn)) */ {
-		/* Runlist already has same size as requested. */
-		rl->lcn = LCN_ENOENT;
-	}
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * ntfs_rl_punch_nolock - punch a hole into a runlist
- * @vol:	ntfs volume (needed for error output)
- * @runlist:	runlist to punch a hole into
- * @start:	starting VCN of the hole to be created
- * @length:	size of the hole to be created in units of clusters
- *
- * Punch a hole into the runlist @runlist starting at VCN @start and of size
- * @length clusters.
- *
- * Return 0 on success and -errno on error, in which case @runlist has not been
- * modified.
- *
- * If @start and/or @start + @length are outside the runlist return error code
- * -ENOENT.
- *
- * If the runlist contains unmapped or error elements between @start and @start
- * + @length return error code -EINVAL.
- *
- * Locking: The caller must hold @runlist->lock for writing.
- */
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const VCN start, const s64 length)
-{
-	const VCN end = start + length;
-	s64 delta;
-	runlist_element *rl, *rl_end, *rl_real_end, *trl;
-	int old_size;
-	bool lcn_fixup = false;
-
-	ntfs_debug("Entering for start 0x%llx, length 0x%llx.",
-			(long long)start, (long long)length);
-	BUG_ON(!runlist);
-	BUG_ON(start < 0);
-	BUG_ON(length < 0);
-	BUG_ON(end < 0);
-	rl = runlist->rl;
-	if (unlikely(!rl)) {
-		if (likely(!start && !length))
-			return 0;
-		return -ENOENT;
-	}
-	/* Find @start in the runlist. */
-	while (likely(rl->length && start >= rl[1].vcn))
-		rl++;
-	rl_end = rl;
-	/* Find @end in the runlist. */
-	while (likely(rl_end->length && end >= rl_end[1].vcn)) {
-		/* Verify there are no unmapped or error elements. */
-		if (unlikely(rl_end->lcn < LCN_HOLE))
-			return -EINVAL;
-		rl_end++;
-	}
-	/* Check the last element. */
-	if (unlikely(rl_end->length && rl_end->lcn < LCN_HOLE))
-		return -EINVAL;
-	/* This covers @start being out of bounds, too. */
-	if (!rl_end->length && end > rl_end->vcn)
-		return -ENOENT;
-	if (!length)
-		return 0;
-	if (!rl->length)
-		return -ENOENT;
-	rl_real_end = rl_end;
-	/* Determine the runlist size. */
-	while (likely(rl_real_end->length))
-		rl_real_end++;
-	old_size = rl_real_end - runlist->rl + 1;
-	/* If @start is in a hole simply extend the hole. */
-	if (rl->lcn == LCN_HOLE) {
-		/*
-		 * If both @start and @end are in the same sparse run, we are
-		 * done.
-		 */
-		if (end <= rl[1].vcn) {
-			ntfs_debug("Done (requested hole is already sparse).");
-			return 0;
-		}
-extend_hole:
-		/* Extend the hole. */
-		rl->length = end - rl->vcn;
-		/* If @end is in a hole, merge it with the current one. */
-		if (rl_end->lcn == LCN_HOLE) {
-			rl_end++;
-			rl->length = rl_end->vcn - rl->vcn;
-		}
-		/* We have done the hole.  Now deal with the remaining tail. */
-		rl++;
-		/* Cut out all runlist elements up to @end. */
-		if (rl < rl_end)
-			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
-					sizeof(*rl));
-		/* Adjust the beginning of the tail if necessary. */
-		if (end > rl->vcn) {
-			delta = end - rl->vcn;
-			rl->vcn = end;
-			rl->length -= delta;
-			/* Only adjust the lcn if it is real. */
-			if (rl->lcn >= 0)
-				rl->lcn += delta;
-		}
-shrink_allocation:
-		/* Reallocate memory if the allocation changed. */
-		if (rl < rl_end) {
-			rl = ntfs_rl_realloc(runlist->rl, old_size,
-					old_size - (rl_end - rl));
-			if (IS_ERR(rl))
-				ntfs_warning(vol->sb, "Failed to shrink "
-						"runlist buffer.  This just "
-						"wastes a bit of memory "
-						"temporarily so we ignore it "
-						"and return success.");
-			else
-				runlist->rl = rl;
-		}
-		ntfs_debug("Done (extend hole).");
-		return 0;
-	}
-	/*
-	 * If @start is at the beginning of a run things are easier as there is
-	 * no need to split the first run.
-	 */
-	if (start == rl->vcn) {
-		/*
-		 * @start is at the beginning of a run.
-		 *
-		 * If the previous run is sparse, extend its hole.
-		 *
-		 * If @end is not in the same run, switch the run to be sparse
-		 * and extend the newly created hole.
-		 *
-		 * Thus both of these cases reduce the problem to the above
-		 * case of "@start is in a hole".
-		 */
-		if (rl > runlist->rl && (rl - 1)->lcn == LCN_HOLE) {
-			rl--;
-			goto extend_hole;
-		}
-		if (end >= rl[1].vcn) {
-			rl->lcn = LCN_HOLE;
-			goto extend_hole;
-		}
-		/*
-		 * The final case is when @end is in the same run as @start.
-		 * For this need to split the run into two.  One run for the
-		 * sparse region between the beginning of the old run, i.e.
-		 * @start, and @end and one for the remaining non-sparse
-		 * region, i.e. between @end and the end of the old run.
-		 */
-		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
-		if (IS_ERR(trl))
-			goto enomem_out;
-		old_size++;
-		if (runlist->rl != trl) {
-			rl = trl + (rl - runlist->rl);
-			rl_end = trl + (rl_end - runlist->rl);
-			rl_real_end = trl + (rl_real_end - runlist->rl);
-			runlist->rl = trl;
-		}
-split_end:
-		/* Shift all the runs up by one. */
-		memmove(rl + 1, rl, (rl_real_end - rl + 1) * sizeof(*rl));
-		/* Finally, setup the two split runs. */
-		rl->lcn = LCN_HOLE;
-		rl->length = length;
-		rl++;
-		rl->vcn += length;
-		/* Only adjust the lcn if it is real. */
-		if (rl->lcn >= 0 || lcn_fixup)
-			rl->lcn += length;
-		rl->length -= length;
-		ntfs_debug("Done (split one).");
-		return 0;
-	}
-	/*
-	 * @start is neither in a hole nor at the beginning of a run.
-	 *
-	 * If @end is in a hole, things are easier as simply truncating the run
-	 * @start is in to end at @start - 1, deleting all runs after that up
-	 * to @end, and finally extending the beginning of the run @end is in
-	 * to be @start is all that is needed.
-	 */
-	if (rl_end->lcn == LCN_HOLE) {
-		/* Truncate the run containing @start. */
-		rl->length = start - rl->vcn;
-		rl++;
-		/* Cut out all runlist elements up to @end. */
-		if (rl < rl_end)
-			memmove(rl, rl_end, (rl_real_end - rl_end + 1) *
-					sizeof(*rl));
-		/* Extend the beginning of the run @end is in to be @start. */
-		rl->vcn = start;
-		rl->length = rl[1].vcn - start;
-		goto shrink_allocation;
-	}
-	/* 
-	 * If @end is not in a hole there are still two cases to distinguish.
-	 * Either @end is or is not in the same run as @start.
-	 *
-	 * The second case is easier as it can be reduced to an already solved
-	 * problem by truncating the run @start is in to end at @start - 1.
-	 * Then, if @end is in the next run need to split the run into a sparse
-	 * run followed by a non-sparse run (already covered above) and if @end
-	 * is not in the next run switching it to be sparse, again reduces the
-	 * problem to the already covered case of "@start is in a hole".
-	 */
-	if (end >= rl[1].vcn) {
-		/*
-		 * If @end is not in the next run, reduce the problem to the
-		 * case of "@start is in a hole".
-		 */
-		if (rl[1].length && end >= rl[2].vcn) {
-			/* Truncate the run containing @start. */
-			rl->length = start - rl->vcn;
-			rl++;
-			rl->vcn = start;
-			rl->lcn = LCN_HOLE;
-			goto extend_hole;
-		}
-		trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 1);
-		if (IS_ERR(trl))
-			goto enomem_out;
-		old_size++;
-		if (runlist->rl != trl) {
-			rl = trl + (rl - runlist->rl);
-			rl_end = trl + (rl_end - runlist->rl);
-			rl_real_end = trl + (rl_real_end - runlist->rl);
-			runlist->rl = trl;
-		}
-		/* Truncate the run containing @start. */
-		rl->length = start - rl->vcn;
-		rl++;
-		/*
-		 * @end is in the next run, reduce the problem to the case
-		 * where "@start is at the beginning of a run and @end is in
-		 * the same run as @start".
-		 */
-		delta = rl->vcn - start;
-		rl->vcn = start;
-		if (rl->lcn >= 0) {
-			rl->lcn -= delta;
-			/* Need this in case the lcn just became negative. */
-			lcn_fixup = true;
-		}
-		rl->length += delta;
-		goto split_end;
-	}
-	/*
-	 * The first case from above, i.e. @end is in the same run as @start.
-	 * We need to split the run into three.  One run for the non-sparse
-	 * region between the beginning of the old run and @start, one for the
-	 * sparse region between @start and @end, and one for the remaining
-	 * non-sparse region, i.e. between @end and the end of the old run.
-	 */
-	trl = ntfs_rl_realloc(runlist->rl, old_size, old_size + 2);
-	if (IS_ERR(trl))
-		goto enomem_out;
-	old_size += 2;
-	if (runlist->rl != trl) {
-		rl = trl + (rl - runlist->rl);
-		rl_end = trl + (rl_end - runlist->rl);
-		rl_real_end = trl + (rl_real_end - runlist->rl);
-		runlist->rl = trl;
-	}
-	/* Shift all the runs up by two. */
-	memmove(rl + 2, rl, (rl_real_end - rl + 1) * sizeof(*rl));
-	/* Finally, setup the three split runs. */
-	rl->length = start - rl->vcn;
-	rl++;
-	rl->vcn = start;
-	rl->lcn = LCN_HOLE;
-	rl->length = length;
-	rl++;
-	delta = end - rl->vcn;
-	rl->vcn = end;
-	rl->lcn += delta;
-	rl->length -= delta;
-	ntfs_debug("Done (split both).");
-	return 0;
-enomem_out:
-	ntfs_error(vol->sb, "Not enough memory to extend runlist buffer.");
-	return -ENOMEM;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/runlist.h b/fs/ntfs/runlist.h
deleted file mode 100644
index 38de0a375f59..000000000000
--- a/fs/ntfs/runlist.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * runlist.h - Defines for runlist handling in NTFS Linux kernel driver.
- *	       Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_RUNLIST_H
-#define _LINUX_NTFS_RUNLIST_H
-
-#include "types.h"
-#include "layout.h"
-#include "volume.h"
-
-/**
- * runlist_element - in memory vcn to lcn mapping array element
- * @vcn:	starting vcn of the current array element
- * @lcn:	starting lcn of the current array element
- * @length:	length in clusters of the current array element
- *
- * The last vcn (in fact the last vcn + 1) is reached when length == 0.
- *
- * When lcn == -1 this means that the count vcns starting at vcn are not
- * physically allocated (i.e. this is a hole / data is sparse).
- */
-typedef struct {	/* In memory vcn to lcn mapping structure element. */
-	VCN vcn;	/* vcn = Starting virtual cluster number. */
-	LCN lcn;	/* lcn = Starting logical cluster number. */
-	s64 length;	/* Run length in clusters. */
-} runlist_element;
-
-/**
- * runlist - in memory vcn to lcn mapping array including a read/write lock
- * @rl:		pointer to an array of runlist elements
- * @lock:	read/write spinlock for serializing access to @rl
- *
- */
-typedef struct {
-	runlist_element *rl;
-	struct rw_semaphore lock;
-} runlist;
-
-static inline void ntfs_init_runlist(runlist *rl)
-{
-	rl->rl = NULL;
-	init_rwsem(&rl->lock);
-}
-
-typedef enum {
-	LCN_HOLE		= -1,	/* Keep this as highest value or die! */
-	LCN_RL_NOT_MAPPED	= -2,
-	LCN_ENOENT		= -3,
-	LCN_ENOMEM		= -4,
-	LCN_EIO			= -5,
-} LCN_SPECIAL_VALUES;
-
-extern runlist_element *ntfs_runlists_merge(runlist_element *drl,
-		runlist_element *srl);
-
-extern runlist_element *ntfs_mapping_pairs_decompress(const ntfs_volume *vol,
-		const ATTR_RECORD *attr, runlist_element *old_rl);
-
-extern LCN ntfs_rl_vcn_to_lcn(const runlist_element *rl, const VCN vcn);
-
-#ifdef NTFS_RW
-
-extern runlist_element *ntfs_rl_find_vcn_nolock(runlist_element *rl,
-		const VCN vcn);
-
-extern int ntfs_get_size_for_mapping_pairs(const ntfs_volume *vol,
-		const runlist_element *rl, const VCN first_vcn,
-		const VCN last_vcn);
-
-extern int ntfs_mapping_pairs_build(const ntfs_volume *vol, s8 *dst,
-		const int dst_len, const runlist_element *rl,
-		const VCN first_vcn, const VCN last_vcn, VCN *const stop_vcn);
-
-extern int ntfs_rl_truncate_nolock(const ntfs_volume *vol,
-		runlist *const runlist, const s64 new_length);
-
-int ntfs_rl_punch_nolock(const ntfs_volume *vol, runlist *const runlist,
-		const VCN start, const s64 length);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_RUNLIST_H */
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
deleted file mode 100644
index 56a7d5bd33e4..000000000000
--- a/fs/ntfs/super.c
+++ /dev/null
@@ -1,3202 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * super.c - NTFS kernel super block handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2012 Anton Altaparmakov and Tuxera Inc.
- * Copyright (c) 2001,2002 Richard Russon
- */
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/stddef.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
-#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
-#include <linux/vfs.h>
-#include <linux/moduleparam.h>
-#include <linux/bitmap.h>
-
-#include "sysctl.h"
-#include "logfile.h"
-#include "quota.h"
-#include "usnjrnl.h"
-#include "dir.h"
-#include "debug.h"
-#include "index.h"
-#include "inode.h"
-#include "aops.h"
-#include "layout.h"
-#include "malloc.h"
-#include "ntfs.h"
-
-/* Number of mounted filesystems which have compression enabled. */
-static unsigned long ntfs_nr_compression_users;
-
-/* A global default upcase table and a corresponding reference count. */
-static ntfschar *default_upcase;
-static unsigned long ntfs_nr_upcase_users;
-
-/* Error constants/strings used in inode.c::ntfs_show_options(). */
-typedef enum {
-	/* One of these must be present, default is ON_ERRORS_CONTINUE. */
-	ON_ERRORS_PANIC			= 0x01,
-	ON_ERRORS_REMOUNT_RO		= 0x02,
-	ON_ERRORS_CONTINUE		= 0x04,
-	/* Optional, can be combined with any of the above. */
-	ON_ERRORS_RECOVER		= 0x10,
-} ON_ERRORS_ACTIONS;
-
-const option_t on_errors_arr[] = {
-	{ ON_ERRORS_PANIC,	"panic" },
-	{ ON_ERRORS_REMOUNT_RO,	"remount-ro", },
-	{ ON_ERRORS_CONTINUE,	"continue", },
-	{ ON_ERRORS_RECOVER,	"recover" },
-	{ 0,			NULL }
-};
-
-/**
- * simple_getbool - convert input string to a boolean value
- * @s: input string to convert
- * @setval: where to store the output boolean value
- *
- * Copied from old ntfs driver (which copied from vfat driver).
- *
- * "1", "yes", "true", or an empty string are converted to %true.
- * "0", "no", and "false" are converted to %false.
- *
- * Return: %1 if the string is converted or was empty and *setval contains it;
- *	   %0 if the string was not valid.
- */
-static int simple_getbool(char *s, bool *setval)
-{
-	if (s) {
-		if (!strcmp(s, "1") || !strcmp(s, "yes") || !strcmp(s, "true"))
-			*setval = true;
-		else if (!strcmp(s, "0") || !strcmp(s, "no") ||
-							!strcmp(s, "false"))
-			*setval = false;
-		else
-			return 0;
-	} else
-		*setval = true;
-	return 1;
-}
-
-/**
- * parse_options - parse the (re)mount options
- * @vol:	ntfs volume
- * @opt:	string containing the (re)mount options
- *
- * Parse the recognized options in @opt for the ntfs volume described by @vol.
- */
-static bool parse_options(ntfs_volume *vol, char *opt)
-{
-	char *p, *v, *ov;
-	static char *utf8 = "utf8";
-	int errors = 0, sloppy = 0;
-	kuid_t uid = INVALID_UID;
-	kgid_t gid = INVALID_GID;
-	umode_t fmask = (umode_t)-1, dmask = (umode_t)-1;
-	int mft_zone_multiplier = -1, on_errors = -1;
-	int show_sys_files = -1, case_sensitive = -1, disable_sparse = -1;
-	struct nls_table *nls_map = NULL, *old_nls;
-
-	/* I am lazy... (-8 */
-#define NTFS_GETOPT_WITH_DEFAULT(option, variable, default_value)	\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			variable = default_value;			\
-		else {							\
-			variable = simple_strtoul(ov = v, &v, 0);	\
-			if (*v)						\
-				goto needs_val;				\
-		}							\
-	}
-#define NTFS_GETOPT(option, variable)					\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		variable = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_UID(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		uid_t uid_value;					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		uid_value = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-		variable = make_kuid(current_user_ns(), uid_value);	\
-		if (!uid_valid(variable))				\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_GID(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		gid_t gid_value;					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		gid_value = simple_strtoul(ov = v, &v, 0);		\
-		if (*v)							\
-			goto needs_val;					\
-		variable = make_kgid(current_user_ns(), gid_value);	\
-		if (!gid_valid(variable))				\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_OCTAL(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		variable = simple_strtoul(ov = v, &v, 8);		\
-		if (*v)							\
-			goto needs_val;					\
-	}
-#define NTFS_GETOPT_BOOL(option, variable)				\
-	if (!strcmp(p, option)) {					\
-		bool val;						\
-		if (!simple_getbool(v, &val))				\
-			goto needs_bool;				\
-		variable = val;						\
-	}
-#define NTFS_GETOPT_OPTIONS_ARRAY(option, variable, opt_array)		\
-	if (!strcmp(p, option)) {					\
-		int _i;							\
-		if (!v || !*v)						\
-			goto needs_arg;					\
-		ov = v;							\
-		if (variable == -1)					\
-			variable = 0;					\
-		for (_i = 0; opt_array[_i].str && *opt_array[_i].str; _i++) \
-			if (!strcmp(opt_array[_i].str, v)) {		\
-				variable |= opt_array[_i].val;		\
-				break;					\
-			}						\
-		if (!opt_array[_i].str || !*opt_array[_i].str)		\
-			goto needs_val;					\
-	}
-	if (!opt || !*opt)
-		goto no_mount_options;
-	ntfs_debug("Entering with mount options string: %s", opt);
-	while ((p = strsep(&opt, ","))) {
-		if ((v = strchr(p, '=')))
-			*v++ = 0;
-		NTFS_GETOPT_UID("uid", uid)
-		else NTFS_GETOPT_GID("gid", gid)
-		else NTFS_GETOPT_OCTAL("umask", fmask = dmask)
-		else NTFS_GETOPT_OCTAL("fmask", fmask)
-		else NTFS_GETOPT_OCTAL("dmask", dmask)
-		else NTFS_GETOPT("mft_zone_multiplier", mft_zone_multiplier)
-		else NTFS_GETOPT_WITH_DEFAULT("sloppy", sloppy, true)
-		else NTFS_GETOPT_BOOL("show_sys_files", show_sys_files)
-		else NTFS_GETOPT_BOOL("case_sensitive", case_sensitive)
-		else NTFS_GETOPT_BOOL("disable_sparse", disable_sparse)
-		else NTFS_GETOPT_OPTIONS_ARRAY("errors", on_errors,
-				on_errors_arr)
-		else if (!strcmp(p, "posix") || !strcmp(p, "show_inodes"))
-			ntfs_warning(vol->sb, "Ignoring obsolete option %s.",
-					p);
-		else if (!strcmp(p, "nls") || !strcmp(p, "iocharset")) {
-			if (!strcmp(p, "iocharset"))
-				ntfs_warning(vol->sb, "Option iocharset is "
-						"deprecated. Please use "
-						"option nls=<charsetname> in "
-						"the future.");
-			if (!v || !*v)
-				goto needs_arg;
-use_utf8:
-			old_nls = nls_map;
-			nls_map = load_nls(v);
-			if (!nls_map) {
-				if (!old_nls) {
-					ntfs_error(vol->sb, "NLS character set "
-							"%s not found.", v);
-					return false;
-				}
-				ntfs_error(vol->sb, "NLS character set %s not "
-						"found. Using previous one %s.",
-						v, old_nls->charset);
-				nls_map = old_nls;
-			} else /* nls_map */ {
-				unload_nls(old_nls);
-			}
-		} else if (!strcmp(p, "utf8")) {
-			bool val = false;
-			ntfs_warning(vol->sb, "Option utf8 is no longer "
-				   "supported, using option nls=utf8. Please "
-				   "use option nls=utf8 in the future and "
-				   "make sure utf8 is compiled either as a "
-				   "module or into the kernel.");
-			if (!v || !*v)
-				val = true;
-			else if (!simple_getbool(v, &val))
-				goto needs_bool;
-			if (val) {
-				v = utf8;
-				goto use_utf8;
-			}
-		} else {
-			ntfs_error(vol->sb, "Unrecognized mount option %s.", p);
-			if (errors < INT_MAX)
-				errors++;
-		}
-#undef NTFS_GETOPT_OPTIONS_ARRAY
-#undef NTFS_GETOPT_BOOL
-#undef NTFS_GETOPT
-#undef NTFS_GETOPT_WITH_DEFAULT
-	}
-no_mount_options:
-	if (errors && !sloppy)
-		return false;
-	if (sloppy)
-		ntfs_warning(vol->sb, "Sloppy option given. Ignoring "
-				"unrecognized mount option(s) and continuing.");
-	/* Keep this first! */
-	if (on_errors != -1) {
-		if (!on_errors) {
-			ntfs_error(vol->sb, "Invalid errors option argument "
-					"or bug in options parser.");
-			return false;
-		}
-	}
-	if (nls_map) {
-		if (vol->nls_map && vol->nls_map != nls_map) {
-			ntfs_error(vol->sb, "Cannot change NLS character set "
-					"on remount.");
-			return false;
-		} /* else (!vol->nls_map) */
-		ntfs_debug("Using NLS character set %s.", nls_map->charset);
-		vol->nls_map = nls_map;
-	} else /* (!nls_map) */ {
-		if (!vol->nls_map) {
-			vol->nls_map = load_nls_default();
-			if (!vol->nls_map) {
-				ntfs_error(vol->sb, "Failed to load default "
-						"NLS character set.");
-				return false;
-			}
-			ntfs_debug("Using default NLS character set (%s).",
-					vol->nls_map->charset);
-		}
-	}
-	if (mft_zone_multiplier != -1) {
-		if (vol->mft_zone_multiplier && vol->mft_zone_multiplier !=
-				mft_zone_multiplier) {
-			ntfs_error(vol->sb, "Cannot change mft_zone_multiplier "
-					"on remount.");
-			return false;
-		}
-		if (mft_zone_multiplier < 1 || mft_zone_multiplier > 4) {
-			ntfs_error(vol->sb, "Invalid mft_zone_multiplier. "
-					"Using default value, i.e. 1.");
-			mft_zone_multiplier = 1;
-		}
-		vol->mft_zone_multiplier = mft_zone_multiplier;
-	}
-	if (!vol->mft_zone_multiplier)
-		vol->mft_zone_multiplier = 1;
-	if (on_errors != -1)
-		vol->on_errors = on_errors;
-	if (!vol->on_errors || vol->on_errors == ON_ERRORS_RECOVER)
-		vol->on_errors |= ON_ERRORS_CONTINUE;
-	if (uid_valid(uid))
-		vol->uid = uid;
-	if (gid_valid(gid))
-		vol->gid = gid;
-	if (fmask != (umode_t)-1)
-		vol->fmask = fmask;
-	if (dmask != (umode_t)-1)
-		vol->dmask = dmask;
-	if (show_sys_files != -1) {
-		if (show_sys_files)
-			NVolSetShowSystemFiles(vol);
-		else
-			NVolClearShowSystemFiles(vol);
-	}
-	if (case_sensitive != -1) {
-		if (case_sensitive)
-			NVolSetCaseSensitive(vol);
-		else
-			NVolClearCaseSensitive(vol);
-	}
-	if (disable_sparse != -1) {
-		if (disable_sparse)
-			NVolClearSparseEnabled(vol);
-		else {
-			if (!NVolSparseEnabled(vol) &&
-					vol->major_ver && vol->major_ver < 3)
-				ntfs_warning(vol->sb, "Not enabling sparse "
-						"support due to NTFS volume "
-						"version %i.%i (need at least "
-						"version 3.0).", vol->major_ver,
-						vol->minor_ver);
-			else
-				NVolSetSparseEnabled(vol);
-		}
-	}
-	return true;
-needs_arg:
-	ntfs_error(vol->sb, "The %s option requires an argument.", p);
-	return false;
-needs_bool:
-	ntfs_error(vol->sb, "The %s option requires a boolean argument.", p);
-	return false;
-needs_val:
-	ntfs_error(vol->sb, "Invalid %s option argument: %s", p, ov);
-	return false;
-}
-
-#ifdef NTFS_RW
-
-/**
- * ntfs_write_volume_flags - write new flags to the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	new flags value for the volume information flags
- *
- * Internal function.  You probably want to use ntfs_{set,clear}_volume_flags()
- * instead (see below).
- *
- * Replace the volume information flags on the volume @vol with the value
- * supplied in @flags.  Note, this overwrites the volume information flags, so
- * make sure to combine the flags you want to modify with the old flags and use
- * the result when calling ntfs_write_volume_flags().
- *
- * Return 0 on success and -errno on error.
- */
-static int ntfs_write_volume_flags(ntfs_volume *vol, const VOLUME_FLAGS flags)
-{
-	ntfs_inode *ni = NTFS_I(vol->vol_ino);
-	MFT_RECORD *m;
-	VOLUME_INFORMATION *vi;
-	ntfs_attr_search_ctx *ctx;
-	int err;
-
-	ntfs_debug("Entering, old flags = 0x%x, new flags = 0x%x.",
-			le16_to_cpu(vol->vol_flags), le16_to_cpu(flags));
-	if (vol->vol_flags == flags)
-		goto done;
-	BUG_ON(!ni);
-	m = map_mft_record(ni);
-	if (IS_ERR(m)) {
-		err = PTR_ERR(m);
-		goto err_out;
-	}
-	ctx = ntfs_attr_get_search_ctx(ni, m);
-	if (!ctx) {
-		err = -ENOMEM;
-		goto put_unm_err_out;
-	}
-	err = ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx);
-	if (err)
-		goto put_unm_err_out;
-	vi = (VOLUME_INFORMATION*)((u8*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	vol->vol_flags = vi->flags = flags;
-	flush_dcache_mft_record_page(ctx->ntfs_ino);
-	mark_mft_record_dirty(ctx->ntfs_ino);
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-done:
-	ntfs_debug("Done.");
-	return 0;
-put_unm_err_out:
-	if (ctx)
-		ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(ni);
-err_out:
-	ntfs_error(vol->sb, "Failed with error code %i.", -err);
-	return err;
-}
-
-/**
- * ntfs_set_volume_flags - set bits in the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	flags to set on the volume
- *
- * Set the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_set_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
-	flags &= VOLUME_FLAGS_MASK;
-	return ntfs_write_volume_flags(vol, vol->vol_flags | flags);
-}
-
-/**
- * ntfs_clear_volume_flags - clear bits in the volume information flags
- * @vol:	ntfs volume on which to modify the flags
- * @flags:	flags to clear on the volume
- *
- * Clear the bits in @flags in the volume information flags on the volume @vol.
- *
- * Return 0 on success and -errno on error.
- */
-static inline int ntfs_clear_volume_flags(ntfs_volume *vol, VOLUME_FLAGS flags)
-{
-	flags &= VOLUME_FLAGS_MASK;
-	flags = vol->vol_flags & cpu_to_le16(~le16_to_cpu(flags));
-	return ntfs_write_volume_flags(vol, flags);
-}
-
-#endif /* NTFS_RW */
-
-/**
- * ntfs_remount - change the mount options of a mounted ntfs filesystem
- * @sb:		superblock of mounted ntfs filesystem
- * @flags:	remount flags
- * @opt:	remount options string
- *
- * Change the mount options of an already mounted ntfs filesystem.
- *
- * NOTE:  The VFS sets the @sb->s_flags remount flags to @flags after
- * ntfs_remount() returns successfully (i.e. returns 0).  Otherwise,
- * @sb->s_flags are not changed.
- */
-static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
-{
-	ntfs_volume *vol = NTFS_SB(sb);
-
-	ntfs_debug("Entering with remount options string: %s", opt);
-
-	sync_filesystem(sb);
-
-#ifndef NTFS_RW
-	/* For read-only compiled driver, enforce read-only flag. */
-	*flags |= SB_RDONLY;
-#else /* NTFS_RW */
-	/*
-	 * For the read-write compiled driver, if we are remounting read-write,
-	 * make sure there are no volume errors and that no unsupported volume
-	 * flags are set.  Also, empty the logfile journal as it would become
-	 * stale as soon as something is written to the volume and mark the
-	 * volume dirty so that chkdsk is run if the volume is not umounted
-	 * cleanly.  Finally, mark the quotas out of date so Windows rescans
-	 * the volume on boot and updates them.
-	 *
-	 * When remounting read-only, mark the volume clean if no volume errors
-	 * have occurred.
-	 */
-	if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) {
-		static const char *es = ".  Cannot remount read-write.";
-
-		/* Remounting read-write. */
-		if (NVolErrors(vol)) {
-			ntfs_error(sb, "Volume has errors and is read-only%s",
-					es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_IS_DIRTY) {
-			ntfs_error(sb, "Volume is dirty and read-only%s", es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
-			ntfs_error(sb, "Volume has been modified by chkdsk "
-					"and is read-only%s", es);
-			return -EROFS;
-		}
-		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-			ntfs_error(sb, "Volume has unsupported flags set "
-					"(0x%x) and is read-only%s",
-					(unsigned)le16_to_cpu(vol->vol_flags),
-					es);
-			return -EROFS;
-		}
-		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
-			ntfs_error(sb, "Failed to set dirty bit in volume "
-					"information flags%s", es);
-			return -EROFS;
-		}
-#if 0
-		// TODO: Enable this code once we start modifying anything that
-		//	 is different between NTFS 1.2 and 3.x...
-		/* Set NT4 compatibility flag on newer NTFS version volumes. */
-		if ((vol->major_ver > 1)) {
-			if (ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
-				ntfs_error(sb, "Failed to set NT4 "
-						"compatibility flag%s", es);
-				NVolSetErrors(vol);
-				return -EROFS;
-			}
-		}
-#endif
-		if (!ntfs_empty_logfile(vol->logfile_ino)) {
-			ntfs_error(sb, "Failed to empty journal $LogFile%s",
-					es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-		if (!ntfs_mark_quotas_out_of_date(vol)) {
-			ntfs_error(sb, "Failed to mark quotas out of date%s",
-					es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-		if (!ntfs_stamp_usnjrnl(vol)) {
-			ntfs_error(sb, "Failed to stamp transaction log "
-					"($UsnJrnl)%s", es);
-			NVolSetErrors(vol);
-			return -EROFS;
-		}
-	} else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) {
-		/* Remounting read-only. */
-		if (!NVolErrors(vol)) {
-			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
-				ntfs_warning(sb, "Failed to clear dirty bit "
-						"in volume information "
-						"flags.  Run chkdsk.");
-		}
-	}
-#endif /* NTFS_RW */
-
-	// TODO: Deal with *flags.
-
-	if (!parse_options(vol, opt))
-		return -EINVAL;
-
-	ntfs_debug("Done.");
-	return 0;
-}
-
-/**
- * is_boot_sector_ntfs - check whether a boot sector is a valid NTFS boot sector
- * @sb:		Super block of the device to which @b belongs.
- * @b:		Boot sector of device @sb to check.
- * @silent:	If 'true', all output will be silenced.
- *
- * is_boot_sector_ntfs() checks whether the boot sector @b is a valid NTFS boot
- * sector. Returns 'true' if it is valid and 'false' if not.
- *
- * @sb is only needed for warning/error output, i.e. it can be NULL when silent
- * is 'true'.
- */
-static bool is_boot_sector_ntfs(const struct super_block *sb,
-		const NTFS_BOOT_SECTOR *b, const bool silent)
-{
-	/*
-	 * Check that checksum == sum of u32 values from b to the checksum
-	 * field.  If checksum is zero, no checking is done.  We will work when
-	 * the checksum test fails, since some utilities update the boot sector
-	 * ignoring the checksum which leaves the checksum out-of-date.  We
-	 * report a warning if this is the case.
-	 */
-	if ((void*)b < (void*)&b->checksum && b->checksum && !silent) {
-		le32 *u;
-		u32 i;
-
-		for (i = 0, u = (le32*)b; u < (le32*)(&b->checksum); ++u)
-			i += le32_to_cpup(u);
-		if (le32_to_cpu(b->checksum) != i)
-			ntfs_warning(sb, "Invalid boot sector checksum.");
-	}
-	/* Check OEMidentifier is "NTFS    " */
-	if (b->oem_id != magicNTFS)
-		goto not_ntfs;
-	/* Check bytes per sector value is between 256 and 4096. */
-	if (le16_to_cpu(b->bpb.bytes_per_sector) < 0x100 ||
-			le16_to_cpu(b->bpb.bytes_per_sector) > 0x1000)
-		goto not_ntfs;
-	/* Check sectors per cluster value is valid. */
-	switch (b->bpb.sectors_per_cluster) {
-	case 1: case 2: case 4: case 8: case 16: case 32: case 64: case 128:
-		break;
-	default:
-		goto not_ntfs;
-	}
-	/* Check the cluster size is not above the maximum (64kiB). */
-	if ((u32)le16_to_cpu(b->bpb.bytes_per_sector) *
-			b->bpb.sectors_per_cluster > NTFS_MAX_CLUSTER_SIZE)
-		goto not_ntfs;
-	/* Check reserved/unused fields are really zero. */
-	if (le16_to_cpu(b->bpb.reserved_sectors) ||
-			le16_to_cpu(b->bpb.root_entries) ||
-			le16_to_cpu(b->bpb.sectors) ||
-			le16_to_cpu(b->bpb.sectors_per_fat) ||
-			le32_to_cpu(b->bpb.large_sectors) || b->bpb.fats)
-		goto not_ntfs;
-	/* Check clusters per file mft record value is valid. */
-	if ((u8)b->clusters_per_mft_record < 0xe1 ||
-			(u8)b->clusters_per_mft_record > 0xf7)
-		switch (b->clusters_per_mft_record) {
-		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
-			break;
-		default:
-			goto not_ntfs;
-		}
-	/* Check clusters per index block value is valid. */
-	if ((u8)b->clusters_per_index_record < 0xe1 ||
-			(u8)b->clusters_per_index_record > 0xf7)
-		switch (b->clusters_per_index_record) {
-		case 1: case 2: case 4: case 8: case 16: case 32: case 64:
-			break;
-		default:
-			goto not_ntfs;
-		}
-	/*
-	 * Check for valid end of sector marker. We will work without it, but
-	 * many BIOSes will refuse to boot from a bootsector if the magic is
-	 * incorrect, so we emit a warning.
-	 */
-	if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
-		ntfs_warning(sb, "Invalid end of sector marker.");
-	return true;
-not_ntfs:
-	return false;
-}
-
-/**
- * read_ntfs_boot_sector - read the NTFS boot sector of a device
- * @sb:		super block of device to read the boot sector from
- * @silent:	if true, suppress all output
- *
- * Reads the boot sector from the device and validates it. If that fails, tries
- * to read the backup boot sector, first from the end of the device a-la NT4 and
- * later and then from the middle of the device a-la NT3.51 and before.
- *
- * If a valid boot sector is found but it is not the primary boot sector, we
- * repair the primary boot sector silently (unless the device is read-only or
- * the primary boot sector is not accessible).
- *
- * NOTE: To call this function, @sb must have the fields s_dev, the ntfs super
- * block (u.ntfs_sb), nr_blocks and the device flags (s_flags) initialized
- * to their respective values.
- *
- * Return the unlocked buffer head containing the boot sector or NULL on error.
- */
-static struct buffer_head *read_ntfs_boot_sector(struct super_block *sb,
-		const int silent)
-{
-	const char *read_err_str = "Unable to read %s boot sector.";
-	struct buffer_head *bh_primary, *bh_backup;
-	sector_t nr_blocks = NTFS_SB(sb)->nr_blocks;
-
-	/* Try to read primary boot sector. */
-	if ((bh_primary = sb_bread(sb, 0))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_primary->b_data, silent))
-			return bh_primary;
-		if (!silent)
-			ntfs_error(sb, "Primary boot sector is invalid.");
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "primary");
-	if (!(NTFS_SB(sb)->on_errors & ON_ERRORS_RECOVER)) {
-		if (bh_primary)
-			brelse(bh_primary);
-		if (!silent)
-			ntfs_error(sb, "Mount option errors=recover not used. "
-					"Aborting without trying to recover.");
-		return NULL;
-	}
-	/* Try to read NT4+ backup boot sector. */
-	if ((bh_backup = sb_bread(sb, nr_blocks - 1))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_backup->b_data, silent))
-			goto hotfix_primary_boot_sector;
-		brelse(bh_backup);
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "backup");
-	/* Try to read NT3.51- backup boot sector. */
-	if ((bh_backup = sb_bread(sb, nr_blocks >> 1))) {
-		if (is_boot_sector_ntfs(sb, (NTFS_BOOT_SECTOR*)
-				bh_backup->b_data, silent))
-			goto hotfix_primary_boot_sector;
-		if (!silent)
-			ntfs_error(sb, "Could not find a valid backup boot "
-					"sector.");
-		brelse(bh_backup);
-	} else if (!silent)
-		ntfs_error(sb, read_err_str, "backup");
-	/* We failed. Cleanup and return. */
-	if (bh_primary)
-		brelse(bh_primary);
-	return NULL;
-hotfix_primary_boot_sector:
-	if (bh_primary) {
-		/*
-		 * If we managed to read sector zero and the volume is not
-		 * read-only, copy the found, valid backup boot sector to the
-		 * primary boot sector.  Note we only copy the actual boot
-		 * sector structure, not the actual whole device sector as that
-		 * may be bigger and would potentially damage the $Boot system
-		 * file (FIXME: Would be nice to know if the backup boot sector
-		 * on a large sector device contains the whole boot loader or
-		 * just the first 512 bytes).
-		 */
-		if (!sb_rdonly(sb)) {
-			ntfs_warning(sb, "Hot-fix: Recovering invalid primary "
-					"boot sector from backup copy.");
-			memcpy(bh_primary->b_data, bh_backup->b_data,
-					NTFS_BLOCK_SIZE);
-			mark_buffer_dirty(bh_primary);
-			sync_dirty_buffer(bh_primary);
-			if (buffer_uptodate(bh_primary)) {
-				brelse(bh_backup);
-				return bh_primary;
-			}
-			ntfs_error(sb, "Hot-fix: Device write error while "
-					"recovering primary boot sector.");
-		} else {
-			ntfs_warning(sb, "Hot-fix: Recovery of primary boot "
-					"sector failed: Read-only mount.");
-		}
-		brelse(bh_primary);
-	}
-	ntfs_warning(sb, "Using backup boot sector.");
-	return bh_backup;
-}
-
-/**
- * parse_ntfs_boot_sector - parse the boot sector and store the data in @vol
- * @vol:	volume structure to initialise with data from boot sector
- * @b:		boot sector to parse
- *
- * Parse the ntfs boot sector @b and store all imporant information therein in
- * the ntfs super block @vol.  Return 'true' on success and 'false' on error.
- */
-static bool parse_ntfs_boot_sector(ntfs_volume *vol, const NTFS_BOOT_SECTOR *b)
-{
-	unsigned int sectors_per_cluster_bits, nr_hidden_sects;
-	int clusters_per_mft_record, clusters_per_index_record;
-	s64 ll;
-
-	vol->sector_size = le16_to_cpu(b->bpb.bytes_per_sector);
-	vol->sector_size_bits = ffs(vol->sector_size) - 1;
-	ntfs_debug("vol->sector_size = %i (0x%x)", vol->sector_size,
-			vol->sector_size);
-	ntfs_debug("vol->sector_size_bits = %i (0x%x)", vol->sector_size_bits,
-			vol->sector_size_bits);
-	if (vol->sector_size < vol->sb->s_blocksize) {
-		ntfs_error(vol->sb, "Sector size (%i) is smaller than the "
-				"device block size (%lu).  This is not "
-				"supported.  Sorry.", vol->sector_size,
-				vol->sb->s_blocksize);
-		return false;
-	}
-	ntfs_debug("sectors_per_cluster = 0x%x", b->bpb.sectors_per_cluster);
-	sectors_per_cluster_bits = ffs(b->bpb.sectors_per_cluster) - 1;
-	ntfs_debug("sectors_per_cluster_bits = 0x%x",
-			sectors_per_cluster_bits);
-	nr_hidden_sects = le32_to_cpu(b->bpb.hidden_sectors);
-	ntfs_debug("number of hidden sectors = 0x%x", nr_hidden_sects);
-	vol->cluster_size = vol->sector_size << sectors_per_cluster_bits;
-	vol->cluster_size_mask = vol->cluster_size - 1;
-	vol->cluster_size_bits = ffs(vol->cluster_size) - 1;
-	ntfs_debug("vol->cluster_size = %i (0x%x)", vol->cluster_size,
-			vol->cluster_size);
-	ntfs_debug("vol->cluster_size_mask = 0x%x", vol->cluster_size_mask);
-	ntfs_debug("vol->cluster_size_bits = %i", vol->cluster_size_bits);
-	if (vol->cluster_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Cluster size (%i) is smaller than the "
-				"sector size (%i).  This is not supported.  "
-				"Sorry.", vol->cluster_size, vol->sector_size);
-		return false;
-	}
-	clusters_per_mft_record = b->clusters_per_mft_record;
-	ntfs_debug("clusters_per_mft_record = %i (0x%x)",
-			clusters_per_mft_record, clusters_per_mft_record);
-	if (clusters_per_mft_record > 0)
-		vol->mft_record_size = vol->cluster_size <<
-				(ffs(clusters_per_mft_record) - 1);
-	else
-		/*
-		 * When mft_record_size < cluster_size, clusters_per_mft_record
-		 * = -log2(mft_record_size) bytes. mft_record_size normaly is
-		 * 1024 bytes, which is encoded as 0xF6 (-10 in decimal).
-		 */
-		vol->mft_record_size = 1 << -clusters_per_mft_record;
-	vol->mft_record_size_mask = vol->mft_record_size - 1;
-	vol->mft_record_size_bits = ffs(vol->mft_record_size) - 1;
-	ntfs_debug("vol->mft_record_size = %i (0x%x)", vol->mft_record_size,
-			vol->mft_record_size);
-	ntfs_debug("vol->mft_record_size_mask = 0x%x",
-			vol->mft_record_size_mask);
-	ntfs_debug("vol->mft_record_size_bits = %i (0x%x)",
-			vol->mft_record_size_bits, vol->mft_record_size_bits);
-	/*
-	 * We cannot support mft record sizes above the PAGE_SIZE since
-	 * we store $MFT/$DATA, the table of mft records in the page cache.
-	 */
-	if (vol->mft_record_size > PAGE_SIZE) {
-		ntfs_error(vol->sb, "Mft record size (%i) exceeds the "
-				"PAGE_SIZE on your system (%lu).  "
-				"This is not supported.  Sorry.",
-				vol->mft_record_size, PAGE_SIZE);
-		return false;
-	}
-	/* We cannot support mft record sizes below the sector size. */
-	if (vol->mft_record_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Mft record size (%i) is smaller than the "
-				"sector size (%i).  This is not supported.  "
-				"Sorry.", vol->mft_record_size,
-				vol->sector_size);
-		return false;
-	}
-	clusters_per_index_record = b->clusters_per_index_record;
-	ntfs_debug("clusters_per_index_record = %i (0x%x)",
-			clusters_per_index_record, clusters_per_index_record);
-	if (clusters_per_index_record > 0)
-		vol->index_record_size = vol->cluster_size <<
-				(ffs(clusters_per_index_record) - 1);
-	else
-		/*
-		 * When index_record_size < cluster_size,
-		 * clusters_per_index_record = -log2(index_record_size) bytes.
-		 * index_record_size normaly equals 4096 bytes, which is
-		 * encoded as 0xF4 (-12 in decimal).
-		 */
-		vol->index_record_size = 1 << -clusters_per_index_record;
-	vol->index_record_size_mask = vol->index_record_size - 1;
-	vol->index_record_size_bits = ffs(vol->index_record_size) - 1;
-	ntfs_debug("vol->index_record_size = %i (0x%x)",
-			vol->index_record_size, vol->index_record_size);
-	ntfs_debug("vol->index_record_size_mask = 0x%x",
-			vol->index_record_size_mask);
-	ntfs_debug("vol->index_record_size_bits = %i (0x%x)",
-			vol->index_record_size_bits,
-			vol->index_record_size_bits);
-	/* We cannot support index record sizes below the sector size. */
-	if (vol->index_record_size < vol->sector_size) {
-		ntfs_error(vol->sb, "Index record size (%i) is smaller than "
-				"the sector size (%i).  This is not "
-				"supported.  Sorry.", vol->index_record_size,
-				vol->sector_size);
-		return false;
-	}
-	/*
-	 * Get the size of the volume in clusters and check for 64-bit-ness.
-	 * Windows currently only uses 32 bits to save the clusters so we do
-	 * the same as it is much faster on 32-bit CPUs.
-	 */
-	ll = sle64_to_cpu(b->number_of_sectors) >> sectors_per_cluster_bits;
-	if ((u64)ll >= 1ULL << 32) {
-		ntfs_error(vol->sb, "Cannot handle 64-bit clusters.  Sorry.");
-		return false;
-	}
-	vol->nr_clusters = ll;
-	ntfs_debug("vol->nr_clusters = 0x%llx", (long long)vol->nr_clusters);
-	/*
-	 * On an architecture where unsigned long is 32-bits, we restrict the
-	 * volume size to 2TiB (2^41). On a 64-bit architecture, the compiler
-	 * will hopefully optimize the whole check away.
-	 */
-	if (sizeof(unsigned long) < 8) {
-		if ((ll << vol->cluster_size_bits) >= (1ULL << 41)) {
-			ntfs_error(vol->sb, "Volume size (%lluTiB) is too "
-					"large for this architecture.  "
-					"Maximum supported is 2TiB.  Sorry.",
-					(unsigned long long)ll >> (40 -
-					vol->cluster_size_bits));
-			return false;
-		}
-	}
-	ll = sle64_to_cpu(b->mft_lcn);
-	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFT LCN (%lli, 0x%llx) is beyond end of "
-				"volume.  Weird.", (unsigned long long)ll,
-				(unsigned long long)ll);
-		return false;
-	}
-	vol->mft_lcn = ll;
-	ntfs_debug("vol->mft_lcn = 0x%llx", (long long)vol->mft_lcn);
-	ll = sle64_to_cpu(b->mftmirr_lcn);
-	if (ll >= vol->nr_clusters) {
-		ntfs_error(vol->sb, "MFTMirr LCN (%lli, 0x%llx) is beyond end "
-				"of volume.  Weird.", (unsigned long long)ll,
-				(unsigned long long)ll);
-		return false;
-	}
-	vol->mftmirr_lcn = ll;
-	ntfs_debug("vol->mftmirr_lcn = 0x%llx", (long long)vol->mftmirr_lcn);
-#ifdef NTFS_RW
-	/*
-	 * Work out the size of the mft mirror in number of mft records. If the
-	 * cluster size is less than or equal to the size taken by four mft
-	 * records, the mft mirror stores the first four mft records. If the
-	 * cluster size is bigger than the size taken by four mft records, the
-	 * mft mirror contains as many mft records as will fit into one
-	 * cluster.
-	 */
-	if (vol->cluster_size <= (4 << vol->mft_record_size_bits))
-		vol->mftmirr_size = 4;
-	else
-		vol->mftmirr_size = vol->cluster_size >>
-				vol->mft_record_size_bits;
-	ntfs_debug("vol->mftmirr_size = %i", vol->mftmirr_size);
-#endif /* NTFS_RW */
-	vol->serial_no = le64_to_cpu(b->volume_serial_number);
-	ntfs_debug("vol->serial_no = 0x%llx",
-			(unsigned long long)vol->serial_no);
-	return true;
-}
-
-/**
- * ntfs_setup_allocators - initialize the cluster and mft allocators
- * @vol:	volume structure for which to setup the allocators
- *
- * Setup the cluster (lcn) and mft allocators to the starting values.
- */
-static void ntfs_setup_allocators(ntfs_volume *vol)
-{
-#ifdef NTFS_RW
-	LCN mft_zone_size, mft_lcn;
-#endif /* NTFS_RW */
-
-	ntfs_debug("vol->mft_zone_multiplier = 0x%x",
-			vol->mft_zone_multiplier);
-#ifdef NTFS_RW
-	/* Determine the size of the MFT zone. */
-	mft_zone_size = vol->nr_clusters;
-	switch (vol->mft_zone_multiplier) {  /* % of volume size in clusters */
-	case 4:
-		mft_zone_size >>= 1;			/* 50%   */
-		break;
-	case 3:
-		mft_zone_size = (mft_zone_size +
-				(mft_zone_size >> 1)) >> 2;	/* 37.5% */
-		break;
-	case 2:
-		mft_zone_size >>= 2;			/* 25%   */
-		break;
-	/* case 1: */
-	default:
-		mft_zone_size >>= 3;			/* 12.5% */
-		break;
-	}
-	/* Setup the mft zone. */
-	vol->mft_zone_start = vol->mft_zone_pos = vol->mft_lcn;
-	ntfs_debug("vol->mft_zone_pos = 0x%llx",
-			(unsigned long long)vol->mft_zone_pos);
-	/*
-	 * Calculate the mft_lcn for an unmodified NTFS volume (see mkntfs
-	 * source) and if the actual mft_lcn is in the expected place or even
-	 * further to the front of the volume, extend the mft_zone to cover the
-	 * beginning of the volume as well.  This is in order to protect the
-	 * area reserved for the mft bitmap as well within the mft_zone itself.
-	 * On non-standard volumes we do not protect it as the overhead would
-	 * be higher than the speed increase we would get by doing it.
-	 */
-	mft_lcn = (8192 + 2 * vol->cluster_size - 1) / vol->cluster_size;
-	if (mft_lcn * vol->cluster_size < 16 * 1024)
-		mft_lcn = (16 * 1024 + vol->cluster_size - 1) /
-				vol->cluster_size;
-	if (vol->mft_zone_start <= mft_lcn)
-		vol->mft_zone_start = 0;
-	ntfs_debug("vol->mft_zone_start = 0x%llx",
-			(unsigned long long)vol->mft_zone_start);
-	/*
-	 * Need to cap the mft zone on non-standard volumes so that it does
-	 * not point outside the boundaries of the volume.  We do this by
-	 * halving the zone size until we are inside the volume.
-	 */
-	vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
-	while (vol->mft_zone_end >= vol->nr_clusters) {
-		mft_zone_size >>= 1;
-		vol->mft_zone_end = vol->mft_lcn + mft_zone_size;
-	}
-	ntfs_debug("vol->mft_zone_end = 0x%llx",
-			(unsigned long long)vol->mft_zone_end);
-	/*
-	 * Set the current position within each data zone to the start of the
-	 * respective zone.
-	 */
-	vol->data1_zone_pos = vol->mft_zone_end;
-	ntfs_debug("vol->data1_zone_pos = 0x%llx",
-			(unsigned long long)vol->data1_zone_pos);
-	vol->data2_zone_pos = 0;
-	ntfs_debug("vol->data2_zone_pos = 0x%llx",
-			(unsigned long long)vol->data2_zone_pos);
-
-	/* Set the mft data allocation position to mft record 24. */
-	vol->mft_data_pos = 24;
-	ntfs_debug("vol->mft_data_pos = 0x%llx",
-			(unsigned long long)vol->mft_data_pos);
-#endif /* NTFS_RW */
-}
-
-#ifdef NTFS_RW
-
-/**
- * load_and_init_mft_mirror - load and setup the mft mirror inode for a volume
- * @vol:	ntfs super block describing device whose mft mirror to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_mft_mirror(ntfs_volume *vol)
-{
-	struct inode *tmp_ino;
-	ntfs_inode *tmp_ni;
-
-	ntfs_debug("Entering.");
-	/* Get mft mirror inode. */
-	tmp_ino = ntfs_iget(vol->sb, FILE_MFTMirr);
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		/* Caller will display error message. */
-		return false;
-	}
-	/*
-	 * Re-initialize some specifics about $MFTMirr's inode as
-	 * ntfs_read_inode() will have set up the default ones.
-	 */
-	/* Set uid and gid to root. */
-	tmp_ino->i_uid = GLOBAL_ROOT_UID;
-	tmp_ino->i_gid = GLOBAL_ROOT_GID;
-	/* Regular file.  No access for anyone. */
-	tmp_ino->i_mode = S_IFREG;
-	/* No VFS initiated operations allowed for $MFTMirr. */
-	tmp_ino->i_op = &ntfs_empty_inode_ops;
-	tmp_ino->i_fop = &ntfs_empty_file_ops;
-	/* Put in our special address space operations. */
-	tmp_ino->i_mapping->a_ops = &ntfs_mst_aops;
-	tmp_ni = NTFS_I(tmp_ino);
-	/* The $MFTMirr, like the $MFT is multi sector transfer protected. */
-	NInoSetMstProtected(tmp_ni);
-	NInoSetSparseDisabled(tmp_ni);
-	/*
-	 * Set up our little cheat allowing us to reuse the async read io
-	 * completion handler for directories.
-	 */
-	tmp_ni->itype.index.block_size = vol->mft_record_size;
-	tmp_ni->itype.index.block_size_bits = vol->mft_record_size_bits;
-	vol->mftmirr_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * check_mft_mirror - compare contents of the mft mirror with the mft
- * @vol:	ntfs super block describing device whose mft mirror to check
- *
- * Return 'true' on success or 'false' on error.
- *
- * Note, this function also results in the mft mirror runlist being completely
- * mapped into memory.  The mft mirror write code requires this and will BUG()
- * should it find an unmapped runlist element.
- */
-static bool check_mft_mirror(ntfs_volume *vol)
-{
-	struct super_block *sb = vol->sb;
-	ntfs_inode *mirr_ni;
-	struct page *mft_page, *mirr_page;
-	u8 *kmft, *kmirr;
-	runlist_element *rl, rl2[2];
-	pgoff_t index;
-	int mrecs_per_page, i;
-
-	ntfs_debug("Entering.");
-	/* Compare contents of $MFT and $MFTMirr. */
-	mrecs_per_page = PAGE_SIZE / vol->mft_record_size;
-	BUG_ON(!mrecs_per_page);
-	BUG_ON(!vol->mftmirr_size);
-	mft_page = mirr_page = NULL;
-	kmft = kmirr = NULL;
-	index = i = 0;
-	do {
-		u32 bytes;
-
-		/* Switch pages if necessary. */
-		if (!(i % mrecs_per_page)) {
-			if (index) {
-				ntfs_unmap_page(mft_page);
-				ntfs_unmap_page(mirr_page);
-			}
-			/* Get the $MFT page. */
-			mft_page = ntfs_map_page(vol->mft_ino->i_mapping,
-					index);
-			if (IS_ERR(mft_page)) {
-				ntfs_error(sb, "Failed to read $MFT.");
-				return false;
-			}
-			kmft = page_address(mft_page);
-			/* Get the $MFTMirr page. */
-			mirr_page = ntfs_map_page(vol->mftmirr_ino->i_mapping,
-					index);
-			if (IS_ERR(mirr_page)) {
-				ntfs_error(sb, "Failed to read $MFTMirr.");
-				goto mft_unmap_out;
-			}
-			kmirr = page_address(mirr_page);
-			++index;
-		}
-		/* Do not check the record if it is not in use. */
-		if (((MFT_RECORD*)kmft)->flags & MFT_RECORD_IN_USE) {
-			/* Make sure the record is ok. */
-			if (ntfs_is_baad_recordp((le32*)kmft)) {
-				ntfs_error(sb, "Incomplete multi sector "
-						"transfer detected in mft "
-						"record %i.", i);
-mm_unmap_out:
-				ntfs_unmap_page(mirr_page);
-mft_unmap_out:
-				ntfs_unmap_page(mft_page);
-				return false;
-			}
-		}
-		/* Do not check the mirror record if it is not in use. */
-		if (((MFT_RECORD*)kmirr)->flags & MFT_RECORD_IN_USE) {
-			if (ntfs_is_baad_recordp((le32*)kmirr)) {
-				ntfs_error(sb, "Incomplete multi sector "
-						"transfer detected in mft "
-						"mirror record %i.", i);
-				goto mm_unmap_out;
-			}
-		}
-		/* Get the amount of data in the current record. */
-		bytes = le32_to_cpu(((MFT_RECORD*)kmft)->bytes_in_use);
-		if (bytes < sizeof(MFT_RECORD_OLD) ||
-				bytes > vol->mft_record_size ||
-				ntfs_is_baad_recordp((le32*)kmft)) {
-			bytes = le32_to_cpu(((MFT_RECORD*)kmirr)->bytes_in_use);
-			if (bytes < sizeof(MFT_RECORD_OLD) ||
-					bytes > vol->mft_record_size ||
-					ntfs_is_baad_recordp((le32*)kmirr))
-				bytes = vol->mft_record_size;
-		}
-		/* Compare the two records. */
-		if (memcmp(kmft, kmirr, bytes)) {
-			ntfs_error(sb, "$MFT and $MFTMirr (record %i) do not "
-					"match.  Run ntfsfix or chkdsk.", i);
-			goto mm_unmap_out;
-		}
-		kmft += vol->mft_record_size;
-		kmirr += vol->mft_record_size;
-	} while (++i < vol->mftmirr_size);
-	/* Release the last pages. */
-	ntfs_unmap_page(mft_page);
-	ntfs_unmap_page(mirr_page);
-
-	/* Construct the mft mirror runlist by hand. */
-	rl2[0].vcn = 0;
-	rl2[0].lcn = vol->mftmirr_lcn;
-	rl2[0].length = (vol->mftmirr_size * vol->mft_record_size +
-			vol->cluster_size - 1) / vol->cluster_size;
-	rl2[1].vcn = rl2[0].length;
-	rl2[1].lcn = LCN_ENOENT;
-	rl2[1].length = 0;
-	/*
-	 * Because we have just read all of the mft mirror, we know we have
-	 * mapped the full runlist for it.
-	 */
-	mirr_ni = NTFS_I(vol->mftmirr_ino);
-	down_read(&mirr_ni->runlist.lock);
-	rl = mirr_ni->runlist.rl;
-	/* Compare the two runlists.  They must be identical. */
-	i = 0;
-	do {
-		if (rl2[i].vcn != rl[i].vcn || rl2[i].lcn != rl[i].lcn ||
-				rl2[i].length != rl[i].length) {
-			ntfs_error(sb, "$MFTMirr location mismatch.  "
-					"Run chkdsk.");
-			up_read(&mirr_ni->runlist.lock);
-			return false;
-		}
-	} while (rl2[i++].length);
-	up_read(&mirr_ni->runlist.lock);
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_check_logfile - load and check the logfile inode for a volume
- * @vol:	ntfs super block describing device whose logfile to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_check_logfile(ntfs_volume *vol,
-		RESTART_PAGE_HEADER **rp)
-{
-	struct inode *tmp_ino;
-
-	ntfs_debug("Entering.");
-	tmp_ino = ntfs_iget(vol->sb, FILE_LogFile);
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		/* Caller will display error message. */
-		return false;
-	}
-	if (!ntfs_check_logfile(tmp_ino, rp)) {
-		iput(tmp_ino);
-		/* ntfs_check_logfile() will have displayed error output. */
-		return false;
-	}
-	NInoSetSparseDisabled(NTFS_I(tmp_ino));
-	vol->logfile_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-#define NTFS_HIBERFIL_HEADER_SIZE	4096
-
-/**
- * check_windows_hibernation_status - check if Windows is suspended on a volume
- * @vol:	ntfs super block of device to check
- *
- * Check if Windows is hibernated on the ntfs volume @vol.  This is done by
- * looking for the file hiberfil.sys in the root directory of the volume.  If
- * the file is not present Windows is definitely not suspended.
- *
- * If hiberfil.sys exists and is less than 4kiB in size it means Windows is
- * definitely suspended (this volume is not the system volume).  Caveat:  on a
- * system with many volumes it is possible that the < 4kiB check is bogus but
- * for now this should do fine.
- *
- * If hiberfil.sys exists and is larger than 4kiB in size, we need to read the
- * hiberfil header (which is the first 4kiB).  If this begins with "hibr",
- * Windows is definitely suspended.  If it is completely full of zeroes,
- * Windows is definitely not hibernated.  Any other case is treated as if
- * Windows is suspended.  This caters for the above mentioned caveat of a
- * system with many volumes where no "hibr" magic would be present and there is
- * no zero header.
- *
- * Return 0 if Windows is not hibernated on the volume, >0 if Windows is
- * hibernated on the volume, and -errno on error.
- */
-static int check_windows_hibernation_status(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *vi;
-	struct page *page;
-	u32 *kaddr, *kend;
-	ntfs_name *name = NULL;
-	int ret = 1;
-	static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
-			cpu_to_le16('i'), cpu_to_le16('b'),
-			cpu_to_le16('e'), cpu_to_le16('r'),
-			cpu_to_le16('f'), cpu_to_le16('i'),
-			cpu_to_le16('l'), cpu_to_le16('.'),
-			cpu_to_le16('s'), cpu_to_le16('y'),
-			cpu_to_le16('s'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the hibernation file by looking up the
-	 * filename hiberfil.sys in the root directory.
-	 */
-	inode_lock(vol->root_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
-			&name);
-	inode_unlock(vol->root_ino);
-	if (IS_ERR_MREF(mref)) {
-		ret = MREF_ERR(mref);
-		/* If the file does not exist, Windows is not hibernated. */
-		if (ret == -ENOENT) {
-			ntfs_debug("hiberfil.sys not present.  Windows is not "
-					"hibernated on the volume.");
-			return 0;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for "
-				"hiberfil.sys.");
-		return ret;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	vi = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(vi) || is_bad_inode(vi)) {
-		if (!IS_ERR(vi))
-			iput(vi);
-		ntfs_error(vol->sb, "Failed to load hiberfil.sys.");
-		return IS_ERR(vi) ? PTR_ERR(vi) : -EIO;
-	}
-	if (unlikely(i_size_read(vi) < NTFS_HIBERFIL_HEADER_SIZE)) {
-		ntfs_debug("hiberfil.sys is smaller than 4kiB (0x%llx).  "
-				"Windows is hibernated on the volume.  This "
-				"is not the system volume.", i_size_read(vi));
-		goto iput_out;
-	}
-	page = ntfs_map_page(vi->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to read from hiberfil.sys.");
-		ret = PTR_ERR(page);
-		goto iput_out;
-	}
-	kaddr = (u32*)page_address(page);
-	if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
-		ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
-				"hibernated on the volume.  This is the "
-				"system volume.");
-		goto unm_iput_out;
-	}
-	kend = kaddr + NTFS_HIBERFIL_HEADER_SIZE/sizeof(*kaddr);
-	do {
-		if (unlikely(*kaddr)) {
-			ntfs_debug("hiberfil.sys is larger than 4kiB "
-					"(0x%llx), does not contain the "
-					"\"hibr\" magic, and does not have a "
-					"zero header.  Windows is hibernated "
-					"on the volume.  This is not the "
-					"system volume.", i_size_read(vi));
-			goto unm_iput_out;
-		}
-	} while (++kaddr < kend);
-	ntfs_debug("hiberfil.sys contains a zero header.  Windows is not "
-			"hibernated on the volume.  This is the system "
-			"volume.");
-	ret = 0;
-unm_iput_out:
-	ntfs_unmap_page(page);
-iput_out:
-	iput(vi);
-	return ret;
-}
-
-/**
- * load_and_init_quota - load and setup the quota file for a volume if present
- * @vol:	ntfs super block describing device whose quota file to load
- *
- * Return 'true' on success or 'false' on error.  If $Quota is not present, we
- * leave vol->quota_ino as NULL and return success.
- */
-static bool load_and_init_quota(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *tmp_ino;
-	ntfs_name *name = NULL;
-	static const ntfschar Quota[7] = { cpu_to_le16('$'),
-			cpu_to_le16('Q'), cpu_to_le16('u'),
-			cpu_to_le16('o'), cpu_to_le16('t'),
-			cpu_to_le16('a'), 0 };
-	static ntfschar Q[3] = { cpu_to_le16('$'),
-			cpu_to_le16('Q'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the quota file by looking up the filename
-	 * $Quota in the extended system files directory $Extend.
-	 */
-	inode_lock(vol->extend_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
-			&name);
-	inode_unlock(vol->extend_ino);
-	if (IS_ERR_MREF(mref)) {
-		/*
-		 * If the file does not exist, quotas are disabled and have
-		 * never been enabled on this volume, just return success.
-		 */
-		if (MREF_ERR(mref) == -ENOENT) {
-			ntfs_debug("$Quota not present.  Volume does not have "
-					"quotas enabled.");
-			/*
-			 * No need to try to set quotas out of date if they are
-			 * not enabled.
-			 */
-			NVolSetQuotaOutOfDate(vol);
-			return true;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for $Quota.");
-		return false;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(tmp_ino) || is_bad_inode(tmp_ino)) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		ntfs_error(vol->sb, "Failed to load $Quota.");
-		return false;
-	}
-	vol->quota_ino = tmp_ino;
-	/* Get the $Q index allocation attribute. */
-	tmp_ino = ntfs_index_iget(vol->quota_ino, Q, 2);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $Quota/$Q index.");
-		return false;
-	}
-	vol->quota_q_ino = tmp_ino;
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_init_usnjrnl - load and setup the transaction log if present
- * @vol:	ntfs super block describing device whose usnjrnl file to load
- *
- * Return 'true' on success or 'false' on error.
- *
- * If $UsnJrnl is not present or in the process of being disabled, we set
- * NVolUsnJrnlStamped() and return success.
- *
- * If the $UsnJrnl $DATA/$J attribute has a size equal to the lowest valid usn,
- * i.e. transaction logging has only just been enabled or the journal has been
- * stamped and nothing has been logged since, we also set NVolUsnJrnlStamped()
- * and return success.
- */
-static bool load_and_init_usnjrnl(ntfs_volume *vol)
-{
-	MFT_REF mref;
-	struct inode *tmp_ino;
-	ntfs_inode *tmp_ni;
-	struct page *page;
-	ntfs_name *name = NULL;
-	USN_HEADER *uh;
-	static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
-			cpu_to_le16('U'), cpu_to_le16('s'),
-			cpu_to_le16('n'), cpu_to_le16('J'),
-			cpu_to_le16('r'), cpu_to_le16('n'),
-			cpu_to_le16('l'), 0 };
-	static ntfschar Max[5] = { cpu_to_le16('$'),
-			cpu_to_le16('M'), cpu_to_le16('a'),
-			cpu_to_le16('x'), 0 };
-	static ntfschar J[3] = { cpu_to_le16('$'),
-			cpu_to_le16('J'), 0 };
-
-	ntfs_debug("Entering.");
-	/*
-	 * Find the inode number for the transaction log file by looking up the
-	 * filename $UsnJrnl in the extended system files directory $Extend.
-	 */
-	inode_lock(vol->extend_ino);
-	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
-			&name);
-	inode_unlock(vol->extend_ino);
-	if (IS_ERR_MREF(mref)) {
-		/*
-		 * If the file does not exist, transaction logging is disabled,
-		 * just return success.
-		 */
-		if (MREF_ERR(mref) == -ENOENT) {
-			ntfs_debug("$UsnJrnl not present.  Volume does not "
-					"have transaction logging enabled.");
-not_enabled:
-			/*
-			 * No need to try to stamp the transaction log if
-			 * transaction logging is not enabled.
-			 */
-			NVolSetUsnJrnlStamped(vol);
-			return true;
-		}
-		/* A real error occurred. */
-		ntfs_error(vol->sb, "Failed to find inode number for "
-				"$UsnJrnl.");
-		return false;
-	}
-	/* We do not care for the type of match that was found. */
-	kfree(name);
-	/* Get the inode. */
-	tmp_ino = ntfs_iget(vol->sb, MREF(mref));
-	if (IS_ERR(tmp_ino) || unlikely(is_bad_inode(tmp_ino))) {
-		if (!IS_ERR(tmp_ino))
-			iput(tmp_ino);
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl.");
-		return false;
-	}
-	vol->usnjrnl_ino = tmp_ino;
-	/*
-	 * If the transaction log is in the process of being deleted, we can
-	 * ignore it.
-	 */
-	if (unlikely(vol->vol_flags & VOLUME_DELETE_USN_UNDERWAY)) {
-		ntfs_debug("$UsnJrnl in the process of being disabled.  "
-				"Volume does not have transaction logging "
-				"enabled.");
-		goto not_enabled;
-	}
-	/* Get the $DATA/$Max attribute. */
-	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, Max, 4);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$Max "
-				"attribute.");
-		return false;
-	}
-	vol->usnjrnl_max_ino = tmp_ino;
-	if (unlikely(i_size_read(tmp_ino) < sizeof(USN_HEADER))) {
-		ntfs_error(vol->sb, "Found corrupt $UsnJrnl/$DATA/$Max "
-				"attribute (size is 0x%llx but should be at "
-				"least 0x%zx bytes).", i_size_read(tmp_ino),
-				sizeof(USN_HEADER));
-		return false;
-	}
-	/* Get the $DATA/$J attribute. */
-	tmp_ino = ntfs_attr_iget(vol->usnjrnl_ino, AT_DATA, J, 2);
-	if (IS_ERR(tmp_ino)) {
-		ntfs_error(vol->sb, "Failed to load $UsnJrnl/$DATA/$J "
-				"attribute.");
-		return false;
-	}
-	vol->usnjrnl_j_ino = tmp_ino;
-	/* Verify $J is non-resident and sparse. */
-	tmp_ni = NTFS_I(vol->usnjrnl_j_ino);
-	if (unlikely(!NInoNonResident(tmp_ni) || !NInoSparse(tmp_ni))) {
-		ntfs_error(vol->sb, "$UsnJrnl/$DATA/$J attribute is resident "
-				"and/or not sparse.");
-		return false;
-	}
-	/* Read the USN_HEADER from $DATA/$Max. */
-	page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
-	if (IS_ERR(page)) {
-		ntfs_error(vol->sb, "Failed to read from $UsnJrnl/$DATA/$Max "
-				"attribute.");
-		return false;
-	}
-	uh = (USN_HEADER*)page_address(page);
-	/* Sanity check the $Max. */
-	if (unlikely(sle64_to_cpu(uh->allocation_delta) >
-			sle64_to_cpu(uh->maximum_size))) {
-		ntfs_error(vol->sb, "Allocation delta (0x%llx) exceeds "
-				"maximum size (0x%llx).  $UsnJrnl is corrupt.",
-				(long long)sle64_to_cpu(uh->allocation_delta),
-				(long long)sle64_to_cpu(uh->maximum_size));
-		ntfs_unmap_page(page);
-		return false;
-	}
-	/*
-	 * If the transaction log has been stamped and nothing has been written
-	 * to it since, we do not need to stamp it.
-	 */
-	if (unlikely(sle64_to_cpu(uh->lowest_valid_usn) >=
-			i_size_read(vol->usnjrnl_j_ino))) {
-		if (likely(sle64_to_cpu(uh->lowest_valid_usn) ==
-				i_size_read(vol->usnjrnl_j_ino))) {
-			ntfs_unmap_page(page);
-			ntfs_debug("$UsnJrnl is enabled but nothing has been "
-					"logged since it was last stamped.  "
-					"Treating this as if the volume does "
-					"not have transaction logging "
-					"enabled.");
-			goto not_enabled;
-		}
-		ntfs_error(vol->sb, "$UsnJrnl has lowest valid usn (0x%llx) "
-				"which is out of bounds (0x%llx).  $UsnJrnl "
-				"is corrupt.",
-				(long long)sle64_to_cpu(uh->lowest_valid_usn),
-				i_size_read(vol->usnjrnl_j_ino));
-		ntfs_unmap_page(page);
-		return false;
-	}
-	ntfs_unmap_page(page);
-	ntfs_debug("Done.");
-	return true;
-}
-
-/**
- * load_and_init_attrdef - load the attribute definitions table for a volume
- * @vol:	ntfs super block describing device whose attrdef to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_attrdef(ntfs_volume *vol)
-{
-	loff_t i_size;
-	struct super_block *sb = vol->sb;
-	struct inode *ino;
-	struct page *page;
-	pgoff_t index, max_index;
-	unsigned int size;
-
-	ntfs_debug("Entering.");
-	/* Read attrdef table and setup vol->attrdef and vol->attrdef_size. */
-	ino = ntfs_iget(sb, FILE_AttrDef);
-	if (IS_ERR(ino) || is_bad_inode(ino)) {
-		if (!IS_ERR(ino))
-			iput(ino);
-		goto failed;
-	}
-	NInoSetSparseDisabled(NTFS_I(ino));
-	/* The size of FILE_AttrDef must be above 0 and fit inside 31 bits. */
-	i_size = i_size_read(ino);
-	if (i_size <= 0 || i_size > 0x7fffffff)
-		goto iput_failed;
-	vol->attrdef = (ATTR_DEF*)ntfs_malloc_nofs(i_size);
-	if (!vol->attrdef)
-		goto iput_failed;
-	index = 0;
-	max_index = i_size >> PAGE_SHIFT;
-	size = PAGE_SIZE;
-	while (index < max_index) {
-		/* Read the attrdef table and copy it into the linear buffer. */
-read_partial_attrdef_page:
-		page = ntfs_map_page(ino->i_mapping, index);
-		if (IS_ERR(page))
-			goto free_iput_failed;
-		memcpy((u8*)vol->attrdef + (index++ << PAGE_SHIFT),
-				page_address(page), size);
-		ntfs_unmap_page(page);
-	}
-	if (size == PAGE_SIZE) {
-		size = i_size & ~PAGE_MASK;
-		if (size)
-			goto read_partial_attrdef_page;
-	}
-	vol->attrdef_size = i_size;
-	ntfs_debug("Read %llu bytes from $AttrDef.", i_size);
-	iput(ino);
-	return true;
-free_iput_failed:
-	ntfs_free(vol->attrdef);
-	vol->attrdef = NULL;
-iput_failed:
-	iput(ino);
-failed:
-	ntfs_error(sb, "Failed to initialize attribute definition table.");
-	return false;
-}
-
-#endif /* NTFS_RW */
-
-/**
- * load_and_init_upcase - load the upcase table for an ntfs volume
- * @vol:	ntfs super block describing device whose upcase to load
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_and_init_upcase(ntfs_volume *vol)
-{
-	loff_t i_size;
-	struct super_block *sb = vol->sb;
-	struct inode *ino;
-	struct page *page;
-	pgoff_t index, max_index;
-	unsigned int size;
-	int i, max;
-
-	ntfs_debug("Entering.");
-	/* Read upcase table and setup vol->upcase and vol->upcase_len. */
-	ino = ntfs_iget(sb, FILE_UpCase);
-	if (IS_ERR(ino) || is_bad_inode(ino)) {
-		if (!IS_ERR(ino))
-			iput(ino);
-		goto upcase_failed;
-	}
-	/*
-	 * The upcase size must not be above 64k Unicode characters, must not
-	 * be zero and must be a multiple of sizeof(ntfschar).
-	 */
-	i_size = i_size_read(ino);
-	if (!i_size || i_size & (sizeof(ntfschar) - 1) ||
-			i_size > 64ULL * 1024 * sizeof(ntfschar))
-		goto iput_upcase_failed;
-	vol->upcase = (ntfschar*)ntfs_malloc_nofs(i_size);
-	if (!vol->upcase)
-		goto iput_upcase_failed;
-	index = 0;
-	max_index = i_size >> PAGE_SHIFT;
-	size = PAGE_SIZE;
-	while (index < max_index) {
-		/* Read the upcase table and copy it into the linear buffer. */
-read_partial_upcase_page:
-		page = ntfs_map_page(ino->i_mapping, index);
-		if (IS_ERR(page))
-			goto iput_upcase_failed;
-		memcpy((char*)vol->upcase + (index++ << PAGE_SHIFT),
-				page_address(page), size);
-		ntfs_unmap_page(page);
-	}
-	if (size == PAGE_SIZE) {
-		size = i_size & ~PAGE_MASK;
-		if (size)
-			goto read_partial_upcase_page;
-	}
-	vol->upcase_len = i_size >> UCHAR_T_SIZE_BITS;
-	ntfs_debug("Read %llu bytes from $UpCase (expected %zu bytes).",
-			i_size, 64 * 1024 * sizeof(ntfschar));
-	iput(ino);
-	mutex_lock(&ntfs_lock);
-	if (!default_upcase) {
-		ntfs_debug("Using volume specified $UpCase since default is "
-				"not present.");
-		mutex_unlock(&ntfs_lock);
-		return true;
-	}
-	max = default_upcase_len;
-	if (max > vol->upcase_len)
-		max = vol->upcase_len;
-	for (i = 0; i < max; i++)
-		if (vol->upcase[i] != default_upcase[i])
-			break;
-	if (i == max) {
-		ntfs_free(vol->upcase);
-		vol->upcase = default_upcase;
-		vol->upcase_len = max;
-		ntfs_nr_upcase_users++;
-		mutex_unlock(&ntfs_lock);
-		ntfs_debug("Volume specified $UpCase matches default. Using "
-				"default.");
-		return true;
-	}
-	mutex_unlock(&ntfs_lock);
-	ntfs_debug("Using volume specified $UpCase since it does not match "
-			"the default.");
-	return true;
-iput_upcase_failed:
-	iput(ino);
-	ntfs_free(vol->upcase);
-	vol->upcase = NULL;
-upcase_failed:
-	mutex_lock(&ntfs_lock);
-	if (default_upcase) {
-		vol->upcase = default_upcase;
-		vol->upcase_len = default_upcase_len;
-		ntfs_nr_upcase_users++;
-		mutex_unlock(&ntfs_lock);
-		ntfs_error(sb, "Failed to load $UpCase from the volume. Using "
-				"default.");
-		return true;
-	}
-	mutex_unlock(&ntfs_lock);
-	ntfs_error(sb, "Failed to initialize upcase table.");
-	return false;
-}
-
-/*
- * The lcn and mft bitmap inodes are NTFS-internal inodes with
- * their own special locking rules:
- */
-static struct lock_class_key
-	lcnbmp_runlist_lock_key, lcnbmp_mrec_lock_key,
-	mftbmp_runlist_lock_key, mftbmp_mrec_lock_key;
-
-/**
- * load_system_files - open the system files using normal functions
- * @vol:	ntfs super block describing device whose system files to load
- *
- * Open the system files with normal access functions and complete setting up
- * the ntfs super block @vol.
- *
- * Return 'true' on success or 'false' on error.
- */
-static bool load_system_files(ntfs_volume *vol)
-{
-	struct super_block *sb = vol->sb;
-	MFT_RECORD *m;
-	VOLUME_INFORMATION *vi;
-	ntfs_attr_search_ctx *ctx;
-#ifdef NTFS_RW
-	RESTART_PAGE_HEADER *rp;
-	int err;
-#endif /* NTFS_RW */
-
-	ntfs_debug("Entering.");
-#ifdef NTFS_RW
-	/* Get mft mirror inode compare the contents of $MFT and $MFTMirr. */
-	if (!load_and_init_mft_mirror(vol) || !check_mft_mirror(vol)) {
-		static const char *es1 = "Failed to load $MFTMirr";
-		static const char *es2 = "$MFTMirr does not match $MFT";
-		static const char *es3 = ".  Run ntfsfix and/or chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						!vol->mftmirr_ino ? es1 : es2,
-						es3);
-				goto iput_mirr_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s",
-					!vol->mftmirr_ino ? es1 : es2, es3);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s",
-					!vol->mftmirr_ino ? es1 : es2, es3);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	/* Get mft bitmap attribute inode. */
-	vol->mftbmp_ino = ntfs_attr_iget(vol->mft_ino, AT_BITMAP, NULL, 0);
-	if (IS_ERR(vol->mftbmp_ino)) {
-		ntfs_error(sb, "Failed to load $MFT/$BITMAP attribute.");
-		goto iput_mirr_err_out;
-	}
-	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->runlist.lock,
-			   &mftbmp_runlist_lock_key);
-	lockdep_set_class(&NTFS_I(vol->mftbmp_ino)->mrec_lock,
-			   &mftbmp_mrec_lock_key);
-	/* Read upcase table and setup @vol->upcase and @vol->upcase_len. */
-	if (!load_and_init_upcase(vol))
-		goto iput_mftbmp_err_out;
-#ifdef NTFS_RW
-	/*
-	 * Read attribute definitions table and setup @vol->attrdef and
-	 * @vol->attrdef_size.
-	 */
-	if (!load_and_init_attrdef(vol))
-		goto iput_upcase_err_out;
-#endif /* NTFS_RW */
-	/*
-	 * Get the cluster allocation bitmap inode and verify the size, no
-	 * need for any locking at this stage as we are already running
-	 * exclusively as we are mount in progress task.
-	 */
-	vol->lcnbmp_ino = ntfs_iget(sb, FILE_Bitmap);
-	if (IS_ERR(vol->lcnbmp_ino) || is_bad_inode(vol->lcnbmp_ino)) {
-		if (!IS_ERR(vol->lcnbmp_ino))
-			iput(vol->lcnbmp_ino);
-		goto bitmap_failed;
-	}
-	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->runlist.lock,
-			   &lcnbmp_runlist_lock_key);
-	lockdep_set_class(&NTFS_I(vol->lcnbmp_ino)->mrec_lock,
-			   &lcnbmp_mrec_lock_key);
-
-	NInoSetSparseDisabled(NTFS_I(vol->lcnbmp_ino));
-	if ((vol->nr_clusters + 7) >> 3 > i_size_read(vol->lcnbmp_ino)) {
-		iput(vol->lcnbmp_ino);
-bitmap_failed:
-		ntfs_error(sb, "Failed to load $Bitmap.");
-		goto iput_attrdef_err_out;
-	}
-	/*
-	 * Get the volume inode and setup our cache of the volume flags and
-	 * version.
-	 */
-	vol->vol_ino = ntfs_iget(sb, FILE_Volume);
-	if (IS_ERR(vol->vol_ino) || is_bad_inode(vol->vol_ino)) {
-		if (!IS_ERR(vol->vol_ino))
-			iput(vol->vol_ino);
-volume_failed:
-		ntfs_error(sb, "Failed to load $Volume.");
-		goto iput_lcnbmp_err_out;
-	}
-	m = map_mft_record(NTFS_I(vol->vol_ino));
-	if (IS_ERR(m)) {
-iput_volume_failed:
-		iput(vol->vol_ino);
-		goto volume_failed;
-	}
-	if (!(ctx = ntfs_attr_get_search_ctx(NTFS_I(vol->vol_ino), m))) {
-		ntfs_error(sb, "Failed to get attribute search context.");
-		goto get_ctx_vol_failed;
-	}
-	if (ntfs_attr_lookup(AT_VOLUME_INFORMATION, NULL, 0, 0, 0, NULL, 0,
-			ctx) || ctx->attr->non_resident || ctx->attr->flags) {
-err_put_vol:
-		ntfs_attr_put_search_ctx(ctx);
-get_ctx_vol_failed:
-		unmap_mft_record(NTFS_I(vol->vol_ino));
-		goto iput_volume_failed;
-	}
-	vi = (VOLUME_INFORMATION*)((char*)ctx->attr +
-			le16_to_cpu(ctx->attr->data.resident.value_offset));
-	/* Some bounds checks. */
-	if ((u8*)vi < (u8*)ctx->attr || (u8*)vi +
-			le32_to_cpu(ctx->attr->data.resident.value_length) >
-			(u8*)ctx->attr + le32_to_cpu(ctx->attr->length))
-		goto err_put_vol;
-	/* Copy the volume flags and version to the ntfs_volume structure. */
-	vol->vol_flags = vi->flags;
-	vol->major_ver = vi->major_ver;
-	vol->minor_ver = vi->minor_ver;
-	ntfs_attr_put_search_ctx(ctx);
-	unmap_mft_record(NTFS_I(vol->vol_ino));
-	pr_info("volume version %i.%i.\n", vol->major_ver,
-			vol->minor_ver);
-	if (vol->major_ver < 3 && NVolSparseEnabled(vol)) {
-		ntfs_warning(vol->sb, "Disabling sparse support due to NTFS "
-				"volume version %i.%i (need at least version "
-				"3.0).", vol->major_ver, vol->minor_ver);
-		NVolClearSparseEnabled(vol);
-	}
-#ifdef NTFS_RW
-	/* Make sure that no unsupported volume flags are set. */
-	if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
-		static const char *es1a = "Volume is dirty";
-		static const char *es1b = "Volume has been modified by chkdsk";
-		static const char *es1c = "Volume has unsupported flags set";
-		static const char *es2a = ".  Run chkdsk and mount in Windows.";
-		static const char *es2b = ".  Mount in Windows.";
-		const char *es1, *es2;
-
-		es2 = es2a;
-		if (vol->vol_flags & VOLUME_IS_DIRTY)
-			es1 = es1a;
-		else if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
-			es1 = es1b;
-			es2 = es2b;
-		} else {
-			es1 = es1c;
-			ntfs_warning(sb, "Unsupported volume flags 0x%x "
-					"encountered.",
-					(unsigned)le16_to_cpu(vol->vol_flags));
-		}
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_vol_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/*
-		 * Do not set NVolErrors() because ntfs_remount() re-checks the
-		 * flags which we need to do in case any flags have changed.
-		 */
-	}
-	/*
-	 * Get the inode for the logfile, check it and determine if the volume
-	 * was shutdown cleanly.
-	 */
-	rp = NULL;
-	if (!load_and_check_logfile(vol, &rp) ||
-			!ntfs_is_logfile_clean(vol->logfile_ino, rp)) {
-		static const char *es1a = "Failed to load $LogFile";
-		static const char *es1b = "$LogFile is not clean";
-		static const char *es2 = ".  Mount in Windows.";
-		const char *es1;
-
-		es1 = !vol->logfile_ino ? es1a : es1b;
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				if (vol->logfile_ino) {
-					BUG_ON(!rp);
-					ntfs_free(rp);
-				}
-				goto iput_logfile_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	ntfs_free(rp);
-#endif /* NTFS_RW */
-	/* Get the root directory inode so we can do path lookups. */
-	vol->root_ino = ntfs_iget(sb, FILE_root);
-	if (IS_ERR(vol->root_ino) || is_bad_inode(vol->root_ino)) {
-		if (!IS_ERR(vol->root_ino))
-			iput(vol->root_ino);
-		ntfs_error(sb, "Failed to load root directory.");
-		goto iput_logfile_err_out;
-	}
-#ifdef NTFS_RW
-	/*
-	 * Check if Windows is suspended to disk on the target volume.  If it
-	 * is hibernated, we must not write *anything* to the disk so set
-	 * NVolErrors() without setting the dirty volume flag and mount
-	 * read-only.  This will prevent read-write remounting and it will also
-	 * prevent all writes.
-	 */
-	err = check_windows_hibernation_status(vol);
-	if (unlikely(err)) {
-		static const char *es1a = "Failed to determine if Windows is "
-				"hibernated";
-		static const char *es1b = "Windows is hibernated";
-		static const char *es2 = ".  Run chkdsk.";
-		const char *es1;
-
-		es1 = err < 0 ? es1a : es1b;
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_root_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, mark the volume dirty. */
-	if (!sb_rdonly(sb) && ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
-		static const char *es1 = "Failed to set dirty bit in volume "
-				"information flags";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		/*
-		 * Do not set NVolErrors() because ntfs_remount() might manage
-		 * to set the dirty flag in which case all would be well.
-		 */
-	}
-#if 0
-	// TODO: Enable this code once we start modifying anything that is
-	//	 different between NTFS 1.2 and 3.x...
-	/*
-	 * If (still) a read-write mount, set the NT4 compatibility flag on
-	 * newer NTFS version volumes.
-	 */
-	if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) &&
-			ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) {
-		static const char *es1 = "Failed to set NT4 compatibility flag";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif
-	/* If (still) a read-write mount, empty the logfile. */
-	if (!sb_rdonly(sb) && !ntfs_empty_logfile(vol->logfile_ino)) {
-		static const char *es1 = "Failed to empty $LogFile";
-		static const char *es2 = ".  Mount in Windows.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_root_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	/* If on NTFS versions before 3.0, we are done. */
-	if (unlikely(vol->major_ver < 3))
-		return true;
-	/* NTFS 3.0+ specific initialization. */
-	/* Get the security descriptors inode. */
-	vol->secure_ino = ntfs_iget(sb, FILE_Secure);
-	if (IS_ERR(vol->secure_ino) || is_bad_inode(vol->secure_ino)) {
-		if (!IS_ERR(vol->secure_ino))
-			iput(vol->secure_ino);
-		ntfs_error(sb, "Failed to load $Secure.");
-		goto iput_root_err_out;
-	}
-	// TODO: Initialize security.
-	/* Get the extended system files' directory inode. */
-	vol->extend_ino = ntfs_iget(sb, FILE_Extend);
-	if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
-	    !S_ISDIR(vol->extend_ino->i_mode)) {
-		if (!IS_ERR(vol->extend_ino))
-			iput(vol->extend_ino);
-		ntfs_error(sb, "Failed to load $Extend.");
-		goto iput_sec_err_out;
-	}
-#ifdef NTFS_RW
-	/* Find the quota file, load it if present, and set it up. */
-	if (!load_and_init_quota(vol)) {
-		static const char *es1 = "Failed to load $Quota";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_quota_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, mark the quotas out of date. */
-	if (!sb_rdonly(sb) && !ntfs_mark_quotas_out_of_date(vol)) {
-		static const char *es1 = "Failed to mark quotas out of date";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_quota_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-	/*
-	 * Find the transaction log file ($UsnJrnl), load it if present, check
-	 * it, and set it up.
-	 */
-	if (!load_and_init_usnjrnl(vol)) {
-		static const char *es1 = "Failed to load $UsnJrnl";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* If a read-write mount, convert it to a read-only mount. */
-		if (!sb_rdonly(sb)) {
-			if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-					ON_ERRORS_CONTINUE))) {
-				ntfs_error(sb, "%s and neither on_errors="
-						"continue nor on_errors="
-						"remount-ro was specified%s",
-						es1, es2);
-				goto iput_usnjrnl_err_out;
-			}
-			sb->s_flags |= SB_RDONLY;
-			ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		} else
-			ntfs_warning(sb, "%s.  Will not be able to remount "
-					"read-write%s", es1, es2);
-		/* This will prevent a read-write remount. */
-		NVolSetErrors(vol);
-	}
-	/* If (still) a read-write mount, stamp the transaction log. */
-	if (!sb_rdonly(sb) && !ntfs_stamp_usnjrnl(vol)) {
-		static const char *es1 = "Failed to stamp transaction log "
-				"($UsnJrnl)";
-		static const char *es2 = ".  Run chkdsk.";
-
-		/* Convert to a read-only mount. */
-		if (!(vol->on_errors & (ON_ERRORS_REMOUNT_RO |
-				ON_ERRORS_CONTINUE))) {
-			ntfs_error(sb, "%s and neither on_errors=continue nor "
-					"on_errors=remount-ro was specified%s",
-					es1, es2);
-			goto iput_usnjrnl_err_out;
-		}
-		ntfs_error(sb, "%s.  Mounting read-only%s", es1, es2);
-		sb->s_flags |= SB_RDONLY;
-		NVolSetErrors(vol);
-	}
-#endif /* NTFS_RW */
-	return true;
-#ifdef NTFS_RW
-iput_usnjrnl_err_out:
-	iput(vol->usnjrnl_j_ino);
-	iput(vol->usnjrnl_max_ino);
-	iput(vol->usnjrnl_ino);
-iput_quota_err_out:
-	iput(vol->quota_q_ino);
-	iput(vol->quota_ino);
-	iput(vol->extend_ino);
-#endif /* NTFS_RW */
-iput_sec_err_out:
-	iput(vol->secure_ino);
-iput_root_err_out:
-	iput(vol->root_ino);
-iput_logfile_err_out:
-#ifdef NTFS_RW
-	iput(vol->logfile_ino);
-iput_vol_err_out:
-#endif /* NTFS_RW */
-	iput(vol->vol_ino);
-iput_lcnbmp_err_out:
-	iput(vol->lcnbmp_ino);
-iput_attrdef_err_out:
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-#ifdef NTFS_RW
-iput_upcase_err_out:
-#endif /* NTFS_RW */
-	vol->upcase_len = 0;
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-iput_mftbmp_err_out:
-	iput(vol->mftbmp_ino);
-iput_mirr_err_out:
-#ifdef NTFS_RW
-	iput(vol->mftmirr_ino);
-#endif /* NTFS_RW */
-	return false;
-}
-
-/**
- * ntfs_put_super - called by the vfs to unmount a volume
- * @sb:		vfs superblock of volume to unmount
- *
- * ntfs_put_super() is called by the VFS (from fs/super.c::do_umount()) when
- * the volume is being unmounted (umount system call has been invoked) and it
- * releases all inodes and memory belonging to the NTFS specific part of the
- * super block.
- */
-static void ntfs_put_super(struct super_block *sb)
-{
-	ntfs_volume *vol = NTFS_SB(sb);
-
-	ntfs_debug("Entering.");
-
-#ifdef NTFS_RW
-	/*
-	 * Commit all inodes while they are still open in case some of them
-	 * cause others to be dirtied.
-	 */
-	ntfs_commit_inode(vol->vol_ino);
-
-	/* NTFS 3.0+ specific. */
-	if (vol->major_ver >= 3) {
-		if (vol->usnjrnl_j_ino)
-			ntfs_commit_inode(vol->usnjrnl_j_ino);
-		if (vol->usnjrnl_max_ino)
-			ntfs_commit_inode(vol->usnjrnl_max_ino);
-		if (vol->usnjrnl_ino)
-			ntfs_commit_inode(vol->usnjrnl_ino);
-		if (vol->quota_q_ino)
-			ntfs_commit_inode(vol->quota_q_ino);
-		if (vol->quota_ino)
-			ntfs_commit_inode(vol->quota_ino);
-		if (vol->extend_ino)
-			ntfs_commit_inode(vol->extend_ino);
-		if (vol->secure_ino)
-			ntfs_commit_inode(vol->secure_ino);
-	}
-
-	ntfs_commit_inode(vol->root_ino);
-
-	down_write(&vol->lcnbmp_lock);
-	ntfs_commit_inode(vol->lcnbmp_ino);
-	up_write(&vol->lcnbmp_lock);
-
-	down_write(&vol->mftbmp_lock);
-	ntfs_commit_inode(vol->mftbmp_ino);
-	up_write(&vol->mftbmp_lock);
-
-	if (vol->logfile_ino)
-		ntfs_commit_inode(vol->logfile_ino);
-
-	if (vol->mftmirr_ino)
-		ntfs_commit_inode(vol->mftmirr_ino);
-	ntfs_commit_inode(vol->mft_ino);
-
-	/*
-	 * If a read-write mount and no volume errors have occurred, mark the
-	 * volume clean.  Also, re-commit all affected inodes.
-	 */
-	if (!sb_rdonly(sb)) {
-		if (!NVolErrors(vol)) {
-			if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY))
-				ntfs_warning(sb, "Failed to clear dirty bit "
-						"in volume information "
-						"flags.  Run chkdsk.");
-			ntfs_commit_inode(vol->vol_ino);
-			ntfs_commit_inode(vol->root_ino);
-			if (vol->mftmirr_ino)
-				ntfs_commit_inode(vol->mftmirr_ino);
-			ntfs_commit_inode(vol->mft_ino);
-		} else {
-			ntfs_warning(sb, "Volume has errors.  Leaving volume "
-					"marked dirty.  Run chkdsk.");
-		}
-	}
-#endif /* NTFS_RW */
-
-	iput(vol->vol_ino);
-	vol->vol_ino = NULL;
-
-	/* NTFS 3.0+ specific clean up. */
-	if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
-		if (vol->usnjrnl_j_ino) {
-			iput(vol->usnjrnl_j_ino);
-			vol->usnjrnl_j_ino = NULL;
-		}
-		if (vol->usnjrnl_max_ino) {
-			iput(vol->usnjrnl_max_ino);
-			vol->usnjrnl_max_ino = NULL;
-		}
-		if (vol->usnjrnl_ino) {
-			iput(vol->usnjrnl_ino);
-			vol->usnjrnl_ino = NULL;
-		}
-		if (vol->quota_q_ino) {
-			iput(vol->quota_q_ino);
-			vol->quota_q_ino = NULL;
-		}
-		if (vol->quota_ino) {
-			iput(vol->quota_ino);
-			vol->quota_ino = NULL;
-		}
-#endif /* NTFS_RW */
-		if (vol->extend_ino) {
-			iput(vol->extend_ino);
-			vol->extend_ino = NULL;
-		}
-		if (vol->secure_ino) {
-			iput(vol->secure_ino);
-			vol->secure_ino = NULL;
-		}
-	}
-
-	iput(vol->root_ino);
-	vol->root_ino = NULL;
-
-	down_write(&vol->lcnbmp_lock);
-	iput(vol->lcnbmp_ino);
-	vol->lcnbmp_ino = NULL;
-	up_write(&vol->lcnbmp_lock);
-
-	down_write(&vol->mftbmp_lock);
-	iput(vol->mftbmp_ino);
-	vol->mftbmp_ino = NULL;
-	up_write(&vol->mftbmp_lock);
-
-#ifdef NTFS_RW
-	if (vol->logfile_ino) {
-		iput(vol->logfile_ino);
-		vol->logfile_ino = NULL;
-	}
-	if (vol->mftmirr_ino) {
-		/* Re-commit the mft mirror and mft just in case. */
-		ntfs_commit_inode(vol->mftmirr_ino);
-		ntfs_commit_inode(vol->mft_ino);
-		iput(vol->mftmirr_ino);
-		vol->mftmirr_ino = NULL;
-	}
-	/*
-	 * We should have no dirty inodes left, due to
-	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.
-	 */
-	ntfs_commit_inode(vol->mft_ino);
-	write_inode_now(vol->mft_ino, 1);
-#endif /* NTFS_RW */
-
-	iput(vol->mft_ino);
-	vol->mft_ino = NULL;
-
-	/* Throw away the table of attribute definitions. */
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-	vol->upcase_len = 0;
-	/*
-	 * Destroy the global default upcase table if necessary.  Also decrease
-	 * the number of upcase users if we are a user.
-	 */
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	if (!ntfs_nr_upcase_users && default_upcase) {
-		ntfs_free(default_upcase);
-		default_upcase = NULL;
-	}
-	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
-		free_compression_buffers();
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-
-	unload_nls(vol->nls_map);
-
-	sb->s_fs_info = NULL;
-	kfree(vol);
-}
-
-/**
- * get_nr_free_clusters - return the number of free clusters on a volume
- * @vol:	ntfs volume for which to obtain free cluster count
- *
- * Calculate the number of free clusters on the mounted NTFS volume @vol. We
- * actually calculate the number of clusters in use instead because this
- * allows us to not care about partial pages as these will be just zero filled
- * and hence not be counted as allocated clusters.
- *
- * The only particularity is that clusters beyond the end of the logical ntfs
- * volume will be marked as allocated to prevent errors which means we have to
- * discount those at the end. This is important as the cluster bitmap always
- * has a size in multiples of 8 bytes, i.e. up to 63 clusters could be outside
- * the logical volume and marked in use when they are not as they do not exist.
- *
- * If any pages cannot be read we assume all clusters in the erroring pages are
- * in use. This means we return an underestimate on errors which is better than
- * an overestimate.
- */
-static s64 get_nr_free_clusters(ntfs_volume *vol)
-{
-	s64 nr_free = vol->nr_clusters;
-	struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
-	struct page *page;
-	pgoff_t index, max_index;
-
-	ntfs_debug("Entering.");
-	/* Serialize accesses to the cluster bitmap. */
-	down_read(&vol->lcnbmp_lock);
-	/*
-	 * Convert the number of bits into bytes rounded up, then convert into
-	 * multiples of PAGE_SIZE, rounding up so that if we have one
-	 * full and one partial page max_index = 2.
-	 */
-	max_index = (((vol->nr_clusters + 7) >> 3) + PAGE_SIZE - 1) >>
-			PAGE_SHIFT;
-	/* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
-	ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
-			max_index, PAGE_SIZE / 4);
-	for (index = 0; index < max_index; index++) {
-		unsigned long *kaddr;
-
-		/*
-		 * Read the page from page cache, getting it from backing store
-		 * if necessary, and increment the use count.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		/* Ignore pages which errored synchronously. */
-		if (IS_ERR(page)) {
-			ntfs_debug("read_mapping_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_SIZE * 8;
-			continue;
-		}
-		kaddr = kmap_atomic(page);
-		/*
-		 * Subtract the number of set bits. If this
-		 * is the last page and it is partial we don't really care as
-		 * it just means we do a little extra work but it won't affect
-		 * the result as all out of range bytes are set to zero by
-		 * ntfs_readpage().
-		 */
-		nr_free -= bitmap_weight(kaddr,
-					PAGE_SIZE * BITS_PER_BYTE);
-		kunmap_atomic(kaddr);
-		put_page(page);
-	}
-	ntfs_debug("Finished reading $Bitmap, last index = 0x%lx.", index - 1);
-	/*
-	 * Fixup for eventual bits outside logical ntfs volume (see function
-	 * description above).
-	 */
-	if (vol->nr_clusters & 63)
-		nr_free += 64 - (vol->nr_clusters & 63);
-	up_read(&vol->lcnbmp_lock);
-	/* If errors occurred we may well have gone below zero, fix this. */
-	if (nr_free < 0)
-		nr_free = 0;
-	ntfs_debug("Exiting.");
-	return nr_free;
-}
-
-/**
- * __get_nr_free_mft_records - return the number of free inodes on a volume
- * @vol:	ntfs volume for which to obtain free inode count
- * @nr_free:	number of mft records in filesystem
- * @max_index:	maximum number of pages containing set bits
- *
- * Calculate the number of free mft records (inodes) on the mounted NTFS
- * volume @vol. We actually calculate the number of mft records in use instead
- * because this allows us to not care about partial pages as these will be just
- * zero filled and hence not be counted as allocated mft record.
- *
- * If any pages cannot be read we assume all mft records in the erroring pages
- * are in use. This means we return an underestimate on errors which is better
- * than an overestimate.
- *
- * NOTE: Caller must hold mftbmp_lock rw_semaphore for reading or writing.
- */
-static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
-		s64 nr_free, const pgoff_t max_index)
-{
-	struct address_space *mapping = vol->mftbmp_ino->i_mapping;
-	struct page *page;
-	pgoff_t index;
-
-	ntfs_debug("Entering.");
-	/* Use multiples of 4 bytes, thus max_size is PAGE_SIZE / 4. */
-	ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
-			"0x%lx.", max_index, PAGE_SIZE / 4);
-	for (index = 0; index < max_index; index++) {
-		unsigned long *kaddr;
-
-		/*
-		 * Read the page from page cache, getting it from backing store
-		 * if necessary, and increment the use count.
-		 */
-		page = read_mapping_page(mapping, index, NULL);
-		/* Ignore pages which errored synchronously. */
-		if (IS_ERR(page)) {
-			ntfs_debug("read_mapping_page() error. Skipping "
-					"page (index 0x%lx).", index);
-			nr_free -= PAGE_SIZE * 8;
-			continue;
-		}
-		kaddr = kmap_atomic(page);
-		/*
-		 * Subtract the number of set bits. If this
-		 * is the last page and it is partial we don't really care as
-		 * it just means we do a little extra work but it won't affect
-		 * the result as all out of range bytes are set to zero by
-		 * ntfs_readpage().
-		 */
-		nr_free -= bitmap_weight(kaddr,
-					PAGE_SIZE * BITS_PER_BYTE);
-		kunmap_atomic(kaddr);
-		put_page(page);
-	}
-	ntfs_debug("Finished reading $MFT/$BITMAP, last index = 0x%lx.",
-			index - 1);
-	/* If errors occurred we may well have gone below zero, fix this. */
-	if (nr_free < 0)
-		nr_free = 0;
-	ntfs_debug("Exiting.");
-	return nr_free;
-}
-
-/**
- * ntfs_statfs - return information about mounted NTFS volume
- * @dentry:	dentry from mounted volume
- * @sfs:	statfs structure in which to return the information
- *
- * Return information about the mounted NTFS volume @dentry in the statfs structure
- * pointed to by @sfs (this is initialized with zeros before ntfs_statfs is
- * called). We interpret the values to be correct of the moment in time at
- * which we are called. Most values are variable otherwise and this isn't just
- * the free values but the totals as well. For example we can increase the
- * total number of file nodes if we run out and we can keep doing this until
- * there is no more space on the volume left at all.
- *
- * Called from vfs_statfs which is used to handle the statfs, fstatfs, and
- * ustat system calls.
- *
- * Return 0 on success or -errno on error.
- */
-static int ntfs_statfs(struct dentry *dentry, struct kstatfs *sfs)
-{
-	struct super_block *sb = dentry->d_sb;
-	s64 size;
-	ntfs_volume *vol = NTFS_SB(sb);
-	ntfs_inode *mft_ni = NTFS_I(vol->mft_ino);
-	pgoff_t max_index;
-	unsigned long flags;
-
-	ntfs_debug("Entering.");
-	/* Type of filesystem. */
-	sfs->f_type   = NTFS_SB_MAGIC;
-	/* Optimal transfer block size. */
-	sfs->f_bsize  = PAGE_SIZE;
-	/*
-	 * Total data blocks in filesystem in units of f_bsize and since
-	 * inodes are also stored in data blocs ($MFT is a file) this is just
-	 * the total clusters.
-	 */
-	sfs->f_blocks = vol->nr_clusters << vol->cluster_size_bits >>
-				PAGE_SHIFT;
-	/* Free data blocks in filesystem in units of f_bsize. */
-	size	      = get_nr_free_clusters(vol) << vol->cluster_size_bits >>
-				PAGE_SHIFT;
-	if (size < 0LL)
-		size = 0LL;
-	/* Free blocks avail to non-superuser, same as above on NTFS. */
-	sfs->f_bavail = sfs->f_bfree = size;
-	/* Serialize accesses to the inode bitmap. */
-	down_read(&vol->mftbmp_lock);
-	read_lock_irqsave(&mft_ni->size_lock, flags);
-	size = i_size_read(vol->mft_ino) >> vol->mft_record_size_bits;
-	/*
-	 * Convert the maximum number of set bits into bytes rounded up, then
-	 * convert into multiples of PAGE_SIZE, rounding up so that if we
-	 * have one full and one partial page max_index = 2.
-	 */
-	max_index = ((((mft_ni->initialized_size >> vol->mft_record_size_bits)
-			+ 7) >> 3) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	read_unlock_irqrestore(&mft_ni->size_lock, flags);
-	/* Number of inodes in filesystem (at this point in time). */
-	sfs->f_files = size;
-	/* Free inodes in fs (based on current total count). */
-	sfs->f_ffree = __get_nr_free_mft_records(vol, size, max_index);
-	up_read(&vol->mftbmp_lock);
-	/*
-	 * File system id. This is extremely *nix flavour dependent and even
-	 * within Linux itself all fs do their own thing. I interpret this to
-	 * mean a unique id associated with the mounted fs and not the id
-	 * associated with the filesystem driver, the latter is already given
-	 * by the filesystem type in sfs->f_type. Thus we use the 64-bit
-	 * volume serial number splitting it into two 32-bit parts. We enter
-	 * the least significant 32-bits in f_fsid[0] and the most significant
-	 * 32-bits in f_fsid[1].
-	 */
-	sfs->f_fsid = u64_to_fsid(vol->serial_no);
-	/* Maximum length of filenames. */
-	sfs->f_namelen	   = NTFS_MAX_NAME_LEN;
-	return 0;
-}
-
-#ifdef NTFS_RW
-static int ntfs_write_inode(struct inode *vi, struct writeback_control *wbc)
-{
-	return __ntfs_write_inode(vi, wbc->sync_mode == WB_SYNC_ALL);
-}
-#endif
-
-/*
- * The complete super operations.
- */
-static const struct super_operations ntfs_sops = {
-	.alloc_inode	= ntfs_alloc_big_inode,	  /* VFS: Allocate new inode. */
-	.free_inode	= ntfs_free_big_inode, /* VFS: Deallocate inode. */
-#ifdef NTFS_RW
-	.write_inode	= ntfs_write_inode,	/* VFS: Write dirty inode to
-						   disk. */
-#endif /* NTFS_RW */
-	.put_super	= ntfs_put_super,	/* Syscall: umount. */
-	.statfs		= ntfs_statfs,		/* Syscall: statfs */
-	.remount_fs	= ntfs_remount,		/* Syscall: mount -o remount. */
-	.evict_inode	= ntfs_evict_big_inode,	/* VFS: Called when an inode is
-						   removed from memory. */
-	.show_options	= ntfs_show_options,	/* Show mount options in
-						   proc. */
-};
-
-/**
- * ntfs_fill_super - mount an ntfs filesystem
- * @sb:		super block of ntfs filesystem to mount
- * @opt:	string containing the mount options
- * @silent:	silence error output
- *
- * ntfs_fill_super() is called by the VFS to mount the device described by @sb
- * with the mount otions in @data with the NTFS filesystem.
- *
- * If @silent is true, remain silent even if errors are detected. This is used
- * during bootup, when the kernel tries to mount the root filesystem with all
- * registered filesystems one after the other until one succeeds. This implies
- * that all filesystems except the correct one will quite correctly and
- * expectedly return an error, but nobody wants to see error messages when in
- * fact this is what is supposed to happen.
- *
- * NOTE: @sb->s_flags contains the mount options flags.
- */
-static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
-{
-	ntfs_volume *vol;
-	struct buffer_head *bh;
-	struct inode *tmp_ino;
-	int blocksize, result;
-
-	/*
-	 * We do a pretty difficult piece of bootstrap by reading the
-	 * MFT (and other metadata) from disk into memory. We'll only
-	 * release this metadata during umount, so the locking patterns
-	 * observed during bootstrap do not count. So turn off the
-	 * observation of locking patterns (strictly for this context
-	 * only) while mounting NTFS. [The validator is still active
-	 * otherwise, even for this context: it will for example record
-	 * lock class registrations.]
-	 */
-	lockdep_off();
-	ntfs_debug("Entering.");
-#ifndef NTFS_RW
-	sb->s_flags |= SB_RDONLY;
-#endif /* ! NTFS_RW */
-	/* Allocate a new ntfs_volume and place it in sb->s_fs_info. */
-	sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS);
-	vol = NTFS_SB(sb);
-	if (!vol) {
-		if (!silent)
-			ntfs_error(sb, "Allocation of NTFS volume structure "
-					"failed. Aborting mount...");
-		lockdep_on();
-		return -ENOMEM;
-	}
-	/* Initialize ntfs_volume structure. */
-	*vol = (ntfs_volume) {
-		.sb = sb,
-		/*
-		 * Default is group and other don't have any access to files or
-		 * directories while owner has full access. Further, files by
-		 * default are not executable but directories are of course
-		 * browseable.
-		 */
-		.fmask = 0177,
-		.dmask = 0077,
-	};
-	init_rwsem(&vol->mftbmp_lock);
-	init_rwsem(&vol->lcnbmp_lock);
-
-	/* By default, enable sparse support. */
-	NVolSetSparseEnabled(vol);
-
-	/* Important to get the mount options dealt with now. */
-	if (!parse_options(vol, (char*)opt))
-		goto err_out_now;
-
-	/* We support sector sizes up to the PAGE_SIZE. */
-	if (bdev_logical_block_size(sb->s_bdev) > PAGE_SIZE) {
-		if (!silent)
-			ntfs_error(sb, "Device has unsupported sector size "
-					"(%i).  The maximum supported sector "
-					"size on this architecture is %lu "
-					"bytes.",
-					bdev_logical_block_size(sb->s_bdev),
-					PAGE_SIZE);
-		goto err_out_now;
-	}
-	/*
-	 * Setup the device access block size to NTFS_BLOCK_SIZE or the hard
-	 * sector size, whichever is bigger.
-	 */
-	blocksize = sb_min_blocksize(sb, NTFS_BLOCK_SIZE);
-	if (blocksize < NTFS_BLOCK_SIZE) {
-		if (!silent)
-			ntfs_error(sb, "Unable to set device block size.");
-		goto err_out_now;
-	}
-	BUG_ON(blocksize != sb->s_blocksize);
-	ntfs_debug("Set device block size to %i bytes (block size bits %i).",
-			blocksize, sb->s_blocksize_bits);
-	/* Determine the size of the device in units of block_size bytes. */
-	vol->nr_blocks = sb_bdev_nr_blocks(sb);
-	if (!vol->nr_blocks) {
-		if (!silent)
-			ntfs_error(sb, "Unable to determine device size.");
-		goto err_out_now;
-	}
-	/* Read the boot sector and return unlocked buffer head to it. */
-	if (!(bh = read_ntfs_boot_sector(sb, silent))) {
-		if (!silent)
-			ntfs_error(sb, "Not an NTFS volume.");
-		goto err_out_now;
-	}
-	/*
-	 * Extract the data from the boot sector and setup the ntfs volume
-	 * using it.
-	 */
-	result = parse_ntfs_boot_sector(vol, (NTFS_BOOT_SECTOR*)bh->b_data);
-	brelse(bh);
-	if (!result) {
-		if (!silent)
-			ntfs_error(sb, "Unsupported NTFS filesystem.");
-		goto err_out_now;
-	}
-	/*
-	 * If the boot sector indicates a sector size bigger than the current
-	 * device block size, switch the device block size to the sector size.
-	 * TODO: It may be possible to support this case even when the set
-	 * below fails, we would just be breaking up the i/o for each sector
-	 * into multiple blocks for i/o purposes but otherwise it should just
-	 * work.  However it is safer to leave disabled until someone hits this
-	 * error message and then we can get them to try it without the setting
-	 * so we know for sure that it works.
-	 */
-	if (vol->sector_size > blocksize) {
-		blocksize = sb_set_blocksize(sb, vol->sector_size);
-		if (blocksize != vol->sector_size) {
-			if (!silent)
-				ntfs_error(sb, "Unable to set device block "
-						"size to sector size (%i).",
-						vol->sector_size);
-			goto err_out_now;
-		}
-		BUG_ON(blocksize != sb->s_blocksize);
-		vol->nr_blocks = sb_bdev_nr_blocks(sb);
-		ntfs_debug("Changed device block size to %i bytes (block size "
-				"bits %i) to match volume sector size.",
-				blocksize, sb->s_blocksize_bits);
-	}
-	/* Initialize the cluster and mft allocators. */
-	ntfs_setup_allocators(vol);
-	/* Setup remaining fields in the super block. */
-	sb->s_magic = NTFS_SB_MAGIC;
-	/*
-	 * Ntfs allows 63 bits for the file size, i.e. correct would be:
-	 *	sb->s_maxbytes = ~0ULL >> 1;
-	 * But the kernel uses a long as the page cache page index which on
-	 * 32-bit architectures is only 32-bits. MAX_LFS_FILESIZE is kernel
-	 * defined to the maximum the page cache page index can cope with
-	 * without overflowing the index or to 2^63 - 1, whichever is smaller.
-	 */
-	sb->s_maxbytes = MAX_LFS_FILESIZE;
-	/* Ntfs measures time in 100ns intervals. */
-	sb->s_time_gran = 100;
-	/*
-	 * Now load the metadata required for the page cache and our address
-	 * space operations to function. We do this by setting up a specialised
-	 * read_inode method and then just calling the normal iget() to obtain
-	 * the inode for $MFT which is sufficient to allow our normal inode
-	 * operations and associated address space operations to function.
-	 */
-	sb->s_op = &ntfs_sops;
-	tmp_ino = new_inode(sb);
-	if (!tmp_ino) {
-		if (!silent)
-			ntfs_error(sb, "Failed to load essential metadata.");
-		goto err_out_now;
-	}
-	tmp_ino->i_ino = FILE_MFT;
-	insert_inode_hash(tmp_ino);
-	if (ntfs_read_inode_mount(tmp_ino) < 0) {
-		if (!silent)
-			ntfs_error(sb, "Failed to load essential metadata.");
-		goto iput_tmp_ino_err_out_now;
-	}
-	mutex_lock(&ntfs_lock);
-	/*
-	 * The current mount is a compression user if the cluster size is
-	 * less than or equal 4kiB.
-	 */
-	if (vol->cluster_size <= 4096 && !ntfs_nr_compression_users++) {
-		result = allocate_compression_buffers();
-		if (result) {
-			ntfs_error(NULL, "Failed to allocate buffers "
-					"for compression engine.");
-			ntfs_nr_compression_users--;
-			mutex_unlock(&ntfs_lock);
-			goto iput_tmp_ino_err_out_now;
-		}
-	}
-	/*
-	 * Generate the global default upcase table if necessary.  Also
-	 * temporarily increment the number of upcase users to avoid race
-	 * conditions with concurrent (u)mounts.
-	 */
-	if (!default_upcase)
-		default_upcase = generate_default_upcase();
-	ntfs_nr_upcase_users++;
-	mutex_unlock(&ntfs_lock);
-	/*
-	 * From now on, ignore @silent parameter. If we fail below this line,
-	 * it will be due to a corrupt fs or a system error, so we report it.
-	 */
-	/*
-	 * Open the system files with normal access functions and complete
-	 * setting up the ntfs super block.
-	 */
-	if (!load_system_files(vol)) {
-		ntfs_error(sb, "Failed to load system files.");
-		goto unl_upcase_iput_tmp_ino_err_out_now;
-	}
-
-	/* We grab a reference, simulating an ntfs_iget(). */
-	ihold(vol->root_ino);
-	if ((sb->s_root = d_make_root(vol->root_ino))) {
-		ntfs_debug("Exiting, status successful.");
-		/* Release the default upcase if it has no users. */
-		mutex_lock(&ntfs_lock);
-		if (!--ntfs_nr_upcase_users && default_upcase) {
-			ntfs_free(default_upcase);
-			default_upcase = NULL;
-		}
-		mutex_unlock(&ntfs_lock);
-		sb->s_export_op = &ntfs_export_ops;
-		lockdep_on();
-		return 0;
-	}
-	ntfs_error(sb, "Failed to allocate root directory.");
-	/* Clean up after the successful load_system_files() call from above. */
-	// TODO: Use ntfs_put_super() instead of repeating all this code...
-	// FIXME: Should mark the volume clean as the error is most likely
-	// 	  -ENOMEM.
-	iput(vol->vol_ino);
-	vol->vol_ino = NULL;
-	/* NTFS 3.0+ specific clean up. */
-	if (vol->major_ver >= 3) {
-#ifdef NTFS_RW
-		if (vol->usnjrnl_j_ino) {
-			iput(vol->usnjrnl_j_ino);
-			vol->usnjrnl_j_ino = NULL;
-		}
-		if (vol->usnjrnl_max_ino) {
-			iput(vol->usnjrnl_max_ino);
-			vol->usnjrnl_max_ino = NULL;
-		}
-		if (vol->usnjrnl_ino) {
-			iput(vol->usnjrnl_ino);
-			vol->usnjrnl_ino = NULL;
-		}
-		if (vol->quota_q_ino) {
-			iput(vol->quota_q_ino);
-			vol->quota_q_ino = NULL;
-		}
-		if (vol->quota_ino) {
-			iput(vol->quota_ino);
-			vol->quota_ino = NULL;
-		}
-#endif /* NTFS_RW */
-		if (vol->extend_ino) {
-			iput(vol->extend_ino);
-			vol->extend_ino = NULL;
-		}
-		if (vol->secure_ino) {
-			iput(vol->secure_ino);
-			vol->secure_ino = NULL;
-		}
-	}
-	iput(vol->root_ino);
-	vol->root_ino = NULL;
-	iput(vol->lcnbmp_ino);
-	vol->lcnbmp_ino = NULL;
-	iput(vol->mftbmp_ino);
-	vol->mftbmp_ino = NULL;
-#ifdef NTFS_RW
-	if (vol->logfile_ino) {
-		iput(vol->logfile_ino);
-		vol->logfile_ino = NULL;
-	}
-	if (vol->mftmirr_ino) {
-		iput(vol->mftmirr_ino);
-		vol->mftmirr_ino = NULL;
-	}
-#endif /* NTFS_RW */
-	/* Throw away the table of attribute definitions. */
-	vol->attrdef_size = 0;
-	if (vol->attrdef) {
-		ntfs_free(vol->attrdef);
-		vol->attrdef = NULL;
-	}
-	vol->upcase_len = 0;
-	mutex_lock(&ntfs_lock);
-	if (vol->upcase == default_upcase) {
-		ntfs_nr_upcase_users--;
-		vol->upcase = NULL;
-	}
-	mutex_unlock(&ntfs_lock);
-	if (vol->upcase) {
-		ntfs_free(vol->upcase);
-		vol->upcase = NULL;
-	}
-	if (vol->nls_map) {
-		unload_nls(vol->nls_map);
-		vol->nls_map = NULL;
-	}
-	/* Error exit code path. */
-unl_upcase_iput_tmp_ino_err_out_now:
-	/*
-	 * Decrease the number of upcase users and destroy the global default
-	 * upcase table if necessary.
-	 */
-	mutex_lock(&ntfs_lock);
-	if (!--ntfs_nr_upcase_users && default_upcase) {
-		ntfs_free(default_upcase);
-		default_upcase = NULL;
-	}
-	if (vol->cluster_size <= 4096 && !--ntfs_nr_compression_users)
-		free_compression_buffers();
-	mutex_unlock(&ntfs_lock);
-iput_tmp_ino_err_out_now:
-	iput(tmp_ino);
-	if (vol->mft_ino && vol->mft_ino != tmp_ino)
-		iput(vol->mft_ino);
-	vol->mft_ino = NULL;
-	/* Errors at this stage are irrelevant. */
-err_out_now:
-	sb->s_fs_info = NULL;
-	kfree(vol);
-	ntfs_debug("Failed, returning -EINVAL.");
-	lockdep_on();
-	return -EINVAL;
-}
-
-/*
- * This is a slab cache to optimize allocations and deallocations of Unicode
- * strings of the maximum length allowed by NTFS, which is NTFS_MAX_NAME_LEN
- * (255) Unicode characters + a terminating NULL Unicode character.
- */
-struct kmem_cache *ntfs_name_cache;
-
-/* Slab caches for efficient allocation/deallocation of inodes. */
-struct kmem_cache *ntfs_inode_cache;
-struct kmem_cache *ntfs_big_inode_cache;
-
-/* Init once constructor for the inode slab cache. */
-static void ntfs_big_inode_init_once(void *foo)
-{
-	ntfs_inode *ni = (ntfs_inode *)foo;
-
-	inode_init_once(VFS_I(ni));
-}
-
-/*
- * Slab caches to optimize allocations and deallocations of attribute search
- * contexts and index contexts, respectively.
- */
-struct kmem_cache *ntfs_attr_ctx_cache;
-struct kmem_cache *ntfs_index_ctx_cache;
-
-/* Driver wide mutex. */
-DEFINE_MUTEX(ntfs_lock);
-
-static struct dentry *ntfs_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super);
-}
-
-static struct file_system_type ntfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "ntfs",
-	.mount		= ntfs_mount,
-	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
-};
-MODULE_ALIAS_FS("ntfs");
-
-/* Stable names for the slab caches. */
-static const char ntfs_index_ctx_cache_name[] = "ntfs_index_ctx_cache";
-static const char ntfs_attr_ctx_cache_name[] = "ntfs_attr_ctx_cache";
-static const char ntfs_name_cache_name[] = "ntfs_name_cache";
-static const char ntfs_inode_cache_name[] = "ntfs_inode_cache";
-static const char ntfs_big_inode_cache_name[] = "ntfs_big_inode_cache";
-
-static int __init init_ntfs_fs(void)
-{
-	int err = 0;
-
-	/* This may be ugly but it results in pretty output so who cares. (-8 */
-	pr_info("driver " NTFS_VERSION " [Flags: R/"
-#ifdef NTFS_RW
-			"W"
-#else
-			"O"
-#endif
-#ifdef DEBUG
-			" DEBUG"
-#endif
-#ifdef MODULE
-			" MODULE"
-#endif
-			"].\n");
-
-	ntfs_debug("Debug messages are enabled.");
-
-	ntfs_index_ctx_cache = kmem_cache_create(ntfs_index_ctx_cache_name,
-			sizeof(ntfs_index_context), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
-	if (!ntfs_index_ctx_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_index_ctx_cache_name);
-		goto ictx_err_out;
-	}
-	ntfs_attr_ctx_cache = kmem_cache_create(ntfs_attr_ctx_cache_name,
-			sizeof(ntfs_attr_search_ctx), 0 /* offset */,
-			SLAB_HWCACHE_ALIGN, NULL /* ctor */);
-	if (!ntfs_attr_ctx_cache) {
-		pr_crit("NTFS: Failed to create %s!\n",
-			ntfs_attr_ctx_cache_name);
-		goto actx_err_out;
-	}
-
-	ntfs_name_cache = kmem_cache_create(ntfs_name_cache_name,
-			(NTFS_MAX_NAME_LEN+1) * sizeof(ntfschar), 0,
-			SLAB_HWCACHE_ALIGN, NULL);
-	if (!ntfs_name_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_name_cache_name);
-		goto name_err_out;
-	}
-
-	ntfs_inode_cache = kmem_cache_create(ntfs_inode_cache_name,
-			sizeof(ntfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-	if (!ntfs_inode_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_inode_cache_name);
-		goto inode_err_out;
-	}
-
-	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
-			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-			SLAB_ACCOUNT, ntfs_big_inode_init_once);
-	if (!ntfs_big_inode_cache) {
-		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
-		goto big_inode_err_out;
-	}
-
-	/* Register the ntfs sysctls. */
-	err = ntfs_sysctl(1);
-	if (err) {
-		pr_crit("Failed to register NTFS sysctls!\n");
-		goto sysctl_err_out;
-	}
-
-	err = register_filesystem(&ntfs_fs_type);
-	if (!err) {
-		ntfs_debug("NTFS driver registered successfully.");
-		return 0; /* Success! */
-	}
-	pr_crit("Failed to register NTFS filesystem driver!\n");
-
-	/* Unregister the ntfs sysctls. */
-	ntfs_sysctl(0);
-sysctl_err_out:
-	kmem_cache_destroy(ntfs_big_inode_cache);
-big_inode_err_out:
-	kmem_cache_destroy(ntfs_inode_cache);
-inode_err_out:
-	kmem_cache_destroy(ntfs_name_cache);
-name_err_out:
-	kmem_cache_destroy(ntfs_attr_ctx_cache);
-actx_err_out:
-	kmem_cache_destroy(ntfs_index_ctx_cache);
-ictx_err_out:
-	if (!err) {
-		pr_crit("Aborting NTFS filesystem driver registration...\n");
-		err = -ENOMEM;
-	}
-	return err;
-}
-
-static void __exit exit_ntfs_fs(void)
-{
-	ntfs_debug("Unregistering NTFS driver.");
-
-	unregister_filesystem(&ntfs_fs_type);
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(ntfs_big_inode_cache);
-	kmem_cache_destroy(ntfs_inode_cache);
-	kmem_cache_destroy(ntfs_name_cache);
-	kmem_cache_destroy(ntfs_attr_ctx_cache);
-	kmem_cache_destroy(ntfs_index_ctx_cache);
-	/* Unregister the ntfs sysctls. */
-	ntfs_sysctl(0);
-}
-
-MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>");
-MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc.");
-MODULE_VERSION(NTFS_VERSION);
-MODULE_LICENSE("GPL");
-#ifdef DEBUG
-module_param(debug_msgs, bint, 0);
-MODULE_PARM_DESC(debug_msgs, "Enable debug messages.");
-#endif
-
-module_init(init_ntfs_fs)
-module_exit(exit_ntfs_fs)
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
deleted file mode 100644
index 4e980170d86a..000000000000
--- a/fs/ntfs/sysctl.c
+++ /dev/null
@@ -1,58 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * sysctl.c - Code for sysctl handling in NTFS Linux kernel driver. Part of
- *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2005 Anton Altaparmakov
- */
-
-#ifdef DEBUG
-
-#include <linux/module.h>
-
-#ifdef CONFIG_SYSCTL
-
-#include <linux/proc_fs.h>
-#include <linux/sysctl.h>
-
-#include "sysctl.h"
-#include "debug.h"
-
-/* Definition of the ntfs sysctl. */
-static struct ctl_table ntfs_sysctls[] = {
-	{
-		.procname	= "ntfs-debug",
-		.data		= &debug_msgs,		/* Data pointer and size. */
-		.maxlen		= sizeof(debug_msgs),
-		.mode		= 0644,			/* Mode, proc handler. */
-		.proc_handler	= proc_dointvec
-	},
-};
-
-/* Storage for the sysctls header. */
-static struct ctl_table_header *sysctls_root_table;
-
-/**
- * ntfs_sysctl - add or remove the debug sysctl
- * @add:	add (1) or remove (0) the sysctl
- *
- * Add or remove the debug sysctl. Return 0 on success or -errno on error.
- */
-int ntfs_sysctl(int add)
-{
-	if (add) {
-		BUG_ON(sysctls_root_table);
-		sysctls_root_table = register_sysctl("fs", ntfs_sysctls);
-		if (!sysctls_root_table)
-			return -ENOMEM;
-	} else {
-		BUG_ON(!sysctls_root_table);
-		unregister_sysctl_table(sysctls_root_table);
-		sysctls_root_table = NULL;
-	}
-	return 0;
-}
-
-#endif /* CONFIG_SYSCTL */
-#endif /* DEBUG */
diff --git a/fs/ntfs/sysctl.h b/fs/ntfs/sysctl.h
deleted file mode 100644
index 96bb2299d2d5..000000000000
--- a/fs/ntfs/sysctl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * sysctl.h - Defines for sysctl handling in NTFS Linux kernel driver. Part of
- *	      the Linux-NTFS project. Adapted from the old NTFS driver,
- *	      Copyright (C) 1997 Martin von Löwis, Régis Duchesne
- *
- * Copyright (c) 2002-2004 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_SYSCTL_H
-#define _LINUX_NTFS_SYSCTL_H
-
-
-#if defined(DEBUG) && defined(CONFIG_SYSCTL)
-
-extern int ntfs_sysctl(int add);
-
-#else
-
-/* Just return success. */
-static inline int ntfs_sysctl(int add)
-{
-	return 0;
-}
-
-#endif /* DEBUG && CONFIG_SYSCTL */
-#endif /* _LINUX_NTFS_SYSCTL_H */
diff --git a/fs/ntfs/time.h b/fs/ntfs/time.h
deleted file mode 100644
index 6b63261300cc..000000000000
--- a/fs/ntfs/time.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * time.h - NTFS time conversion functions.  Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TIME_H
-#define _LINUX_NTFS_TIME_H
-
-#include <linux/time.h>		/* For current_kernel_time(). */
-#include <asm/div64.h>		/* For do_div(). */
-
-#include "endian.h"
-
-#define NTFS_TIME_OFFSET ((s64)(369 * 365 + 89) * 24 * 3600 * 10000000)
-
-/**
- * utc2ntfs - convert Linux UTC time to NTFS time
- * @ts:		Linux UTC time to convert to NTFS time
- *
- * Convert the Linux UTC time @ts to its corresponding NTFS time and return
- * that in little endian format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100-nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline sle64 utc2ntfs(const struct timespec64 ts)
-{
-	/*
-	 * Convert the seconds to 100ns intervals, add the nano-seconds
-	 * converted to 100ns intervals, and then add the NTFS time offset.
-	 */
-	return cpu_to_sle64((s64)ts.tv_sec * 10000000 + ts.tv_nsec / 100 +
-			NTFS_TIME_OFFSET);
-}
-
-/**
- * get_current_ntfs_time - get the current time in little endian NTFS format
- *
- * Get the current time from the Linux kernel, convert it to its corresponding
- * NTFS time and return that in little endian format.
- */
-static inline sle64 get_current_ntfs_time(void)
-{
-	struct timespec64 ts;
-
-	ktime_get_coarse_real_ts64(&ts);
-	return utc2ntfs(ts);
-}
-
-/**
- * ntfs2utc - convert NTFS time to Linux time
- * @time:	NTFS time (little endian) to convert to Linux UTC
- *
- * Convert the little endian NTFS time @time to its corresponding Linux UTC
- * time and return that in cpu format.
- *
- * Linux stores time in a struct timespec64 consisting of a time64_t tv_sec
- * and a long tv_nsec where tv_sec is the number of 1-second intervals since
- * 1st January 1970, 00:00:00 UTC and tv_nsec is the number of 1-nano-second
- * intervals since the value of tv_sec.
- *
- * NTFS uses Microsoft's standard time format which is stored in a s64 and is
- * measured as the number of 100 nano-second intervals since 1st January 1601,
- * 00:00:00 UTC.
- */
-static inline struct timespec64 ntfs2utc(const sle64 time)
-{
-	struct timespec64 ts;
-
-	/* Subtract the NTFS time offset. */
-	u64 t = (u64)(sle64_to_cpu(time) - NTFS_TIME_OFFSET);
-	/*
-	 * Convert the time to 1-second intervals and the remainder to
-	 * 1-nano-second intervals.
-	 */
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t;
-	return ts;
-}
-
-#endif /* _LINUX_NTFS_TIME_H */
diff --git a/fs/ntfs/types.h b/fs/ntfs/types.h
deleted file mode 100644
index 9a47859e7a06..000000000000
--- a/fs/ntfs/types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * types.h - Defines for NTFS Linux kernel driver specific types.
- *	     Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_TYPES_H
-#define _LINUX_NTFS_TYPES_H
-
-#include <linux/types.h>
-
-typedef __le16 le16;
-typedef __le32 le32;
-typedef __le64 le64;
-typedef __u16 __bitwise sle16;
-typedef __u32 __bitwise sle32;
-typedef __u64 __bitwise sle64;
-
-/* 2-byte Unicode character type. */
-typedef le16 ntfschar;
-#define UCHAR_T_SIZE_BITS 1
-
-/*
- * Clusters are signed 64-bit values on NTFS volumes. We define two types, LCN
- * and VCN, to allow for type checking and better code readability.
- */
-typedef s64 VCN;
-typedef sle64 leVCN;
-typedef s64 LCN;
-typedef sle64 leLCN;
-
-/*
- * The NTFS journal $LogFile uses log sequence numbers which are signed 64-bit
- * values.  We define our own type LSN, to allow for type checking and better
- * code readability.
- */
-typedef s64 LSN;
-typedef sle64 leLSN;
-
-/*
- * The NTFS transaction log $UsnJrnl uses usn which are signed 64-bit values.
- * We define our own type USN, to allow for type checking and better code
- * readability.
- */
-typedef s64 USN;
-typedef sle64 leUSN;
-
-typedef enum {
-	CASE_SENSITIVE = 0,
-	IGNORE_CASE = 1,
-} IGNORE_CASE_BOOL;
-
-#endif /* _LINUX_NTFS_TYPES_H */
diff --git a/fs/ntfs/unistr.c b/fs/ntfs/unistr.c
deleted file mode 100644
index a6b6c64f14a9..000000000000
--- a/fs/ntfs/unistr.c
+++ /dev/null
@@ -1,384 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * unistr.c - NTFS Unicode string handling. Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include <linux/slab.h>
-
-#include "types.h"
-#include "debug.h"
-#include "ntfs.h"
-
-/*
- * IMPORTANT
- * =========
- *
- * All these routines assume that the Unicode characters are in little endian
- * encoding inside the strings!!!
- */
-
-/*
- * This is used by the name collation functions to quickly determine what
- * characters are (in)valid.
- */
-static const u8 legal_ansi_char_array[0x40] = {
-	0x00, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
-
-	0x17, 0x07, 0x18, 0x17, 0x17, 0x17, 0x17, 0x17,
-	0x17, 0x17, 0x18, 0x16, 0x16, 0x17, 0x07, 0x00,
-
-	0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17, 0x17,
-	0x17, 0x17, 0x04, 0x16, 0x18, 0x16, 0x18, 0x18,
-};
-
-/**
- * ntfs_are_names_equal - compare two Unicode names for equality
- * @s1:			name to compare to @s2
- * @s1_len:		length in Unicode characters of @s1
- * @s2:			name to compare to @s1
- * @s2_len:		length in Unicode characters of @s2
- * @ic:			ignore case bool
- * @upcase:		upcase table (only if @ic == IGNORE_CASE)
- * @upcase_size:	length in Unicode characters of @upcase (if present)
- *
- * Compare the names @s1 and @s2 and return 'true' (1) if the names are
- * identical, or 'false' (0) if they are not identical. If @ic is IGNORE_CASE,
- * the @upcase table is used to performa a case insensitive comparison.
- */
-bool ntfs_are_names_equal(const ntfschar *s1, size_t s1_len,
-		const ntfschar *s2, size_t s2_len, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_size)
-{
-	if (s1_len != s2_len)
-		return false;
-	if (ic == CASE_SENSITIVE)
-		return !ntfs_ucsncmp(s1, s2, s1_len);
-	return !ntfs_ucsncasecmp(s1, s2, s1_len, upcase, upcase_size);
-}
-
-/**
- * ntfs_collate_names - collate two Unicode names
- * @name1:	first Unicode name to compare
- * @name2:	second Unicode name to compare
- * @err_val:	if @name1 contains an invalid character return this value
- * @ic:		either CASE_SENSITIVE or IGNORE_CASE
- * @upcase:	upcase table (ignored if @ic is CASE_SENSITIVE)
- * @upcase_len:	upcase table size (ignored if @ic is CASE_SENSITIVE)
- *
- * ntfs_collate_names collates two Unicode names and returns:
- *
- *  -1 if the first name collates before the second one,
- *   0 if the names match,
- *   1 if the second name collates before the first one, or
- * @err_val if an invalid character is found in @name1 during the comparison.
- *
- * The following characters are considered invalid: '"', '*', '<', '>' and '?'.
- */
-int ntfs_collate_names(const ntfschar *name1, const u32 name1_len,
-		const ntfschar *name2, const u32 name2_len,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	u32 cnt, min_len;
-	u16 c1, c2;
-
-	min_len = name1_len;
-	if (name1_len > name2_len)
-		min_len = name2_len;
-	for (cnt = 0; cnt < min_len; ++cnt) {
-		c1 = le16_to_cpu(*name1++);
-		c2 = le16_to_cpu(*name2++);
-		if (ic) {
-			if (c1 < upcase_len)
-				c1 = le16_to_cpu(upcase[c1]);
-			if (c2 < upcase_len)
-				c2 = le16_to_cpu(upcase[c2]);
-		}
-		if (c1 < 64 && legal_ansi_char_array[c1] & 8)
-			return err_val;
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-	}
-	if (name1_len < name2_len)
-		return -1;
-	if (name1_len == name2_len)
-		return 0;
-	/* name1_len > name2_len */
-	c1 = le16_to_cpu(*name1);
-	if (c1 < 64 && legal_ansi_char_array[c1] & 8)
-		return err_val;
-	return 1;
-}
-
-/**
- * ntfs_ucsncmp - compare two little endian Unicode strings
- * @s1:		first string
- * @s2:		second string
- * @n:		maximum unicode characters to compare
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * The strings in little endian format and appropriate le16_to_cpu()
- * conversion is performed on non-little endian machines.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncmp(const ntfschar *s1, const ntfschar *s2, size_t n)
-{
-	u16 c1, c2;
-	size_t i;
-
-	for (i = 0; i < n; ++i) {
-		c1 = le16_to_cpu(s1[i]);
-		c2 = le16_to_cpu(s2[i]);
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-		if (!c1)
-			break;
-	}
-	return 0;
-}
-
-/**
- * ntfs_ucsncasecmp - compare two little endian Unicode strings, ignoring case
- * @s1:			first string
- * @s2:			second string
- * @n:			maximum unicode characters to compare
- * @upcase:		upcase table
- * @upcase_size:	upcase table size in Unicode characters
- *
- * Compare the first @n characters of the Unicode strings @s1 and @s2,
- * ignoring case. The strings in little endian format and appropriate
- * le16_to_cpu() conversion is performed on non-little endian machines.
- *
- * Each character is uppercased using the @upcase table before the comparison.
- *
- * The function returns an integer less than, equal to, or greater than zero
- * if @s1 (or the first @n Unicode characters thereof) is found, respectively,
- * to be less than, to match, or be greater than @s2.
- */
-int ntfs_ucsncasecmp(const ntfschar *s1, const ntfschar *s2, size_t n,
-		const ntfschar *upcase, const u32 upcase_size)
-{
-	size_t i;
-	u16 c1, c2;
-
-	for (i = 0; i < n; ++i) {
-		if ((c1 = le16_to_cpu(s1[i])) < upcase_size)
-			c1 = le16_to_cpu(upcase[c1]);
-		if ((c2 = le16_to_cpu(s2[i])) < upcase_size)
-			c2 = le16_to_cpu(upcase[c2]);
-		if (c1 < c2)
-			return -1;
-		if (c1 > c2)
-			return 1;
-		if (!c1)
-			break;
-	}
-	return 0;
-}
-
-void ntfs_upcase_name(ntfschar *name, u32 name_len, const ntfschar *upcase,
-		const u32 upcase_len)
-{
-	u32 i;
-	u16 u;
-
-	for (i = 0; i < name_len; i++)
-		if ((u = le16_to_cpu(name[i])) < upcase_len)
-			name[i] = upcase[u];
-}
-
-void ntfs_file_upcase_value(FILE_NAME_ATTR *file_name_attr,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	ntfs_upcase_name((ntfschar*)&file_name_attr->file_name,
-			file_name_attr->file_name_length, upcase, upcase_len);
-}
-
-int ntfs_file_compare_values(FILE_NAME_ATTR *file_name_attr1,
-		FILE_NAME_ATTR *file_name_attr2,
-		const int err_val, const IGNORE_CASE_BOOL ic,
-		const ntfschar *upcase, const u32 upcase_len)
-{
-	return ntfs_collate_names((ntfschar*)&file_name_attr1->file_name,
-			file_name_attr1->file_name_length,
-			(ntfschar*)&file_name_attr2->file_name,
-			file_name_attr2->file_name_length,
-			err_val, ic, upcase, upcase_len);
-}
-
-/**
- * ntfs_nlstoucs - convert NLS string to little endian Unicode string
- * @vol:	ntfs volume which we are working with
- * @ins:	input NLS string buffer
- * @ins_len:	length of input string in bytes
- * @outs:	on return contains the allocated output Unicode string buffer
- *
- * Convert the input string @ins, which is in whatever format the loaded NLS
- * map dictates, into a little endian, 2-byte Unicode string.
- *
- * This function allocates the string and the caller is responsible for
- * calling kmem_cache_free(ntfs_name_cache, *@outs); when finished with it.
- *
- * On success the function returns the number of Unicode characters written to
- * the output string *@outs (>= 0), not counting the terminating Unicode NULL
- * character. *@outs is set to the allocated output string buffer.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. Both *@outs and *@outs_len
- * are then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_nlstoucs(const ntfs_volume *vol, const char *ins,
-		const int ins_len, ntfschar **outs)
-{
-	struct nls_table *nls = vol->nls_map;
-	ntfschar *ucs;
-	wchar_t wc;
-	int i, o, wc_len;
-
-	/* We do not trust outside sources. */
-	if (likely(ins)) {
-		ucs = kmem_cache_alloc(ntfs_name_cache, GFP_NOFS);
-		if (likely(ucs)) {
-			for (i = o = 0; i < ins_len; i += wc_len) {
-				wc_len = nls->char2uni(ins + i, ins_len - i,
-						&wc);
-				if (likely(wc_len >= 0 &&
-						o < NTFS_MAX_NAME_LEN)) {
-					if (likely(wc)) {
-						ucs[o++] = cpu_to_le16(wc);
-						continue;
-					} /* else if (!wc) */
-					break;
-				} /* else if (wc_len < 0 ||
-						o >= NTFS_MAX_NAME_LEN) */
-				goto name_err;
-			}
-			ucs[o] = 0;
-			*outs = ucs;
-			return o;
-		} /* else if (!ucs) */
-		ntfs_error(vol->sb, "Failed to allocate buffer for converted "
-				"name from ntfs_name_cache.");
-		return -ENOMEM;
-	} /* else if (!ins) */
-	ntfs_error(vol->sb, "Received NULL pointer.");
-	return -EINVAL;
-name_err:
-	kmem_cache_free(ntfs_name_cache, ucs);
-	if (wc_len < 0) {
-		ntfs_error(vol->sb, "Name using character set %s contains "
-				"characters that cannot be converted to "
-				"Unicode.", nls->charset);
-		i = -EILSEQ;
-	} else /* if (o >= NTFS_MAX_NAME_LEN) */ {
-		ntfs_error(vol->sb, "Name is too long (maximum length for a "
-				"name on NTFS is %d Unicode characters.",
-				NTFS_MAX_NAME_LEN);
-		i = -ENAMETOOLONG;
-	}
-	return i;
-}
-
-/**
- * ntfs_ucstonls - convert little endian Unicode string to NLS string
- * @vol:	ntfs volume which we are working with
- * @ins:	input Unicode string buffer
- * @ins_len:	length of input string in Unicode characters
- * @outs:	on return contains the (allocated) output NLS string buffer
- * @outs_len:	length of output string buffer in bytes
- *
- * Convert the input little endian, 2-byte Unicode string @ins, of length
- * @ins_len into the string format dictated by the loaded NLS.
- *
- * If *@outs is NULL, this function allocates the string and the caller is
- * responsible for calling kfree(*@outs); when finished with it. In this case
- * @outs_len is ignored and can be 0.
- *
- * On success the function returns the number of bytes written to the output
- * string *@outs (>= 0), not counting the terminating NULL byte. If the output
- * string buffer was allocated, *@outs is set to it.
- *
- * On error, a negative number corresponding to the error code is returned. In
- * that case the output string is not allocated. The contents of *@outs are
- * then undefined.
- *
- * This might look a bit odd due to fast path optimization...
- */
-int ntfs_ucstonls(const ntfs_volume *vol, const ntfschar *ins,
-		const int ins_len, unsigned char **outs, int outs_len)
-{
-	struct nls_table *nls = vol->nls_map;
-	unsigned char *ns;
-	int i, o, ns_len, wc;
-
-	/* We don't trust outside sources. */
-	if (ins) {
-		ns = *outs;
-		ns_len = outs_len;
-		if (ns && !ns_len) {
-			wc = -ENAMETOOLONG;
-			goto conversion_err;
-		}
-		if (!ns) {
-			ns_len = ins_len * NLS_MAX_CHARSET_SIZE;
-			ns = kmalloc(ns_len + 1, GFP_NOFS);
-			if (!ns)
-				goto mem_err_out;
-		}
-		for (i = o = 0; i < ins_len; i++) {
-retry:			wc = nls->uni2char(le16_to_cpu(ins[i]), ns + o,
-					ns_len - o);
-			if (wc > 0) {
-				o += wc;
-				continue;
-			} else if (!wc)
-				break;
-			else if (wc == -ENAMETOOLONG && ns != *outs) {
-				unsigned char *tc;
-				/* Grow in multiples of 64 bytes. */
-				tc = kmalloc((ns_len + 64) &
-						~63, GFP_NOFS);
-				if (tc) {
-					memcpy(tc, ns, ns_len);
-					ns_len = ((ns_len + 64) & ~63) - 1;
-					kfree(ns);
-					ns = tc;
-					goto retry;
-				} /* No memory so goto conversion_error; */
-			} /* wc < 0, real error. */
-			goto conversion_err;
-		}
-		ns[o] = 0;
-		*outs = ns;
-		return o;
-	} /* else (!ins) */
-	ntfs_error(vol->sb, "Received NULL pointer.");
-	return -EINVAL;
-conversion_err:
-	ntfs_error(vol->sb, "Unicode name contains characters that cannot be "
-			"converted to character set %s.  You might want to "
-			"try to use the mount option nls=utf8.", nls->charset);
-	if (ns != *outs)
-		kfree(ns);
-	if (wc != -ENAMETOOLONG)
-		wc = -EILSEQ;
-	return wc;
-mem_err_out:
-	ntfs_error(vol->sb, "Failed to allocate name!");
-	return -ENOMEM;
-}
diff --git a/fs/ntfs/upcase.c b/fs/ntfs/upcase.c
deleted file mode 100644
index 4ebe84a78dea..000000000000
--- a/fs/ntfs/upcase.c
+++ /dev/null
@@ -1,73 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
- *	      Part of the Linux-NTFS project.
- *
- * Copyright (c) 2001 Richard Russon <ntfs@flatcap.org>
- * Copyright (c) 2001-2006 Anton Altaparmakov
- */
-
-#include "malloc.h"
-#include "ntfs.h"
-
-ntfschar *generate_default_upcase(void)
-{
-	static const int uc_run_table[][3] = { /* Start, End, Add */
-	{0x0061, 0x007B,  -32}, {0x0451, 0x045D, -80}, {0x1F70, 0x1F72,  74},
-	{0x00E0, 0x00F7,  -32}, {0x045E, 0x0460, -80}, {0x1F72, 0x1F76,  86},
-	{0x00F8, 0x00FF,  -32}, {0x0561, 0x0587, -48}, {0x1F76, 0x1F78, 100},
-	{0x0256, 0x0258, -205}, {0x1F00, 0x1F08,   8}, {0x1F78, 0x1F7A, 128},
-	{0x028A, 0x028C, -217}, {0x1F10, 0x1F16,   8}, {0x1F7A, 0x1F7C, 112},
-	{0x03AC, 0x03AD,  -38}, {0x1F20, 0x1F28,   8}, {0x1F7C, 0x1F7E, 126},
-	{0x03AD, 0x03B0,  -37}, {0x1F30, 0x1F38,   8}, {0x1FB0, 0x1FB2,   8},
-	{0x03B1, 0x03C2,  -32}, {0x1F40, 0x1F46,   8}, {0x1FD0, 0x1FD2,   8},
-	{0x03C2, 0x03C3,  -31}, {0x1F51, 0x1F52,   8}, {0x1FE0, 0x1FE2,   8},
-	{0x03C3, 0x03CC,  -32}, {0x1F53, 0x1F54,   8}, {0x1FE5, 0x1FE6,   7},
-	{0x03CC, 0x03CD,  -64}, {0x1F55, 0x1F56,   8}, {0x2170, 0x2180, -16},
-	{0x03CD, 0x03CF,  -63}, {0x1F57, 0x1F58,   8}, {0x24D0, 0x24EA, -26},
-	{0x0430, 0x0450,  -32}, {0x1F60, 0x1F68,   8}, {0xFF41, 0xFF5B, -32},
-	{0}
-	};
-
-	static const int uc_dup_table[][2] = { /* Start, End */
-	{0x0100, 0x012F}, {0x01A0, 0x01A6}, {0x03E2, 0x03EF}, {0x04CB, 0x04CC},
-	{0x0132, 0x0137}, {0x01B3, 0x01B7}, {0x0460, 0x0481}, {0x04D0, 0x04EB},
-	{0x0139, 0x0149}, {0x01CD, 0x01DD}, {0x0490, 0x04BF}, {0x04EE, 0x04F5},
-	{0x014A, 0x0178}, {0x01DE, 0x01EF}, {0x04BF, 0x04BF}, {0x04F8, 0x04F9},
-	{0x0179, 0x017E}, {0x01F4, 0x01F5}, {0x04C1, 0x04C4}, {0x1E00, 0x1E95},
-	{0x018B, 0x018B}, {0x01FA, 0x0218}, {0x04C7, 0x04C8}, {0x1EA0, 0x1EF9},
-	{0}
-	};
-
-	static const int uc_word_table[][2] = { /* Offset, Value */
-	{0x00FF, 0x0178}, {0x01AD, 0x01AC}, {0x01F3, 0x01F1}, {0x0269, 0x0196},
-	{0x0183, 0x0182}, {0x01B0, 0x01AF}, {0x0253, 0x0181}, {0x026F, 0x019C},
-	{0x0185, 0x0184}, {0x01B9, 0x01B8}, {0x0254, 0x0186}, {0x0272, 0x019D},
-	{0x0188, 0x0187}, {0x01BD, 0x01BC}, {0x0259, 0x018F}, {0x0275, 0x019F},
-	{0x018C, 0x018B}, {0x01C6, 0x01C4}, {0x025B, 0x0190}, {0x0283, 0x01A9},
-	{0x0192, 0x0191}, {0x01C9, 0x01C7}, {0x0260, 0x0193}, {0x0288, 0x01AE},
-	{0x0199, 0x0198}, {0x01CC, 0x01CA}, {0x0263, 0x0194}, {0x0292, 0x01B7},
-	{0x01A8, 0x01A7}, {0x01DD, 0x018E}, {0x0268, 0x0197},
-	{0}
-	};
-
-	int i, r;
-	ntfschar *uc;
-
-	uc = ntfs_malloc_nofs(default_upcase_len * sizeof(ntfschar));
-	if (!uc)
-		return uc;
-	memset(uc, 0, default_upcase_len * sizeof(ntfschar));
-	/* Generate the little endian Unicode upcase table used by ntfs. */
-	for (i = 0; i < default_upcase_len; i++)
-		uc[i] = cpu_to_le16(i);
-	for (r = 0; uc_run_table[r][0]; r++)
-		for (i = uc_run_table[r][0]; i < uc_run_table[r][1]; i++)
-			le16_add_cpu(&uc[i], uc_run_table[r][2]);
-	for (r = 0; uc_dup_table[r][0]; r++)
-		for (i = uc_dup_table[r][0]; i < uc_dup_table[r][1]; i += 2)
-			le16_add_cpu(&uc[i + 1], -1);
-	for (r = 0; uc_word_table[r][0]; r++)
-		uc[uc_word_table[r][0]] = cpu_to_le16(uc_word_table[r][1]);
-	return uc;
-}
diff --git a/fs/ntfs/usnjrnl.c b/fs/ntfs/usnjrnl.c
deleted file mode 100644
index 9097a0b4ef25..000000000000
--- a/fs/ntfs/usnjrnl.c
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * usnjrnl.h - NTFS kernel transaction log ($UsnJrnl) handling.  Part of the
- *	       Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifdef NTFS_RW
-
-#include <linux/fs.h>
-#include <linux/highmem.h>
-#include <linux/mm.h>
-
-#include "aops.h"
-#include "debug.h"
-#include "endian.h"
-#include "time.h"
-#include "types.h"
-#include "usnjrnl.h"
-#include "volume.h"
-
-/**
- * ntfs_stamp_usnjrnl - stamp the transaction log ($UsnJrnl) on an ntfs volume
- * @vol:	ntfs volume on which to stamp the transaction log
- *
- * Stamp the transaction log ($UsnJrnl) on the ntfs volume @vol and return
- * 'true' on success and 'false' on error.
- *
- * This function assumes that the transaction log has already been loaded and
- * consistency checked by a call to fs/ntfs/super.c::load_and_init_usnjrnl().
- */
-bool ntfs_stamp_usnjrnl(ntfs_volume *vol)
-{
-	ntfs_debug("Entering.");
-	if (likely(!NVolUsnJrnlStamped(vol))) {
-		sle64 stamp;
-		struct page *page;
-		USN_HEADER *uh;
-
-		page = ntfs_map_page(vol->usnjrnl_max_ino->i_mapping, 0);
-		if (IS_ERR(page)) {
-			ntfs_error(vol->sb, "Failed to read from "
-					"$UsnJrnl/$DATA/$Max attribute.");
-			return false;
-		}
-		uh = (USN_HEADER*)page_address(page);
-		stamp = get_current_ntfs_time();
-		ntfs_debug("Stamping transaction log ($UsnJrnl): old "
-				"journal_id 0x%llx, old lowest_valid_usn "
-				"0x%llx, new journal_id 0x%llx, new "
-				"lowest_valid_usn 0x%llx.",
-				(long long)sle64_to_cpu(uh->journal_id),
-				(long long)sle64_to_cpu(uh->lowest_valid_usn),
-				(long long)sle64_to_cpu(stamp),
-				i_size_read(vol->usnjrnl_j_ino));
-		uh->lowest_valid_usn =
-				cpu_to_sle64(i_size_read(vol->usnjrnl_j_ino));
-		uh->journal_id = stamp;
-		flush_dcache_page(page);
-		set_page_dirty(page);
-		ntfs_unmap_page(page);
-		/* Set the flag so we do not have to do it again on remount. */
-		NVolSetUsnJrnlStamped(vol);
-	}
-	ntfs_debug("Done.");
-	return true;
-}
-
-#endif /* NTFS_RW */
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
deleted file mode 100644
index 85f531b59395..000000000000
--- a/fs/ntfs/usnjrnl.h
+++ /dev/null
@@ -1,191 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * usnjrnl.h - Defines for NTFS kernel transaction log ($UsnJrnl) handling.
- *	       Part of the Linux-NTFS project.
- *
- * Copyright (c) 2005 Anton Altaparmakov
- */
-
-#ifndef _LINUX_NTFS_USNJRNL_H
-#define _LINUX_NTFS_USNJRNL_H
-
-#ifdef NTFS_RW
-
-#include "types.h"
-#include "endian.h"
-#include "layout.h"
-#include "volume.h"
-
-/*
- * Transaction log ($UsnJrnl) organization:
- *
- * The transaction log records whenever a file is modified in any way.  So for
- * example it will record that file "blah" was written to at a particular time
- * but not what was written.  If will record that a file was deleted or
- * created, that a file was truncated, etc.  See below for all the reason
- * codes used.
- *
- * The transaction log is in the $Extend directory which is in the root
- * directory of each volume.  If it is not present it means transaction
- * logging is disabled.  If it is present it means transaction logging is
- * either enabled or in the process of being disabled in which case we can
- * ignore it as it will go away as soon as Windows gets its hands on it.
- *
- * To determine whether the transaction logging is enabled or in the process
- * of being disabled, need to check the volume flags in the
- * $VOLUME_INFORMATION attribute in the $Volume system file (which is present
- * in the root directory and has a fixed mft record number, see layout.h).
- * If the flag VOLUME_DELETE_USN_UNDERWAY is set it means the transaction log
- * is in the process of being disabled and if this flag is clear it means the
- * transaction log is enabled.
- *
- * The transaction log consists of two parts; the $DATA/$Max attribute as well
- * as the $DATA/$J attribute.  $Max is a header describing the transaction
- * log whilst $J is the transaction log data itself as a sequence of variable
- * sized USN_RECORDs (see below for all the structures).
- *
- * We do not care about transaction logging at this point in time but we still
- * need to let windows know that the transaction log is out of date.  To do
- * this we need to stamp the transaction log.  This involves setting the
- * lowest_valid_usn field in the $DATA/$Max attribute to the usn to be used
- * for the next added USN_RECORD to the $DATA/$J attribute as well as
- * generating a new journal_id in $DATA/$Max.
- *
- * The journal_id is as of the current version (2.0) of the transaction log
- * simply the 64-bit timestamp of when the journal was either created or last
- * stamped.
- *
- * To determine the next usn there are two ways.  The first is to parse
- * $DATA/$J and to find the last USN_RECORD in it and to add its record_length
- * to its usn (which is the byte offset in the $DATA/$J attribute).  The
- * second is simply to take the data size of the attribute.  Since the usns
- * are simply byte offsets into $DATA/$J, this is exactly the next usn.  For
- * obvious reasons we use the second method as it is much simpler and faster.
- *
- * As an aside, note that to actually disable the transaction log, one would
- * need to set the VOLUME_DELETE_USN_UNDERWAY flag (see above), then go
- * through all the mft records on the volume and set the usn field in their
- * $STANDARD_INFORMATION attribute to zero.  Once that is done, one would need
- * to delete the transaction log file, i.e. \$Extent\$UsnJrnl, and finally,
- * one would need to clear the VOLUME_DELETE_USN_UNDERWAY flag.
- *
- * Note that if a volume is unmounted whilst the transaction log is being
- * disabled, the process will continue the next time the volume is mounted.
- * This is why we can safely mount read-write when we see a transaction log
- * in the process of being deleted.
- */
-
-/* Some $UsnJrnl related constants. */
-#define UsnJrnlMajorVer		2
-#define UsnJrnlMinorVer		0
-
-/*
- * $DATA/$Max attribute.  This is (always?) resident and has a fixed size of
- * 32 bytes.  It contains the header describing the transaction log.
- */
-typedef struct {
-/*Ofs*/
-/*   0*/sle64 maximum_size;	/* The maximum on-disk size of the $DATA/$J
-				   attribute. */
-/*   8*/sle64 allocation_delta;	/* Number of bytes by which to increase the
-				   size of the $DATA/$J attribute. */
-/*0x10*/sle64 journal_id;	/* Current id of the transaction log. */
-/*0x18*/leUSN lowest_valid_usn;	/* Lowest valid usn in $DATA/$J for the
-				   current journal_id. */
-/* sizeof() = 32 (0x20) bytes */
-} __attribute__ ((__packed__)) USN_HEADER;
-
-/*
- * Reason flags (32-bit).  Cumulative flags describing the change(s) to the
- * file since it was last opened.  I think the names speak for themselves but
- * if you disagree check out the descriptions in the Linux NTFS project NTFS
- * documentation: http://www.linux-ntfs.org/
- */
-enum {
-	USN_REASON_DATA_OVERWRITE	= cpu_to_le32(0x00000001),
-	USN_REASON_DATA_EXTEND		= cpu_to_le32(0x00000002),
-	USN_REASON_DATA_TRUNCATION	= cpu_to_le32(0x00000004),
-	USN_REASON_NAMED_DATA_OVERWRITE	= cpu_to_le32(0x00000010),
-	USN_REASON_NAMED_DATA_EXTEND	= cpu_to_le32(0x00000020),
-	USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
-	USN_REASON_FILE_CREATE		= cpu_to_le32(0x00000100),
-	USN_REASON_FILE_DELETE		= cpu_to_le32(0x00000200),
-	USN_REASON_EA_CHANGE		= cpu_to_le32(0x00000400),
-	USN_REASON_SECURITY_CHANGE	= cpu_to_le32(0x00000800),
-	USN_REASON_RENAME_OLD_NAME	= cpu_to_le32(0x00001000),
-	USN_REASON_RENAME_NEW_NAME	= cpu_to_le32(0x00002000),
-	USN_REASON_INDEXABLE_CHANGE	= cpu_to_le32(0x00004000),
-	USN_REASON_BASIC_INFO_CHANGE	= cpu_to_le32(0x00008000),
-	USN_REASON_HARD_LINK_CHANGE	= cpu_to_le32(0x00010000),
-	USN_REASON_COMPRESSION_CHANGE	= cpu_to_le32(0x00020000),
-	USN_REASON_ENCRYPTION_CHANGE	= cpu_to_le32(0x00040000),
-	USN_REASON_OBJECT_ID_CHANGE	= cpu_to_le32(0x00080000),
-	USN_REASON_REPARSE_POINT_CHANGE	= cpu_to_le32(0x00100000),
-	USN_REASON_STREAM_CHANGE	= cpu_to_le32(0x00200000),
-	USN_REASON_CLOSE		= cpu_to_le32(0x80000000),
-};
-
-typedef le32 USN_REASON_FLAGS;
-
-/*
- * Source info flags (32-bit).  Information about the source of the change(s)
- * to the file.  For detailed descriptions of what these mean, see the Linux
- * NTFS project NTFS documentation:
- *	http://www.linux-ntfs.org/
- */
-enum {
-	USN_SOURCE_DATA_MANAGEMENT	  = cpu_to_le32(0x00000001),
-	USN_SOURCE_AUXILIARY_DATA	  = cpu_to_le32(0x00000002),
-	USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
-};
-
-typedef le32 USN_SOURCE_INFO_FLAGS;
-
-/*
- * $DATA/$J attribute.  This is always non-resident, is marked as sparse, and
- * is of variabled size.  It consists of a sequence of variable size
- * USN_RECORDS.  The minimum allocated_size is allocation_delta as
- * specified in $DATA/$Max.  When the maximum_size specified in $DATA/$Max is
- * exceeded by more than allocation_delta bytes, allocation_delta bytes are
- * allocated and appended to the $DATA/$J attribute and an equal number of
- * bytes at the beginning of the attribute are freed and made sparse.  Note the
- * making sparse only happens at volume checkpoints and hence the actual
- * $DATA/$J size can exceed maximum_size + allocation_delta temporarily.
- */
-typedef struct {
-/*Ofs*/
-/*   0*/le32 length;		/* Byte size of this record (8-byte
-				   aligned). */
-/*   4*/le16 major_ver;		/* Major version of the transaction log used
-				   for this record. */
-/*   6*/le16 minor_ver;		/* Minor version of the transaction log used
-				   for this record. */
-/*   8*/leMFT_REF mft_reference;/* The mft reference of the file (or
-				   directory) described by this record. */
-/*0x10*/leMFT_REF parent_directory;/* The mft reference of the parent
-				   directory of the file described by this
-				   record. */
-/*0x18*/leUSN usn;		/* The usn of this record.  Equals the offset
-				   within the $DATA/$J attribute. */
-/*0x20*/sle64 time;		/* Time when this record was created. */
-/*0x28*/USN_REASON_FLAGS reason;/* Reason flags (see above). */
-/*0x2c*/USN_SOURCE_INFO_FLAGS source_info;/* Source info flags (see above). */
-/*0x30*/le32 security_id;	/* File security_id copied from
-				   $STANDARD_INFORMATION. */
-/*0x34*/FILE_ATTR_FLAGS file_attributes;	/* File attributes copied from
-				   $STANDARD_INFORMATION or $FILE_NAME (not
-				   sure which). */
-/*0x38*/le16 file_name_size;	/* Size of the file name in bytes. */
-/*0x3a*/le16 file_name_offset;	/* Offset to the file name in bytes from the
-				   start of this record. */
-/*0x3c*/ntfschar file_name[0];	/* Use when creating only.  When reading use
-				   file_name_offset to determine the location
-				   of the name. */
-/* sizeof() = 60 (0x3c) bytes */
-} __attribute__ ((__packed__)) USN_RECORD;
-
-extern bool ntfs_stamp_usnjrnl(ntfs_volume *vol);
-
-#endif /* NTFS_RW */
-
-#endif /* _LINUX_NTFS_USNJRNL_H */
diff --git a/fs/ntfs/volume.h b/fs/ntfs/volume.h
deleted file mode 100644
index 930a9ae8a053..000000000000
--- a/fs/ntfs/volume.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/*
- * volume.h - Defines for volume structures in NTFS Linux kernel driver. Part
- *	      of the Linux-NTFS project.
- *
- * Copyright (c) 2001-2006 Anton Altaparmakov
- * Copyright (c) 2002 Richard Russon
- */
-
-#ifndef _LINUX_NTFS_VOLUME_H
-#define _LINUX_NTFS_VOLUME_H
-
-#include <linux/rwsem.h>
-#include <linux/uidgid.h>
-
-#include "types.h"
-#include "layout.h"
-
-/*
- * The NTFS in memory super block structure.
- */
-typedef struct {
-	/*
-	 * FIXME: Reorder to have commonly used together element within the
-	 * same cache line, aiming at a cache line size of 32 bytes. Aim for
-	 * 64 bytes for less commonly used together elements. Put most commonly
-	 * used elements to front of structure. Obviously do this only when the
-	 * structure has stabilized... (AIA)
-	 */
-	/* Device specifics. */
-	struct super_block *sb;		/* Pointer back to the super_block. */
-	LCN nr_blocks;			/* Number of sb->s_blocksize bytes
-					   sized blocks on the device. */
-	/* Configuration provided by user at mount time. */
-	unsigned long flags;		/* Miscellaneous flags, see below. */
-	kuid_t uid;			/* uid that files will be mounted as. */
-	kgid_t gid;			/* gid that files will be mounted as. */
-	umode_t fmask;			/* The mask for file permissions. */
-	umode_t dmask;			/* The mask for directory
-					   permissions. */
-	u8 mft_zone_multiplier;		/* Initial mft zone multiplier. */
-	u8 on_errors;			/* What to do on filesystem errors. */
-	/* NTFS bootsector provided information. */
-	u16 sector_size;		/* in bytes */
-	u8 sector_size_bits;		/* log2(sector_size) */
-	u32 cluster_size;		/* in bytes */
-	u32 cluster_size_mask;		/* cluster_size - 1 */
-	u8 cluster_size_bits;		/* log2(cluster_size) */
-	u32 mft_record_size;		/* in bytes */
-	u32 mft_record_size_mask;	/* mft_record_size - 1 */
-	u8 mft_record_size_bits;	/* log2(mft_record_size) */
-	u32 index_record_size;		/* in bytes */
-	u32 index_record_size_mask;	/* index_record_size - 1 */
-	u8 index_record_size_bits;	/* log2(index_record_size) */
-	LCN nr_clusters;		/* Volume size in clusters == number of
-					   bits in lcn bitmap. */
-	LCN mft_lcn;			/* Cluster location of mft data. */
-	LCN mftmirr_lcn;		/* Cluster location of copy of mft. */
-	u64 serial_no;			/* The volume serial number. */
-	/* Mount specific NTFS information. */
-	u32 upcase_len;			/* Number of entries in upcase[]. */
-	ntfschar *upcase;		/* The upcase table. */
-
-	s32 attrdef_size;		/* Size of the attribute definition
-					   table in bytes. */
-	ATTR_DEF *attrdef;		/* Table of attribute definitions.
-					   Obtained from FILE_AttrDef. */
-
-#ifdef NTFS_RW
-	/* Variables used by the cluster and mft allocators. */
-	s64 mft_data_pos;		/* Mft record number at which to
-					   allocate the next mft record. */
-	LCN mft_zone_start;		/* First cluster of the mft zone. */
-	LCN mft_zone_end;		/* First cluster beyond the mft zone. */
-	LCN mft_zone_pos;		/* Current position in the mft zone. */
-	LCN data1_zone_pos;		/* Current position in the first data
-					   zone. */
-	LCN data2_zone_pos;		/* Current position in the second data
-					   zone. */
-#endif /* NTFS_RW */
-
-	struct inode *mft_ino;		/* The VFS inode of $MFT. */
-
-	struct inode *mftbmp_ino;	/* Attribute inode for $MFT/$BITMAP. */
-	struct rw_semaphore mftbmp_lock; /* Lock for serializing accesses to the
-					    mft record bitmap ($MFT/$BITMAP). */
-#ifdef NTFS_RW
-	struct inode *mftmirr_ino;	/* The VFS inode of $MFTMirr. */
-	int mftmirr_size;		/* Size of mft mirror in mft records. */
-
-	struct inode *logfile_ino;	/* The VFS inode of $LogFile. */
-#endif /* NTFS_RW */
-
-	struct inode *lcnbmp_ino;	/* The VFS inode of $Bitmap. */
-	struct rw_semaphore lcnbmp_lock; /* Lock for serializing accesses to the
-					    cluster bitmap ($Bitmap/$DATA). */
-
-	struct inode *vol_ino;		/* The VFS inode of $Volume. */
-	VOLUME_FLAGS vol_flags;		/* Volume flags. */
-	u8 major_ver;			/* Ntfs major version of volume. */
-	u8 minor_ver;			/* Ntfs minor version of volume. */
-
-	struct inode *root_ino;		/* The VFS inode of the root
-					   directory. */
-	struct inode *secure_ino;	/* The VFS inode of $Secure (NTFS3.0+
-					   only, otherwise NULL). */
-	struct inode *extend_ino;	/* The VFS inode of $Extend (NTFS3.0+
-					   only, otherwise NULL). */
-#ifdef NTFS_RW
-	/* $Quota stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
-	struct inode *quota_ino;	/* The VFS inode of $Quota. */
-	struct inode *quota_q_ino;	/* Attribute inode for $Quota/$Q. */
-	/* $UsnJrnl stuff is NTFS3.0+ specific.  Unused/NULL otherwise. */
-	struct inode *usnjrnl_ino;	/* The VFS inode of $UsnJrnl. */
-	struct inode *usnjrnl_max_ino;	/* Attribute inode for $UsnJrnl/$Max. */
-	struct inode *usnjrnl_j_ino;	/* Attribute inode for $UsnJrnl/$J. */
-#endif /* NTFS_RW */
-	struct nls_table *nls_map;
-} ntfs_volume;
-
-/*
- * Defined bits for the flags field in the ntfs_volume structure.
- */
-typedef enum {
-	NV_Errors,		/* 1: Volume has errors, prevent remount rw. */
-	NV_ShowSystemFiles,	/* 1: Return system files in ntfs_readdir(). */
-	NV_CaseSensitive,	/* 1: Treat file names as case sensitive and
-				      create filenames in the POSIX namespace.
-				      Otherwise be case insensitive but still
-				      create file names in POSIX namespace. */
-	NV_LogFileEmpty,	/* 1: $LogFile journal is empty. */
-	NV_QuotaOutOfDate,	/* 1: $Quota is out of date. */
-	NV_UsnJrnlStamped,	/* 1: $UsnJrnl has been stamped. */
-	NV_SparseEnabled,	/* 1: May create sparse files. */
-} ntfs_volume_flags;
-
-/*
- * Macro tricks to expand the NVolFoo(), NVolSetFoo(), and NVolClearFoo()
- * functions.
- */
-#define DEFINE_NVOL_BIT_OPS(flag)					\
-static inline int NVol##flag(ntfs_volume *vol)		\
-{							\
-	return test_bit(NV_##flag, &(vol)->flags);	\
-}							\
-static inline void NVolSet##flag(ntfs_volume *vol)	\
-{							\
-	set_bit(NV_##flag, &(vol)->flags);		\
-}							\
-static inline void NVolClear##flag(ntfs_volume *vol)	\
-{							\
-	clear_bit(NV_##flag, &(vol)->flags);		\
-}
-
-/* Emit the ntfs volume bitops functions. */
-DEFINE_NVOL_BIT_OPS(Errors)
-DEFINE_NVOL_BIT_OPS(ShowSystemFiles)
-DEFINE_NVOL_BIT_OPS(CaseSensitive)
-DEFINE_NVOL_BIT_OPS(LogFileEmpty)
-DEFINE_NVOL_BIT_OPS(QuotaOutOfDate)
-DEFINE_NVOL_BIT_OPS(UsnJrnlStamped)
-DEFINE_NVOL_BIT_OPS(SparseEnabled)
-
-#endif /* _LINUX_NTFS_VOLUME_H */
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index cae41db0aaa7..084d19d78397 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -431,7 +431,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	 * fnd contains tree's path to insert to.
 	 * If fnd is not NULL then dir is locked.
 	 */
-	inode = ntfs_create_inode(mnt_idmap(file->f_path.mnt), dir, dentry, uni,
+	inode = ntfs_create_inode(file_mnt_idmap(file), dir, dentry, uni,
 				  mode, 0, NULL, 0, fnd);
 	err = IS_ERR(inode) ? PTR_ERR(inode) :
 			      finish_open(file, dentry, ntfs_file_open);
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index cef5467fd928..9df7c20d066f 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -1825,7 +1825,7 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_inode_cachep = kmem_cache_create(
 		"ntfs_inode_cache", sizeof(struct ntfs_inode), 0,
-		(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+		(SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
 		init_once);
 	if (!ntfs_inode_cachep) {
 		err = -ENOMEM;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4d7efefa98c5..1bde1281d514 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -213,7 +213,7 @@ struct o2hb_region {
 	unsigned int		hr_num_pages;
 
 	struct page             **hr_slot_data;
-	struct bdev_handle	*hr_bdev_handle;
+	struct file		*hr_bdev_file;
 	struct o2hb_disk_slot	*hr_slots;
 
 	/* live node map of this region */
@@ -263,7 +263,7 @@ struct o2hb_region {
 
 static inline struct block_device *reg_bdev(struct o2hb_region *reg)
 {
-	return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL;
+	return reg->hr_bdev_file ? file_bdev(reg->hr_bdev_file) : NULL;
 }
 
 struct o2hb_bio_wait_ctxt {
@@ -1509,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item)
 		kfree(reg->hr_slot_data);
 	}
 
-	if (reg->hr_bdev_handle)
-		bdev_release(reg->hr_bdev_handle);
+	if (reg->hr_bdev_file)
+		fput(reg->hr_bdev_file);
 
 	kfree(reg->hr_slots);
 
@@ -1569,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item,
 	unsigned long block_bytes;
 	unsigned int block_bits;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	status = o2hb_read_block_input(reg, page, &block_bytes,
@@ -1598,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item,
 	char *p = (char *)page;
 	ssize_t ret;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	ret = kstrtoull(p, 0, &tmp);
@@ -1623,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item,
 	unsigned long tmp;
 	char *p = (char *)page;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		return -EINVAL;
 
 	tmp = simple_strtoul(p, &p, 0);
@@ -1642,7 +1642,7 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page)
 {
 	unsigned int ret = 0;
 
-	if (to_o2hb_region(item)->hr_bdev_handle)
+	if (to_o2hb_region(item)->hr_bdev_file)
 		ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item)));
 
 	return ret;
@@ -1753,7 +1753,7 @@ out:
 }
 
 /*
- * this is acting as commit; we set up all of hr_bdev_handle and hr_task or
+ * this is acting as commit; we set up all of hr_bdev_file and hr_task or
  * nothing
  */
 static ssize_t o2hb_region_dev_store(struct config_item *item,
@@ -1769,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	ssize_t ret = -EINVAL;
 	int live_threshold;
 
-	if (reg->hr_bdev_handle)
+	if (reg->hr_bdev_file)
 		goto out;
 
 	/* We can't heartbeat without having had our node number
@@ -1795,11 +1795,11 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	if (!S_ISBLK(f.file->f_mapping->host->i_mode))
 		goto out2;
 
-	reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev,
+	reg->hr_bdev_file = bdev_file_open_by_dev(f.file->f_mapping->host->i_rdev,
 			BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL);
-	if (IS_ERR(reg->hr_bdev_handle)) {
-		ret = PTR_ERR(reg->hr_bdev_handle);
-		reg->hr_bdev_handle = NULL;
+	if (IS_ERR(reg->hr_bdev_file)) {
+		ret = PTR_ERR(reg->hr_bdev_file);
+		reg->hr_bdev_file = NULL;
 		goto out2;
 	}
 
@@ -1903,8 +1903,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 
 out3:
 	if (ret < 0) {
-		bdev_release(reg->hr_bdev_handle);
-		reg->hr_bdev_handle = NULL;
+		fput(reg->hr_bdev_file);
+		reg->hr_bdev_file = NULL;
 	}
 out2:
 	fdput(f);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 85215162c9dd..7fc0e920eda7 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -578,7 +578,7 @@ static int __init init_dlmfs_fs(void)
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					SLAB_ACCOUNT),
 				dlmfs_init_once);
 	if (!dlmfs_inode_cache) {
 		status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 64a6ef638495..cb40cafbc062 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1615,7 +1615,7 @@ update_holders:
 unlock:
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
-	/* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
+	/* ocfs2_unblock_lock request on seeing OCFS2_LOCK_UPCONVERT_FINISHING */
 	kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED);
 
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 8b6d15010703..0da8e7bd3261 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2763,6 +2763,7 @@ const struct inode_operations ocfs2_file_iops = {
 const struct inode_operations ocfs2_special_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
+	.listxattr	= ocfs2_listxattr,
 	.permission	= ocfs2_permission,
 	.get_inode_acl	= ocfs2_iop_get_acl,
 	.set_acl	= ocfs2_iop_set_acl,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 82b28fdacc7e..accf03d4765e 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -65,7 +65,7 @@ struct ocfs2_inode_info
 	tid_t i_sync_tid;
 	tid_t i_datasync_tid;
 
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 };
 
 /*
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index f37174e79fad..6de944818c56 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -27,7 +27,7 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
 	struct ocfs2_file_private *fp = file->private_data;
 	struct ocfs2_lock_res *lockres = &fp->fp_flock;
 
-	if (fl->fl_type == F_WRLCK)
+	if (lock_is_write(fl))
 		level = 1;
 	if (!IS_SETLKW(cmd))
 		trylock = 1;
@@ -53,8 +53,8 @@ static int ocfs2_do_flock(struct file *file, struct inode *inode,
 		 */
 
 		locks_init_lock(&request);
-		request.fl_type = F_UNLCK;
-		request.fl_flags = FL_FLOCK;
+		request.c.flc_type = F_UNLCK;
+		request.c.flc_flags = FL_FLOCK;
 		locks_lock_file_wait(file, &request);
 
 		ocfs2_file_unlock(file);
@@ -100,14 +100,14 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	if (!(fl->fl_flags & FL_FLOCK))
+	if (!(fl->c.flc_flags & FL_FLOCK))
 		return -ENOLCK;
 
 	if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) ||
 	    ocfs2_mount_local(osb))
 		return locks_lock_file_wait(file, fl);
 
-	if (fl->fl_type == F_UNLCK)
+	if (lock_is_unlock(fl))
 		return ocfs2_do_funlock(file, cmd, fl);
 	else
 		return ocfs2_do_flock(file, inode, cmd, fl);
@@ -118,7 +118,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	if (!(fl->fl_flags & FL_POSIX))
+	if (!(fl->c.flc_flags & FL_POSIX))
 		return -ENOLCK;
 
 	return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index dc9f76ab7e13..0575c2d060eb 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -447,14 +447,17 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
 	int err;
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv;
+	unsigned int memalloc;
 
 	down_write(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	err = ocfs2_qinfo_lock(info, 1);
 	if (err < 0)
 		goto out_sem;
 	err = __ocfs2_global_write_info(sb, type);
 	ocfs2_qinfo_unlock(info, 1);
 out_sem:
+	memalloc_nofs_restore(memalloc);
 	up_write(&dqopt->dqio_sem);
 	return err;
 }
@@ -601,6 +604,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	int status = 0;
+	unsigned int memalloc;
 
 	trace_ocfs2_sync_dquot_helper(from_kqid(&init_user_ns, dquot->dq_id),
 				      dquot->dq_id.type,
@@ -618,6 +622,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 		goto out_ilock;
 	}
 	down_write(&sb_dqopt(sb)->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	status = ocfs2_sync_dquot(dquot);
 	if (status < 0)
 		mlog_errno(status);
@@ -625,6 +630,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	status = ocfs2_local_write_dquot(dquot);
 	if (status < 0)
 		mlog_errno(status);
+	memalloc_nofs_restore(memalloc);
 	up_write(&sb_dqopt(sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
@@ -662,6 +668,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
+	unsigned int memalloc;
 
 	trace_ocfs2_write_dquot(from_kqid(&init_user_ns, dquot->dq_id),
 				dquot->dq_id.type);
@@ -673,7 +680,9 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 		goto out;
 	}
 	down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	status = ocfs2_local_write_dquot(dquot);
+	memalloc_nofs_restore(memalloc);
 	up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out:
@@ -920,6 +929,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
+	unsigned int memalloc;
 
 	trace_ocfs2_mark_dquot_dirty(from_kqid(&init_user_ns, dquot->dq_id),
 				     type);
@@ -946,6 +956,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 		goto out_ilock;
 	}
 	down_write(&sb_dqopt(sb)->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	status = ocfs2_sync_dquot(dquot);
 	if (status < 0) {
 		mlog_errno(status);
@@ -954,6 +965,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	/* Now write updated local dquot structure */
 	status = ocfs2_local_write_dquot(dquot);
 out_dlock:
+	memalloc_nofs_restore(memalloc);
 	up_write(&sb_dqopt(sb)->dqio_sem);
 	ocfs2_commit_trans(osb, handle);
 out_ilock:
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index e09842fc9d4d..8ce462c64c51 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -470,6 +470,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 	int bit, chunk;
 	struct ocfs2_recovery_chunk *rchunk, *next;
 	qsize_t spacechange, inodechange;
+	unsigned int memalloc;
 
 	trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
 
@@ -521,6 +522,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 				goto out_drop_lock;
 			}
 			down_write(&sb_dqopt(sb)->dqio_sem);
+			memalloc = memalloc_nofs_save();
 			spin_lock(&dquot->dq_dqb_lock);
 			/* Add usage from quota entry into quota changes
 			 * of our node. Auxiliary variables are important
@@ -553,6 +555,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 			unlock_buffer(qbh);
 			ocfs2_journal_dirty(handle, qbh);
 out_commit:
+			memalloc_nofs_restore(memalloc);
 			up_write(&sb_dqopt(sb)->dqio_sem);
 			ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out_drop_lock:
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index 9b76ee66aeb2..c11406cd87a8 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -744,7 +744,7 @@ static int user_plock(struct ocfs2_cluster_connection *conn,
 		return dlm_posix_cancel(conn->cc_lockspace, ino, file, fl);
 	else if (IS_GETLK(cmd))
 		return dlm_posix_get(conn->cc_lockspace, ino, file, fl);
-	else if (fl->fl_type == F_UNLCK)
+	else if (lock_is_unlock(fl))
 		return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl);
 	else
 		return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6b906424902b..8aabaed2c1cb 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -122,7 +122,7 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
 static int ocfs2_enable_quotas(struct ocfs2_super *osb);
 static void ocfs2_disable_quotas(struct ocfs2_super *osb);
 
-static struct dquot **ocfs2_get_dquots(struct inode *inode)
+static struct dquot __rcu **ocfs2_get_dquots(struct inode *inode)
 {
 	return OCFS2_I(inode)->i_dquot;
 }
@@ -1706,18 +1706,17 @@ static int ocfs2_initialize_mem_caches(void)
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
 					0,
-					(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+					SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT,
 					NULL);
 	ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
 					sizeof(struct ocfs2_quota_chunk),
 					0,
-					(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+					SLAB_RECLAIM_ACCOUNT,
 					NULL);
 	if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
 	    !ocfs2_qf_chunk_cachep) {
@@ -2027,8 +2026,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
 	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
-	memcpy(&sb->s_uuid, di->id2.i_super.s_uuid,
-	       sizeof(di->id2.i_super.s_uuid));
+	super_set_uuid(sb, di->id2.i_super.s_uuid,
+		       sizeof(di->id2.i_super.s_uuid));
 
 	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
 
diff --git a/fs/open.c b/fs/open.c
index a84d21e55c39..ee8460c83c77 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,7 +29,6 @@
 #include <linux/audit.h>
 #include <linux/falloc.h>
 #include <linux/fs_struct.h>
-#include <linux/ima.h>
 #include <linux/dnotify.h>
 #include <linux/compat.h>
 #include <linux/mnt_idmapping.h>
@@ -154,49 +153,52 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
 }
 #endif
 
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+long do_ftruncate(struct file *file, loff_t length, int small)
 {
 	struct inode *inode;
 	struct dentry *dentry;
-	struct fd f;
 	int error;
 
-	error = -EINVAL;
-	if (length < 0)
-		goto out;
-	error = -EBADF;
-	f = fdget(fd);
-	if (!f.file)
-		goto out;
-
 	/* explicitly opened as large or we are on 64-bit box */
-	if (f.file->f_flags & O_LARGEFILE)
+	if (file->f_flags & O_LARGEFILE)
 		small = 0;
 
-	dentry = f.file->f_path.dentry;
+	dentry = file->f_path.dentry;
 	inode = dentry->d_inode;
-	error = -EINVAL;
-	if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
-		goto out_putf;
+	if (!S_ISREG(inode->i_mode) || !(file->f_mode & FMODE_WRITE))
+		return -EINVAL;
 
-	error = -EINVAL;
 	/* Cannot ftruncate over 2^31 bytes without large file support */
 	if (small && length > MAX_NON_LFS)
-		goto out_putf;
+		return -EINVAL;
 
-	error = -EPERM;
 	/* Check IS_APPEND on real upper inode */
-	if (IS_APPEND(file_inode(f.file)))
-		goto out_putf;
+	if (IS_APPEND(file_inode(file)))
+		return -EPERM;
 	sb_start_write(inode->i_sb);
-	error = security_file_truncate(f.file);
+	error = security_file_truncate(file);
 	if (!error)
-		error = do_truncate(file_mnt_idmap(f.file), dentry, length,
-				    ATTR_MTIME | ATTR_CTIME, f.file);
+		error = do_truncate(file_mnt_idmap(file), dentry, length,
+				    ATTR_MTIME | ATTR_CTIME, file);
 	sb_end_write(inode->i_sb);
-out_putf:
+
+	return error;
+}
+
+long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+{
+	struct fd f;
+	int error;
+
+	if (length < 0)
+		return -EINVAL;
+	f = fdget(fd);
+	if (!f.file)
+		return -EBADF;
+
+	error = do_ftruncate(f.file, length, small);
+
 	fdput(f);
-out:
 	return error;
 }
 
@@ -1364,7 +1366,7 @@ struct file *filp_open(const char *filename, int flags, umode_t mode)
 {
 	struct filename *name = getname_kernel(filename);
 	struct file *file = ERR_CAST(name);
-	
+
 	if (!IS_ERR(name)) {
 		file = file_open_name(name, flags, mode);
 		putname(name);
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c4b65a6d41cc..4a0779e3ef79 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -446,7 +446,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+					     SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index 3b6982bf6bcf..e75e173a9186 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -22,7 +22,7 @@ int op_cache_initialize(void)
 	op_cache = kmem_cache_create("orangefs_op_cache",
 				     sizeof(struct orangefs_kernel_op_s),
 				     0,
-				     ORANGEFS_CACHE_CREATE_FLAGS,
+				     0,
 				     NULL);
 
 	if (!op_cache) {
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 926d9c0a428a..e2df7eeadc7a 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -93,16 +93,6 @@ enum orangefs_vfs_op_states {
 	OP_VFS_STATE_GIVEN_UP = 16,
 };
 
-/*
- * orangefs kernel memory related flags
- */
-
-#if (defined CONFIG_DEBUG_SLAB)
-#define ORANGEFS_CACHE_CREATE_FLAGS SLAB_RED_ZONE
-#else
-#define ORANGEFS_CACHE_CREATE_FLAGS 0
-#endif
-
 extern const struct xattr_handler * const orangefs_xattr_handlers[];
 
 extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu);
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index 5254256a224d..34849b4a3243 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -527,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
 	sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
 	if (!ORANGEFS_SB(sb)) {
 		d = ERR_PTR(-ENOMEM);
-		goto free_sb_and_op;
+		goto free_op;
 	}
 
 	ret = orangefs_fill_sb(sb,
@@ -644,7 +644,7 @@ int orangefs_inode_cache_initialize(void)
 					"orangefs_inode_cache",
 					sizeof(struct orangefs_inode_s),
 					0,
-					ORANGEFS_CACHE_CREATE_FLAGS,
+					0,
 					offsetof(struct orangefs_inode_s,
 						link_target),
 					sizeof_field(struct orangefs_inode_s,
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8586e2f5d243..0762575a1e70 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -234,11 +234,11 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
 {
 	loff_t tmp;
 
-	if (WARN_ON_ONCE(pos != pos2))
+	if (pos != pos2)
 		return -EIO;
-	if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0))
+	if (pos < 0 || len < 0 || totlen < 0)
 		return -EIO;
-	if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp)))
+	if (check_add_overflow(pos, len, &tmp))
 		return -EIO;
 	return 0;
 }
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 112b4b12f825..36dcc530ac28 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -280,12 +280,20 @@ static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path,
 {
 	struct ovl_fs_context *ctx = fc->fs_private;
 
-	if (ovl_dentry_weird(path->dentry))
-		return invalfc(fc, "filesystem on %s not supported", name);
-
 	if (!d_is_dir(path->dentry))
 		return invalfc(fc, "%s is not a directory", name);
 
+	/*
+	 * Root dentries of case-insensitive capable filesystems might
+	 * not have the dentry operations set, but still be incompatible
+	 * with overlayfs.  Check explicitly to prevent post-mount
+	 * failures.
+	 */
+	if (sb_has_encoding(path->mnt->mnt_sb))
+		return invalfc(fc, "case-insensitive capable filesystem on %s not supported", name);
+
+	if (ovl_dentry_weird(path->dentry))
+		return invalfc(fc, "filesystem on %s not supported", name);
 
 	/*
 	 * Check whether upper path is read-only here to report failures
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 2eef6c70b2ae..a40fc7e05525 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -28,41 +28,38 @@ MODULE_LICENSE("GPL");
 
 struct ovl_dir_cache;
 
-static struct dentry *ovl_d_real(struct dentry *dentry,
-				 const struct inode *inode)
+static struct dentry *ovl_d_real(struct dentry *dentry, enum d_real_type type)
 {
-	struct dentry *real = NULL, *lower;
+	struct dentry *upper, *lower;
 	int err;
 
-	/*
-	 * vfs is only expected to call d_real() with NULL from d_real_inode()
-	 * and with overlay inode from file_dentry() on an overlay file.
-	 *
-	 * TODO: remove @inode argument from d_real() API, remove code in this
-	 * function that deals with non-NULL @inode and remove d_real() call
-	 * from file_dentry().
-	 */
-	if (inode && d_inode(dentry) == inode)
-		return dentry;
-	else if (inode)
+	switch (type) {
+	case D_REAL_DATA:
+	case D_REAL_METADATA:
+		break;
+	default:
 		goto bug;
+	}
 
 	if (!d_is_reg(dentry)) {
 		/* d_real_inode() is only relevant for regular files */
 		return dentry;
 	}
 
-	real = ovl_dentry_upper(dentry);
-	if (real && (inode == d_inode(real)))
-		return real;
+	upper = ovl_dentry_upper(dentry);
+	if (upper && (type == D_REAL_METADATA ||
+		      ovl_has_upperdata(d_inode(dentry))))
+		return upper;
 
-	if (real && !inode && ovl_has_upperdata(d_inode(dentry)))
-		return real;
+	if (type == D_REAL_METADATA) {
+		lower = ovl_dentry_lower(dentry);
+		goto real_lower;
+	}
 
 	/*
-	 * Best effort lazy lookup of lowerdata for !inode case to return
+	 * Best effort lazy lookup of lowerdata for D_REAL_DATA case to return
 	 * the real lowerdata dentry.  The only current caller of d_real() with
-	 * NULL inode is d_real_inode() from trace_uprobe and this caller is
+	 * D_REAL_DATA is d_real_inode() from trace_uprobe and this caller is
 	 * likely going to be followed reading from the file, before placing
 	 * uprobes on offset within the file, so lowerdata should be available
 	 * when setting the uprobe.
@@ -73,18 +70,13 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	lower = ovl_dentry_lowerdata(dentry);
 	if (!lower)
 		goto bug;
-	real = lower;
 
-	/* Handle recursion */
-	real = d_real(real, inode);
+real_lower:
+	/* Handle recursion into stacked lower fs */
+	return d_real(lower, type);
 
-	if (!inode || inode == d_inode(real))
-		return real;
 bug:
-	WARN(1, "%s(%pd4, %s:%lu): real dentry (%p/%lu) not found\n",
-	     __func__, dentry, inode ? inode->i_sb->s_id : "NULL",
-	     inode ? inode->i_ino : 0, real,
-	     real && d_inode(real) ? d_inode(real)->i_ino : 0);
+	WARN(1, "%s(%pd4, %d): real dentry not found\n", __func__, dentry, type);
 	return dentry;
 }
 
@@ -1511,7 +1503,7 @@ static int __init ovl_init(void)
 	ovl_inode_cachep = kmem_cache_create("ovl_inode",
 					     sizeof(struct ovl_inode), 0,
 					     (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+					      SLAB_ACCOUNT),
 					     ovl_inode_init_once);
 	if (ovl_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index a8e17f14d7a2..d285d1d7baad 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -774,13 +774,14 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
 			 const struct path *upperpath)
 {
 	bool set = false;
+	uuid_t uuid;
 	int res;
 
 	/* Try to load existing persistent uuid */
-	res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, sb->s_uuid.b,
+	res = ovl_path_getxattr(ofs, upperpath, OVL_XATTR_UUID, uuid.b,
 				UUID_SIZE);
 	if (res == UUID_SIZE)
-		return true;
+		goto set_uuid;
 
 	if (res != -ENODATA)
 		goto fail;
@@ -808,17 +809,20 @@ bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
 	}
 
 	/* Generate overlay instance uuid */
-	uuid_gen(&sb->s_uuid);
+	uuid_gen(&uuid);
 
 	/* Try to store persistent uuid */
 	set = true;
-	res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, sb->s_uuid.b,
+	res = ovl_setxattr(ofs, upperpath->dentry, OVL_XATTR_UUID, uuid.b,
 			   UUID_SIZE);
-	if (res == 0)
-		return true;
+	if (res)
+		goto fail;
+
+set_uuid:
+	super_set_uuid(sb, uuid.b, sizeof(uuid));
+	return true;
 
 fail:
-	memset(sb->s_uuid.b, 0, UUID_SIZE);
 	ofs->config.uuid = OVL_UUID_NULL;
 	pr_warn("failed to %s uuid (%pd2, err=%i); falling back to uuid=null.\n",
 		set ? "set" : "get", upperpath->dentry, res);
diff --git a/fs/pidfs.c b/fs/pidfs.c
new file mode 100644
index 000000000000..a63d5d24aa02
--- /dev/null
+++ b/fs/pidfs.c
@@ -0,0 +1,283 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/mount.h>
+#include <linux/pid.h>
+#include <linux/pidfs.h>
+#include <linux/pid_namespace.h>
+#include <linux/poll.h>
+#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/pseudo_fs.h>
+#include <linux/seq_file.h>
+#include <uapi/linux/pidfd.h>
+
+#include "internal.h"
+
+#ifdef CONFIG_PROC_FS
+/**
+ * pidfd_show_fdinfo - print information about a pidfd
+ * @m: proc fdinfo file
+ * @f: file referencing a pidfd
+ *
+ * Pid:
+ * This function will print the pid that a given pidfd refers to in the
+ * pid namespace of the procfs instance.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its pid. This is
+ * similar to calling getppid() on a process whose parent is outside of
+ * its pid namespace.
+ *
+ * NSpid:
+ * If pid namespaces are supported then this function will also print
+ * the pid of a given pidfd refers to for all descendant pid namespaces
+ * starting from the current pid namespace of the instance, i.e. the
+ * Pid field and the first entry in the NSpid field will be identical.
+ * If the pid namespace of the process is not a descendant of the pid
+ * namespace of the procfs instance 0 will be shown as its first NSpid
+ * entry and no others will be shown.
+ * Note that this differs from the Pid and NSpid fields in
+ * /proc/<pid>/status where Pid and NSpid are always shown relative to
+ * the  pid namespace of the procfs instance. The difference becomes
+ * obvious when sending around a pidfd between pid namespaces from a
+ * different branch of the tree, i.e. where no ancestral relation is
+ * present between the pid namespaces:
+ * - create two new pid namespaces ns1 and ns2 in the initial pid
+ *   namespace (also take care to create new mount namespaces in the
+ *   new pid namespace and mount procfs)
+ * - create a process with a pidfd in ns1
+ * - send pidfd from ns1 to ns2
+ * - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
+ *   have exactly one entry, which is 0
+ */
+static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
+{
+	struct pid *pid = pidfd_pid(f);
+	struct pid_namespace *ns;
+	pid_t nr = -1;
+
+	if (likely(pid_has_task(pid, PIDTYPE_PID))) {
+		ns = proc_pid_ns(file_inode(m->file)->i_sb);
+		nr = pid_nr_ns(pid, ns);
+	}
+
+	seq_put_decimal_ll(m, "Pid:\t", nr);
+
+#ifdef CONFIG_PID_NS
+	seq_put_decimal_ll(m, "\nNSpid:\t", nr);
+	if (nr > 0) {
+		int i;
+
+		/* If nr is non-zero it means that 'pid' is valid and that
+		 * ns, i.e. the pid namespace associated with the procfs
+		 * instance, is in the pid namespace hierarchy of pid.
+		 * Start at one below the already printed level.
+		 */
+		for (i = ns->level + 1; i <= pid->level; i++)
+			seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
+	}
+#endif
+	seq_putc(m, '\n');
+}
+#endif
+
+/*
+ * Poll support for process exit notification.
+ */
+static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts)
+{
+	struct pid *pid = pidfd_pid(file);
+	bool thread = file->f_flags & PIDFD_THREAD;
+	struct task_struct *task;
+	__poll_t poll_flags = 0;
+
+	poll_wait(file, &pid->wait_pidfd, pts);
+	/*
+	 * Depending on PIDFD_THREAD, inform pollers when the thread
+	 * or the whole thread-group exits.
+	 */
+	guard(rcu)();
+	task = pid_task(pid, PIDTYPE_PID);
+	if (!task)
+		poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP;
+	else if (task->exit_state && (thread || thread_group_empty(task)))
+		poll_flags = EPOLLIN | EPOLLRDNORM;
+
+	return poll_flags;
+}
+
+static const struct file_operations pidfs_file_operations = {
+	.poll		= pidfd_poll,
+#ifdef CONFIG_PROC_FS
+	.show_fdinfo	= pidfd_show_fdinfo,
+#endif
+};
+
+struct pid *pidfd_pid(const struct file *file)
+{
+	if (file->f_op != &pidfs_file_operations)
+		return ERR_PTR(-EBADF);
+	return file_inode(file)->i_private;
+}
+
+static struct vfsmount *pidfs_mnt __ro_after_init;
+
+#if BITS_PER_LONG == 32
+/*
+ * Provide a fallback mechanism for 32-bit systems so processes remain
+ * reliably comparable by inode number even on those systems.
+ */
+static DEFINE_IDA(pidfd_inum_ida);
+
+static int pidfs_inum(struct pid *pid, unsigned long *ino)
+{
+	int ret;
+
+	ret = ida_alloc_range(&pidfd_inum_ida, RESERVED_PIDS + 1,
+			      UINT_MAX, GFP_ATOMIC);
+	if (ret < 0)
+		return -ENOSPC;
+
+	*ino = ret;
+	return 0;
+}
+
+static inline void pidfs_free_inum(unsigned long ino)
+{
+	if (ino > 0)
+		ida_free(&pidfd_inum_ida, ino);
+}
+#else
+static inline int pidfs_inum(struct pid *pid, unsigned long *ino)
+{
+	*ino = pid->ino;
+	return 0;
+}
+#define pidfs_free_inum(ino) ((void)(ino))
+#endif
+
+/*
+ * The vfs falls back to simple_setattr() if i_op->setattr() isn't
+ * implemented. Let's reject it completely until we have a clean
+ * permission concept for pidfds.
+ */
+static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+			 struct iattr *attr)
+{
+	return -EOPNOTSUPP;
+}
+
+static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path,
+			 struct kstat *stat, u32 request_mask,
+			 unsigned int query_flags)
+{
+	struct inode *inode = d_inode(path->dentry);
+
+	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	return 0;
+}
+
+static const struct inode_operations pidfs_inode_operations = {
+	.getattr = pidfs_getattr,
+	.setattr = pidfs_setattr,
+};
+
+static void pidfs_evict_inode(struct inode *inode)
+{
+	struct pid *pid = inode->i_private;
+
+	clear_inode(inode);
+	put_pid(pid);
+	pidfs_free_inum(inode->i_ino);
+}
+
+static const struct super_operations pidfs_sops = {
+	.drop_inode	= generic_delete_inode,
+	.evict_inode	= pidfs_evict_inode,
+	.statfs		= simple_statfs,
+};
+
+static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	struct inode *inode = d_inode(dentry);
+	struct pid *pid = inode->i_private;
+
+	return dynamic_dname(buffer, buflen, "pidfd:[%llu]", pid->ino);
+}
+
+static const struct dentry_operations pidfs_dentry_operations = {
+	.d_delete	= always_delete_dentry,
+	.d_dname	= pidfs_dname,
+	.d_prune	= stashed_dentry_prune,
+};
+
+static int pidfs_init_inode(struct inode *inode, void *data)
+{
+	inode->i_private = data;
+	inode->i_flags |= S_PRIVATE;
+	inode->i_mode |= S_IRWXU;
+	inode->i_op = &pidfs_inode_operations;
+	inode->i_fop = &pidfs_file_operations;
+	/*
+	 * Inode numbering for pidfs start at RESERVED_PIDS + 1. This
+	 * avoids collisions with the root inode which is 1 for pseudo
+	 * filesystems.
+	 */
+	return pidfs_inum(data, &inode->i_ino);
+}
+
+static void pidfs_put_data(void *data)
+{
+	struct pid *pid = data;
+	put_pid(pid);
+}
+
+static const struct stashed_operations pidfs_stashed_ops = {
+	.init_inode = pidfs_init_inode,
+	.put_data = pidfs_put_data,
+};
+
+static int pidfs_init_fs_context(struct fs_context *fc)
+{
+	struct pseudo_fs_context *ctx;
+
+	ctx = init_pseudo(fc, PID_FS_MAGIC);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->ops = &pidfs_sops;
+	ctx->dops = &pidfs_dentry_operations;
+	fc->s_fs_info = (void *)&pidfs_stashed_ops;
+	return 0;
+}
+
+static struct file_system_type pidfs_type = {
+	.name			= "pidfs",
+	.init_fs_context	= pidfs_init_fs_context,
+	.kill_sb		= kill_anon_super,
+};
+
+struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags)
+{
+
+	struct file *pidfd_file;
+	struct path path;
+	int ret;
+
+	ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	pidfd_file = dentry_open(&path, flags, current_cred());
+	path_put(&path);
+	return pidfd_file;
+}
+
+void __init pidfs_init(void)
+{
+	pidfs_mnt = kern_mount(&pidfs_type);
+	if (IS_ERR(pidfs_mnt))
+		panic("Failed to mount pidfs pseudo filesystem");
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index f1adbfe743d4..50c8a8596b52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -76,18 +76,20 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
  */
 
-static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
+#define cmp_int(l, r)		((l > r) - (l < r))
+
+#ifdef CONFIG_PROVE_LOCKING
+static int pipe_lock_cmp_fn(const struct lockdep_map *a,
+			    const struct lockdep_map *b)
 {
-	if (pipe->files)
-		mutex_lock_nested(&pipe->mutex, subclass);
+	return cmp_int((unsigned long) a, (unsigned long) b);
 }
+#endif
 
 void pipe_lock(struct pipe_inode_info *pipe)
 {
-	/*
-	 * pipe_lock() nests non-pipe inode locks (for writing to a file)
-	 */
-	pipe_lock_nested(pipe, I_MUTEX_PARENT);
+	if (pipe->files)
+		mutex_lock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_lock);
 
@@ -98,28 +100,16 @@ void pipe_unlock(struct pipe_inode_info *pipe)
 }
 EXPORT_SYMBOL(pipe_unlock);
 
-static inline void __pipe_lock(struct pipe_inode_info *pipe)
-{
-	mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
-}
-
-static inline void __pipe_unlock(struct pipe_inode_info *pipe)
-{
-	mutex_unlock(&pipe->mutex);
-}
-
 void pipe_double_lock(struct pipe_inode_info *pipe1,
 		      struct pipe_inode_info *pipe2)
 {
 	BUG_ON(pipe1 == pipe2);
 
-	if (pipe1 < pipe2) {
-		pipe_lock_nested(pipe1, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe2, I_MUTEX_CHILD);
-	} else {
-		pipe_lock_nested(pipe2, I_MUTEX_PARENT);
-		pipe_lock_nested(pipe1, I_MUTEX_CHILD);
-	}
+	if (pipe1 > pipe2)
+		swap(pipe1, pipe2);
+
+	pipe_lock(pipe1);
+	pipe_lock(pipe2);
 }
 
 static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
@@ -271,7 +261,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		return 0;
 
 	ret = 0;
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/*
 	 * We only wake up writers if the pipe was full when we started
@@ -368,7 +358,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 			ret = -EAGAIN;
 			break;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		/*
 		 * We only get here if we didn't actually read anything.
@@ -400,13 +390,13 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to)
 		if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
 			return -ERESTARTSYS;
 
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
 		wake_next_reader = true;
 	}
 	if (pipe_empty(pipe->head, pipe->tail))
 		wake_next_reader = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	if (was_full)
 		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
@@ -462,7 +452,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 	if (unlikely(total_len == 0))
 		return 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
@@ -582,19 +572,19 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
 		 * after waiting we need to re-check whether the pipe
 		 * become empty while we dropped the lock.
 		 */
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		if (was_empty)
 			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		was_empty = pipe_empty(pipe->head, pipe->tail);
 		wake_next_writer = true;
 	}
 out:
 	if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
 		wake_next_writer = false;
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	/*
 	 * If we do do a wakeup event, we do a 'sync' wakeup, because we
@@ -629,7 +619,7 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 	switch (cmd) {
 	case FIONREAD:
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		count = 0;
 		head = pipe->head;
 		tail = pipe->tail;
@@ -639,16 +629,16 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			count += pipe->bufs[tail & mask].len;
 			tail++;
 		}
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 
 		return put_user(count, (int __user *)arg);
 
 #ifdef CONFIG_WATCH_QUEUE
 	case IOC_WATCH_QUEUE_SET_SIZE: {
 		int ret;
-		__pipe_lock(pipe);
+		mutex_lock(&pipe->mutex);
 		ret = watch_queue_set_size(pipe, arg);
-		__pipe_unlock(pipe);
+		mutex_unlock(&pipe->mutex);
 		return ret;
 	}
 
@@ -734,7 +724,7 @@ pipe_release(struct inode *inode, struct file *file)
 {
 	struct pipe_inode_info *pipe = file->private_data;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (file->f_mode & FMODE_READ)
 		pipe->readers--;
 	if (file->f_mode & FMODE_WRITE)
@@ -747,7 +737,7 @@ pipe_release(struct inode *inode, struct file *file)
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return 0;
@@ -759,7 +749,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 	struct pipe_inode_info *pipe = filp->private_data;
 	int retval = 0;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 	if (filp->f_mode & FMODE_READ)
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
@@ -768,7 +758,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return retval;
 }
 
@@ -834,6 +824,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 		pipe->nr_accounted = pipe_bufs;
 		pipe->user = user;
 		mutex_init(&pipe->mutex);
+		lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL);
 		return pipe;
 	}
 
@@ -1144,7 +1135,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	filp->private_data = pipe;
 	/* OK, we have a pipe and it's pinned down */
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	/* We can only do regular read/write on fifos */
 	stream_open(inode, filp);
@@ -1214,7 +1205,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return 0;
 
 err_rd:
@@ -1230,7 +1221,7 @@ err_wr:
 	goto err;
 
 err:
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 
 	put_pipe_info(inode, pipe);
 	return ret;
@@ -1411,7 +1402,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 	if (!pipe)
 		return -EBADF;
 
-	__pipe_lock(pipe);
+	mutex_lock(&pipe->mutex);
 
 	switch (cmd) {
 	case F_SETPIPE_SZ:
@@ -1425,7 +1416,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg)
 		break;
 	}
 
-	__pipe_unlock(pipe);
+	mutex_unlock(&pipe->mutex);
 	return ret;
 }
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index e1af20893ebe..3f87297dbfdb 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -26,7 +26,6 @@
 #include <linux/mnt_idmapping.h>
 #include <linux/iversion.h>
 #include <linux/security.h>
-#include <linux/evm.h>
 #include <linux/fsnotify.h>
 #include <linux/filelock.h>
 
@@ -786,12 +785,12 @@ struct posix_acl *posix_acl_from_xattr(struct user_namespace *userns,
 		return ERR_PTR(count);
 	if (count == 0)
 		return NULL;
-	
+
 	acl = posix_acl_alloc(count, GFP_NOFS);
 	if (!acl)
 		return ERR_PTR(-ENOMEM);
 	acl_e = acl->a_entries;
-	
+
 	for (end = entry + count; entry != end; acl_e++, entry++) {
 		acl_e->e_tag  = le16_to_cpu(entry->e_tag);
 		acl_e->e_perm = le16_to_cpu(entry->e_perm);
@@ -1137,7 +1136,7 @@ retry_deleg:
 		error = -EIO;
 	if (!error) {
 		fsnotify_xattr(dentry);
-		evm_inode_post_set_acl(dentry, acl_name, kacl);
+		security_inode_post_set_acl(dentry, acl_name, kacl);
 	}
 
 out_inode_unlock:
@@ -1245,7 +1244,7 @@ retry_deleg:
 		error = -EIO;
 	if (!error) {
 		fsnotify_xattr(dentry);
-		evm_inode_post_remove_acl(idmap, dentry, acl_name);
+		security_inode_post_remove_acl(idmap, dentry, acl_name);
 	}
 
 out_inode_unlock:
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index 32b1116ae137..d80a1431ef7b 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -32,7 +32,7 @@ config PROC_FS
 config PROC_KCORE
 	bool "/proc/kcore support" if !ARM
 	depends on PROC_FS && MMU
-	select CRASH_CORE
+	select VMCORE_INFO
 	help
 	  Provides a virtual ELF core file of the live kernel.  This can
 	  be read with gdb and other ELF tools.  No modifications can be
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index bd08616ed8ba..7b4db9c56e6a 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
 
 obj-y   += proc.o
 
-CFLAGS_task_mmu.o	+= $(call cc-option,-Wno-override-init,)
+CFLAGS_task_mmu.o	+= -Wno-override-init
 proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)	:= task_mmu.o
 
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index 902b326e1e56..87dcaae32ff8 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -62,12 +62,12 @@ static int __init copy_xbc_key_value_list(char *dst, size_t size)
 				break;
 			dst += ret;
 		}
-		if (ret >= 0 && boot_command_line[0]) {
-			ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
-				       boot_command_line);
-			if (ret > 0)
-				dst += ret;
-		}
+	}
+	if (cmdline_has_extra_options() && ret >= 0 && boot_command_line[0]) {
+		ret = snprintf(dst, rest(dst, end), "# Parameters from bootloader:\n# %s\n",
+			       boot_command_line);
+		if (ret > 0)
+			dst += ret;
 	}
 out:
 	kfree(key);
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 05350f3c2812..dcd513dccf55 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -92,7 +92,7 @@ void __init proc_init_kmemcache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_ACCOUNT|
 						SLAB_PANIC),
 					     init_once);
 	pde_opener_cache =
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 6422e569b080..8e08a9a1b7ed 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -10,7 +10,7 @@
  *	Safe accesses to vmalloc/direct-mapped discontiguous areas, Kanoj Sarcar <kanoj@sgi.com>
  */
 
-#include <linux/crash_core.h>
+#include <linux/vmcore_info.h>
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
 #include <linux/kcore.h>
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3f78ebbb795f..23fbab954c20 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1352,8 +1352,7 @@ static inline pagemap_entry_t make_pme(u64 frame, u64 flags)
 	return (pagemap_entry_t) { .pme = (frame & PM_PFRAME_MASK) | flags };
 }
 
-static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
-			  struct pagemapread *pm)
+static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm)
 {
 	pm->buffer[pm->pos++] = *pme;
 	if (pm->pos >= pm->len)
@@ -1380,7 +1379,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 			hole_end = end;
 
 		for (; addr < hole_end; addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1392,7 +1391,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			pme = make_pme(0, PM_SOFT_DIRTY);
 		for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				goto out;
 		}
@@ -1519,7 +1518,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		for (; addr != end; addr += PAGE_SIZE) {
 			pagemap_entry_t pme = make_pme(frame, flags);
 
-			err = add_to_pagemap(addr, &pme, pm);
+			err = add_to_pagemap(&pme, pm);
 			if (err)
 				break;
 			if (pm->show_pfn) {
@@ -1547,7 +1546,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pagemap_entry_t pme;
 
 		pme = pte_to_pagemap_entry(pm, vma, addr, ptep_get(pte));
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			break;
 	}
@@ -1597,7 +1596,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 	for (; addr != end; addr += PAGE_SIZE) {
 		pagemap_entry_t pme = make_pme(frame, flags);
 
-		err = add_to_pagemap(addr, &pme, pm);
+		err = add_to_pagemap(&pme, pm);
 		if (err)
 			return err;
 		if (pm->show_pfn && (flags & PM_PRESENT))
@@ -1807,7 +1806,7 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pte_to_swp_entry(pte);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 		if (pte_swp_soft_dirty(pte))
@@ -1873,7 +1872,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
 		if (p->masks_of_interest & PAGE_IS_FILE) {
 			swp = pmd_to_swp_entry(pmd);
 			if (is_pfn_swap_entry(swp) &&
-			    !PageAnon(pfn_swap_entry_to_page(swp)))
+			    !folio_test_anon(pfn_swap_entry_folio(swp)))
 				categories |= PAGE_IS_FILE;
 		}
 	}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d0d9bfdad30c..56815799ce79 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -307,7 +307,6 @@ int pstore_put_backend_records(struct pstore_info *psi)
 {
 	struct pstore_private *pos, *tmp;
 	struct dentry *root;
-	int rc = 0;
 
 	root = psinfo_lock_root();
 	if (!root)
@@ -317,11 +316,8 @@ int pstore_put_backend_records(struct pstore_info *psi)
 		list_for_each_entry_safe(pos, tmp, &records_list, list) {
 			if (pos->record->psi == psi) {
 				list_del_init(&pos->list);
-				rc = simple_unlink(d_inode(root), pos->dentry);
-				if (WARN_ON(rc))
-					break;
-				d_drop(pos->dentry);
-				dput(pos->dentry);
+				d_invalidate(pos->dentry);
+				simple_unlink(d_inode(root), pos->dentry);
 				pos->dentry = NULL;
 			}
 		}
@@ -329,7 +325,7 @@ int pstore_put_backend_records(struct pstore_info *psi)
 
 	inode_unlock(d_inode(root));
 
-	return rc;
+	return 0;
 }
 
 /*
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 88b34fdbf759..b1a455f42e93 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -893,6 +893,7 @@ static const struct of_device_id dt_match[] = {
 	{ .compatible = "ramoops" },
 	{}
 };
+MODULE_DEVICE_TABLE(of, dt_match);
 
 static struct platform_driver ramoops_driver = {
 	.probe		= ramoops_probe,
diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c
index 2770746bb7aa..694db616663f 100644
--- a/fs/pstore/zone.c
+++ b/fs/pstore/zone.c
@@ -973,6 +973,8 @@ static ssize_t psz_kmsg_read(struct pstore_zone *zone,
 		char *buf = kasprintf(GFP_KERNEL, "%s: Total %d times\n",
 				      kmsg_dump_reason_str(record->reason),
 				      record->count);
+		if (!buf)
+			return -ENOMEM;
 		hlen = strlen(buf);
 		record->buf = krealloc(buf, hlen + size, GFP_KERNEL);
 		if (!record->buf) {
@@ -1215,7 +1217,6 @@ static struct pstore_zone **psz_init_zones(enum pstore_type_id type,
 		pr_err("allocate for zones %s failed\n", name);
 		return ERR_PTR(-ENOMEM);
 	}
-	memset(zones, 0, c * sizeof(*zones));
 
 	for (i = 0; i < c; i++) {
 		zone = psz_init_zone(type, off, record_size);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 6eb9bb369b57..d79841e94428 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -21,6 +21,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
+#include <linux/fs_context.h>
 #include "qnx4.h"
 
 #define QNX4_VERSION  4
@@ -30,28 +31,33 @@ static const struct super_operations qnx4_sops;
 
 static struct inode *qnx4_alloc_inode(struct super_block *sb);
 static void qnx4_free_inode(struct inode *inode);
-static int qnx4_remount(struct super_block *sb, int *flags, char *data);
 static int qnx4_statfs(struct dentry *, struct kstatfs *);
+static int qnx4_get_tree(struct fs_context *fc);
 
 static const struct super_operations qnx4_sops =
 {
 	.alloc_inode	= qnx4_alloc_inode,
 	.free_inode	= qnx4_free_inode,
 	.statfs		= qnx4_statfs,
-	.remount_fs	= qnx4_remount,
 };
 
-static int qnx4_remount(struct super_block *sb, int *flags, char *data)
+static int qnx4_reconfigure(struct fs_context *fc)
 {
+	struct super_block *sb = fc->root->d_sb;
 	struct qnx4_sb_info *qs;
 
 	sync_filesystem(sb);
 	qs = qnx4_sb(sb);
 	qs->Version = QNX4_VERSION;
-	*flags |= SB_RDONLY;
+	fc->sb_flags |= SB_RDONLY;
 	return 0;
 }
 
+static const struct fs_context_operations qnx4_context_opts = {
+	.get_tree	= qnx4_get_tree,
+	.reconfigure	= qnx4_reconfigure,
+};
+
 static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create )
 {
 	unsigned long phys;
@@ -183,12 +189,13 @@ static const char *qnx4_checkroot(struct super_block *sb,
 	return "bitmap file not found.";
 }
 
-static int qnx4_fill_super(struct super_block *s, void *data, int silent)
+static int qnx4_fill_super(struct super_block *s, struct fs_context *fc)
 {
 	struct buffer_head *bh;
 	struct inode *root;
 	const char *errmsg;
 	struct qnx4_sb_info *qs;
+	int silent = fc->sb_flags & SB_SILENT;
 
 	qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL);
 	if (!qs)
@@ -216,7 +223,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data);
 	brelse(bh);
 	if (errmsg != NULL) {
- 		if (!silent)
+		if (!silent)
 			printk(KERN_ERR "qnx4: %s\n", errmsg);
 		return -EINVAL;
 	}
@@ -235,6 +242,18 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
 	return 0;
 }
 
+static int qnx4_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, qnx4_fill_super);
+}
+
+static int qnx4_init_fs_context(struct fs_context *fc)
+{
+	fc->ops = &qnx4_context_opts;
+
+	return 0;
+}
+
 static void qnx4_kill_sb(struct super_block *sb)
 {
 	struct qnx4_sb_info *qs = qnx4_sb(sb);
@@ -359,7 +378,7 @@ static int init_inodecache(void)
 	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
@@ -376,18 +395,12 @@ static void destroy_inodecache(void)
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
-static struct dentry *qnx4_mount(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super);
-}
-
 static struct file_system_type qnx4_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "qnx4",
-	.mount		= qnx4_mount,
-	.kill_sb	= qnx4_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner			= THIS_MODULE,
+	.name			= "qnx4",
+	.kill_sb		= qnx4_kill_sb,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= qnx4_init_fs_context,
 };
 MODULE_ALIAS_FS("qnx4");
 
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index a286c545717f..405913f4faff 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -615,7 +615,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 1f0c754416b6..dacbee455c03 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -399,15 +399,17 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
 
 /* Dirtify all the dquots - this can block when journalling */
-static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
 {
 	int ret, err, cnt;
+	struct dquot *dquot;
 
 	ret = err = 0;
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (dquot[cnt])
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (dquot)
 			/* Even in case of error we have to continue */
-			ret = mark_dquot_dirty(dquot[cnt]);
+			ret = mark_dquot_dirty(dquot);
 		if (!err)
 			err = ret;
 	}
@@ -875,10 +877,7 @@ void dqput(struct dquot *dquot)
 	}
 
 	/* Need to release dquot? */
-#ifdef CONFIG_QUOTA_DEBUG
-	/* sanity check */
-	BUG_ON(!list_empty(&dquot->dq_free));
-#endif
+	WARN_ON_ONCE(!list_empty(&dquot->dq_free));
 	put_releasing_dquots(dquot);
 	atomic_dec(&dquot->dq_count);
 	spin_unlock(&dq_list_lock);
@@ -987,9 +986,8 @@ we_slept:
 	 * smp_mb__before_atomic() in dquot_acquire().
 	 */
 	smp_rmb();
-#ifdef CONFIG_QUOTA_DEBUG
-	BUG_ON(!dquot->dq_sb);	/* Has somebody invalidated entry under us? */
-#endif
+	/* Has somebody invalidated entry under us? */
+	WARN_ON_ONCE(hlist_unhashed(&dquot->dq_hash));
 out:
 	if (empty)
 		do_destroy_dquot(empty);
@@ -998,14 +996,14 @@ out:
 }
 EXPORT_SYMBOL(dqget);
 
-static inline struct dquot **i_dquot(struct inode *inode)
+static inline struct dquot __rcu **i_dquot(struct inode *inode)
 {
 	return inode->i_sb->s_op->get_dquots(inode);
 }
 
 static int dqinit_needed(struct inode *inode, int type)
 {
-	struct dquot * const *dquots;
+	struct dquot __rcu * const *dquots;
 	int cnt;
 
 	if (IS_NOQUOTA(inode))
@@ -1095,14 +1093,16 @@ static void remove_dquot_ref(struct super_block *sb, int type)
 		 */
 		spin_lock(&dq_data_lock);
 		if (!IS_NOQUOTA(inode)) {
-			struct dquot **dquots = i_dquot(inode);
-			struct dquot *dquot = dquots[type];
+			struct dquot __rcu **dquots = i_dquot(inode);
+			struct dquot *dquot = srcu_dereference_check(
+				dquots[type], &dquot_srcu,
+				lockdep_is_held(&dq_data_lock));
 
 #ifdef CONFIG_QUOTA_DEBUG
 			if (unlikely(inode_get_rsv_space(inode) > 0))
 				reserved = 1;
 #endif
-			dquots[type] = NULL;
+			rcu_assign_pointer(dquots[type], NULL);
 			if (dquot)
 				dqput(dquot);
 		}
@@ -1455,7 +1455,8 @@ static int inode_quota_active(const struct inode *inode)
 static int __dquot_initialize(struct inode *inode, int type)
 {
 	int cnt, init_needed = 0;
-	struct dquot **dquots, *got[MAXQUOTAS] = {};
+	struct dquot __rcu **dquots;
+	struct dquot *got[MAXQUOTAS] = {};
 	struct super_block *sb = inode->i_sb;
 	qsize_t rsv;
 	int ret = 0;
@@ -1530,7 +1531,7 @@ static int __dquot_initialize(struct inode *inode, int type)
 		if (!got[cnt])
 			continue;
 		if (!dquots[cnt]) {
-			dquots[cnt] = got[cnt];
+			rcu_assign_pointer(dquots[cnt], got[cnt]);
 			got[cnt] = NULL;
 			/*
 			 * Make quota reservation system happy if someone
@@ -1538,12 +1539,16 @@ static int __dquot_initialize(struct inode *inode, int type)
 			 */
 			rsv = inode_get_rsv_space(inode);
 			if (unlikely(rsv)) {
+				struct dquot *dquot = srcu_dereference_check(
+					dquots[cnt], &dquot_srcu,
+					lockdep_is_held(&dq_data_lock));
+
 				spin_lock(&inode->i_lock);
 				/* Get reservation again under proper lock */
 				rsv = __inode_get_rsv_space(inode);
-				spin_lock(&dquots[cnt]->dq_dqb_lock);
-				dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
-				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+				spin_lock(&dquot->dq_dqb_lock);
+				dquot->dq_dqb.dqb_rsvspace += rsv;
+				spin_unlock(&dquot->dq_dqb_lock);
 				spin_unlock(&inode->i_lock);
 			}
 		}
@@ -1565,7 +1570,7 @@ EXPORT_SYMBOL(dquot_initialize);
 
 bool dquot_initialize_needed(struct inode *inode)
 {
-	struct dquot **dquots;
+	struct dquot __rcu **dquots;
 	int i;
 
 	if (!inode_quota_active(inode))
@@ -1590,13 +1595,14 @@ EXPORT_SYMBOL(dquot_initialize_needed);
 static void __dquot_drop(struct inode *inode)
 {
 	int cnt;
-	struct dquot **dquots = i_dquot(inode);
+	struct dquot __rcu **dquots = i_dquot(inode);
 	struct dquot *put[MAXQUOTAS];
 
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		put[cnt] = dquots[cnt];
-		dquots[cnt] = NULL;
+		put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
+					lockdep_is_held(&dq_data_lock));
+		rcu_assign_pointer(dquots[cnt], NULL);
 	}
 	spin_unlock(&dq_data_lock);
 	dqput_all(put);
@@ -1604,7 +1610,7 @@ static void __dquot_drop(struct inode *inode)
 
 void dquot_drop(struct inode *inode)
 {
-	struct dquot * const *dquots;
+	struct dquot __rcu * const *dquots;
 	int cnt;
 
 	if (IS_NOQUOTA(inode))
@@ -1677,7 +1683,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	int cnt, ret = 0, index;
 	struct dquot_warn warn[MAXQUOTAS];
 	int reserve = flags & DQUOT_SPACE_RESERVE;
-	struct dquot **dquots;
+	struct dquot __rcu **dquots;
+	struct dquot *dquot;
 
 	if (!inode_quota_active(inode)) {
 		if (reserve) {
@@ -1697,27 +1704,26 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
 	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!dquots[cnt])
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (!dquot)
 			continue;
 		if (reserve) {
-			ret = dquot_add_space(dquots[cnt], 0, number, flags,
-					      &warn[cnt]);
+			ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
 		} else {
-			ret = dquot_add_space(dquots[cnt], number, 0, flags,
-					      &warn[cnt]);
+			ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
 		}
 		if (ret) {
 			/* Back out changes we already did */
 			for (cnt--; cnt >= 0; cnt--) {
-				if (!dquots[cnt])
+				dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+				if (!dquot)
 					continue;
-				spin_lock(&dquots[cnt]->dq_dqb_lock);
+				spin_lock(&dquot->dq_dqb_lock);
 				if (reserve)
-					dquot_free_reserved_space(dquots[cnt],
-								  number);
+					dquot_free_reserved_space(dquot, number);
 				else
-					dquot_decr_space(dquots[cnt], number);
-				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+					dquot_decr_space(dquot, number);
+				spin_unlock(&dquot->dq_dqb_lock);
 			}
 			spin_unlock(&inode->i_lock);
 			goto out_flush_warn;
@@ -1747,7 +1753,8 @@ int dquot_alloc_inode(struct inode *inode)
 {
 	int cnt, ret = 0, index;
 	struct dquot_warn warn[MAXQUOTAS];
-	struct dquot * const *dquots;
+	struct dquot __rcu * const *dquots;
+	struct dquot *dquot;
 
 	if (!inode_quota_active(inode))
 		return 0;
@@ -1758,17 +1765,19 @@ int dquot_alloc_inode(struct inode *inode)
 	index = srcu_read_lock(&dquot_srcu);
 	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!dquots[cnt])
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (!dquot)
 			continue;
-		ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+		ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
 		if (ret) {
 			for (cnt--; cnt >= 0; cnt--) {
-				if (!dquots[cnt])
+				dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+				if (!dquot)
 					continue;
 				/* Back out changes we already did */
-				spin_lock(&dquots[cnt]->dq_dqb_lock);
-				dquot_decr_inodes(dquots[cnt], 1);
-				spin_unlock(&dquots[cnt]->dq_dqb_lock);
+				spin_lock(&dquot->dq_dqb_lock);
+				dquot_decr_inodes(dquot, 1);
+				spin_unlock(&dquot->dq_dqb_lock);
 			}
 			goto warn_put_all;
 		}
@@ -1789,7 +1798,8 @@ EXPORT_SYMBOL(dquot_alloc_inode);
  */
 void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 {
-	struct dquot **dquots;
+	struct dquot __rcu **dquots;
+	struct dquot *dquot;
 	int cnt, index;
 
 	if (!inode_quota_active(inode)) {
@@ -1805,9 +1815,8 @@ void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
 	spin_lock(&inode->i_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (dquots[cnt]) {
-			struct dquot *dquot = dquots[cnt];
-
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (dquot) {
 			spin_lock(&dquot->dq_dqb_lock);
 			if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
 				number = dquot->dq_dqb.dqb_rsvspace;
@@ -1831,7 +1840,8 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
  */
 void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 {
-	struct dquot **dquots;
+	struct dquot __rcu **dquots;
+	struct dquot *dquot;
 	int cnt, index;
 
 	if (!inode_quota_active(inode)) {
@@ -1847,9 +1857,8 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
 	spin_lock(&inode->i_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (dquots[cnt]) {
-			struct dquot *dquot = dquots[cnt];
-
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (dquot) {
 			spin_lock(&dquot->dq_dqb_lock);
 			if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
 				number = dquot->dq_dqb.dqb_curspace;
@@ -1875,7 +1884,8 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
 	unsigned int cnt;
 	struct dquot_warn warn[MAXQUOTAS];
-	struct dquot **dquots;
+	struct dquot __rcu **dquots;
+	struct dquot *dquot;
 	int reserve = flags & DQUOT_SPACE_RESERVE, index;
 
 	if (!inode_quota_active(inode)) {
@@ -1896,17 +1906,18 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 		int wtype;
 
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
-		if (!dquots[cnt])
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (!dquot)
 			continue;
-		spin_lock(&dquots[cnt]->dq_dqb_lock);
-		wtype = info_bdq_free(dquots[cnt], number);
+		spin_lock(&dquot->dq_dqb_lock);
+		wtype = info_bdq_free(dquot, number);
 		if (wtype != QUOTA_NL_NOWARN)
-			prepare_warning(&warn[cnt], dquots[cnt], wtype);
+			prepare_warning(&warn[cnt], dquot, wtype);
 		if (reserve)
-			dquot_free_reserved_space(dquots[cnt], number);
+			dquot_free_reserved_space(dquot, number);
 		else
-			dquot_decr_space(dquots[cnt], number);
-		spin_unlock(&dquots[cnt]->dq_dqb_lock);
+			dquot_decr_space(dquot, number);
+		spin_unlock(&dquot->dq_dqb_lock);
 	}
 	if (reserve)
 		*inode_reserved_space(inode) -= number;
@@ -1930,7 +1941,8 @@ void dquot_free_inode(struct inode *inode)
 {
 	unsigned int cnt;
 	struct dquot_warn warn[MAXQUOTAS];
-	struct dquot * const *dquots;
+	struct dquot __rcu * const *dquots;
+	struct dquot *dquot;
 	int index;
 
 	if (!inode_quota_active(inode))
@@ -1941,16 +1953,16 @@ void dquot_free_inode(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		int wtype;
-
 		warn[cnt].w_type = QUOTA_NL_NOWARN;
-		if (!dquots[cnt])
+		dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+		if (!dquot)
 			continue;
-		spin_lock(&dquots[cnt]->dq_dqb_lock);
-		wtype = info_idq_free(dquots[cnt], 1);
+		spin_lock(&dquot->dq_dqb_lock);
+		wtype = info_idq_free(dquot, 1);
 		if (wtype != QUOTA_NL_NOWARN)
-			prepare_warning(&warn[cnt], dquots[cnt], wtype);
-		dquot_decr_inodes(dquots[cnt], 1);
-		spin_unlock(&dquots[cnt]->dq_dqb_lock);
+			prepare_warning(&warn[cnt], dquot, wtype);
+		dquot_decr_inodes(dquot, 1);
+		spin_unlock(&dquot->dq_dqb_lock);
 	}
 	spin_unlock(&inode->i_lock);
 	mark_all_dquot_dirty(dquots);
@@ -1976,8 +1988,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	qsize_t cur_space;
 	qsize_t rsv_space = 0;
 	qsize_t inode_usage = 1;
+	struct dquot __rcu **dquots;
 	struct dquot *transfer_from[MAXQUOTAS] = {};
-	int cnt, ret = 0;
+	int cnt, index, ret = 0;
 	char is_valid[MAXQUOTAS] = {};
 	struct dquot_warn warn_to[MAXQUOTAS];
 	struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -2008,6 +2021,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 	}
 	cur_space = __inode_get_bytes(inode);
 	rsv_space = __inode_get_rsv_space(inode);
+	dquots = i_dquot(inode);
 	/*
 	 * Build the transfer_from list, check limits, and update usage in
 	 * the target structures.
@@ -2022,7 +2036,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 		if (!sb_has_quota_active(inode->i_sb, cnt))
 			continue;
 		is_valid[cnt] = 1;
-		transfer_from[cnt] = i_dquot(inode)[cnt];
+		transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
+				&dquot_srcu, lockdep_is_held(&dq_data_lock));
 		ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
 				       &warn_to[cnt]);
 		if (ret)
@@ -2061,13 +2076,21 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
 						  rsv_space);
 			spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
 		}
-		i_dquot(inode)[cnt] = transfer_to[cnt];
+		rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
 	}
 	spin_unlock(&inode->i_lock);
 	spin_unlock(&dq_data_lock);
 
-	mark_all_dquot_dirty(transfer_from);
-	mark_all_dquot_dirty(transfer_to);
+	/*
+	 * These arrays are local and we hold dquot references so we don't need
+	 * the srcu protection but still take dquot_srcu to avoid warning in
+	 * mark_all_dquot_dirty().
+	 */
+	index = srcu_read_lock(&dquot_srcu);
+	mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+	mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+	srcu_read_unlock(&dquot_srcu, index);
+
 	flush_warnings(warn_to);
 	flush_warnings(warn_from_inodes);
 	flush_warnings(warn_from_space);
@@ -2388,7 +2411,8 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
 	lockdep_assert_held_write(&sb->s_umount);
 
 	/* Just unsuspend quotas? */
-	BUG_ON(flags & DQUOT_SUSPENDED);
+	if (WARN_ON_ONCE(flags & DQUOT_SUSPENDED))
+		return -EINVAL;
 
 	if (!fmt)
 		return -ESRCH;
@@ -2984,7 +3008,7 @@ static int __init dquot_init(void)
 	dquot_cachep = kmem_cache_create("dquot",
 			sizeof(struct dquot), sizeof(unsigned long) * 4,
 			(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-				SLAB_MEM_SPREAD|SLAB_PANIC),
+				SLAB_PANIC),
 			NULL);
 
 	order = 0;
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 0f1493e0f6d0..afceef3ddfaa 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -21,6 +21,12 @@ MODULE_AUTHOR("Jan Kara");
 MODULE_DESCRIPTION("Quota trie support");
 MODULE_LICENSE("GPL");
 
+/*
+ * Maximum quota tree depth we support. Only to limit recursion when working
+ * with the tree.
+ */
+#define MAX_QTREE_DEPTH 6
+
 #define __QUOTA_QT_PARANOIA
 
 static int __get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
@@ -108,7 +114,7 @@ static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
 /* Remove empty block from list and return it */
 static int get_free_dqblk(struct qtree_mem_dqinfo *info)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int ret, blk;
 
@@ -160,7 +166,7 @@ static int put_free_dqblk(struct qtree_mem_dqinfo *info, char *buf, uint blk)
 static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 			       uint blk)
 {
-	char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	uint nextblk = le32_to_cpu(dh->dqdh_next_free);
 	uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
@@ -207,7 +213,7 @@ out_buf:
 static int insert_free_dqentry(struct qtree_mem_dqinfo *info, char *buf,
 			       uint blk)
 {
-	char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *tmpbuf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
 	int err;
 
@@ -255,7 +261,7 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
 {
 	uint blk, i;
 	struct qt_disk_dqdbheader *dh;
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	char *ddquot;
 
 	*err = 0;
@@ -327,27 +333,36 @@ out_buf:
 
 /* Insert reference to structure into the trie */
 static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
-			  uint *treeblk, int depth)
+			  uint *blks, int depth)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	int ret = 0, newson = 0, newact = 0;
 	__le32 *ref;
 	uint newblk;
+	int i;
 
 	if (!buf)
 		return -ENOMEM;
-	if (!*treeblk) {
+	if (!blks[depth]) {
 		ret = get_free_dqblk(info);
 		if (ret < 0)
 			goto out_buf;
-		*treeblk = ret;
+		for (i = 0; i < depth; i++)
+			if (ret == blks[i]) {
+				quota_error(dquot->dq_sb,
+					"Free block already used in tree: block %u",
+					ret);
+				ret = -EIO;
+				goto out_buf;
+			}
+		blks[depth] = ret;
 		memset(buf, 0, info->dqi_usable_bs);
 		newact = 1;
 	} else {
-		ret = read_blk(info, *treeblk, buf);
+		ret = read_blk(info, blks[depth], buf);
 		if (ret < 0) {
 			quota_error(dquot->dq_sb, "Can't read tree quota "
-				    "block %u", *treeblk);
+				    "block %u", blks[depth]);
 			goto out_buf;
 		}
 	}
@@ -357,8 +372,20 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			     info->dqi_blocks - 1);
 	if (ret)
 		goto out_buf;
-	if (!newblk)
+	if (!newblk) {
 		newson = 1;
+	} else {
+		for (i = 0; i <= depth; i++)
+			if (newblk == blks[i]) {
+				quota_error(dquot->dq_sb,
+					"Cycle in quota tree detected: block %u index %u",
+					blks[depth],
+					get_index(info, dquot->dq_id, depth));
+				ret = -EIO;
+				goto out_buf;
+			}
+	}
+	blks[depth + 1] = newblk;
 	if (depth == info->dqi_qtree_depth - 1) {
 #ifdef __QUOTA_QT_PARANOIA
 		if (newblk) {
@@ -370,16 +397,16 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			goto out_buf;
 		}
 #endif
-		newblk = find_free_dqentry(info, dquot, &ret);
+		blks[depth + 1] = find_free_dqentry(info, dquot, &ret);
 	} else {
-		ret = do_insert_tree(info, dquot, &newblk, depth+1);
+		ret = do_insert_tree(info, dquot, blks, depth + 1);
 	}
 	if (newson && ret >= 0) {
 		ref[get_index(info, dquot->dq_id, depth)] =
-							cpu_to_le32(newblk);
-		ret = write_blk(info, *treeblk, buf);
+						cpu_to_le32(blks[depth + 1]);
+		ret = write_blk(info, blks[depth], buf);
 	} else if (newact && ret < 0) {
-		put_free_dqblk(info, buf, *treeblk);
+		put_free_dqblk(info, buf, blks[depth]);
 	}
 out_buf:
 	kfree(buf);
@@ -390,7 +417,7 @@ out_buf:
 static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
 				 struct dquot *dquot)
 {
-	int tmp = QT_TREEOFF;
+	uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
 
 #ifdef __QUOTA_QT_PARANOIA
 	if (info->dqi_blocks <= QT_TREEOFF) {
@@ -398,7 +425,11 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
 		return -EIO;
 	}
 #endif
-	return do_insert_tree(info, dquot, &tmp, 0);
+	if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+		quota_error(dquot->dq_sb, "Quota tree depth too big!");
+		return -EIO;
+	}
+	return do_insert_tree(info, dquot, blks, 0);
 }
 
 /*
@@ -410,7 +441,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 	int type = dquot->dq_id.type;
 	struct super_block *sb = dquot->dq_sb;
 	ssize_t ret;
-	char *ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+	char *ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
 
 	if (!ddquot)
 		return -ENOMEM;
@@ -449,7 +480,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 			uint blk)
 {
 	struct qt_disk_dqdbheader *dh;
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	int ret = 0;
 
 	if (!buf)
@@ -511,19 +542,20 @@ out_buf:
 
 /* Remove reference to dquot from tree */
 static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
-		       uint *blk, int depth)
+		       uint *blks, int depth)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	int ret = 0;
 	uint newblk;
 	__le32 *ref = (__le32 *)buf;
+	int i;
 
 	if (!buf)
 		return -ENOMEM;
-	ret = read_blk(info, *blk, buf);
+	ret = read_blk(info, blks[depth], buf);
 	if (ret < 0) {
 		quota_error(dquot->dq_sb, "Can't read quota data block %u",
-			    *blk);
+			    blks[depth]);
 		goto out_buf;
 	}
 	newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
@@ -532,29 +564,38 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
 	if (ret)
 		goto out_buf;
 
+	for (i = 0; i <= depth; i++)
+		if (newblk == blks[i]) {
+			quota_error(dquot->dq_sb,
+				"Cycle in quota tree detected: block %u index %u",
+				blks[depth],
+				get_index(info, dquot->dq_id, depth));
+			ret = -EIO;
+			goto out_buf;
+		}
 	if (depth == info->dqi_qtree_depth - 1) {
 		ret = free_dqentry(info, dquot, newblk);
-		newblk = 0;
+		blks[depth + 1] = 0;
 	} else {
-		ret = remove_tree(info, dquot, &newblk, depth+1);
+		blks[depth + 1] = newblk;
+		ret = remove_tree(info, dquot, blks, depth + 1);
 	}
-	if (ret >= 0 && !newblk) {
-		int i;
+	if (ret >= 0 && !blks[depth + 1]) {
 		ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
 		/* Block got empty? */
 		for (i = 0; i < (info->dqi_usable_bs >> 2) && !ref[i]; i++)
 			;
 		/* Don't put the root block into the free block list */
 		if (i == (info->dqi_usable_bs >> 2)
-		    && *blk != QT_TREEOFF) {
-			put_free_dqblk(info, buf, *blk);
-			*blk = 0;
+		    && blks[depth] != QT_TREEOFF) {
+			put_free_dqblk(info, buf, blks[depth]);
+			blks[depth] = 0;
 		} else {
-			ret = write_blk(info, *blk, buf);
+			ret = write_blk(info, blks[depth], buf);
 			if (ret < 0)
 				quota_error(dquot->dq_sb,
 					    "Can't write quota tree block %u",
-					    *blk);
+					    blks[depth]);
 		}
 	}
 out_buf:
@@ -565,11 +606,15 @@ out_buf:
 /* Delete dquot from tree */
 int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 {
-	uint tmp = QT_TREEOFF;
+	uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
 
 	if (!dquot->dq_off)	/* Even not allocated? */
 		return 0;
-	return remove_tree(info, dquot, &tmp, 0);
+	if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+		quota_error(dquot->dq_sb, "Quota tree depth too big!");
+		return -EIO;
+	}
+	return remove_tree(info, dquot, blks, 0);
 }
 EXPORT_SYMBOL(qtree_delete_dquot);
 
@@ -577,7 +622,7 @@ EXPORT_SYMBOL(qtree_delete_dquot);
 static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
 				 struct dquot *dquot, uint blk)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	loff_t ret = 0;
 	int i;
 	char *ddquot;
@@ -613,18 +658,20 @@ out_buf:
 
 /* Find entry for given id in the tree */
 static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
-				struct dquot *dquot, uint blk, int depth)
+				struct dquot *dquot, uint *blks, int depth)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	loff_t ret = 0;
 	__le32 *ref = (__le32 *)buf;
+	uint blk;
+	int i;
 
 	if (!buf)
 		return -ENOMEM;
-	ret = read_blk(info, blk, buf);
+	ret = read_blk(info, blks[depth], buf);
 	if (ret < 0) {
 		quota_error(dquot->dq_sb, "Can't read quota tree block %u",
-			    blk);
+			    blks[depth]);
 		goto out_buf;
 	}
 	ret = 0;
@@ -636,8 +683,19 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
 	if (ret)
 		goto out_buf;
 
+	/* Check for cycles in the tree */
+	for (i = 0; i <= depth; i++)
+		if (blk == blks[i]) {
+			quota_error(dquot->dq_sb,
+				"Cycle in quota tree detected: block %u index %u",
+				blks[depth],
+				get_index(info, dquot->dq_id, depth));
+			ret = -EIO;
+			goto out_buf;
+		}
+	blks[depth + 1] = blk;
 	if (depth < info->dqi_qtree_depth - 1)
-		ret = find_tree_dqentry(info, dquot, blk, depth+1);
+		ret = find_tree_dqentry(info, dquot, blks, depth + 1);
 	else
 		ret = find_block_dqentry(info, dquot, blk);
 out_buf:
@@ -649,7 +707,13 @@ out_buf:
 static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
 				  struct dquot *dquot)
 {
-	return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+	uint blks[MAX_QTREE_DEPTH] = { QT_TREEOFF };
+
+	if (info->dqi_qtree_depth >= MAX_QTREE_DEPTH) {
+		quota_error(dquot->dq_sb, "Quota tree depth too big!");
+		return -EIO;
+	}
+	return find_tree_dqentry(info, dquot, blks, 0);
 }
 
 int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
@@ -684,7 +748,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
 		}
 		dquot->dq_off = offset;
 	}
-	ddquot = kmalloc(info->dqi_entry_size, GFP_NOFS);
+	ddquot = kmalloc(info->dqi_entry_size, GFP_KERNEL);
 	if (!ddquot)
 		return -ENOMEM;
 	ret = sb->s_op->quota_read(sb, type, ddquot, info->dqi_entry_size,
@@ -728,7 +792,7 @@ EXPORT_SYMBOL(qtree_release_dquot);
 static int find_next_id(struct qtree_mem_dqinfo *info, qid_t *id,
 			unsigned int blk, int depth)
 {
-	char *buf = kmalloc(info->dqi_usable_bs, GFP_NOFS);
+	char *buf = kmalloc(info->dqi_usable_bs, GFP_KERNEL);
 	__le32 *ref = (__le32 *)buf;
 	ssize_t ret;
 	unsigned int epb = info->dqi_usable_bs >> 2;
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index a0db3f195e95..3f3e8acc05db 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -160,9 +160,11 @@ static int v1_read_file_info(struct super_block *sb, int type)
 {
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct v1_disk_dqblk dqblk;
+	unsigned int memalloc;
 	int ret;
 
 	down_read(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
 				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -179,6 +181,7 @@ static int v1_read_file_info(struct super_block *sb, int type)
 	dqopt->info[type].dqi_bgrace =
 			dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME;
 out:
+	memalloc_nofs_restore(memalloc);
 	up_read(&dqopt->dqio_sem);
 	return ret;
 }
@@ -187,9 +190,11 @@ static int v1_write_file_info(struct super_block *sb, int type)
 {
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct v1_disk_dqblk dqblk;
+	unsigned int memalloc;
 	int ret;
 
 	down_write(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = sb->s_op->quota_read(sb, type, (char *)&dqblk,
 				sizeof(struct v1_disk_dqblk), v1_dqoff(0));
 	if (ret != sizeof(struct v1_disk_dqblk)) {
@@ -209,6 +214,7 @@ static int v1_write_file_info(struct super_block *sb, int type)
 	else if (ret >= 0)
 		ret = -EIO;
 out:
+	memalloc_nofs_restore(memalloc);
 	up_write(&dqopt->dqio_sem);
 	return ret;
 }
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index ae99e7b88205..c48c233f3bef 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -96,9 +96,11 @@ static int v2_read_file_info(struct super_block *sb, int type)
 	struct qtree_mem_dqinfo *qinfo;
 	ssize_t size;
 	unsigned int version;
+	unsigned int memalloc;
 	int ret;
 
 	down_read(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = v2_read_header(sb, type, &dqhead);
 	if (ret < 0)
 		goto out;
@@ -119,7 +121,7 @@ static int v2_read_file_info(struct super_block *sb, int type)
 			ret = -EIO;
 		goto out;
 	}
-	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+	info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_KERNEL);
 	if (!info->dqi_priv) {
 		ret = -ENOMEM;
 		goto out;
@@ -166,14 +168,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
 		    i_size_read(sb_dqopt(sb)->files[type]));
 		goto out_free;
 	}
-	if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) {
-		quota_error(sb, "Free block number too big (%u >= %u).",
-			    qinfo->dqi_free_blk, qinfo->dqi_blocks);
+	if (qinfo->dqi_free_blk && (qinfo->dqi_free_blk <= QT_TREEOFF ||
+	    qinfo->dqi_free_blk >= qinfo->dqi_blocks)) {
+		quota_error(sb, "Free block number %u out of range (%u, %u).",
+			    qinfo->dqi_free_blk, QT_TREEOFF, qinfo->dqi_blocks);
 		goto out_free;
 	}
-	if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) {
-		quota_error(sb, "Block with free entry too big (%u >= %u).",
-			    qinfo->dqi_free_entry, qinfo->dqi_blocks);
+	if (qinfo->dqi_free_entry && (qinfo->dqi_free_entry <= QT_TREEOFF ||
+	    qinfo->dqi_free_entry >= qinfo->dqi_blocks)) {
+		quota_error(sb, "Block with free entry %u out of range (%u, %u).",
+			    qinfo->dqi_free_entry, QT_TREEOFF,
+			    qinfo->dqi_blocks);
 		goto out_free;
 	}
 	ret = 0;
@@ -183,6 +188,7 @@ out_free:
 		info->dqi_priv = NULL;
 	}
 out:
+	memalloc_nofs_restore(memalloc);
 	up_read(&dqopt->dqio_sem);
 	return ret;
 }
@@ -195,8 +201,10 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	struct mem_dqinfo *info = &dqopt->info[type];
 	struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
 	ssize_t size;
+	unsigned int memalloc;
 
 	down_write(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	spin_lock(&dq_data_lock);
 	info->dqi_flags &= ~DQF_INFO_DIRTY;
 	dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
@@ -209,6 +217,7 @@ static int v2_write_file_info(struct super_block *sb, int type)
 	dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
 	size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
 	       sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
+	memalloc_nofs_restore(memalloc);
 	up_write(&dqopt->dqio_sem);
 	if (size != sizeof(struct v2_disk_dqinfo)) {
 		quota_error(sb, "Can't write info structure");
@@ -328,11 +337,14 @@ static int v2_read_dquot(struct dquot *dquot)
 {
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 	int ret;
+	unsigned int memalloc;
 
 	down_read(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = qtree_read_dquot(
 			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
 			dquot);
+	memalloc_nofs_restore(memalloc);
 	up_read(&dqopt->dqio_sem);
 	return ret;
 }
@@ -342,6 +354,7 @@ static int v2_write_dquot(struct dquot *dquot)
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
 	int ret;
 	bool alloc = false;
+	unsigned int memalloc;
 
 	/*
 	 * If space for dquot is already allocated, we don't need any
@@ -355,9 +368,11 @@ static int v2_write_dquot(struct dquot *dquot)
 	} else {
 		down_read(&dqopt->dqio_sem);
 	}
+	memalloc = memalloc_nofs_save();
 	ret = qtree_write_dquot(
 			sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv,
 			dquot);
+	memalloc_nofs_restore(memalloc);
 	if (alloc)
 		up_write(&dqopt->dqio_sem);
 	else
@@ -368,10 +383,13 @@ static int v2_write_dquot(struct dquot *dquot)
 static int v2_release_dquot(struct dquot *dquot)
 {
 	struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
+	unsigned int memalloc;
 	int ret;
 
 	down_write(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot);
+	memalloc_nofs_restore(memalloc);
 	up_write(&dqopt->dqio_sem);
 
 	return ret;
@@ -386,10 +404,13 @@ static int v2_free_file_info(struct super_block *sb, int type)
 static int v2_get_next_id(struct super_block *sb, struct kqid *qid)
 {
 	struct quota_info *dqopt = sb_dqopt(sb);
+	unsigned int memalloc;
 	int ret;
 
 	down_read(&dqopt->dqio_sem);
+	memalloc = memalloc_nofs_save();
 	ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid);
+	memalloc_nofs_restore(memalloc);
 	up_read(&dqopt->dqio_sem);
 	return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 4ac05a9e25bc..8006faaaf0ec 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -102,11 +102,20 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	int error = -ENOSPC;
 
 	if (inode) {
+		error = security_inode_init_security(inode, dir,
+						     &dentry->d_name, NULL,
+						     NULL);
+		if (error) {
+			iput(inode);
+			goto out;
+		}
+
 		d_instantiate(dentry, inode);
 		dget(dentry);	/* Extra count - pin the dentry in core */
 		error = 0;
 		inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
 	}
+out:
 	return error;
 }
 
@@ -134,6 +143,15 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	inode = ramfs_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0);
 	if (inode) {
 		int l = strlen(symname)+1;
+
+		error = security_inode_init_security(inode, dir,
+						     &dentry->d_name, NULL,
+						     NULL);
+		if (error) {
+			iput(inode);
+			goto out;
+		}
+
 		error = page_symlink(inode, symname, l);
 		if (!error) {
 			d_instantiate(dentry, inode);
@@ -143,6 +161,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		} else
 			iput(inode);
 	}
+out:
 	return error;
 }
 
@@ -150,12 +169,23 @@ static int ramfs_tmpfile(struct mnt_idmap *idmap,
 			 struct inode *dir, struct file *file, umode_t mode)
 {
 	struct inode *inode;
+	int error;
 
 	inode = ramfs_get_inode(dir->i_sb, dir, mode, 0);
 	if (!inode)
 		return -ENOSPC;
+
+	error = security_inode_init_security(inode, dir,
+					     &file_dentry(file)->d_name, NULL,
+					     NULL);
+	if (error) {
+		iput(inode);
+		goto out;
+	}
+
 	d_tmpfile(file, inode);
-	return finish_open_simple(file, 0);
+out:
+	return finish_open_simple(file, error);
 }
 
 static const struct inode_operations ramfs_dir_inode_operations = {
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 171c912af50f..e539ccd39e1e 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2386,7 +2386,7 @@ static int journal_read(struct super_block *sb)
 
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
 	reiserfs_info(sb, "checking transaction log (%pg)\n",
-		      journal->j_bdev_handle->bdev);
+		      file_bdev(journal->j_bdev_file));
 	start = ktime_get_seconds();
 
 	/*
@@ -2447,7 +2447,7 @@ static int journal_read(struct super_block *sb)
 		 * device and journal device to be the same
 		 */
 		d_bh =
-		    reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock,
+		    reiserfs_breada(file_bdev(journal->j_bdev_file), cur_dblock,
 				    sb->s_blocksize,
 				    SB_ONDISK_JOURNAL_1st_BLOCK(sb) +
 				    SB_ONDISK_JOURNAL_SIZE(sb));
@@ -2588,9 +2588,9 @@ static void journal_list_init(struct super_block *sb)
 
 static void release_journal_dev(struct reiserfs_journal *journal)
 {
-	if (journal->j_bdev_handle) {
-		bdev_release(journal->j_bdev_handle);
-		journal->j_bdev_handle = NULL;
+	if (journal->j_bdev_file) {
+		bdev_fput(journal->j_bdev_file);
+		journal->j_bdev_file = NULL;
 	}
 }
 
@@ -2605,7 +2605,7 @@ static int journal_init_dev(struct super_block *super,
 
 	result = 0;
 
-	journal->j_bdev_handle = NULL;
+	journal->j_bdev_file = NULL;
 	jdev = SB_ONDISK_JOURNAL_DEVICE(super) ?
 	    new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev;
 
@@ -2616,37 +2616,37 @@ static int journal_init_dev(struct super_block *super,
 	if ((!jdev_name || !jdev_name[0])) {
 		if (jdev == super->s_dev)
 			holder = NULL;
-		journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode,
+		journal->j_bdev_file = bdev_file_open_by_dev(jdev, blkdev_mode,
 							  holder, NULL);
-		if (IS_ERR(journal->j_bdev_handle)) {
-			result = PTR_ERR(journal->j_bdev_handle);
-			journal->j_bdev_handle = NULL;
+		if (IS_ERR(journal->j_bdev_file)) {
+			result = PTR_ERR(journal->j_bdev_file);
+			journal->j_bdev_file = NULL;
 			reiserfs_warning(super, "sh-458",
 					 "cannot init journal device unknown-block(%u,%u): %i",
 					 MAJOR(jdev), MINOR(jdev), result);
 			return result;
 		} else if (jdev != super->s_dev)
-			set_blocksize(journal->j_bdev_handle->bdev,
+			set_blocksize(file_bdev(journal->j_bdev_file),
 				      super->s_blocksize);
 
 		return 0;
 	}
 
-	journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode,
+	journal->j_bdev_file = bdev_file_open_by_path(jdev_name, blkdev_mode,
 						   holder, NULL);
-	if (IS_ERR(journal->j_bdev_handle)) {
-		result = PTR_ERR(journal->j_bdev_handle);
-		journal->j_bdev_handle = NULL;
+	if (IS_ERR(journal->j_bdev_file)) {
+		result = PTR_ERR(journal->j_bdev_file);
+		journal->j_bdev_file = NULL;
 		reiserfs_warning(super, "sh-457",
 				 "journal_init_dev: Cannot open '%s': %i",
 				 jdev_name, result);
 		return result;
 	}
 
-	set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize);
+	set_blocksize(file_bdev(journal->j_bdev_file), super->s_blocksize);
 	reiserfs_info(super,
 		      "journal_init_dev: journal device: %pg\n",
-		      journal->j_bdev_handle->bdev);
+		      file_bdev(journal->j_bdev_file));
 	return 0;
 }
 
@@ -2804,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 				 "journal header magic %x (device %pg) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
-				 journal->j_bdev_handle->bdev,
+				 file_bdev(journal->j_bdev_file),
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
@@ -2828,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	reiserfs_info(sb, "journal params: device %pg, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
-		      journal->j_bdev_handle->bdev,
+		      file_bdev(journal->j_bdev_file),
 		      SB_ONDISK_JOURNAL_SIZE(sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 83cb9402e0f9..5c68a4a52d78 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "prepare: \t%12lu\n"
 		   "prepare_retry: \t%12lu\n",
 		   DJP(jp_journal_1st_block),
-		   SB_JOURNAL(sb)->j_bdev_handle->bdev,
+		   file_bdev(SB_JOURNAL(sb)->j_bdev_file),
 		   DJP(jp_journal_dev),
 		   DJP(jp_journal_size),
 		   DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index 725667880e62..f0e1f29f20ee 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -97,7 +97,7 @@ struct reiserfs_inode_info {
 	struct rw_semaphore i_xattr_sem;
 #endif
 #ifdef CONFIG_QUOTA
-	struct dquot *i_dquot[MAXQUOTAS];
+	struct dquot __rcu *i_dquot[MAXQUOTAS];
 #endif
 
 	struct inode vfs_inode;
@@ -299,7 +299,7 @@ struct reiserfs_journal {
 	/* oldest journal block.  start here for traverse */
 	struct reiserfs_journal_cnode *j_first;
 
-	struct bdev_handle *j_bdev_handle;
+	struct file *j_bdev_file;
 
 	/* first block on s_dev of reserved area journal */
 	int j_1st_reserved_block;
@@ -2810,10 +2810,10 @@ struct reiserfs_journal_header {
 
 /* We need these to make journal.c code more readable */
 #define journal_find_get_block(s, block) __find_get_block(\
-		SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize)
-#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+		file_bdev(SB_JOURNAL(s)->j_bdev_file), block, s->s_blocksize)
+#define journal_getblk(s, block) __getblk(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
 		block, s->s_blocksize)
-#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\
+#define journal_bread(s, block) __bread(file_bdev(SB_JOURNAL(s)->j_bdev_file),\
 		block, s->s_blocksize)
 
 enum reiserfs_bh_state_bits {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 67b5510beded..ab76468da02d 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -670,7 +670,6 @@ static int __init init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-						      SLAB_MEM_SPREAD|
 						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
@@ -802,7 +801,7 @@ static ssize_t reiserfs_quota_write(struct super_block *, int, const char *,
 static ssize_t reiserfs_quota_read(struct super_block *, int, char *, size_t,
 				   loff_t);
 
-static struct dquot **reiserfs_get_dquots(struct inode *inode)
+static struct dquot __rcu **reiserfs_get_dquots(struct inode *inode)
 {
 	return REISERFS_I(inode)->i_dquot;
 }
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 545ad44f96b8..2cbb92462074 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -594,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb)
 #ifdef CONFIG_ROMFS_ON_BLOCK
 	if (sb->s_bdev) {
 		sync_blockdev(sb->s_bdev);
-		bdev_release(sb->s_bdev_handle);
+		bdev_fput(sb->s_bdev_file);
 	}
 #endif
 }
@@ -630,8 +630,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-				  SLAB_ACCOUNT, romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				  romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 0ee55af1a55c..9515c3fa1a03 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -476,7 +476,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
 		wait->_key |= POLLOUT_SET;
 }
 
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
 {
 	ktime_t expire, *to = NULL;
 	struct poll_wqueues table;
@@ -839,7 +839,7 @@ SYSCALL_DEFINE1(old_select, struct sel_arg_struct __user *, arg)
 
 struct poll_list {
 	struct poll_list *next;
-	int len;
+	unsigned int len;
 	struct pollfd entries[];
 };
 
@@ -975,14 +975,15 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 		struct timespec64 *end_time)
 {
 	struct poll_wqueues table;
-	int err = -EFAULT, fdcount, len;
+	int err = -EFAULT, fdcount;
 	/* Allocate small arguments on the stack to save memory and be
 	   faster - use long to make sure the buffer is aligned properly
 	   on 64 bit archs to avoid unaligned access */
 	long stack_pps[POLL_STACK_ALLOC/sizeof(long)];
 	struct poll_list *const head = (struct poll_list *)stack_pps;
  	struct poll_list *walk = head;
- 	unsigned long todo = nfds;
+	unsigned int todo = nfds;
+	unsigned int len;
 
 	if (nfds > rlimit(RLIMIT_NOFILE))
 		return -EINVAL;
@@ -998,9 +999,9 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 					sizeof(struct pollfd) * walk->len))
 			goto out_fds;
 
-		todo -= walk->len;
-		if (!todo)
+		if (walk->len >= todo)
 			break;
+		todo -= walk->len;
 
 		len = min(todo, POLLFD_PER_PAGE);
 		walk = walk->next = kmalloc(struct_size(walk, entries, len),
@@ -1020,7 +1021,7 @@ static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 	for (walk = head; walk; walk = walk->next) {
 		struct pollfd *fds = walk->entries;
-		int j;
+		unsigned int j;
 
 		for (j = walk->len; j; fds++, ufds++, j--)
 			unsafe_put_user(fds->revents, &ufds->revents, Efault);
diff --git a/fs/smb/client/Makefile b/fs/smb/client/Makefile
index 0b07eb94c93b..e11985f2460b 100644
--- a/fs/smb/client/Makefile
+++ b/fs/smb/client/Makefile
@@ -12,7 +12,7 @@ cifs-y := trace.o cifsfs.o cifs_debug.o connect.o dir.o file.o \
 	  smb2ops.o smb2maperror.o smb2transport.o \
 	  smb2misc.o smb2pdu.o smb2inode.o smb2file.o cifsacl.o fs_context.o \
 	  dns_resolve.o cifs_spnego_negtokeninit.asn1.o asn1.o \
-	  namespace.o
+	  namespace.o reparse.o
 
 $(obj)/asn1.o: $(obj)/cifs_spnego_negtokeninit.asn1.h
 
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index 3de5047a7ff9..0ff2491c311d 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -239,7 +239,8 @@ replay_again:
 		.tcon = tcon,
 		.path = path,
 		.create_options = cifs_create_options(cifs_sb, CREATE_NOT_FILE),
-		.desired_access =  FILE_READ_DATA | FILE_READ_ATTRIBUTES,
+		.desired_access =  FILE_READ_DATA | FILE_READ_ATTRIBUTES |
+				   FILE_READ_EA,
 		.disposition = FILE_OPEN,
 		.fid = pfid,
 		.replay = !!(retries),
@@ -416,6 +417,7 @@ smb2_close_cached_fid(struct kref *ref)
 {
 	struct cached_fid *cfid = container_of(ref, struct cached_fid,
 					       refcount);
+	int rc;
 
 	spin_lock(&cfid->cfids->cfid_list_lock);
 	if (cfid->on_list) {
@@ -429,9 +431,10 @@ smb2_close_cached_fid(struct kref *ref)
 	cfid->dentry = NULL;
 
 	if (cfid->is_open) {
-		SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
+		rc = SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid,
 			   cfid->fid.volatile_fid);
-		atomic_dec(&cfid->tcon->num_remote_opens);
+		if (rc) /* should we retry on -EBUSY or -EAGAIN? */
+			cifs_dbg(VFS, "close cached dir rc %d\n", rc);
 	}
 
 	free_cached_dir(cfid);
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 3e4209f41c18..c71ae5c04306 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -250,6 +250,8 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
 		list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+			if (cifs_ses_exiting(ses))
+				continue;
 			list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 				spin_lock(&tcon->open_file_lock);
 				list_for_each_entry(cfile, &tcon->openFileList, tlist) {
@@ -278,6 +280,24 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v)
 	return 0;
 }
 
+static __always_inline const char *compression_alg_str(__le16 alg)
+{
+	switch (alg) {
+	case SMB3_COMPRESS_NONE:
+		return "NONE";
+	case SMB3_COMPRESS_LZNT1:
+		return "LZNT1";
+	case SMB3_COMPRESS_LZ77:
+		return "LZ77";
+	case SMB3_COMPRESS_LZ77_HUFF:
+		return "LZ77-Huffman";
+	case SMB3_COMPRESS_PATTERN:
+		return "Pattern_V1";
+	default:
+		return "invalid";
+	}
+}
+
 static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
 {
 	struct mid_q_entry *mid_entry;
@@ -423,12 +443,6 @@ skip_rdma:
 			server->echo_credits,
 			server->oplock_credits,
 			server->dialect);
-		if (server->compress_algorithm == SMB3_COMPRESS_LZNT1)
-			seq_printf(m, " COMPRESS_LZNT1");
-		else if (server->compress_algorithm == SMB3_COMPRESS_LZ77)
-			seq_printf(m, " COMPRESS_LZ77");
-		else if (server->compress_algorithm == SMB3_COMPRESS_LZ77_HUFF)
-			seq_printf(m, " COMPRESS_LZ77_HUFF");
 		if (server->sign)
 			seq_printf(m, " signed");
 		if (server->posix_ext_supported)
@@ -460,6 +474,14 @@ skip_rdma:
 				   server->leaf_fullpath);
 		}
 
+		seq_puts(m, "\nCompression: ");
+		if (!server->compression.requested)
+			seq_puts(m, "disabled on mount");
+		else if (server->compression.enabled)
+			seq_printf(m, "enabled (%s)", compression_alg_str(server->compression.alg));
+		else
+			seq_puts(m, "disabled (not supported by this server)");
+
 		seq_printf(m, "\n\n\tSessions: ");
 		i = 0;
 		list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
@@ -488,6 +510,8 @@ skip_rdma:
 				ses->ses_count, ses->serverOS, ses->serverNOS,
 				ses->capabilities, ses->ses_status);
 			}
+			if (ses->expired_pwd)
+				seq_puts(m, "password no longer valid ");
 			spin_unlock(&ses->ses_lock);
 
 			seq_printf(m, "\n\tSecurity type: %s ",
@@ -654,6 +678,8 @@ static ssize_t cifs_stats_proc_write(struct file *file,
 			}
 #endif /* CONFIG_CIFS_STATS2 */
 			list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+				if (cifs_ses_exiting(ses))
+					continue;
 				list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 					atomic_set(&tcon->num_smbs_sent, 0);
 					spin_lock(&tcon->stat_lock);
@@ -733,6 +759,8 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
 			}
 #endif /* STATS2 */
 		list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+			if (cifs_ses_exiting(ses))
+				continue;
 			list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 				i++;
 				seq_printf(m, "\n%d) %s", i, tcon->tree_name);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 0c269396ae15..d41eedbff674 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -151,15 +151,12 @@ MODULE_PARM_DESC(disable_legacy_dialects, "To improve security it may be "
 				  "vers=1.0 (CIFS/SMB1) and vers=2.0 are weaker"
 				  " and less secure. Default: n/N/0");
 
-extern mempool_t *cifs_sm_req_poolp;
-extern mempool_t *cifs_req_poolp;
-extern mempool_t *cifs_mid_poolp;
-
 struct workqueue_struct	*cifsiod_wq;
 struct workqueue_struct	*decrypt_wq;
 struct workqueue_struct	*fileinfo_put_wq;
 struct workqueue_struct	*cifsoplockd_wq;
 struct workqueue_struct	*deferredclose_wq;
+struct workqueue_struct	*serverclose_wq;
 __u32 cifs_lock_secret;
 
 /*
@@ -673,6 +670,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",backupgid=%u",
 			   from_kgid_munged(&init_user_ns,
 					    cifs_sb->ctx->backupgid));
+	seq_show_option(s, "reparse",
+			cifs_reparse_type_str(cifs_sb->ctx->reparse_type));
 
 	seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize);
@@ -1085,7 +1084,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 }
 
 static int
-cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
+cifs_setlease(struct file *file, int arg, struct file_lease **lease, void **priv)
 {
 	/*
 	 * Note that this is called by vfs setlease with i_lock held to
@@ -1094,9 +1093,6 @@ cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv)
 	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
 
-	if (!(S_ISREG(inode->i_mode)))
-		return -EINVAL;
-
 	/* Check if file is oplocked if this is request for new lease */
 	if (arg == F_UNLCK ||
 	    ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
@@ -1667,7 +1663,7 @@ cifs_init_inodecache(void)
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
 					      sizeof(struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
+						SLAB_ACCOUNT),
 					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1893,6 +1889,13 @@ init_cifs(void)
 		goto out_destroy_cifsoplockd_wq;
 	}
 
+	serverclose_wq = alloc_workqueue("serverclose",
+					   WQ_FREEZABLE|WQ_MEM_RECLAIM, 0);
+	if (!serverclose_wq) {
+		rc = -ENOMEM;
+		goto out_destroy_serverclose_wq;
+	}
+
 	rc = cifs_init_inodecache();
 	if (rc)
 		goto out_destroy_deferredclose_wq;
@@ -1967,6 +1970,8 @@ out_destroy_decrypt_wq:
 	destroy_workqueue(decrypt_wq);
 out_destroy_cifsiod_wq:
 	destroy_workqueue(cifsiod_wq);
+out_destroy_serverclose_wq:
+	destroy_workqueue(serverclose_wq);
 out_clean_proc:
 	cifs_proc_clean();
 	return rc;
@@ -1996,6 +2001,7 @@ exit_cifs(void)
 	destroy_workqueue(cifsoplockd_wq);
 	destroy_workqueue(decrypt_wq);
 	destroy_workqueue(fileinfo_put_wq);
+	destroy_workqueue(serverclose_wq);
 	destroy_workqueue(cifsiod_wq);
 	cifs_proc_clean();
 }
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 685f7d1139c6..ca55d01117c8 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
 /* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 47
-#define CIFS_VERSION   "2.47"
+#define SMB3_PRODUCT_BUILD 48
+#define CIFS_VERSION   "2.48"
 #endif				/* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 53c75cfb33ab..d6669ce4ae87 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -153,6 +153,24 @@ enum securityEnum {
 	Kerberos,		/* Kerberos via SPNEGO */
 };
 
+enum cifs_reparse_type {
+	CIFS_REPARSE_TYPE_NFS,
+	CIFS_REPARSE_TYPE_WSL,
+	CIFS_REPARSE_TYPE_DEFAULT = CIFS_REPARSE_TYPE_NFS,
+};
+
+static inline const char *cifs_reparse_type_str(enum cifs_reparse_type type)
+{
+	switch (type) {
+	case CIFS_REPARSE_TYPE_NFS:
+		return "nfs";
+	case CIFS_REPARSE_TYPE_WSL:
+		return "wsl";
+	default:
+		return "unknown";
+	}
+}
+
 struct session_key {
 	unsigned int len;
 	char *response;
@@ -208,6 +226,10 @@ struct cifs_open_info_data {
 			struct reparse_posix_data *posix;
 		};
 	} reparse;
+	struct {
+		__u8		eas[SMB2_WSL_MAX_QUERY_EA_RESP_SIZE];
+		unsigned int	eas_len;
+	} wsl;
 	char *symlink_target;
 	struct cifs_sid posix_owner;
 	struct cifs_sid posix_group;
@@ -217,19 +239,6 @@ struct cifs_open_info_data {
 	};
 };
 
-static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
-{
-	struct smb2_file_all_info *fi = &data->fi;
-	u32 attrs = le32_to_cpu(fi->Attributes);
-	bool ret;
-
-	ret = data->reparse_point || (attrs & ATTR_REPARSE);
-	if (ret)
-		attrs |= ATTR_REPARSE;
-	fi->Attributes = cpu_to_le32(attrs);
-	return ret;
-}
-
 /*
  *****************************************************************
  * Except the CIFS PDUs themselves all the
@@ -346,6 +355,9 @@ struct smb_version_operations {
 	/* informational QFS call */
 	void (*qfs_tcon)(const unsigned int, struct cifs_tcon *,
 			 struct cifs_sb_info *);
+	/* query for server interfaces */
+	int (*query_server_interfaces)(const unsigned int, struct cifs_tcon *,
+				       bool);
 	/* check if a path is accessible or not */
 	int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
 				  struct cifs_sb_info *, const char *);
@@ -371,7 +383,8 @@ struct smb_version_operations {
 			    struct cifs_open_info_data *data);
 	/* set size by path */
 	int (*set_path_size)(const unsigned int, struct cifs_tcon *,
-			     const char *, __u64, struct cifs_sb_info *, bool);
+			     const char *, __u64, struct cifs_sb_info *, bool,
+				 struct dentry *);
 	/* set size by file handle */
 	int (*set_file_size)(const unsigned int, struct cifs_tcon *,
 			     struct cifsFileInfo *, __u64, bool);
@@ -401,7 +414,7 @@ struct smb_version_operations {
 		     struct cifs_sb_info *);
 	/* unlink file */
 	int (*unlink)(const unsigned int, struct cifs_tcon *, const char *,
-		      struct cifs_sb_info *);
+		      struct cifs_sb_info *, struct dentry *);
 	/* open, rename and delete file */
 	int (*rename_pending_delete)(const char *, struct dentry *,
 				     const unsigned int);
@@ -429,10 +442,10 @@ struct smb_version_operations {
 	/* set fid protocol-specific info */
 	void (*set_fid)(struct cifsFileInfo *, struct cifs_fid *, __u32);
 	/* close a file */
-	void (*close)(const unsigned int, struct cifs_tcon *,
+	int (*close)(const unsigned int, struct cifs_tcon *,
 		      struct cifs_fid *);
 	/* close a file, returning file attributes and timestamps */
-	void (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon,
+	int (*close_getattr)(const unsigned int xid, struct cifs_tcon *tcon,
 		      struct cifsFileInfo *pfile_info);
 	/* send a flush request to the server */
 	int (*flush)(const unsigned int, struct cifs_tcon *, struct cifs_fid *);
@@ -759,7 +772,11 @@ struct TCP_Server_Info {
 	unsigned int	max_write;
 	unsigned int	min_offload;
 	unsigned int	retrans;
-	__le16	compress_algorithm;
+	struct {
+		bool requested; /* "compress" mount option set*/
+		bool enabled; /* actually negotiated with server */
+		__le16 alg; /* preferred alg negotiated with server */
+	} compression;
 	__u16	signing_algorithm;
 	__le16	cipher_type;
 	 /* save initital negprot hash */
@@ -1060,12 +1077,14 @@ struct cifs_ses {
 				   and after mount option parsing we fill it */
 	char *domainName;
 	char *password;
+	char *password2; /* When key rotation used, new password may be set before it expires */
 	char workstation_name[CIFS_MAX_WORKSTATION_LEN];
 	struct session_key auth_key;
 	struct ntlmssp_auth *ntlmssp; /* ciphertext, flags, server challenge */
 	enum securityEnum sectype; /* what security flavor was specified? */
 	bool sign;		/* is signing required? */
 	bool domainAuto:1;
+	bool expired_pwd;  /* track if access denied or expired pwd so can know if need to update */
 	unsigned int flags;
 	__u16 session_flags;
 	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
@@ -1263,7 +1282,6 @@ struct cifs_tcon {
 	struct cached_fids *cfids;
 	/* BB add field for back pointer to sb struct(s)? */
 #ifdef CONFIG_CIFS_DFS_UPCALL
-	struct list_head dfs_ses_list;
 	struct delayed_work dfs_cache_work;
 #endif
 	struct delayed_work	query_interfaces; /* query interfaces workqueue job */
@@ -1379,6 +1397,7 @@ struct cifs_open_parms {
 	umode_t mode;
 	bool reconnect:1;
 	bool replay:1; /* indicates that this open is for a replay */
+	struct kvec *ea_cctx;
 };
 
 struct cifs_fid {
@@ -1420,6 +1439,8 @@ struct cifsFileInfo {
 	bool invalidHandle:1;	/* file closed via session abend */
 	bool swapfile:1;
 	bool oplock_break_cancelled:1;
+	bool status_file_deleted:1; /* file has been deleted */
+	bool offload:1; /* offload final part of _put to a wq */
 	unsigned int oplock_epoch; /* epoch from the lease break */
 	__u32 oplock_level; /* oplock/lease level from the lease break */
 	int count;
@@ -1428,6 +1449,7 @@ struct cifsFileInfo {
 	struct cifs_search_info srch_inf;
 	struct work_struct oplock_break; /* work for oplock breaks */
 	struct work_struct put; /* work for the final part of _put */
+	struct work_struct serverclose; /* work for serverclose */
 	struct delayed_work deferred;
 	bool deferred_close_scheduled; /* Flag to indicate close is scheduled */
 	char *symlink_target;
@@ -1784,7 +1806,6 @@ struct cifs_mount_ctx {
 	struct TCP_Server_Info *server;
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon;
-	struct list_head dfs_ses_list;
 };
 
 static inline void __free_dfs_info_param(struct dfs_info3_param *param)
@@ -2085,8 +2106,11 @@ extern struct workqueue_struct *decrypt_wq;
 extern struct workqueue_struct *fileinfo_put_wq;
 extern struct workqueue_struct *cifsoplockd_wq;
 extern struct workqueue_struct *deferredclose_wq;
+extern struct workqueue_struct *serverclose_wq;
 extern __u32 cifs_lock_secret;
 
+extern mempool_t *cifs_sm_req_poolp;
+extern mempool_t *cifs_req_poolp;
 extern mempool_t *cifs_mid_poolp;
 
 /* Operations for different SMB versions */
@@ -2277,6 +2301,17 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
 	}
 }
 
+#define CIFS_OPARMS(_cifs_sb, _tcon, _path, _da, _cd, _co, _mode) \
+	((struct cifs_open_parms) { \
+		.tcon = _tcon, \
+		.path = _path, \
+		.desired_access = (_da), \
+		.disposition = (_cd), \
+		.create_options = cifs_create_options(_cifs_sb, (_co)), \
+		.mode = (_mode), \
+		.cifs_sb = _cifs_sb, \
+	})
+
 struct smb2_compound_vars {
 	struct cifs_open_parms oparms;
 	struct kvec rsp_iov[MAX_COMPOUND];
@@ -2288,6 +2323,17 @@ struct smb2_compound_vars {
 	struct kvec close_iov;
 	struct smb2_file_rename_info rename_info;
 	struct smb2_file_link_info link_info;
+	struct kvec ea_iov;
 };
 
+static inline bool cifs_ses_exiting(struct cifs_ses *ses)
+{
+	bool ret;
+
+	spin_lock(&ses->ses_lock);
+	ret = ses->ses_status == SES_EXITING;
+	spin_unlock(&ses->ses_lock);
+	return ret;
+}
+
 #endif	/* _CIFS_GLOB_H */
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index a841bf4967fa..8e0a348f1f66 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -144,7 +144,8 @@ extern int cifs_reconnect(struct TCP_Server_Info *server,
 extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
 extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
 extern bool backup_cred(struct cifs_sb_info *);
-extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
+extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof,
+				   bool from_readdir);
 extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
 			    unsigned int bytes_written);
 extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int);
@@ -201,17 +202,14 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr,
 				     struct cifs_sb_info *cifs_sb);
 extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *,
 					struct cifs_sb_info *);
-extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
+extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
+			       bool from_readdir);
 extern struct inode *cifs_iget(struct super_block *sb,
 			       struct cifs_fattr *fattr);
 
 int cifs_get_inode_info(struct inode **inode, const char *full_path,
 			struct cifs_open_info_data *data, struct super_block *sb, int xid,
 			const struct cifs_fid *fid);
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
-				 struct cifs_fattr *fattr,
-				 struct cifs_open_info_data *data);
-
 extern int smb311_posix_get_inode_info(struct inode **inode,
 				       const char *full_path,
 				       struct cifs_open_info_data *data,
@@ -296,6 +294,10 @@ extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon);
 
 extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon,
 				const char *path);
+
+extern void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
+				const char *path);
+
 extern struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb3_fs_context *ctx,
 		     struct TCP_Server_Info *primary_server);
@@ -402,7 +404,8 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid,
 				     __u32 pid_of_opener);
 extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
 			 const char *file_name, __u64 size,
-			 struct cifs_sb_info *cifs_sb, bool set_allocation);
+			 struct cifs_sb_info *cifs_sb, bool set_allocation,
+			 struct dentry *dentry);
 extern int CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon,
 			      struct cifsFileInfo *cfile, __u64 size,
 			      bool set_allocation);
@@ -438,7 +441,8 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
 			const struct nls_table *nls_codepage,
 			int remap_special_chars);
 extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
-			  const char *name, struct cifs_sb_info *cifs_sb);
+			  const char *name, struct cifs_sb_info *cifs_sb,
+			  struct dentry *dentry);
 int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
 		  struct dentry *source_dentry,
 		  const char *from_name, const char *to_name,
@@ -721,31 +725,31 @@ struct super_block *cifs_get_tcon_super(struct cifs_tcon *tcon);
 void cifs_put_tcon_super(struct super_block *sb);
 int cifs_wait_for_server_reconnect(struct TCP_Server_Info *server, bool retry);
 
-/* Put references of @ses and @ses->dfs_root_ses */
+/* Put references of @ses and its children */
 static inline void cifs_put_smb_ses(struct cifs_ses *ses)
 {
-	struct cifs_ses *rses = ses->dfs_root_ses;
+	struct cifs_ses *next;
 
-	__cifs_put_smb_ses(ses);
-	if (rses)
-		__cifs_put_smb_ses(rses);
+	do {
+		next = ses->dfs_root_ses;
+		__cifs_put_smb_ses(ses);
+	} while ((ses = next));
 }
 
-/* Get an active reference of @ses and @ses->dfs_root_ses.
+/* Get an active reference of @ses and its children.
  *
  * NOTE: make sure to call this function when incrementing reference count of
  * @ses to ensure that any DFS root session attached to it (@ses->dfs_root_ses)
  * will also get its reference count incremented.
  *
- * cifs_put_smb_ses() will put both references, so call it when you're done.
+ * cifs_put_smb_ses() will put all references, so call it when you're done.
  */
 static inline void cifs_smb_ses_inc_refcount(struct cifs_ses *ses)
 {
 	lockdep_assert_held(&cifs_tcp_ses_lock);
 
-	ses->ses_count++;
-	if (ses->dfs_root_ses)
-		ses->dfs_root_ses->ses_count++;
+	for (; ses; ses = ses->dfs_root_ses)
+		ses->ses_count++;
 }
 
 static inline bool dfs_src_pathname_equal(const char *s1, const char *s2)
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 01e89070df5a..23b5709ddc31 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -738,7 +738,7 @@ PsxDelete:
 
 int
 CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
-	       struct cifs_sb_info *cifs_sb)
+	       struct cifs_sb_info *cifs_sb, struct dentry *dentry)
 {
 	DELETE_FILE_REQ *pSMB = NULL;
 	DELETE_FILE_RSP *pSMBr = NULL;
@@ -2066,20 +2066,20 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon,
 		parm_data = (struct cifs_posix_lock *)
 			((char *)&pSMBr->hdr.Protocol + data_offset);
 		if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK))
-			pLockData->fl_type = F_UNLCK;
+			pLockData->c.flc_type = F_UNLCK;
 		else {
 			if (parm_data->lock_type ==
 					cpu_to_le16(CIFS_RDLCK))
-				pLockData->fl_type = F_RDLCK;
+				pLockData->c.flc_type = F_RDLCK;
 			else if (parm_data->lock_type ==
 					cpu_to_le16(CIFS_WRLCK))
-				pLockData->fl_type = F_WRLCK;
+				pLockData->c.flc_type = F_WRLCK;
 
 			pLockData->fl_start = le64_to_cpu(parm_data->start);
 			pLockData->fl_end = pLockData->fl_start +
 				(le64_to_cpu(parm_data->length) ?
 				 le64_to_cpu(parm_data->length) - 1 : 0);
-			pLockData->fl_pid = -le32_to_cpu(parm_data->pid);
+			pLockData->c.flc_pid = -le32_to_cpu(parm_data->pid);
 		}
 	}
 
@@ -4993,7 +4993,7 @@ QFSPosixRetry:
 int
 CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon,
 	      const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb,
-	      bool set_allocation)
+	      bool set_allocation, struct dentry *dentry)
 {
 	struct smb_com_transaction2_spi_req *pSMB = NULL;
 	struct smb_com_transaction2_spi_rsp *pSMBr = NULL;
@@ -5854,10 +5854,8 @@ SetEARetry:
 	parm_data->list.EA_flags = 0;
 	/* we checked above that name len is less than 255 */
 	parm_data->list.name_len = (__u8)name_len;
-	/* EA names are always ASCII */
-	if (ea_name)
-		strncpy(parm_data->list.name, ea_name, name_len);
-	parm_data->list.name[name_len] = '\0';
+	/* EA names are always ASCII and NUL-terminated */
+	strscpy(parm_data->list.name, ea_name ?: "", name_len + 1);
 	parm_data->list.value_len = cpu_to_le16(ea_value_len);
 	/* caller ensures that ea_value_len is less than 64K but
 	we need to ensure that it fits within the smb */
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index ac9595504f4b..4e35970681bf 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -52,9 +52,6 @@
 #include "fs_context.h"
 #include "cifs_swn.h"
 
-extern mempool_t *cifs_req_poolp;
-extern bool disable_legacy_dialects;
-
 /* FIXME: should these be tunable? */
 #define TLINK_ERROR_EXPIRE	(1 * HZ)
 #define TLINK_IDLE_EXPIRE	(600 * HZ)
@@ -123,12 +120,16 @@ static void smb2_query_server_interfaces(struct work_struct *work)
 	struct cifs_tcon *tcon = container_of(work,
 					struct cifs_tcon,
 					query_interfaces.work);
+	struct TCP_Server_Info *server = tcon->ses->server;
 
 	/*
 	 * query server network interfaces, in case they change
 	 */
+	if (!server->ops->query_server_interfaces)
+		return;
+
 	xid = get_xid();
-	rc = SMB3_request_interfaces(xid, tcon, false);
+	rc = server->ops->query_server_interfaces(xid, tcon, false);
 	free_xid(xid);
 
 	if (rc) {
@@ -174,6 +175,8 @@ cifs_signal_cifsd_for_reconnect(struct TCP_Server_Info *server,
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+		if (cifs_ses_exiting(ses))
+			continue;
 		spin_lock(&ses->chan_lock);
 		for (i = 0; i < ses->chan_count; i++) {
 			if (!ses->chans[i].server)
@@ -231,7 +234,13 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
-		/* check if iface is still active */
+		spin_lock(&ses->ses_lock);
+		if (ses->ses_status == SES_EXITING) {
+			spin_unlock(&ses->ses_lock);
+			continue;
+		}
+		spin_unlock(&ses->ses_lock);
+
 		spin_lock(&ses->chan_lock);
 		if (cifs_ses_get_chan_index(ses, server) ==
 		    CIFS_INVAL_CHAN_INDEX) {
@@ -1736,7 +1745,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx,
 	tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */
 	tcp_ses->reconnect_instance = 1;
 	tcp_ses->lstrp = jiffies;
-	tcp_ses->compress_algorithm = cpu_to_le16(ctx->compression);
+	tcp_ses->compression.requested = ctx->compress;
 	spin_lock_init(&tcp_ses->req_lock);
 	spin_lock_init(&tcp_ses->srv_lock);
 	spin_lock_init(&tcp_ses->mid_lock);
@@ -1859,6 +1868,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx)
 	    ctx->sectype != ses->sectype)
 		return 0;
 
+	if (ctx->dfs_root_ses != ses->dfs_root_ses)
+		return 0;
+
 	/*
 	 * If an existing session is limited to less channels than
 	 * requested, it should not be reused
@@ -1962,31 +1974,6 @@ out:
 	return rc;
 }
 
-/**
- * cifs_free_ipc - helper to release the session IPC tcon
- * @ses: smb session to unmount the IPC from
- *
- * Needs to be called everytime a session is destroyed.
- *
- * On session close, the IPC is closed and the server must release all tcons of the session.
- * No need to send a tree disconnect here.
- *
- * Besides, it will make the server to not close durable and resilient files on session close, as
- * specified in MS-SMB2 3.3.5.6 Receiving an SMB2 LOGOFF Request.
- */
-static int
-cifs_free_ipc(struct cifs_ses *ses)
-{
-	struct cifs_tcon *tcon = ses->tcon_ipc;
-
-	if (tcon == NULL)
-		return 0;
-
-	tconInfoFree(tcon);
-	ses->tcon_ipc = NULL;
-	return 0;
-}
-
 static struct cifs_ses *
 cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 {
@@ -2018,48 +2005,52 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 void __cifs_put_smb_ses(struct cifs_ses *ses)
 {
 	struct TCP_Server_Info *server = ses->server;
+	struct cifs_tcon *tcon;
 	unsigned int xid;
 	size_t i;
+	bool do_logoff;
 	int rc;
 
+	spin_lock(&cifs_tcp_ses_lock);
 	spin_lock(&ses->ses_lock);
-	if (ses->ses_status == SES_EXITING) {
+	cifs_dbg(FYI, "%s: id=0x%llx ses_count=%d ses_status=%u ipc=%s\n",
+		 __func__, ses->Suid, ses->ses_count, ses->ses_status,
+		 ses->tcon_ipc ? ses->tcon_ipc->tree_name : "none");
+	if (ses->ses_status == SES_EXITING || --ses->ses_count > 0) {
 		spin_unlock(&ses->ses_lock);
+		spin_unlock(&cifs_tcp_ses_lock);
 		return;
 	}
-	spin_unlock(&ses->ses_lock);
+	/* ses_count can never go negative */
+	WARN_ON(ses->ses_count < 0);
 
-	cifs_dbg(FYI, "%s: ses_count=%d\n", __func__, ses->ses_count);
-	cifs_dbg(FYI,
-		 "%s: ses ipc: %s\n", __func__, ses->tcon_ipc ? ses->tcon_ipc->tree_name : "NONE");
+	spin_lock(&ses->chan_lock);
+	cifs_chan_clear_need_reconnect(ses, server);
+	spin_unlock(&ses->chan_lock);
 
-	spin_lock(&cifs_tcp_ses_lock);
-	if (--ses->ses_count > 0) {
-		spin_unlock(&cifs_tcp_ses_lock);
-		return;
-	}
-	spin_lock(&ses->ses_lock);
-	if (ses->ses_status == SES_GOOD)
-		ses->ses_status = SES_EXITING;
+	do_logoff = ses->ses_status == SES_GOOD && server->ops->logoff;
+	ses->ses_status = SES_EXITING;
+	tcon = ses->tcon_ipc;
+	ses->tcon_ipc = NULL;
 	spin_unlock(&ses->ses_lock);
 	spin_unlock(&cifs_tcp_ses_lock);
 
-	/* ses_count can never go negative */
-	WARN_ON(ses->ses_count < 0);
-
-	spin_lock(&ses->ses_lock);
-	if (ses->ses_status == SES_EXITING && server->ops->logoff) {
-		spin_unlock(&ses->ses_lock);
-		cifs_free_ipc(ses);
+	/*
+	 * On session close, the IPC is closed and the server must release all
+	 * tcons of the session.  No need to send a tree disconnect here.
+	 *
+	 * Besides, it will make the server to not close durable and resilient
+	 * files on session close, as specified in MS-SMB2 3.3.5.6 Receiving an
+	 * SMB2 LOGOFF Request.
+	 */
+	tconInfoFree(tcon);
+	if (do_logoff) {
 		xid = get_xid();
 		rc = server->ops->logoff(xid, ses);
 		if (rc)
 			cifs_server_dbg(VFS, "%s: Session Logoff failure rc=%d\n",
 				__func__, rc);
 		_free_xid(xid);
-	} else {
-		spin_unlock(&ses->ses_lock);
-		cifs_free_ipc(ses);
 	}
 
 	spin_lock(&cifs_tcp_ses_lock);
@@ -2192,6 +2183,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
 	}
 
 	++delim;
+	/* BB consider adding support for password2 (Key Rotation) for multiuser in future */
 	ctx->password = kstrndup(delim, len, GFP_KERNEL);
 	if (!ctx->password) {
 		cifs_dbg(FYI, "Unable to allocate %zd bytes for password\n",
@@ -2215,6 +2207,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses)
 			kfree(ctx->username);
 			ctx->username = NULL;
 			kfree_sensitive(ctx->password);
+			/* no need to free ctx->password2 since not allocated in this path */
 			ctx->password = NULL;
 			goto out_key_put;
 		}
@@ -2326,6 +2319,12 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 		if (!ses->password)
 			goto get_ses_fail;
 	}
+	/* ctx->password freed at unmount */
+	if (ctx->password2) {
+		ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+		if (!ses->password2)
+			goto get_ses_fail;
+	}
 	if (ctx->domainname) {
 		ses->domainName = kstrdup(ctx->domainname, GFP_KERNEL);
 		if (!ses->domainName)
@@ -2372,9 +2371,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx)
 	 * need to lock before changing something in the session.
 	 */
 	spin_lock(&cifs_tcp_ses_lock);
+	if (ctx->dfs_root_ses)
+		cifs_smb_ses_inc_refcount(ctx->dfs_root_ses);
 	ses->dfs_root_ses = ctx->dfs_root_ses;
-	if (ses->dfs_root_ses)
-		ses->dfs_root_ses->ses_count++;
 	list_add(&ses->smb_ses_list, &server->smb_ses_list);
 	spin_unlock(&cifs_tcp_ses_lock);
 
@@ -2803,6 +2802,8 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data)
 		return 0;
 	if (old->ctx->closetimeo != new->ctx->closetimeo)
 		return 0;
+	if (old->ctx->reparse_type != new->ctx->reparse_type)
+		return 0;
 
 	return 1;
 }
@@ -3323,6 +3324,9 @@ void cifs_mount_put_conns(struct cifs_mount_ctx *mnt_ctx)
 		cifs_put_smb_ses(mnt_ctx->ses);
 	else if (mnt_ctx->server)
 		cifs_put_tcp_session(mnt_ctx->server, 0);
+	mnt_ctx->ses = NULL;
+	mnt_ctx->tcon = NULL;
+	mnt_ctx->server = NULL;
 	mnt_ctx->cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS;
 	free_xid(mnt_ctx->xid);
 }
@@ -3601,8 +3605,6 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
 	bool isdfs;
 	int rc;
 
-	INIT_LIST_HEAD(&mnt_ctx.dfs_ses_list);
-
 	rc = dfs_mount_share(&mnt_ctx, &isdfs);
 	if (rc)
 		goto error;
@@ -3633,7 +3635,6 @@ out:
 	return rc;
 
 error:
-	dfs_put_root_smb_sessions(&mnt_ctx.dfs_ses_list);
 	cifs_mount_put_conns(&mnt_ctx);
 	return rc;
 }
@@ -3648,6 +3649,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx)
 		goto error;
 
 	rc = cifs_mount_get_tcon(&mnt_ctx);
+	if (!rc) {
+		/*
+		 * Prevent superblock from being created with any missing
+		 * connections.
+		 */
+		if (WARN_ON(!mnt_ctx.server))
+			rc = -EHOSTDOWN;
+		else if (WARN_ON(!mnt_ctx.ses))
+			rc = -EACCES;
+		else if (WARN_ON(!mnt_ctx.tcon))
+			rc = -ENOENT;
+	}
 	if (rc)
 		goto error;
 
@@ -3985,13 +3998,14 @@ cifs_set_vol_auth(struct smb3_fs_context *ctx, struct cifs_ses *ses)
 }
 
 static struct cifs_tcon *
-cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+__cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 {
 	int rc;
 	struct cifs_tcon *master_tcon = cifs_sb_master_tcon(cifs_sb);
 	struct cifs_ses *ses;
 	struct cifs_tcon *tcon = NULL;
 	struct smb3_fs_context *ctx;
+	char *origin_fullpath = NULL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
 	if (ctx == NULL)
@@ -4015,6 +4029,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 	ctx->sign = master_tcon->ses->sign;
 	ctx->seal = master_tcon->seal;
 	ctx->witness = master_tcon->use_witness;
+	ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses;
 
 	rc = cifs_set_vol_auth(ctx, master_tcon->ses);
 	if (rc) {
@@ -4034,12 +4049,39 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 		goto out;
 	}
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	spin_lock(&master_tcon->tc_lock);
+	if (master_tcon->origin_fullpath) {
+		spin_unlock(&master_tcon->tc_lock);
+		origin_fullpath = dfs_get_path(cifs_sb, cifs_sb->ctx->source);
+		if (IS_ERR(origin_fullpath)) {
+			tcon = ERR_CAST(origin_fullpath);
+			origin_fullpath = NULL;
+			cifs_put_smb_ses(ses);
+			goto out;
+		}
+	} else {
+		spin_unlock(&master_tcon->tc_lock);
+	}
+#endif
+
 	tcon = cifs_get_tcon(ses, ctx);
 	if (IS_ERR(tcon)) {
 		cifs_put_smb_ses(ses);
 		goto out;
 	}
 
+#ifdef CONFIG_CIFS_DFS_UPCALL
+	if (origin_fullpath) {
+		spin_lock(&tcon->tc_lock);
+		tcon->origin_fullpath = origin_fullpath;
+		spin_unlock(&tcon->tc_lock);
+		origin_fullpath = NULL;
+		queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
+				   dfs_cache_get_ttl() * HZ);
+	}
+#endif
+
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 	if (cap_unix(ses))
 		reset_cifs_unix_caps(0, tcon, NULL, ctx);
@@ -4048,11 +4090,23 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 out:
 	kfree(ctx->username);
 	kfree_sensitive(ctx->password);
+	kfree(origin_fullpath);
 	kfree(ctx);
 
 	return tcon;
 }
 
+static struct cifs_tcon *
+cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
+{
+	struct cifs_tcon *ret;
+
+	cifs_mount_lock();
+	ret = __cifs_construct_tcon(cifs_sb, fsuid);
+	cifs_mount_unlock();
+	return ret;
+}
+
 struct cifs_tcon *
 cifs_sb_master_tcon(struct cifs_sb_info *cifs_sb)
 {
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index 449c59830039..3ec965547e3d 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -66,33 +66,20 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path)
 }
 
 /*
- * Track individual DFS referral servers used by new DFS mount.
- *
- * On success, their lifetime will be shared by final tcon (dfs_ses_list).
- * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount().
+ * Get an active reference of @ses so that next call to cifs_put_tcon() won't
+ * release it as any new DFS referrals must go through its IPC tcon.
  */
-static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
+static void add_root_smb_session(struct cifs_mount_ctx *mnt_ctx)
 {
 	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
-	struct dfs_root_ses *root_ses;
 	struct cifs_ses *ses = mnt_ctx->ses;
 
 	if (ses) {
-		root_ses = kmalloc(sizeof(*root_ses), GFP_KERNEL);
-		if (!root_ses)
-			return -ENOMEM;
-
-		INIT_LIST_HEAD(&root_ses->list);
-
 		spin_lock(&cifs_tcp_ses_lock);
 		cifs_smb_ses_inc_refcount(ses);
 		spin_unlock(&cifs_tcp_ses_lock);
-		root_ses->ses = ses;
-		list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list);
 	}
-	/* Select new DFS referral server so that new referrals go through it */
 	ctx->dfs_root_ses = ses;
-	return 0;
 }
 
 static inline int parse_dfs_target(struct smb3_fs_context *ctx,
@@ -185,11 +172,8 @@ again:
 					continue;
 			}
 
-			if (is_refsrv) {
-				rc = add_root_smb_session(mnt_ctx);
-				if (rc)
-					goto out;
-			}
+			if (is_refsrv)
+				add_root_smb_session(mnt_ctx);
 
 			rc = ref_walk_advance(rw);
 			if (!rc) {
@@ -232,6 +216,7 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
 	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
 	struct cifs_tcon *tcon;
 	char *origin_fullpath;
+	bool new_tcon = true;
 	int rc;
 
 	origin_fullpath = dfs_get_path(cifs_sb, ctx->source);
@@ -239,6 +224,18 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
 		return PTR_ERR(origin_fullpath);
 
 	rc = dfs_referral_walk(mnt_ctx);
+	if (!rc) {
+		/*
+		 * Prevent superblock from being created with any missing
+		 * connections.
+		 */
+		if (WARN_ON(!mnt_ctx->server))
+			rc = -EHOSTDOWN;
+		else if (WARN_ON(!mnt_ctx->ses))
+			rc = -EACCES;
+		else if (WARN_ON(!mnt_ctx->tcon))
+			rc = -ENOENT;
+	}
 	if (rc)
 		goto out;
 
@@ -247,15 +244,14 @@ static int __dfs_mount_share(struct cifs_mount_ctx *mnt_ctx)
 	if (!tcon->origin_fullpath) {
 		tcon->origin_fullpath = origin_fullpath;
 		origin_fullpath = NULL;
+	} else {
+		new_tcon = false;
 	}
 	spin_unlock(&tcon->tc_lock);
 
-	if (list_empty(&tcon->dfs_ses_list)) {
-		list_replace_init(&mnt_ctx->dfs_ses_list, &tcon->dfs_ses_list);
+	if (new_tcon) {
 		queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
 				   dfs_cache_get_ttl() * HZ);
-	} else {
-		dfs_put_root_smb_sessions(&mnt_ctx->dfs_ses_list);
 	}
 
 out:
@@ -298,7 +294,6 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
 	if (rc)
 		return rc;
 
-	ctx->dfs_root_ses = mnt_ctx->ses;
 	/*
 	 * If called with 'nodfs' mount option, then skip DFS resolving.  Otherwise unconditionally
 	 * try to get an DFS referral (even cached) to determine whether it is an DFS mount.
@@ -324,7 +319,9 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs)
 
 	*isdfs = true;
 	add_root_smb_session(mnt_ctx);
-	return __dfs_mount_share(mnt_ctx);
+	rc = __dfs_mount_share(mnt_ctx);
+	dfs_put_root_smb_sessions(mnt_ctx);
+	return rc;
 }
 
 /* Update dfs referral path of superblock */
diff --git a/fs/smb/client/dfs.h b/fs/smb/client/dfs.h
index 875ab7ae57fc..e5c4dcf83750 100644
--- a/fs/smb/client/dfs.h
+++ b/fs/smb/client/dfs.h
@@ -7,7 +7,9 @@
 #define _CIFS_DFS_H
 
 #include "cifsglob.h"
+#include "cifsproto.h"
 #include "fs_context.h"
+#include "dfs_cache.h"
 #include "cifs_unicode.h"
 #include <linux/namei.h>
 
@@ -114,11 +116,6 @@ static inline void ref_walk_set_tgt_hint(struct dfs_ref_walk *rw)
 				       ref_walk_tit(rw));
 }
 
-struct dfs_root_ses {
-	struct list_head list;
-	struct cifs_ses *ses;
-};
-
 int dfs_parse_target_referral(const char *full_path, const struct dfs_info3_param *ref,
 			      struct smb3_fs_context *ctx);
 int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs);
@@ -133,20 +130,32 @@ static inline int dfs_get_referral(struct cifs_mount_ctx *mnt_ctx, const char *p
 {
 	struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
 	struct cifs_sb_info *cifs_sb = mnt_ctx->cifs_sb;
+	struct cifs_ses *rses = ctx->dfs_root_ses ?: mnt_ctx->ses;
 
-	return dfs_cache_find(mnt_ctx->xid, ctx->dfs_root_ses, cifs_sb->local_nls,
+	return dfs_cache_find(mnt_ctx->xid, rses, cifs_sb->local_nls,
 			      cifs_remap(cifs_sb), path, ref, tl);
 }
 
-static inline void dfs_put_root_smb_sessions(struct list_head *head)
+/*
+ * cifs_get_smb_ses() already guarantees an active reference of
+ * @ses->dfs_root_ses when a new session is created, so we need to put extra
+ * references of all DFS root sessions that were used across the mount process
+ * in dfs_mount_share().
+ */
+static inline void dfs_put_root_smb_sessions(struct cifs_mount_ctx *mnt_ctx)
 {
-	struct dfs_root_ses *root, *tmp;
+	const struct smb3_fs_context *ctx = mnt_ctx->fs_ctx;
+	struct cifs_ses *ses = ctx->dfs_root_ses;
+	struct cifs_ses *cur;
+
+	if (!ses)
+		return;
 
-	list_for_each_entry_safe(root, tmp, head, list) {
-		list_del_init(&root->list);
-		cifs_put_smb_ses(root->ses);
-		kfree(root);
+	for (cur = ses; cur; cur = cur->dfs_root_ses) {
+		if (cur->dfs_root_ses)
+			cifs_put_smb_ses(cur->dfs_root_ses);
 	}
+	cifs_put_smb_ses(ses);
 }
 
 #endif /* _CIFS_DFS_H */
diff --git a/fs/smb/client/dfs_cache.c b/fs/smb/client/dfs_cache.c
index 508d831fabe3..11c8efecf7aa 100644
--- a/fs/smb/client/dfs_cache.c
+++ b/fs/smb/client/dfs_cache.c
@@ -1172,8 +1172,8 @@ static bool is_ses_good(struct cifs_ses *ses)
 	return ret;
 }
 
-/* Refresh dfs referral of tcon and mark it for reconnect if needed */
-static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_refresh)
+/* Refresh dfs referral of @ses and mark it for reconnect if needed */
+static void __refresh_ses_referral(struct cifs_ses *ses, bool force_refresh)
 {
 	struct TCP_Server_Info *server = ses->server;
 	DFS_CACHE_TGT_LIST(old_tl);
@@ -1181,10 +1181,21 @@ static int __refresh_tcon(const char *path, struct cifs_ses *ses, bool force_ref
 	bool needs_refresh = false;
 	struct cache_entry *ce;
 	unsigned int xid;
+	char *path = NULL;
 	int rc = 0;
 
 	xid = get_xid();
 
+	mutex_lock(&server->refpath_lock);
+	if (server->leaf_fullpath) {
+		path = kstrdup(server->leaf_fullpath + 1, GFP_ATOMIC);
+		if (!path)
+			rc = -ENOMEM;
+	}
+	mutex_unlock(&server->refpath_lock);
+	if (!path)
+		goto out;
+
 	down_read(&htable_rw_lock);
 	ce = lookup_cache_entry(path);
 	needs_refresh = force_refresh || IS_ERR(ce) || cache_entry_expired(ce);
@@ -1218,19 +1229,17 @@ out:
 	free_xid(xid);
 	dfs_cache_free_tgts(&old_tl);
 	dfs_cache_free_tgts(&new_tl);
-	return rc;
+	kfree(path);
 }
 
-static int refresh_tcon(struct cifs_tcon *tcon, bool force_refresh)
+static inline void refresh_ses_referral(struct cifs_ses *ses)
 {
-	struct TCP_Server_Info *server = tcon->ses->server;
-	struct cifs_ses *ses = tcon->ses;
+	__refresh_ses_referral(ses, false);
+}
 
-	mutex_lock(&server->refpath_lock);
-	if (server->leaf_fullpath)
-		__refresh_tcon(server->leaf_fullpath + 1, ses, force_refresh);
-	mutex_unlock(&server->refpath_lock);
-	return 0;
+static inline void force_refresh_ses_referral(struct cifs_ses *ses)
+{
+	__refresh_ses_referral(ses, true);
 }
 
 /**
@@ -1271,34 +1280,20 @@ int dfs_cache_remount_fs(struct cifs_sb_info *cifs_sb)
 	 */
 	cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_USE_PREFIX_PATH;
 
-	return refresh_tcon(tcon, true);
+	force_refresh_ses_referral(tcon->ses);
+	return 0;
 }
 
 /* Refresh all DFS referrals related to DFS tcon */
 void dfs_cache_refresh(struct work_struct *work)
 {
-	struct TCP_Server_Info *server;
-	struct dfs_root_ses *rses;
 	struct cifs_tcon *tcon;
 	struct cifs_ses *ses;
 
 	tcon = container_of(work, struct cifs_tcon, dfs_cache_work.work);
-	ses = tcon->ses;
-	server = ses->server;
 
-	mutex_lock(&server->refpath_lock);
-	if (server->leaf_fullpath)
-		__refresh_tcon(server->leaf_fullpath + 1, ses, false);
-	mutex_unlock(&server->refpath_lock);
-
-	list_for_each_entry(rses, &tcon->dfs_ses_list, list) {
-		ses = rses->ses;
-		server = ses->server;
-		mutex_lock(&server->refpath_lock);
-		if (server->leaf_fullpath)
-			__refresh_tcon(server->leaf_fullpath + 1, ses, false);
-		mutex_unlock(&server->refpath_lock);
-	}
+	for (ses = tcon->ses; ses; ses = ses->dfs_root_ses)
+		refresh_ses_referral(ses);
 
 	queue_delayed_work(dfscache_wq, &tcon->dfs_cache_work,
 			   atomic_read(&dfs_cache_ttl) * HZ);
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 89333d9bce36..864b194dbaa0 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -189,6 +189,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
 	int disposition;
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_open_parms oparms;
+	int rdwr_for_fscache = 0;
 
 	*oplock = 0;
 	if (tcon->ses->server->oplocks)
@@ -200,6 +201,10 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
 		return PTR_ERR(full_path);
 	}
 
+	/* If we're caching, we need to be able to fill in around partial writes. */
+	if (cifs_fscache_enabled(inode) && (oflags & O_ACCMODE) == O_WRONLY)
+		rdwr_for_fscache = 1;
+
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 	if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open &&
 	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
@@ -276,6 +281,8 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
 		desired_access |= GENERIC_READ; /* is this too little? */
 	if (OPEN_FMODE(oflags) & FMODE_WRITE)
 		desired_access |= GENERIC_WRITE;
+	if (rdwr_for_fscache == 1)
+		desired_access |= GENERIC_READ;
 
 	disposition = FILE_OVERWRITE_IF;
 	if ((oflags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
@@ -304,6 +311,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
 	if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
 		create_options |= CREATE_OPTION_READONLY;
 
+retry_open:
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
 		.cifs_sb = cifs_sb,
@@ -317,8 +325,15 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned
 	rc = server->ops->open(xid, &oparms, oplock, buf);
 	if (rc) {
 		cifs_dbg(FYI, "cifs_create returned 0x%x\n", rc);
+		if (rc == -EACCES && rdwr_for_fscache == 1) {
+			desired_access &= ~GENERIC_READ;
+			rdwr_for_fscache = 2;
+			goto retry_open;
+		}
 		goto out;
 	}
+	if (rdwr_for_fscache == 2)
+		cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
 
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 	/*
@@ -612,11 +627,18 @@ int cifs_mknod(struct mnt_idmap *idmap, struct inode *inode,
 		goto mknod_out;
 	}
 
+	trace_smb3_mknod_enter(xid, tcon->ses->Suid, tcon->tid, full_path);
+
 	rc = tcon->ses->server->ops->make_node(xid, inode, direntry, tcon,
 					       full_path, mode,
 					       device_number);
 
 mknod_out:
+	if (rc)
+		trace_smb3_mknod_err(xid,  tcon->ses->Suid, tcon->tid, rc);
+	else
+		trace_smb3_mknod_done(xid, tcon->ses->Suid, tcon->tid);
+
 	free_dentry_path(page);
 	free_xid(xid);
 	cifs_put_tlink(tlink);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index f391c9b803d8..9be37d0fe724 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -206,12 +206,12 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
 	 */
 }
 
-static inline int cifs_convert_flags(unsigned int flags)
+static inline int cifs_convert_flags(unsigned int flags, int rdwr_for_fscache)
 {
 	if ((flags & O_ACCMODE) == O_RDONLY)
 		return GENERIC_READ;
 	else if ((flags & O_ACCMODE) == O_WRONLY)
-		return GENERIC_WRITE;
+		return rdwr_for_fscache == 1 ? (GENERIC_READ | GENERIC_WRITE) : GENERIC_WRITE;
 	else if ((flags & O_ACCMODE) == O_RDWR) {
 		/* GENERIC_ALL is too much permission to request
 		   can cause unnecessary access denied on create */
@@ -329,7 +329,7 @@ int cifs_posix_open(const char *full_path, struct inode **pinode,
 		}
 	} else {
 		cifs_revalidate_mapping(*pinode);
-		rc = cifs_fattr_to_inode(*pinode, &fattr);
+		rc = cifs_fattr_to_inode(*pinode, &fattr, false);
 	}
 
 posix_open_ret:
@@ -348,11 +348,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
 	int create_options = CREATE_NOT_DIR;
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_open_parms oparms;
+	int rdwr_for_fscache = 0;
 
 	if (!server->ops->open)
 		return -ENOSYS;
 
-	desired_access = cifs_convert_flags(f_flags);
+	/* If we're caching, we need to be able to fill in around partial writes. */
+	if (cifs_fscache_enabled(inode) && (f_flags & O_ACCMODE) == O_WRONLY)
+		rdwr_for_fscache = 1;
+
+	desired_access = cifs_convert_flags(f_flags, rdwr_for_fscache);
 
 /*********************************************************************
  *  open flag mapping table:
@@ -389,6 +394,7 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
 	if (f_flags & O_DIRECT)
 		create_options |= CREATE_NO_BUFFER;
 
+retry_open:
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
 		.cifs_sb = cifs_sb,
@@ -400,8 +406,16 @@ static int cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_
 	};
 
 	rc = server->ops->open(xid, &oparms, oplock, buf);
-	if (rc)
+	if (rc) {
+		if (rc == -EACCES && rdwr_for_fscache == 1) {
+			desired_access = cifs_convert_flags(f_flags, 0);
+			rdwr_for_fscache = 2;
+			goto retry_open;
+		}
 		return rc;
+	}
+	if (rdwr_for_fscache == 2)
+		cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
 
 	/* TODO: Add support for calling posix query info but with passing in fid */
 	if (tcon->unix_ext)
@@ -445,6 +459,7 @@ cifs_down_write(struct rw_semaphore *sem)
 }
 
 static void cifsFileInfo_put_work(struct work_struct *work);
+void serverclose_work(struct work_struct *work);
 
 struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 				       struct tcon_link *tlink, __u32 oplock,
@@ -491,6 +506,7 @@ struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 	cfile->tlink = cifs_get_tlink(tlink);
 	INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
 	INIT_WORK(&cfile->put, cifsFileInfo_put_work);
+	INIT_WORK(&cfile->serverclose, serverclose_work);
 	INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close);
 	mutex_init(&cfile->fh_mutex);
 	spin_lock_init(&cfile->file_info_lock);
@@ -582,6 +598,40 @@ static void cifsFileInfo_put_work(struct work_struct *work)
 	cifsFileInfo_put_final(cifs_file);
 }
 
+void serverclose_work(struct work_struct *work)
+{
+	struct cifsFileInfo *cifs_file = container_of(work,
+			struct cifsFileInfo, serverclose);
+
+	struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink);
+
+	struct TCP_Server_Info *server = tcon->ses->server;
+	int rc = 0;
+	int retries = 0;
+	int MAX_RETRIES = 4;
+
+	do {
+		if (server->ops->close_getattr)
+			rc = server->ops->close_getattr(0, tcon, cifs_file);
+		else if (server->ops->close)
+			rc = server->ops->close(0, tcon, &cifs_file->fid);
+
+		if (rc == -EBUSY || rc == -EAGAIN) {
+			retries++;
+			msleep(250);
+		}
+	} while ((rc == -EBUSY || rc == -EAGAIN) && (retries < MAX_RETRIES)
+	);
+
+	if (retries == MAX_RETRIES)
+		pr_warn("Serverclose failed %d times, giving up\n", MAX_RETRIES);
+
+	if (cifs_file->offload)
+		queue_work(fileinfo_put_wq, &cifs_file->put);
+	else
+		cifsFileInfo_put_final(cifs_file);
+}
+
 /**
  * cifsFileInfo_put - release a reference of file priv data
  *
@@ -622,10 +672,13 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
 	struct cifs_fid fid = {};
 	struct cifs_pending_open open;
 	bool oplock_break_cancelled;
+	bool serverclose_offloaded = false;
 
 	spin_lock(&tcon->open_file_lock);
 	spin_lock(&cifsi->open_file_lock);
 	spin_lock(&cifs_file->file_info_lock);
+
+	cifs_file->offload = offload;
 	if (--cifs_file->count > 0) {
 		spin_unlock(&cifs_file->file_info_lock);
 		spin_unlock(&cifsi->open_file_lock);
@@ -667,13 +720,20 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
 	if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
 		struct TCP_Server_Info *server = tcon->ses->server;
 		unsigned int xid;
+		int rc = 0;
 
 		xid = get_xid();
 		if (server->ops->close_getattr)
-			server->ops->close_getattr(xid, tcon, cifs_file);
+			rc = server->ops->close_getattr(xid, tcon, cifs_file);
 		else if (server->ops->close)
-			server->ops->close(xid, tcon, &cifs_file->fid);
+			rc = server->ops->close(xid, tcon, &cifs_file->fid);
 		_free_xid(xid);
+
+		if (rc == -EBUSY || rc == -EAGAIN) {
+			// Server close failed, hence offloading it as an async op
+			queue_work(serverclose_wq, &cifs_file->serverclose);
+			serverclose_offloaded = true;
+		}
 	}
 
 	if (oplock_break_cancelled)
@@ -681,10 +741,15 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file,
 
 	cifs_del_pending_open(&open);
 
-	if (offload)
-		queue_work(fileinfo_put_wq, &cifs_file->put);
-	else
-		cifsFileInfo_put_final(cifs_file);
+	// if serverclose has been offloaded to wq (on failure), it will
+	// handle offloading put as well. If serverclose not offloaded,
+	// we need to handle offloading put here.
+	if (!serverclose_offloaded) {
+		if (offload)
+			queue_work(fileinfo_put_wq, &cifs_file->put);
+		else
+			cifsFileInfo_put_final(cifs_file);
+	}
 }
 
 int cifs_open(struct inode *inode, struct file *file)
@@ -834,11 +899,11 @@ int cifs_open(struct inode *inode, struct file *file)
 use_cache:
 	fscache_use_cookie(cifs_inode_cookie(file_inode(file)),
 			   file->f_mode & FMODE_WRITE);
-	if (file->f_flags & O_DIRECT &&
-	    (!((file->f_flags & O_ACCMODE) != O_RDONLY) ||
-	     file->f_flags & O_APPEND))
-		cifs_invalidate_cache(file_inode(file),
-				      FSCACHE_INVAL_DIO_WRITE);
+	if (!(file->f_flags & O_DIRECT))
+		goto out;
+	if ((file->f_flags & (O_ACCMODE | O_APPEND)) == O_RDONLY)
+		goto out;
+	cifs_invalidate_cache(file_inode(file), FSCACHE_INVAL_DIO_WRITE);
 
 out:
 	free_dentry_path(page);
@@ -903,6 +968,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 	int disposition = FILE_OPEN;
 	int create_options = CREATE_NOT_DIR;
 	struct cifs_open_parms oparms;
+	int rdwr_for_fscache = 0;
 
 	xid = get_xid();
 	mutex_lock(&cfile->fh_mutex);
@@ -966,7 +1032,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 	}
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
 
-	desired_access = cifs_convert_flags(cfile->f_flags);
+	/* If we're caching, we need to be able to fill in around partial writes. */
+	if (cifs_fscache_enabled(inode) && (cfile->f_flags & O_ACCMODE) == O_WRONLY)
+		rdwr_for_fscache = 1;
+
+	desired_access = cifs_convert_flags(cfile->f_flags, rdwr_for_fscache);
 
 	/* O_SYNC also has bit for O_DSYNC so following check picks up either */
 	if (cfile->f_flags & O_SYNC)
@@ -978,6 +1048,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 	if (server->ops->get_lease_key)
 		server->ops->get_lease_key(inode, &cfile->fid);
 
+retry_open:
 	oparms = (struct cifs_open_parms) {
 		.tcon = tcon,
 		.cifs_sb = cifs_sb,
@@ -1003,6 +1074,11 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 		/* indicate that we need to relock the file */
 		oparms.reconnect = true;
 	}
+	if (rc == -EACCES && rdwr_for_fscache == 1) {
+		desired_access = cifs_convert_flags(cfile->f_flags, 0);
+		rdwr_for_fscache = 2;
+		goto retry_open;
+	}
 
 	if (rc) {
 		mutex_unlock(&cfile->fh_mutex);
@@ -1011,6 +1087,9 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
 		goto reopen_error_exit;
 	}
 
+	if (rdwr_for_fscache == 2)
+		cifs_invalidate_cache(inode, FSCACHE_INVAL_DIO_WRITE);
+
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
 reopen_success:
 #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
@@ -1072,6 +1151,19 @@ void smb2_deferred_work_close(struct work_struct *work)
 	_cifsFileInfo_put(cfile, true, false);
 }
 
+static bool
+smb2_can_defer_close(struct inode *inode, struct cifs_deferred_close *dclose)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+	return (cifs_sb->ctx->closetimeo && cinode->lease_granted && dclose &&
+			(cinode->oplock == CIFS_CACHE_RHW_FLG ||
+			 cinode->oplock == CIFS_CACHE_RH_FLG) &&
+			!test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags));
+
+}
+
 int cifs_close(struct inode *inode, struct file *file)
 {
 	struct cifsFileInfo *cfile;
@@ -1085,10 +1177,8 @@ int cifs_close(struct inode *inode, struct file *file)
 		cfile = file->private_data;
 		file->private_data = NULL;
 		dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL);
-		if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG)
-		    && cinode->lease_granted &&
-		    !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) &&
-		    dclose) {
+		if ((cfile->status_file_deleted == false) &&
+		    (smb2_can_defer_close(inode, dclose))) {
 			if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) {
 				inode_set_mtime_to_ts(inode,
 						      inode_set_ctime_current(inode));
@@ -1315,20 +1405,20 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
 	down_read(&cinode->lock_sem);
 
 	exist = cifs_find_lock_conflict(cfile, offset, length, type,
-					flock->fl_flags, &conf_lock,
+					flock->c.flc_flags, &conf_lock,
 					CIFS_LOCK_OP);
 	if (exist) {
 		flock->fl_start = conf_lock->offset;
 		flock->fl_end = conf_lock->offset + conf_lock->length - 1;
-		flock->fl_pid = conf_lock->pid;
+		flock->c.flc_pid = conf_lock->pid;
 		if (conf_lock->type & server->vals->shared_lock_type)
-			flock->fl_type = F_RDLCK;
+			flock->c.flc_type = F_RDLCK;
 		else
-			flock->fl_type = F_WRLCK;
+			flock->c.flc_type = F_WRLCK;
 	} else if (!cinode->can_cache_brlcks)
 		rc = 1;
 	else
-		flock->fl_type = F_UNLCK;
+		flock->c.flc_type = F_UNLCK;
 
 	up_read(&cinode->lock_sem);
 	return rc;
@@ -1404,16 +1494,16 @@ cifs_posix_lock_test(struct file *file, struct file_lock *flock)
 {
 	int rc = 0;
 	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
-	unsigned char saved_type = flock->fl_type;
+	unsigned char saved_type = flock->c.flc_type;
 
-	if ((flock->fl_flags & FL_POSIX) == 0)
+	if ((flock->c.flc_flags & FL_POSIX) == 0)
 		return 1;
 
 	down_read(&cinode->lock_sem);
 	posix_test_lock(file, flock);
 
-	if (flock->fl_type == F_UNLCK && !cinode->can_cache_brlcks) {
-		flock->fl_type = saved_type;
+	if (lock_is_unlock(flock) && !cinode->can_cache_brlcks) {
+		flock->c.flc_type = saved_type;
 		rc = 1;
 	}
 
@@ -1434,7 +1524,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
 	struct cifsInodeInfo *cinode = CIFS_I(file_inode(file));
 	int rc = FILE_LOCK_DEFERRED + 1;
 
-	if ((flock->fl_flags & FL_POSIX) == 0)
+	if ((flock->c.flc_flags & FL_POSIX) == 0)
 		return rc;
 
 	cifs_down_write(&cinode->lock_sem);
@@ -1584,7 +1674,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 
 	el = locks_to_send.next;
 	spin_lock(&flctx->flc_lock);
-	list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
+	for_each_file_lock(flock, &flctx->flc_posix) {
+		unsigned char ftype = flock->c.flc_type;
+
 		if (el == &locks_to_send) {
 			/*
 			 * The list ended. We don't have enough allocated
@@ -1594,12 +1686,12 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 			break;
 		}
 		length = cifs_flock_len(flock);
-		if (flock->fl_type == F_RDLCK || flock->fl_type == F_SHLCK)
+		if (ftype == F_RDLCK || ftype == F_SHLCK)
 			type = CIFS_RDLCK;
 		else
 			type = CIFS_WRLCK;
 		lck = list_entry(el, struct lock_to_push, llist);
-		lck->pid = hash_lockowner(flock->fl_owner);
+		lck->pid = hash_lockowner(flock->c.flc_owner);
 		lck->netfid = cfile->fid.netfid;
 		lck->length = length;
 		lck->type = type;
@@ -1666,42 +1758,43 @@ static void
 cifs_read_flock(struct file_lock *flock, __u32 *type, int *lock, int *unlock,
 		bool *wait_flag, struct TCP_Server_Info *server)
 {
-	if (flock->fl_flags & FL_POSIX)
+	if (flock->c.flc_flags & FL_POSIX)
 		cifs_dbg(FYI, "Posix\n");
-	if (flock->fl_flags & FL_FLOCK)
+	if (flock->c.flc_flags & FL_FLOCK)
 		cifs_dbg(FYI, "Flock\n");
-	if (flock->fl_flags & FL_SLEEP) {
+	if (flock->c.flc_flags & FL_SLEEP) {
 		cifs_dbg(FYI, "Blocking lock\n");
 		*wait_flag = true;
 	}
-	if (flock->fl_flags & FL_ACCESS)
+	if (flock->c.flc_flags & FL_ACCESS)
 		cifs_dbg(FYI, "Process suspended by mandatory locking - not implemented yet\n");
-	if (flock->fl_flags & FL_LEASE)
+	if (flock->c.flc_flags & FL_LEASE)
 		cifs_dbg(FYI, "Lease on file - not implemented yet\n");
-	if (flock->fl_flags &
+	if (flock->c.flc_flags &
 	    (~(FL_POSIX | FL_FLOCK | FL_SLEEP |
 	       FL_ACCESS | FL_LEASE | FL_CLOSE | FL_OFDLCK)))
-		cifs_dbg(FYI, "Unknown lock flags 0x%x\n", flock->fl_flags);
+		cifs_dbg(FYI, "Unknown lock flags 0x%x\n",
+		         flock->c.flc_flags);
 
 	*type = server->vals->large_lock_type;
-	if (flock->fl_type == F_WRLCK) {
+	if (lock_is_write(flock)) {
 		cifs_dbg(FYI, "F_WRLCK\n");
 		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
-	} else if (flock->fl_type == F_UNLCK) {
+	} else if (lock_is_unlock(flock)) {
 		cifs_dbg(FYI, "F_UNLCK\n");
 		*type |= server->vals->unlock_lock_type;
 		*unlock = 1;
 		/* Check if unlock includes more than one lock range */
-	} else if (flock->fl_type == F_RDLCK) {
+	} else if (lock_is_read(flock)) {
 		cifs_dbg(FYI, "F_RDLCK\n");
 		*type |= server->vals->shared_lock_type;
 		*lock = 1;
-	} else if (flock->fl_type == F_EXLCK) {
+	} else if (flock->c.flc_type == F_EXLCK) {
 		cifs_dbg(FYI, "F_EXLCK\n");
 		*type |= server->vals->exclusive_lock_type;
 		*lock = 1;
-	} else if (flock->fl_type == F_SHLCK) {
+	} else if (flock->c.flc_type == F_SHLCK) {
 		cifs_dbg(FYI, "F_SHLCK\n");
 		*type |= server->vals->shared_lock_type;
 		*lock = 1;
@@ -1733,7 +1826,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 		else
 			posix_lock_type = CIFS_WRLCK;
 		rc = CIFSSMBPosixLock(xid, tcon, netfid,
-				      hash_lockowner(flock->fl_owner),
+				      hash_lockowner(flock->c.flc_owner),
 				      flock->fl_start, length, flock,
 				      posix_lock_type, wait_flag);
 		return rc;
@@ -1750,7 +1843,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 	if (rc == 0) {
 		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
 					    type, 0, 1, false);
-		flock->fl_type = F_UNLCK;
+		flock->c.flc_type = F_UNLCK;
 		if (rc != 0)
 			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
 				 rc);
@@ -1758,7 +1851,7 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 	}
 
 	if (type & server->vals->shared_lock_type) {
-		flock->fl_type = F_WRLCK;
+		flock->c.flc_type = F_WRLCK;
 		return 0;
 	}
 
@@ -1770,12 +1863,12 @@ cifs_getlk(struct file *file, struct file_lock *flock, __u32 type,
 	if (rc == 0) {
 		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
 			type | server->vals->shared_lock_type, 0, 1, false);
-		flock->fl_type = F_RDLCK;
+		flock->c.flc_type = F_RDLCK;
 		if (rc != 0)
 			cifs_dbg(VFS, "Error unlocking previously locked range %d during test of lock\n",
 				 rc);
 	} else
-		flock->fl_type = F_WRLCK;
+		flock->c.flc_type = F_WRLCK;
 
 	return 0;
 }
@@ -1943,7 +2036,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 			posix_lock_type = CIFS_UNLCK;
 
 		rc = CIFSSMBPosixLock(xid, tcon, cfile->fid.netfid,
-				      hash_lockowner(flock->fl_owner),
+				      hash_lockowner(flock->c.flc_owner),
 				      flock->fl_start, length,
 				      NULL, posix_lock_type, wait_flag);
 		goto out;
@@ -1953,7 +2046,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		struct cifsLockInfo *lock;
 
 		lock = cifs_lock_init(flock->fl_start, length, type,
-				      flock->fl_flags);
+				      flock->c.flc_flags);
 		if (!lock)
 			return -ENOMEM;
 
@@ -1992,7 +2085,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		rc = server->ops->mand_unlock_range(cfile, flock, xid);
 
 out:
-	if ((flock->fl_flags & FL_POSIX) || (flock->fl_flags & FL_FLOCK)) {
+	if ((flock->c.flc_flags & FL_POSIX) || (flock->c.flc_flags & FL_FLOCK)) {
 		/*
 		 * If this is a request to remove all locks because we
 		 * are closing the file, it doesn't matter if the
@@ -2001,7 +2094,7 @@ out:
 		 */
 		if (rc) {
 			cifs_dbg(VFS, "%s failed rc=%d\n", __func__, rc);
-			if (!(flock->fl_flags & FL_CLOSE))
+			if (!(flock->c.flc_flags & FL_CLOSE))
 				return rc;
 		}
 		rc = locks_lock_file_wait(file, flock);
@@ -2022,7 +2115,7 @@ int cifs_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	xid = get_xid();
 
-	if (!(fl->fl_flags & FL_FLOCK)) {
+	if (!(fl->c.flc_flags & FL_FLOCK)) {
 		rc = -ENOLCK;
 		free_xid(xid);
 		return rc;
@@ -2073,7 +2166,8 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock)
 	xid = get_xid();
 
 	cifs_dbg(FYI, "%s: %pD2 cmd=0x%x type=0x%x flags=0x%x r=%lld:%lld\n", __func__, file, cmd,
-		 flock->fl_flags, flock->fl_type, (long long)flock->fl_start,
+		 flock->c.flc_flags, flock->c.flc_type,
+		 (long long)flock->fl_start,
 		 (long long)flock->fl_end);
 
 	cfile = (struct cifsFileInfo *)file->private_data;
@@ -2624,20 +2718,20 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
  * dirty pages if possible, but don't sleep while doing so.
  */
 static void cifs_extend_writeback(struct address_space *mapping,
+				  struct xa_state *xas,
 				  long *_count,
 				  loff_t start,
 				  int max_pages,
-				  size_t max_len,
-				  unsigned int *_len)
+				  loff_t max_len,
+				  size_t *_len)
 {
 	struct folio_batch batch;
 	struct folio *folio;
-	unsigned int psize, nr_pages;
-	size_t len = *_len;
-	pgoff_t index = (start + len) / PAGE_SIZE;
+	unsigned int nr_pages;
+	pgoff_t index = (start + *_len) / PAGE_SIZE;
+	size_t len;
 	bool stop = true;
 	unsigned int i;
-	XA_STATE(xas, &mapping->i_pages, index);
 
 	folio_batch_init(&batch);
 
@@ -2648,54 +2742,64 @@ static void cifs_extend_writeback(struct address_space *mapping,
 		 */
 		rcu_read_lock();
 
-		xas_for_each(&xas, folio, ULONG_MAX) {
+		xas_for_each(xas, folio, ULONG_MAX) {
 			stop = true;
-			if (xas_retry(&xas, folio))
+			if (xas_retry(xas, folio))
 				continue;
 			if (xa_is_value(folio))
 				break;
-			if (folio->index != index)
+			if (folio->index != index) {
+				xas_reset(xas);
 				break;
+			}
+
 			if (!folio_try_get_rcu(folio)) {
-				xas_reset(&xas);
+				xas_reset(xas);
 				continue;
 			}
 			nr_pages = folio_nr_pages(folio);
-			if (nr_pages > max_pages)
+			if (nr_pages > max_pages) {
+				xas_reset(xas);
 				break;
+			}
 
 			/* Has the page moved or been split? */
-			if (unlikely(folio != xas_reload(&xas))) {
+			if (unlikely(folio != xas_reload(xas))) {
 				folio_put(folio);
+				xas_reset(xas);
 				break;
 			}
 
 			if (!folio_trylock(folio)) {
 				folio_put(folio);
+				xas_reset(xas);
 				break;
 			}
-			if (!folio_test_dirty(folio) || folio_test_writeback(folio)) {
+			if (!folio_test_dirty(folio) ||
+			    folio_test_writeback(folio)) {
 				folio_unlock(folio);
 				folio_put(folio);
+				xas_reset(xas);
 				break;
 			}
 
 			max_pages -= nr_pages;
-			psize = folio_size(folio);
-			len += psize;
+			len = folio_size(folio);
 			stop = false;
-			if (max_pages <= 0 || len >= max_len || *_count <= 0)
-				stop = true;
 
 			index += nr_pages;
+			*_count -= nr_pages;
+			*_len += len;
+			if (max_pages <= 0 || *_len >= max_len || *_count <= 0)
+				stop = true;
+
 			if (!folio_batch_add(&batch, folio))
 				break;
 			if (stop)
 				break;
 		}
 
-		if (!stop)
-			xas_pause(&xas);
+		xas_pause(xas);
 		rcu_read_unlock();
 
 		/* Now, if we obtained any pages, we can shift them to being
@@ -2712,16 +2816,12 @@ static void cifs_extend_writeback(struct address_space *mapping,
 			if (!folio_clear_dirty_for_io(folio))
 				WARN_ON(1);
 			folio_start_writeback(folio);
-
-			*_count -= folio_nr_pages(folio);
 			folio_unlock(folio);
 		}
 
 		folio_batch_release(&batch);
 		cond_resched();
 	} while (!stop);
-
-	*_len = len;
 }
 
 /*
@@ -2729,8 +2829,10 @@ static void cifs_extend_writeback(struct address_space *mapping,
  */
 static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
 						 struct writeback_control *wbc,
+						 struct xa_state *xas,
 						 struct folio *folio,
-						 loff_t start, loff_t end)
+						 unsigned long long start,
+						 unsigned long long end)
 {
 	struct inode *inode = mapping->host;
 	struct TCP_Server_Info *server;
@@ -2739,17 +2841,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
 	struct cifs_credits credits_on_stack;
 	struct cifs_credits *credits = &credits_on_stack;
 	struct cifsFileInfo *cfile = NULL;
-	unsigned int xid, wsize, len;
-	loff_t i_size = i_size_read(inode);
-	size_t max_len;
+	unsigned long long i_size = i_size_read(inode), max_len;
+	unsigned int xid, wsize;
+	size_t len = folio_size(folio);
 	long count = wbc->nr_to_write;
 	int rc;
 
 	/* The folio should be locked, dirty and not undergoing writeback. */
+	if (!folio_clear_dirty_for_io(folio))
+		WARN_ON_ONCE(1);
 	folio_start_writeback(folio);
 
 	count -= folio_nr_pages(folio);
-	len = folio_size(folio);
 
 	xid = get_xid();
 	server = cifs_pick_channel(cifs_sb_master_tcon(cifs_sb)->ses);
@@ -2779,9 +2882,10 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
 	wdata->server = server;
 	cfile = NULL;
 
-	/* Find all consecutive lockable dirty pages, stopping when we find a
-	 * page that is not immediately lockable, is not dirty or is missing,
-	 * or we reach the end of the range.
+	/* Find all consecutive lockable dirty pages that have contiguous
+	 * written regions, stopping when we find a page that is not
+	 * immediately lockable, is not dirty or is missing, or we reach the
+	 * end of the range.
 	 */
 	if (start < i_size) {
 		/* Trim the write to the EOF; the extra data is ignored.  Also
@@ -2801,19 +2905,18 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
 			max_pages -= folio_nr_pages(folio);
 
 			if (max_pages > 0)
-				cifs_extend_writeback(mapping, &count, start,
+				cifs_extend_writeback(mapping, xas, &count, start,
 						      max_pages, max_len, &len);
 		}
-		len = min_t(loff_t, len, max_len);
 	}
-
-	wdata->bytes = len;
+	len = min_t(unsigned long long, len, i_size - start);
 
 	/* We now have a contiguous set of dirty pages, each with writeback
 	 * set; the first page is still locked at this point, but all the rest
 	 * have been unlocked.
 	 */
 	folio_unlock(folio);
+	wdata->bytes = len;
 
 	if (start < i_size) {
 		iov_iter_xarray(&wdata->iter, ITER_SOURCE, &mapping->i_pages,
@@ -2864,102 +2967,118 @@ err_xid:
 /*
  * write a region of pages back to the server
  */
-static int cifs_writepages_region(struct address_space *mapping,
-				  struct writeback_control *wbc,
-				  loff_t start, loff_t end, loff_t *_next)
+static ssize_t cifs_writepages_begin(struct address_space *mapping,
+				     struct writeback_control *wbc,
+				     struct xa_state *xas,
+				     unsigned long long *_start,
+				     unsigned long long end)
 {
-	struct folio_batch fbatch;
+	struct folio *folio;
+	unsigned long long start = *_start;
+	ssize_t ret;
 	int skips = 0;
 
-	folio_batch_init(&fbatch);
-	do {
-		int nr;
-		pgoff_t index = start / PAGE_SIZE;
+search_again:
+	/* Find the first dirty page. */
+	rcu_read_lock();
 
-		nr = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
-					    PAGECACHE_TAG_DIRTY, &fbatch);
-		if (!nr)
+	for (;;) {
+		folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+		if (xas_retry(xas, folio) || xa_is_value(folio))
+			continue;
+		if (!folio)
 			break;
 
-		for (int i = 0; i < nr; i++) {
-			ssize_t ret;
-			struct folio *folio = fbatch.folios[i];
+		if (!folio_try_get_rcu(folio)) {
+			xas_reset(xas);
+			continue;
+		}
 
-redo_folio:
-			start = folio_pos(folio); /* May regress with THPs */
+		if (unlikely(folio != xas_reload(xas))) {
+			folio_put(folio);
+			xas_reset(xas);
+			continue;
+		}
 
-			/* At this point we hold neither the i_pages lock nor the
-			 * page lock: the page may be truncated or invalidated
-			 * (changing page->mapping to NULL), or even swizzled
-			 * back from swapper_space to tmpfs file mapping
-			 */
-			if (wbc->sync_mode != WB_SYNC_NONE) {
-				ret = folio_lock_killable(folio);
-				if (ret < 0)
-					goto write_error;
-			} else {
-				if (!folio_trylock(folio))
-					goto skip_write;
-			}
+		xas_pause(xas);
+		break;
+	}
+	rcu_read_unlock();
+	if (!folio)
+		return 0;
 
-			if (folio->mapping != mapping ||
-			    !folio_test_dirty(folio)) {
-				start += folio_size(folio);
-				folio_unlock(folio);
-				continue;
-			}
+	start = folio_pos(folio); /* May regress with THPs */
 
-			if (folio_test_writeback(folio) ||
-			    folio_test_fscache(folio)) {
-				folio_unlock(folio);
-				if (wbc->sync_mode == WB_SYNC_NONE)
-					goto skip_write;
+	/* At this point we hold neither the i_pages lock nor the page lock:
+	 * the page may be truncated or invalidated (changing page->mapping to
+	 * NULL), or even swizzled back from swapper_space to tmpfs file
+	 * mapping
+	 */
+lock_again:
+	if (wbc->sync_mode != WB_SYNC_NONE) {
+		ret = folio_lock_killable(folio);
+		if (ret < 0)
+			return ret;
+	} else {
+		if (!folio_trylock(folio))
+			goto search_again;
+	}
+
+	if (folio->mapping != mapping ||
+	    !folio_test_dirty(folio)) {
+		start += folio_size(folio);
+		folio_unlock(folio);
+		goto search_again;
+	}
 
-				folio_wait_writeback(folio);
+	if (folio_test_writeback(folio) ||
+	    folio_test_fscache(folio)) {
+		folio_unlock(folio);
+		if (wbc->sync_mode != WB_SYNC_NONE) {
+			folio_wait_writeback(folio);
 #ifdef CONFIG_CIFS_FSCACHE
-				folio_wait_fscache(folio);
+			folio_wait_fscache(folio);
 #endif
-				goto redo_folio;
-			}
-
-			if (!folio_clear_dirty_for_io(folio))
-				/* We hold the page lock - it should've been dirty. */
-				WARN_ON(1);
-
-			ret = cifs_write_back_from_locked_folio(mapping, wbc, folio, start, end);
-			if (ret < 0)
-				goto write_error;
-
-			start += ret;
-			continue;
-
-write_error:
-			folio_batch_release(&fbatch);
-			*_next = start;
-			return ret;
+			goto lock_again;
+		}
 
-skip_write:
-			/*
-			 * Too many skipped writes, or need to reschedule?
-			 * Treat it as a write error without an error code.
-			 */
+		start += folio_size(folio);
+		if (wbc->sync_mode == WB_SYNC_NONE) {
 			if (skips >= 5 || need_resched()) {
 				ret = 0;
-				goto write_error;
+				goto out;
 			}
-
-			/* Otherwise, just skip that folio and go on to the next */
 			skips++;
-			start += folio_size(folio);
-			continue;
 		}
+		goto search_again;
+	}
 
-		folio_batch_release(&fbatch);		
-		cond_resched();
-	} while (wbc->nr_to_write > 0);
+	ret = cifs_write_back_from_locked_folio(mapping, wbc, xas, folio, start, end);
+out:
+	if (ret > 0)
+		*_start = start + ret;
+	return ret;
+}
 
-	*_next = start;
-	return 0;
+/*
+ * Write a region of pages back to the server
+ */
+static int cifs_writepages_region(struct address_space *mapping,
+				  struct writeback_control *wbc,
+				  unsigned long long *_start,
+				  unsigned long long end)
+{
+	ssize_t ret;
+
+	XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+	do {
+		ret = cifs_writepages_begin(mapping, wbc, &xas, _start, end);
+		if (ret > 0 && wbc->nr_to_write > 0)
+			cond_resched();
+	} while (ret > 0 && wbc->nr_to_write > 0);
+
+	return ret > 0 ? 0 : ret;
 }
 
 /*
@@ -2968,7 +3087,7 @@ skip_write:
 static int cifs_writepages(struct address_space *mapping,
 			   struct writeback_control *wbc)
 {
-	loff_t start, next;
+	loff_t start, end;
 	int ret;
 
 	/* We have to be careful as we can end up racing with setattr()
@@ -2976,28 +3095,34 @@ static int cifs_writepages(struct address_space *mapping,
 	 * to prevent it.
 	 */
 
-	if (wbc->range_cyclic) {
+	if (wbc->range_cyclic && mapping->writeback_index) {
 		start = mapping->writeback_index * PAGE_SIZE;
-		ret = cifs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
-		if (ret == 0) {
-			mapping->writeback_index = next / PAGE_SIZE;
-			if (start > 0 && wbc->nr_to_write > 0) {
-				ret = cifs_writepages_region(mapping, wbc, 0,
-							     start, &next);
-				if (ret == 0)
-					mapping->writeback_index =
-						next / PAGE_SIZE;
-			}
+		ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
+		if (ret < 0)
+			goto out;
+
+		if (wbc->nr_to_write <= 0) {
+			mapping->writeback_index = start / PAGE_SIZE;
+			goto out;
 		}
+
+		start = 0;
+		end = mapping->writeback_index * PAGE_SIZE;
+		mapping->writeback_index = 0;
+		ret = cifs_writepages_region(mapping, wbc, &start, end);
+		if (ret == 0)
+			mapping->writeback_index = start / PAGE_SIZE;
 	} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
-		ret = cifs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
+		start = 0;
+		ret = cifs_writepages_region(mapping, wbc, &start, LLONG_MAX);
 		if (wbc->nr_to_write > 0 && ret == 0)
-			mapping->writeback_index = next / PAGE_SIZE;
+			mapping->writeback_index = start / PAGE_SIZE;
 	} else {
-		ret = cifs_writepages_region(mapping, wbc,
-					     wbc->range_start, wbc->range_end, &next);
+		start = wbc->range_start;
+		ret = cifs_writepages_region(mapping, wbc, &start, wbc->range_end);
 	}
 
+out:
 	return ret;
 }
 
@@ -3094,8 +3219,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
 	if (rc > 0) {
 		spin_lock(&inode->i_lock);
 		if (pos > inode->i_size) {
+			loff_t additional_blocks = (512 - 1 + copied) >> 9;
+
 			i_size_write(inode, pos);
-			inode->i_blocks = (512 - 1 + pos) >> 9;
+			/*
+			 * Estimate new allocation size based on the amount written.
+			 * This will be updated from server on close (and on queryinfo)
+			 */
+			inode->i_blocks = min_t(blkcnt_t, (512 - 1 + pos) >> 9,
+						inode->i_blocks + additional_blocks);
 		}
 		spin_unlock(&inode->i_lock);
 	}
@@ -4738,12 +4870,14 @@ static int is_inode_writable(struct cifsInodeInfo *cifs_inode)
    refreshing the inode only on increases in the file size
    but this is tricky to do without racing with writebehind
    page caching in the current Linux kernel design */
-bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file)
+bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file,
+			    bool from_readdir)
 {
 	if (!cifsInode)
 		return true;
 
-	if (is_inode_writable(cifsInode)) {
+	if (is_inode_writable(cifsInode) ||
+		((cifsInode->oplock & CIFS_CACHE_RW_FLG) != 0 && from_readdir)) {
 		/* This inode is open for write at least once */
 		struct cifs_sb_info *cifs_sb;
 
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index 4b2f5aa2ea0e..6c727d8c31e8 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -37,7 +37,7 @@
 #include "rfc1002pdu.h"
 #include "fs_context.h"
 
-static DEFINE_MUTEX(cifs_mount_mutex);
+DEFINE_MUTEX(cifs_mount_mutex);
 
 static const match_table_t cifs_smb_version_tokens = {
 	{ Smb_1, SMB1_VERSION_STRING },
@@ -162,6 +162,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_string("username", Opt_user),
 	fsparam_string("pass", Opt_pass),
 	fsparam_string("password", Opt_pass),
+	fsparam_string("password2", Opt_pass2),
 	fsparam_string("ip", Opt_ip),
 	fsparam_string("addr", Opt_ip),
 	fsparam_string("domain", Opt_domain),
@@ -174,6 +175,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
 	fsparam_string("vers", Opt_vers),
 	fsparam_string("sec", Opt_sec),
 	fsparam_string("cache", Opt_cache),
+	fsparam_string("reparse", Opt_reparse),
 
 	/* Arguments that should be ignored */
 	fsparam_flag("guest", Opt_ignore),
@@ -296,6 +298,35 @@ cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_conte
 	return 0;
 }
 
+static const match_table_t reparse_flavor_tokens = {
+	{ Opt_reparse_default,	"default" },
+	{ Opt_reparse_nfs,	"nfs" },
+	{ Opt_reparse_wsl,	"wsl" },
+	{ Opt_reparse_err,	NULL },
+};
+
+static int parse_reparse_flavor(struct fs_context *fc, char *value,
+				struct smb3_fs_context *ctx)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	switch (match_token(value, reparse_flavor_tokens, args)) {
+	case Opt_reparse_default:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
+		break;
+	case Opt_reparse_nfs:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_NFS;
+		break;
+	case Opt_reparse_wsl:
+		ctx->reparse_type = CIFS_REPARSE_TYPE_WSL;
+		break;
+	default:
+		cifs_errorf(fc, "bad reparse= option: %s\n", value);
+		return 1;
+	}
+	return 0;
+}
+
 #define DUP_CTX_STR(field)						\
 do {									\
 	if (ctx->field) {						\
@@ -315,6 +346,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	new_ctx->nodename = NULL;
 	new_ctx->username = NULL;
 	new_ctx->password = NULL;
+	new_ctx->password2 = NULL;
 	new_ctx->server_hostname = NULL;
 	new_ctx->domainname = NULL;
 	new_ctx->UNC = NULL;
@@ -327,6 +359,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx
 	DUP_CTX_STR(prepath);
 	DUP_CTX_STR(username);
 	DUP_CTX_STR(password);
+	DUP_CTX_STR(password2);
 	DUP_CTX_STR(server_hostname);
 	DUP_CTX_STR(UNC);
 	DUP_CTX_STR(source);
@@ -753,9 +786,9 @@ static int smb3_get_tree(struct fs_context *fc)
 
 	if (err)
 		return err;
-	mutex_lock(&cifs_mount_mutex);
+	cifs_mount_lock();
 	ret = smb3_get_tree_common(fc);
-	mutex_unlock(&cifs_mount_mutex);
+	cifs_mount_unlock();
 	return ret;
 }
 
@@ -772,7 +805,7 @@ static void smb3_fs_context_free(struct fs_context *fc)
  */
 static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
 				       struct smb3_fs_context *new_ctx,
-				       struct smb3_fs_context *old_ctx)
+				       struct smb3_fs_context *old_ctx, bool need_recon)
 {
 	if (new_ctx->posix_paths != old_ctx->posix_paths) {
 		cifs_errorf(fc, "can not change posixpaths during remount\n");
@@ -798,8 +831,15 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc,
 	}
 	if (new_ctx->password &&
 	    (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) {
-		cifs_errorf(fc, "can not change password during remount\n");
-		return -EINVAL;
+		if (need_recon == false) {
+			cifs_errorf(fc,
+				    "can not change password of active session during remount\n");
+			return -EINVAL;
+		} else if (old_ctx->sectype == Kerberos) {
+			cifs_errorf(fc,
+				    "can not change password for Kerberos via remount\n");
+			return -EINVAL;
+		}
 	}
 	if (new_ctx->domainname &&
 	    (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) {
@@ -843,9 +883,14 @@ static int smb3_reconfigure(struct fs_context *fc)
 	struct smb3_fs_context *ctx = smb3_fc2context(fc);
 	struct dentry *root = fc->root;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb);
+	struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses;
+	bool need_recon = false;
 	int rc;
 
-	rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx);
+	if (ses->expired_pwd)
+		need_recon = true;
+
+	rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx, need_recon);
 	if (rc)
 		return rc;
 
@@ -858,7 +903,14 @@ static int smb3_reconfigure(struct fs_context *fc)
 	STEAL_STRING(cifs_sb, ctx, UNC);
 	STEAL_STRING(cifs_sb, ctx, source);
 	STEAL_STRING(cifs_sb, ctx, username);
-	STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+	if (need_recon == false)
+		STEAL_STRING_SENSITIVE(cifs_sb, ctx, password);
+	else  {
+		kfree_sensitive(ses->password);
+		ses->password = kstrdup(ctx->password, GFP_KERNEL);
+		kfree_sensitive(ses->password2);
+		ses->password2 = kstrdup(ctx->password2, GFP_KERNEL);
+	}
 	STEAL_STRING(cifs_sb, ctx, domainname);
 	STEAL_STRING(cifs_sb, ctx, nodename);
 	STEAL_STRING(cifs_sb, ctx, iocharset);
@@ -916,7 +968,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 
 	switch (opt) {
 	case Opt_compress:
-		ctx->compression = UNKNOWN_TYPE;
+		ctx->compress = true;
 		cifs_dbg(VFS,
 			"SMB3 compression support is experimental\n");
 		break;
@@ -1258,6 +1310,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 			goto cifs_parse_mount_err;
 		}
 		break;
+	case Opt_pass2:
+		kfree_sensitive(ctx->password2);
+		ctx->password2 = NULL;
+		if (strlen(param->string) == 0)
+			break;
+
+		ctx->password2 = kstrdup(param->string, GFP_KERNEL);
+		if (ctx->password2 == NULL) {
+			cifs_errorf(fc, "OOM when copying password2 string\n");
+			goto cifs_parse_mount_err;
+		}
+		break;
 	case Opt_ip:
 		if (strlen(param->string) == 0) {
 			ctx->got_ip = false;
@@ -1549,6 +1613,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
 	case Opt_rdma:
 		ctx->rdma = true;
 		break;
+	case Opt_reparse:
+		if (parse_reparse_flavor(fc, param->string, ctx))
+			goto cifs_parse_mount_err;
+		break;
 	}
 	/* case Opt_ignore: - is ignored as expected ... */
 
@@ -1557,6 +1625,8 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
  cifs_parse_mount_err:
 	kfree_sensitive(ctx->password);
 	ctx->password = NULL;
+	kfree_sensitive(ctx->password2);
+	ctx->password2 = NULL;
 	return -EINVAL;
 }
 
@@ -1635,6 +1705,7 @@ int smb3_init_fs_context(struct fs_context *fc)
 	ctx->backupgid_specified = false; /* no backup intent for a group */
 
 	ctx->retrans = 1;
+	ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT;
 
 /*
  *	short int override_uid = -1;
@@ -1661,6 +1732,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx)
 	ctx->username = NULL;
 	kfree_sensitive(ctx->password);
 	ctx->password = NULL;
+	kfree_sensitive(ctx->password2);
+	ctx->password2 = NULL;
 	kfree(ctx->server_hostname);
 	ctx->server_hostname = NULL;
 	kfree(ctx->UNC);
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index 182ce11cbe93..a947bddeba27 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -41,6 +41,13 @@ enum {
 	Opt_cache_err
 };
 
+enum cifs_reparse_parm {
+	Opt_reparse_default,
+	Opt_reparse_nfs,
+	Opt_reparse_wsl,
+	Opt_reparse_err
+};
+
 enum cifs_sec_param {
 	Opt_sec_krb5,
 	Opt_sec_krb5i,
@@ -138,6 +145,7 @@ enum cifs_param {
 	Opt_source,
 	Opt_user,
 	Opt_pass,
+	Opt_pass2,
 	Opt_ip,
 	Opt_domain,
 	Opt_srcaddr,
@@ -148,6 +156,7 @@ enum cifs_param {
 	Opt_vers,
 	Opt_sec,
 	Opt_cache,
+	Opt_reparse,
 
 	/* Mount options to be ignored */
 	Opt_ignore,
@@ -169,6 +178,7 @@ struct smb3_fs_context {
 
 	char *username;
 	char *password;
+	char *password2;
 	char *domainname;
 	char *source;
 	char *server_hostname;
@@ -265,12 +275,13 @@ struct smb3_fs_context {
 	unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
 	unsigned int max_channels;
 	unsigned int max_cached_dirs;
-	__u16 compression; /* compression algorithm 0xFFFF default 0=disabled */
+	bool compress; /* enable SMB2 messages (READ/WRITE) de/compression */
 	bool rootfs:1; /* if it's a SMB root file system */
 	bool witness:1; /* use witness protocol */
 	char *leaf_fullpath;
 	struct cifs_ses *dfs_root_ses;
 	bool dfs_automount:1; /* set for dfs automount only */
+	enum cifs_reparse_type reparse_type;
 };
 
 extern const struct fs_parameter_spec smb3_fs_parameters[];
@@ -295,4 +306,16 @@ extern void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb);
 #define MAX_CACHED_FIDS 16
 extern char *cifs_sanitize_prepath(char *prepath, gfp_t gfp);
 
+extern struct mutex cifs_mount_mutex;
+
+static inline void cifs_mount_lock(void)
+{
+	mutex_lock(&cifs_mount_mutex);
+}
+
+static inline void cifs_mount_unlock(void)
+{
+	mutex_unlock(&cifs_mount_mutex);
+}
+
 #endif
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index c4a3cb736881..340efce8f052 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -12,6 +12,16 @@
 #include "cifs_fs_sb.h"
 #include "cifsproto.h"
 
+/*
+ * Key for fscache inode.  [!] Contents must match comparisons in cifs_find_inode().
+ */
+struct cifs_fscache_inode_key {
+
+	__le64  uniqueid;	/* server inode number */
+	__le64  createtime;	/* creation time on server */
+	u8	type;		/* S_IFMT file type */
+} __packed;
+
 static void cifs_fscache_fill_volume_coherency(
 	struct cifs_tcon *tcon,
 	struct cifs_fscache_volume_coherency_data *cd)
@@ -97,15 +107,19 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon)
 void cifs_fscache_get_inode_cookie(struct inode *inode)
 {
 	struct cifs_fscache_inode_coherency_data cd;
+	struct cifs_fscache_inode_key key;
 	struct cifsInodeInfo *cifsi = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb);
 
+	key.uniqueid	= cpu_to_le64(cifsi->uniqueid);
+	key.createtime	= cpu_to_le64(cifsi->createtime);
+	key.type	= (inode->i_mode & S_IFMT) >> 12;
 	cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd);
 
 	cifsi->netfs.cache =
 		fscache_acquire_cookie(tcon->fscache, 0,
-				       &cifsi->uniqueid, sizeof(cifsi->uniqueid),
+				       &key, sizeof(key),
 				       &cd, sizeof(cd),
 				       i_size_read(&cifsi->netfs.inode));
 	if (cifsi->netfs.cache)
diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h
index a3d73720914f..1f2ea9f5cc9a 100644
--- a/fs/smb/client/fscache.h
+++ b/fs/smb/client/fscache.h
@@ -109,6 +109,11 @@ static inline void cifs_readahead_to_fscache(struct inode *inode,
 		__cifs_readahead_to_fscache(inode, pos, len);
 }
 
+static inline bool cifs_fscache_enabled(struct inode *inode)
+{
+	return fscache_cookie_enabled(cifs_inode_cookie(inode));
+}
+
 #else /* CONFIG_CIFS_FSCACHE */
 static inline
 void cifs_fscache_fill_coherency(struct inode *inode,
@@ -124,6 +129,7 @@ static inline void cifs_fscache_release_inode_cookie(struct inode *inode) {}
 static inline void cifs_fscache_unuse_inode_cookie(struct inode *inode, bool update) {}
 static inline struct fscache_cookie *cifs_inode_cookie(struct inode *inode) { return NULL; }
 static inline void cifs_invalidate_cache(struct inode *inode, unsigned int flags) {}
+static inline bool cifs_fscache_enabled(struct inode *inode) { return false; }
 
 static inline int cifs_fscache_query_occupancy(struct inode *inode,
 					       pgoff_t first, unsigned int nr_pages,
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index d02f8ba29cb5..60afab5c83d4 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -26,6 +26,7 @@
 #include "fs_context.h"
 #include "cifs_ioctl.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 static void cifs_set_ops(struct inode *inode)
 {
@@ -147,7 +148,8 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 
 /* populate an inode with info from a cifs_fattr struct */
 int
-cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr,
+		    bool from_readdir)
 {
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
@@ -199,7 +201,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
 	 * Can't safely change the file size here if the client is writing to
 	 * it due to potential races.
 	 */
-	if (is_size_safe_to_change(cifs_i, fattr->cf_eof)) {
+	if (is_size_safe_to_change(cifs_i, fattr->cf_eof, from_readdir)) {
 		i_size_write(inode, fattr->cf_eof);
 
 		/*
@@ -368,7 +370,7 @@ static int update_inode_info(struct super_block *sb,
 		CIFS_I(*inode)->time = 0; /* force reval */
 		return -ESTALE;
 	}
-	return cifs_fattr_to_inode(*inode, fattr);
+	return cifs_fattr_to_inode(*inode, fattr, false);
 }
 
 #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
@@ -399,11 +401,10 @@ cifs_get_file_info_unix(struct file *filp)
 		cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
 	} else if (rc == -EREMOTE) {
 		cifs_create_junction_fattr(&fattr, inode->i_sb);
-		rc = 0;
 	} else
 		goto cifs_gfiunix_out;
 
-	rc = cifs_fattr_to_inode(inode, &fattr);
+	rc = cifs_fattr_to_inode(inode, &fattr, false);
 
 cifs_gfiunix_out:
 	free_xid(xid);
@@ -727,84 +728,6 @@ out_reparse:
 		fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
 }
 
-static inline dev_t nfs_mkdev(struct reparse_posix_data *buf)
-{
-	u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
-
-	return MKDEV(v >> 32, v & 0xffffffff);
-}
-
-bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
-				 struct cifs_fattr *fattr,
-				 struct cifs_open_info_data *data)
-{
-	struct reparse_posix_data *buf = data->reparse.posix;
-	u32 tag = data->reparse.tag;
-
-	if (tag == IO_REPARSE_TAG_NFS && buf) {
-		switch (le64_to_cpu(buf->InodeType)) {
-		case NFS_SPECFILE_CHR:
-			fattr->cf_mode |= S_IFCHR;
-			fattr->cf_dtype = DT_CHR;
-			fattr->cf_rdev = nfs_mkdev(buf);
-			break;
-		case NFS_SPECFILE_BLK:
-			fattr->cf_mode |= S_IFBLK;
-			fattr->cf_dtype = DT_BLK;
-			fattr->cf_rdev = nfs_mkdev(buf);
-			break;
-		case NFS_SPECFILE_FIFO:
-			fattr->cf_mode |= S_IFIFO;
-			fattr->cf_dtype = DT_FIFO;
-			break;
-		case NFS_SPECFILE_SOCK:
-			fattr->cf_mode |= S_IFSOCK;
-			fattr->cf_dtype = DT_SOCK;
-			break;
-		case NFS_SPECFILE_LNK:
-			fattr->cf_mode |= S_IFLNK;
-			fattr->cf_dtype = DT_LNK;
-			break;
-		default:
-			WARN_ON_ONCE(1);
-			return false;
-		}
-		return true;
-	}
-
-	switch (tag) {
-	case IO_REPARSE_TAG_LX_SYMLINK:
-		fattr->cf_mode |= S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		break;
-	case IO_REPARSE_TAG_LX_FIFO:
-		fattr->cf_mode |= S_IFIFO;
-		fattr->cf_dtype = DT_FIFO;
-		break;
-	case IO_REPARSE_TAG_AF_UNIX:
-		fattr->cf_mode |= S_IFSOCK;
-		fattr->cf_dtype = DT_SOCK;
-		break;
-	case IO_REPARSE_TAG_LX_CHR:
-		fattr->cf_mode |= S_IFCHR;
-		fattr->cf_dtype = DT_CHR;
-		break;
-	case IO_REPARSE_TAG_LX_BLK:
-		fattr->cf_mode |= S_IFBLK;
-		fattr->cf_dtype = DT_BLK;
-		break;
-	case 0: /* SMB1 symlink */
-	case IO_REPARSE_TAG_SYMLINK:
-	case IO_REPARSE_TAG_NFS:
-		fattr->cf_mode |= S_IFLNK;
-		fattr->cf_dtype = DT_LNK;
-		break;
-	default:
-		return false;
-	}
-	return true;
-}
-
 static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 				    struct cifs_open_info_data *data,
 				    struct super_block *sb)
@@ -835,6 +758,8 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+	fattr->cf_uid = cifs_sb->ctx->linux_uid;
+	fattr->cf_gid = cifs_sb->ctx->linux_gid;
 
 	fattr->cf_mode = cifs_sb->ctx->file_mode;
 	if (cifs_open_data_reparse(data) &&
@@ -877,9 +802,6 @@ out_reparse:
 		fattr->cf_symlink_target = data->symlink_target;
 		data->symlink_target = NULL;
 	}
-
-	fattr->cf_uid = cifs_sb->ctx->linux_uid;
-	fattr->cf_gid = cifs_sb->ctx->linux_gid;
 }
 
 static int
@@ -893,9 +815,14 @@ cifs_get_file_info(struct file *filp)
 	struct cifsFileInfo *cfile = filp->private_data;
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	struct TCP_Server_Info *server = tcon->ses->server;
+	struct dentry *dentry = filp->f_path.dentry;
+	void *page = alloc_dentry_path();
+	const unsigned char *path;
 
-	if (!server->ops->query_file_info)
+	if (!server->ops->query_file_info) {
+		free_dentry_path(page);
 		return -ENOSYS;
+	}
 
 	xid = get_xid();
 	rc = server->ops->query_file_info(xid, tcon, cfile, &data);
@@ -907,11 +834,17 @@ cifs_get_file_info(struct file *filp)
 			data.symlink = true;
 			data.reparse.tag = IO_REPARSE_TAG_SYMLINK;
 		}
+		path = build_path_from_dentry(dentry, page);
+		if (IS_ERR(path)) {
+			rc = PTR_ERR(path);
+			goto cgfi_exit;
+		}
 		cifs_open_info_to_fattr(&fattr, &data, inode->i_sb);
+		if (fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+			cifs_mark_open_handles_for_deleted_file(inode, path);
 		break;
 	case -EREMOTE:
 		cifs_create_junction_fattr(&fattr, inode->i_sb);
-		rc = 0;
 		break;
 	case -EOPNOTSUPP:
 	case -EINVAL:
@@ -934,9 +867,10 @@ cifs_get_file_info(struct file *filp)
 	fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
 	fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
 	/* if filetype is different, return error */
-	rc = cifs_fattr_to_inode(inode, &fattr);
+	rc = cifs_fattr_to_inode(inode, &fattr, false);
 cgfi_exit:
 	cifs_free_open_info(&data);
+	free_dentry_path(page);
 	free_xid(xid);
 	return rc;
 }
@@ -1171,6 +1105,9 @@ static int cifs_get_fattr(struct cifs_open_info_data *data,
 		} else {
 			cifs_open_info_to_fattr(fattr, data, sb);
 		}
+		if (!rc && *inode &&
+		    (fattr->cf_flags & CIFS_FATTR_DELETE_PENDING))
+			cifs_mark_open_handles_for_deleted_file(*inode, full_path);
 		break;
 	case -EREMOTE:
 		/* DFS link, no metadata available on this server */
@@ -1399,6 +1336,8 @@ int smb311_posix_get_inode_info(struct inode **inode,
 		goto out;
 
 	rc = update_inode_info(sb, &fattr, inode);
+	if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+		cifs_mark_open_handles_for_deleted_file(*inode, full_path);
 out:
 	kfree(fattr.cf_symlink_target);
 	return rc;
@@ -1413,6 +1352,8 @@ cifs_find_inode(struct inode *inode, void *opaque)
 {
 	struct cifs_fattr *fattr = opaque;
 
+	/* [!] The compared values must be the same in struct cifs_fscache_inode_key. */
+
 	/* don't match inode with different uniqueid */
 	if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid)
 		return 0;
@@ -1491,7 +1432,7 @@ retry_iget5_locked:
 		}
 
 		/* can't fail - see cifs_find_inode() */
-		cifs_fattr_to_inode(inode, fattr);
+		cifs_fattr_to_inode(inode, fattr, false);
 		if (sb->s_flags & SB_NOATIME)
 			inode->i_flags |= S_NOATIME | S_NOCMTIME;
 		if (inode->i_state & I_NEW) {
@@ -1560,6 +1501,9 @@ iget_root:
 		goto out;
 	}
 
+	if (!rc && fattr.cf_flags & CIFS_FATTR_DELETE_PENDING)
+		cifs_mark_open_handles_for_deleted_file(inode, path);
+
 	if (rc && tcon->pipe) {
 		cifs_dbg(FYI, "ipc connection - fake read inode\n");
 		spin_lock(&inode->i_lock);
@@ -1846,20 +1790,24 @@ retry_std_delete:
 		goto psx_del_no_retry;
 	}
 
-	rc = server->ops->unlink(xid, tcon, full_path, cifs_sb);
+	rc = server->ops->unlink(xid, tcon, full_path, cifs_sb, dentry);
 
 psx_del_no_retry:
 	if (!rc) {
-		if (inode)
+		if (inode) {
+			cifs_mark_open_handles_for_deleted_file(inode, full_path);
 			cifs_drop_nlink(inode);
+		}
 	} else if (rc == -ENOENT) {
 		d_drop(dentry);
 	} else if (rc == -EBUSY) {
 		if (server->ops->rename_pending_delete) {
 			rc = server->ops->rename_pending_delete(full_path,
 								dentry, xid);
-			if (rc == 0)
+			if (rc == 0) {
+				cifs_mark_open_handles_for_deleted_file(inode, full_path);
 				cifs_drop_nlink(inode);
+			}
 		}
 	} else if ((rc == -EACCES) && (dosattr == 0) && inode) {
 		attrs = kzalloc(sizeof(*attrs), GFP_KERNEL);
@@ -2797,7 +2745,7 @@ void cifs_setsize(struct inode *inode, loff_t offset)
 
 static int
 cifs_set_file_size(struct inode *inode, struct iattr *attrs,
-		   unsigned int xid, const char *full_path)
+		   unsigned int xid, const char *full_path, struct dentry *dentry)
 {
 	int rc;
 	struct cifsFileInfo *open_file;
@@ -2848,7 +2796,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
 	 */
 	if (server->ops->set_path_size)
 		rc = server->ops->set_path_size(xid, tcon, full_path,
-						attrs->ia_size, cifs_sb, false);
+						attrs->ia_size, cifs_sb, false, dentry);
 	else
 		rc = -ENOSYS;
 	cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc);
@@ -2938,7 +2886,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
 	rc = 0;
 
 	if (attrs->ia_valid & ATTR_SIZE) {
-		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry);
 		if (rc != 0)
 			goto out;
 	}
@@ -3105,7 +3053,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
 	}
 
 	if (attrs->ia_valid & ATTR_SIZE) {
-		rc = cifs_set_file_size(inode, attrs, xid, full_path);
+		rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry);
 		if (rc != 0)
 			goto cifs_setattr_exit;
 	}
diff --git a/fs/smb/client/ioctl.c b/fs/smb/client/ioctl.c
index e2f92c21fff5..855ac5a62edf 100644
--- a/fs/smb/client/ioctl.c
+++ b/fs/smb/client/ioctl.c
@@ -247,7 +247,9 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
 		spin_lock(&cifs_tcp_ses_lock);
 		list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) {
 			list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) {
-				if (ses_it->Suid == out.session_id) {
+				spin_lock(&ses_it->ses_lock);
+				if (ses_it->ses_status != SES_EXITING &&
+				    ses_it->Suid == out.session_id) {
 					ses = ses_it;
 					/*
 					 * since we are using the session outside the crit
@@ -255,9 +257,11 @@ static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug
 					 * so increment its refcount
 					 */
 					cifs_smb_ses_inc_refcount(ses);
+					spin_unlock(&ses_it->ses_lock);
 					found = true;
 					goto search_end;
 				}
+				spin_unlock(&ses_it->ses_lock);
 			}
 		}
 search_end:
@@ -345,6 +349,11 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	xid = get_xid();
 
 	cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
+	if (pSMBFile == NULL)
+		trace_smb3_ioctl(xid, 0, command);
+	else
+		trace_smb3_ioctl(xid, pSMBFile->fid.persistent_fid, command);
+
 	switch (command) {
 		case FS_IOC_GETFLAGS:
 			if (pSMBFile == NULL)
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index 0748d7b757b9..7d15a1969b81 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -27,9 +27,6 @@
 #include "fs_context.h"
 #include "cached_dir.h"
 
-extern mempool_t *cifs_sm_req_poolp;
-extern mempool_t *cifs_req_poolp;
-
 /* The xid serves as a useful identifier for each incoming vfs request,
    in a similar way to the mid which is useful to track each sent smb,
    and CurrentXid can also provide a running counter (although it
@@ -101,6 +98,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
 	kfree(buf_to_free->serverDomain);
 	kfree(buf_to_free->serverNOS);
 	kfree_sensitive(buf_to_free->password);
+	kfree_sensitive(buf_to_free->password2);
 	kfree(buf_to_free->user_name);
 	kfree(buf_to_free->domainName);
 	kfree_sensitive(buf_to_free->auth_key.response);
@@ -141,9 +139,6 @@ tcon_info_alloc(bool dir_leases_enabled)
 	atomic_set(&ret_buf->num_local_opens, 0);
 	atomic_set(&ret_buf->num_remote_opens, 0);
 	ret_buf->stats_from_time = ktime_get_real_seconds();
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
-#endif
 
 	return ret_buf;
 }
@@ -159,9 +154,6 @@ tconInfoFree(struct cifs_tcon *tcon)
 	atomic_dec(&tconInfoAllocCount);
 	kfree(tcon->nativeFileSystem);
 	kfree_sensitive(tcon->password);
-#ifdef CONFIG_CIFS_DFS_UPCALL
-	dfs_put_root_smb_sessions(&tcon->dfs_ses_list);
-#endif
 	kfree(tcon->origin_fullpath);
 	kfree(tcon);
 }
@@ -490,6 +482,8 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+		if (cifs_ses_exiting(ses))
+			continue;
 		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 			if (tcon->tid != buf->Tid)
 				continue;
@@ -853,6 +847,40 @@ cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path)
 	free_dentry_path(page);
 }
 
+/*
+ * If a dentry has been deleted, all corresponding open handles should know that
+ * so that we do not defer close them.
+ */
+void cifs_mark_open_handles_for_deleted_file(struct inode *inode,
+					     const char *path)
+{
+	struct cifsFileInfo *cfile;
+	void *page;
+	const char *full_path;
+	struct cifsInodeInfo *cinode = CIFS_I(inode);
+
+	page = alloc_dentry_path();
+	spin_lock(&cinode->open_file_lock);
+
+	/*
+	 * note: we need to construct path from dentry and compare only if the
+	 * inode has any hardlinks. When number of hardlinks is 1, we can just
+	 * mark all open handles since they are going to be from the same file.
+	 */
+	if (inode->i_nlink > 1) {
+		list_for_each_entry(cfile, &cinode->openFileList, flist) {
+			full_path = build_path_from_dentry(cfile->dentry, page);
+			if (!IS_ERR(full_path) && strcmp(full_path, path) == 0)
+				cfile->status_file_deleted = true;
+		}
+	} else {
+		list_for_each_entry(cfile, &cinode->openFileList, flist)
+			cfile->status_file_deleted = true;
+	}
+	spin_unlock(&cinode->open_file_lock);
+	free_dentry_path(page);
+}
+
 /* parses DFS referral V3 structure
  * caller is responsible for freeing target_nodes
  * returns:
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index b520eea7bfce..ebe1cb30e18e 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -22,6 +22,7 @@
 #include "smb2proto.h"
 #include "fs_context.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 /*
  * To be safe - for UCS to UTF-8 with strings loaded with the rare long
@@ -56,23 +57,6 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 #endif /* DEBUG2 */
 
 /*
- * Match a reparse point inode if reparse tag and ctime haven't changed.
- *
- * Windows Server updates ctime of reparse points when their data have changed.
- * The server doesn't allow changing reparse tags from existing reparse points,
- * though it's worth checking.
- */
-static inline bool reparse_inode_match(struct inode *inode,
-				       struct cifs_fattr *fattr)
-{
-	struct timespec64 ctime = inode_get_ctime(inode);
-
-	return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
-		CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
-		timespec64_equal(&ctime, &fattr->cf_ctime);
-}
-
-/*
  * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
  *
  * Find the dentry that matches "name". If there isn't one, create one. If it's
@@ -141,6 +125,8 @@ retry:
 					if (likely(reparse_inode_match(inode, fattr))) {
 						fattr->cf_mode = inode->i_mode;
 						fattr->cf_rdev = inode->i_rdev;
+						fattr->cf_uid = inode->i_uid;
+						fattr->cf_gid = inode->i_gid;
 						fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
 						fattr->cf_symlink_target = NULL;
 					} else {
@@ -148,7 +134,7 @@ retry:
 						rc = -ESTALE;
 					}
 				}
-				if (!rc && !cifs_fattr_to_inode(inode, fattr)) {
+				if (!rc && !cifs_fattr_to_inode(inode, fattr, true)) {
 					dput(dentry);
 					return;
 				}
diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c
new file mode 100644
index 000000000000..a0ffbda90733
--- /dev/null
+++ b/fs/smb/client/reparse.c
@@ -0,0 +1,532 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/slab.h>
+#include "cifsglob.h"
+#include "smb2proto.h"
+#include "cifsproto.h"
+#include "cifs_unicode.h"
+#include "cifs_debug.h"
+#include "fs_context.h"
+#include "reparse.h"
+
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname)
+{
+	struct reparse_symlink_data_buffer *buf = NULL;
+	struct cifs_open_info_data data;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct inode *new;
+	struct kvec iov;
+	__le16 *path;
+	char *sym, sep = CIFS_DIR_SEP(cifs_sb);
+	u16 len, plen;
+	int rc = 0;
+
+	sym = kstrdup(symname, GFP_KERNEL);
+	if (!sym)
+		return -ENOMEM;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
+		.symlink_target = sym,
+	};
+
+	convert_delimiter(sym, sep);
+	path = cifs_convert_path_to_utf16(sym, cifs_sb);
+	if (!path) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
+	len = sizeof(*buf) + plen * 2;
+	buf = kzalloc(len, GFP_KERNEL);
+	if (!buf) {
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
+	buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+	buf->SubstituteNameOffset = cpu_to_le16(plen);
+	buf->SubstituteNameLength = cpu_to_le16(plen);
+	memcpy(&buf->PathBuffer[plen], path, plen);
+	buf->PrintNameOffset = 0;
+	buf->PrintNameLength = cpu_to_le16(plen);
+	memcpy(buf->PathBuffer, path, plen);
+	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+	if (*sym != sep)
+		buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
+
+	convert_delimiter(sym, '/');
+	iov.iov_base = buf;
+	iov.iov_len = len;
+	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+				     tcon, full_path, &iov, NULL);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+out:
+	kfree(path);
+	cifs_free_open_info(&data);
+	kfree(buf);
+	return rc;
+}
+
+static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+			       mode_t mode, dev_t dev,
+			       struct kvec *iov)
+{
+	u64 type;
+	u16 len, dlen;
+
+	len = sizeof(*buf);
+
+	switch ((type = reparse_mode_nfs_type(mode))) {
+	case NFS_SPECFILE_BLK:
+	case NFS_SPECFILE_CHR:
+		dlen = sizeof(__le64);
+		break;
+	case NFS_SPECFILE_FIFO:
+	case NFS_SPECFILE_SOCK:
+		dlen = 0;
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
+	buf->Reserved = 0;
+	buf->InodeType = cpu_to_le64(type);
+	buf->ReparseDataLength = cpu_to_le16(len + dlen -
+					     sizeof(struct reparse_data_buffer));
+	*(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
+						 MINOR(dev));
+	iov->iov_base = buf;
+	iov->iov_len = len + dlen;
+	return 0;
+}
+
+static int mknod_nfs(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_open_info_data data;
+	struct reparse_posix_data *p;
+	struct inode *new;
+	struct kvec iov;
+	__u8 buf[sizeof(*p) + sizeof(__le64)];
+	int rc;
+
+	p = (struct reparse_posix_data *)buf;
+	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+	if (rc)
+		return rc;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+	};
+
+	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+				     tcon, full_path, &iov, NULL);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+	cifs_free_open_info(&data);
+	return rc;
+}
+
+static int wsl_set_reparse_buf(struct reparse_data_buffer *buf,
+			       mode_t mode, struct kvec *iov)
+{
+	u32 tag;
+
+	switch ((tag = reparse_mode_wsl_tag(mode))) {
+	case IO_REPARSE_TAG_LX_BLK:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_AF_UNIX:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	buf->ReparseTag = cpu_to_le32(tag);
+	buf->Reserved = 0;
+	buf->ReparseDataLength = 0;
+	iov->iov_base = buf;
+	iov->iov_len = sizeof(*buf);
+	return 0;
+}
+
+static struct smb2_create_ea_ctx *ea_create_context(u32 dlen, size_t *cc_len)
+{
+	struct smb2_create_ea_ctx *cc;
+
+	*cc_len = round_up(sizeof(*cc) + dlen, 8);
+	cc = kzalloc(*cc_len, GFP_KERNEL);
+	if (!cc)
+		return ERR_PTR(-ENOMEM);
+
+	cc->ctx.NameOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx,
+						  name));
+	cc->ctx.NameLength = cpu_to_le16(4);
+	memcpy(cc->name, SMB2_CREATE_EA_BUFFER, strlen(SMB2_CREATE_EA_BUFFER));
+	cc->ctx.DataOffset = cpu_to_le16(offsetof(struct smb2_create_ea_ctx, ea));
+	cc->ctx.DataLength = cpu_to_le32(dlen);
+	return cc;
+}
+
+struct wsl_xattr {
+	const char	*name;
+	__le64		value;
+	u16		size;
+	u32		next;
+};
+
+static int wsl_set_xattrs(struct inode *inode, umode_t _mode,
+			  dev_t _dev, struct kvec *iov)
+{
+	struct smb2_file_full_ea_info *ea;
+	struct smb2_create_ea_ctx *cc;
+	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+	__le64 uid = cpu_to_le64(from_kuid(current_user_ns(), ctx->linux_uid));
+	__le64 gid = cpu_to_le64(from_kgid(current_user_ns(), ctx->linux_gid));
+	__le64 dev = cpu_to_le64(((u64)MINOR(_dev) << 32) | MAJOR(_dev));
+	__le64 mode = cpu_to_le64(_mode);
+	struct wsl_xattr xattrs[] = {
+		{ .name = SMB2_WSL_XATTR_UID,  .value = uid,  .size = SMB2_WSL_XATTR_UID_SIZE, },
+		{ .name = SMB2_WSL_XATTR_GID,  .value = gid,  .size = SMB2_WSL_XATTR_GID_SIZE, },
+		{ .name = SMB2_WSL_XATTR_MODE, .value = mode, .size = SMB2_WSL_XATTR_MODE_SIZE, },
+		{ .name = SMB2_WSL_XATTR_DEV,  .value = dev, .size = SMB2_WSL_XATTR_DEV_SIZE, },
+	};
+	size_t cc_len;
+	u32 dlen = 0, next = 0;
+	int i, num_xattrs;
+	u8 name_size = SMB2_WSL_XATTR_NAME_LEN + 1;
+
+	memset(iov, 0, sizeof(*iov));
+
+	/* Exclude $LXDEV xattr for sockets and fifos */
+	if (S_ISSOCK(_mode) || S_ISFIFO(_mode))
+		num_xattrs = ARRAY_SIZE(xattrs) - 1;
+	else
+		num_xattrs = ARRAY_SIZE(xattrs);
+
+	for (i = 0; i < num_xattrs; i++) {
+		xattrs[i].next = ALIGN(sizeof(*ea) + name_size +
+				       xattrs[i].size, 4);
+		dlen += xattrs[i].next;
+	}
+
+	cc = ea_create_context(dlen, &cc_len);
+	if (IS_ERR(cc))
+		return PTR_ERR(cc);
+
+	ea = &cc->ea;
+	for (i = 0; i < num_xattrs; i++) {
+		ea = (void *)((u8 *)ea + next);
+		next = xattrs[i].next;
+		ea->next_entry_offset = cpu_to_le32(next);
+
+		ea->ea_name_length = name_size - 1;
+		ea->ea_value_length = cpu_to_le16(xattrs[i].size);
+		memcpy(ea->ea_data, xattrs[i].name, name_size);
+		memcpy(&ea->ea_data[name_size],
+		       &xattrs[i].value, xattrs[i].size);
+	}
+	ea->next_entry_offset = 0;
+
+	iov->iov_base = cc;
+	iov->iov_len = cc_len;
+	return 0;
+}
+
+static int mknod_wsl(unsigned int xid, struct inode *inode,
+		     struct dentry *dentry, struct cifs_tcon *tcon,
+		     const char *full_path, umode_t mode, dev_t dev)
+{
+	struct cifs_open_info_data data;
+	struct reparse_data_buffer buf;
+	struct smb2_create_ea_ctx *cc;
+	struct inode *new;
+	unsigned int len;
+	struct kvec reparse_iov, xattr_iov;
+	int rc;
+
+	rc = wsl_set_reparse_buf(&buf, mode, &reparse_iov);
+	if (rc)
+		return rc;
+
+	rc = wsl_set_xattrs(inode, mode, dev, &xattr_iov);
+	if (rc)
+		return rc;
+
+	data = (struct cifs_open_info_data) {
+		.reparse_point = true,
+		.reparse = { .tag = le32_to_cpu(buf.ReparseTag), .buf = &buf, },
+	};
+
+	cc = xattr_iov.iov_base;
+	len = le32_to_cpu(cc->ctx.DataLength);
+	memcpy(data.wsl.eas, &cc->ea, len);
+	data.wsl.eas_len = len;
+
+	new = smb2_get_reparse_inode(&data, inode->i_sb,
+				     xid, tcon, full_path,
+				     &reparse_iov, &xattr_iov);
+	if (!IS_ERR(new))
+		d_instantiate(dentry, new);
+	else
+		rc = PTR_ERR(new);
+	cifs_free_open_info(&data);
+	kfree(xattr_iov.iov_base);
+	return rc;
+}
+
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev)
+{
+	struct smb3_fs_context *ctx = CIFS_SB(inode->i_sb)->ctx;
+	int rc = -EOPNOTSUPP;
+
+	switch (ctx->reparse_type) {
+	case CIFS_REPARSE_TYPE_NFS:
+		rc = mknod_nfs(xid, inode, dentry, tcon, full_path, mode, dev);
+		break;
+	case CIFS_REPARSE_TYPE_WSL:
+		rc = mknod_wsl(xid, inode, dentry, tcon, full_path, mode, dev);
+		break;
+	}
+	return rc;
+}
+
+/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
+static int parse_reparse_posix(struct reparse_posix_data *buf,
+			       struct cifs_sb_info *cifs_sb,
+			       struct cifs_open_info_data *data)
+{
+	unsigned int len;
+	u64 type;
+
+	switch ((type = le64_to_cpu(buf->InodeType))) {
+	case NFS_SPECFILE_LNK:
+		len = le16_to_cpu(buf->ReparseDataLength);
+		data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
+							       len, true,
+							       cifs_sb->local_nls);
+		if (!data->symlink_target)
+			return -ENOMEM;
+		convert_delimiter(data->symlink_target, '/');
+		cifs_dbg(FYI, "%s: target path: %s\n",
+			 __func__, data->symlink_target);
+		break;
+	case NFS_SPECFILE_CHR:
+	case NFS_SPECFILE_BLK:
+	case NFS_SPECFILE_FIFO:
+	case NFS_SPECFILE_SOCK:
+		break;
+	default:
+		cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
+			 __func__, type);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
+				 u32 plen, bool unicode,
+				 struct cifs_sb_info *cifs_sb,
+				 struct cifs_open_info_data *data)
+{
+	unsigned int len;
+	unsigned int offs;
+
+	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
+
+	offs = le16_to_cpu(sym->SubstituteNameOffset);
+	len = le16_to_cpu(sym->SubstituteNameLength);
+	if (offs + 20 > plen || offs + len + 20 > plen) {
+		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
+		return -EIO;
+	}
+
+	data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
+						       len, unicode,
+						       cifs_sb->local_nls);
+	if (!data->symlink_target)
+		return -ENOMEM;
+
+	convert_delimiter(data->symlink_target, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
+
+	return 0;
+}
+
+int parse_reparse_point(struct reparse_data_buffer *buf,
+			u32 plen, struct cifs_sb_info *cifs_sb,
+			bool unicode, struct cifs_open_info_data *data)
+{
+	data->reparse.buf = buf;
+
+	/* See MS-FSCC 2.1.2 */
+	switch (le32_to_cpu(buf->ReparseTag)) {
+	case IO_REPARSE_TAG_NFS:
+		return parse_reparse_posix((struct reparse_posix_data *)buf,
+					   cifs_sb, data);
+	case IO_REPARSE_TAG_SYMLINK:
+		return parse_reparse_symlink(
+			(struct reparse_symlink_data_buffer *)buf,
+			plen, unicode, cifs_sb, data);
+	case IO_REPARSE_TAG_LX_SYMLINK:
+	case IO_REPARSE_TAG_AF_UNIX:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_BLK:
+		return 0;
+	default:
+		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
+			 __func__, le32_to_cpu(buf->ReparseTag));
+		return -EOPNOTSUPP;
+	}
+}
+
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
+			     struct kvec *rsp_iov,
+			     struct cifs_open_info_data *data)
+{
+	struct reparse_data_buffer *buf;
+	struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
+	u32 plen = le32_to_cpu(io->OutputCount);
+
+	buf = (struct reparse_data_buffer *)((u8 *)io +
+					     le32_to_cpu(io->OutputOffset));
+	return parse_reparse_point(buf, plen, cifs_sb, true, data);
+}
+
+static void wsl_to_fattr(struct cifs_open_info_data *data,
+			 struct cifs_sb_info *cifs_sb,
+			 u32 tag, struct cifs_fattr *fattr)
+{
+	struct smb2_file_full_ea_info *ea;
+	u32 next = 0;
+
+	switch (tag) {
+	case IO_REPARSE_TAG_LX_SYMLINK:
+		fattr->cf_mode |= S_IFLNK;
+		break;
+	case IO_REPARSE_TAG_LX_FIFO:
+		fattr->cf_mode |= S_IFIFO;
+		break;
+	case IO_REPARSE_TAG_AF_UNIX:
+		fattr->cf_mode |= S_IFSOCK;
+		break;
+	case IO_REPARSE_TAG_LX_CHR:
+		fattr->cf_mode |= S_IFCHR;
+		break;
+	case IO_REPARSE_TAG_LX_BLK:
+		fattr->cf_mode |= S_IFBLK;
+		break;
+	}
+
+	if (!data->wsl.eas_len)
+		goto out;
+
+	ea = (struct smb2_file_full_ea_info *)data->wsl.eas;
+	do {
+		const char *name;
+		void *v;
+		u8 nlen;
+
+		ea = (void *)((u8 *)ea + next);
+		next = le32_to_cpu(ea->next_entry_offset);
+		if (!le16_to_cpu(ea->ea_value_length))
+			continue;
+
+		name = ea->ea_data;
+		nlen = ea->ea_name_length;
+		v = (void *)((u8 *)ea->ea_data + ea->ea_name_length + 1);
+
+		if (!strncmp(name, SMB2_WSL_XATTR_UID, nlen))
+			fattr->cf_uid = wsl_make_kuid(cifs_sb, v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_GID, nlen))
+			fattr->cf_gid = wsl_make_kgid(cifs_sb, v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_MODE, nlen))
+			fattr->cf_mode = (umode_t)le32_to_cpu(*(__le32 *)v);
+		else if (!strncmp(name, SMB2_WSL_XATTR_DEV, nlen))
+			fattr->cf_rdev = wsl_mkdev(v);
+	} while (next);
+out:
+	fattr->cf_dtype = S_DT(fattr->cf_mode);
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+				 struct cifs_fattr *fattr,
+				 struct cifs_open_info_data *data)
+{
+	struct reparse_posix_data *buf = data->reparse.posix;
+	u32 tag = data->reparse.tag;
+
+	if (tag == IO_REPARSE_TAG_NFS && buf) {
+		switch (le64_to_cpu(buf->InodeType)) {
+		case NFS_SPECFILE_CHR:
+			fattr->cf_mode |= S_IFCHR;
+			fattr->cf_rdev = reparse_nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_BLK:
+			fattr->cf_mode |= S_IFBLK;
+			fattr->cf_rdev = reparse_nfs_mkdev(buf);
+			break;
+		case NFS_SPECFILE_FIFO:
+			fattr->cf_mode |= S_IFIFO;
+			break;
+		case NFS_SPECFILE_SOCK:
+			fattr->cf_mode |= S_IFSOCK;
+			break;
+		case NFS_SPECFILE_LNK:
+			fattr->cf_mode |= S_IFLNK;
+			break;
+		default:
+			WARN_ON_ONCE(1);
+			return false;
+		}
+		goto out;
+	}
+
+	switch (tag) {
+	case IO_REPARSE_TAG_DFS:
+	case IO_REPARSE_TAG_DFSR:
+	case IO_REPARSE_TAG_MOUNT_POINT:
+		/* See cifs_create_junction_fattr() */
+		fattr->cf_mode = S_IFDIR | 0711;
+		break;
+	case IO_REPARSE_TAG_LX_SYMLINK:
+	case IO_REPARSE_TAG_LX_FIFO:
+	case IO_REPARSE_TAG_AF_UNIX:
+	case IO_REPARSE_TAG_LX_CHR:
+	case IO_REPARSE_TAG_LX_BLK:
+		wsl_to_fattr(data, cifs_sb, tag, fattr);
+		break;
+	case 0: /* SMB1 symlink */
+	case IO_REPARSE_TAG_SYMLINK:
+	case IO_REPARSE_TAG_NFS:
+		fattr->cf_mode |= S_IFLNK;
+		break;
+	default:
+		return false;
+	}
+out:
+	fattr->cf_dtype = S_DT(fattr->cf_mode);
+	return true;
+}
diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h
new file mode 100644
index 000000000000..6b55d1df9e2f
--- /dev/null
+++ b/fs/smb/client/reparse.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024 Paulo Alcantara <pc@manguebit.com>
+ */
+
+#ifndef _CIFS_REPARSE_H
+#define _CIFS_REPARSE_H
+
+#include <linux/fs.h>
+#include <linux/stat.h>
+#include <linux/uidgid.h>
+#include "fs_context.h"
+#include "cifsglob.h"
+
+static inline dev_t reparse_nfs_mkdev(struct reparse_posix_data *buf)
+{
+	u64 v = le64_to_cpu(*(__le64 *)buf->DataBuffer);
+
+	return MKDEV(v >> 32, v & 0xffffffff);
+}
+
+static inline dev_t wsl_mkdev(void *ptr)
+{
+	u64 v = le64_to_cpu(*(__le64 *)ptr);
+
+	return MKDEV(v & 0xffffffff, v >> 32);
+}
+
+static inline kuid_t wsl_make_kuid(struct cifs_sb_info *cifs_sb,
+				   void *ptr)
+{
+	u32 uid = le32_to_cpu(*(__le32 *)ptr);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
+		return cifs_sb->ctx->linux_uid;
+	return make_kuid(current_user_ns(), uid);
+}
+
+static inline kgid_t wsl_make_kgid(struct cifs_sb_info *cifs_sb,
+				   void *ptr)
+{
+	u32 gid = le32_to_cpu(*(__le32 *)ptr);
+
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
+		return cifs_sb->ctx->linux_gid;
+	return make_kgid(current_user_ns(), gid);
+}
+
+static inline u64 reparse_mode_nfs_type(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFBLK: return NFS_SPECFILE_BLK;
+	case S_IFCHR: return NFS_SPECFILE_CHR;
+	case S_IFIFO: return NFS_SPECFILE_FIFO;
+	case S_IFSOCK: return NFS_SPECFILE_SOCK;
+	}
+	return 0;
+}
+
+static inline u32 reparse_mode_wsl_tag(mode_t mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFBLK: return IO_REPARSE_TAG_LX_BLK;
+	case S_IFCHR: return IO_REPARSE_TAG_LX_CHR;
+	case S_IFIFO: return IO_REPARSE_TAG_LX_FIFO;
+	case S_IFSOCK: return IO_REPARSE_TAG_AF_UNIX;
+	}
+	return 0;
+}
+
+/*
+ * Match a reparse point inode if reparse tag and ctime haven't changed.
+ *
+ * Windows Server updates ctime of reparse points when their data have changed.
+ * The server doesn't allow changing reparse tags from existing reparse points,
+ * though it's worth checking.
+ */
+static inline bool reparse_inode_match(struct inode *inode,
+				       struct cifs_fattr *fattr)
+{
+	struct timespec64 ctime = inode_get_ctime(inode);
+
+	return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
+		CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
+		timespec64_equal(&ctime, &fattr->cf_ctime);
+}
+
+static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
+{
+	struct smb2_file_all_info *fi = &data->fi;
+	u32 attrs = le32_to_cpu(fi->Attributes);
+	bool ret;
+
+	ret = data->reparse_point || (attrs & ATTR_REPARSE);
+	if (ret)
+		attrs |= ATTR_REPARSE;
+	fi->Attributes = cpu_to_le32(attrs);
+	return ret;
+}
+
+bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
+				 struct cifs_fattr *fattr,
+				 struct cifs_open_info_data *data);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname);
+int smb2_mknod_reparse(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev);
+int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, struct kvec *rsp_iov,
+			     struct cifs_open_info_data *data);
+
+#endif /* _CIFS_REPARSE_H */
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 8f37373fd333..3216f786908f 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -230,7 +230,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
 		spin_lock(&ses->iface_lock);
 		if (!ses->iface_count) {
 			spin_unlock(&ses->iface_lock);
-			cifs_dbg(VFS, "server %s does not advertise interfaces\n",
+			cifs_dbg(ONCE, "server %s does not advertise interfaces\n",
 				      ses->server->hostname);
 			break;
 		}
@@ -396,7 +396,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
 	spin_lock(&ses->iface_lock);
 	if (!ses->iface_count) {
 		spin_unlock(&ses->iface_lock);
-		cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
+		cifs_dbg(ONCE, "server %s does not advertise interfaces\n", ses->server->hostname);
 		return;
 	}
 
diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c
index a9eaba8083b0..212ec6f66ec6 100644
--- a/fs/smb/client/smb1ops.c
+++ b/fs/smb/client/smb1ops.c
@@ -753,11 +753,11 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
 	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
 }
 
-static void
+static int
 cifs_close_file(const unsigned int xid, struct cifs_tcon *tcon,
 		struct cifs_fid *fid)
 {
-	CIFSSMBClose(xid, tcon, fid->netfid);
+	return CIFSSMBClose(xid, tcon, fid->netfid);
 }
 
 static int
diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c
index e0ee96d69d49..c23478ab1cf8 100644
--- a/fs/smb/client/smb2file.c
+++ b/fs/smb/client/smb2file.c
@@ -228,7 +228,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
 			 * flock and OFD lock are associated with an open
 			 * file description, not the process.
 			 */
-			if (!(flock->fl_flags & (FL_FLOCK | FL_OFDLCK)))
+			if (!(flock->c.flc_flags & (FL_FLOCK | FL_OFDLCK)))
 				continue;
 		if (cinode->can_cache_brlcks) {
 			/*
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index a0c156996fc5..2466e6155136 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -36,7 +36,8 @@ enum smb2_compound_ops {
 	SMB2_OP_RMDIR,
 	SMB2_OP_POSIX_QUERY_INFO,
 	SMB2_OP_SET_REPARSE,
-	SMB2_OP_GET_REPARSE
+	SMB2_OP_GET_REPARSE,
+	SMB2_OP_QUERY_WSL_EA,
 };
 
 /* Used when constructing chained read requests. */
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index 05818cd6d932..5c02a12251c8 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -85,6 +85,82 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
 	return 0;
 }
 
+struct wsl_query_ea {
+	__le32	next;
+	__u8	name_len;
+	__u8	name[SMB2_WSL_XATTR_NAME_LEN + 1];
+} __packed;
+
+#define NEXT_OFF cpu_to_le32(sizeof(struct wsl_query_ea))
+
+static const struct wsl_query_ea wsl_query_eas[] = {
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_UID, },
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_GID, },
+	{ .next = NEXT_OFF, .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_MODE, },
+	{ .next = 0,        .name_len = SMB2_WSL_XATTR_NAME_LEN, .name = SMB2_WSL_XATTR_DEV, },
+};
+
+static int check_wsl_eas(struct kvec *rsp_iov)
+{
+	struct smb2_file_full_ea_info *ea;
+	struct smb2_query_info_rsp *rsp = rsp_iov->iov_base;
+	unsigned long addr;
+	u32 outlen, next;
+	u16 vlen;
+	u8 nlen;
+	u8 *end;
+
+	outlen = le32_to_cpu(rsp->OutputBufferLength);
+	if (outlen < SMB2_WSL_MIN_QUERY_EA_RESP_SIZE ||
+	    outlen > SMB2_WSL_MAX_QUERY_EA_RESP_SIZE)
+		return -EINVAL;
+
+	ea = (void *)((u8 *)rsp_iov->iov_base +
+		      le16_to_cpu(rsp->OutputBufferOffset));
+	end = (u8 *)rsp_iov->iov_base + rsp_iov->iov_len;
+	for (;;) {
+		if ((u8 *)ea > end - sizeof(*ea))
+			return -EINVAL;
+
+		nlen = ea->ea_name_length;
+		vlen = le16_to_cpu(ea->ea_value_length);
+		if (nlen != SMB2_WSL_XATTR_NAME_LEN ||
+		    (u8 *)ea + nlen + 1 + vlen > end)
+			return -EINVAL;
+
+		switch (vlen) {
+		case 4:
+			if (strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) &&
+			    strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) &&
+			    strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen))
+				return -EINVAL;
+			break;
+		case 8:
+			if (strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+				return -EINVAL;
+			break;
+		case 0:
+			if (!strncmp(ea->ea_data, SMB2_WSL_XATTR_UID, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_GID, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_MODE, nlen) ||
+			    !strncmp(ea->ea_data, SMB2_WSL_XATTR_DEV, nlen))
+				break;
+			fallthrough;
+		default:
+			return -EINVAL;
+		}
+
+		next = le32_to_cpu(ea->next_entry_offset);
+		if (!next)
+			break;
+		if (!IS_ALIGNED(next, 4) ||
+		    check_add_overflow((unsigned long)ea, next, &addr))
+			return -EINVAL;
+		ea = (void *)addr;
+	}
+	return 0;
+}
+
 /*
  * note: If cfile is passed, the reference to it is dropped here.
  * So make sure that you do not reuse cfile after return from this func.
@@ -95,10 +171,9 @@ static int parse_posix_sids(struct cifs_open_info_data *data,
  */
 static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 			    struct cifs_sb_info *cifs_sb, const char *full_path,
-			    __u32 desired_access, __u32 create_disposition,
-			    __u32 create_options, umode_t mode, struct kvec *in_iov,
+			    struct cifs_open_parms *oparms, struct kvec *in_iov,
 			    int *cmds, int num_cmds, struct cifsFileInfo *cfile,
-			    struct kvec *out_iov, int *out_buftype)
+			    struct kvec *out_iov, int *out_buftype, struct dentry *dentry)
 {
 
 	struct reparse_data_buffer *rbuf;
@@ -115,11 +190,12 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
 	int resp_buftype[MAX_COMPOUND];
 	struct smb2_query_info_rsp *qi_rsp = NULL;
 	struct cifs_open_info_data *idata;
+	struct inode *inode = NULL;
 	int flags = 0;
 	__u8 delete_pending[8] = {1, 0, 0, 0, 0, 0, 0, 0};
 	unsigned int size[2];
 	void *data[2];
-	int len;
+	unsigned int len;
 	int retries = 0, cur_sleep = 1;
 
 replay_again:
@@ -152,16 +228,28 @@ replay_again:
 		goto finished;
 	}
 
-	vars->oparms = (struct cifs_open_parms) {
-		.tcon = tcon,
-		.path = full_path,
-		.desired_access = desired_access,
-		.disposition = create_disposition,
-		.create_options = cifs_create_options(cifs_sb, create_options),
-		.fid = &fid,
-		.mode = mode,
-		.cifs_sb = cifs_sb,
-	};
+	/* if there is an existing lease, reuse it */
+
+	/*
+	 * note: files with hardlinks cause unexpected behaviour. As per MS-SMB2,
+	 * lease keys are associated with the filepath. We are maintaining lease keys
+	 * with the inode on the client. If the file has hardlinks, it is possible
+	 * that the lease for a file be reused for an operation on its hardlink or
+	 * vice versa.
+	 * As a workaround, send request using an existing lease key and if the server
+	 * returns STATUS_INVALID_PARAMETER, which maps to EINVAL, send the request
+	 * again without the lease.
+	 */
+	if (dentry) {
+		inode = d_inode(dentry);
+		if (CIFS_I(inode)->lease_granted && server->ops->get_lease_key) {
+			oplock = SMB2_OPLOCK_LEVEL_LEASE;
+			server->ops->get_lease_key(inode, &fid);
+		}
+	}
+
+	vars->oparms = *oparms;
+	vars->oparms.fid = &fid;
 
 	rqst[num_rqst].rq_iov = &vars->open_iov[0];
 	rqst[num_rqst].rq_nvec = SMB2_CREATE_IOV_SIZE;
@@ -202,14 +290,13 @@ replay_again:
 							  SMB2_O_INFO_FILE, 0,
 							  sizeof(struct smb2_file_all_info) +
 							  PATH_MAX * 2, 0, NULL);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_query_info_compound_enter(xid, ses->Suid,
 							     tcon->tid, full_path);
@@ -239,14 +326,13 @@ replay_again:
 							  sizeof(struct smb311_posix_qinfo *) +
 							  (PATH_MAX * 2) +
 							  (sizeof(struct cifs_sid) * 2), 0, NULL);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
 								   tcon->tid, full_path);
@@ -304,13 +390,13 @@ replay_again:
 							FILE_END_OF_FILE_INFORMATION,
 							SMB2_O_INFO_FILE, 0,
 							data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
 			break;
@@ -335,14 +421,13 @@ replay_again:
 							COMPOUND_FID, current->tgid,
 							FILE_BASIC_INFORMATION,
 							SMB2_O_INFO_FILE, 0, data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_set_info_compound_enter(xid, ses->Suid,
 							   tcon->tid, full_path);
@@ -376,13 +461,13 @@ replay_again:
 							COMPOUND_FID, COMPOUND_FID,
 							current->tgid, FILE_RENAME_INFORMATION,
 							SMB2_O_INFO_FILE, 0, data, size);
-				if (!rc) {
-					smb2_set_next_command(tcon, &rqst[num_rqst]);
-					smb2_set_related(&rqst[num_rqst]);
-				}
 			}
-			if (rc)
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
+			}
 			num_rqst++;
 			trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
 			break;
@@ -417,15 +502,27 @@ replay_again:
 			rqst[num_rqst].rq_iov = vars->io_iov;
 			rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
 
-			rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
-					     COMPOUND_FID, COMPOUND_FID,
-					     FSCTL_SET_REPARSE_POINT,
-					     in_iov[i].iov_base,
-					     in_iov[i].iov_len, 0);
-			if (rc)
+			if (cfile) {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     cfile->fid.persistent_fid,
+						     cfile->fid.volatile_fid,
+						     FSCTL_SET_REPARSE_POINT,
+						     in_iov[i].iov_base,
+						     in_iov[i].iov_len, 0);
+			} else {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     COMPOUND_FID, COMPOUND_FID,
+						     FSCTL_SET_REPARSE_POINT,
+						     in_iov[i].iov_base,
+						     in_iov[i].iov_len, 0);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
-			smb2_set_next_command(tcon, &rqst[num_rqst]);
-			smb2_set_related(&rqst[num_rqst++]);
+			}
+			num_rqst++;
 			trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
 							      tcon->tid, full_path);
 			break;
@@ -433,17 +530,61 @@ replay_again:
 			rqst[num_rqst].rq_iov = vars->io_iov;
 			rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
 
-			rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
-					     COMPOUND_FID, COMPOUND_FID,
-					     FSCTL_GET_REPARSE_POINT,
-					     NULL, 0, CIFSMaxBufSize);
-			if (rc)
+			if (cfile) {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     cfile->fid.persistent_fid,
+						     cfile->fid.volatile_fid,
+						     FSCTL_GET_REPARSE_POINT,
+						     NULL, 0, CIFSMaxBufSize);
+			} else {
+				rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+						     COMPOUND_FID, COMPOUND_FID,
+						     FSCTL_GET_REPARSE_POINT,
+						     NULL, 0, CIFSMaxBufSize);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
 				goto finished;
-			smb2_set_next_command(tcon, &rqst[num_rqst]);
-			smb2_set_related(&rqst[num_rqst++]);
+			}
+			num_rqst++;
 			trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
 							      tcon->tid, full_path);
 			break;
+		case SMB2_OP_QUERY_WSL_EA:
+			rqst[num_rqst].rq_iov = &vars->ea_iov;
+			rqst[num_rqst].rq_nvec = 1;
+
+			if (cfile) {
+				rc = SMB2_query_info_init(tcon, server,
+							  &rqst[num_rqst],
+							  cfile->fid.persistent_fid,
+							  cfile->fid.volatile_fid,
+							  FILE_FULL_EA_INFORMATION,
+							  SMB2_O_INFO_FILE, 0,
+							  SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+							  sizeof(wsl_query_eas),
+							  (void *)wsl_query_eas);
+			} else {
+				rc = SMB2_query_info_init(tcon, server,
+							  &rqst[num_rqst],
+							  COMPOUND_FID,
+							  COMPOUND_FID,
+							  FILE_FULL_EA_INFORMATION,
+							  SMB2_O_INFO_FILE, 0,
+							  SMB2_WSL_MAX_QUERY_EA_RESP_SIZE,
+							  sizeof(wsl_query_eas),
+							  (void *)wsl_query_eas);
+			}
+			if (!rc && (!cfile || num_rqst > 1)) {
+				smb2_set_next_command(tcon, &rqst[num_rqst]);
+				smb2_set_related(&rqst[num_rqst]);
+			} else if (rc) {
+				goto finished;
+			}
+			num_rqst++;
+			break;
 		default:
 			cifs_dbg(VFS, "Invalid command\n");
 			rc = -EINVAL;
@@ -551,8 +692,15 @@ finished:
 		case SMB2_OP_DELETE:
 			if (rc)
 				trace_smb3_delete_err(xid,  ses->Suid, tcon->tid, rc);
-			else
+			else {
+				/*
+				 * If dentry (hence, inode) is NULL, lease break is going to
+				 * take care of degrading leases on handles for deleted files.
+				 */
+				if (inode)
+					cifs_mark_open_handles_for_deleted_file(inode, full_path);
 				trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+			}
 			break;
 		case SMB2_OP_MKDIR:
 			if (rc)
@@ -626,11 +774,32 @@ finished:
 				memset(iov, 0, sizeof(*iov));
 				resp_buftype[i + 1] = CIFS_NO_BUFFER;
 			} else {
-				trace_smb3_set_reparse_compound_err(xid,  ses->Suid,
+				trace_smb3_set_reparse_compound_err(xid, ses->Suid,
 								    tcon->tid, rc);
 			}
 			SMB2_ioctl_free(&rqst[num_rqst++]);
 			break;
+		case SMB2_OP_QUERY_WSL_EA:
+			if (!rc) {
+				idata = in_iov[i].iov_base;
+				qi_rsp = rsp_iov[i + 1].iov_base;
+				data[0] = (u8 *)qi_rsp + le16_to_cpu(qi_rsp->OutputBufferOffset);
+				size[0] = le32_to_cpu(qi_rsp->OutputBufferLength);
+				rc = check_wsl_eas(&rsp_iov[i + 1]);
+				if (!rc) {
+					memcpy(idata->wsl.eas, data[0], size[0]);
+					idata->wsl.eas_len = size[0];
+				}
+			}
+			if (!rc) {
+				trace_smb3_query_wsl_ea_compound_done(xid, ses->Suid,
+								      tcon->tid);
+			} else {
+				trace_smb3_query_wsl_ea_compound_err(xid, ses->Suid,
+								     tcon->tid, rc);
+			}
+			SMB2_query_info_free(&rqst[num_rqst++]);
+			break;
 		}
 	}
 	SMB2_close_free(&rqst[num_rqst]);
@@ -693,15 +862,16 @@ int smb2_query_path_info(const unsigned int xid,
 			 const char *full_path,
 			 struct cifs_open_info_data *data)
 {
+	struct cifs_open_parms oparms;
 	__u32 create_options = 0;
 	struct cifsFileInfo *cfile;
 	struct cached_fid *cfid = NULL;
 	struct smb2_hdr *hdr;
-	struct kvec in_iov[2], out_iov[3] = {};
+	struct kvec in_iov[3], out_iov[3] = {};
 	int out_buftype[3] = {};
-	int cmds[2];
+	int cmds[3];
 	bool islink;
-	int i, num_cmds;
+	int i, num_cmds = 0;
 	int rc, rc2;
 
 	data->adjust_tz = false;
@@ -734,20 +904,22 @@ int smb2_query_path_info(const unsigned int xid,
 			close_cached_dir(cfid);
 			return rc;
 		}
-		cmds[0] = SMB2_OP_QUERY_INFO;
+		cmds[num_cmds++] = SMB2_OP_QUERY_INFO;
 	} else {
-		cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
+		cmds[num_cmds++] = SMB2_OP_POSIX_QUERY_INFO;
 	}
 
 	in_iov[0].iov_base = data;
 	in_iov[0].iov_len = sizeof(*data);
 	in_iov[1] = in_iov[0];
+	in_iov[2] = in_iov[0];
 
 	cifs_get_readable_path(tcon, full_path, &cfile);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+			     FILE_OPEN, create_options, ACL_NO_MODE);
 	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_READ_ATTRIBUTES, FILE_OPEN,
-			      create_options, ACL_NO_MODE, in_iov,
-			      cmds, 1, cfile, out_iov, out_buftype);
+			      &oparms, in_iov, cmds, num_cmds,
+			      cfile, out_iov, out_buftype, NULL);
 	hdr = out_iov[0].iov_base;
 	/*
 	 * If first iov is unset, then SMB session was dropped or we've got a
@@ -767,19 +939,22 @@ int smb2_query_path_info(const unsigned int xid,
 		if (rc || !data->reparse_point)
 			goto out;
 
-		if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
-			/* symlink already parsed in create response */
-			num_cmds = 1;
-		} else {
-			cmds[1] = SMB2_OP_GET_REPARSE;
-			num_cmds = 2;
-		}
-		create_options |= OPEN_REPARSE_POINT;
+		cmds[num_cmds++] = SMB2_OP_QUERY_WSL_EA;
+		/*
+		 * Skip SMB2_OP_GET_REPARSE if symlink already parsed in create
+		 * response.
+		 */
+		if (data->reparse.tag != IO_REPARSE_TAG_SYMLINK)
+			cmds[num_cmds++] = SMB2_OP_GET_REPARSE;
+
+		oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+				     FILE_READ_ATTRIBUTES | FILE_READ_EA,
+				     FILE_OPEN, create_options |
+				     OPEN_REPARSE_POINT, ACL_NO_MODE);
 		cifs_get_readable_path(tcon, full_path, &cfile);
 		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      FILE_READ_ATTRIBUTES, FILE_OPEN,
-				      create_options, ACL_NO_MODE, in_iov,
-				      cmds, num_cmds, cfile, NULL, NULL);
+				      &oparms, in_iov, cmds, num_cmds,
+				      cfile, NULL, NULL, NULL);
 		break;
 	case -EREMOTE:
 		break;
@@ -807,11 +982,14 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
 	   struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
-	return smb2_compound_op(xid, tcon, cifs_sb, name,
-				FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				CREATE_NOT_FILE, mode,
-				NULL, &(int){SMB2_OP_MKDIR}, 1,
-				NULL, NULL, NULL);
+	struct cifs_open_parms oparms;
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE, CREATE_NOT_FILE, mode);
+	return smb2_compound_op(xid, tcon, cifs_sb,
+				name, &oparms, NULL,
+				&(int){SMB2_OP_MKDIR}, 1,
+				NULL, NULL, NULL, NULL);
 }
 
 void
@@ -819,6 +997,7 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
 		   struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
 		   const unsigned int xid)
 {
+	struct cifs_open_parms oparms;
 	FILE_BASIC_INFO data = {};
 	struct cifsInodeInfo *cifs_i;
 	struct cifsFileInfo *cfile;
@@ -832,11 +1011,12 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
 	dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
 	data.Attributes = cpu_to_le32(dosattrs);
 	cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE, CREATE_NOT_FILE, ACL_NO_MODE);
 	tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
-				 FILE_WRITE_ATTRIBUTES, FILE_CREATE,
-				 CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
+				 &oparms, &in_iov,
 				 &(int){SMB2_OP_SET_INFO}, 1,
-				 cfile, NULL, NULL);
+				 cfile, NULL, NULL, NULL);
 	if (tmprc == 0)
 		cifs_i->cifsAttrs = dosattrs;
 }
@@ -845,31 +1025,47 @@ int
 smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	   struct cifs_sb_info *cifs_sb)
 {
+	struct cifs_open_parms oparms;
+
 	drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
-	return smb2_compound_op(xid, tcon, cifs_sb, name,
-				DELETE, FILE_OPEN, CREATE_NOT_FILE,
-				ACL_NO_MODE, NULL,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name, DELETE,
+			     FILE_OPEN, CREATE_NOT_FILE, ACL_NO_MODE);
+	return smb2_compound_op(xid, tcon, cifs_sb,
+				name, &oparms, NULL,
 				&(int){SMB2_OP_RMDIR}, 1,
-				NULL, NULL, NULL);
+				NULL, NULL, NULL, NULL);
 }
 
 int
 smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
-	    struct cifs_sb_info *cifs_sb)
+	    struct cifs_sb_info *cifs_sb, struct dentry *dentry)
 {
-	return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
-				ACL_NO_MODE, NULL,
-				&(int){SMB2_OP_DELETE}, 1,
-				NULL, NULL, NULL);
+	struct cifs_open_parms oparms;
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, name,
+			     DELETE, FILE_OPEN,
+			     CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+			     ACL_NO_MODE);
+	int rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+				  NULL, &(int){SMB2_OP_DELETE}, 1,
+				  NULL, NULL, NULL, dentry);
+	if (rc == -EINVAL) {
+		cifs_dbg(FYI, "invalid lease key, resending request without lease");
+		rc = smb2_compound_op(xid, tcon, cifs_sb, name, &oparms,
+				      NULL, &(int){SMB2_OP_DELETE}, 1,
+				      NULL, NULL, NULL, NULL);
+	}
+	return rc;
 }
 
 static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 			      const char *from_name, const char *to_name,
 			      struct cifs_sb_info *cifs_sb,
 			      __u32 create_options, __u32 access,
-			      int command, struct cifsFileInfo *cfile)
+			      int command, struct cifsFileInfo *cfile,
+				  struct dentry *dentry)
 {
+	struct cifs_open_parms oparms;
 	struct kvec in_iov;
 	__le16 *smb2_to_name = NULL;
 	int rc;
@@ -881,9 +1077,11 @@ static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
 	}
 	in_iov.iov_base = smb2_to_name;
 	in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
-			      FILE_OPEN, create_options, ACL_NO_MODE,
-			      &in_iov, &command, 1, cfile, NULL, NULL);
+	oparms = CIFS_OPARMS(cifs_sb, tcon, from_name, access, FILE_OPEN,
+			     create_options, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb, from_name,
+			      &oparms, &in_iov, &command, 1,
+			      cfile, NULL, NULL, dentry);
 smb2_rename_path:
 	kfree(smb2_to_name);
 	return rc;
@@ -901,8 +1099,14 @@ int smb2_rename_path(const unsigned int xid,
 	drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
 	cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
 
-	return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
-				  co, DELETE, SMB2_OP_RENAME, cfile);
+	int rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  co, DELETE, SMB2_OP_RENAME, cfile, source_dentry);
+	if (rc == -EINVAL) {
+		cifs_dbg(FYI, "invalid lease key, resending request without lease");
+		rc = smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+				  co, DELETE, SMB2_OP_RENAME, cfile, NULL);
+	}
+	return rc;
 }
 
 int smb2_create_hardlink(const unsigned int xid,
@@ -915,32 +1119,46 @@ int smb2_create_hardlink(const unsigned int xid,
 
 	return smb2_set_path_attr(xid, tcon, from_name, to_name,
 				  cifs_sb, co, FILE_READ_ATTRIBUTES,
-				  SMB2_OP_HARDLINK, NULL);
+				  SMB2_OP_HARDLINK, NULL, NULL);
 }
 
 int
 smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 		   const char *full_path, __u64 size,
-		   struct cifs_sb_info *cifs_sb, bool set_alloc)
+		   struct cifs_sb_info *cifs_sb, bool set_alloc,
+		   struct dentry *dentry)
 {
+	struct cifs_open_parms oparms;
 	struct cifsFileInfo *cfile;
 	struct kvec in_iov;
 	__le64 eof = cpu_to_le64(size);
+	int rc;
 
 	in_iov.iov_base = &eof;
 	in_iov.iov_len = sizeof(eof);
 	cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-	return smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				FILE_WRITE_DATA, FILE_OPEN,
-				0, ACL_NO_MODE, &in_iov,
-				&(int){SMB2_OP_SET_EOF}, 1,
-				cfile, NULL, NULL);
+
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_DATA,
+			     FILE_OPEN, 0, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb,
+			      full_path, &oparms, &in_iov,
+			      &(int){SMB2_OP_SET_EOF}, 1,
+			      cfile, NULL, NULL, dentry);
+	if (rc == -EINVAL) {
+		cifs_dbg(FYI, "invalid lease key, resending request without lease");
+		rc = smb2_compound_op(xid, tcon, cifs_sb,
+				      full_path, &oparms, &in_iov,
+				      &(int){SMB2_OP_SET_EOF}, 1,
+				      cfile, NULL, NULL, NULL);
+	}
+	return rc;
 }
 
 int
 smb2_set_file_info(struct inode *inode, const char *full_path,
 		   FILE_BASIC_INFO *buf, const unsigned int xid)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink;
 	struct cifs_tcon *tcon;
@@ -959,11 +1177,12 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
 	tcon = tlink_tcon(tlink);
 
 	cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_WRITE_ATTRIBUTES, FILE_OPEN,
-			      0, ACL_NO_MODE, &in_iov,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_ATTRIBUTES,
+			     FILE_OPEN, 0, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb,
+			      full_path, &oparms, &in_iov,
 			      &(int){SMB2_OP_SET_INFO}, 1,
-			      cfile, NULL, NULL);
+			      cfile, NULL, NULL, NULL);
 	cifs_put_tlink(tlink);
 	return rc;
 }
@@ -973,32 +1192,37 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 				     const unsigned int xid,
 				     struct cifs_tcon *tcon,
 				     const char *full_path,
-				     struct kvec *iov)
+				     struct kvec *reparse_iov,
+				     struct kvec *xattr_iov)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
 	struct cifsFileInfo *cfile;
 	struct inode *new = NULL;
 	struct kvec in_iov[2];
 	int cmds[2];
-	int da, co, cd;
 	int rc;
 
-	da = SYNCHRONIZE | DELETE |
-		FILE_READ_ATTRIBUTES |
-		FILE_WRITE_ATTRIBUTES;
-	co = CREATE_NOT_DIR | OPEN_REPARSE_POINT;
-	cd = FILE_CREATE;
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path,
+			     SYNCHRONIZE | DELETE |
+			     FILE_READ_ATTRIBUTES |
+			     FILE_WRITE_ATTRIBUTES,
+			     FILE_CREATE,
+			     CREATE_NOT_DIR | OPEN_REPARSE_POINT,
+			     ACL_NO_MODE);
+	if (xattr_iov)
+		oparms.ea_cctx = xattr_iov;
+
 	cmds[0] = SMB2_OP_SET_REPARSE;
-	in_iov[0] = *iov;
+	in_iov[0] = *reparse_iov;
 	in_iov[1].iov_base = data;
 	in_iov[1].iov_len = sizeof(*data);
 
 	if (tcon->posix_extensions) {
 		cmds[1] = SMB2_OP_POSIX_QUERY_INFO;
 		cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      da, cd, co, ACL_NO_MODE, in_iov,
-				      cmds, 2, cfile, NULL, NULL);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+				      in_iov, cmds, 2, cfile, NULL, NULL, NULL);
 		if (!rc) {
 			rc = smb311_posix_get_inode_info(&new, full_path,
 							 data, sb, xid);
@@ -1006,9 +1230,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 	} else {
 		cmds[1] = SMB2_OP_QUERY_INFO;
 		cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
-		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-				      da, cd, co, ACL_NO_MODE, in_iov,
-				      cmds, 2, cfile, NULL, NULL);
+		rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms,
+				      in_iov, cmds, 2, cfile, NULL, NULL, NULL);
 		if (!rc) {
 			rc = cifs_get_inode_info(&new, full_path,
 						 data, sb, xid, NULL);
@@ -1024,6 +1247,7 @@ int smb2_query_reparse_point(const unsigned int xid,
 			     u32 *tag, struct kvec *rsp,
 			     int *rsp_buftype)
 {
+	struct cifs_open_parms oparms;
 	struct cifs_open_info_data data = {};
 	struct cifsFileInfo *cfile;
 	struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), };
@@ -1032,11 +1256,12 @@ int smb2_query_reparse_point(const unsigned int xid,
 	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
 
 	cifs_get_readable_path(tcon, full_path, &cfile);
-	rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
-			      FILE_READ_ATTRIBUTES, FILE_OPEN,
-			      OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_READ_ATTRIBUTES,
+			     FILE_OPEN, OPEN_REPARSE_POINT, ACL_NO_MODE);
+	rc = smb2_compound_op(xid, tcon, cifs_sb,
+			      full_path, &oparms, &in_iov,
 			      &(int){SMB2_OP_GET_REPARSE}, 1,
-			      cfile, NULL, NULL);
+			      cfile, NULL, NULL, NULL);
 	if (rc)
 		goto out;
 
diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c
index 82b84a4941dd..cc72be5a93a9 100644
--- a/fs/smb/client/smb2misc.c
+++ b/fs/smb/client/smb2misc.c
@@ -622,6 +622,8 @@ smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+		if (cifs_ses_exiting(ses))
+			continue;
 		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 			spin_lock(&tcon->open_file_lock);
 			cifs_stats_inc(
@@ -697,6 +699,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 	/* look up tcon based on tid & uid */
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+		if (cifs_ses_exiting(ses))
+			continue;
 		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 
 			spin_lock(&tcon->open_file_lock);
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 4695433fcf39..78c94d0350fe 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -28,6 +28,7 @@
 #include "fscache.h"
 #include "fs_context.h"
 #include "cached_dir.h"
+#include "reparse.h"
 
 /* Change credits for different ops and return the total number of credits */
 static int
@@ -1411,14 +1412,14 @@ smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
 	memcpy(cfile->fid.create_guid, fid->create_guid, 16);
 }
 
-static void
+static int
 smb2_close_file(const unsigned int xid, struct cifs_tcon *tcon,
 		struct cifs_fid *fid)
 {
-	SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
+	return SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid);
 }
 
-static void
+static int
 smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
 		   struct cifsFileInfo *cfile)
 {
@@ -1429,7 +1430,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
 	rc = __SMB2_close(xid, tcon, cfile->fid.persistent_fid,
 		   cfile->fid.volatile_fid, &file_inf);
 	if (rc)
-		return;
+		return rc;
 
 	inode = d_inode(cfile->dentry);
 
@@ -1458,6 +1459,7 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon,
 
 	/* End of file and Attributes should not have to be updated on close */
 	spin_unlock(&inode->i_lock);
+	return rc;
 }
 
 static int
@@ -2479,6 +2481,8 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server)
 
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) {
+		if (cifs_ses_exiting(ses))
+			continue;
 		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
 			if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) {
 				spin_lock(&tcon->tc_lock);
@@ -2986,109 +2990,6 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
 	return rc;
 }
 
-/* See MS-FSCC 2.1.2.6 for the 'NFS' style reparse tags */
-static int parse_reparse_posix(struct reparse_posix_data *buf,
-			       struct cifs_sb_info *cifs_sb,
-			       struct cifs_open_info_data *data)
-{
-	unsigned int len;
-	u64 type;
-
-	switch ((type = le64_to_cpu(buf->InodeType))) {
-	case NFS_SPECFILE_LNK:
-		len = le16_to_cpu(buf->ReparseDataLength);
-		data->symlink_target = cifs_strndup_from_utf16(buf->DataBuffer,
-							       len, true,
-							       cifs_sb->local_nls);
-		if (!data->symlink_target)
-			return -ENOMEM;
-		convert_delimiter(data->symlink_target, '/');
-		cifs_dbg(FYI, "%s: target path: %s\n",
-			 __func__, data->symlink_target);
-		break;
-	case NFS_SPECFILE_CHR:
-	case NFS_SPECFILE_BLK:
-	case NFS_SPECFILE_FIFO:
-	case NFS_SPECFILE_SOCK:
-		break;
-	default:
-		cifs_dbg(VFS, "%s: unhandled inode type: 0x%llx\n",
-			 __func__, type);
-		return -EOPNOTSUPP;
-	}
-	return 0;
-}
-
-static int parse_reparse_symlink(struct reparse_symlink_data_buffer *sym,
-				 u32 plen, bool unicode,
-				 struct cifs_sb_info *cifs_sb,
-				 struct cifs_open_info_data *data)
-{
-	unsigned int len;
-	unsigned int offs;
-
-	/* We handle Symbolic Link reparse tag here. See: MS-FSCC 2.1.2.4 */
-
-	offs = le16_to_cpu(sym->SubstituteNameOffset);
-	len = le16_to_cpu(sym->SubstituteNameLength);
-	if (offs + 20 > plen || offs + len + 20 > plen) {
-		cifs_dbg(VFS, "srv returned malformed symlink buffer\n");
-		return -EIO;
-	}
-
-	data->symlink_target = cifs_strndup_from_utf16(sym->PathBuffer + offs,
-						       len, unicode,
-						       cifs_sb->local_nls);
-	if (!data->symlink_target)
-		return -ENOMEM;
-
-	convert_delimiter(data->symlink_target, '/');
-	cifs_dbg(FYI, "%s: target path: %s\n", __func__, data->symlink_target);
-
-	return 0;
-}
-
-int parse_reparse_point(struct reparse_data_buffer *buf,
-			u32 plen, struct cifs_sb_info *cifs_sb,
-			bool unicode, struct cifs_open_info_data *data)
-{
-	data->reparse.buf = buf;
-
-	/* See MS-FSCC 2.1.2 */
-	switch (le32_to_cpu(buf->ReparseTag)) {
-	case IO_REPARSE_TAG_NFS:
-		return parse_reparse_posix((struct reparse_posix_data *)buf,
-					   cifs_sb, data);
-	case IO_REPARSE_TAG_SYMLINK:
-		return parse_reparse_symlink(
-			(struct reparse_symlink_data_buffer *)buf,
-			plen, unicode, cifs_sb, data);
-	case IO_REPARSE_TAG_LX_SYMLINK:
-	case IO_REPARSE_TAG_AF_UNIX:
-	case IO_REPARSE_TAG_LX_FIFO:
-	case IO_REPARSE_TAG_LX_CHR:
-	case IO_REPARSE_TAG_LX_BLK:
-		return 0;
-	default:
-		cifs_dbg(VFS, "%s: unhandled reparse tag: 0x%08x\n",
-			 __func__, le32_to_cpu(buf->ReparseTag));
-		return -EOPNOTSUPP;
-	}
-}
-
-static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
-				    struct kvec *rsp_iov,
-				    struct cifs_open_info_data *data)
-{
-	struct reparse_data_buffer *buf;
-	struct smb2_ioctl_rsp *io = rsp_iov->iov_base;
-	u32 plen = le32_to_cpu(io->OutputCount);
-
-	buf = (struct reparse_data_buffer *)((u8 *)io +
-					     le32_to_cpu(io->OutputOffset));
-	return parse_reparse_point(buf, plen, cifs_sb, true, data);
-}
-
 static struct cifs_ntsd *
 get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
 		    const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
@@ -4015,7 +3916,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
 		strcat(message, "W");
 	}
 	if (!new_oplock)
-		strncpy(message, "None", sizeof(message));
+		strscpy(message, "None");
 
 	cinode->oplock = new_oplock;
 	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
@@ -5063,214 +4964,84 @@ static int smb2_next_header(struct TCP_Server_Info *server, char *buf,
 	return 0;
 }
 
-int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
-		       struct dentry *dentry, struct cifs_tcon *tcon,
-		       const char *full_path, umode_t mode, dev_t dev)
+static int __cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, umode_t mode, dev_t dev)
 {
-	struct cifs_open_info_data buf = {};
 	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_open_parms oparms;
 	struct cifs_io_parms io_parms = {};
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct cifs_fid fid;
 	unsigned int bytes_written;
-	struct win_dev *pdev;
+	struct win_dev pdev = {};
 	struct kvec iov[2];
 	__u32 oplock = server->oplocks ? REQ_OPLOCK : 0;
 	int rc;
 
-	if (!S_ISCHR(mode) && !S_ISBLK(mode) && !S_ISFIFO(mode))
+	switch (mode & S_IFMT) {
+	case S_IFCHR:
+		strscpy(pdev.type, "IntxCHR");
+		pdev.major = cpu_to_le64(MAJOR(dev));
+		pdev.minor = cpu_to_le64(MINOR(dev));
+		break;
+	case S_IFBLK:
+		strscpy(pdev.type, "IntxBLK");
+		pdev.major = cpu_to_le64(MAJOR(dev));
+		pdev.minor = cpu_to_le64(MINOR(dev));
+		break;
+	case S_IFIFO:
+		strscpy(pdev.type, "LnxFIFO");
+		break;
+	default:
 		return -EPERM;
+	}
 
-	oparms = (struct cifs_open_parms) {
-		.tcon = tcon,
-		.cifs_sb = cifs_sb,
-		.desired_access = GENERIC_WRITE,
-		.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
-						      CREATE_OPTION_SPECIAL),
-		.disposition = FILE_CREATE,
-		.path = full_path,
-		.fid = &fid,
-	};
+	oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, GENERIC_WRITE,
+			     FILE_CREATE, CREATE_NOT_DIR |
+			     CREATE_OPTION_SPECIAL, ACL_NO_MODE);
+	oparms.fid = &fid;
 
-	rc = server->ops->open(xid, &oparms, &oplock, &buf);
+	rc = server->ops->open(xid, &oparms, &oplock, NULL);
 	if (rc)
 		return rc;
 
-	/*
-	 * BB Do not bother to decode buf since no local inode yet to put
-	 * timestamps in, but we can reuse it safely.
-	 */
-	pdev = (struct win_dev *)&buf.fi;
 	io_parms.pid = current->tgid;
 	io_parms.tcon = tcon;
-	io_parms.length = sizeof(*pdev);
-	iov[1].iov_base = pdev;
-	iov[1].iov_len = sizeof(*pdev);
-	if (S_ISCHR(mode)) {
-		memcpy(pdev->type, "IntxCHR", 8);
-		pdev->major = cpu_to_le64(MAJOR(dev));
-		pdev->minor = cpu_to_le64(MINOR(dev));
-	} else if (S_ISBLK(mode)) {
-		memcpy(pdev->type, "IntxBLK", 8);
-		pdev->major = cpu_to_le64(MAJOR(dev));
-		pdev->minor = cpu_to_le64(MINOR(dev));
-	} else if (S_ISFIFO(mode)) {
-		memcpy(pdev->type, "LnxFIFO", 8);
-	}
+	io_parms.length = sizeof(pdev);
+	iov[1].iov_base = &pdev;
+	iov[1].iov_len = sizeof(pdev);
 
 	rc = server->ops->sync_write(xid, &fid, &io_parms,
 				     &bytes_written, iov, 1);
 	server->ops->close(xid, tcon, &fid);
-	d_drop(dentry);
-	/* FIXME: add code here to set EAs */
-	cifs_free_open_info(&buf);
 	return rc;
 }
 
-static inline u64 mode_nfs_type(mode_t mode)
-{
-	switch (mode & S_IFMT) {
-	case S_IFBLK: return NFS_SPECFILE_BLK;
-	case S_IFCHR: return NFS_SPECFILE_CHR;
-	case S_IFIFO: return NFS_SPECFILE_FIFO;
-	case S_IFSOCK: return NFS_SPECFILE_SOCK;
-	}
-	return 0;
-}
-
-static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
-			       mode_t mode, dev_t dev,
-			       struct kvec *iov)
-{
-	u64 type;
-	u16 len, dlen;
-
-	len = sizeof(*buf);
-
-	switch ((type = mode_nfs_type(mode))) {
-	case NFS_SPECFILE_BLK:
-	case NFS_SPECFILE_CHR:
-		dlen = sizeof(__le64);
-		break;
-	case NFS_SPECFILE_FIFO:
-	case NFS_SPECFILE_SOCK:
-		dlen = 0;
-		break;
-	default:
-		return -EOPNOTSUPP;
-	}
-
-	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
-	buf->Reserved = 0;
-	buf->InodeType = cpu_to_le64(type);
-	buf->ReparseDataLength = cpu_to_le16(len + dlen -
-					     sizeof(struct reparse_data_buffer));
-	*(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
-						 MINOR(dev));
-	iov->iov_base = buf;
-	iov->iov_len = len + dlen;
-	return 0;
-}
-
-static int nfs_make_node(unsigned int xid, struct inode *inode,
-			 struct dentry *dentry, struct cifs_tcon *tcon,
-			 const char *full_path, umode_t mode, dev_t dev)
+int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev)
 {
-	struct cifs_open_info_data data;
-	struct reparse_posix_data *p;
-	struct inode *new;
-	struct kvec iov;
-	__u8 buf[sizeof(*p) + sizeof(__le64)];
+	struct inode *new = NULL;
 	int rc;
 
-	p = (struct reparse_posix_data *)buf;
-	rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+	rc = __cifs_sfu_make_node(xid, inode, dentry, tcon,
+				  full_path, mode, dev);
 	if (rc)
 		return rc;
 
-	data = (struct cifs_open_info_data) {
-		.reparse_point = true,
-		.reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
-	};
-
-	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
-	if (!IS_ERR(new))
-		d_instantiate(dentry, new);
-	else
-		rc = PTR_ERR(new);
-	cifs_free_open_info(&data);
-	return rc;
-}
-
-static int smb2_create_reparse_symlink(const unsigned int xid,
-				       struct inode *inode,
-				       struct dentry *dentry,
-				       struct cifs_tcon *tcon,
-				       const char *full_path,
-				       const char *symname)
-{
-	struct reparse_symlink_data_buffer *buf = NULL;
-	struct cifs_open_info_data data;
-	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
-	struct inode *new;
-	struct kvec iov;
-	__le16 *path;
-	char *sym, sep = CIFS_DIR_SEP(cifs_sb);
-	u16 len, plen;
-	int rc = 0;
-
-	sym = kstrdup(symname, GFP_KERNEL);
-	if (!sym)
-		return -ENOMEM;
-
-	data = (struct cifs_open_info_data) {
-		.reparse_point = true,
-		.reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
-		.symlink_target = sym,
-	};
-
-	convert_delimiter(sym, sep);
-	path = cifs_convert_path_to_utf16(sym, cifs_sb);
-	if (!path) {
-		rc = -ENOMEM;
-		goto out;
-	}
-
-	plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
-	len = sizeof(*buf) + plen * 2;
-	buf = kzalloc(len, GFP_KERNEL);
-	if (!buf) {
-		rc = -ENOMEM;
-		goto out;
+	if (tcon->posix_extensions) {
+		rc = smb311_posix_get_inode_info(&new, full_path, NULL,
+						 inode->i_sb, xid);
+	} else if (tcon->unix_ext) {
+		rc = cifs_get_inode_info_unix(&new, full_path,
+					      inode->i_sb, xid);
+	} else {
+		rc = cifs_get_inode_info(&new, full_path, NULL,
+					 inode->i_sb, xid, NULL);
 	}
-
-	buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
-	buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
-	buf->SubstituteNameOffset = cpu_to_le16(plen);
-	buf->SubstituteNameLength = cpu_to_le16(plen);
-	memcpy(&buf->PathBuffer[plen], path, plen);
-	buf->PrintNameOffset = 0;
-	buf->PrintNameLength = cpu_to_le16(plen);
-	memcpy(buf->PathBuffer, path, plen);
-	buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
-	if (*sym != sep)
-		buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
-
-	convert_delimiter(sym, '/');
-	iov.iov_base = buf;
-	iov.iov_len = len;
-	new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
-				     tcon, full_path, &iov);
-	if (!IS_ERR(new))
+	if (!rc)
 		d_instantiate(dentry, new);
-	else
-		rc = PTR_ERR(new);
-out:
-	kfree(path);
-	cifs_free_open_info(&data);
-	kfree(buf);
 	return rc;
 }
 
@@ -5291,8 +5062,8 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
 		rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
 					full_path, mode, dev);
 	} else {
-		rc = nfs_make_node(xid, inode, dentry, tcon,
-				   full_path, mode, dev);
+		rc = smb2_mknod_reparse(xid, inode, dentry, tcon,
+					full_path, mode, dev);
 	}
 	return rc;
 }
@@ -5538,6 +5309,7 @@ struct smb_version_operations smb30_operations = {
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
 	.qfs_tcon = smb3_qfs_tcon,
+	.query_server_interfaces = SMB3_request_interfaces,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
@@ -5653,6 +5425,7 @@ struct smb_version_operations smb311_operations = {
 	.tree_connect = SMB2_tcon,
 	.tree_disconnect = SMB2_tdis,
 	.qfs_tcon = smb3_qfs_tcon,
+	.query_server_interfaces = SMB3_request_interfaces,
 	.is_path_accessible = smb2_is_path_accessible,
 	.can_echo = smb2_can_echo,
 	.echo = SMB2_echo,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 608ee05491e2..86c647a947cc 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -367,6 +367,17 @@ again:
 		}
 
 		rc = cifs_setup_session(0, ses, server, nls_codepage);
+		if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) {
+			/*
+			 * Try alternate password for next reconnect (key rotation
+			 * could be enabled on the server e.g.) if an alternate
+			 * password is available and the current password is expired,
+			 * but do not swap on non pwd related errors like host down
+			 */
+			if (ses->password2)
+				swap(ses->password2, ses->password);
+		}
+
 		if ((rc == -EACCES) && !tcon->retry) {
 			mutex_unlock(&ses->session_mutex);
 			rc = -EHOSTDOWN;
@@ -409,14 +420,15 @@ skip_sess_setup:
 	spin_unlock(&ses->ses_lock);
 
 	if (!rc &&
-	    (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
+	    (server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL) &&
+	    server->ops->query_server_interfaces) {
 		mutex_unlock(&ses->session_mutex);
 
 		/*
 		 * query server network interfaces, in case they change
 		 */
 		xid = get_xid();
-		rc = SMB3_request_interfaces(xid, tcon, false);
+		rc = server->ops->query_server_interfaces(xid, tcon, false);
 		free_xid(xid);
 
 		if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
@@ -731,7 +743,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req,
 	pneg_ctxt += sizeof(struct smb2_posix_neg_context);
 	neg_context_count++;
 
-	if (server->compress_algorithm) {
+	if (server->compression.requested) {
 		build_compression_ctxt((struct smb2_compression_capabilities_context *)
 				pneg_ctxt);
 		ctxt_len = ALIGN(sizeof(struct smb2_compression_capabilities_context), 8);
@@ -779,6 +791,9 @@ static void decode_compress_ctx(struct TCP_Server_Info *server,
 			 struct smb2_compression_capabilities_context *ctxt)
 {
 	unsigned int len = le16_to_cpu(ctxt->DataLength);
+	__le16 alg;
+
+	server->compression.enabled = false;
 
 	/*
 	 * Caller checked that DataLength remains within SMB boundary. We still
@@ -789,15 +804,22 @@ static void decode_compress_ctx(struct TCP_Server_Info *server,
 		pr_warn_once("server sent bad compression cntxt\n");
 		return;
 	}
+
 	if (le16_to_cpu(ctxt->CompressionAlgorithmCount) != 1) {
-		pr_warn_once("Invalid SMB3 compress algorithm count\n");
+		pr_warn_once("invalid SMB3 compress algorithm count\n");
 		return;
 	}
-	if (le16_to_cpu(ctxt->CompressionAlgorithms[0]) > 3) {
-		pr_warn_once("unknown compression algorithm\n");
+
+	alg = ctxt->CompressionAlgorithms[0];
+
+	/* 'NONE' (0) compressor type is never negotiated */
+	if (alg == 0 || le16_to_cpu(alg) > 3) {
+		pr_warn_once("invalid compression algorithm '%u'\n", alg);
 		return;
 	}
-	server->compress_algorithm = ctxt->CompressionAlgorithms[0];
+
+	server->compression.alg = alg;
+	server->compression.enabled = true;
 }
 
 static int decode_encrypt_ctx(struct TCP_Server_Info *server,
@@ -1536,6 +1558,11 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data)
 			    &sess_data->buf0_type,
 			    CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov);
 	cifs_small_buf_release(sess_data->iov[0].iov_base);
+	if (rc == 0)
+		sess_data->ses->expired_pwd = false;
+	else if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED))
+		sess_data->ses->expired_pwd = true;
+
 	memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec));
 
 	return rc;
@@ -2715,6 +2742,17 @@ add_query_id_context(struct kvec *iov, unsigned int *num_iovec)
 	return 0;
 }
 
+static void add_ea_context(struct cifs_open_parms *oparms,
+			   struct kvec *rq_iov, unsigned int *num_iovs)
+{
+	struct kvec *iov = oparms->ea_cctx;
+
+	if (iov && iov->iov_base && iov->iov_len) {
+		rq_iov[(*num_iovs)++] = *iov;
+		memset(iov, 0, sizeof(*iov));
+	}
+}
+
 static int
 alloc_path_with_tree_prefix(__le16 **out_path, int *out_size, int *out_len,
 			    const char *treename, const __le16 *path)
@@ -3081,6 +3119,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
 	}
 
 	add_query_id_context(iov, &n_iov);
+	add_ea_context(oparms, iov, &n_iov);
 
 	if (n_iov > 2) {
 		/*
@@ -3600,9 +3639,9 @@ replay_again:
 			memcpy(&pbuf->network_open_info,
 			       &rsp->network_open_info,
 			       sizeof(pbuf->network_open_info));
+		atomic_dec(&tcon->num_remote_opens);
 	}
 
-	atomic_dec(&tcon->num_remote_opens);
 close_exit:
 	SMB2_close_free(&rqst);
 	free_rsp_buf(resp_buftype, rsp);
diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h
index db08194484e0..c72a3b2886b7 100644
--- a/fs/smb/client/smb2pdu.h
+++ b/fs/smb/client/smb2pdu.h
@@ -117,9 +117,10 @@ struct share_redirect_error_context_rsp {
  * [4] : posix context
  * [5] : time warp context
  * [6] : query id context
- * [7] : compound padding
+ * [7] : create ea context
+ * [8] : compound padding
  */
-#define SMB2_CREATE_IOV_SIZE 8
+#define SMB2_CREATE_IOV_SIZE 9
 
 /*
  * Maximum size of a SMB2_CREATE response is 64 (smb2 header) +
@@ -413,4 +414,35 @@ struct smb2_posix_info_parsed {
 	const u8 *name;
 };
 
+struct smb2_create_ea_ctx {
+	struct create_context ctx;
+	__u8 name[8];
+	struct smb2_file_full_ea_info ea;
+} __packed;
+
+#define SMB2_WSL_XATTR_UID		"$LXUID"
+#define SMB2_WSL_XATTR_GID		"$LXGID"
+#define SMB2_WSL_XATTR_MODE		"$LXMOD"
+#define SMB2_WSL_XATTR_DEV		"$LXDEV"
+#define SMB2_WSL_XATTR_NAME_LEN	6
+#define SMB2_WSL_NUM_XATTRS		4
+
+#define SMB2_WSL_XATTR_UID_SIZE	4
+#define SMB2_WSL_XATTR_GID_SIZE	4
+#define SMB2_WSL_XATTR_MODE_SIZE	4
+#define SMB2_WSL_XATTR_DEV_SIZE	8
+
+#define SMB2_WSL_MIN_QUERY_EA_RESP_SIZE \
+	(ALIGN((SMB2_WSL_NUM_XATTRS - 1) * \
+	       (SMB2_WSL_XATTR_NAME_LEN + 1 + \
+		sizeof(struct smb2_file_full_ea_info)), 4) + \
+	 SMB2_WSL_XATTR_NAME_LEN + 1 + sizeof(struct smb2_file_full_ea_info))
+
+#define SMB2_WSL_MAX_QUERY_EA_RESP_SIZE \
+	(ALIGN(SMB2_WSL_MIN_QUERY_EA_RESP_SIZE + \
+	       SMB2_WSL_XATTR_UID_SIZE + \
+	       SMB2_WSL_XATTR_GID_SIZE + \
+	       SMB2_WSL_XATTR_MODE_SIZE + \
+	       SMB2_WSL_XATTR_DEV_SIZE, 4))
+
 #endif				/* _SMB2PDU_H */
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index b3069911e9dd..732169d8a67a 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -61,7 +61,8 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
 				     const unsigned int xid,
 				     struct cifs_tcon *tcon,
 				     const char *full_path,
-				     struct kvec *iov);
+				     struct kvec *reparse_iov,
+				     struct kvec *xattr_iov);
 int smb2_query_reparse_point(const unsigned int xid,
 			     struct cifs_tcon *tcon,
 			     struct cifs_sb_info *cifs_sb,
@@ -75,7 +76,8 @@ int smb2_query_path_info(const unsigned int xid,
 			 struct cifs_open_info_data *data);
 extern int smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
 			      const char *full_path, __u64 size,
-			      struct cifs_sb_info *cifs_sb, bool set_alloc);
+			      struct cifs_sb_info *cifs_sb, bool set_alloc,
+				  struct dentry *dentry);
 extern int smb2_set_file_info(struct inode *inode, const char *full_path,
 			      FILE_BASIC_INFO *buf, const unsigned int xid);
 extern int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
@@ -91,7 +93,8 @@ extern void smb2_mkdir_setinfo(struct inode *inode, const char *full_path,
 extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
 		      const char *name, struct cifs_sb_info *cifs_sb);
 extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
-		       const char *name, struct cifs_sb_info *cifs_sb);
+		       const char *name, struct cifs_sb_info *cifs_sb,
+			   struct dentry *dentry);
 int smb2_rename_path(const unsigned int xid,
 		     struct cifs_tcon *tcon,
 		     struct dentry *source_dentry,
@@ -308,5 +311,11 @@ int smb311_posix_query_path_info(const unsigned int xid,
 int posix_info_parse(const void *beg, const void *end,
 		     struct smb2_posix_info_parsed *out);
 int posix_info_sid_size(const void *beg, const void *end);
+int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode,
+				struct dentry *dentry, struct cifs_tcon *tcon,
+				const char *full_path, const char *symname);
+int smb2_make_nfs_node(unsigned int xid, struct inode *inode,
+		       struct dentry *dentry, struct cifs_tcon *tcon,
+		       const char *full_path, umode_t mode, dev_t dev);
 
 #endif			/* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c
index 5a3ca62d2f07..1d6e54f7879e 100644
--- a/fs/smb/client/smb2transport.c
+++ b/fs/smb/client/smb2transport.c
@@ -659,7 +659,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	}
 	spin_unlock(&server->srv_lock);
 	if (!is_binding && !server->session_estab) {
-		strncpy(shdr->Signature, "BSRSPYL", 8);
+		strscpy(shdr->Signature, "BSRSPYL");
 		return 0;
 	}
 
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index 522fa387fcfd..5e83cb9da902 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -375,6 +375,7 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
 DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mknod_enter);
 
 DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
 	TP_PROTO(unsigned int xid,
@@ -411,10 +412,11 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(query_wsl_ea_compound_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
 DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
-
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mknod_done);
 
 DECLARE_EVENT_CLASS(smb3_inf_compound_err_class,
 	TP_PROTO(unsigned int xid,
@@ -456,9 +458,11 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(query_wsl_ea_compound_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
 DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mknod_err);
 
 /*
  * For logging SMB3 Status code and Command for responses which return errors
@@ -1030,6 +1034,38 @@ DEFINE_EVENT(smb3_ses_class, smb3_##name,  \
 
 DEFINE_SMB3_SES_EVENT(ses_not_found);
 
+DECLARE_EVENT_CLASS(smb3_ioctl_class,
+	TP_PROTO(unsigned int xid,
+		__u64	fid,
+		unsigned int command),
+	TP_ARGS(xid, fid, command),
+	TP_STRUCT__entry(
+		__field(unsigned int, xid)
+		__field(__u64, fid)
+		__field(unsigned int, command)
+	),
+	TP_fast_assign(
+		__entry->xid = xid;
+		__entry->fid = fid;
+		__entry->command = command;
+	),
+	TP_printk("xid=%u fid=0x%llx ioctl cmd=0x%x",
+		__entry->xid, __entry->fid, __entry->command)
+)
+
+#define DEFINE_SMB3_IOCTL_EVENT(name)        \
+DEFINE_EVENT(smb3_ioctl_class, smb3_##name,  \
+	TP_PROTO(unsigned int xid,	     \
+		__u64 fid,		     \
+		unsigned int command),	     \
+	TP_ARGS(xid, fid, command))
+
+DEFINE_SMB3_IOCTL_EVENT(ioctl);
+
+
+
+
+
 DECLARE_EVENT_CLASS(smb3_credit_class,
 	TP_PROTO(__u64	currmid,
 		__u64 conn_id,
diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h
index 57f2343164a3..1b594307c9d5 100644
--- a/fs/smb/common/smb2pdu.h
+++ b/fs/smb/common/smb2pdu.h
@@ -208,38 +208,45 @@ struct smb2_transform_hdr {
 	__le64  SessionId;
 } __packed;
 
+/*
+ * These are simplified versions from the spec, as we don't need a fully fledged
+ * form of both unchained and chained structs.
+ *
+ * Moreover, even in chained compressed payloads, the initial compression header
+ * has the form of the unchained one -- i.e. it never has the
+ * OriginalPayloadSize field and ::Offset field always represent an offset
+ * (instead of a length, as it is in the chained header).
+ *
+ * See MS-SMB2 2.2.42 for more details.
+ */
+#define SMB2_COMPRESSION_FLAG_NONE	0x0000
+#define SMB2_COMPRESSION_FLAG_CHAINED	0x0001
 
-/* See MS-SMB2 2.2.42 */
-struct smb2_compression_transform_hdr_unchained {
-	__le32 ProtocolId;	/* 0xFC 'S' 'M' 'B' */
+struct smb2_compression_hdr {
+	__le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */
 	__le32 OriginalCompressedSegmentSize;
 	__le16 CompressionAlgorithm;
 	__le16 Flags;
-	__le16 Length; /* if chained it is length, else offset */
+	__le32 Offset; /* this is the size of the uncompressed SMB2 header below */
+	/* uncompressed SMB2 header (READ or WRITE) goes here */
+	/* compressed data goes here */
 } __packed;
 
-/* See MS-SMB2 2.2.42.1 */
-#define SMB2_COMPRESSION_FLAG_NONE	0x0000
-#define SMB2_COMPRESSION_FLAG_CHAINED	0x0001
-
-struct compression_payload_header {
+/*
+ * ... OTOH, set compression payload header to always have OriginalPayloadSize
+ * as it's easier to pass the struct size minus sizeof(OriginalPayloadSize)
+ * than to juggle around the header/data memory.
+ */
+struct smb2_compression_payload_hdr {
 	__le16	CompressionAlgorithm;
 	__le16	Flags;
 	__le32	Length; /* length of compressed playload including field below if present */
-	/* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */
+	__le32 OriginalPayloadSize; /* accounted when LZNT1, LZ77, LZ77+Huffman */
 } __packed;
 
-/* See MS-SMB2 2.2.42.2 */
-struct smb2_compression_transform_hdr_chained {
-	__le32 ProtocolId;	/* 0xFC 'S' 'M' 'B' */
-	__le32 OriginalCompressedSegmentSize;
-	/* struct compression_payload_header[] */
-} __packed;
-
-/* See MS-SMB2 2.2.42.2.2 */
-struct compression_pattern_payload_v1 {
-	__le16	Pattern;
-	__le16	Reserved1;
+struct smb2_compression_pattern_v1 {
+	__u8	Pattern;
+	__u8	Reserved1;
 	__le16	Reserved2;
 	__le32	Repetitions;
 } __packed;
@@ -273,15 +280,16 @@ struct smb3_blob_data {
 #define SE_GROUP_RESOURCE		0x20000000
 #define SE_GROUP_LOGON_ID		0xC0000000
 
-/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
-
 struct sid_array_data {
 	__le16 SidAttrCount;
 	/* SidAttrList - array of sid_attr_data structs */
 } __packed;
 
-struct luid_attr_data {
-
+/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */
+struct sid_attr_data {
+	__le16 BlobSize;
+	__u8 BlobData[];
+	/* __le32 Attr */
 } __packed;
 
 /*
@@ -495,6 +503,7 @@ struct smb2_encryption_neg_context {
 #define SMB3_COMPRESS_LZ77_HUFF	cpu_to_le16(0x0003)
 /* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */
 #define SMB3_COMPRESS_PATTERN	cpu_to_le16(0x0004) /* Pattern_V1 */
+#define SMB3_COMPRESS_LZ4	cpu_to_le16(0x0005)
 
 /* Compression Flags */
 #define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE		cpu_to_le32(0x00000000)
diff --git a/fs/smb/common/smbfsctl.h b/fs/smb/common/smbfsctl.h
index edd7fc2a7921..a94d658b88e8 100644
--- a/fs/smb/common/smbfsctl.h
+++ b/fs/smb/common/smbfsctl.h
@@ -158,12 +158,6 @@
 #define IO_REPARSE_TAG_LX_CHR	     0x80000025
 #define IO_REPARSE_TAG_LX_BLK	     0x80000026
 
-#define IO_REPARSE_TAG_LX_SYMLINK_LE	cpu_to_le32(0xA000001D)
-#define IO_REPARSE_TAG_AF_UNIX_LE	cpu_to_le32(0x80000023)
-#define IO_REPARSE_TAG_LX_FIFO_LE	cpu_to_le32(0x80000024)
-#define IO_REPARSE_TAG_LX_CHR_LE	cpu_to_le32(0x80000025)
-#define IO_REPARSE_TAG_LX_BLK_LE	cpu_to_le32(0x80000026)
-
 /* fsctl flags */
 /* If Flags is set to this value, the request is an FSCTL not ioctl request */
 #define SMB2_0_IOCTL_IS_FSCTL		0x00000001
diff --git a/fs/smb/server/glob.h b/fs/smb/server/glob.h
index 5b8f3e0ebdb3..d528b20b37a8 100644
--- a/fs/smb/server/glob.h
+++ b/fs/smb/server/glob.h
@@ -12,8 +12,6 @@
 #include "unicode.h"
 #include "vfs_cache.h"
 
-#define KSMBD_VERSION	"3.4.2"
-
 extern int ksmbd_debug_types;
 
 #define KSMBD_DEBUG_SMB		BIT(0)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index 0ebf91ffa236..686b321c5a8b 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -75,6 +75,7 @@ struct ksmbd_heartbeat {
 #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION	BIT(1)
 #define KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL	BIT(2)
 #define KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF	BIT(3)
+#define KSMBD_GLOBAL_FLAG_DURABLE_HANDLE	BIT(4)
 
 /*
  * IPC request for ksmbd server startup
@@ -166,7 +167,8 @@ struct ksmbd_share_config_response {
 	__u16	force_uid;
 	__u16	force_gid;
 	__s8	share_name[KSMBD_REQ_MAX_SHARE_NAME];
-	__u32	reserved[112];		/* Reserved room */
+	__u32	reserved[111];		/* Reserved room */
+	__u32	payload_sz;
 	__u32	veto_list_sz;
 	__s8	____payload[];
 };
diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c
index 328a412259dc..a2f0a2edceb8 100644
--- a/fs/smb/server/mgmt/share_config.c
+++ b/fs/smb/server/mgmt/share_config.c
@@ -158,7 +158,12 @@ static struct ksmbd_share_config *share_config_request(struct unicode_map *um,
 	share->name = kstrdup(name, GFP_KERNEL);
 
 	if (!test_share_config_flag(share, KSMBD_SHARE_FLAG_PIPE)) {
-		share->path = kstrdup(ksmbd_share_config_path(resp),
+		int path_len = PATH_MAX;
+
+		if (resp->payload_sz)
+			path_len = resp->payload_sz - resp->veto_list_sz;
+
+		share->path = kstrndup(ksmbd_share_config_path(resp), path_len,
 				      GFP_KERNEL);
 		if (share->path)
 			share->path_sz = strlen(share->path);
diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c
index 15f68ee05089..aec0a7a12405 100644
--- a/fs/smb/server/mgmt/user_session.c
+++ b/fs/smb/server/mgmt/user_session.c
@@ -156,7 +156,7 @@ void ksmbd_session_destroy(struct ksmbd_session *sess)
 	kfree(sess);
 }
 
-static struct ksmbd_session *__session_lookup(unsigned long long id)
+struct ksmbd_session *__session_lookup(unsigned long long id)
 {
 	struct ksmbd_session *sess;
 
@@ -305,6 +305,32 @@ struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
 	return sess;
 }
 
+void destroy_previous_session(struct ksmbd_conn *conn,
+			      struct ksmbd_user *user, u64 id)
+{
+	struct ksmbd_session *prev_sess;
+	struct ksmbd_user *prev_user;
+
+	down_write(&sessions_table_lock);
+	down_write(&conn->session_lock);
+	prev_sess = __session_lookup(id);
+	if (!prev_sess || prev_sess->state == SMB2_SESSION_EXPIRED)
+		goto out;
+
+	prev_user = prev_sess->user;
+	if (!prev_user ||
+	    strcmp(user->name, prev_user->name) ||
+	    user->passkey_sz != prev_user->passkey_sz ||
+	    memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
+		goto out;
+
+	ksmbd_destroy_file_table(&prev_sess->file_table);
+	prev_sess->state = SMB2_SESSION_EXPIRED;
+out:
+	up_write(&conn->session_lock);
+	up_write(&sessions_table_lock);
+}
+
 static bool ksmbd_preauth_session_id_match(struct preauth_session *sess,
 					   unsigned long long id)
 {
diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h
index 63cb08fffde8..dc9fded2cd43 100644
--- a/fs/smb/server/mgmt/user_session.h
+++ b/fs/smb/server/mgmt/user_session.h
@@ -88,8 +88,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn,
 int ksmbd_session_register(struct ksmbd_conn *conn,
 			   struct ksmbd_session *sess);
 void ksmbd_sessions_deregister(struct ksmbd_conn *conn);
+struct ksmbd_session *__session_lookup(unsigned long long id);
 struct ksmbd_session *ksmbd_session_lookup_all(struct ksmbd_conn *conn,
 					       unsigned long long id);
+void destroy_previous_session(struct ksmbd_conn *conn,
+			      struct ksmbd_user *user, u64 id);
 struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn,
 						    u64 sess_id);
 struct preauth_session *ksmbd_preauth_session_lookup(struct ksmbd_conn *conn,
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 53dfaac425c6..4978edfb15f9 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -159,7 +159,8 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci)
 	opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info,
 					op_entry);
 	if (opinfo) {
-		if (!atomic_inc_not_zero(&opinfo->refcount))
+		if (opinfo->conn == NULL ||
+		    !atomic_inc_not_zero(&opinfo->refcount))
 			opinfo = NULL;
 		else {
 			atomic_inc(&opinfo->conn->r_count);
@@ -527,7 +528,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
 	 */
 	read_lock(&ci->m_lock);
 	list_for_each_entry(opinfo, &ci->m_op_list, op_entry) {
-		if (!opinfo->is_lease)
+		if (!opinfo->is_lease || !opinfo->conn)
 			continue;
 		read_unlock(&ci->m_lock);
 		lease = opinfo->o_lease;
@@ -641,7 +642,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk)
 	struct smb2_hdr *rsp_hdr;
 	struct ksmbd_file *fp;
 
-	fp = ksmbd_lookup_durable_fd(br_info->fid);
+	fp = ksmbd_lookup_global_fd(br_info->fid);
 	if (!fp)
 		goto out;
 
@@ -1106,7 +1107,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
 
 	read_lock(&p_ci->m_lock);
 	list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
-		if (!opinfo->is_lease)
+		if (opinfo->conn == NULL || !opinfo->is_lease)
 			continue;
 
 		if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE &&
@@ -1142,7 +1143,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
 	opinfo = rcu_dereference(fp->f_opinfo);
 	rcu_read_unlock();
 
-	if (!opinfo->is_lease || opinfo->o_lease->version != 2)
+	if (!opinfo || !opinfo->is_lease || opinfo->o_lease->version != 2)
 		return;
 
 	p_ci = ksmbd_inode_lookup_lock(fp->filp->f_path.dentry->d_parent);
@@ -1151,7 +1152,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp)
 
 	read_lock(&p_ci->m_lock);
 	list_for_each_entry(opinfo, &p_ci->m_op_list, op_entry) {
-		if (!opinfo->is_lease)
+		if (opinfo->conn == NULL || !opinfo->is_lease)
 			continue;
 
 		if (opinfo->o_lease->state != SMB2_OPLOCK_LEVEL_NONE) {
@@ -1361,6 +1362,9 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) {
+		if (brk_op->conn == NULL)
+			continue;
+
 		if (!atomic_inc_not_zero(&brk_op->refcount))
 			continue;
 
@@ -1496,11 +1500,10 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
 /**
  * parse_lease_state() - parse lease context containted in file open request
  * @open_req:	buffer containing smb2 file open(create) request
- * @is_dir:	whether leasing file is directory
  *
  * Return:  oplock state, -ENOENT if create lease context not found
  */
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
+struct lease_ctx_info *parse_lease_state(void *open_req)
 {
 	struct create_context *cc;
 	struct smb2_create_req *req = (struct smb2_create_req *)open_req;
@@ -1518,12 +1521,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir)
 		struct create_lease_v2 *lc = (struct create_lease_v2 *)cc;
 
 		memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE);
-		if (is_dir) {
-			lreq->req_state = lc->lcontext.LeaseState &
-				~SMB2_LEASE_WRITE_CACHING_LE;
-			lreq->is_dir = true;
-		} else
-			lreq->req_state = lc->lcontext.LeaseState;
+		lreq->req_state = lc->lcontext.LeaseState;
 		lreq->flags = lc->lcontext.LeaseFlags;
 		lreq->epoch = lc->lcontext.Epoch;
 		lreq->duration = lc->lcontext.LeaseDuration;
@@ -1646,6 +1644,8 @@ void create_durable_v2_rsp_buf(char *cc, struct ksmbd_file *fp)
 	buf->Name[3] = 'Q';
 
 	buf->Timeout = cpu_to_le32(fp->durable_timeout);
+	if (fp->is_persistent)
+		buf->Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
 }
 
 /**
@@ -1813,3 +1813,71 @@ out:
 	read_unlock(&lease_list_lock);
 	return ret_op;
 }
+
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+			      struct ksmbd_share_config *share,
+			      struct ksmbd_file *fp,
+			      struct lease_ctx_info *lctx,
+			      char *name)
+{
+	struct oplock_info *opinfo = opinfo_get(fp);
+	int ret = 0;
+
+	if (!opinfo)
+		return 0;
+
+	if (opinfo->is_lease == false) {
+		if (lctx) {
+			pr_err("create context include lease\n");
+			ret = -EBADF;
+			goto out;
+		}
+
+		if (opinfo->level != SMB2_OPLOCK_LEVEL_BATCH) {
+			pr_err("oplock level is not equal to SMB2_OPLOCK_LEVEL_BATCH\n");
+			ret = -EBADF;
+		}
+
+		goto out;
+	}
+
+	if (memcmp(conn->ClientGUID, fp->client_guid,
+				SMB2_CLIENT_GUID_SIZE)) {
+		ksmbd_debug(SMB, "Client guid of fp is not equal to the one of connection\n");
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (!lctx) {
+		ksmbd_debug(SMB, "create context does not include lease\n");
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (memcmp(opinfo->o_lease->lease_key, lctx->lease_key,
+				SMB2_LEASE_KEY_SIZE)) {
+		ksmbd_debug(SMB,
+			    "lease key of fp does not match lease key in create context\n");
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (!(opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)) {
+		ksmbd_debug(SMB, "lease state does not contain SMB2_LEASE_HANDLE_CACHING\n");
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (opinfo->o_lease->version != lctx->version) {
+		ksmbd_debug(SMB,
+			    "lease version of fp does not match the one in create context\n");
+		ret = -EBADF;
+		goto out;
+	}
+
+	if (!ksmbd_inode_pending_delete(fp))
+		ret = ksmbd_validate_name_reconnect(share, fp, name);
+out:
+	opinfo_put(opinfo);
+	return ret;
+}
diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h
index 5b93ea9196c0..e9da63f25b20 100644
--- a/fs/smb/server/oplock.h
+++ b/fs/smb/server/oplock.h
@@ -111,7 +111,7 @@ void opinfo_put(struct oplock_info *opinfo);
 
 /* Lease related functions */
 void create_lease_buf(u8 *rbuf, struct lease *lease);
-struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir);
+struct lease_ctx_info *parse_lease_state(void *open_req);
 __u8 smb2_map_lease_to_oplock(__le32 lease_state);
 int lease_read_to_write(struct oplock_info *opinfo);
 
@@ -130,4 +130,9 @@ void destroy_lease_table(struct ksmbd_conn *conn);
 void smb_send_parent_lease_break_noti(struct ksmbd_file *fp,
 				      struct lease_ctx_info *lctx);
 void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp);
+int smb2_check_durable_oplock(struct ksmbd_conn *conn,
+			      struct ksmbd_share_config *share,
+			      struct ksmbd_file *fp,
+			      struct lease_ctx_info *lctx,
+			      char *name);
 #endif /* __KSMBD_OPLOCK_H */
diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c
index 3079e607c5fe..c0788188aa82 100644
--- a/fs/smb/server/server.c
+++ b/fs/smb/server/server.c
@@ -625,7 +625,6 @@ static void __exit ksmbd_server_exit(void)
 }
 
 MODULE_AUTHOR("Namjae Jeon <linkinjeon@kernel.org>");
-MODULE_VERSION(KSMBD_VERSION);
 MODULE_DESCRIPTION("Linux kernel CIFS/SMB SERVER");
 MODULE_LICENSE("GPL");
 MODULE_SOFTDEP("pre: ecb");
diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c
index 03dded29a980..727cb49926ee 100644
--- a/fs/smb/server/smb2misc.c
+++ b/fs/smb/server/smb2misc.c
@@ -101,13 +101,17 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 		*len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength);
 		break;
 	case SMB2_TREE_CONNECT:
-		*off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset);
+		*off = max_t(unsigned short int,
+			     le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset),
+			     offsetof(struct smb2_tree_connect_req, Buffer));
 		*len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength);
 		break;
 	case SMB2_CREATE:
 	{
 		unsigned short int name_off =
-			le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset);
+			max_t(unsigned short int,
+			      le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset),
+			      offsetof(struct smb2_create_req, Buffer));
 		unsigned short int name_len =
 			le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength);
 
@@ -128,11 +132,15 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 		break;
 	}
 	case SMB2_QUERY_INFO:
-		*off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset);
+		*off = max_t(unsigned int,
+			     le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset),
+			     offsetof(struct smb2_query_info_req, Buffer));
 		*len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength);
 		break;
 	case SMB2_SET_INFO:
-		*off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset);
+		*off = max_t(unsigned int,
+			     le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset),
+			     offsetof(struct smb2_set_info_req, Buffer));
 		*len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength);
 		break;
 	case SMB2_READ:
@@ -142,7 +150,7 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 	case SMB2_WRITE:
 		if (((struct smb2_write_req *)hdr)->DataOffset ||
 		    ((struct smb2_write_req *)hdr)->Length) {
-			*off = max_t(unsigned int,
+			*off = max_t(unsigned short int,
 				     le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset),
 				     offsetof(struct smb2_write_req, Buffer));
 			*len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length);
@@ -153,7 +161,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 		*len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength);
 		break;
 	case SMB2_QUERY_DIRECTORY:
-		*off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset);
+		*off = max_t(unsigned short int,
+			     le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset),
+			     offsetof(struct smb2_query_directory_req, Buffer));
 		*len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength);
 		break;
 	case SMB2_LOCK:
@@ -168,7 +178,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len,
 		break;
 	}
 	case SMB2_IOCTL:
-		*off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset);
+		*off = max_t(unsigned int,
+			     le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset),
+			     offsetof(struct smb2_ioctl_req, Buffer));
 		*len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount);
 		break;
 	default:
diff --git a/fs/smb/server/smb2ops.c b/fs/smb/server/smb2ops.c
index 27a9dce3e03a..606aa3c5189a 100644
--- a/fs/smb/server/smb2ops.c
+++ b/fs/smb/server/smb2ops.c
@@ -228,6 +228,11 @@ void init_smb3_0_server(struct ksmbd_conn *conn)
 	    conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
 
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
+	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
+	     conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
+
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
 }
@@ -256,6 +261,9 @@ void init_smb3_02_server(struct ksmbd_conn *conn)
 
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
+
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
 }
 
 /**
@@ -275,14 +283,12 @@ int init_smb3_11_server(struct ksmbd_conn *conn)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_LEASING |
 			SMB2_GLOBAL_CAP_DIRECTORY_LEASING;
 
-	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION ||
-	    (!(server_conf.flags & KSMBD_GLOBAL_FLAG_SMB2_ENCRYPTION_OFF) &&
-	     conn->cli_cap & SMB2_GLOBAL_CAP_ENCRYPTION))
-		conn->vals->capabilities |= SMB2_GLOBAL_CAP_ENCRYPTION;
-
 	if (server_conf.flags & KSMBD_GLOBAL_FLAG_SMB3_MULTICHANNEL)
 		conn->vals->capabilities |= SMB2_GLOBAL_CAP_MULTI_CHANNEL;
 
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE)
+		conn->vals->capabilities |= SMB2_GLOBAL_CAP_PERSISTENT_HANDLES;
+
 	INIT_LIST_HEAD(&conn->preauth_sess_table);
 	return 0;
 }
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 0c97d3c86072..5723bbf372d7 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -607,30 +607,6 @@ int smb2_check_user_session(struct ksmbd_work *work)
 	return -ENOENT;
 }
 
-static void destroy_previous_session(struct ksmbd_conn *conn,
-				     struct ksmbd_user *user, u64 id)
-{
-	struct ksmbd_session *prev_sess = ksmbd_session_lookup_slowpath(id);
-	struct ksmbd_user *prev_user;
-	struct channel *chann;
-	long index;
-
-	if (!prev_sess)
-		return;
-
-	prev_user = prev_sess->user;
-
-	if (!prev_user ||
-	    strcmp(user->name, prev_user->name) ||
-	    user->passkey_sz != prev_user->passkey_sz ||
-	    memcmp(user->passkey, prev_user->passkey, user->passkey_sz))
-		return;
-
-	prev_sess->state = SMB2_SESSION_EXPIRED;
-	xa_for_each(&prev_sess->ksmbd_chann_list, index, chann)
-		ksmbd_conn_set_exiting(chann->conn);
-}
-
 /**
  * smb2_get_name() - get filename string from on the wire smb format
  * @src:	source buffer
@@ -1951,7 +1927,7 @@ int smb2_tree_connect(struct ksmbd_work *work)
 
 	WORK_BUFFERS(work, req, rsp);
 
-	treename = smb_strndup_from_utf16(req->Buffer,
+	treename = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->PathOffset),
 					  le16_to_cpu(req->PathLength), true,
 					  conn->local_nls);
 	if (IS_ERR(treename)) {
@@ -2642,6 +2618,165 @@ static void ksmbd_acls_fattr(struct smb_fattr *fattr,
 	}
 }
 
+enum {
+	DURABLE_RECONN_V2 = 1,
+	DURABLE_RECONN,
+	DURABLE_REQ_V2,
+	DURABLE_REQ,
+};
+
+struct durable_info {
+	struct ksmbd_file *fp;
+	unsigned short int type;
+	bool persistent;
+	bool reconnected;
+	unsigned int timeout;
+	char *CreateGuid;
+};
+
+static int parse_durable_handle_context(struct ksmbd_work *work,
+					struct smb2_create_req *req,
+					struct lease_ctx_info *lc,
+					struct durable_info *dh_info)
+{
+	struct ksmbd_conn *conn = work->conn;
+	struct create_context *context;
+	int dh_idx, err = 0;
+	u64 persistent_id = 0;
+	int req_op_level;
+	static const char * const durable_arr[] = {"DH2C", "DHnC", "DH2Q", "DHnQ"};
+
+	req_op_level = req->RequestedOplockLevel;
+	for (dh_idx = DURABLE_RECONN_V2; dh_idx <= ARRAY_SIZE(durable_arr);
+	     dh_idx++) {
+		context = smb2_find_context_vals(req, durable_arr[dh_idx - 1], 4);
+		if (IS_ERR(context)) {
+			err = PTR_ERR(context);
+			goto out;
+		}
+		if (!context)
+			continue;
+
+		switch (dh_idx) {
+		case DURABLE_RECONN_V2:
+		{
+			struct create_durable_reconn_v2_req *recon_v2;
+
+			if (dh_info->type == DURABLE_RECONN ||
+			    dh_info->type == DURABLE_REQ_V2) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			recon_v2 = (struct create_durable_reconn_v2_req *)context;
+			persistent_id = recon_v2->Fid.PersistentFileId;
+			dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+			if (!dh_info->fp) {
+				ksmbd_debug(SMB, "Failed to get durable handle state\n");
+				err = -EBADF;
+				goto out;
+			}
+
+			if (memcmp(dh_info->fp->create_guid, recon_v2->CreateGuid,
+				   SMB2_CREATE_GUID_SIZE)) {
+				err = -EBADF;
+				ksmbd_put_durable_fd(dh_info->fp);
+				goto out;
+			}
+
+			dh_info->type = dh_idx;
+			dh_info->reconnected = true;
+			ksmbd_debug(SMB,
+				"reconnect v2 Persistent-id from reconnect = %llu\n",
+					persistent_id);
+			break;
+		}
+		case DURABLE_RECONN:
+		{
+			struct create_durable_reconn_req *recon;
+
+			if (dh_info->type == DURABLE_RECONN_V2 ||
+			    dh_info->type == DURABLE_REQ_V2) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			recon = (struct create_durable_reconn_req *)context;
+			persistent_id = recon->Data.Fid.PersistentFileId;
+			dh_info->fp = ksmbd_lookup_durable_fd(persistent_id);
+			if (!dh_info->fp) {
+				ksmbd_debug(SMB, "Failed to get durable handle state\n");
+				err = -EBADF;
+				goto out;
+			}
+
+			dh_info->type = dh_idx;
+			dh_info->reconnected = true;
+			ksmbd_debug(SMB, "reconnect Persistent-id from reconnect = %llu\n",
+				    persistent_id);
+			break;
+		}
+		case DURABLE_REQ_V2:
+		{
+			struct create_durable_req_v2 *durable_v2_blob;
+
+			if (dh_info->type == DURABLE_RECONN ||
+			    dh_info->type == DURABLE_RECONN_V2) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			durable_v2_blob =
+				(struct create_durable_req_v2 *)context;
+			ksmbd_debug(SMB, "Request for durable v2 open\n");
+			dh_info->fp = ksmbd_lookup_fd_cguid(durable_v2_blob->CreateGuid);
+			if (dh_info->fp) {
+				if (!memcmp(conn->ClientGUID, dh_info->fp->client_guid,
+					    SMB2_CLIENT_GUID_SIZE)) {
+					if (!(req->hdr.Flags & SMB2_FLAGS_REPLAY_OPERATION)) {
+						err = -ENOEXEC;
+						goto out;
+					}
+
+					dh_info->fp->conn = conn;
+					dh_info->reconnected = true;
+					goto out;
+				}
+			}
+
+			if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+			     req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+				dh_info->CreateGuid =
+					durable_v2_blob->CreateGuid;
+				dh_info->persistent =
+					le32_to_cpu(durable_v2_blob->Flags);
+				dh_info->timeout =
+					le32_to_cpu(durable_v2_blob->Timeout);
+				dh_info->type = dh_idx;
+			}
+			break;
+		}
+		case DURABLE_REQ:
+			if (dh_info->type == DURABLE_RECONN)
+				goto out;
+			if (dh_info->type == DURABLE_RECONN_V2 ||
+			    dh_info->type == DURABLE_REQ_V2) {
+				err = -EINVAL;
+				goto out;
+			}
+
+			if (((lc && (lc->req_state & SMB2_LEASE_HANDLE_CACHING_LE)) ||
+			     req_op_level == SMB2_OPLOCK_LEVEL_BATCH)) {
+				ksmbd_debug(SMB, "Request for durable open\n");
+				dh_info->type = dh_idx;
+			}
+		}
+	}
+
+out:
+	return err;
+}
+
 /**
  * smb2_open() - handler for smb file open request
  * @work:	smb work containing request buffer
@@ -2665,6 +2800,7 @@ int smb2_open(struct ksmbd_work *work)
 	struct lease_ctx_info *lc = NULL;
 	struct create_ea_buf_req *ea_buf = NULL;
 	struct oplock_info *opinfo;
+	struct durable_info dh_info = {0};
 	__le32 *next_ptr = NULL;
 	int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0;
 	int rc = 0;
@@ -2704,7 +2840,7 @@ int smb2_open(struct ksmbd_work *work)
 			goto err_out2;
 		}
 
-		name = smb2_get_name(req->Buffer,
+		name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset),
 				     le16_to_cpu(req->NameLength),
 				     work->conn->local_nls);
 		if (IS_ERR(name)) {
@@ -2745,6 +2881,49 @@ int smb2_open(struct ksmbd_work *work)
 		}
 	}
 
+	req_op_level = req->RequestedOplockLevel;
+
+	if (server_conf.flags & KSMBD_GLOBAL_FLAG_DURABLE_HANDLE &&
+	    req->CreateContextsOffset) {
+		lc = parse_lease_state(req);
+		rc = parse_durable_handle_context(work, req, lc, &dh_info);
+		if (rc) {
+			ksmbd_debug(SMB, "error parsing durable handle context\n");
+			goto err_out2;
+		}
+
+		if (dh_info.reconnected == true) {
+			rc = smb2_check_durable_oplock(conn, share, dh_info.fp, lc, name);
+			if (rc) {
+				ksmbd_put_durable_fd(dh_info.fp);
+				goto err_out2;
+			}
+
+			rc = ksmbd_reopen_durable_fd(work, dh_info.fp);
+			if (rc) {
+				ksmbd_put_durable_fd(dh_info.fp);
+				goto err_out2;
+			}
+
+			if (ksmbd_override_fsids(work)) {
+				rc = -ENOMEM;
+				ksmbd_put_durable_fd(dh_info.fp);
+				goto err_out2;
+			}
+
+			fp = dh_info.fp;
+			file_info = FILE_OPENED;
+
+			rc = ksmbd_vfs_getattr(&fp->filp->f_path, &stat);
+			if (rc)
+				goto err_out2;
+
+			ksmbd_put_durable_fd(fp);
+			goto reconnected_fp;
+		}
+	} else if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
+		lc = parse_lease_state(req);
+
 	if (le32_to_cpu(req->ImpersonationLevel) > le32_to_cpu(IL_DELEGATE)) {
 		pr_err("Invalid impersonationlevel : 0x%x\n",
 		       le32_to_cpu(req->ImpersonationLevel));
@@ -3207,10 +3386,6 @@ int smb2_open(struct ksmbd_work *work)
 		need_truncate = 1;
 	}
 
-	req_op_level = req->RequestedOplockLevel;
-	if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE)
-		lc = parse_lease_state(req, S_ISDIR(file_inode(filp)->i_mode));
-
 	share_ret = ksmbd_smb_check_shared_mode(fp->filp, fp);
 	if (!test_share_config_flag(work->tcon->share_conf, KSMBD_SHARE_FLAG_OPLOCKS) ||
 	    (req_op_level == SMB2_OPLOCK_LEVEL_LEASE &&
@@ -3221,6 +3396,11 @@ int smb2_open(struct ksmbd_work *work)
 		}
 	} else {
 		if (req_op_level == SMB2_OPLOCK_LEVEL_LEASE) {
+			if (S_ISDIR(file_inode(filp)->i_mode)) {
+				lc->req_state &= ~SMB2_LEASE_WRITE_CACHING_LE;
+				lc->is_dir = true;
+			}
+
 			/*
 			 * Compare parent lease using parent key. If there is no
 			 * a lease that has same parent key, Send lease break
@@ -3317,6 +3497,24 @@ int smb2_open(struct ksmbd_work *work)
 
 	memcpy(fp->client_guid, conn->ClientGUID, SMB2_CLIENT_GUID_SIZE);
 
+	if (dh_info.type == DURABLE_REQ_V2 || dh_info.type == DURABLE_REQ) {
+		if (dh_info.type == DURABLE_REQ_V2 && dh_info.persistent)
+			fp->is_persistent = true;
+		else
+			fp->is_durable = true;
+
+		if (dh_info.type == DURABLE_REQ_V2) {
+			memcpy(fp->create_guid, dh_info.CreateGuid,
+					SMB2_CREATE_GUID_SIZE);
+			if (dh_info.timeout)
+				fp->durable_timeout = min(dh_info.timeout,
+						300000);
+			else
+				fp->durable_timeout = 60;
+		}
+	}
+
+reconnected_fp:
 	rsp->StructureSize = cpu_to_le16(89);
 	rcu_read_lock();
 	opinfo = rcu_dereference(fp->f_opinfo);
@@ -3403,6 +3601,33 @@ int smb2_open(struct ksmbd_work *work)
 		next_off = conn->vals->create_disk_id_size;
 	}
 
+	if (dh_info.type == DURABLE_REQ || dh_info.type == DURABLE_REQ_V2) {
+		struct create_context *durable_ccontext;
+
+		durable_ccontext = (struct create_context *)(rsp->Buffer +
+				le32_to_cpu(rsp->CreateContextsLength));
+		contxt_cnt++;
+		if (dh_info.type == DURABLE_REQ) {
+			create_durable_rsp_buf(rsp->Buffer +
+					le32_to_cpu(rsp->CreateContextsLength));
+			le32_add_cpu(&rsp->CreateContextsLength,
+					conn->vals->create_durable_size);
+			iov_len += conn->vals->create_durable_size;
+		} else {
+			create_durable_v2_rsp_buf(rsp->Buffer +
+					le32_to_cpu(rsp->CreateContextsLength),
+					fp);
+			le32_add_cpu(&rsp->CreateContextsLength,
+					conn->vals->create_durable_v2_size);
+			iov_len += conn->vals->create_durable_v2_size;
+		}
+
+		if (next_ptr)
+			*next_ptr = cpu_to_le32(next_off);
+		next_ptr = &durable_ccontext->Next;
+		next_off = conn->vals->create_durable_size;
+	}
+
 	if (posix_ctxt) {
 		contxt_cnt++;
 		create_posix_rsp_buf(rsp->Buffer +
@@ -3828,11 +4053,16 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv)
 		}
 
 		ksmbd_kstat.kstat = &kstat;
-		if (priv->info_level != FILE_NAMES_INFORMATION)
-			ksmbd_vfs_fill_dentry_attrs(priv->work,
-						    idmap,
-						    dent,
-						    &ksmbd_kstat);
+		if (priv->info_level != FILE_NAMES_INFORMATION) {
+			rc = ksmbd_vfs_fill_dentry_attrs(priv->work,
+							 idmap,
+							 dent,
+							 &ksmbd_kstat);
+			if (rc) {
+				dput(dent);
+				continue;
+			}
+		}
 
 		rc = smb2_populate_readdir_entry(priv->work->conn,
 						 priv->info_level,
@@ -4075,7 +4305,7 @@ int smb2_query_dir(struct ksmbd_work *work)
 	}
 
 	srch_flag = req->Flags;
-	srch_ptr = smb_strndup_from_utf16(req->Buffer,
+	srch_ptr = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->FileNameOffset),
 					  le16_to_cpu(req->FileNameLength), 1,
 					  conn->local_nls);
 	if (IS_ERR(srch_ptr)) {
@@ -4335,7 +4565,8 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp,
 		    sizeof(struct smb2_ea_info_req))
 			return -EINVAL;
 
-		ea_req = (struct smb2_ea_info_req *)req->Buffer;
+		ea_req = (struct smb2_ea_info_req *)((char *)req +
+						     le16_to_cpu(req->InputBufferOffset));
 	} else {
 		/* need to send all EAs, if no specific EA is requested*/
 		if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY)
@@ -4480,6 +4711,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
 	struct smb2_file_basic_info *basic_info;
 	struct kstat stat;
 	u64 time;
+	int ret;
 
 	if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
 		pr_err("no right to read the attributes : 0x%x\n",
@@ -4487,9 +4719,12 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
 		return -EACCES;
 	}
 
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
+
 	basic_info = (struct smb2_file_basic_info *)rsp->Buffer;
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
-			 file_inode(fp->filp), &stat);
 	basic_info->CreationTime = cpu_to_le64(fp->create_time);
 	time = ksmbd_UnixTimeToNT(stat.atime);
 	basic_info->LastAccessTime = cpu_to_le64(time);
@@ -4504,27 +4739,31 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp,
 	return 0;
 }
 
-static void get_file_standard_info(struct smb2_query_info_rsp *rsp,
-				   struct ksmbd_file *fp, void *rsp_org)
+static int get_file_standard_info(struct smb2_query_info_rsp *rsp,
+				  struct ksmbd_file *fp, void *rsp_org)
 {
 	struct smb2_file_standard_info *sinfo;
 	unsigned int delete_pending;
-	struct inode *inode;
 	struct kstat stat;
+	int ret;
 
-	inode = file_inode(fp->filp);
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
 	sinfo = (struct smb2_file_standard_info *)rsp->Buffer;
 	delete_pending = ksmbd_inode_pending_delete(fp);
 
-	sinfo->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
+	sinfo->AllocationSize = cpu_to_le64(stat.blocks << 9);
 	sinfo->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
 	sinfo->NumberOfLinks = cpu_to_le32(get_nlink(&stat) - delete_pending);
 	sinfo->DeletePending = delete_pending;
 	sinfo->Directory = S_ISDIR(stat.mode) ? 1 : 0;
 	rsp->OutputBufferLength =
 		cpu_to_le32(sizeof(struct smb2_file_standard_info));
+
+	return 0;
 }
 
 static void get_file_alignment_info(struct smb2_query_info_rsp *rsp,
@@ -4546,11 +4785,11 @@ static int get_file_all_info(struct ksmbd_work *work,
 	struct ksmbd_conn *conn = work->conn;
 	struct smb2_file_all_info *file_info;
 	unsigned int delete_pending;
-	struct inode *inode;
 	struct kstat stat;
 	int conv_len;
 	char *filename;
 	u64 time;
+	int ret;
 
 	if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
 		ksmbd_debug(SMB, "no right to read the attributes : 0x%x\n",
@@ -4562,8 +4801,10 @@ static int get_file_all_info(struct ksmbd_work *work,
 	if (IS_ERR(filename))
 		return PTR_ERR(filename);
 
-	inode = file_inode(fp->filp);
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
 	ksmbd_debug(SMB, "filename = %s\n", filename);
 	delete_pending = ksmbd_inode_pending_delete(fp);
@@ -4579,7 +4820,7 @@ static int get_file_all_info(struct ksmbd_work *work,
 	file_info->Attributes = fp->f_ci->m_fattr;
 	file_info->Pad1 = 0;
 	file_info->AllocationSize =
-		cpu_to_le64(inode->i_blocks << 9);
+		cpu_to_le64(stat.blocks << 9);
 	file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
 	file_info->NumberOfLinks =
 			cpu_to_le32(get_nlink(&stat) - delete_pending);
@@ -4623,10 +4864,10 @@ static void get_file_alternate_info(struct ksmbd_work *work,
 		cpu_to_le32(sizeof(struct smb2_file_alt_name_info) + conv_len);
 }
 
-static void get_file_stream_info(struct ksmbd_work *work,
-				 struct smb2_query_info_rsp *rsp,
-				 struct ksmbd_file *fp,
-				 void *rsp_org)
+static int get_file_stream_info(struct ksmbd_work *work,
+				struct smb2_query_info_rsp *rsp,
+				struct ksmbd_file *fp,
+				void *rsp_org)
 {
 	struct ksmbd_conn *conn = work->conn;
 	struct smb2_file_stream_info *file_info;
@@ -4637,9 +4878,13 @@ static void get_file_stream_info(struct ksmbd_work *work,
 	int nbytes = 0, streamlen, stream_name_len, next, idx = 0;
 	int buf_free_len;
 	struct smb2_query_info_req *req = ksmbd_req_buf_next(work);
+	int ret;
+
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
-			 file_inode(fp->filp), &stat);
 	file_info = (struct smb2_file_stream_info *)rsp->Buffer;
 
 	buf_free_len =
@@ -4720,29 +4965,37 @@ out:
 	kvfree(xattr_list);
 
 	rsp->OutputBufferLength = cpu_to_le32(nbytes);
+
+	return 0;
 }
 
-static void get_file_internal_info(struct smb2_query_info_rsp *rsp,
-				   struct ksmbd_file *fp, void *rsp_org)
+static int get_file_internal_info(struct smb2_query_info_rsp *rsp,
+				  struct ksmbd_file *fp, void *rsp_org)
 {
 	struct smb2_file_internal_info *file_info;
 	struct kstat stat;
+	int ret;
+
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
-			 file_inode(fp->filp), &stat);
 	file_info = (struct smb2_file_internal_info *)rsp->Buffer;
 	file_info->IndexNumber = cpu_to_le64(stat.ino);
 	rsp->OutputBufferLength =
 		cpu_to_le32(sizeof(struct smb2_file_internal_info));
+
+	return 0;
 }
 
 static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
 				      struct ksmbd_file *fp, void *rsp_org)
 {
 	struct smb2_file_ntwrk_info *file_info;
-	struct inode *inode;
 	struct kstat stat;
 	u64 time;
+	int ret;
 
 	if (!(fp->daccess & FILE_READ_ATTRIBUTES_LE)) {
 		pr_err("no right to read the attributes : 0x%x\n",
@@ -4750,10 +5003,12 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
 		return -EACCES;
 	}
 
-	file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
-	inode = file_inode(fp->filp);
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat);
+	file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer;
 
 	file_info->CreationTime = cpu_to_le64(fp->create_time);
 	time = ksmbd_UnixTimeToNT(stat.atime);
@@ -4763,8 +5018,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp,
 	time = ksmbd_UnixTimeToNT(stat.ctime);
 	file_info->ChangeTime = cpu_to_le64(time);
 	file_info->Attributes = fp->f_ci->m_fattr;
-	file_info->AllocationSize =
-		cpu_to_le64(inode->i_blocks << 9);
+	file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
 	file_info->EndOfFile = S_ISDIR(stat.mode) ? 0 : cpu_to_le64(stat.size);
 	file_info->Reserved = cpu_to_le32(0);
 	rsp->OutputBufferLength =
@@ -4804,14 +5058,17 @@ static void get_file_mode_info(struct smb2_query_info_rsp *rsp,
 		cpu_to_le32(sizeof(struct smb2_file_mode_info));
 }
 
-static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
-				      struct ksmbd_file *fp, void *rsp_org)
+static int get_file_compression_info(struct smb2_query_info_rsp *rsp,
+				     struct ksmbd_file *fp, void *rsp_org)
 {
 	struct smb2_file_comp_info *file_info;
 	struct kstat stat;
+	int ret;
 
-	generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS,
-			 file_inode(fp->filp), &stat);
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
 	file_info = (struct smb2_file_comp_info *)rsp->Buffer;
 	file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9);
@@ -4823,6 +5080,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp,
 
 	rsp->OutputBufferLength =
 		cpu_to_le32(sizeof(struct smb2_file_comp_info));
+
+	return 0;
 }
 
 static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
@@ -4844,7 +5103,7 @@ static int get_file_attribute_tag_info(struct smb2_query_info_rsp *rsp,
 	return 0;
 }
 
-static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
+static int find_file_posix_info(struct smb2_query_info_rsp *rsp,
 				struct ksmbd_file *fp, void *rsp_org)
 {
 	struct smb311_posix_qinfo *file_info;
@@ -4852,24 +5111,31 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
 	struct mnt_idmap *idmap = file_mnt_idmap(fp->filp);
 	vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode);
 	vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode);
+	struct kstat stat;
 	u64 time;
 	int out_buf_len = sizeof(struct smb311_posix_qinfo) + 32;
+	int ret;
+
+	ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			  AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		return ret;
 
 	file_info = (struct smb311_posix_qinfo *)rsp->Buffer;
 	file_info->CreationTime = cpu_to_le64(fp->create_time);
-	time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+	time = ksmbd_UnixTimeToNT(stat.atime);
 	file_info->LastAccessTime = cpu_to_le64(time);
-	time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+	time = ksmbd_UnixTimeToNT(stat.mtime);
 	file_info->LastWriteTime = cpu_to_le64(time);
-	time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+	time = ksmbd_UnixTimeToNT(stat.ctime);
 	file_info->ChangeTime = cpu_to_le64(time);
 	file_info->DosAttributes = fp->f_ci->m_fattr;
-	file_info->Inode = cpu_to_le64(inode->i_ino);
-	file_info->EndOfFile = cpu_to_le64(inode->i_size);
-	file_info->AllocationSize = cpu_to_le64(inode->i_blocks << 9);
-	file_info->HardLinks = cpu_to_le32(inode->i_nlink);
-	file_info->Mode = cpu_to_le32(inode->i_mode & 0777);
-	file_info->DeviceId = cpu_to_le32(inode->i_rdev);
+	file_info->Inode = cpu_to_le64(stat.ino);
+	file_info->EndOfFile = cpu_to_le64(stat.size);
+	file_info->AllocationSize = cpu_to_le64(stat.blocks << 9);
+	file_info->HardLinks = cpu_to_le32(stat.nlink);
+	file_info->Mode = cpu_to_le32(stat.mode & 0777);
+	file_info->DeviceId = cpu_to_le32(stat.rdev);
 
 	/*
 	 * Sids(32) contain two sids(Domain sid(16), UNIX group sid(16)).
@@ -4882,6 +5148,8 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp,
 		  SIDUNIX_GROUP, (struct smb_sid *)&file_info->Sids[16]);
 
 	rsp->OutputBufferLength = cpu_to_le32(out_buf_len);
+
+	return 0;
 }
 
 static int smb2_get_info_file(struct ksmbd_work *work,
@@ -4930,7 +5198,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
 		break;
 
 	case FILE_STANDARD_INFORMATION:
-		get_file_standard_info(rsp, fp, work->response_buf);
+		rc = get_file_standard_info(rsp, fp, work->response_buf);
 		break;
 
 	case FILE_ALIGNMENT_INFORMATION:
@@ -4946,11 +5214,11 @@ static int smb2_get_info_file(struct ksmbd_work *work,
 		break;
 
 	case FILE_STREAM_INFORMATION:
-		get_file_stream_info(work, rsp, fp, work->response_buf);
+		rc = get_file_stream_info(work, rsp, fp, work->response_buf);
 		break;
 
 	case FILE_INTERNAL_INFORMATION:
-		get_file_internal_info(rsp, fp, work->response_buf);
+		rc = get_file_internal_info(rsp, fp, work->response_buf);
 		break;
 
 	case FILE_NETWORK_OPEN_INFORMATION:
@@ -4974,7 +5242,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
 		break;
 
 	case FILE_COMPRESSION_INFORMATION:
-		get_file_compression_info(rsp, fp, work->response_buf);
+		rc = get_file_compression_info(rsp, fp, work->response_buf);
 		break;
 
 	case FILE_ATTRIBUTE_TAG_INFORMATION:
@@ -4985,7 +5253,7 @@ static int smb2_get_info_file(struct ksmbd_work *work,
 			pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n");
 			rc = -EOPNOTSUPP;
 		} else {
-			find_file_posix_info(rsp, fp, work->response_buf);
+			rc = find_file_posix_info(rsp, fp, work->response_buf);
 		}
 		break;
 	default:
@@ -5398,7 +5666,6 @@ int smb2_close(struct ksmbd_work *work)
 	struct smb2_close_rsp *rsp;
 	struct ksmbd_conn *conn = work->conn;
 	struct ksmbd_file *fp;
-	struct inode *inode;
 	u64 time;
 	int err = 0;
 
@@ -5453,24 +5720,33 @@ int smb2_close(struct ksmbd_work *work)
 	rsp->Reserved = 0;
 
 	if (req->Flags == SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB) {
+		struct kstat stat;
+		int ret;
+
 		fp = ksmbd_lookup_fd_fast(work, volatile_id);
 		if (!fp) {
 			err = -ENOENT;
 			goto out;
 		}
 
-		inode = file_inode(fp->filp);
+		ret = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+				  AT_STATX_SYNC_AS_STAT);
+		if (ret) {
+			ksmbd_fd_put(work, fp);
+			goto out;
+		}
+
 		rsp->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB;
-		rsp->AllocationSize = S_ISDIR(inode->i_mode) ? 0 :
-			cpu_to_le64(inode->i_blocks << 9);
-		rsp->EndOfFile = cpu_to_le64(inode->i_size);
+		rsp->AllocationSize = S_ISDIR(stat.mode) ? 0 :
+			cpu_to_le64(stat.blocks << 9);
+		rsp->EndOfFile = cpu_to_le64(stat.size);
 		rsp->Attributes = fp->f_ci->m_fattr;
 		rsp->CreationTime = cpu_to_le64(fp->create_time);
-		time = ksmbd_UnixTimeToNT(inode_get_atime(inode));
+		time = ksmbd_UnixTimeToNT(stat.atime);
 		rsp->LastAccessTime = cpu_to_le64(time);
-		time = ksmbd_UnixTimeToNT(inode_get_mtime(inode));
+		time = ksmbd_UnixTimeToNT(stat.mtime);
 		rsp->LastWriteTime = cpu_to_le64(time);
-		time = ksmbd_UnixTimeToNT(inode_get_ctime(inode));
+		time = ksmbd_UnixTimeToNT(stat.ctime);
 		rsp->ChangeTime = cpu_to_le64(time);
 		ksmbd_fd_put(work, fp);
 	} else {
@@ -5581,8 +5857,9 @@ static int smb2_rename(struct ksmbd_work *work,
 	if (!file_info->ReplaceIfExists)
 		flags = RENAME_NOREPLACE;
 
-	smb_break_all_levII_oplock(work, fp, 0);
 	rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
+	if (!rc)
+		smb_break_all_levII_oplock(work, fp, 0);
 out:
 	kfree(new_name);
 	return rc;
@@ -5759,15 +6036,21 @@ static int set_file_allocation_info(struct ksmbd_work *work,
 
 	loff_t alloc_blks;
 	struct inode *inode;
+	struct kstat stat;
 	int rc;
 
 	if (!(fp->daccess & FILE_WRITE_DATA_LE))
 		return -EACCES;
 
+	rc = vfs_getattr(&fp->filp->f_path, &stat, STATX_BASIC_STATS,
+			 AT_STATX_SYNC_AS_STAT);
+	if (rc)
+		return rc;
+
 	alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9;
 	inode = file_inode(fp->filp);
 
-	if (alloc_blks > inode->i_blocks) {
+	if (alloc_blks > stat.blocks) {
 		smb_break_all_levII_oplock(work, fp, 1);
 		rc = vfs_fallocate(fp->filp, FALLOC_FL_KEEP_SIZE, 0,
 				   alloc_blks * 512);
@@ -5775,7 +6058,7 @@ static int set_file_allocation_info(struct ksmbd_work *work,
 			pr_err("vfs_fallocate is failed : %d\n", rc);
 			return rc;
 		}
-	} else if (alloc_blks < inode->i_blocks) {
+	} else if (alloc_blks < stat.blocks) {
 		loff_t size;
 
 		/*
@@ -5930,6 +6213,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			      struct ksmbd_share_config *share)
 {
 	unsigned int buf_len = le32_to_cpu(req->BufferLength);
+	char *buffer = (char *)req + le16_to_cpu(req->BufferOffset);
 
 	switch (req->FileInfoClass) {
 	case FILE_BASIC_INFORMATION:
@@ -5937,7 +6221,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 		if (buf_len < sizeof(struct smb2_file_basic_info))
 			return -EINVAL;
 
-		return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share);
+		return set_file_basic_info(fp, (struct smb2_file_basic_info *)buffer, share);
 	}
 	case FILE_ALLOCATION_INFORMATION:
 	{
@@ -5945,7 +6229,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			return -EINVAL;
 
 		return set_file_allocation_info(work, fp,
-						(struct smb2_file_alloc_info *)req->Buffer);
+						(struct smb2_file_alloc_info *)buffer);
 	}
 	case FILE_END_OF_FILE_INFORMATION:
 	{
@@ -5953,7 +6237,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			return -EINVAL;
 
 		return set_end_of_file_info(work, fp,
-					    (struct smb2_file_eof_info *)req->Buffer);
+					    (struct smb2_file_eof_info *)buffer);
 	}
 	case FILE_RENAME_INFORMATION:
 	{
@@ -5961,7 +6245,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			return -EINVAL;
 
 		return set_rename_info(work, fp,
-				       (struct smb2_file_rename_info *)req->Buffer,
+				       (struct smb2_file_rename_info *)buffer,
 				       buf_len);
 	}
 	case FILE_LINK_INFORMATION:
@@ -5970,7 +6254,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			return -EINVAL;
 
 		return smb2_create_link(work, work->tcon->share_conf,
-					(struct smb2_file_link_info *)req->Buffer,
+					(struct smb2_file_link_info *)buffer,
 					buf_len, fp->filp,
 					work->conn->local_nls);
 	}
@@ -5980,7 +6264,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 			return -EINVAL;
 
 		return set_file_disposition_info(fp,
-						 (struct smb2_file_disposition_info *)req->Buffer);
+						 (struct smb2_file_disposition_info *)buffer);
 	}
 	case FILE_FULL_EA_INFORMATION:
 	{
@@ -5993,7 +6277,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 		if (buf_len < sizeof(struct smb2_ea_info))
 			return -EINVAL;
 
-		return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
+		return smb2_set_ea((struct smb2_ea_info *)buffer,
 				   buf_len, &fp->filp->f_path, true);
 	}
 	case FILE_POSITION_INFORMATION:
@@ -6001,14 +6285,14 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
 		if (buf_len < sizeof(struct smb2_file_pos_info))
 			return -EINVAL;
 
-		return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer);
+		return set_file_position_info(fp, (struct smb2_file_pos_info *)buffer);
 	}
 	case FILE_MODE_INFORMATION:
 	{
 		if (buf_len < sizeof(struct smb2_file_mode_info))
 			return -EINVAL;
 
-		return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer);
+		return set_file_mode_info(fp, (struct smb2_file_mode_info *)buffer);
 	}
 	}
 
@@ -6089,7 +6373,7 @@ int smb2_set_info(struct ksmbd_work *work)
 		}
 		rc = smb2_set_info_sec(fp,
 				       le32_to_cpu(req->AdditionalInformation),
-				       req->Buffer,
+				       (char *)req + le16_to_cpu(req->BufferOffset),
 				       le32_to_cpu(req->BufferLength));
 		ksmbd_revert_fsids(work);
 		break;
@@ -6764,10 +7048,10 @@ struct file_lock *smb_flock_init(struct file *f)
 
 	locks_init_lock(fl);
 
-	fl->fl_owner = f;
-	fl->fl_pid = current->tgid;
-	fl->fl_file = f;
-	fl->fl_flags = FL_POSIX;
+	fl->c.flc_owner = f;
+	fl->c.flc_pid = current->tgid;
+	fl->c.flc_file = f;
+	fl->c.flc_flags = FL_POSIX;
 	fl->fl_ops = NULL;
 	fl->fl_lmops = NULL;
 
@@ -6784,30 +7068,30 @@ static int smb2_set_flock_flags(struct file_lock *flock, int flags)
 	case SMB2_LOCKFLAG_SHARED:
 		ksmbd_debug(SMB, "received shared request\n");
 		cmd = F_SETLKW;
-		flock->fl_type = F_RDLCK;
-		flock->fl_flags |= FL_SLEEP;
+		flock->c.flc_type = F_RDLCK;
+		flock->c.flc_flags |= FL_SLEEP;
 		break;
 	case SMB2_LOCKFLAG_EXCLUSIVE:
 		ksmbd_debug(SMB, "received exclusive request\n");
 		cmd = F_SETLKW;
-		flock->fl_type = F_WRLCK;
-		flock->fl_flags |= FL_SLEEP;
+		flock->c.flc_type = F_WRLCK;
+		flock->c.flc_flags |= FL_SLEEP;
 		break;
 	case SMB2_LOCKFLAG_SHARED | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
 		ksmbd_debug(SMB,
 			    "received shared & fail immediately request\n");
 		cmd = F_SETLK;
-		flock->fl_type = F_RDLCK;
+		flock->c.flc_type = F_RDLCK;
 		break;
 	case SMB2_LOCKFLAG_EXCLUSIVE | SMB2_LOCKFLAG_FAIL_IMMEDIATELY:
 		ksmbd_debug(SMB,
 			    "received exclusive & fail immediately request\n");
 		cmd = F_SETLK;
-		flock->fl_type = F_WRLCK;
+		flock->c.flc_type = F_WRLCK;
 		break;
 	case SMB2_LOCKFLAG_UNLOCK:
 		ksmbd_debug(SMB, "received unlock request\n");
-		flock->fl_type = F_UNLCK;
+		flock->c.flc_type = F_UNLCK;
 		cmd = F_SETLK;
 		break;
 	}
@@ -6845,13 +7129,13 @@ static void smb2_remove_blocked_lock(void **argv)
 	struct file_lock *flock = (struct file_lock *)argv[0];
 
 	ksmbd_vfs_posix_lock_unblock(flock);
-	wake_up(&flock->fl_wait);
+	locks_wake_up(flock);
 }
 
 static inline bool lock_defer_pending(struct file_lock *fl)
 {
 	/* check pending lock waiters */
-	return waitqueue_active(&fl->fl_wait);
+	return waitqueue_active(&fl->c.flc_wait);
 }
 
 /**
@@ -6942,8 +7226,8 @@ int smb2_lock(struct ksmbd_work *work)
 		list_for_each_entry(cmp_lock, &lock_list, llist) {
 			if (cmp_lock->fl->fl_start <= flock->fl_start &&
 			    cmp_lock->fl->fl_end >= flock->fl_end) {
-				if (cmp_lock->fl->fl_type != F_UNLCK &&
-				    flock->fl_type != F_UNLCK) {
+				if (cmp_lock->fl->c.flc_type != F_UNLCK &&
+				    flock->c.flc_type != F_UNLCK) {
 					pr_err("conflict two locks in one request\n");
 					err = -EINVAL;
 					locks_free_lock(flock);
@@ -6991,12 +7275,12 @@ int smb2_lock(struct ksmbd_work *work)
 		list_for_each_entry(conn, &conn_list, conns_list) {
 			spin_lock(&conn->llist_lock);
 			list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) {
-				if (file_inode(cmp_lock->fl->fl_file) !=
-				    file_inode(smb_lock->fl->fl_file))
+				if (file_inode(cmp_lock->fl->c.flc_file) !=
+				    file_inode(smb_lock->fl->c.flc_file))
 					continue;
 
-				if (smb_lock->fl->fl_type == F_UNLCK) {
-					if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file &&
+				if (lock_is_unlock(smb_lock->fl)) {
+					if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file &&
 					    cmp_lock->start == smb_lock->start &&
 					    cmp_lock->end == smb_lock->end &&
 					    !lock_defer_pending(cmp_lock->fl)) {
@@ -7013,7 +7297,7 @@ int smb2_lock(struct ksmbd_work *work)
 					continue;
 				}
 
-				if (cmp_lock->fl->fl_file == smb_lock->fl->fl_file) {
+				if (cmp_lock->fl->c.flc_file == smb_lock->fl->c.flc_file) {
 					if (smb_lock->flags & SMB2_LOCKFLAG_SHARED)
 						continue;
 				} else {
@@ -7055,7 +7339,7 @@ int smb2_lock(struct ksmbd_work *work)
 		}
 		up_read(&conn_list_lock);
 out_check_cl:
-		if (smb_lock->fl->fl_type == F_UNLCK && nolock) {
+		if (lock_is_unlock(smb_lock->fl) && nolock) {
 			pr_err("Try to unlock nolocked range\n");
 			rsp->hdr.Status = STATUS_RANGE_NOT_LOCKED;
 			goto out;
@@ -7179,7 +7463,7 @@ out:
 		struct file_lock *rlock = NULL;
 
 		rlock = smb_flock_init(filp);
-		rlock->fl_type = F_UNLCK;
+		rlock->c.flc_type = F_UNLCK;
 		rlock->fl_start = smb_lock->start;
 		rlock->fl_end = smb_lock->end;
 
@@ -7535,7 +7819,7 @@ static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id,
 				 struct smb2_ioctl_rsp *rsp)
 {
 	struct ksmbd_rpc_command *rpc_resp;
-	char *data_buf = (char *)&req->Buffer[0];
+	char *data_buf = (char *)req + le32_to_cpu(req->InputOffset);
 	int nbytes = 0;
 
 	rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf,
@@ -7648,6 +7932,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 	u64 id = KSMBD_NO_FID;
 	struct ksmbd_conn *conn = work->conn;
 	int ret = 0;
+	char *buffer;
 
 	if (work->next_smb2_rcv_hdr_off) {
 		req = ksmbd_req_buf_next(work);
@@ -7670,6 +7955,8 @@ int smb2_ioctl(struct ksmbd_work *work)
 		goto out;
 	}
 
+	buffer = (char *)req + le32_to_cpu(req->InputOffset);
+
 	cnt_code = le32_to_cpu(req->CtlCode);
 	ret = smb2_calc_max_out_buf_len(work, 48,
 					le32_to_cpu(req->MaxOutputResponse));
@@ -7727,7 +8014,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 		}
 
 		ret = fsctl_validate_negotiate_info(conn,
-			(struct validate_negotiate_info_req *)&req->Buffer[0],
+			(struct validate_negotiate_info_req *)buffer,
 			(struct validate_negotiate_info_rsp *)&rsp->Buffer[0],
 			in_buf_len);
 		if (ret < 0)
@@ -7780,7 +8067,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 		rsp->VolatileFileId = req->VolatileFileId;
 		rsp->PersistentFileId = req->PersistentFileId;
 		fsctl_copychunk(work,
-				(struct copychunk_ioctl_req *)&req->Buffer[0],
+				(struct copychunk_ioctl_req *)buffer,
 				le32_to_cpu(req->CtlCode),
 				le32_to_cpu(req->InputCount),
 				req->VolatileFileId,
@@ -7793,8 +8080,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 			goto out;
 		}
 
-		ret = fsctl_set_sparse(work, id,
-				       (struct file_sparse *)&req->Buffer[0]);
+		ret = fsctl_set_sparse(work, id, (struct file_sparse *)buffer);
 		if (ret < 0)
 			goto out;
 		break;
@@ -7817,7 +8103,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 		}
 
 		zero_data =
-			(struct file_zero_data_information *)&req->Buffer[0];
+			(struct file_zero_data_information *)buffer;
 
 		off = le64_to_cpu(zero_data->FileOffset);
 		bfz = le64_to_cpu(zero_data->BeyondFinalZero);
@@ -7848,7 +8134,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 		}
 
 		ret = fsctl_query_allocated_ranges(work, id,
-			(struct file_allocated_range_buffer *)&req->Buffer[0],
+			(struct file_allocated_range_buffer *)buffer,
 			(struct file_allocated_range_buffer *)&rsp->Buffer[0],
 			out_buf_len /
 			sizeof(struct file_allocated_range_buffer), &nbytes);
@@ -7892,7 +8178,7 @@ int smb2_ioctl(struct ksmbd_work *work)
 			goto out;
 		}
 
-		dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0];
+		dup_ext = (struct duplicate_extents_to_file *)buffer;
 
 		fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle,
 					     dup_ext->PersistentFileHandle);
diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h
index d12cfd3b0927..bd1d2a0e9203 100644
--- a/fs/smb/server/smb2pdu.h
+++ b/fs/smb/server/smb2pdu.h
@@ -72,6 +72,18 @@ struct create_durable_req_v2 {
 	__u8 CreateGuid[16];
 } __packed;
 
+struct create_durable_reconn_req {
+	struct create_context ccontext;
+	__u8   Name[8];
+	union {
+		__u8  Reserved[16];
+		struct {
+			__u64 PersistentFileId;
+			__u64 VolatileFileId;
+		} Fid;
+	} Data;
+} __packed;
+
 struct create_durable_reconn_v2_req {
 	struct create_context ccontext;
 	__u8   Name[8];
@@ -98,6 +110,9 @@ struct create_durable_rsp {
 	} Data;
 } __packed;
 
+/* See MS-SMB2 2.2.13.2.11 */
+/* Flags */
+#define SMB2_DHANDLE_FLAG_PERSISTENT	0x00000002
 struct create_durable_v2_rsp {
 	struct create_context ccontext;
 	__u8   Name[8];
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 7c98bf699772..fcaf373cc008 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -457,10 +457,13 @@ int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level,
 			}
 
 			ksmbd_kstat.kstat = &kstat;
-			ksmbd_vfs_fill_dentry_attrs(work,
-						    idmap,
-						    dentry,
-						    &ksmbd_kstat);
+			rc = ksmbd_vfs_fill_dentry_attrs(work,
+							 idmap,
+							 dentry,
+							 &ksmbd_kstat);
+			if (rc)
+				break;
+
 			rc = fn(conn, info_level, d_info, &ksmbd_kstat);
 			if (rc)
 				break;
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index f29bb03f0dc4..8752ac82c557 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -65,6 +65,7 @@ struct ipc_msg_table_entry {
 	struct hlist_node	ipc_table_hlist;
 
 	void			*response;
+	unsigned int		msg_sz;
 };
 
 static struct delayed_work ipc_timer_work;
@@ -275,6 +276,7 @@ static int handle_response(int type, void *payload, size_t sz)
 		}
 
 		memcpy(entry->response, payload, sz);
+		entry->msg_sz = sz;
 		wake_up_interruptible(&entry->wait);
 		ret = 0;
 		break;
@@ -453,6 +455,34 @@ out:
 	return ret;
 }
 
+static int ipc_validate_msg(struct ipc_msg_table_entry *entry)
+{
+	unsigned int msg_sz = entry->msg_sz;
+
+	if (entry->type == KSMBD_EVENT_RPC_REQUEST) {
+		struct ksmbd_rpc_command *resp = entry->response;
+
+		msg_sz = sizeof(struct ksmbd_rpc_command) + resp->payload_sz;
+	} else if (entry->type == KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST) {
+		struct ksmbd_spnego_authen_response *resp = entry->response;
+
+		msg_sz = sizeof(struct ksmbd_spnego_authen_response) +
+				resp->session_key_len + resp->spnego_blob_len;
+	} else if (entry->type == KSMBD_EVENT_SHARE_CONFIG_REQUEST) {
+		struct ksmbd_share_config_response *resp = entry->response;
+
+		if (resp->payload_sz) {
+			if (resp->payload_sz < resp->veto_list_sz)
+				return -EINVAL;
+
+			msg_sz = sizeof(struct ksmbd_share_config_response) +
+					resp->payload_sz;
+		}
+	}
+
+	return entry->msg_sz != msg_sz ? -EINVAL : 0;
+}
+
 static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle)
 {
 	struct ipc_msg_table_entry entry;
@@ -477,6 +507,13 @@ static void *ipc_msg_send_request(struct ksmbd_ipc_msg *msg, unsigned int handle
 	ret = wait_event_interruptible_timeout(entry.wait,
 					       entry.response != NULL,
 					       IPC_WAIT_TIMEOUT);
+	if (entry.response) {
+		ret = ipc_validate_msg(&entry);
+		if (ret) {
+			kvfree(entry.response);
+			entry.response = NULL;
+		}
+	}
 out:
 	down_write(&ipc_msg_table_lock);
 	hash_del(&entry.ipc_table_hlist);
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index a6961bfe3e13..22f0f3db3ac9 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -337,18 +337,18 @@ static int check_lock_range(struct file *filp, loff_t start, loff_t end,
 		return 0;
 
 	spin_lock(&ctx->flc_lock);
-	list_for_each_entry(flock, &ctx->flc_posix, fl_list) {
+	for_each_file_lock(flock, &ctx->flc_posix) {
 		/* check conflict locks */
 		if (flock->fl_end >= start && end >= flock->fl_start) {
-			if (flock->fl_type == F_RDLCK) {
+			if (lock_is_read(flock)) {
 				if (type == WRITE) {
 					pr_err("not allow write by shared lock\n");
 					error = 1;
 					goto out;
 				}
-			} else if (flock->fl_type == F_WRLCK) {
+			} else if (lock_is_write(flock)) {
 				/* check owner in lock */
-				if (flock->fl_file != filp) {
+				if (flock->c.flc_file != filp) {
 					error = 1;
 					pr_err("not allow rw access by exclusive lock from other opens\n");
 					goto out;
@@ -1682,11 +1682,19 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work,
 				struct dentry *dentry,
 				struct ksmbd_kstat *ksmbd_kstat)
 {
+	struct ksmbd_share_config *share_conf = work->tcon->share_conf;
 	u64 time;
 	int rc;
+	struct path path = {
+		.mnt = share_conf->vfs_path.mnt,
+		.dentry = dentry,
+	};
 
-	generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry),
-			 ksmbd_kstat->kstat);
+	rc = vfs_getattr(&path, ksmbd_kstat->kstat,
+			 STATX_BASIC_STATS | STATX_BTIME,
+			 AT_STATX_SYNC_AS_STAT);
+	if (rc)
+		return rc;
 
 	time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime);
 	ksmbd_kstat->create_time = time;
@@ -1837,13 +1845,13 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work,
 
 void ksmbd_vfs_posix_lock_wait(struct file_lock *flock)
 {
-	wait_event(flock->fl_wait, !flock->fl_blocker);
+	wait_event(flock->c.flc_wait, !flock->c.flc_blocker);
 }
 
 int ksmbd_vfs_posix_lock_wait_timeout(struct file_lock *flock, long timeout)
 {
-	return wait_event_interruptible_timeout(flock->fl_wait,
-						!flock->fl_blocker,
+	return wait_event_interruptible_timeout(flock->c.flc_wait,
+						!flock->c.flc_blocker,
 						timeout);
 }
 
diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c
index 4e82ff627d12..030f70700036 100644
--- a/fs/smb/server/vfs_cache.c
+++ b/fs/smb/server/vfs_cache.c
@@ -305,7 +305,8 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp)
 
 	fd_limit_close();
 	__ksmbd_remove_durable_fd(fp);
-	__ksmbd_remove_fd(ft, fp);
+	if (ft)
+		__ksmbd_remove_fd(ft, fp);
 
 	close_id_del_oplock(fp);
 	filp = fp->filp;
@@ -465,11 +466,32 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
 	return fp;
 }
 
-struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id)
 {
 	return __ksmbd_lookup_fd(&global_ft, id);
 }
 
+struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id)
+{
+	struct ksmbd_file *fp;
+
+	fp = __ksmbd_lookup_fd(&global_ft, id);
+	if (fp && fp->conn) {
+		ksmbd_put_durable_fd(fp);
+		fp = NULL;
+	}
+
+	return fp;
+}
+
+void ksmbd_put_durable_fd(struct ksmbd_file *fp)
+{
+	if (!atomic_dec_and_test(&fp->refcount))
+		return;
+
+	__ksmbd_close_fd(NULL, fp);
+}
+
 struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid)
 {
 	struct ksmbd_file	*fp = NULL;
@@ -639,6 +661,32 @@ __close_file_table_ids(struct ksmbd_file_table *ft,
 	return num;
 }
 
+static inline bool is_reconnectable(struct ksmbd_file *fp)
+{
+	struct oplock_info *opinfo = opinfo_get(fp);
+	bool reconn = false;
+
+	if (!opinfo)
+		return false;
+
+	if (opinfo->op_state != OPLOCK_STATE_NONE) {
+		opinfo_put(opinfo);
+		return false;
+	}
+
+	if (fp->is_resilient || fp->is_persistent)
+		reconn = true;
+	else if (fp->is_durable && opinfo->is_lease &&
+		 opinfo->o_lease->state & SMB2_LEASE_HANDLE_CACHING_LE)
+		reconn = true;
+
+	else if (fp->is_durable && opinfo->level == SMB2_OPLOCK_LEVEL_BATCH)
+		reconn = true;
+
+	opinfo_put(opinfo);
+	return reconn;
+}
+
 static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
 			       struct ksmbd_file *fp)
 {
@@ -648,7 +696,28 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon,
 static bool session_fd_check(struct ksmbd_tree_connect *tcon,
 			     struct ksmbd_file *fp)
 {
-	return false;
+	struct ksmbd_inode *ci;
+	struct oplock_info *op;
+	struct ksmbd_conn *conn;
+
+	if (!is_reconnectable(fp))
+		return false;
+
+	conn = fp->conn;
+	ci = fp->f_ci;
+	write_lock(&ci->m_lock);
+	list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+		if (op->conn != conn)
+			continue;
+		op->conn = NULL;
+	}
+	write_unlock(&ci->m_lock);
+
+	fp->conn = NULL;
+	fp->tcon = NULL;
+	fp->volatile_id = KSMBD_NO_FID;
+
+	return true;
 }
 
 void ksmbd_close_tree_conn_fds(struct ksmbd_work *work)
@@ -687,6 +756,68 @@ void ksmbd_free_global_file_table(void)
 	ksmbd_destroy_file_table(&global_ft);
 }
 
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+				  struct ksmbd_file *fp, char *name)
+{
+	char *pathname, *ab_pathname;
+	int ret = 0;
+
+	pathname = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!pathname)
+		return -EACCES;
+
+	ab_pathname = d_path(&fp->filp->f_path, pathname, PATH_MAX);
+	if (IS_ERR(ab_pathname)) {
+		kfree(pathname);
+		return -EACCES;
+	}
+
+	if (name && strcmp(&ab_pathname[share->path_sz + 1], name)) {
+		ksmbd_debug(SMB, "invalid name reconnect %s\n", name);
+		ret = -EINVAL;
+	}
+
+	kfree(pathname);
+
+	return ret;
+}
+
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp)
+{
+	struct ksmbd_inode *ci;
+	struct oplock_info *op;
+
+	if (!fp->is_durable || fp->conn || fp->tcon) {
+		pr_err("Invalid durable fd [%p:%p]\n", fp->conn, fp->tcon);
+		return -EBADF;
+	}
+
+	if (has_file_id(fp->volatile_id)) {
+		pr_err("Still in use durable fd: %llu\n", fp->volatile_id);
+		return -EBADF;
+	}
+
+	fp->conn = work->conn;
+	fp->tcon = work->tcon;
+
+	ci = fp->f_ci;
+	write_lock(&ci->m_lock);
+	list_for_each_entry_rcu(op, &ci->m_op_list, op_entry) {
+		if (op->conn)
+			continue;
+		op->conn = fp->conn;
+	}
+	write_unlock(&ci->m_lock);
+
+	__open_id(&work->sess->file_table, fp, OPEN_ID_TYPE_VOLATILE_ID);
+	if (!has_file_id(fp->volatile_id)) {
+		fp->conn = NULL;
+		fp->tcon = NULL;
+		return -EBADF;
+	}
+	return 0;
+}
+
 int ksmbd_init_file_table(struct ksmbd_file_table *ft)
 {
 	ft->idr = kzalloc(sizeof(struct idr), GFP_KERNEL);
diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h
index a528f0cc775a..ed44fb4e18e7 100644
--- a/fs/smb/server/vfs_cache.h
+++ b/fs/smb/server/vfs_cache.h
@@ -14,6 +14,7 @@
 #include <linux/workqueue.h>
 
 #include "vfs.h"
+#include "mgmt/share_config.h"
 
 /* Windows style file permissions for extended response */
 #define	FILE_GENERIC_ALL	0x1F01FF
@@ -106,6 +107,9 @@ struct ksmbd_file {
 	int				dot_dotdot[2];
 	unsigned int			f_state;
 	bool				reserve_lease_break;
+	bool				is_durable;
+	bool				is_persistent;
+	bool				is_resilient;
 };
 
 static inline void set_ctx_actor(struct dir_context *ctx,
@@ -141,7 +145,9 @@ struct ksmbd_file *ksmbd_lookup_fd_slow(struct ksmbd_work *work, u64 id,
 void ksmbd_fd_put(struct ksmbd_work *work, struct ksmbd_file *fp);
 struct ksmbd_inode *ksmbd_inode_lookup_lock(struct dentry *d);
 void ksmbd_inode_put(struct ksmbd_inode *ci);
+struct ksmbd_file *ksmbd_lookup_global_fd(unsigned long long id);
 struct ksmbd_file *ksmbd_lookup_durable_fd(unsigned long long id);
+void ksmbd_put_durable_fd(struct ksmbd_file *fp);
 struct ksmbd_file *ksmbd_lookup_fd_cguid(char *cguid);
 struct ksmbd_file *ksmbd_lookup_fd_inode(struct dentry *dentry);
 unsigned int ksmbd_open_durable_fd(struct ksmbd_file *fp);
@@ -173,6 +179,9 @@ void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp);
 void ksmbd_clear_inode_pending_delete(struct ksmbd_file *fp);
 void ksmbd_fd_set_delete_on_close(struct ksmbd_file *fp,
 				  int file_info);
+int ksmbd_reopen_durable_fd(struct ksmbd_work *work, struct ksmbd_file *fp);
+int ksmbd_validate_name_reconnect(struct ksmbd_share_config *share,
+				  struct ksmbd_file *fp, char *name);
 int ksmbd_init_file_cache(void);
 void ksmbd_exit_file_cache(void);
 #endif /* __VFS_CACHE_H__ */
diff --git a/fs/super.c b/fs/super.c
index d6efeba0d0ce..69ce6c600968 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1527,16 +1527,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 		struct fs_context *fc)
 {
 	blk_mode_t mode = sb_open_mode(sb_flags);
-	struct bdev_handle *bdev_handle;
+	struct file *bdev_file;
 	struct block_device *bdev;
 
-	bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
-	if (IS_ERR(bdev_handle)) {
+	bdev_file = bdev_file_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops);
+	if (IS_ERR(bdev_file)) {
 		if (fc)
 			errorf(fc, "%s: Can't open blockdev", fc->source);
-		return PTR_ERR(bdev_handle);
+		return PTR_ERR(bdev_file);
 	}
-	bdev = bdev_handle->bdev;
+	bdev = file_bdev(bdev_file);
 
 	/*
 	 * This really should be in blkdev_get_by_dev, but right now can't due
@@ -1544,7 +1544,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	 * writable from userspace even for a read-only block device.
 	 */
 	if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
-		bdev_release(bdev_handle);
+		bdev_fput(bdev_file);
 		return -EACCES;
 	}
 
@@ -1555,11 +1555,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
 	if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
 		if (fc)
 			warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
-		bdev_release(bdev_handle);
+		bdev_fput(bdev_file);
 		return -EBUSY;
 	}
 	spin_lock(&sb_lock);
-	sb->s_bdev_handle = bdev_handle;
+	sb->s_bdev_file = bdev_file;
 	sb->s_bdev = bdev;
 	sb->s_bdi = bdi_get(bdev->bd_disk->bdi);
 	if (bdev_stable_writes(bdev))
@@ -1675,7 +1675,7 @@ void kill_block_super(struct super_block *sb)
 	generic_shutdown_super(sb);
 	if (bdev) {
 		sync_blockdev(bdev);
-		bdev_release(sb->s_bdev_handle);
+		bdev_fput(sb->s_bdev_file);
 	}
 }
 
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 138676463336..d22ad67a0f32 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -31,6 +31,17 @@ static void remove_files(struct kernfs_node *parent,
 			kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
 }
 
+static umode_t __first_visible(const struct attribute_group *grp, struct kobject *kobj)
+{
+	if (grp->attrs && grp->attrs[0] && grp->is_visible)
+		return grp->is_visible(kobj, grp->attrs[0], 0);
+
+	if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible)
+		return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0);
+
+	return 0;
+}
+
 static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 			kuid_t uid, kgid_t gid,
 			const struct attribute_group *grp, int update)
@@ -52,6 +63,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 				kernfs_remove_by_name(parent, (*attr)->name);
 			if (grp->is_visible) {
 				mode = grp->is_visible(kobj, *attr, i);
+				mode &= ~SYSFS_GROUP_INVISIBLE;
 				if (!mode)
 					continue;
 			}
@@ -81,6 +93,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 						(*bin_attr)->attr.name);
 			if (grp->is_bin_visible) {
 				mode = grp->is_bin_visible(kobj, *bin_attr, i);
+				mode &= ~SYSFS_GROUP_INVISIBLE;
 				if (!mode)
 					continue;
 			}
@@ -127,16 +140,31 @@ static int internal_create_group(struct kobject *kobj, int update,
 
 	kobject_get_ownership(kobj, &uid, &gid);
 	if (grp->name) {
+		umode_t mode = __first_visible(grp, kobj);
+
+		if (mode & SYSFS_GROUP_INVISIBLE)
+			mode = 0;
+		else
+			mode = S_IRWXU | S_IRUGO | S_IXUGO;
+
 		if (update) {
 			kn = kernfs_find_and_get(kobj->sd, grp->name);
 			if (!kn) {
-				pr_warn("Can't update unknown attr grp name: %s/%s\n",
-					kobj->name, grp->name);
-				return -EINVAL;
+				pr_debug("attr grp %s/%s not created yet\n",
+					 kobj->name, grp->name);
+				/* may have been invisible prior to this update */
+				update = 0;
+			} else if (!mode) {
+				sysfs_remove_group(kobj, grp);
+				kernfs_put(kn);
+				return 0;
 			}
-		} else {
-			kn = kernfs_create_dir_ns(kobj->sd, grp->name,
-						  S_IRWXU | S_IRUGO | S_IXUGO,
+		}
+
+		if (!update) {
+			if (!mode)
+				return 0;
+			kn = kernfs_create_dir_ns(kobj->sd, grp->name, mode,
 						  uid, gid, kobj, NULL);
 			if (IS_ERR(kn)) {
 				if (PTR_ERR(kn) == -EEXIST)
@@ -279,9 +307,8 @@ void sysfs_remove_group(struct kobject *kobj,
 	if (grp->name) {
 		kn = kernfs_find_and_get(parent, grp->name);
 		if (!kn) {
-			WARN(!kn, KERN_WARNING
-			     "sysfs group '%s' not found for kobject '%s'\n",
-			     grp->name, kobject_name(kobj));
+			pr_debug("sysfs group '%s' not found for kobject '%s'\n",
+				 grp->name, kobject_name(kobj));
 			return;
 		}
 	} else {
@@ -318,13 +345,13 @@ void sysfs_remove_groups(struct kobject *kobj,
 EXPORT_SYMBOL_GPL(sysfs_remove_groups);
 
 /**
- * sysfs_merge_group - merge files into a pre-existing attribute group.
+ * sysfs_merge_group - merge files into a pre-existing named attribute group.
  * @kobj:	The kobject containing the group.
  * @grp:	The files to create and the attribute group they belong to.
  *
- * This function returns an error if the group doesn't exist or any of the
- * files already exist in that group, in which case none of the new files
- * are created.
+ * This function returns an error if the group doesn't exist, the .name field is
+ * NULL or any of the files already exist in that group, in which case none of
+ * the new files are created.
  */
 int sysfs_merge_group(struct kobject *kobj,
 		       const struct attribute_group *grp)
@@ -356,7 +383,7 @@ int sysfs_merge_group(struct kobject *kobj,
 EXPORT_SYMBOL_GPL(sysfs_merge_group);
 
 /**
- * sysfs_unmerge_group - remove files from a pre-existing attribute group.
+ * sysfs_unmerge_group - remove files from a pre-existing named attribute group.
  * @kobj:	The kobject containing the group.
  * @grp:	The files to remove and the attribute group they belong to.
  */
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 5a915b2e68f5..76bc2d5e75a9 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -336,7 +336,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 410ab2a44d2f..19bcb51a2203 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -83,9 +83,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
 	return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
 }
 
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
 static Indirect *get_branch(struct inode *inode,
 			    int depth,
 			    int offsets[],
@@ -105,15 +102,18 @@ static Indirect *get_branch(struct inode *inode,
 		bh = sb_bread(sb, block);
 		if (!bh)
 			goto failure;
+		read_lock(&pointers_lock);
 		if (!verify_chain(chain, p))
 			goto changed;
 		add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+		read_unlock(&pointers_lock);
 		if (!p->key)
 			goto no_block;
 	}
 	return NULL;
 
 changed:
+	read_unlock(&pointers_lock);
 	brelse(bh);
 	*err = -EAGAIN;
 	goto no_block;
@@ -219,9 +219,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
 		goto out;
 
 reread:
-	read_lock(&pointers_lock);
 	partial = get_branch(inode, depth, offsets, chain, &err);
-	read_unlock(&pointers_lock);
 
 	/* Simplest case - block found, no allocation needed */
 	if (!partial) {
@@ -291,9 +289,9 @@ static Indirect *find_shared(struct inode *inode,
 	*top = 0;
 	for (k = depth; k > 1 && !offsets[k-1]; k--)
 		;
+	partial = get_branch(inode, k, offsets, chain, &err);
 
 	write_lock(&pointers_lock);
-	partial = get_branch(inode, k, offsets, chain, &err);
 	if (!partial)
 		partial = chain + k-1;
 	/*
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index 110e8a272189..894c6ca1e500 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -35,6 +35,17 @@ static DEFINE_MUTEX(eventfs_mutex);
 /* Choose something "unique" ;-) */
 #define EVENTFS_FILE_INODE_INO		0x12c4e37
 
+struct eventfs_root_inode {
+	struct eventfs_inode		ei;
+	struct dentry			*events_dir;
+};
+
+static struct eventfs_root_inode *get_root_inode(struct eventfs_inode *ei)
+{
+	WARN_ON_ONCE(!ei->is_events);
+	return container_of(ei, struct eventfs_root_inode, ei);
+}
+
 /* Just try to make something consistent and unique */
 static int eventfs_dir_ino(struct eventfs_inode *ei)
 {
@@ -73,12 +84,18 @@ enum {
 static void release_ei(struct kref *ref)
 {
 	struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+	struct eventfs_root_inode *rei;
 
 	WARN_ON_ONCE(!ei->is_freed);
 
 	kfree(ei->entry_attrs);
 	kfree_const(ei->name);
-	kfree_rcu(ei, rcu);
+	if (ei->is_events) {
+		rei = get_root_inode(ei);
+		kfree_rcu(rei, ei.rcu);
+	} else {
+		kfree_rcu(ei, rcu);
+	}
 }
 
 static inline void put_ei(struct eventfs_inode *ei)
@@ -319,6 +336,7 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
 
 /**
  * lookup_file - look up a file in the tracefs filesystem
+ * @parent_ei: Pointer to the eventfs_inode that represents parent of the file
  * @dentry: the dentry to look up
  * @mode: the permission that the file should have.
  * @attr: saved attributes changed by user
@@ -372,6 +390,7 @@ static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
 /**
  * lookup_dir_entry - look up a dir in the tracefs filesystem
  * @dentry: the directory to look up
+ * @pei: Pointer to the parent eventfs_inode if available
  * @ei: the eventfs_inode that represents the directory to create
  *
  * This function will look up a dentry for a directory represented by
@@ -408,19 +427,43 @@ static struct dentry *lookup_dir_entry(struct dentry *dentry,
 	return NULL;
 }
 
+static inline struct eventfs_inode *init_ei(struct eventfs_inode *ei, const char *name)
+{
+	ei->name = kstrdup_const(name, GFP_KERNEL);
+	if (!ei->name)
+		return NULL;
+	kref_init(&ei->kref);
+	return ei;
+}
+
 static inline struct eventfs_inode *alloc_ei(const char *name)
 {
 	struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+	struct eventfs_inode *result;
 
 	if (!ei)
 		return NULL;
 
-	ei->name = kstrdup_const(name, GFP_KERNEL);
-	if (!ei->name) {
+	result = init_ei(ei, name);
+	if (!result)
 		kfree(ei);
+
+	return result;
+}
+
+static inline struct eventfs_inode *alloc_root_ei(const char *name)
+{
+	struct eventfs_root_inode *rei = kzalloc(sizeof(*rei), GFP_KERNEL);
+	struct eventfs_inode *ei;
+
+	if (!rei)
 		return NULL;
-	}
-	kref_init(&ei->kref);
+
+	rei->ei.is_events = 1;
+	ei = init_ei(&rei->ei, name);
+	if (!ei)
+		kfree(rei);
+
 	return ei;
 }
 
@@ -437,16 +480,20 @@ void eventfs_d_release(struct dentry *dentry)
 
 /**
  * lookup_file_dentry - create a dentry for a file of an eventfs_inode
+ * @dentry: The parent dentry under which the new file's dentry will be created
  * @ei: the eventfs_inode that the file will be created under
  * @idx: the index into the entry_attrs[] of the @ei
- * @parent: The parent dentry of the created file.
- * @name: The name of the file to create
  * @mode: The mode of the file.
  * @data: The data to use to set the inode of the file with on open()
  * @fops: The fops of the file to be created.
  *
- * Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry.
+ * This function creates a dentry for a file associated with an
+ * eventfs_inode @ei. It uses the entry attributes specified by @idx,
+ * if available. The file will have the specified @mode and its inode will be
+ * set up with @data upon open. The file operations will be set to @fops.
+ *
+ * Return: Returns a pointer to the newly created file's dentry or an error
+ * pointer.
  */
 static struct dentry *
 lookup_file_dentry(struct dentry *dentry,
@@ -483,7 +530,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
 	struct dentry *result = NULL;
 
 	ti = get_tracefs(dir);
-	if (!(ti->flags & TRACEFS_EVENT_INODE))
+	if (WARN_ON_ONCE(!(ti->flags & TRACEFS_EVENT_INODE)))
 		return ERR_PTR(-EIO);
 
 	mutex_lock(&eventfs_mutex);
@@ -495,7 +542,8 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
 	list_for_each_entry(ei_child, &ei->children, list) {
 		if (strcmp(ei_child->name, name) != 0)
 			continue;
-		if (ei_child->is_freed)
+		/* A child is freed and removed from the list at the same time */
+		if (WARN_ON_ONCE(ei_child->is_freed))
 			goto out;
 		result = lookup_dir_entry(dentry, ei, ei_child);
 		goto out;
@@ -709,6 +757,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 						int size, void *data)
 {
 	struct dentry *dentry = tracefs_start_creating(name, parent);
+	struct eventfs_root_inode *rei;
 	struct eventfs_inode *ei;
 	struct tracefs_inode *ti;
 	struct inode *inode;
@@ -721,7 +770,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 	if (IS_ERR(dentry))
 		return ERR_CAST(dentry);
 
-	ei = alloc_ei(name);
+	ei = alloc_root_ei(name);
 	if (!ei)
 		goto fail;
 
@@ -730,10 +779,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
 		goto fail;
 
 	// Note: we have a ref to the dentry from tracefs_start_creating()
-	ei->events_dir = dentry;
+	rei = get_root_inode(ei);
+	rei->events_dir = dentry;
+
 	ei->entries = entries;
 	ei->nr_entries = size;
-	ei->is_events = 1;
 	ei->data = data;
 
 	/* Save the ownership of this directory */
@@ -844,13 +894,15 @@ void eventfs_remove_dir(struct eventfs_inode *ei)
  */
 void eventfs_remove_events_dir(struct eventfs_inode *ei)
 {
+	struct eventfs_root_inode *rei;
 	struct dentry *dentry;
 
-	dentry = ei->events_dir;
+	rei = get_root_inode(ei);
+	dentry = rei->events_dir;
 	if (!dentry)
 		return;
 
-	ei->events_dir = NULL;
+	rei->events_dir = NULL;
 	eventfs_remove_dir(ei);
 
 	/*
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index d65ffad4c327..5545e6bf7d26 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -731,7 +731,6 @@ static int __init tracefs_init(void)
 	tracefs_inode_cachep = kmem_cache_create("tracefs_inode_cache",
 						 sizeof(struct tracefs_inode),
 						 0, (SLAB_RECLAIM_ACCOUNT|
-						     SLAB_MEM_SPREAD|
 						     SLAB_ACCOUNT),
 						 init_once);
 	if (!tracefs_inode_cachep)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index beb3dcd0e434..15c26f9aaad4 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -36,7 +36,6 @@ struct eventfs_attr {
  * @children:	link list into the child eventfs_inode
  * @entries:	the array of entries representing the files in the directory
  * @name:	the name of the directory to create
- * @events_dir: the dentry of the events directory
  * @entry_attrs: Saved mode and ownership of the @d_children
  * @data:	The private data to pass to the callbacks
  * @attr:	Saved mode and ownership of eventfs_inode itself
@@ -54,7 +53,6 @@ struct eventfs_inode {
 	struct list_head		children;
 	const struct eventfs_entry	*entries;
 	const char			*name;
-	struct dentry			*events_dir;
 	struct eventfs_attr		*entry_attrs;
 	void				*data;
 	struct eventfs_attr		attr;
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index d013c5b3f1ed..ac77ac1fd73e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -1742,17 +1742,22 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size)
 	err = dbg_walk_index(c, NULL, add_size, &calc);
 	if (err) {
 		ubifs_err(c, "error %d while walking the index", err);
-		return err;
+		goto out_err;
 	}
 
 	if (calc != idx_size) {
 		ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld",
 			  calc, idx_size);
 		dump_stack();
-		return -EINVAL;
+		err = -EINVAL;
+		goto out_err;
 	}
 
 	return 0;
+
+out_err:
+	ubifs_destroy_tnc_tree(c);
+	return err;
 }
 
 /**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e413a9cf8ee3..eac0fef801f1 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -205,7 +205,6 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
 	err = fscrypt_prepare_lookup(dir, dentry, &nm);
-	generic_set_encrypted_ci_d_ops(dentry);
 	if (err == -ENOENT)
 		return d_splice_alias(NULL, dentry);
 	if (err)
@@ -1134,6 +1133,8 @@ out_cancel:
 	dir_ui->ui_size = dir->i_size;
 	mutex_unlock(&dir_ui->ui_mutex);
 out_inode:
+	/* Free inode->i_link before inode is marked as bad. */
+	fscrypt_free_inode(inode);
 	make_bad_inode(inode);
 	iput(inode);
 out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5029eb3390a5..a1f46919934c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -96,36 +96,36 @@ dump:
 	return -EINVAL;
 }
 
-static int do_readpage(struct page *page)
+static int do_readpage(struct folio *folio)
 {
 	void *addr;
 	int err = 0, i;
 	unsigned int block, beyond;
-	struct ubifs_data_node *dn;
-	struct inode *inode = page->mapping->host;
+	struct ubifs_data_node *dn = NULL;
+	struct inode *inode = folio->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	loff_t i_size = i_size_read(inode);
 
 	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
-		inode->i_ino, page->index, i_size, page->flags);
-	ubifs_assert(c, !PageChecked(page));
-	ubifs_assert(c, !PagePrivate(page));
+		inode->i_ino, folio->index, i_size, folio->flags);
+	ubifs_assert(c, !folio_test_checked(folio));
+	ubifs_assert(c, !folio->private);
 
-	addr = kmap(page);
+	addr = kmap_local_folio(folio, 0);
 
-	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
 	beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT;
 	if (block >= beyond) {
 		/* Reading beyond inode */
-		SetPageChecked(page);
-		memset(addr, 0, PAGE_SIZE);
+		folio_set_checked(folio);
+		addr = folio_zero_tail(folio, 0, addr);
 		goto out;
 	}
 
 	dn = kmalloc(UBIFS_MAX_DATA_NODE_SZ, GFP_NOFS);
 	if (!dn) {
 		err = -ENOMEM;
-		goto error;
+		goto out;
 	}
 
 	i = 0;
@@ -150,39 +150,35 @@ static int do_readpage(struct page *page)
 					memset(addr + ilen, 0, dlen - ilen);
 			}
 		}
-		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+		if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio)))
 			break;
 		block += 1;
 		addr += UBIFS_BLOCK_SIZE;
+		if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+			kunmap_local(addr - UBIFS_BLOCK_SIZE);
+			addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+		}
 	}
+
 	if (err) {
 		struct ubifs_info *c = inode->i_sb->s_fs_info;
 		if (err == -ENOENT) {
 			/* Not found, so it must be a hole */
-			SetPageChecked(page);
+			folio_set_checked(folio);
 			dbg_gen("hole");
-			goto out_free;
+			err = 0;
+		} else {
+			ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
+				  folio->index, inode->i_ino, err);
 		}
-		ubifs_err(c, "cannot read page %lu of inode %lu, error %d",
-			  page->index, inode->i_ino, err);
-		goto error;
 	}
 
-out_free:
-	kfree(dn);
 out:
-	SetPageUptodate(page);
-	ClearPageError(page);
-	flush_dcache_page(page);
-	kunmap(page);
-	return 0;
-
-error:
 	kfree(dn);
-	ClearPageUptodate(page);
-	SetPageError(page);
-	flush_dcache_page(page);
-	kunmap(page);
+	if (!err)
+		folio_mark_uptodate(folio);
+	flush_dcache_folio(folio);
+	kunmap_local(addr);
 	return err;
 }
 
@@ -222,16 +218,16 @@ static int write_begin_slow(struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_SHIFT;
 	struct ubifs_budget_req req = { .new_page = 1 };
 	int err, appending = !!(pos + len > inode->i_size);
-	struct page *page;
+	struct folio *folio;
 
 	dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
 		inode->i_ino, pos, len, inode->i_size);
 
 	/*
-	 * At the slow path we have to budget before locking the page, because
-	 * budgeting may force write-back, which would wait on locked pages and
-	 * deadlock if we had the page locked. At this point we do not know
-	 * anything about the page, so assume that this is a new page which is
+	 * At the slow path we have to budget before locking the folio, because
+	 * budgeting may force write-back, which would wait on locked folios and
+	 * deadlock if we had the folio locked. At this point we do not know
+	 * anything about the folio, so assume that this is a new folio which is
 	 * written to a hole. This corresponds to largest budget. Later the
 	 * budget will be amended if this is not true.
 	 */
@@ -243,45 +239,43 @@ static int write_begin_slow(struct address_space *mapping,
 	if (unlikely(err))
 		return err;
 
-	page = grab_cache_page_write_begin(mapping, index);
-	if (unlikely(!page)) {
+	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+			mapping_gfp_mask(mapping));
+	if (IS_ERR(folio)) {
 		ubifs_release_budget(c, &req);
-		return -ENOMEM;
+		return PTR_ERR(folio);
 	}
 
-	if (!PageUptodate(page)) {
-		if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE)
-			SetPageChecked(page);
+	if (!folio_test_uptodate(folio)) {
+		if (pos == folio_pos(folio) && len >= folio_size(folio))
+			folio_set_checked(folio);
 		else {
-			err = do_readpage(page);
+			err = do_readpage(folio);
 			if (err) {
-				unlock_page(page);
-				put_page(page);
+				folio_unlock(folio);
+				folio_put(folio);
 				ubifs_release_budget(c, &req);
 				return err;
 			}
 		}
-
-		SetPageUptodate(page);
-		ClearPageError(page);
 	}
 
-	if (PagePrivate(page))
+	if (folio->private)
 		/*
-		 * The page is dirty, which means it was budgeted twice:
+		 * The folio is dirty, which means it was budgeted twice:
 		 *   o first time the budget was allocated by the task which
-		 *     made the page dirty and set the PG_private flag;
+		 *     made the folio dirty and set the private field;
 		 *   o and then we budgeted for it for the second time at the
 		 *     very beginning of this function.
 		 *
-		 * So what we have to do is to release the page budget we
+		 * So what we have to do is to release the folio budget we
 		 * allocated.
 		 */
 		release_new_page_budget(c);
-	else if (!PageChecked(page))
+	else if (!folio_test_checked(folio))
 		/*
-		 * We are changing a page which already exists on the media.
-		 * This means that changing the page does not make the amount
+		 * We are changing a folio which already exists on the media.
+		 * This means that changing the folio does not make the amount
 		 * of indexing information larger, and this part of the budget
 		 * which we have already acquired may be released.
 		 */
@@ -304,14 +298,14 @@ static int write_begin_slow(struct address_space *mapping,
 			ubifs_release_dirty_inode_budget(c, ui);
 	}
 
-	*pagep = page;
+	*pagep = &folio->page;
 	return 0;
 }
 
 /**
  * allocate_budget - allocate budget for 'ubifs_write_begin()'.
  * @c: UBIFS file-system description object
- * @page: page to allocate budget for
+ * @folio: folio to allocate budget for
  * @ui: UBIFS inode object the page belongs to
  * @appending: non-zero if the page is appended
  *
@@ -322,15 +316,15 @@ static int write_begin_slow(struct address_space *mapping,
  *
  * Returns: %0 in case of success and %-ENOSPC in case of failure.
  */
-static int allocate_budget(struct ubifs_info *c, struct page *page,
+static int allocate_budget(struct ubifs_info *c, struct folio *folio,
 			   struct ubifs_inode *ui, int appending)
 {
 	struct ubifs_budget_req req = { .fast = 1 };
 
-	if (PagePrivate(page)) {
+	if (folio->private) {
 		if (!appending)
 			/*
-			 * The page is dirty and we are not appending, which
+			 * The folio is dirty and we are not appending, which
 			 * means no budget is needed at all.
 			 */
 			return 0;
@@ -354,11 +348,11 @@ static int allocate_budget(struct ubifs_info *c, struct page *page,
 		 */
 		req.dirtied_ino = 1;
 	} else {
-		if (PageChecked(page))
+		if (folio_test_checked(folio))
 			/*
 			 * The page corresponds to a hole and does not
 			 * exist on the media. So changing it makes
-			 * make the amount of indexing information
+			 * the amount of indexing information
 			 * larger, and we have to budget for a new
 			 * page.
 			 */
@@ -428,7 +422,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	pgoff_t index = pos >> PAGE_SHIFT;
 	int err, appending = !!(pos + len > inode->i_size);
 	int skipped_read = 0;
-	struct page *page;
+	struct folio *folio;
 
 	ubifs_assert(c, ubifs_inode(inode)->ui_size == inode->i_size);
 	ubifs_assert(c, !c->ro_media && !c->ro_mount);
@@ -437,13 +431,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 		return -EROFS;
 
 	/* Try out the fast-path part first */
-	page = grab_cache_page_write_begin(mapping, index);
-	if (unlikely(!page))
-		return -ENOMEM;
+	folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
+			mapping_gfp_mask(mapping));
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
 
-	if (!PageUptodate(page)) {
+	if (!folio_test_uptodate(folio)) {
 		/* The page is not loaded from the flash */
-		if (!(pos & ~PAGE_MASK) && len == PAGE_SIZE) {
+		if (pos == folio_pos(folio) && len >= folio_size(folio)) {
 			/*
 			 * We change whole page so no need to load it. But we
 			 * do not know whether this page exists on the media or
@@ -453,32 +448,27 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			 * media. Thus, we are setting the @PG_checked flag
 			 * here.
 			 */
-			SetPageChecked(page);
+			folio_set_checked(folio);
 			skipped_read = 1;
 		} else {
-			err = do_readpage(page);
+			err = do_readpage(folio);
 			if (err) {
-				unlock_page(page);
-				put_page(page);
+				folio_unlock(folio);
+				folio_put(folio);
 				return err;
 			}
 		}
-
-		SetPageUptodate(page);
-		ClearPageError(page);
 	}
 
-	err = allocate_budget(c, page, ui, appending);
+	err = allocate_budget(c, folio, ui, appending);
 	if (unlikely(err)) {
 		ubifs_assert(c, err == -ENOSPC);
 		/*
 		 * If we skipped reading the page because we were going to
 		 * write all of it, then it is not up to date.
 		 */
-		if (skipped_read) {
-			ClearPageChecked(page);
-			ClearPageUptodate(page);
-		}
+		if (skipped_read)
+			folio_clear_checked(folio);
 		/*
 		 * Budgeting failed which means it would have to force
 		 * write-back but didn't, because we set the @fast flag in the
@@ -490,8 +480,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 			ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
 			mutex_unlock(&ui->ui_mutex);
 		}
-		unlock_page(page);
-		put_page(page);
+		folio_unlock(folio);
+		folio_put(folio);
 
 		return write_begin_slow(mapping, pos, len, pagep);
 	}
@@ -502,22 +492,21 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
 	 * with @ui->ui_mutex locked if we are appending pages, and unlocked
 	 * otherwise. This is an optimization (slightly hacky though).
 	 */
-	*pagep = page;
+	*pagep = &folio->page;
 	return 0;
-
 }
 
 /**
  * cancel_budget - cancel budget.
  * @c: UBIFS file-system description object
- * @page: page to cancel budget for
+ * @folio: folio to cancel budget for
  * @ui: UBIFS inode object the page belongs to
  * @appending: non-zero if the page is appended
  *
  * This is a helper function for a page write operation. It unlocks the
  * @ui->ui_mutex in case of appending.
  */
-static void cancel_budget(struct ubifs_info *c, struct page *page,
+static void cancel_budget(struct ubifs_info *c, struct folio *folio,
 			  struct ubifs_inode *ui, int appending)
 {
 	if (appending) {
@@ -525,8 +514,8 @@ static void cancel_budget(struct ubifs_info *c, struct page *page,
 			ubifs_release_dirty_inode_budget(c, ui);
 		mutex_unlock(&ui->ui_mutex);
 	}
-	if (!PagePrivate(page)) {
-		if (PageChecked(page))
+	if (!folio->private) {
+		if (folio_test_checked(folio))
 			release_new_page_budget(c);
 		else
 			release_existing_page_budget(c);
@@ -537,6 +526,7 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 			   loff_t pos, unsigned len, unsigned copied,
 			   struct page *page, void *fsdata)
 {
+	struct folio *folio = page_folio(page);
 	struct inode *inode = mapping->host;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -544,44 +534,47 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 	int appending = !!(end_pos > inode->i_size);
 
 	dbg_gen("ino %lu, pos %llu, pg %lu, len %u, copied %d, i_size %lld",
-		inode->i_ino, pos, page->index, len, copied, inode->i_size);
+		inode->i_ino, pos, folio->index, len, copied, inode->i_size);
 
-	if (unlikely(copied < len && len == PAGE_SIZE)) {
+	if (unlikely(copied < len && !folio_test_uptodate(folio))) {
 		/*
-		 * VFS copied less data to the page that it intended and
+		 * VFS copied less data to the folio than it intended and
 		 * declared in its '->write_begin()' call via the @len
-		 * argument. If the page was not up-to-date, and @len was
-		 * @PAGE_SIZE, the 'ubifs_write_begin()' function did
+		 * argument. If the folio was not up-to-date,
+		 * the 'ubifs_write_begin()' function did
 		 * not load it from the media (for optimization reasons). This
-		 * means that part of the page contains garbage. So read the
-		 * page now.
+		 * means that part of the folio contains garbage. So read the
+		 * folio now.
 		 */
 		dbg_gen("copied %d instead of %d, read page and repeat",
 			copied, len);
-		cancel_budget(c, page, ui, appending);
-		ClearPageChecked(page);
+		cancel_budget(c, folio, ui, appending);
+		folio_clear_checked(folio);
 
 		/*
 		 * Return 0 to force VFS to repeat the whole operation, or the
 		 * error code if 'do_readpage()' fails.
 		 */
-		copied = do_readpage(page);
+		copied = do_readpage(folio);
 		goto out;
 	}
 
-	if (!PagePrivate(page)) {
-		attach_page_private(page, (void *)1);
+	if (len == folio_size(folio))
+		folio_mark_uptodate(folio);
+
+	if (!folio->private) {
+		folio_attach_private(folio, (void *)1);
 		atomic_long_inc(&c->dirty_pg_cnt);
-		__set_page_dirty_nobuffers(page);
+		filemap_dirty_folio(mapping, folio);
 	}
 
 	if (appending) {
 		i_size_write(inode, end_pos);
 		ui->ui_size = end_pos;
 		/*
-		 * Note, we do not set @I_DIRTY_PAGES (which means that the
-		 * inode has dirty pages), this has been done in
-		 * '__set_page_dirty_nobuffers()'.
+		 * We do not set @I_DIRTY_PAGES (which means that
+		 * the inode has dirty pages), this was done in
+		 * filemap_dirty_folio().
 		 */
 		__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 		ubifs_assert(c, mutex_is_locked(&ui->ui_mutex));
@@ -589,43 +582,43 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
 	}
 
 out:
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	return copied;
 }
 
 /**
  * populate_page - copy data nodes into a page for bulk-read.
  * @c: UBIFS file-system description object
- * @page: page
+ * @folio: folio
  * @bu: bulk-read information
  * @n: next zbranch slot
  *
  * Returns: %0 on success and a negative error code on failure.
  */
-static int populate_page(struct ubifs_info *c, struct page *page,
+static int populate_page(struct ubifs_info *c, struct folio *folio,
 			 struct bu_info *bu, int *n)
 {
 	int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	loff_t i_size = i_size_read(inode);
 	unsigned int page_block;
 	void *addr, *zaddr;
 	pgoff_t end_index;
 
 	dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
-		inode->i_ino, page->index, i_size, page->flags);
+		inode->i_ino, folio->index, i_size, folio->flags);
 
-	addr = zaddr = kmap(page);
+	addr = zaddr = kmap_local_folio(folio, 0);
 
 	end_index = (i_size - 1) >> PAGE_SHIFT;
-	if (!i_size || page->index > end_index) {
+	if (!i_size || folio->index > end_index) {
 		hole = 1;
-		memset(addr, 0, PAGE_SIZE);
+		addr = folio_zero_tail(folio, 0, addr);
 		goto out_hole;
 	}
 
-	page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	page_block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
 	while (1) {
 		int err, len, out_len, dlen;
 
@@ -674,9 +667,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 			break;
 		addr += UBIFS_BLOCK_SIZE;
 		page_block += 1;
+		if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) {
+			kunmap_local(addr - UBIFS_BLOCK_SIZE);
+			addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE);
+		}
 	}
 
-	if (end_index == page->index) {
+	if (end_index == folio->index) {
 		int len = i_size & (PAGE_SIZE - 1);
 
 		if (len && len < read)
@@ -685,22 +682,19 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 out_hole:
 	if (hole) {
-		SetPageChecked(page);
+		folio_set_checked(folio);
 		dbg_gen("hole");
 	}
 
-	SetPageUptodate(page);
-	ClearPageError(page);
-	flush_dcache_page(page);
-	kunmap(page);
+	folio_mark_uptodate(folio);
+	flush_dcache_folio(folio);
+	kunmap_local(addr);
 	*n = nn;
 	return 0;
 
 out_err:
-	ClearPageUptodate(page);
-	SetPageError(page);
-	flush_dcache_page(page);
-	kunmap(page);
+	flush_dcache_folio(folio);
+	kunmap_local(addr);
 	ubifs_err(c, "bad data node (block %u, inode %lu)",
 		  page_block, inode->i_ino);
 	return -EINVAL;
@@ -710,15 +704,15 @@ out_err:
  * ubifs_do_bulk_read - do bulk-read.
  * @c: UBIFS file-system description object
  * @bu: bulk-read information
- * @page1: first page to read
+ * @folio1: first folio to read
  *
  * Returns: %1 if the bulk-read is done, otherwise %0 is returned.
  */
 static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
-			      struct page *page1)
+			      struct folio *folio1)
 {
-	pgoff_t offset = page1->index, end_index;
-	struct address_space *mapping = page1->mapping;
+	pgoff_t offset = folio1->index, end_index;
+	struct address_space *mapping = folio1->mapping;
 	struct inode *inode = mapping->host;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	int err, page_idx, page_cnt, ret = 0, n = 0;
@@ -768,11 +762,11 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
 			goto out_warn;
 	}
 
-	err = populate_page(c, page1, bu, &n);
+	err = populate_page(c, folio1, bu, &n);
 	if (err)
 		goto out_warn;
 
-	unlock_page(page1);
+	folio_unlock(folio1);
 	ret = 1;
 
 	isize = i_size_read(inode);
@@ -782,19 +776,19 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
 
 	for (page_idx = 1; page_idx < page_cnt; page_idx++) {
 		pgoff_t page_offset = offset + page_idx;
-		struct page *page;
+		struct folio *folio;
 
 		if (page_offset > end_index)
 			break;
-		page = pagecache_get_page(mapping, page_offset,
+		folio = __filemap_get_folio(mapping, page_offset,
 				 FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
 				 ra_gfp_mask);
-		if (!page)
+		if (IS_ERR(folio))
 			break;
-		if (!PageUptodate(page))
-			err = populate_page(c, page, bu, &n);
-		unlock_page(page);
-		put_page(page);
+		if (!folio_test_uptodate(folio))
+			err = populate_page(c, folio, bu, &n);
+		folio_unlock(folio);
+		folio_put(folio);
 		if (err)
 			break;
 	}
@@ -817,7 +811,7 @@ out_bu_off:
 
 /**
  * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
- * @page: page from which to start bulk-read.
+ * @folio: folio from which to start bulk-read.
  *
  * Some flash media are capable of reading sequentially at faster rates. UBIFS
  * bulk-read facility is designed to take advantage of that, by reading in one
@@ -826,12 +820,12 @@ out_bu_off:
  *
  * Returns: %1 if a bulk-read is done and %0 otherwise.
  */
-static int ubifs_bulk_read(struct page *page)
+static int ubifs_bulk_read(struct folio *folio)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
-	pgoff_t index = page->index, last_page_read = ui->last_page_read;
+	pgoff_t index = folio->index, last_page_read = ui->last_page_read;
 	struct bu_info *bu;
 	int err = 0, allocated = 0;
 
@@ -879,8 +873,8 @@ static int ubifs_bulk_read(struct page *page)
 
 	bu->buf_len = c->max_bu_buf_len;
 	data_key_init(c, &bu->key, inode->i_ino,
-		      page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
-	err = ubifs_do_bulk_read(c, bu, page);
+		      folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+	err = ubifs_do_bulk_read(c, bu, folio);
 
 	if (!allocated)
 		mutex_unlock(&c->bu_mutex);
@@ -894,69 +888,71 @@ out_unlock:
 
 static int ubifs_read_folio(struct file *file, struct folio *folio)
 {
-	struct page *page = &folio->page;
-
-	if (ubifs_bulk_read(page))
+	if (ubifs_bulk_read(folio))
 		return 0;
-	do_readpage(page);
+	do_readpage(folio);
 	folio_unlock(folio);
 	return 0;
 }
 
-static int do_writepage(struct page *page, int len)
+static int do_writepage(struct folio *folio, size_t len)
 {
-	int err = 0, i, blen;
+	int err = 0, blen;
 	unsigned int block;
 	void *addr;
+	size_t offset = 0;
 	union ubifs_key key;
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 
 #ifdef UBIFS_DEBUG
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	spin_lock(&ui->ui_lock);
-	ubifs_assert(c, page->index <= ui->synced_i_size >> PAGE_SHIFT);
+	ubifs_assert(c, folio->index <= ui->synced_i_size >> PAGE_SHIFT);
 	spin_unlock(&ui->ui_lock);
 #endif
 
-	/* Update radix tree tags */
-	set_page_writeback(page);
+	folio_start_writeback(folio);
 
-	addr = kmap(page);
-	block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
-	i = 0;
-	while (len) {
-		blen = min_t(int, len, UBIFS_BLOCK_SIZE);
+	addr = kmap_local_folio(folio, offset);
+	block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+	for (;;) {
+		blen = min_t(size_t, len, UBIFS_BLOCK_SIZE);
 		data_key_init(c, &key, inode->i_ino, block);
 		err = ubifs_jnl_write_data(c, inode, &key, addr, blen);
 		if (err)
 			break;
-		if (++i >= UBIFS_BLOCKS_PER_PAGE)
+		len -= blen;
+		if (!len)
 			break;
 		block += 1;
 		addr += blen;
-		len -= blen;
+		if (folio_test_highmem(folio) && !offset_in_page(addr)) {
+			kunmap_local(addr - blen);
+			offset += PAGE_SIZE;
+			addr = kmap_local_folio(folio, offset);
+		}
 	}
+	kunmap_local(addr);
 	if (err) {
-		SetPageError(page);
-		ubifs_err(c, "cannot write page %lu of inode %lu, error %d",
-			  page->index, inode->i_ino, err);
+		mapping_set_error(folio->mapping, err);
+		ubifs_err(c, "cannot write folio %lu of inode %lu, error %d",
+			  folio->index, inode->i_ino, err);
 		ubifs_ro_mode(c, err);
 	}
 
-	ubifs_assert(c, PagePrivate(page));
-	if (PageChecked(page))
+	ubifs_assert(c, folio->private != NULL);
+	if (folio_test_checked(folio))
 		release_new_page_budget(c);
 	else
 		release_existing_page_budget(c);
 
 	atomic_long_dec(&c->dirty_pg_cnt);
-	detach_page_private(page);
-	ClearPageChecked(page);
+	folio_detach_private(folio);
+	folio_clear_checked(folio);
 
-	kunmap(page);
-	unlock_page(page);
-	end_page_writeback(page);
+	folio_unlock(folio);
+	folio_end_writeback(folio);
 	return err;
 }
 
@@ -1006,22 +1002,21 @@ static int do_writepage(struct page *page, int len)
  * on the page lock and it would not write the truncated inode node to the
  * journal before we have finished.
  */
-static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
+static int ubifs_writepage(struct folio *folio, struct writeback_control *wbc,
+		void *data)
 {
-	struct inode *inode = page->mapping->host;
+	struct inode *inode = folio->mapping->host;
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 	loff_t i_size =  i_size_read(inode), synced_i_size;
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
-	int err, len = i_size & (PAGE_SIZE - 1);
-	void *kaddr;
+	int err, len = folio_size(folio);
 
 	dbg_gen("ino %lu, pg %lu, pg flags %#lx",
-		inode->i_ino, page->index, page->flags);
-	ubifs_assert(c, PagePrivate(page));
+		inode->i_ino, folio->index, folio->flags);
+	ubifs_assert(c, folio->private != NULL);
 
-	/* Is the page fully outside @i_size? (truncate in progress) */
-	if (page->index > end_index || (page->index == end_index && !len)) {
+	/* Is the folio fully outside @i_size? (truncate in progress) */
+	if (folio_pos(folio) >= i_size) {
 		err = 0;
 		goto out_unlock;
 	}
@@ -1030,9 +1025,9 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 	synced_i_size = ui->synced_i_size;
 	spin_unlock(&ui->ui_lock);
 
-	/* Is the page fully inside @i_size? */
-	if (page->index < end_index) {
-		if (page->index >= synced_i_size >> PAGE_SHIFT) {
+	/* Is the folio fully inside i_size? */
+	if (folio_pos(folio) + len <= i_size) {
+		if (folio_pos(folio) >= synced_i_size) {
 			err = inode->i_sb->s_op->write_inode(inode, NULL);
 			if (err)
 				goto out_redirty;
@@ -1045,20 +1040,18 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 			 * with this.
 			 */
 		}
-		return do_writepage(page, PAGE_SIZE);
+		return do_writepage(folio, len);
 	}
 
 	/*
-	 * The page straddles @i_size. It must be zeroed out on each and every
+	 * The folio straddles @i_size. It must be zeroed out on each and every
 	 * writepage invocation because it may be mmapped. "A file is mapped
 	 * in multiples of the page size. For a file that is not a multiple of
 	 * the page size, the remaining memory is zeroed when mapped, and
 	 * writes to that region are not written out to the file."
 	 */
-	kaddr = kmap_atomic(page);
-	memset(kaddr + len, 0, PAGE_SIZE - len);
-	flush_dcache_page(page);
-	kunmap_atomic(kaddr);
+	len = i_size - folio_pos(folio);
+	folio_zero_segment(folio, len, folio_size(folio));
 
 	if (i_size > synced_i_size) {
 		err = inode->i_sb->s_op->write_inode(inode, NULL);
@@ -1066,19 +1059,25 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
 			goto out_redirty;
 	}
 
-	return do_writepage(page, len);
+	return do_writepage(folio, len);
 out_redirty:
 	/*
-	 * redirty_page_for_writepage() won't call ubifs_dirty_inode() because
+	 * folio_redirty_for_writepage() won't call ubifs_dirty_inode() because
 	 * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so
 	 * there is no need to do space budget for dirty inode.
 	 */
-	redirty_page_for_writepage(wbc, page);
+	folio_redirty_for_writepage(wbc, folio);
 out_unlock:
-	unlock_page(page);
+	folio_unlock(folio);
 	return err;
 }
 
+static int ubifs_writepages(struct address_space *mapping,
+		struct writeback_control *wbc)
+{
+	return write_cache_pages(mapping, wbc, ubifs_writepage, NULL);
+}
+
 /**
  * do_attr_changes - change inode attributes.
  * @inode: inode to change attributes for
@@ -1155,11 +1154,11 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 
 	if (offset) {
 		pgoff_t index = new_size >> PAGE_SHIFT;
-		struct page *page;
+		struct folio *folio;
 
-		page = find_lock_page(inode->i_mapping, index);
-		if (page) {
-			if (PageDirty(page)) {
+		folio = filemap_lock_folio(inode->i_mapping, index);
+		if (!IS_ERR(folio)) {
+			if (folio_test_dirty(folio)) {
 				/*
 				 * 'ubifs_jnl_truncate()' will try to truncate
 				 * the last data node, but it contains
@@ -1168,14 +1167,14 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 				 * 'ubifs_jnl_truncate()' will see an already
 				 * truncated (and up to date) data node.
 				 */
-				ubifs_assert(c, PagePrivate(page));
+				ubifs_assert(c, folio->private != NULL);
 
-				clear_page_dirty_for_io(page);
+				folio_clear_dirty_for_io(folio);
 				if (UBIFS_BLOCKS_PER_PAGE_SHIFT)
-					offset = new_size &
-						 (PAGE_SIZE - 1);
-				err = do_writepage(page, offset);
-				put_page(page);
+					offset = offset_in_folio(folio,
+							new_size);
+				err = do_writepage(folio, offset);
+				folio_put(folio);
 				if (err)
 					goto out_budg;
 				/*
@@ -1188,8 +1187,8 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
 				 * to 'ubifs_jnl_truncate()' to save it from
 				 * having to read it.
 				 */
-				unlock_page(page);
-				put_page(page);
+				folio_unlock(folio);
+				folio_put(folio);
 			}
 		}
 	}
@@ -1512,14 +1511,14 @@ static bool ubifs_release_folio(struct folio *folio, gfp_t unused_gfp_flags)
  */
 static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 {
-	struct page *page = vmf->page;
+	struct folio *folio = page_folio(vmf->page);
 	struct inode *inode = file_inode(vmf->vma->vm_file);
 	struct ubifs_info *c = inode->i_sb->s_fs_info;
 	struct timespec64 now = current_time(inode);
 	struct ubifs_budget_req req = { .new_page = 1 };
 	int err, update_time;
 
-	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, page->index,
+	dbg_gen("ino %lu, pg %lu, i_size %lld",	inode->i_ino, folio->index,
 		i_size_read(inode));
 	ubifs_assert(c, !c->ro_media && !c->ro_mount);
 
@@ -1527,17 +1526,17 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS; /* -EROFS */
 
 	/*
-	 * We have not locked @page so far so we may budget for changing the
-	 * page. Note, we cannot do this after we locked the page, because
+	 * We have not locked @folio so far so we may budget for changing the
+	 * folio. Note, we cannot do this after we locked the folio, because
 	 * budgeting may cause write-back which would cause deadlock.
 	 *
-	 * At the moment we do not know whether the page is dirty or not, so we
-	 * assume that it is not and budget for a new page. We could look at
+	 * At the moment we do not know whether the folio is dirty or not, so we
+	 * assume that it is not and budget for a new folio. We could look at
 	 * the @PG_private flag and figure this out, but we may race with write
-	 * back and the page state may change by the time we lock it, so this
+	 * back and the folio state may change by the time we lock it, so this
 	 * would need additional care. We do not bother with this at the
 	 * moment, although it might be good idea to do. Instead, we allocate
-	 * budget for a new page and amend it later on if the page was in fact
+	 * budget for a new folio and amend it later on if the folio was in fact
 	 * dirty.
 	 *
 	 * The budgeting-related logic of this function is similar to what we
@@ -1560,21 +1559,21 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 		return VM_FAULT_SIGBUS;
 	}
 
-	lock_page(page);
-	if (unlikely(page->mapping != inode->i_mapping ||
-		     page_offset(page) > i_size_read(inode))) {
-		/* Page got truncated out from underneath us */
+	folio_lock(folio);
+	if (unlikely(folio->mapping != inode->i_mapping ||
+		     folio_pos(folio) >= i_size_read(inode))) {
+		/* Folio got truncated out from underneath us */
 		goto sigbus;
 	}
 
-	if (PagePrivate(page))
+	if (folio->private)
 		release_new_page_budget(c);
 	else {
-		if (!PageChecked(page))
+		if (!folio_test_checked(folio))
 			ubifs_convert_page_budget(c);
-		attach_page_private(page, (void *)1);
+		folio_attach_private(folio, (void *)1);
 		atomic_long_inc(&c->dirty_pg_cnt);
-		__set_page_dirty_nobuffers(page);
+		filemap_dirty_folio(folio->mapping, folio);
 	}
 
 	if (update_time) {
@@ -1590,11 +1589,11 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf)
 			ubifs_release_dirty_inode_budget(c, ui);
 	}
 
-	wait_for_stable_page(page);
+	folio_wait_stable(folio);
 	return VM_FAULT_LOCKED;
 
 sigbus:
-	unlock_page(page);
+	folio_unlock(folio);
 	ubifs_release_budget(c, &req);
 	return VM_FAULT_SIGBUS;
 }
@@ -1648,7 +1647,7 @@ static int ubifs_symlink_getattr(struct mnt_idmap *idmap,
 
 const struct address_space_operations ubifs_file_address_operations = {
 	.read_folio     = ubifs_read_folio,
-	.writepage      = ubifs_writepage,
+	.writepages     = ubifs_writepages,
 	.write_begin    = ubifs_write_begin,
 	.write_end      = ubifs_write_end,
 	.invalidate_folio = ubifs_invalidate_folio,
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 873e6e1c92b5..6ebf3c04ac5f 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -82,8 +82,9 @@ static int valuable(struct ubifs_info *c, const struct ubifs_lprops *lprops)
  */
 static int scan_for_dirty_cb(struct ubifs_info *c,
 			     const struct ubifs_lprops *lprops, int in_tree,
-			     struct scan_data *data)
+			     void *arg)
 {
+	struct scan_data *data = arg;
 	int ret = LPT_SCAN_CONTINUE;
 
 	/* Exclude LEBs that are currently in use */
@@ -166,8 +167,7 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c,
 	data.pick_free = pick_free;
 	data.lnum = -1;
 	data.exclude_index = exclude_index;
-	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
-				    (ubifs_lpt_scan_callback)scan_for_dirty_cb,
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_dirty_cb,
 				    &data);
 	if (err)
 		return ERR_PTR(err);
@@ -349,8 +349,9 @@ out:
  */
 static int scan_for_free_cb(struct ubifs_info *c,
 			    const struct ubifs_lprops *lprops, int in_tree,
-			    struct scan_data *data)
+			    void *arg)
 {
+	struct scan_data *data = arg;
 	int ret = LPT_SCAN_CONTINUE;
 
 	/* Exclude LEBs that are currently in use */
@@ -446,7 +447,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
 	data.pick_free = pick_free;
 	data.lnum = -1;
 	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
-				    (ubifs_lpt_scan_callback)scan_for_free_cb,
+				    scan_for_free_cb,
 				    &data);
 	if (err)
 		return ERR_PTR(err);
@@ -589,8 +590,9 @@ out:
  */
 static int scan_for_idx_cb(struct ubifs_info *c,
 			   const struct ubifs_lprops *lprops, int in_tree,
-			   struct scan_data *data)
+			   void *arg)
 {
+	struct scan_data *data = arg;
 	int ret = LPT_SCAN_CONTINUE;
 
 	/* Exclude LEBs that are currently in use */
@@ -625,8 +627,7 @@ static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
 	int err;
 
 	data.lnum = -1;
-	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
-				    (ubifs_lpt_scan_callback)scan_for_idx_cb,
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_for_idx_cb,
 				    &data);
 	if (err)
 		return ERR_PTR(err);
@@ -726,11 +727,10 @@ out:
 	return err;
 }
 
-static int cmp_dirty_idx(const struct ubifs_lprops **a,
-			 const struct ubifs_lprops **b)
+static int cmp_dirty_idx(const void *a, const void *b)
 {
-	const struct ubifs_lprops *lpa = *a;
-	const struct ubifs_lprops *lpb = *b;
+	const struct ubifs_lprops *lpa = *(const struct ubifs_lprops **)a;
+	const struct ubifs_lprops *lpb = *(const struct ubifs_lprops **)b;
 
 	return lpa->dirty + lpa->free - lpb->dirty - lpb->free;
 }
@@ -754,7 +754,7 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
 	       sizeof(void *) * c->dirty_idx.cnt);
 	/* Sort it so that the dirtiest is now at the end */
 	sort(c->dirty_idx.arr, c->dirty_idx.cnt, sizeof(void *),
-	     (int (*)(const void *, const void *))cmp_dirty_idx, NULL);
+	     cmp_dirty_idx, NULL);
 	dbg_find("found %d dirty index LEBs", c->dirty_idx.cnt);
 	if (c->dirty_idx.cnt)
 		dbg_find("dirtiest index LEB is %d with dirty %d and free %d",
@@ -782,8 +782,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c)
  */
 static int scan_dirty_idx_cb(struct ubifs_info *c,
 			   const struct ubifs_lprops *lprops, int in_tree,
-			   struct scan_data *data)
+			   void *arg)
 {
+	struct scan_data *data = arg;
 	int ret = LPT_SCAN_CONTINUE;
 
 	/* Exclude LEBs that are currently in use */
@@ -842,8 +843,7 @@ static int find_dirty_idx_leb(struct ubifs_info *c)
 	if (c->pnodes_have >= c->pnode_cnt)
 		/* All pnodes are in memory, so skip scan */
 		return -ENOSPC;
-	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum,
-				    (ubifs_lpt_scan_callback)scan_dirty_idx_cb,
+	err = ubifs_lpt_scan_nolock(c, -1, c->lscan_lnum, scan_dirty_idx_cb,
 				    &data);
 	if (err)
 		return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f0a5538c84b0..74aee92433d7 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -293,6 +293,96 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
 }
 
 /**
+ * __queue_and_wait - queue a task and wait until the task is waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function adds current task in queue and waits until the task is waked
+ * up. This function should be called with @c->reserve_space_wq locked.
+ */
+static void __queue_and_wait(struct ubifs_info *c)
+{
+	DEFINE_WAIT(wait);
+
+	__add_wait_queue_entry_tail_exclusive(&c->reserve_space_wq, &wait);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	spin_unlock(&c->reserve_space_wq.lock);
+
+	schedule();
+	finish_wait(&c->reserve_space_wq, &wait);
+}
+
+/**
+ * wait_for_reservation - try queuing current task to wait until waked up.
+ * @c: UBIFS file-system description object
+ *
+ * This function queues current task to wait until waked up, if queuing is
+ * started(@c->need_wait_space is not %0). Returns %true if current task is
+ * added in queue, otherwise %false is returned.
+ */
+static bool wait_for_reservation(struct ubifs_info *c)
+{
+	if (likely(atomic_read(&c->need_wait_space) == 0))
+		/* Quick path to check whether queuing is started. */
+		return false;
+
+	spin_lock(&c->reserve_space_wq.lock);
+	if (atomic_read(&c->need_wait_space) == 0) {
+		/* Queuing is not started, don't queue current task. */
+		spin_unlock(&c->reserve_space_wq.lock);
+		return false;
+	}
+
+	__queue_and_wait(c);
+	return true;
+}
+
+/**
+ * wake_up_reservation - wake up first task in queue or stop queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function wakes up the first task in queue if it exists, or stops
+ * queuing if no tasks in queue.
+ */
+static void wake_up_reservation(struct ubifs_info *c)
+{
+	spin_lock(&c->reserve_space_wq.lock);
+	if (waitqueue_active(&c->reserve_space_wq))
+		wake_up_locked(&c->reserve_space_wq);
+	else
+		/*
+		 * Compared with wait_for_reservation(), set @c->need_wait_space
+		 * under the protection of wait queue lock, which can avoid that
+		 * @c->need_wait_space is set to 0 after new task queued.
+		 */
+		atomic_set(&c->need_wait_space, 0);
+	spin_unlock(&c->reserve_space_wq.lock);
+}
+
+/**
+ * wake_up_reservation - add current task in queue or start queuing.
+ * @c: UBIFS file-system description object
+ *
+ * This function starts queuing if queuing is not started, otherwise adds
+ * current task in queue.
+ */
+static void add_or_start_queue(struct ubifs_info *c)
+{
+	spin_lock(&c->reserve_space_wq.lock);
+	if (atomic_cmpxchg(&c->need_wait_space, 0, 1) == 0) {
+		/* Starts queuing, task can go on directly. */
+		spin_unlock(&c->reserve_space_wq.lock);
+		return;
+	}
+
+	/*
+	 * There are at least two tasks have retried more than 32 times
+	 * at certain point, first task has started queuing, just queue
+	 * the left tasks.
+	 */
+	__queue_and_wait(c);
+}
+
+/**
  * make_reservation - reserve journal space.
  * @c: UBIFS file-system description object
  * @jhead: journal head
@@ -311,33 +401,27 @@ static int write_head(struct ubifs_info *c, int jhead, void *buf, int len,
 static int make_reservation(struct ubifs_info *c, int jhead, int len)
 {
 	int err, cmt_retries = 0, nospc_retries = 0;
+	bool blocked = wait_for_reservation(c);
 
 again:
 	down_read(&c->commit_sem);
 	err = reserve_space(c, jhead, len);
-	if (!err)
+	if (!err) {
 		/* c->commit_sem will get released via finish_reservation(). */
-		return 0;
+		goto out_wake_up;
+	}
 	up_read(&c->commit_sem);
 
 	if (err == -ENOSPC) {
 		/*
 		 * GC could not make any progress. We should try to commit
-		 * once because it could make some dirty space and GC would
-		 * make progress, so make the error -EAGAIN so that the below
+		 * because it could make some dirty space and GC would make
+		 * progress, so make the error -EAGAIN so that the below
 		 * will commit and re-try.
 		 */
-		if (nospc_retries++ < 2) {
-			dbg_jnl("no space, retry");
-			err = -EAGAIN;
-		}
-
-		/*
-		 * This means that the budgeting is incorrect. We always have
-		 * to be able to write to the media, because all operations are
-		 * budgeted. Deletions are not budgeted, though, but we reserve
-		 * an extra LEB for them.
-		 */
+		nospc_retries++;
+		dbg_jnl("no space, retry");
+		err = -EAGAIN;
 	}
 
 	if (err != -EAGAIN)
@@ -349,15 +433,37 @@ again:
 	 */
 	if (cmt_retries > 128) {
 		/*
-		 * This should not happen unless the journal size limitations
-		 * are too tough.
+		 * This should not happen unless:
+		 * 1. The journal size limitations are too tough.
+		 * 2. The budgeting is incorrect. We always have to be able to
+		 *    write to the media, because all operations are budgeted.
+		 *    Deletions are not budgeted, though, but we reserve an
+		 *    extra LEB for them.
 		 */
-		ubifs_err(c, "stuck in space allocation");
+		ubifs_err(c, "stuck in space allocation, nospc_retries %d",
+			  nospc_retries);
 		err = -ENOSPC;
 		goto out;
-	} else if (cmt_retries > 32)
-		ubifs_warn(c, "too many space allocation re-tries (%d)",
-			   cmt_retries);
+	} else if (cmt_retries > 32) {
+		/*
+		 * It's almost impossible to happen, unless there are many tasks
+		 * making reservation concurrently and someone task has retried
+		 * gc + commit for many times, generated available space during
+		 * this period are grabbed by other tasks.
+		 * But if it happens, start queuing up all tasks that will make
+		 * space reservation, then there is only one task making space
+		 * reservation at any time, and it can always make success under
+		 * the premise of correct budgeting.
+		 */
+		ubifs_warn(c, "too many space allocation cmt_retries (%d) "
+			   "nospc_retries (%d), start queuing tasks",
+			   cmt_retries, nospc_retries);
+
+		if (!blocked) {
+			blocked = true;
+			add_or_start_queue(c);
+		}
+	}
 
 	dbg_jnl("-EAGAIN, commit and retry (retried %d times)",
 		cmt_retries);
@@ -365,7 +471,7 @@ again:
 
 	err = ubifs_run_commit(c);
 	if (err)
-		return err;
+		goto out_wake_up;
 	goto again;
 
 out:
@@ -380,6 +486,27 @@ out:
 		cmt_retries = dbg_check_lprops(c);
 		up_write(&c->commit_sem);
 	}
+out_wake_up:
+	if (blocked) {
+		/*
+		 * Only tasks that have ever started queuing or ever been queued
+		 * can wake up other queued tasks, which can make sure that
+		 * there is only one task waked up to make space reservation.
+		 * For example:
+		 *      task A          task B           task C
+		 *                 make_reservation  make_reservation
+		 * reserve_space // 0
+		 * wake_up_reservation
+		 *                  atomic_cmpxchg // 0, start queuing
+		 *                  reserve_space
+		 *                                    wait_for_reservation
+		 *                                     __queue_and_wait
+		 *                                      add_wait_queue
+		 *  if (blocked) // false
+		 *  // So that task C won't be waked up to race with task B
+		 */
+		wake_up_reservation(c);
+	}
 	return err;
 }
 
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6d6cd85c2b4c..a11c3dab7e16 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -1014,8 +1014,9 @@ out:
  */
 static int scan_check_cb(struct ubifs_info *c,
 			 const struct ubifs_lprops *lp, int in_tree,
-			 struct ubifs_lp_stats *lst)
+			 void *arg)
 {
+	struct ubifs_lp_stats *lst = arg;
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
 	int cat, lnum = lp->lnum, is_idx = 0, used = 0, free, dirty, ret;
@@ -1269,8 +1270,7 @@ int dbg_check_lprops(struct ubifs_info *c)
 
 	memset(&lst, 0, sizeof(struct ubifs_lp_stats));
 	err = ubifs_lpt_scan_nolock(c, c->main_first, c->leb_cnt - 1,
-				    (ubifs_lpt_scan_callback)scan_check_cb,
-				    &lst);
+				    scan_check_cb, &lst);
 	if (err && err != -ENOSPC)
 		goto out;
 
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index c4d079328b92..07351fdce722 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -1646,7 +1646,6 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
 		len -= node_len;
 	}
 
-	err = 0;
 out:
 	vfree(buf);
 	return err;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 09e270d6ed02..291583005dd1 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2151,6 +2151,8 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 		mutex_init(&c->bu_mutex);
 		mutex_init(&c->write_reserve_mutex);
 		init_waitqueue_head(&c->cmt_wq);
+		init_waitqueue_head(&c->reserve_space_wq);
+		atomic_set(&c->need_wait_space, 0);
 		c->buds = RB_ROOT;
 		c->old_idx = RB_ROOT;
 		c->size_tree = RB_ROOT;
@@ -2239,13 +2241,14 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_umount;
 	}
 
+	generic_set_sb_d_ops(sb);
 	sb->s_root = d_make_root(root);
 	if (!sb->s_root) {
 		err = -ENOMEM;
 		goto out_umount;
 	}
 
-	import_uuid(&sb->s_uuid, c->uuid);
+	super_set_uuid(sb, c->uuid, sizeof(c->uuid));
 
 	mutex_unlock(&c->umount_mutex);
 	return 0;
@@ -2433,8 +2436,8 @@ static int __init ubifs_init(void)
 
 	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
 				sizeof(struct ubifs_inode), 0,
-				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
-				SLAB_ACCOUNT, &inode_slab_ctor);
+				SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
+				&inode_slab_ctor);
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index f4728e65d1bd..45cacdcd4746 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -3116,14 +3116,7 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
 void ubifs_tnc_close(struct ubifs_info *c)
 {
 	tnc_destroy_cnext(c);
-	if (c->zroot.znode) {
-		long n, freed;
-
-		n = atomic_long_read(&c->clean_zn_cnt);
-		freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
-		ubifs_assert(c, freed == n);
-		atomic_long_sub(n, &ubifs_clean_zn_cnt);
-	}
+	ubifs_destroy_tnc_tree(c);
 	kfree(c->gap_lebs);
 	kfree(c->ilebs);
 	destroy_old_idx(c);
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 4d686e34e64d..d3f8a6aa1f49 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -251,6 +251,28 @@ long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
 }
 
 /**
+ * ubifs_destroy_tnc_tree - destroy all znodes connected to the TNC tree.
+ * @c: UBIFS file-system description object
+ *
+ * This function destroys the whole TNC tree and updates clean global znode
+ * count.
+ */
+void ubifs_destroy_tnc_tree(struct ubifs_info *c)
+{
+	long n, freed;
+
+	if (!c->zroot.znode)
+		return;
+
+	n = atomic_long_read(&c->clean_zn_cnt);
+	freed = ubifs_destroy_tnc_subtree(c, c->zroot.znode);
+	ubifs_assert(c, freed == n);
+	atomic_long_sub(n, &ubifs_clean_zn_cnt);
+
+	c->zroot.znode = NULL;
+}
+
+/**
  * read_znode - read an indexing node from flash and fill znode.
  * @c: UBIFS file-system description object
  * @zzbr: the zbranch describing the node to read
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 3916dc4f30ca..1f3ea879d93a 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1047,6 +1047,8 @@ struct ubifs_debug_info;
  * @bg_bud_bytes: number of bud bytes when background commit is initiated
  * @old_buds: buds to be released after commit ends
  * @max_bud_cnt: maximum number of buds
+ * @need_wait_space: Non %0 means space reservation tasks need to wait in queue
+ * @reserve_space_wq: wait queue to sleep on if @need_wait_space is not %0
  *
  * @commit_sem: synchronizes committer with other processes
  * @cmt_state: commit state
@@ -1305,6 +1307,8 @@ struct ubifs_info {
 	long long bg_bud_bytes;
 	struct list_head old_buds;
 	int max_bud_cnt;
+	atomic_t need_wait_space;
+	wait_queue_head_t reserve_space_wq;
 
 	struct rw_semaphore commit_sem;
 	int cmt_state;
@@ -1903,6 +1907,7 @@ struct ubifs_znode *ubifs_tnc_postorder_next(const struct ubifs_info *c,
 					     struct ubifs_znode *znode);
 long ubifs_destroy_tnc_subtree(const struct ubifs_info *c,
 			       struct ubifs_znode *zr);
+void ubifs_destroy_tnc_tree(struct ubifs_info *c);
 struct ubifs_znode *ubifs_load_znode(struct ubifs_info *c,
 				     struct ubifs_zbranch *zbr,
 				     struct ubifs_znode *parent, int iip);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index f6533f93851b..f94f45fe2c91 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -67,7 +67,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
 		pos_valid = true;
 	}
 
-	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
 	if (!fname) {
 		ret = -ENOMEM;
 		goto out;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index d8493449d4c5..2f831a3a91af 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -357,7 +357,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 		return 0;
 	}
 
-	page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
+	page = find_or_create_page(inode->i_mapping, 0, GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
 
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 1bb6ed948927..1308109fd42d 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -59,7 +59,7 @@ static int udf_fiiter_find_entry(struct inode *dir, const struct qstr *child,
 		child->name[0] == '.' && child->name[1] == '.';
 	int ret;
 
-	fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
+	fname = kmalloc(UDF_NAME_LEN, GFP_KERNEL);
 	if (!fname)
 		return -ENOMEM;
 
@@ -566,7 +566,7 @@ out:
 static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
 		       struct dentry *dentry, const char *symname)
 {
-	struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777);
+	struct inode *inode;
 	struct pathComponent *pc;
 	const char *compstart;
 	struct extent_position epos = {};
@@ -579,17 +579,20 @@ static int udf_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	struct udf_inode_info *iinfo;
 	struct super_block *sb = dir->i_sb;
 
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	iinfo = UDF_I(inode);
-	down_write(&iinfo->i_data_sem);
-	name = kmalloc(UDF_NAME_LEN_CS0, GFP_NOFS);
+	name = kmalloc(UDF_NAME_LEN_CS0, GFP_KERNEL);
 	if (!name) {
 		err = -ENOMEM;
-		goto out_no_entry;
+		goto out;
+	}
+
+	inode = udf_new_inode(dir, S_IFLNK | 0777);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
 	}
 
+	iinfo = UDF_I(inode);
+	down_write(&iinfo->i_data_sem);
 	inode->i_data.a_ops = &udf_symlink_aops;
 	inode->i_op = &udf_symlink_inode_operations;
 	inode_nohighmem(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 928a04d9d9e0..2217f7ed7a49 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -40,20 +40,20 @@
 #include <linux/slab.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/parser.h>
 #include <linux/stat.h>
 #include <linux/cdrom.h>
 #include <linux/nls.h>
 #include <linux/vfs.h>
 #include <linux/vmalloc.h>
 #include <linux/errno.h>
-#include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/bitmap.h>
 #include <linux/crc-itu-t.h>
 #include <linux/log2.h>
 #include <asm/byteorder.h>
 #include <linux/iversion.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
 
 #include "udf_sb.h"
 #include "udf_i.h"
@@ -91,16 +91,20 @@ enum { UDF_MAX_LINKS = 0xffff };
 #define UDF_MAX_FILESIZE (1ULL << 42)
 
 /* These are the "meat" - everything else is stuffing */
-static int udf_fill_super(struct super_block *, void *, int);
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc);
 static void udf_put_super(struct super_block *);
 static int udf_sync_fs(struct super_block *, int);
-static int udf_remount_fs(struct super_block *, int *, char *);
 static void udf_load_logicalvolint(struct super_block *, struct kernel_extent_ad);
 static void udf_open_lvid(struct super_block *);
 static void udf_close_lvid(struct super_block *);
 static unsigned int udf_count_free(struct super_block *);
 static int udf_statfs(struct dentry *, struct kstatfs *);
 static int udf_show_options(struct seq_file *, struct dentry *);
+static int udf_init_fs_context(struct fs_context *fc);
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param);
+static int udf_reconfigure(struct fs_context *fc);
+static void udf_free_fc(struct fs_context *fc);
+static const struct fs_parameter_spec udf_param_spec[];
 
 struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
 {
@@ -119,18 +123,25 @@ struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct super_block *sb)
 }
 
 /* UDF filesystem type */
-static struct dentry *udf_mount(struct file_system_type *fs_type,
-		      int flags, const char *dev_name, void *data)
+static int udf_get_tree(struct fs_context *fc)
 {
-	return mount_bdev(fs_type, flags, dev_name, data, udf_fill_super);
+	return get_tree_bdev(fc, udf_fill_super);
 }
 
+static const struct fs_context_operations udf_context_ops = {
+	.parse_param	= udf_parse_param,
+	.get_tree	= udf_get_tree,
+	.reconfigure	= udf_reconfigure,
+	.free		= udf_free_fc,
+};
+
 static struct file_system_type udf_fstype = {
 	.owner		= THIS_MODULE,
 	.name		= "udf",
-	.mount		= udf_mount,
 	.kill_sb	= kill_block_super,
 	.fs_flags	= FS_REQUIRES_DEV,
+	.init_fs_context = udf_init_fs_context,
+	.parameters	= udf_param_spec,
 };
 MODULE_ALIAS_FS("udf");
 
@@ -177,7 +188,6 @@ static int __init init_inodecache(void)
 	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
-						 SLAB_MEM_SPREAD |
 						 SLAB_ACCOUNT),
 					     init_once);
 	if (!udf_inode_cachep)
@@ -204,12 +214,10 @@ static const struct super_operations udf_sb_ops = {
 	.put_super	= udf_put_super,
 	.sync_fs	= udf_sync_fs,
 	.statfs		= udf_statfs,
-	.remount_fs	= udf_remount_fs,
 	.show_options	= udf_show_options,
 };
 
 struct udf_options {
-	unsigned char novrs;
 	unsigned int blocksize;
 	unsigned int session;
 	unsigned int lastblock;
@@ -223,6 +231,65 @@ struct udf_options {
 	struct nls_table *nls_map;
 };
 
+/*
+ * UDF has historically preserved prior mount options across
+ * a remount, so copy those here if remounting, otherwise set
+ * initial mount defaults.
+ */
+static void udf_init_options(struct fs_context *fc, struct udf_options *uopt)
+{
+	if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+		struct super_block *sb = fc->root->d_sb;
+		struct udf_sb_info *sbi = UDF_SB(sb);
+
+		uopt->flags = sbi->s_flags;
+		uopt->uid   = sbi->s_uid;
+		uopt->gid   = sbi->s_gid;
+		uopt->umask = sbi->s_umask;
+		uopt->fmode = sbi->s_fmode;
+		uopt->dmode = sbi->s_dmode;
+		uopt->nls_map = NULL;
+	} else {
+		uopt->flags = (1 << UDF_FLAG_USE_AD_IN_ICB) |
+			      (1 << UDF_FLAG_STRICT);
+		/*
+		 * By default we'll use overflow[ug]id when UDF
+		 * inode [ug]id == -1
+		 */
+		uopt->uid = make_kuid(current_user_ns(), overflowuid);
+		uopt->gid = make_kgid(current_user_ns(), overflowgid);
+		uopt->umask = 0;
+		uopt->fmode = UDF_INVALID_MODE;
+		uopt->dmode = UDF_INVALID_MODE;
+		uopt->nls_map = NULL;
+		uopt->session = 0xFFFFFFFF;
+	}
+}
+
+static int udf_init_fs_context(struct fs_context *fc)
+{
+	struct udf_options *uopt;
+
+	uopt = kzalloc(sizeof(*uopt), GFP_KERNEL);
+	if (!uopt)
+		return -ENOMEM;
+
+	udf_init_options(fc, uopt);
+
+	fc->fs_private = uopt;
+	fc->ops = &udf_context_ops;
+
+	return 0;
+}
+
+static void udf_free_fc(struct fs_context *fc)
+{
+	struct udf_options *uopt = fc->fs_private;
+
+	unload_nls(uopt->nls_map);
+	kfree(fc->fs_private);
+}
+
 static int __init init_udf_fs(void)
 {
 	int err;
@@ -358,7 +425,7 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
 }
 
 /*
- * udf_parse_options
+ * udf_parse_param
  *
  * PURPOSE
  *	Parse mount options.
@@ -401,12 +468,12 @@ static int udf_show_options(struct seq_file *seq, struct dentry *root)
  *		yield highly unpredictable results.
  *
  * PRE-CONDITIONS
- *	options		Pointer to mount options string.
- *	uopts		Pointer to mount options variable.
+ *	fc		fs_context with pointer to mount options variable.
+ *	param		Pointer to fs_parameter being parsed.
  *
  * POST-CONDITIONS
- *	<return>	1	Mount options parsed okay.
- *	<return>	0	Error parsing mount options.
+ *	<return>	0	Mount options parsed okay.
+ *	<return>	errno	Error parsing mount options.
  *
  * HISTORY
  *	July 1, 1997 - Andrew E. Mileski
@@ -418,229 +485,193 @@ enum {
 	Opt_noadinicb, Opt_adinicb, Opt_shortad, Opt_longad,
 	Opt_gid, Opt_uid, Opt_umask, Opt_session, Opt_lastblock,
 	Opt_anchor, Opt_volume, Opt_partition, Opt_fileset,
-	Opt_rootdir, Opt_utf8, Opt_iocharset,
-	Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore,
-	Opt_fmode, Opt_dmode
-};
-
-static const match_table_t tokens = {
-	{Opt_novrs,	"novrs"},
-	{Opt_nostrict,	"nostrict"},
-	{Opt_bs,	"bs=%u"},
-	{Opt_unhide,	"unhide"},
-	{Opt_undelete,	"undelete"},
-	{Opt_noadinicb,	"noadinicb"},
-	{Opt_adinicb,	"adinicb"},
-	{Opt_shortad,	"shortad"},
-	{Opt_longad,	"longad"},
-	{Opt_uforget,	"uid=forget"},
-	{Opt_uignore,	"uid=ignore"},
-	{Opt_gforget,	"gid=forget"},
-	{Opt_gignore,	"gid=ignore"},
-	{Opt_gid,	"gid=%u"},
-	{Opt_uid,	"uid=%u"},
-	{Opt_umask,	"umask=%o"},
-	{Opt_session,	"session=%u"},
-	{Opt_lastblock,	"lastblock=%u"},
-	{Opt_anchor,	"anchor=%u"},
-	{Opt_volume,	"volume=%u"},
-	{Opt_partition,	"partition=%u"},
-	{Opt_fileset,	"fileset=%u"},
-	{Opt_rootdir,	"rootdir=%u"},
-	{Opt_utf8,	"utf8"},
-	{Opt_iocharset,	"iocharset=%s"},
-	{Opt_fmode,     "mode=%o"},
-	{Opt_dmode,     "dmode=%o"},
-	{Opt_err,	NULL}
+	Opt_rootdir, Opt_utf8, Opt_iocharset, Opt_err, Opt_fmode, Opt_dmode
 };
 
-static int udf_parse_options(char *options, struct udf_options *uopt,
-			     bool remount)
+static const struct fs_parameter_spec udf_param_spec[] = {
+	fsparam_flag	("novrs",		Opt_novrs),
+	fsparam_flag	("nostrict",		Opt_nostrict),
+	fsparam_u32	("bs",			Opt_bs),
+	fsparam_flag	("unhide",		Opt_unhide),
+	fsparam_flag	("undelete",		Opt_undelete),
+	fsparam_flag_no	("adinicb",		Opt_adinicb),
+	fsparam_flag	("shortad",		Opt_shortad),
+	fsparam_flag	("longad",		Opt_longad),
+	fsparam_string	("gid",			Opt_gid),
+	fsparam_string	("uid",			Opt_uid),
+	fsparam_u32	("umask",		Opt_umask),
+	fsparam_u32	("session",		Opt_session),
+	fsparam_u32	("lastblock",		Opt_lastblock),
+	fsparam_u32	("anchor",		Opt_anchor),
+	fsparam_u32	("volume",		Opt_volume),
+	fsparam_u32	("partition",		Opt_partition),
+	fsparam_u32	("fileset",		Opt_fileset),
+	fsparam_u32	("rootdir",		Opt_rootdir),
+	fsparam_flag	("utf8",		Opt_utf8),
+	fsparam_string	("iocharset",		Opt_iocharset),
+	fsparam_u32	("mode",		Opt_fmode),
+	fsparam_u32	("dmode",		Opt_dmode),
+	{}
+ };
+
+static int udf_parse_param(struct fs_context *fc, struct fs_parameter *param)
 {
-	char *p;
-	int option;
 	unsigned int uv;
-
-	uopt->novrs = 0;
-	uopt->session = 0xFFFFFFFF;
-	uopt->lastblock = 0;
-	uopt->anchor = 0;
-
-	if (!options)
-		return 1;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		substring_t args[MAX_OPT_ARGS];
-		int token;
-		unsigned n;
-		if (!*p)
-			continue;
-
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_novrs:
-			uopt->novrs = 1;
-			break;
-		case Opt_bs:
-			if (match_int(&args[0], &option))
-				return 0;
-			n = option;
-			if (n != 512 && n != 1024 && n != 2048 && n != 4096)
-				return 0;
-			uopt->blocksize = n;
-			uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
-			break;
-		case Opt_unhide:
-			uopt->flags |= (1 << UDF_FLAG_UNHIDE);
-			break;
-		case Opt_undelete:
-			uopt->flags |= (1 << UDF_FLAG_UNDELETE);
-			break;
-		case Opt_noadinicb:
+	unsigned int n;
+	struct udf_options *uopt = fc->fs_private;
+	struct fs_parse_result result;
+	int token;
+	bool remount = (fc->purpose & FS_CONTEXT_FOR_RECONFIGURE);
+
+	token = fs_parse(fc, udf_param_spec, param, &result);
+	if (token < 0)
+		return token;
+
+	switch (token) {
+	case Opt_novrs:
+		uopt->flags |= (1 << UDF_FLAG_NOVRS);
+		break;
+	case Opt_bs:
+		n = result.uint_32;
+		if (n != 512 && n != 1024 && n != 2048 && n != 4096)
+			return -EINVAL;
+		uopt->blocksize = n;
+		uopt->flags |= (1 << UDF_FLAG_BLOCKSIZE_SET);
+		break;
+	case Opt_unhide:
+		uopt->flags |= (1 << UDF_FLAG_UNHIDE);
+		break;
+	case Opt_undelete:
+		uopt->flags |= (1 << UDF_FLAG_UNDELETE);
+		break;
+	case Opt_adinicb:
+		if (result.negated)
 			uopt->flags &= ~(1 << UDF_FLAG_USE_AD_IN_ICB);
-			break;
-		case Opt_adinicb:
+		else
 			uopt->flags |= (1 << UDF_FLAG_USE_AD_IN_ICB);
-			break;
-		case Opt_shortad:
-			uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
-			break;
-		case Opt_longad:
-			uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
-			break;
-		case Opt_gid:
-			if (match_uint(args, &uv))
-				return 0;
-			uopt->gid = make_kgid(current_user_ns(), uv);
-			if (!gid_valid(uopt->gid))
-				return 0;
+		break;
+	case Opt_shortad:
+		uopt->flags |= (1 << UDF_FLAG_USE_SHORT_AD);
+		break;
+	case Opt_longad:
+		uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD);
+		break;
+	case Opt_gid:
+		if (kstrtoint(param->string, 10, &uv) == 0) {
+			kgid_t gid = make_kgid(current_user_ns(), uv);
+			if (!gid_valid(gid))
+				return -EINVAL;
+			uopt->gid = gid;
 			uopt->flags |= (1 << UDF_FLAG_GID_SET);
-			break;
-		case Opt_uid:
-			if (match_uint(args, &uv))
-				return 0;
-			uopt->uid = make_kuid(current_user_ns(), uv);
-			if (!uid_valid(uopt->uid))
-				return 0;
+		} else if (!strcmp(param->string, "forget")) {
+			uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
+		} else if (!strcmp(param->string, "ignore")) {
+			/* this option is superseded by gid=<number> */
+			;
+		} else {
+			return -EINVAL;
+		}
+		break;
+	case Opt_uid:
+		if (kstrtoint(param->string, 10, &uv) == 0) {
+			kuid_t uid = make_kuid(current_user_ns(), uv);
+			if (!uid_valid(uid))
+				return -EINVAL;
+			uopt->uid = uid;
 			uopt->flags |= (1 << UDF_FLAG_UID_SET);
-			break;
-		case Opt_umask:
-			if (match_octal(args, &option))
-				return 0;
-			uopt->umask = option;
-			break;
-		case Opt_nostrict:
-			uopt->flags &= ~(1 << UDF_FLAG_STRICT);
-			break;
-		case Opt_session:
-			if (match_int(args, &option))
-				return 0;
-			uopt->session = option;
-			if (!remount)
-				uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
-			break;
-		case Opt_lastblock:
-			if (match_int(args, &option))
-				return 0;
-			uopt->lastblock = option;
-			if (!remount)
-				uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
-			break;
-		case Opt_anchor:
-			if (match_int(args, &option))
-				return 0;
-			uopt->anchor = option;
-			break;
-		case Opt_volume:
-		case Opt_partition:
-		case Opt_fileset:
-		case Opt_rootdir:
-			/* Ignored (never implemented properly) */
-			break;
-		case Opt_utf8:
-			if (!remount) {
-				unload_nls(uopt->nls_map);
-				uopt->nls_map = NULL;
-			}
-			break;
-		case Opt_iocharset:
-			if (!remount) {
-				unload_nls(uopt->nls_map);
-				uopt->nls_map = NULL;
-			}
-			/* When nls_map is not loaded then UTF-8 is used */
-			if (!remount && strcmp(args[0].from, "utf8") != 0) {
-				uopt->nls_map = load_nls(args[0].from);
-				if (!uopt->nls_map) {
-					pr_err("iocharset %s not found\n",
-						args[0].from);
-					return 0;
-				}
-			}
-			break;
-		case Opt_uforget:
+		} else if (!strcmp(param->string, "forget")) {
 			uopt->flags |= (1 << UDF_FLAG_UID_FORGET);
-			break;
-		case Opt_uignore:
-		case Opt_gignore:
-			/* These options are superseeded by uid=<number> */
-			break;
-		case Opt_gforget:
-			uopt->flags |= (1 << UDF_FLAG_GID_FORGET);
-			break;
-		case Opt_fmode:
-			if (match_octal(args, &option))
-				return 0;
-			uopt->fmode = option & 0777;
-			break;
-		case Opt_dmode:
-			if (match_octal(args, &option))
-				return 0;
-			uopt->dmode = option & 0777;
-			break;
-		default:
-			pr_err("bad mount option \"%s\" or missing value\n", p);
-			return 0;
+		} else if (!strcmp(param->string, "ignore")) {
+			/* this option is superseded by uid=<number> */
+			;
+		} else {
+			return -EINVAL;
+		}
+		break;
+	case Opt_umask:
+		uopt->umask = result.uint_32;
+		break;
+	case Opt_nostrict:
+		uopt->flags &= ~(1 << UDF_FLAG_STRICT);
+		break;
+	case Opt_session:
+		uopt->session = result.uint_32;
+		if (!remount)
+			uopt->flags |= (1 << UDF_FLAG_SESSION_SET);
+		break;
+	case Opt_lastblock:
+		uopt->lastblock = result.uint_32;
+		if (!remount)
+			uopt->flags |= (1 << UDF_FLAG_LASTBLOCK_SET);
+		break;
+	case Opt_anchor:
+		uopt->anchor = result.uint_32;
+		break;
+	case Opt_volume:
+	case Opt_partition:
+	case Opt_fileset:
+	case Opt_rootdir:
+		/* Ignored (never implemented properly) */
+		break;
+	case Opt_utf8:
+		if (!remount) {
+			unload_nls(uopt->nls_map);
+			uopt->nls_map = NULL;
 		}
+		break;
+	case Opt_iocharset:
+		if (!remount) {
+			unload_nls(uopt->nls_map);
+			uopt->nls_map = NULL;
+		}
+		/* When nls_map is not loaded then UTF-8 is used */
+		if (!remount && strcmp(param->string, "utf8") != 0) {
+			uopt->nls_map = load_nls(param->string);
+			if (!uopt->nls_map) {
+				errorf(fc, "iocharset %s not found",
+					param->string);
+				return -EINVAL;;
+			}
+		}
+		break;
+	case Opt_fmode:
+		uopt->fmode = result.uint_32 & 0777;
+		break;
+	case Opt_dmode:
+		uopt->dmode = result.uint_32 & 0777;
+		break;
+	default:
+		return -EINVAL;
 	}
-	return 1;
+	return 0;
 }
 
-static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
+static int udf_reconfigure(struct fs_context *fc)
 {
-	struct udf_options uopt;
+	struct udf_options *uopt = fc->fs_private;
+	struct super_block *sb = fc->root->d_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
+	int readonly = fc->sb_flags & SB_RDONLY;
 	int error = 0;
 
-	if (!(*flags & SB_RDONLY) && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
+	if (!readonly && UDF_QUERY_FLAG(sb, UDF_FLAG_RW_INCOMPAT))
 		return -EACCES;
 
 	sync_filesystem(sb);
 
-	uopt.flags = sbi->s_flags;
-	uopt.uid   = sbi->s_uid;
-	uopt.gid   = sbi->s_gid;
-	uopt.umask = sbi->s_umask;
-	uopt.fmode = sbi->s_fmode;
-	uopt.dmode = sbi->s_dmode;
-	uopt.nls_map = NULL;
-
-	if (!udf_parse_options(options, &uopt, true))
-		return -EINVAL;
-
 	write_lock(&sbi->s_cred_lock);
-	sbi->s_flags = uopt.flags;
-	sbi->s_uid   = uopt.uid;
-	sbi->s_gid   = uopt.gid;
-	sbi->s_umask = uopt.umask;
-	sbi->s_fmode = uopt.fmode;
-	sbi->s_dmode = uopt.dmode;
+	sbi->s_flags = uopt->flags;
+	sbi->s_uid   = uopt->uid;
+	sbi->s_gid   = uopt->gid;
+	sbi->s_umask = uopt->umask;
+	sbi->s_fmode = uopt->fmode;
+	sbi->s_dmode = uopt->dmode;
 	write_unlock(&sbi->s_cred_lock);
 
-	if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
+	if (readonly == sb_rdonly(sb))
 		goto out_unlock;
 
-	if (*flags & SB_RDONLY)
+	if (readonly)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
@@ -864,7 +895,7 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
 	int ret;
 	struct timestamp *ts;
 
-	outstr = kmalloc(128, GFP_NOFS);
+	outstr = kmalloc(128, GFP_KERNEL);
 	if (!outstr)
 		return -ENOMEM;
 
@@ -1539,6 +1570,20 @@ out_bh:
 	return ret;
 }
 
+static bool udf_lvid_valid(struct super_block *sb,
+			   struct logicalVolIntegrityDesc *lvid)
+{
+	u32 parts, impuselen;
+
+	parts = le32_to_cpu(lvid->numOfPartitions);
+	impuselen = le32_to_cpu(lvid->lengthOfImpUse);
+	if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
+	    sizeof(struct logicalVolIntegrityDesc) + impuselen +
+	    2 * parts * sizeof(u32) > sb->s_blocksize)
+		return false;
+	return true;
+}
+
 /*
  * Find the prevailing Logical Volume Integrity Descriptor.
  */
@@ -1549,7 +1594,6 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
 	struct udf_sb_info *sbi = UDF_SB(sb);
 	struct logicalVolIntegrityDesc *lvid;
 	int indirections = 0;
-	u32 parts, impuselen;
 
 	while (++indirections <= UDF_MAX_LVID_NESTING) {
 		final_bh = NULL;
@@ -1571,32 +1615,27 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
 		if (!final_bh)
 			return;
 
-		brelse(sbi->s_lvid_bh);
-		sbi->s_lvid_bh = final_bh;
-
 		lvid = (struct logicalVolIntegrityDesc *)final_bh->b_data;
+		if (udf_lvid_valid(sb, lvid)) {
+			brelse(sbi->s_lvid_bh);
+			sbi->s_lvid_bh = final_bh;
+		} else {
+			udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
+				 "ignoring.\n",
+				 le32_to_cpu(lvid->numOfPartitions),
+				 le32_to_cpu(lvid->lengthOfImpUse));
+		}
+
 		if (lvid->nextIntegrityExt.extLength == 0)
-			goto check;
+			return;
 
 		loc = leea_to_cpu(lvid->nextIntegrityExt);
 	}
 
 	udf_warn(sb, "Too many LVID indirections (max %u), ignoring.\n",
 		UDF_MAX_LVID_NESTING);
-out_err:
 	brelse(sbi->s_lvid_bh);
 	sbi->s_lvid_bh = NULL;
-	return;
-check:
-	parts = le32_to_cpu(lvid->numOfPartitions);
-	impuselen = le32_to_cpu(lvid->lengthOfImpUse);
-	if (parts >= sb->s_blocksize || impuselen >= sb->s_blocksize ||
-	    sizeof(struct logicalVolIntegrityDesc) + impuselen +
-	    2 * parts * sizeof(u32) > sb->s_blocksize) {
-		udf_warn(sb, "Corrupted LVID (parts=%u, impuselen=%u), "
-			 "ignoring.\n", parts, impuselen);
-		goto out_err;
-	}
 }
 
 /*
@@ -1946,7 +1985,7 @@ static int udf_load_vrs(struct super_block *sb, struct udf_options *uopt,
 		return -EINVAL;
 	}
 	sbi->s_last_block = uopt->lastblock;
-	if (!uopt->novrs) {
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_NOVRS)) {
 		/* Check that it is NSR02 compliant */
 		nsr = udf_check_vsd(sb);
 		if (!nsr) {
@@ -2084,23 +2123,15 @@ u64 lvid_get_unique_id(struct super_block *sb)
 	return ret;
 }
 
-static int udf_fill_super(struct super_block *sb, void *options, int silent)
+static int udf_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	int ret = -EINVAL;
 	struct inode *inode = NULL;
-	struct udf_options uopt;
+	struct udf_options *uopt = fc->fs_private;
 	struct kernel_lb_addr rootdir, fileset;
 	struct udf_sb_info *sbi;
 	bool lvid_open = false;
-
-	uopt.flags = (1 << UDF_FLAG_USE_AD_IN_ICB) | (1 << UDF_FLAG_STRICT);
-	/* By default we'll use overflow[ug]id when UDF inode [ug]id == -1 */
-	uopt.uid = make_kuid(current_user_ns(), overflowuid);
-	uopt.gid = make_kgid(current_user_ns(), overflowgid);
-	uopt.umask = 0;
-	uopt.fmode = UDF_INVALID_MODE;
-	uopt.dmode = UDF_INVALID_MODE;
-	uopt.nls_map = NULL;
+	int silent = fc->sb_flags & SB_SILENT;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -2110,25 +2141,23 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 	mutex_init(&sbi->s_alloc_mutex);
 
-	if (!udf_parse_options((char *)options, &uopt, false))
-		goto parse_options_failure;
-
 	fileset.logicalBlockNum = 0xFFFFFFFF;
 	fileset.partitionReferenceNum = 0xFFFF;
 
-	sbi->s_flags = uopt.flags;
-	sbi->s_uid = uopt.uid;
-	sbi->s_gid = uopt.gid;
-	sbi->s_umask = uopt.umask;
-	sbi->s_fmode = uopt.fmode;
-	sbi->s_dmode = uopt.dmode;
-	sbi->s_nls_map = uopt.nls_map;
+	sbi->s_flags = uopt->flags;
+	sbi->s_uid = uopt->uid;
+	sbi->s_gid = uopt->gid;
+	sbi->s_umask = uopt->umask;
+	sbi->s_fmode = uopt->fmode;
+	sbi->s_dmode = uopt->dmode;
+	sbi->s_nls_map = uopt->nls_map;
+	uopt->nls_map = NULL;
 	rwlock_init(&sbi->s_cred_lock);
 
-	if (uopt.session == 0xFFFFFFFF)
+	if (uopt->session == 0xFFFFFFFF)
 		sbi->s_session = udf_get_last_session(sb);
 	else
-		sbi->s_session = uopt.session;
+		sbi->s_session = uopt->session;
 
 	udf_debug("Multi-session=%d\n", sbi->s_session);
 
@@ -2139,16 +2168,16 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	sb->s_magic = UDF_SUPER_MAGIC;
 	sb->s_time_gran = 1000;
 
-	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
-		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+	if (uopt->flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
+		ret = udf_load_vrs(sb, uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
-		while (uopt.blocksize <= 4096) {
-			ret = udf_load_vrs(sb, &uopt, silent, &fileset);
+		uopt->blocksize = bdev_logical_block_size(sb->s_bdev);
+		while (uopt->blocksize <= 4096) {
+			ret = udf_load_vrs(sb, uopt, silent, &fileset);
 			if (ret < 0) {
 				if (!silent && ret != -EACCES) {
 					pr_notice("Scanning with blocksize %u failed\n",
-						  uopt.blocksize);
+						  uopt->blocksize);
 				}
 				brelse(sbi->s_lvid_bh);
 				sbi->s_lvid_bh = NULL;
@@ -2161,7 +2190,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 			} else
 				break;
 
-			uopt.blocksize <<= 1;
+			uopt->blocksize <<= 1;
 		}
 	}
 	if (ret < 0) {
@@ -2266,8 +2295,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 
 error_out:
 	iput(sbi->s_vat_inode);
-parse_options_failure:
-	unload_nls(uopt.nls_map);
+	unload_nls(uopt->nls_map);
 	if (lvid_open)
 		udf_close_lvid(sb);
 	brelse(sbi->s_lvid_bh);
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index f9a60bc1abcf..08ec8756b948 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -23,6 +23,7 @@
 #define UDF_FLAG_STRICT			5
 #define UDF_FLAG_UNDELETE		6
 #define UDF_FLAG_UNHIDE			7
+#define UDF_FLAG_NOVRS			8
 #define UDF_FLAG_UID_FORGET     11    /* save -1 for uid to disk */
 #define UDF_FLAG_GID_FORGET     12
 #define UDF_FLAG_UID_SET	13
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index a480810cd4e3..44666afc6209 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1470,8 +1470,7 @@ static int __init init_inodecache(void)
 {
 	ufs_inode_cachep = kmem_cache_create_usercopy("ufs_inode_cache",
 				sizeof(struct ufs_inode_info), 0,
-				(SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
-					SLAB_ACCOUNT),
+				(SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT),
 				offsetof(struct ufs_inode_info, i_u1.i_symlink),
 				sizeof_field(struct ufs_inode_info,
 					i_u1.i_symlink),
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 959551ff9a95..60dcfafdc11a 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -50,45 +50,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
 
 static struct kmem_cache *userfaultfd_ctx_cachep __ro_after_init;
 
-/*
- * Start with fault_pending_wqh and fault_wqh so they're more likely
- * to be in the same cacheline.
- *
- * Locking order:
- *	fd_wqh.lock
- *		fault_pending_wqh.lock
- *			fault_wqh.lock
- *		event_wqh.lock
- *
- * To avoid deadlocks, IRQs must be disabled when taking any of the above locks,
- * since fd_wqh.lock is taken by aio_poll() while it's holding a lock that's
- * also taken in IRQ context.
- */
-struct userfaultfd_ctx {
-	/* waitqueue head for the pending (i.e. not read) userfaults */
-	wait_queue_head_t fault_pending_wqh;
-	/* waitqueue head for the userfaults */
-	wait_queue_head_t fault_wqh;
-	/* waitqueue head for the pseudo fd to wakeup poll/read */
-	wait_queue_head_t fd_wqh;
-	/* waitqueue head for events */
-	wait_queue_head_t event_wqh;
-	/* a refile sequence protected by fault_pending_wqh lock */
-	seqcount_spinlock_t refile_seq;
-	/* pseudo fd refcounting */
-	refcount_t refcount;
-	/* userfaultfd syscall flags */
-	unsigned int flags;
-	/* features requested from the userspace */
-	unsigned int features;
-	/* released */
-	bool released;
-	/* memory mappings are changing because of non-cooperative event */
-	atomic_t mmap_changing;
-	/* mm with one ore more vmas attached to this userfaultfd_ctx */
-	struct mm_struct *mm;
-};
-
 struct userfaultfd_fork_ctx {
 	struct userfaultfd_ctx *orig;
 	struct userfaultfd_ctx *new;
@@ -724,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 		ctx->flags = octx->flags;
 		ctx->features = octx->features;
 		ctx->released = false;
+		init_rwsem(&ctx->map_changing_lock);
 		atomic_set(&ctx->mmap_changing, 0);
 		ctx->mm = vma->vm_mm;
 		mmgrab(ctx->mm);
 
 		userfaultfd_ctx_get(octx);
+		down_write(&octx->map_changing_lock);
 		atomic_inc(&octx->mmap_changing);
+		up_write(&octx->map_changing_lock);
 		fctx->orig = octx;
 		fctx->new = ctx;
 		list_add_tail(&fctx->list, fcs);
@@ -776,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
 	if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
 		vm_ctx->ctx = ctx;
 		userfaultfd_ctx_get(ctx);
+		down_write(&ctx->map_changing_lock);
 		atomic_inc(&ctx->mmap_changing);
+		up_write(&ctx->map_changing_lock);
 	} else {
 		/* Drop uffd context if remap feature not enabled */
 		vma_start_write(vma);
@@ -822,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
 		return true;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	mmap_read_unlock(mm);
 
 	msg_init(&ewq.msg);
@@ -864,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
 		return -ENOMEM;
 
 	userfaultfd_ctx_get(ctx);
+	down_write(&ctx->map_changing_lock);
 	atomic_inc(&ctx->mmap_changing);
+	up_write(&ctx->map_changing_lock);
 	unmap_ctx->ctx = ctx;
 	unmap_ctx->start = start;
 	unmap_ctx->end = end;
@@ -1748,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
 	if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
 		flags |= MFILL_ATOMIC_WP;
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
-					uffdio_copy.len, &ctx->mmap_changing,
-					flags);
+		ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
+					uffdio_copy.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1800,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
-					   uffdio_zeropage.range.len,
-					   &ctx->mmap_changing);
+		ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
+					   uffdio_zeropage.range.len);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1857,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
-					  uffdio_wp.range.len, mode_wp,
-					  &ctx->mmap_changing);
+		ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
+					  uffdio_wp.range.len, mode_wp);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1909,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
 		flags |= MFILL_ATOMIC_WP;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
-					    uffdio_continue.range.len,
-					    &ctx->mmap_changing, flags);
+		ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
+					    uffdio_continue.range.len, flags);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -1964,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
 		goto out;
 
 	if (mmget_not_zero(ctx->mm)) {
-		ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
-					  uffdio_poison.range.len,
-					  &ctx->mmap_changing, 0);
+		ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
+					  uffdio_poison.range.len, 0);
 		mmput(ctx->mm);
 	} else {
 		return -ESRCH;
@@ -2040,16 +2005,8 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
 		return -EINVAL;
 
 	if (mmget_not_zero(mm)) {
-		mmap_read_lock(mm);
-
-		/* Re-check after taking mmap_lock */
-		if (likely(!atomic_read(&ctx->mmap_changing)))
-			ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
-					 uffdio_move.len, uffdio_move.mode);
-		else
-			ret = -EINVAL;
-
-		mmap_read_unlock(mm);
+		ret = move_pages(ctx, uffdio_move.dst, uffdio_move.src,
+				 uffdio_move.len, uffdio_move.mode);
 		mmput(mm);
 	} else {
 		return -ESRCH;
@@ -2255,6 +2212,7 @@ static int new_userfaultfd(int flags)
 	ctx->flags = flags;
 	ctx->features = 0;
 	ctx->released = false;
+	init_rwsem(&ctx->map_changing_lock);
 	atomic_set(&ctx->mmap_changing, 0);
 	ctx->mm = current->mm;
 	/* prevent the mm struct to be freed */
diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c
index 2307f8037efc..118dedef8ebe 100644
--- a/fs/vboxsf/file.c
+++ b/fs/vboxsf/file.c
@@ -218,6 +218,7 @@ const struct file_operations vboxsf_reg_fops = {
 	.release = vboxsf_file_release,
 	.fsync = noop_fsync,
 	.splice_read = filemap_splice_read,
+	.setlease = simple_nosetlease,
 };
 
 const struct inode_operations vboxsf_reg_iops = {
diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c
index 1fb8f4df60cb..ffb1d565da39 100644
--- a/fs/vboxsf/super.c
+++ b/fs/vboxsf/super.c
@@ -151,11 +151,11 @@ static int vboxsf_fill_super(struct super_block *sb, struct fs_context *fc)
 		if (!sbi->nls) {
 			vbg_err("vboxsf: Count not load '%s' nls\n", nls_name);
 			err = -EINVAL;
-			goto fail_free;
+			goto fail_destroy_idr;
 		}
 	}
 
-	sbi->bdi_id = ida_simple_get(&vboxsf_bdi_ida, 0, 0, GFP_KERNEL);
+	sbi->bdi_id = ida_alloc(&vboxsf_bdi_ida, GFP_KERNEL);
 	if (sbi->bdi_id < 0) {
 		err = sbi->bdi_id;
 		goto fail_free;
@@ -221,9 +221,10 @@ fail_unmap:
 	vboxsf_unmap_folder(sbi->root);
 fail_free:
 	if (sbi->bdi_id >= 0)
-		ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+		ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
 	if (sbi->nls)
 		unload_nls(sbi->nls);
+fail_destroy_idr:
 	idr_destroy(&sbi->ino_idr);
 	kfree(sbi);
 	return err;
@@ -268,7 +269,7 @@ static void vboxsf_put_super(struct super_block *sb)
 
 	vboxsf_unmap_folder(sbi->root);
 	if (sbi->bdi_id >= 0)
-		ida_simple_remove(&vboxsf_bdi_ida, sbi->bdi_id);
+		ida_free(&vboxsf_bdi_ida, sbi->bdi_id);
 	if (sbi->nls)
 		unload_nls(sbi->nls);
 
@@ -339,8 +340,7 @@ static int vboxsf_setup(void)
 	vboxsf_inode_cachep =
 		kmem_cache_create("vboxsf_inode_cache",
 				  sizeof(struct vboxsf_inode), 0,
-				  (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
-				   SLAB_ACCOUNT),
+				  SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 				  vboxsf_inode_init_once);
 	if (!vboxsf_inode_cachep) {
 		err = -ENOMEM;
diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c
index 72ac9320e6a3..9515bbf0b54c 100644
--- a/fs/vboxsf/utils.c
+++ b/fs/vboxsf/utils.c
@@ -440,7 +440,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
 {
 	const char *in;
 	char *out;
-	size_t out_len;
 	size_t out_bound_len;
 	size_t in_bound_len;
 
@@ -448,7 +447,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
 	in_bound_len = utf8_len;
 
 	out = name;
-	out_len = 0;
 	/* Reserve space for terminating 0 */
 	out_bound_len = name_bound_len - 1;
 
@@ -469,7 +467,6 @@ int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len,
 
 		out += nb;
 		out_bound_len -= nb;
-		out_len += nb;
 	}
 
 	*out = 0;
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index a6a6b2749241..b3506f56e180 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -69,7 +69,6 @@ struct fsverity_info {
 	u8 file_digest[FS_VERITY_MAX_DIGEST_SIZE];
 	const struct inode *inode;
 	unsigned long *hash_block_verified;
-	spinlock_t hash_page_init_lock;
 };
 
 #define FS_VERITY_MAX_SIGNATURE_SIZE	(FS_VERITY_MAX_DESCRIPTOR_SIZE - \
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index bf7a5f4cccaf..3969d54158d1 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -159,9 +159,9 @@ __bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_ker
 
 __bpf_kfunc_end_defs();
 
-BTF_SET8_START(fsverity_set_ids)
+BTF_KFUNCS_START(fsverity_set_ids)
 BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
-BTF_SET8_END(fsverity_set_ids)
+BTF_KFUNCS_END(fsverity_set_ids)
 
 static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
 {
diff --git a/fs/verity/open.c b/fs/verity/open.c
index 6c31a871b84b..fdeb95eca3af 100644
--- a/fs/verity/open.c
+++ b/fs/verity/open.c
@@ -239,7 +239,6 @@ struct fsverity_info *fsverity_create_info(const struct inode *inode,
 			err = -ENOMEM;
 			goto fail;
 		}
-		spin_lock_init(&vi->hash_page_init_lock);
 	}
 
 	return vi;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 904ccd7e8e16..4fcad0825a12 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -19,7 +19,6 @@ static struct workqueue_struct *fsverity_read_workqueue;
 static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 				   unsigned long hblock_idx)
 {
-	bool verified;
 	unsigned int blocks_per_page;
 	unsigned int i;
 
@@ -43,12 +42,20 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 	 * re-instantiated from the backing storage are re-verified.  To do
 	 * this, we use PG_checked again, but now it doesn't really mean
 	 * "checked".  Instead, now it just serves as an indicator for whether
-	 * the hash page is newly instantiated or not.
+	 * the hash page is newly instantiated or not.  If the page is new, as
+	 * indicated by PG_checked=0, we clear the bitmap bits for the page's
+	 * blocks since they are untrustworthy, then set PG_checked=1.
+	 * Otherwise we return the bitmap bit for the requested block.
 	 *
-	 * The first thread that sees PG_checked=0 must clear the corresponding
-	 * bitmap bits, then set PG_checked=1.  This requires a spinlock.  To
-	 * avoid having to take this spinlock in the common case of
-	 * PG_checked=1, we start with an opportunistic lockless read.
+	 * Multiple threads may execute this code concurrently on the same page.
+	 * This is safe because we use memory barriers to ensure that if a
+	 * thread sees PG_checked=1, then it also sees the associated bitmap
+	 * clearing to have occurred.  Also, all writes and their corresponding
+	 * reads are atomic, and all writes are safe to repeat in the event that
+	 * multiple threads get into the PG_checked=0 section.  (Clearing a
+	 * bitmap bit again at worst causes a hash block to be verified
+	 * redundantly.  That event should be very rare, so it's not worth using
+	 * a lock to avoid.  Setting PG_checked again has no effect.)
 	 */
 	if (PageChecked(hpage)) {
 		/*
@@ -58,24 +65,17 @@ static bool is_hash_block_verified(struct fsverity_info *vi, struct page *hpage,
 		smp_rmb();
 		return test_bit(hblock_idx, vi->hash_block_verified);
 	}
-	spin_lock(&vi->hash_page_init_lock);
-	if (PageChecked(hpage)) {
-		verified = test_bit(hblock_idx, vi->hash_block_verified);
-	} else {
-		blocks_per_page = vi->tree_params.blocks_per_page;
-		hblock_idx = round_down(hblock_idx, blocks_per_page);
-		for (i = 0; i < blocks_per_page; i++)
-			clear_bit(hblock_idx + i, vi->hash_block_verified);
-		/*
-		 * A write memory barrier is needed here to give RELEASE
-		 * semantics to the below SetPageChecked() operation.
-		 */
-		smp_wmb();
-		SetPageChecked(hpage);
-		verified = false;
-	}
-	spin_unlock(&vi->hash_page_init_lock);
-	return verified;
+	blocks_per_page = vi->tree_params.blocks_per_page;
+	hblock_idx = round_down(hblock_idx, blocks_per_page);
+	for (i = 0; i < blocks_per_page; i++)
+		clear_bit(hblock_idx + i, vi->hash_block_verified);
+	/*
+	 * A write memory barrier is needed here to give RELEASE semantics to
+	 * the below SetPageChecked() operation.
+	 */
+	smp_wmb();
+	SetPageChecked(hpage);
+	return false;
 }
 
 /*
diff --git a/fs/xattr.c b/fs/xattr.c
index 09d927603433..f8b643f91a98 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -16,7 +16,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/security.h>
-#include <linux/evm.h>
 #include <linux/syscalls.h>
 #include <linux/export.h>
 #include <linux/fsnotify.h>
@@ -552,11 +551,11 @@ __vfs_removexattr_locked(struct mnt_idmap *idmap,
 		goto out;
 
 	error = __vfs_removexattr(idmap, dentry, name);
+	if (error)
+		return error;
 
-	if (!error) {
-		fsnotify_xattr(dentry);
-		evm_inode_post_removexattr(dentry, name);
-	}
+	fsnotify_xattr(dentry);
+	security_inode_post_removexattr(dentry, name);
 
 out:
 	return error;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 567fb37274d3..d41edd30388b 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -124,12 +124,24 @@ config XFS_DRAIN_INTENTS
 	bool
 	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
 
+config XFS_LIVE_HOOKS
+	bool
+	select JUMP_LABEL if HAVE_ARCH_JUMP_LABEL
+
+config XFS_MEMORY_BUFS
+	bool
+
+config XFS_BTREE_IN_MEM
+	bool
+
 config XFS_ONLINE_SCRUB
 	bool "XFS online metadata check support"
 	default n
 	depends on XFS_FS
 	depends on TMPFS && SHMEM
+	select XFS_LIVE_HOOKS
 	select XFS_DRAIN_INTENTS
+	select XFS_MEMORY_BUFS
 	help
 	  If you say Y here you will be able to check metadata on a
 	  mounted XFS filesystem.  This feature is intended to reduce
@@ -164,6 +176,7 @@ config XFS_ONLINE_REPAIR
 	bool "XFS online metadata repair support"
 	default n
 	depends on XFS_FS && XFS_ONLINE_SCRUB
+	select XFS_BTREE_IN_MEM
 	help
 	  If you say Y here you will be able to repair metadata on a
 	  mounted XFS filesystem.  This feature is intended to reduce
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index fbe3cdc79036..76674ad5833e 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -92,8 +92,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_symlink.o \
 				   xfs_sysfs.o \
 				   xfs_trans.o \
-				   xfs_xattr.o \
-				   kmem.o
+				   xfs_xattr.o
 
 # low-level transaction/log code
 xfs-y				+= xfs_log.o \
@@ -137,6 +136,9 @@ xfs-$(CONFIG_FS_DAX)		+= xfs_notify_failure.o
 endif
 
 xfs-$(CONFIG_XFS_DRAIN_INTENTS)	+= xfs_drain.o
+xfs-$(CONFIG_XFS_LIVE_HOOKS)	+= xfs_hooks.o
+xfs-$(CONFIG_XFS_MEMORY_BUFS)	+= xfs_buf_mem.o
+xfs-$(CONFIG_XFS_BTREE_IN_MEM)	+= libxfs/xfs_btree_mem.o
 
 # online scrub/repair
 ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
@@ -159,6 +161,8 @@ xfs-y				+= $(addprefix scrub/, \
 				   health.o \
 				   ialloc.o \
 				   inode.o \
+				   iscan.o \
+				   nlinks.o \
 				   parent.o \
 				   readdir.o \
 				   refcount.o \
@@ -179,6 +183,7 @@ xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix scrub/, \
 				   dqiterate.o \
 				   quota.o \
+				   quotacheck.o \
 				   )
 
 # online repair
@@ -188,12 +193,17 @@ xfs-y				+= $(addprefix scrub/, \
 				   alloc_repair.o \
 				   bmap_repair.o \
 				   cow_repair.o \
+				   fscounters_repair.o \
 				   ialloc_repair.o \
 				   inode_repair.o \
 				   newbt.o \
+				   nlinks_repair.o \
+				   rcbag_btree.o \
+				   rcbag.o \
 				   reap.o \
 				   refcount_repair.o \
 				   repair.o \
+				   rmap_repair.o \
 				   )
 
 xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
@@ -202,6 +212,7 @@ xfs-$(CONFIG_XFS_RT)		+= $(addprefix scrub/, \
 
 xfs-$(CONFIG_XFS_QUOTA)		+= $(addprefix scrub/, \
 				   quota_repair.o \
+				   quotacheck_repair.o \
 				   )
 endif
 endif
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
deleted file mode 100644
index c557a030acfe..000000000000
--- a/fs/xfs/kmem.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#include "xfs.h"
-#include "xfs_message.h"
-#include "xfs_trace.h"
-
-void *
-kmem_alloc(size_t size, xfs_km_flags_t flags)
-{
-	int	retries = 0;
-	gfp_t	lflags = kmem_flags_convert(flags);
-	void	*ptr;
-
-	trace_kmem_alloc(size, flags, _RET_IP_);
-
-	do {
-		ptr = kmalloc(size, lflags);
-		if (ptr || (flags & KM_MAYFAIL))
-			return ptr;
-		if (!(++retries % 100))
-			xfs_err(NULL,
-	"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
-				current->comm, current->pid,
-				(unsigned int)size, __func__, lflags);
-		memalloc_retry_wait(lflags);
-	} while (1);
-}
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
deleted file mode 100644
index b987dc2c6851..000000000000
--- a/fs/xfs/kmem.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_KMEM_H__
-#define __XFS_SUPPORT_KMEM_H__
-
-#include <linux/slab.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/vmalloc.h>
-
-/*
- * General memory allocation interfaces
- */
-
-typedef unsigned __bitwise xfs_km_flags_t;
-#define KM_NOFS		((__force xfs_km_flags_t)0x0004u)
-#define KM_MAYFAIL	((__force xfs_km_flags_t)0x0008u)
-#define KM_ZERO		((__force xfs_km_flags_t)0x0010u)
-#define KM_NOLOCKDEP	((__force xfs_km_flags_t)0x0020u)
-
-/*
- * We use a special process flag to avoid recursive callbacks into
- * the filesystem during transactions.  We will also issue our own
- * warnings, so we explicitly skip any generic ones (silly of us).
- */
-static inline gfp_t
-kmem_flags_convert(xfs_km_flags_t flags)
-{
-	gfp_t	lflags;
-
-	BUG_ON(flags & ~(KM_NOFS | KM_MAYFAIL | KM_ZERO | KM_NOLOCKDEP));
-
-	lflags = GFP_KERNEL | __GFP_NOWARN;
-	if (flags & KM_NOFS)
-		lflags &= ~__GFP_FS;
-
-	/*
-	 * Default page/slab allocator behavior is to retry for ever
-	 * for small allocations. We can override this behavior by using
-	 * __GFP_RETRY_MAYFAIL which will tell the allocator to retry as long
-	 * as it is feasible but rather fail than retry forever for all
-	 * request sizes.
-	 */
-	if (flags & KM_MAYFAIL)
-		lflags |= __GFP_RETRY_MAYFAIL;
-
-	if (flags & KM_ZERO)
-		lflags |= __GFP_ZERO;
-
-	if (flags & KM_NOLOCKDEP)
-		lflags |= __GFP_NOLOCKDEP;
-
-	return lflags;
-}
-
-extern void *kmem_alloc(size_t, xfs_km_flags_t);
-static inline void  kmem_free(const void *ptr)
-{
-	kvfree(ptr);
-}
-
-
-static inline void *
-kmem_zalloc(size_t size, xfs_km_flags_t flags)
-{
-	return kmem_alloc(size, flags | KM_ZERO);
-}
-
-/*
- * Zone interfaces
- */
-static inline struct page *
-kmem_to_page(void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		return vmalloc_to_page(addr);
-	return virt_to_page(addr);
-}
-
-#endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 39d9525270b7..dc1873f76bff 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -217,6 +217,7 @@ xfs_initialize_perag_data(
 	 */
 	if (fdblocks > sbp->sb_dblocks || ifree > ialloc) {
 		xfs_alert(mp, "AGF corruption. Please run xfs_repair.");
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_COUNTERS);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
@@ -241,7 +242,7 @@ __xfs_free_perag(
 	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
 
 	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
-	kmem_free(pag);
+	kfree(pag);
 }
 
 /*
@@ -263,7 +264,7 @@ xfs_free_perag(
 		xfs_defer_drain_free(&pag->pag_intents_drain);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
-		xfs_buf_hash_destroy(pag);
+		xfs_buf_cache_destroy(&pag->pag_bcache);
 
 		/* drop the mount's active reference */
 		xfs_perag_rele(pag);
@@ -351,9 +352,9 @@ xfs_free_unused_perag_range(
 		spin_unlock(&mp->m_perag_lock);
 		if (!pag)
 			break;
-		xfs_buf_hash_destroy(pag);
+		xfs_buf_cache_destroy(&pag->pag_bcache);
 		xfs_defer_drain_free(&pag->pag_intents_drain);
-		kmem_free(pag);
+		kfree(pag);
 	}
 }
 
@@ -381,7 +382,7 @@ xfs_initialize_perag(
 			continue;
 		}
 
-		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		pag = kzalloc(sizeof(*pag), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 		if (!pag) {
 			error = -ENOMEM;
 			goto out_unwind_new_pags;
@@ -389,7 +390,7 @@ xfs_initialize_perag(
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
 
-		error = radix_tree_preload(GFP_NOFS);
+		error = radix_tree_preload(GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 		if (error)
 			goto out_free_pag;
 
@@ -416,9 +417,10 @@ xfs_initialize_perag(
 		init_waitqueue_head(&pag->pag_active_wq);
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
+		xfs_hooks_init(&pag->pag_rmap_update_hooks);
 #endif /* __KERNEL__ */
 
-		error = xfs_buf_hash_init(pag);
+		error = xfs_buf_cache_init(&pag->pag_bcache);
 		if (error)
 			goto out_remove_pag;
 
@@ -453,7 +455,7 @@ out_remove_pag:
 	radix_tree_delete(&mp->m_perag_tree, index);
 	spin_unlock(&mp->m_perag_lock);
 out_free_pag:
-	kmem_free(pag);
+	kfree(pag);
 out_unwind_new_pags:
 	/* unwind any prior newly initialized pags */
 	xfs_free_unused_perag_range(mp, first_initialised, agcount);
@@ -491,7 +493,7 @@ xfs_btroot_init(
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno);
+	xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
 }
 
 /* Finish initializing a free space btree. */
@@ -549,7 +551,7 @@ xfs_freesp_init_recs(
 }
 
 /*
- * Alloc btree root block init functions
+ * bnobt/cntbt btree root block init functions
  */
 static void
 xfs_bnoroot_init(
@@ -557,17 +559,7 @@ xfs_bnoroot_init(
 	struct xfs_buf		*bp,
 	struct aghdr_init_data	*id)
 {
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 0, id->agno);
-	xfs_freesp_init_recs(mp, bp, id);
-}
-
-static void
-xfs_cntroot_init(
-	struct xfs_mount	*mp,
-	struct xfs_buf		*bp,
-	struct aghdr_init_data	*id)
-{
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 0, id->agno);
+	xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 0, id->agno);
 	xfs_freesp_init_recs(mp, bp, id);
 }
 
@@ -583,7 +575,7 @@ xfs_rmaproot_init(
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
 	struct xfs_rmap_rec	*rrec;
 
-	xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno);
+	xfs_btree_init_buf(mp, bp, id->bc_ops, 0, 4, id->agno);
 
 	/*
 	 * mark the AG header regions as static metadata The BNO
@@ -678,14 +670,13 @@ xfs_agfblock_init(
 	agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
 	agf->agf_seqno = cpu_to_be32(id->agno);
 	agf->agf_length = cpu_to_be32(id->agsize);
-	agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp));
-	agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp));
-	agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1);
-	agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1);
+	agf->agf_bno_root = cpu_to_be32(XFS_BNO_BLOCK(mp));
+	agf->agf_cnt_root = cpu_to_be32(XFS_CNT_BLOCK(mp));
+	agf->agf_bno_level = cpu_to_be32(1);
+	agf->agf_cnt_level = cpu_to_be32(1);
 	if (xfs_has_rmapbt(mp)) {
-		agf->agf_roots[XFS_BTNUM_RMAPi] =
-					cpu_to_be32(XFS_RMAP_BLOCK(mp));
-		agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1);
+		agf->agf_rmap_root = cpu_to_be32(XFS_RMAP_BLOCK(mp));
+		agf->agf_rmap_level = cpu_to_be32(1);
 		agf->agf_rmap_blocks = cpu_to_be32(1);
 	}
 
@@ -796,7 +787,7 @@ struct xfs_aghdr_grow_data {
 	size_t			numblks;
 	const struct xfs_buf_ops *ops;
 	aghdr_init_work_f	work;
-	xfs_btnum_t		type;
+	const struct xfs_btree_ops *bc_ops;
 	bool			need_init;
 };
 
@@ -850,13 +841,15 @@ xfs_ag_init_headers(
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_bnobt_buf_ops,
 		.work = &xfs_bnoroot_init,
+		.bc_ops = &xfs_bnobt_ops,
 		.need_init = true
 	},
 	{ /* CNT root block */
 		.daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)),
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_cntbt_buf_ops,
-		.work = &xfs_cntroot_init,
+		.work = &xfs_bnoroot_init,
+		.bc_ops = &xfs_cntbt_ops,
 		.need_init = true
 	},
 	{ /* INO root block */
@@ -864,7 +857,7 @@ xfs_ag_init_headers(
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_inobt_buf_ops,
 		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_INO,
+		.bc_ops = &xfs_inobt_ops,
 		.need_init = true
 	},
 	{ /* FINO root block */
@@ -872,7 +865,7 @@ xfs_ag_init_headers(
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_finobt_buf_ops,
 		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_FINO,
+		.bc_ops = &xfs_finobt_ops,
 		.need_init =  xfs_has_finobt(mp)
 	},
 	{ /* RMAP root block */
@@ -880,6 +873,7 @@ xfs_ag_init_headers(
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_rmapbt_buf_ops,
 		.work = &xfs_rmaproot_init,
+		.bc_ops = &xfs_rmapbt_ops,
 		.need_init = xfs_has_rmapbt(mp)
 	},
 	{ /* REFC root block */
@@ -887,7 +881,7 @@ xfs_ag_init_headers(
 		.numblks = BTOBB(mp->m_sb.sb_blocksize),
 		.ops = &xfs_refcountbt_buf_ops,
 		.work = &xfs_btroot_init,
-		.type = XFS_BTNUM_REFC,
+		.bc_ops = &xfs_refcountbt_ops,
 		.need_init = xfs_has_reflink(mp)
 	},
 	{ /* NULL terminating block */
@@ -905,7 +899,7 @@ xfs_ag_init_headers(
 
 		id->daddr = dp->daddr;
 		id->numblks = dp->numblks;
-		id->type = dp->type;
+		id->bc_ops = dp->bc_ops;
 		error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops);
 		if (error)
 			break;
@@ -950,8 +944,10 @@ xfs_ag_shrink_space(
 	agf = agfbp->b_addr;
 	aglen = be32_to_cpu(agi->agi_length);
 	/* some extra paranoid checks before we shrink the ag */
-	if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length))
+	if (XFS_IS_CORRUPT(mp, agf->agf_length != agi->agi_length)) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
 		return -EFSCORRUPTED;
+	}
 	if (delta >= aglen)
 		return -EINVAL;
 
@@ -979,14 +975,23 @@ xfs_ag_shrink_space(
 
 	if (error) {
 		/*
-		 * if extent allocation fails, need to roll the transaction to
+		 * If extent allocation fails, need to roll the transaction to
 		 * ensure that the AGFL fixup has been committed anyway.
+		 *
+		 * We need to hold the AGF across the roll to ensure nothing can
+		 * access the AG for allocation until the shrink is fully
+		 * cleaned up. And due to the resetting of the AG block
+		 * reservation space needing to lock the AGI, we also have to
+		 * hold that so we don't get AGI/AGF lock order inversions in
+		 * the error handling path.
 		 */
 		xfs_trans_bhold(*tpp, agfbp);
+		xfs_trans_bhold(*tpp, agibp);
 		err2 = xfs_trans_roll(tpp);
 		if (err2)
 			return err2;
 		xfs_trans_bjoin(*tpp, agfbp);
+		xfs_trans_bjoin(*tpp, agibp);
 		goto resv_init_out;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4b343c4fac28..35de09a2516c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -36,8 +36,9 @@ struct xfs_perag {
 	atomic_t	pag_active_ref;	/* active reference count */
 	wait_queue_head_t pag_active_wq;/* woken active_ref falls to zero */
 	unsigned long	pag_opstate;
-	uint8_t		pagf_levels[XFS_BTNUM_AGF];
-					/* # of levels in bno & cnt btree */
+	uint8_t		pagf_bno_level;	/* # of levels in bno btree */
+	uint8_t		pagf_cnt_level;	/* # of levels in cnt btree */
+	uint8_t		pagf_rmap_level;/* # of levels in rmap btree */
 	uint32_t	pagf_flcount;	/* count of blocks in freelist */
 	xfs_extlen_t	pagf_freeblks;	/* total free blocks */
 	xfs_extlen_t	pagf_longest;	/* longest free space */
@@ -86,8 +87,10 @@ struct xfs_perag {
 	 * Alternate btree heights so that online repair won't trip the write
 	 * verifiers while rebuilding the AG btrees.
 	 */
-	uint8_t		pagf_repair_levels[XFS_BTNUM_AGF];
+	uint8_t		pagf_repair_bno_level;
+	uint8_t		pagf_repair_cnt_level;
 	uint8_t		pagf_repair_refcount_level;
+	uint8_t		pagf_repair_rmap_level;
 #endif
 
 	spinlock_t	pag_state_lock;
@@ -104,9 +107,7 @@ struct xfs_perag {
 	int		pag_ici_reclaimable;	/* reclaimable inodes */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
-	/* buffer cache index */
-	spinlock_t	pag_buf_lock;	/* lock for pag_buf_hash */
-	struct rhashtable pag_buf_hash;
+	struct xfs_buf_cache	pag_bcache;
 
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
@@ -119,6 +120,9 @@ struct xfs_perag {
 	 * inconsistencies.
 	 */
 	struct xfs_defer_drain	pag_intents_drain;
+
+	/* Hook to feed rmapbt updates to an active online repair. */
+	struct xfs_hooks	pag_rmap_update_hooks;
 #endif /* __KERNEL__ */
 };
 
@@ -331,7 +335,7 @@ struct aghdr_init_data {
 	/* per header data */
 	xfs_daddr_t		daddr;		/* header location */
 	size_t			numblks;	/* size of header */
-	xfs_btnum_t		type;		/* type of btree root block */
+	const struct xfs_btree_ops *bc_ops;	/* btree ops */
 };
 
 int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id);
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3bd0a33fee0a..9da52e92172a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -26,6 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
 #include "xfs_bmap.h"
+#include "xfs_health.h"
 
 struct kmem_cache	*xfs_extfree_item_cache;
 
@@ -150,23 +151,38 @@ xfs_alloc_ag_max_usable(
 	return mp->m_sb.sb_agblocks - blocks;
 }
 
+
+static int
+xfs_alloc_lookup(
+	struct xfs_btree_cur	*cur,
+	xfs_lookup_t		dir,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	int			*stat)
+{
+	int			error;
+
+	cur->bc_rec.a.ar_startblock = bno;
+	cur->bc_rec.a.ar_blockcount = len;
+	error = xfs_btree_lookup(cur, dir, stat);
+	if (*stat == 1)
+		cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
+	else
+		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+	return error;
+}
+
 /*
  * Lookup the record equal to [bno, len] in the btree given by cur.
  */
-STATIC int				/* error */
+static inline int				/* error */
 xfs_alloc_lookup_eq(
 	struct xfs_btree_cur	*cur,	/* btree cursor */
 	xfs_agblock_t		bno,	/* starting block of extent */
 	xfs_extlen_t		len,	/* length of extent */
 	int			*stat)	/* success/failure */
 {
-	int			error;
-
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-	cur->bc_ag.abt.active = (*stat == 1);
-	return error;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat);
 }
 
 /*
@@ -180,13 +196,7 @@ xfs_alloc_lookup_ge(
 	xfs_extlen_t		len,	/* length of extent */
 	int			*stat)	/* success/failure */
 {
-	int			error;
-
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
-	cur->bc_ag.abt.active = (*stat == 1);
-	return error;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat);
 }
 
 /*
@@ -200,19 +210,14 @@ xfs_alloc_lookup_le(
 	xfs_extlen_t		len,	/* length of extent */
 	int			*stat)	/* success/failure */
 {
-	int			error;
-	cur->bc_rec.a.ar_startblock = bno;
-	cur->bc_rec.a.ar_blockcount = len;
-	error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-	cur->bc_ag.abt.active = (*stat == 1);
-	return error;
+	return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat);
 }
 
 static inline bool
 xfs_alloc_cur_active(
 	struct xfs_btree_cur	*cur)
 {
-	return cur && cur->bc_ag.abt.active;
+	return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE);
 }
 
 /*
@@ -268,12 +273,12 @@ xfs_alloc_complain_bad_rec(
 	struct xfs_mount		*mp = cur->bc_mp;
 
 	xfs_warn(mp,
-		"%s Freespace BTree record corruption in AG %d detected at %pS!",
-		cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
-		cur->bc_ag.pag->pag_agno, fa);
+		"%sbt record corruption in AG %d detected at %pS!",
+		cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 		"start block 0x%x block count 0x%x", irec->ar_startblock,
 		irec->ar_blockcount);
+	xfs_btree_mark_sick(cur);
 	return -EFSCORRUPTED;
 }
 
@@ -497,14 +502,18 @@ xfs_alloc_fixup_trees(
 		if (XFS_IS_CORRUPT(mp,
 				   i != 1 ||
 				   nfbno1 != fbno ||
-				   nflen1 != flen))
+				   nflen1 != flen)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 #endif
 	} else {
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 	/*
 	 * Look up the record in the by-block tree if necessary.
@@ -516,14 +525,18 @@ xfs_alloc_fixup_trees(
 		if (XFS_IS_CORRUPT(mp,
 				   i != 1 ||
 				   nfbno1 != fbno ||
-				   nflen1 != flen))
+				   nflen1 != flen)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 #endif
 	} else {
 		if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 
 #ifdef DEBUG
@@ -536,8 +549,10 @@ xfs_alloc_fixup_trees(
 
 		if (XFS_IS_CORRUPT(mp,
 				   bnoblock->bb_numrecs !=
-				   cntblock->bb_numrecs))
+				   cntblock->bb_numrecs)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 #endif
 
@@ -567,30 +582,40 @@ xfs_alloc_fixup_trees(
 	 */
 	if ((error = xfs_btree_delete(cnt_cur, &i)))
 		return error;
-	if (XFS_IS_CORRUPT(mp, i != 1))
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cnt_cur);
 		return -EFSCORRUPTED;
+	}
 	/*
 	 * Add new by-size btree entry(s).
 	 */
 	if (nfbno1 != NULLAGBLOCK) {
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 0))
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 		if ((error = xfs_btree_insert(cnt_cur, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 	if (nfbno2 != NULLAGBLOCK) {
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 0))
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 		if ((error = xfs_btree_insert(cnt_cur, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 	/*
 	 * Fix up the by-block btree entry(s).
@@ -601,8 +626,10 @@ xfs_alloc_fixup_trees(
 		 */
 		if ((error = xfs_btree_delete(bno_cur, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 	} else {
 		/*
 		 * Update the by-block entry to start later|be shorter.
@@ -616,12 +643,16 @@ xfs_alloc_fixup_trees(
 		 */
 		if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 0))
+		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 		if ((error = xfs_btree_insert(bno_cur, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			return -EFSCORRUPTED;
+		}
 	}
 	return 0;
 }
@@ -755,6 +786,8 @@ xfs_alloc_read_agfl(
 			mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL);
 	if (error)
 		return error;
 	xfs_buf_set_ref(bp, XFS_AGFL_REF);
@@ -776,6 +809,7 @@ xfs_alloc_update_counters(
 	if (unlikely(be32_to_cpu(agf->agf_freeblks) >
 		     be32_to_cpu(agf->agf_length))) {
 		xfs_buf_mark_corrupt(agbp);
+		xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF);
 		return -EFSCORRUPTED;
 	}
 
@@ -828,8 +862,8 @@ xfs_alloc_cur_setup(
 	 * attempt a small allocation.
 	 */
 	if (!acur->cnt)
-		acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
-					args->agbp, args->pag, XFS_BTNUM_CNT);
+		acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag);
 	error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
 	if (error)
 		return error;
@@ -838,11 +872,11 @@ xfs_alloc_cur_setup(
 	 * Allocate the bnobt left and right search cursors.
 	 */
 	if (!acur->bnolt)
-		acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
-					args->agbp, args->pag, XFS_BTNUM_BNO);
+		acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag);
 	if (!acur->bnogt)
-		acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
-					args->agbp, args->pag, XFS_BTNUM_BNO);
+		acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp,
+					args->agbp, args->pag);
 	return i == 1 ? 0 : -ENOSPC;
 }
 
@@ -884,15 +918,17 @@ xfs_alloc_cur_check(
 	bool			busy;
 	unsigned		busy_gen = 0;
 	bool			deactivate = false;
-	bool			isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
+	bool			isbnobt = xfs_btree_is_bno(cur->bc_ops);
 
 	*new = 0;
 
 	error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(args->mp, i != 1))
+	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	/*
 	 * Check minlen and deactivate a cntbt cursor if out of acceptable size
@@ -958,9 +994,8 @@ xfs_alloc_cur_check(
 		deactivate = true;
 out:
 	if (deactivate)
-		cur->bc_ag.abt.active = false;
-	trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
-				  *new);
+		cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
+	trace_xfs_alloc_cur_check(cur, bno, len, diff, *new);
 	return 0;
 }
 
@@ -1098,6 +1133,7 @@ xfs_alloc_ag_vextent_small(
 		if (error)
 			goto error;
 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+			xfs_btree_mark_sick(ccur);
 			error = -EFSCORRUPTED;
 			goto error;
 		}
@@ -1132,6 +1168,7 @@ xfs_alloc_ag_vextent_small(
 	*fbnop = args->agbno = fbno;
 	*flenp = args->len = 1;
 	if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
+		xfs_btree_mark_sick(ccur);
 		error = -EFSCORRUPTED;
 		goto error;
 	}
@@ -1197,8 +1234,8 @@ xfs_alloc_ag_vextent_exact(
 	/*
 	 * Allocate/initialize a cursor for the by-number freespace btree.
 	 */
-	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					  args->pag, XFS_BTNUM_BNO);
+	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+					  args->pag);
 
 	/*
 	 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
@@ -1218,6 +1255,7 @@ xfs_alloc_ag_vextent_exact(
 	if (error)
 		goto error0;
 	if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+		xfs_btree_mark_sick(bno_cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1257,8 +1295,8 @@ xfs_alloc_ag_vextent_exact(
 	 * We are allocating agbno for args->len
 	 * Allocate/initialize a cursor for the by-size btree.
 	 */
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					args->pag, XFS_BTNUM_CNT);
+	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag);
 	ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
 	error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
 				      args->len, XFSA_FIXUP_BNO_OK);
@@ -1330,7 +1368,7 @@ xfs_alloc_walk_iter(
 		if (error)
 			return error;
 		if (i == 0)
-			cur->bc_ag.abt.active = false;
+			cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE;
 
 		if (count > 0)
 			count--;
@@ -1444,7 +1482,7 @@ xfs_alloc_ag_vextent_locality(
 		if (error)
 			return error;
 		if (i) {
-			acur->cnt->bc_ag.abt.active = true;
+			acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE;
 			fbcur = acur->cnt;
 			fbinc = false;
 		}
@@ -1497,8 +1535,10 @@ xfs_alloc_ag_vextent_lastblock(
 			error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
 			if (error)
 				return error;
-			if (XFS_IS_CORRUPT(args->mp, i != 1))
+			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+				xfs_btree_mark_sick(acur->cnt);
 				return -EFSCORRUPTED;
+			}
 			if (*len >= args->minlen)
 				break;
 			error = xfs_btree_increment(acur->cnt, 0, &i);
@@ -1670,8 +1710,8 @@ restart:
 	/*
 	 * Allocate and initialize a cursor for the by-size btree.
 	 */
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					args->pag, XFS_BTNUM_CNT);
+	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag);
 	bno_cur = NULL;
 
 	/*
@@ -1710,6 +1750,7 @@ restart:
 			if (error)
 				goto error0;
 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+				xfs_btree_mark_sick(cnt_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -1756,6 +1797,7 @@ restart:
 			   rlen != 0 &&
 			   (rlen > flen ||
 			    rbno + rlen > fbno + flen))) {
+		xfs_btree_mark_sick(cnt_cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1778,6 +1820,7 @@ restart:
 					&i)))
 				goto error0;
 			if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+				xfs_btree_mark_sick(cnt_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -1790,6 +1833,7 @@ restart:
 					   rlen != 0 &&
 					   (rlen > flen ||
 					    rbno + rlen > fbno + flen))) {
+				xfs_btree_mark_sick(cnt_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -1806,6 +1850,7 @@ restart:
 				&i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(args->mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1844,14 +1889,15 @@ restart:
 
 	rlen = args->len;
 	if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
+		xfs_btree_mark_sick(cnt_cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
 	/*
 	 * Allocate and initialize a cursor for the by-block tree.
 	 */
-	bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
-					args->pag, XFS_BTNUM_BNO);
+	bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp,
+					args->pag);
 	if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
 			rbno, rlen, XFSA_FIXUP_CNT_OK)))
 		goto error0;
@@ -1863,6 +1909,7 @@ restart:
 	if (XFS_IS_CORRUPT(args->mp,
 			   args->agbno + args->len >
 			   be32_to_cpu(agf->agf_length))) {
+		xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1924,7 +1971,7 @@ xfs_free_ag_extent(
 	/*
 	 * Allocate and initialize a cursor for the by-block btree.
 	 */
-	bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
+	bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag);
 	/*
 	 * Look for a neighboring block on the left (lower block numbers)
 	 * that is contiguous with this space.
@@ -1938,6 +1985,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1953,6 +2001,7 @@ xfs_free_ag_extent(
 			 * Very bad.
 			 */
 			if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
+				xfs_btree_mark_sick(bno_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -1971,6 +2020,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1986,6 +2036,7 @@ xfs_free_ag_extent(
 			 * Very bad.
 			 */
 			if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
+				xfs_btree_mark_sick(bno_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -1994,7 +2045,7 @@ xfs_free_ag_extent(
 	/*
 	 * Now allocate and initialize a cursor for the by-size tree.
 	 */
-	cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
+	cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
 	/*
 	 * Have both left and right contiguous neighbors.
 	 * Merge all three into a single free block.
@@ -2006,12 +2057,14 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
 		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2021,12 +2074,14 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
 		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2036,6 +2091,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_btree_delete(bno_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2045,6 +2101,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2064,6 +2121,7 @@ xfs_free_ag_extent(
 					   i != 1 ||
 					   xxbno != ltbno ||
 					   xxlen != ltlen)) {
+				xfs_btree_mark_sick(bno_cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -2088,12 +2146,14 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
 		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2104,6 +2164,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2123,12 +2184,14 @@ xfs_free_ag_extent(
 		if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
 		if ((error = xfs_btree_delete(cnt_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cnt_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2151,6 +2214,7 @@ xfs_free_ag_extent(
 		if ((error = xfs_btree_insert(bno_cur, &i)))
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(bno_cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2163,12 +2227,14 @@ xfs_free_ag_extent(
 	if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
 		goto error0;
 	if (XFS_IS_CORRUPT(mp, i != 0)) {
+		xfs_btree_mark_sick(cnt_cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
 	if ((error = xfs_btree_insert(cnt_cur, &i)))
 		goto error0;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cnt_cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -2267,8 +2333,9 @@ xfs_alloc_min_freelist(
 	struct xfs_perag	*pag)
 {
 	/* AG btrees have at least 1 level. */
-	static const uint8_t	fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
-	const uint8_t		*levels = pag ? pag->pagf_levels : fake_levels;
+	const unsigned int	bno_level = pag ? pag->pagf_bno_level : 1;
+	const unsigned int	cnt_level = pag ? pag->pagf_cnt_level : 1;
+	const unsigned int	rmap_level = pag ? pag->pagf_rmap_level : 1;
 	unsigned int		min_free;
 
 	ASSERT(mp->m_alloc_maxlevels > 0);
@@ -2295,16 +2362,12 @@ xfs_alloc_min_freelist(
 	 */
 
 	/* space needed by-bno freespace btree */
-	min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
-				       mp->m_alloc_maxlevels) * 2 - 2;
+	min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed by-size freespace btree */
-	min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
-				       mp->m_alloc_maxlevels) * 2 - 2;
+	min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2;
 	/* space needed reverse mapping used space btree */
 	if (xfs_has_rmapbt(mp))
-		min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
-						mp->m_rmap_maxlevels) * 2 - 2;
-
+		min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2;
 	return min_free;
 }
 
@@ -2691,13 +2754,14 @@ xfs_exact_minlen_extent_available(
 	xfs_extlen_t		flen;
 	int			error = 0;
 
-	cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
-					args->pag, XFS_BTNUM_CNT);
+	cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp,
+					args->pag);
 	error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
 	if (error)
 		goto out;
 
 	if (*stat == 0) {
+		xfs_btree_mark_sick(cnt_cur);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
@@ -2987,8 +3051,8 @@ xfs_alloc_log_agf(
 		offsetof(xfs_agf_t, agf_versionnum),
 		offsetof(xfs_agf_t, agf_seqno),
 		offsetof(xfs_agf_t, agf_length),
-		offsetof(xfs_agf_t, agf_roots[0]),
-		offsetof(xfs_agf_t, agf_levels[0]),
+		offsetof(xfs_agf_t, agf_bno_root),   /* also cnt/rmap root */
+		offsetof(xfs_agf_t, agf_bno_level),  /* also cnt/rmap levels */
 		offsetof(xfs_agf_t, agf_flfirst),
 		offsetof(xfs_agf_t, agf_fllast),
 		offsetof(xfs_agf_t, agf_flcount),
@@ -3167,12 +3231,10 @@ xfs_agf_verify(
 	    be32_to_cpu(agf->agf_freeblks) > agf_length)
 		return __this_address;
 
-	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
-	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
-	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
-						mp->m_alloc_maxlevels ||
-	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
-						mp->m_alloc_maxlevels)
+	if (be32_to_cpu(agf->agf_bno_level) < 1 ||
+	    be32_to_cpu(agf->agf_cnt_level) < 1 ||
+	    be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels ||
+	    be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels)
 		return __this_address;
 
 	if (xfs_has_lazysbcount(mp) &&
@@ -3183,9 +3245,8 @@ xfs_agf_verify(
 		if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length)
 			return __this_address;
 
-		if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
-		    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
-							mp->m_rmap_maxlevels)
+		if (be32_to_cpu(agf->agf_rmap_level) < 1 ||
+		    be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels)
 			return __this_address;
 	}
 
@@ -3268,6 +3329,8 @@ xfs_read_agf(
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF);
 	if (error)
 		return error;
 
@@ -3309,12 +3372,9 @@ xfs_alloc_read_agf(
 		pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
 		pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
 		pag->pagf_longest = be32_to_cpu(agf->agf_longest);
-		pag->pagf_levels[XFS_BTNUM_BNOi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
-		pag->pagf_levels[XFS_BTNUM_CNTi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
-		pag->pagf_levels[XFS_BTNUM_RMAPi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+		pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+		pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+		pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
 		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 		if (xfs_agfl_needs_reset(pag->pag_mount, agf))
 			set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate);
@@ -3343,10 +3403,8 @@ xfs_alloc_read_agf(
 		ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
 		ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
 		ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
-		ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
-		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
-		ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
-		       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
+		ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level));
+		ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level));
 	}
 #endif
 	if (agfbpp)
@@ -3895,17 +3953,23 @@ __xfs_free_extent(
 		return -EIO;
 
 	error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
-	if (error)
+	if (error) {
+		if (xfs_metadata_is_sick(error))
+			xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
 		return error;
+	}
+
 	agf = agbp->b_addr;
 
 	if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
 		error = -EFSCORRUPTED;
 		goto err_release;
 	}
 
 	/* validate the extent size is legal now we have the agf locked */
 	if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT);
 		error = -EFSCORRUPTED;
 		goto err_release;
 	}
@@ -3962,7 +4026,7 @@ xfs_alloc_query_range(
 	union xfs_btree_irec			high_brec = { .a = *high_rec };
 	struct xfs_alloc_query_range_info	query = { .priv = priv, .fn = fn };
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	ASSERT(xfs_btree_is_bno(cur->bc_ops));
 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
 			xfs_alloc_query_range_helper, &query);
 }
@@ -3976,7 +4040,7 @@ xfs_alloc_query_all(
 {
 	struct xfs_alloc_query_range_info	query;
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	ASSERT(xfs_btree_is_bno(cur->bc_ops));
 	query.priv = priv;
 	query.fn = fn;
 	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index a7032bf0cd37..6ef5ddd89600 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -16,6 +16,7 @@
 #include "xfs_alloc.h"
 #include "xfs_extent_busy.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_ag.h"
@@ -23,13 +24,22 @@
 static struct kmem_cache	*xfs_allocbt_cur_cache;
 
 STATIC struct xfs_btree_cur *
-xfs_allocbt_dup_cursor(
+xfs_bnobt_dup_cursor(
 	struct xfs_btree_cur	*cur)
 {
-	return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
-			cur->bc_ag.agbp, cur->bc_ag.pag, cur->bc_btnum);
+	return xfs_bnobt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+			cur->bc_ag.pag);
 }
 
+STATIC struct xfs_btree_cur *
+xfs_cntbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_cntbt_init_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ag.agbp,
+			cur->bc_ag.pag);
+}
+
+
 STATIC void
 xfs_allocbt_set_root(
 	struct xfs_btree_cur		*cur,
@@ -38,13 +48,18 @@ xfs_allocbt_set_root(
 {
 	struct xfs_buf		*agbp = cur->bc_ag.agbp;
 	struct xfs_agf		*agf = agbp->b_addr;
-	int			btnum = cur->bc_btnum;
 
 	ASSERT(ptr->s != 0);
 
-	agf->agf_roots[btnum] = ptr->s;
-	be32_add_cpu(&agf->agf_levels[btnum], inc);
-	cur->bc_ag.pag->pagf_levels[btnum] += inc;
+	if (xfs_btree_is_bno(cur->bc_ops)) {
+		agf->agf_bno_root = ptr->s;
+		be32_add_cpu(&agf->agf_bno_level, inc);
+		cur->bc_ag.pag->pagf_bno_level += inc;
+	} else {
+		agf->agf_cnt_root = ptr->s;
+		be32_add_cpu(&agf->agf_cnt_level, inc);
+		cur->bc_ag.pag->pagf_cnt_level += inc;
+	}
 
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
@@ -116,7 +131,7 @@ xfs_allocbt_update_lastrec(
 	__be32			len;
 	int			numrecs;
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+	ASSERT(!xfs_btree_is_bno(cur->bc_ops));
 
 	switch (reason) {
 	case LASTREC_UPDATE:
@@ -226,7 +241,10 @@ xfs_allocbt_init_ptr_from_cur(
 
 	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
 
-	ptr->s = agf->agf_roots[cur->bc_btnum];
+	if (xfs_btree_is_bno(cur->bc_ops))
+		ptr->s = agf->agf_bno_root;
+	else
+		ptr->s = agf->agf_cnt_root;
 }
 
 STATIC int64_t
@@ -299,13 +317,12 @@ xfs_allocbt_verify(
 	struct xfs_perag	*pag = bp->b_pag;
 	xfs_failaddr_t		fa;
 	unsigned int		level;
-	xfs_btnum_t		btnum = XFS_BTNUM_BNOi;
 
 	if (!xfs_verify_magic(bp, block->bb_magic))
 		return __this_address;
 
 	if (xfs_has_crc(mp)) {
-		fa = xfs_btree_sblock_v5hdr_verify(bp);
+		fa = xfs_btree_agblock_v5hdr_verify(bp);
 		if (fa)
 			return fa;
 	}
@@ -320,26 +337,32 @@ xfs_allocbt_verify(
 	 * against.
 	 */
 	level = be16_to_cpu(block->bb_level);
-	if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
-		btnum = XFS_BTNUM_CNTi;
 	if (pag && xfs_perag_initialised_agf(pag)) {
-		unsigned int	maxlevel = pag->pagf_levels[btnum];
+		unsigned int	maxlevel, repair_maxlevel = 0;
 
-#ifdef CONFIG_XFS_ONLINE_REPAIR
 		/*
 		 * Online repair could be rewriting the free space btrees, so
 		 * we'll validate against the larger of either tree while this
 		 * is going on.
 		 */
-		maxlevel = max_t(unsigned int, maxlevel,
-				 pag->pagf_repair_levels[btnum]);
+		if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC)) {
+			maxlevel = pag->pagf_cnt_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+			repair_maxlevel = pag->pagf_repair_cnt_level;
+#endif
+		} else {
+			maxlevel = pag->pagf_bno_level;
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+			repair_maxlevel = pag->pagf_repair_bno_level;
 #endif
-		if (level >= maxlevel)
+		}
+
+		if (level >= max(maxlevel, repair_maxlevel))
 			return __this_address;
 	} else if (level >= mp->m_alloc_maxlevels)
 		return __this_address;
 
-	return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
+	return xfs_btree_agblock_verify(bp, mp->m_alloc_mxr[level != 0]);
 }
 
 static void
@@ -348,7 +371,7 @@ xfs_allocbt_read_verify(
 {
 	xfs_failaddr_t	fa;
 
-	if (!xfs_btree_sblock_verify_crc(bp))
+	if (!xfs_btree_agblock_verify_crc(bp))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_allocbt_verify(bp);
@@ -372,7 +395,7 @@ xfs_allocbt_write_verify(
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
 	}
-	xfs_btree_sblock_calc_crc(bp);
+	xfs_btree_agblock_calc_crc(bp);
 
 }
 
@@ -454,11 +477,19 @@ xfs_allocbt_keys_contiguous(
 				 be32_to_cpu(key2->alloc.ar_startblock));
 }
 
-static const struct xfs_btree_ops xfs_bnobt_ops = {
+const struct xfs_btree_ops xfs_bnobt_ops = {
+	.name			= "bno",
+	.type			= XFS_BTREE_TYPE_AG,
+
 	.rec_len		= sizeof(xfs_alloc_rec_t),
 	.key_len		= sizeof(xfs_alloc_key_t),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
 
-	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.lru_refs		= XFS_ALLOC_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_abtb_2),
+	.sick_mask		= XFS_SICK_AG_BNOBT,
+
+	.dup_cursor		= xfs_bnobt_dup_cursor,
 	.set_root		= xfs_allocbt_set_root,
 	.alloc_block		= xfs_allocbt_alloc_block,
 	.free_block		= xfs_allocbt_free_block,
@@ -477,11 +508,20 @@ static const struct xfs_btree_ops xfs_bnobt_ops = {
 	.keys_contiguous	= xfs_allocbt_keys_contiguous,
 };
 
-static const struct xfs_btree_ops xfs_cntbt_ops = {
+const struct xfs_btree_ops xfs_cntbt_ops = {
+	.name			= "cnt",
+	.type			= XFS_BTREE_TYPE_AG,
+	.geom_flags		= XFS_BTGEO_LASTREC_UPDATE,
+
 	.rec_len		= sizeof(xfs_alloc_rec_t),
 	.key_len		= sizeof(xfs_alloc_key_t),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
+
+	.lru_refs		= XFS_ALLOC_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_abtc_2),
+	.sick_mask		= XFS_SICK_AG_CNTBT,
 
-	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.dup_cursor		= xfs_cntbt_dup_cursor,
 	.set_root		= xfs_allocbt_set_root,
 	.alloc_block		= xfs_allocbt_alloc_block,
 	.free_block		= xfs_allocbt_free_block,
@@ -500,76 +540,55 @@ static const struct xfs_btree_ops xfs_cntbt_ops = {
 	.keys_contiguous	= NULL, /* not needed right now */
 };
 
-/* Allocate most of a new allocation btree cursor. */
-STATIC struct xfs_btree_cur *
-xfs_allocbt_init_common(
+/*
+ * Allocate a new bnobt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_bnobt_init_cursor(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
-	struct xfs_perag	*pag,
-	xfs_btnum_t		btnum)
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
 {
 	struct xfs_btree_cur	*cur;
 
-	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-
-	cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels,
-			xfs_allocbt_cur_cache);
-	cur->bc_ag.abt.active = false;
-
-	if (btnum == XFS_BTNUM_CNT) {
-		cur->bc_ops = &xfs_cntbt_ops;
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
-		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-	} else {
-		cur->bc_ops = &xfs_bnobt_ops;
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
-	}
-
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bnobt_ops,
+			mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
 	cur->bc_ag.pag = xfs_perag_hold(pag);
+	cur->bc_ag.agbp = agbp;
+	if (agbp) {
+		struct xfs_agf		*agf = agbp->b_addr;
 
-	if (xfs_has_crc(mp))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
+		cur->bc_nlevels = be32_to_cpu(agf->agf_bno_level);
+	}
 	return cur;
 }
 
 /*
- * Allocate a new allocation btree cursor.
+ * Allocate a new cntbt cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
  */
-struct xfs_btree_cur *			/* new alloc btree cursor */
-xfs_allocbt_init_cursor(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	struct xfs_buf		*agbp,		/* buffer for agf structure */
-	struct xfs_perag	*pag,
-	xfs_btnum_t		btnum)		/* btree identifier */
-{
-	struct xfs_agf		*agf = agbp->b_addr;
-	struct xfs_btree_cur	*cur;
-
-	cur = xfs_allocbt_init_common(mp, tp, pag, btnum);
-	if (btnum == XFS_BTNUM_CNT)
-		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
-	else
-		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
-
-	cur->bc_ag.agbp = agbp;
-
-	return cur;
-}
-
-/* Create a free space btree cursor with a fake root for staging. */
 struct xfs_btree_cur *
-xfs_allocbt_stage_cursor(
+xfs_cntbt_init_cursor(
 	struct xfs_mount	*mp,
-	struct xbtree_afakeroot	*afake,
-	struct xfs_perag	*pag,
-	xfs_btnum_t		btnum)
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	struct xfs_perag	*pag)
 {
 	struct xfs_btree_cur	*cur;
 
-	cur = xfs_allocbt_init_common(mp, NULL, pag, btnum);
-	xfs_btree_stage_afakeroot(cur, afake);
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_cntbt_ops,
+			mp->m_alloc_maxlevels, xfs_allocbt_cur_cache);
+	cur->bc_ag.pag = xfs_perag_hold(pag);
+	cur->bc_ag.agbp = agbp;
+	if (agbp) {
+		struct xfs_agf		*agf = agbp->b_addr;
+
+		cur->bc_nlevels = be32_to_cpu(agf->agf_cnt_level);
+	}
 	return cur;
 }
 
@@ -588,16 +607,16 @@ xfs_allocbt_commit_staged_btree(
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 
-	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
-	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
-	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
-
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_bnobt_ops);
+	if (xfs_btree_is_bno(cur->bc_ops)) {
+		agf->agf_bno_root = cpu_to_be32(afake->af_root);
+		agf->agf_bno_level = cpu_to_be32(afake->af_levels);
 	} else {
-		cur->bc_flags |= XFS_BTREE_LASTREC_UPDATE;
-		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_cntbt_ops);
+		agf->agf_cnt_root = cpu_to_be32(afake->af_root);
+		agf->agf_cnt_level = cpu_to_be32(afake->af_levels);
 	}
+	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+
+	xfs_btree_commit_afakeroot(cur, tp, agbp);
 }
 
 /* Calculate number of records in an alloc btree block. */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h
index 45df893ef6bb..155b47f231ab 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.h
+++ b/fs/xfs/libxfs/xfs_alloc_btree.h
@@ -47,12 +47,12 @@ struct xbtree_afakeroot;
 		 (maxrecs) * sizeof(xfs_alloc_key_t) + \
 		 ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
 
-extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *mp,
+struct xfs_btree_cur *xfs_bnobt_init_cursor(struct xfs_mount *mp,
 		struct xfs_trans *tp, struct xfs_buf *bp,
-		struct xfs_perag *pag, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_allocbt_stage_cursor(struct xfs_mount *mp,
-		struct xbtree_afakeroot *afake, struct xfs_perag *pag,
-		xfs_btnum_t btnum);
+		struct xfs_perag *pag);
+struct xfs_btree_cur *xfs_cntbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *bp,
+		struct xfs_perag *pag);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
 extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e965a48e7db9..673a4b6d2e8d 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -224,7 +224,7 @@ int
 xfs_attr_get_ilocked(
 	struct xfs_da_args	*args)
 {
-	ASSERT(xfs_isilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(args->dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	if (!xfs_inode_hasattr(args->dp))
 		return -ENOATTR;
@@ -891,7 +891,8 @@ xfs_attr_defer_add(
 
 	struct xfs_attr_intent	*new;
 
-	new = kmem_cache_zalloc(xfs_attr_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	new = kmem_cache_zalloc(xfs_attr_intent_cache,
+			GFP_KERNEL | __GFP_NOFAIL);
 	new->xattri_op_flags = op_flags;
 	new->xattri_da_args = args;
 
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 6374bf107242..ac904cc1a97b 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -29,6 +29,7 @@
 #include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_errortag.h"
+#include "xfs_health.h"
 
 
 /*
@@ -879,8 +880,7 @@ xfs_attr_shortform_to_leaf(
 
 	trace_xfs_attr_sf_to_leaf(args);
 
-	tmpbuffer = kmem_alloc(size, 0);
-	ASSERT(tmpbuffer != NULL);
+	tmpbuffer = kmalloc(size, GFP_KERNEL | __GFP_NOFAIL);
 	memcpy(tmpbuffer, ifp->if_data, size);
 	sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
 
@@ -924,7 +924,7 @@ xfs_attr_shortform_to_leaf(
 	}
 	error = 0;
 out:
-	kmem_free(tmpbuffer);
+	kfree(tmpbuffer);
 	return error;
 }
 
@@ -1059,7 +1059,7 @@ xfs_attr3_leaf_to_shortform(
 
 	trace_xfs_attr_leaf_to_sf(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+	tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
 	if (!tmpbuffer)
 		return -ENOMEM;
 
@@ -1125,7 +1125,7 @@ xfs_attr3_leaf_to_shortform(
 	error = 0;
 
 out:
-	kmem_free(tmpbuffer);
+	kfree(tmpbuffer);
 	return error;
 }
 
@@ -1533,7 +1533,7 @@ xfs_attr3_leaf_compact(
 
 	trace_xfs_attr_leaf_compact(args);
 
-	tmpbuffer = kmem_alloc(args->geo->blksize, 0);
+	tmpbuffer = kmalloc(args->geo->blksize, GFP_KERNEL | __GFP_NOFAIL);
 	memcpy(tmpbuffer, bp->b_addr, args->geo->blksize);
 	memset(bp->b_addr, 0, args->geo->blksize);
 	leaf_src = (xfs_attr_leafblock_t *)tmpbuffer;
@@ -1571,7 +1571,7 @@ xfs_attr3_leaf_compact(
 	 */
 	xfs_trans_log_buf(trans, bp, 0, args->geo->blksize - 1);
 
-	kmem_free(tmpbuffer);
+	kfree(tmpbuffer);
 }
 
 /*
@@ -2250,7 +2250,8 @@ xfs_attr3_leaf_unbalance(
 		struct xfs_attr_leafblock *tmp_leaf;
 		struct xfs_attr3_icleaf_hdr tmphdr;
 
-		tmp_leaf = kmem_zalloc(state->args->geo->blksize, 0);
+		tmp_leaf = kzalloc(state->args->geo->blksize,
+				GFP_KERNEL | __GFP_NOFAIL);
 
 		/*
 		 * Copy the header into the temp leaf so that all the stuff
@@ -2290,7 +2291,7 @@ xfs_attr3_leaf_unbalance(
 		}
 		memcpy(save_leaf, tmp_leaf, state->args->geo->blksize);
 		savehdr = tmphdr; /* struct copy */
-		kmem_free(tmp_leaf);
+		kfree(tmp_leaf);
 	}
 
 	xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr);
@@ -2343,6 +2344,7 @@ xfs_attr3_leaf_lookup_int(
 	entries = xfs_attr3_leaf_entryp(leaf);
 	if (ichdr.count >= args->geo->blksize / 8) {
 		xfs_buf_mark_corrupt(bp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 
@@ -2362,10 +2364,12 @@ xfs_attr3_leaf_lookup_int(
 	}
 	if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
 		xfs_buf_mark_corrupt(bp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 	if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
 		xfs_buf_mark_corrupt(bp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d440393b40eb..ff0412828772 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -22,6 +22,7 @@
 #include "xfs_attr_remote.h"
 #include "xfs_trace.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 #define ATTR_RMTVALUE_MAPSIZE	1	/* # of map entries at once */
 
@@ -276,17 +277,18 @@ xfs_attr3_rmt_hdr_set(
  */
 STATIC int
 xfs_attr_rmtval_copyout(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	xfs_ino_t	ino,
-	int		*offset,
-	int		*valuelen,
-	uint8_t		**dst)
+	struct xfs_mount	*mp,
+	struct xfs_buf		*bp,
+	struct xfs_inode	*dp,
+	int			*offset,
+	int			*valuelen,
+	uint8_t			**dst)
 {
-	char		*src = bp->b_addr;
-	xfs_daddr_t	bno = xfs_buf_daddr(bp);
-	int		len = BBTOB(bp->b_length);
-	int		blksize = mp->m_attr_geo->blksize;
+	char			*src = bp->b_addr;
+	xfs_ino_t		ino = dp->i_ino;
+	xfs_daddr_t		bno = xfs_buf_daddr(bp);
+	int			len = BBTOB(bp->b_length);
+	int			blksize = mp->m_attr_geo->blksize;
 
 	ASSERT(len >= blksize);
 
@@ -302,6 +304,7 @@ xfs_attr_rmtval_copyout(
 				xfs_alert(mp,
 "remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)",
 					bno, *offset, byte_cnt, ino);
+				xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 				return -EFSCORRUPTED;
 			}
 			hdr_size = sizeof(struct xfs_attr3_rmt_hdr);
@@ -418,10 +421,12 @@ xfs_attr_rmtval_get(
 			dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
 			error = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt,
 					0, &bp, &xfs_attr3_rmt_buf_ops);
+			if (xfs_metadata_is_sick(error))
+				xfs_dirattr_mark_sick(args->dp, XFS_ATTR_FORK);
 			if (error)
 				return error;
 
-			error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
+			error = xfs_attr_rmtval_copyout(mp, bp, args->dp,
 							&offset, &valuelen,
 							&dst);
 			xfs_buf_relse(bp);
@@ -545,11 +550,13 @@ xfs_attr_rmtval_stale(
 	struct xfs_buf		*bp;
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	if (XFS_IS_CORRUPT(mp, map->br_startblock == DELAYSTARTBLOCK) ||
-	    XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK))
+	    XFS_IS_CORRUPT(mp, map->br_startblock == HOLESTARTBLOCK)) {
+		xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_buf_incore(mp->m_ddev_targp,
 			XFS_FSB_TO_DADDR(mp, map->br_startblock),
@@ -659,8 +666,10 @@ xfs_attr_rmtval_invalidate(
 				       blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1))
+		if (XFS_IS_CORRUPT(args->dp->i_mount, nmap != 1)) {
+			xfs_bmap_mark_sick(args->dp, XFS_ATTR_FORK);
 			return -EFSCORRUPTED;
+		}
 		error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
 		if (error)
 			return error;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f362345467fa..656c95a22f2e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -36,6 +36,9 @@
 #include "xfs_refcount.h"
 #include "xfs_icache.h"
 #include "xfs_iomap.h"
+#include "xfs_health.h"
+#include "xfs_bmap_item.h"
+#include "xfs_symlink_remote.h"
 
 struct kmem_cache		*xfs_bmap_intent_cache;
 
@@ -225,6 +228,28 @@ xfs_bmap_forkoff_reset(
 	}
 }
 
+static int
+xfs_bmap_read_buf(
+	struct xfs_mount	*mp,		/* file system mount point */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	xfs_fsblock_t		fsbno,		/* file system block number */
+	struct xfs_buf		**bpp)		/* buffer for fsbno */
+{
+	struct xfs_buf		*bp;		/* return value */
+	int			error;
+
+	if (!xfs_verify_fsbno(mp, fsbno))
+		return -EFSCORRUPTED;
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+			XFS_FSB_TO_DADDR(mp, fsbno), mp->m_bsize, 0, &bp,
+			&xfs_bmbt_buf_ops);
+	if (!error) {
+		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
+		*bpp = bp;
+	}
+	return error;
+}
+
 #ifdef DEBUG
 STATIC struct xfs_buf *
 xfs_bmap_get_bp(
@@ -364,9 +389,9 @@ xfs_bmap_check_leaf_extents(
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
 		if (!bp) {
 			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
+			error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+			if (xfs_metadata_is_sick(error))
+				xfs_btree_mark_sick(cur);
 			if (error)
 				goto error_norelse;
 		}
@@ -383,6 +408,7 @@ xfs_bmap_check_leaf_extents(
 		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
 		bno = be64_to_cpu(*pp);
 		if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, bno))) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -450,9 +476,9 @@ xfs_bmap_check_leaf_extents(
 		bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
 		if (!bp) {
 			bp_release = 1;
-			error = xfs_btree_read_bufl(mp, NULL, bno, &bp,
-						XFS_BMAP_BTREE_REF,
-						&xfs_bmbt_buf_ops);
+			error = xfs_bmap_read_buf(mp, NULL, bno, &bp);
+			if (xfs_metadata_is_sick(error))
+				xfs_btree_mark_sick(cur);
 			if (error)
 				goto error_norelse;
 		}
@@ -562,11 +588,14 @@ xfs_bmap_btree_to_extents(
 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
 	cbno = be64_to_cpu(*pp);
 #ifdef DEBUG
-	if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_btree_check_lptr(cur, cbno, 1)))
+	if (XFS_IS_CORRUPT(cur->bc_mp, !xfs_verify_fsbno(mp, cbno))) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 #endif
-	error = xfs_btree_read_bufl(mp, tp, cbno, &cbp, XFS_BMAP_BTREE_REF,
-				&xfs_bmbt_buf_ops);
+	error = xfs_bmap_read_buf(mp, tp, cbno, &cbp);
+	if (xfs_metadata_is_sick(error))
+		xfs_btree_mark_sick(cur);
 	if (error)
 		return error;
 	cblock = XFS_BUF_TO_BLOCK(cbp);
@@ -634,14 +663,13 @@ xfs_bmap_extents_to_btree(
 	 * Fill in the root.
 	 */
 	block = ifp->if_broot;
-	xfs_btree_init_block_int(mp, block, XFS_BUF_DADDR_NULL,
-				 XFS_BTNUM_BMAP, 1, 1, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS);
+	xfs_bmbt_init_block(ip, block, NULL, 1, 1);
 	/*
 	 * Need a cursor.  Can't allocate until bb_level is filled in.
 	 */
 	cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-	cur->bc_ino.flags = wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+	if (wasdel)
+		cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
 	/*
 	 * Convert to a btree with two levels, one record in root.
 	 */
@@ -667,7 +695,7 @@ xfs_bmap_extents_to_btree(
 		goto out_root_realloc;
 	}
 
-	cur->bc_ino.allocated++;
+	cur->bc_bmap.allocated++;
 	ip->i_nblocks++;
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
 	error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
@@ -679,11 +707,8 @@ xfs_bmap_extents_to_btree(
 	/*
 	 * Fill in the child block.
 	 */
-	abp->b_ops = &xfs_bmbt_buf_ops;
 	ablock = XFS_BUF_TO_BLOCK(abp);
-	xfs_btree_init_block_int(mp, ablock, xfs_buf_daddr(abp),
-				XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
-				XFS_BTREE_LONG_PTRS);
+	xfs_bmbt_init_block(ip, ablock, abp, 0, 0);
 
 	for_each_xfs_iext(ifp, &icur, &rec) {
 		if (isnullstartblock(rec.br_startblock))
@@ -878,6 +903,7 @@ xfs_bmap_add_attrfork_btree(
 			goto error0;
 		/* must be at least one entry */
 		if (XFS_IS_CORRUPT(mp, stat != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -887,7 +913,7 @@ xfs_bmap_add_attrfork_btree(
 			xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 			return -ENOSPC;
 		}
-		cur->bc_ino.allocated = 0;
+		cur->bc_bmap.allocated = 0;
 		xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 	}
 	return 0;
@@ -915,7 +941,7 @@ xfs_bmap_add_attrfork_extents(
 	error = xfs_bmap_extents_to_btree(tp, ip, &cur, 0, flags,
 					  XFS_DATA_FORK);
 	if (cur) {
-		cur->bc_ino.allocated = 0;
+		cur->bc_bmap.allocated = 0;
 		xfs_btree_del_cursor(cur, error);
 	}
 	return error;
@@ -960,6 +986,7 @@ xfs_bmap_add_attrfork_local(
 
 	/* should only be called for types that support local format data */
 	ASSERT(0);
+	xfs_bmap_mark_sick(ip, XFS_ATTR_FORK);
 	return -EFSCORRUPTED;
 }
 
@@ -1143,6 +1170,7 @@ xfs_iread_bmbt_block(
 				(unsigned long long)ip->i_ino);
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, block,
 				sizeof(*block), __this_address);
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
@@ -1158,6 +1186,7 @@ xfs_iread_bmbt_block(
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 					"xfs_iread_extents(2)", frp,
 					sizeof(*frp), fa);
+			xfs_bmap_mark_sick(ip, whichfork);
 			return xfs_bmap_complain_bad_rec(ip, whichfork, fa,
 					&new);
 		}
@@ -1189,7 +1218,7 @@ xfs_iread_extents(
 	if (!xfs_need_iread_extents(ifp))
 		return 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	ir.loaded = 0;
 	xfs_iext_first(ifp, &ir.icur);
@@ -1201,6 +1230,7 @@ xfs_iread_extents(
 		goto out;
 
 	if (XFS_IS_CORRUPT(mp, ir.loaded != ifp->if_nextents)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
@@ -1213,6 +1243,8 @@ xfs_iread_extents(
 	smp_store_release(&ifp->if_needextents, 0);
 	return 0;
 out:
+	if (xfs_metadata_is_sick(error))
+		xfs_bmap_mark_sick(ip, whichfork);
 	xfs_iext_destroy(ifp);
 	return error;
 }
@@ -1292,6 +1324,7 @@ xfs_bmap_last_before(
 		break;
 	default:
 		ASSERT(0);
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
@@ -1388,8 +1421,10 @@ xfs_bmap_last_offset(
 	if (ifp->if_format == XFS_DINODE_FMT_LOCAL)
 		return 0;
 
-	if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp)))
+	if (XFS_IS_CORRUPT(ip->i_mount, !xfs_ifork_has_extents(ifp))) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
 	if (error || is_empty)
@@ -1429,8 +1464,7 @@ xfs_bmap_add_extent_delay_real(
 
 	ASSERT(whichfork != XFS_ATTR_FORK);
 	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!bma->cur ||
-	       (bma->cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+	ASSERT(!bma->cur || (bma->cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
@@ -1528,6 +1562,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1535,6 +1570,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1542,6 +1578,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1571,6 +1608,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1604,6 +1642,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1632,6 +1671,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1639,6 +1679,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1673,6 +1714,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1698,6 +1740,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1705,6 +1748,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1721,7 +1765,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = PREV.br_blockcount - new->br_blockcount;
 		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(bma->cur ? bma->cur->bc_ino.allocated : 0));
+			(bma->cur ? bma->cur->bc_bmap.allocated : 0));
 
 		PREV.br_startoff = new_endoff;
 		PREV.br_blockcount = temp;
@@ -1749,6 +1793,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1785,6 +1830,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1792,6 +1838,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1808,7 +1855,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = PREV.br_blockcount - new->br_blockcount;
 		da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(bma->ip, temp),
 			startblockval(PREV.br_startblock) -
-			(bma->cur ? bma->cur->bc_ino.allocated : 0));
+			(bma->cur ? bma->cur->bc_bmap.allocated : 0));
 
 		PREV.br_startblock = nullstartblock(da_new);
 		PREV.br_blockcount = temp;
@@ -1871,6 +1918,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1878,6 +1926,7 @@ xfs_bmap_add_extent_delay_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(bma->cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -1929,8 +1978,8 @@ xfs_bmap_add_extent_delay_real(
 		xfs_mod_delalloc(mp, (int64_t)da_new - da_old);
 
 	if (bma->cur) {
-		da_new += bma->cur->bc_ino.allocated;
-		bma->cur->bc_ino.allocated = 0;
+		da_new += bma->cur->bc_bmap.allocated;
+		bma->cur->bc_bmap.allocated = 0;
 	}
 
 	/* adjust for changes in reserved delayed indirect blocks */
@@ -2074,30 +2123,35 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_delete(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_delete(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2126,18 +2180,21 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_delete(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2169,18 +2226,21 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_delete(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_decrement(cur, 0, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2207,6 +2267,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2240,6 +2301,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2277,6 +2339,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2287,6 +2350,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if ((error = xfs_btree_insert(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2317,6 +2381,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2353,6 +2418,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2363,12 +2429,14 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
 			if ((error = xfs_btree_insert(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2405,6 +2473,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2417,6 +2486,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if ((error = xfs_btree_insert(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2429,6 +2499,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2436,6 +2507,7 @@ xfs_bmap_add_extent_unwritten_real(
 			if ((error = xfs_btree_insert(cur, &i)))
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2472,7 +2544,7 @@ xfs_bmap_add_extent_unwritten_real(
 
 	/* clear out the allocated field, done with it now in any case. */
 	if (cur) {
-		cur->bc_ino.allocated = 0;
+		cur->bc_bmap.allocated = 0;
 		*curp = cur;
 	}
 
@@ -2651,7 +2723,7 @@ xfs_bmap_add_extent_hole_real(
 	struct xfs_bmbt_irec	old;
 
 	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!cur || !(cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL));
+	ASSERT(!cur || !(cur->bc_flags & XFS_BTREE_BMBT_WASDEL));
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
@@ -2721,6 +2793,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2728,6 +2801,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2735,6 +2809,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2764,6 +2839,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2794,6 +2870,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2820,6 +2897,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 0)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2827,6 +2905,7 @@ xfs_bmap_add_extent_hole_real(
 			if (error)
 				goto done;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto done;
 			}
@@ -2853,7 +2932,7 @@ xfs_bmap_add_extent_hole_real(
 
 	/* clear out the allocated field, done with it now in any case. */
 	if (cur)
-		cur->bc_ino.allocated = 0;
+		cur->bc_bmap.allocated = 0;
 
 	xfs_bmap_check_leaf_extents(cur, ip, whichfork);
 done:
@@ -3898,14 +3977,18 @@ xfs_bmapi_read(
 
 	ASSERT(*nmap >= 1);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_ENTIRE)));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
-	if (WARN_ON_ONCE(!ifp))
+	if (WARN_ON_ONCE(!ifp)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
+	}
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT))
+	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
+	}
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
@@ -4160,9 +4243,8 @@ xfs_bmapi_allocate(
 	 */
 	bma->nallocs++;
 
-	if (bma->cur)
-		bma->cur->bc_ino.flags =
-			bma->wasdel ? XFS_BTCUR_BMBT_WASDEL : 0;
+	if (bma->cur && bma->wasdel)
+		bma->cur->bc_flags |= XFS_BTREE_BMBT_WASDEL;
 
 	bma->got.br_startoff = bma->offset;
 	bma->got.br_startblock = bma->blkno;
@@ -4369,7 +4451,7 @@ xfs_bmapi_write(
 	ASSERT(tp != NULL);
 	ASSERT(len > 0);
 	ASSERT(ifp->if_format != XFS_DINODE_FMT_LOCAL);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(!(flags & XFS_BMAPI_REMAP));
 
 	/* zeroing is for currently only for data extents, not metadata */
@@ -4386,6 +4468,7 @@ xfs_bmapi_write(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
@@ -4613,9 +4696,11 @@ xfs_bmapi_convert_delalloc(
 	error = -ENOSPC;
 	if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
 		goto out_finish;
-	error = -EFSCORRUPTED;
-	if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
+	if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) {
+		xfs_bmap_mark_sick(ip, whichfork);
+		error = -EFSCORRUPTED;
 		goto out_finish;
+	}
 
 	XFS_STATS_ADD(mp, xs_xstrat_bytes, XFS_FSB_TO_B(mp, bma.length));
 	XFS_STATS_INC(mp, xs_xstrat_quick);
@@ -4666,7 +4751,7 @@ xfs_bmapi_remap(
 	ifp = xfs_ifork_ptr(ip, whichfork);
 	ASSERT(len > 0);
 	ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
 			   XFS_BMAPI_NORMAP)));
 	ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) !=
@@ -4674,6 +4759,7 @@ xfs_bmapi_remap(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
@@ -4693,10 +4779,8 @@ xfs_bmapi_remap(
 	ip->i_nblocks += len;
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
-	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE)
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_ino.flags = 0;
-	}
 
 	got.br_startoff = bno;
 	got.br_startblock = startblock;
@@ -4831,7 +4915,7 @@ xfs_bmap_del_extent_delay(
 
 	XFS_STATS_INC(mp, xs_del_exlist);
 
-	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	isrt = xfs_ifork_is_realtime(ip, whichfork);
 	del_endoff = del->br_startoff + del->br_blockcount;
 	got_endoff = got->br_startoff + got->br_blockcount;
 	da_old = startblockval(got->br_startblock);
@@ -5067,7 +5151,7 @@ xfs_bmap_del_extent_real(
 		return -ENOSPC;
 
 	*logflagsp = XFS_ILOG_CORE;
-	if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
+	if (xfs_ifork_is_realtime(ip, whichfork)) {
 		if (!(bflags & XFS_BMAPI_REMAP)) {
 			error = xfs_rtfree_blocks(tp, del->br_startblock,
 					del->br_blockcount);
@@ -5088,8 +5172,10 @@ xfs_bmap_del_extent_real(
 		error = xfs_bmbt_lookup_eq(cur, &got, &i);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 	}
 
 	if (got.br_startoff == del->br_startoff)
@@ -5113,8 +5199,10 @@ xfs_bmap_del_extent_real(
 		}
 		if ((error = xfs_btree_delete(cur, &i)))
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 		break;
 	case BMAP_LEFT_FILLING:
 		/*
@@ -5186,8 +5274,10 @@ xfs_bmap_del_extent_real(
 				error = xfs_bmbt_lookup_eq(cur, &got, &i);
 				if (error)
 					return error;
-				if (XFS_IS_CORRUPT(mp, i != 1))
+				if (XFS_IS_CORRUPT(mp, i != 1)) {
+					xfs_btree_mark_sick(cur);
 					return -EFSCORRUPTED;
+				}
 				/*
 				 * Update the btree record back
 				 * to the original value.
@@ -5203,8 +5293,10 @@ xfs_bmap_del_extent_real(
 				*logflagsp = 0;
 				return -ENOSPC;
 			}
-			if (XFS_IS_CORRUPT(mp, i != 1))
+			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				return -EFSCORRUPTED;
+			}
 		} else
 			*logflagsp |= xfs_ilog_fext(whichfork);
 
@@ -5286,12 +5378,14 @@ __xfs_bunmapi(
 	whichfork = xfs_bmapi_whichfork(flags);
 	ASSERT(whichfork != XFS_COW_FORK);
 	ifp = xfs_ifork_ptr(ip, whichfork);
-	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)))
+	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp))) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
+	}
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(len > 0);
 	ASSERT(nexts >= 0);
 
@@ -5304,7 +5398,7 @@ __xfs_bunmapi(
 		return 0;
 	}
 	XFS_STATS_INC(mp, xs_blk_unmap);
-	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
+	isrt = xfs_ifork_is_realtime(ip, whichfork);
 	end = start + len;
 
 	if (!xfs_iext_lookup_extent_before(ip, ifp, &end, &icur, &got)) {
@@ -5317,7 +5411,6 @@ __xfs_bunmapi(
 	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
 		ASSERT(ifp->if_format == XFS_DINODE_FMT_BTREE);
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_ino.flags = 0;
 	} else
 		cur = NULL;
 
@@ -5367,7 +5460,7 @@ __xfs_bunmapi(
 		if (del.br_startoff + del.br_blockcount > end + 1)
 			del.br_blockcount = end + 1 - del.br_startoff;
 
-		if (!isrt)
+		if (!isrt || (flags & XFS_BMAPI_REMAP))
 			goto delete;
 
 		mod = xfs_rtb_to_rtxoff(mp,
@@ -5385,7 +5478,7 @@ __xfs_bunmapi(
 				 * This piece is unwritten, or we're not
 				 * using unwritten extents.  Skip over it.
 				 */
-				ASSERT(end >= mod);
+				ASSERT((flags & XFS_BMAPI_REMAP) || end >= mod);
 				end -= mod > del.br_blockcount ?
 					del.br_blockcount : mod;
 				if (end < got.br_startoff &&
@@ -5555,7 +5648,7 @@ error0:
 		xfs_trans_log_inode(tp, ip, logflags);
 	if (cur) {
 		if (!error)
-			cur->bc_ino.allocated = 0;
+			cur->bc_bmap.allocated = 0;
 		xfs_btree_del_cursor(cur, error);
 	}
 	return error;
@@ -5635,8 +5728,7 @@ xfs_bmse_merge(
 
 	blockcount = left->br_blockcount + got->br_blockcount;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 	ASSERT(xfs_bmse_can_merge(left, got, shift));
 
 	new = *left;
@@ -5657,21 +5749,27 @@ xfs_bmse_merge(
 	error = xfs_bmbt_lookup_eq(cur, got, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(mp, i != 1))
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_btree_delete(cur, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(mp, i != 1))
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	/* lookup and update size of the previous extent */
 	error = xfs_bmbt_lookup_eq(cur, left, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(mp, i != 1))
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_bmbt_update(cur, &new);
 	if (error)
@@ -5719,8 +5817,10 @@ xfs_bmap_shift_update_extent(
 		error = xfs_bmbt_lookup_eq(cur, &prev, &i);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(mp, i != 1))
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 
 		error = xfs_bmbt_update(cur, got);
 		if (error)
@@ -5758,28 +5858,28 @@ xfs_bmap_collapse_extents(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 
 	error = xfs_iread_extents(tp, ip, whichfork);
 	if (error)
 		return error;
 
-	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE)
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_ino.flags = 0;
-	}
 
 	if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, &icur, &got)) {
 		*done = true;
 		goto del_cursor;
 	}
 	if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		error = -EFSCORRUPTED;
 		goto del_cursor;
 	}
@@ -5837,7 +5937,7 @@ xfs_bmap_can_insert_extents(
 	int			is_empty;
 	int			error = 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
 
 	if (xfs_is_shutdown(ip->i_mount))
 		return -EIO;
@@ -5873,22 +5973,21 @@ xfs_bmap_insert_extents(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
 	if (xfs_is_shutdown(mp))
 		return -EIO;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
 
 	error = xfs_iread_extents(tp, ip, whichfork);
 	if (error)
 		return error;
 
-	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE)
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_ino.flags = 0;
-	}
 
 	if (*next_fsb == NULLFSBLOCK) {
 		xfs_iext_last(ifp, &icur);
@@ -5904,11 +6003,13 @@ xfs_bmap_insert_extents(
 		}
 	}
 	if (XFS_IS_CORRUPT(mp, isnullstartblock(got.br_startblock))) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		error = -EFSCORRUPTED;
 		goto del_cursor;
 	}
 
 	if (XFS_IS_CORRUPT(mp, stop_fsb > got.br_startoff)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		error = -EFSCORRUPTED;
 		goto del_cursor;
 	}
@@ -5976,6 +6077,7 @@ xfs_bmap_split_extent(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(ifp)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, whichfork);
 		return -EFSCORRUPTED;
 	}
 
@@ -6002,11 +6104,11 @@ xfs_bmap_split_extent(
 
 	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
-		cur->bc_ino.flags = 0;
 		error = xfs_bmbt_lookup_eq(cur, &got, &i);
 		if (error)
 			goto del_cursor;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto del_cursor;
 		}
@@ -6034,6 +6136,7 @@ xfs_bmap_split_extent(
 		if (error)
 			goto del_cursor;
 		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto del_cursor;
 		}
@@ -6041,6 +6144,7 @@ xfs_bmap_split_extent(
 		if (error)
 			goto del_cursor;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto del_cursor;
 		}
@@ -6060,7 +6164,7 @@ xfs_bmap_split_extent(
 
 del_cursor:
 	if (cur) {
-		cur->bc_ino.allocated = 0;
+		cur->bc_bmap.allocated = 0;
 		xfs_btree_del_cursor(cur, error);
 	}
 
@@ -6069,17 +6173,8 @@ del_cursor:
 	return error;
 }
 
-/* Deferred mapping is only for real extents in the data fork. */
-static bool
-xfs_bmap_is_update_needed(
-	struct xfs_bmbt_irec	*bmap)
-{
-	return  bmap->br_startblock != HOLESTARTBLOCK &&
-		bmap->br_startblock != DELAYSTARTBLOCK;
-}
-
 /* Record a bmap intent. */
-static int
+static inline void
 __xfs_bmap_add(
 	struct xfs_trans		*tp,
 	enum xfs_bmap_intent_type	type,
@@ -6089,25 +6184,19 @@ __xfs_bmap_add(
 {
 	struct xfs_bmap_intent		*bi;
 
-	trace_xfs_bmap_defer(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
-			type,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
-			ip->i_ino, whichfork,
-			bmap->br_startoff,
-			bmap->br_blockcount,
-			bmap->br_state);
+	if ((whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) ||
+	    bmap->br_startblock == HOLESTARTBLOCK ||
+	    bmap->br_startblock == DELAYSTARTBLOCK)
+		return;
 
-	bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&bi->bi_list);
 	bi->bi_type = type;
 	bi->bi_owner = ip;
 	bi->bi_whichfork = whichfork;
 	bi->bi_bmap = *bmap;
 
-	xfs_bmap_update_get_group(tp->t_mountp, bi);
-	xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
-	return 0;
+	xfs_bmap_defer_add(tp, bi);
 }
 
 /* Map an extent into a file. */
@@ -6115,12 +6204,10 @@ void
 xfs_bmap_map_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
+	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_bmap_is_update_needed(PREV))
-		return;
-
-	__xfs_bmap_add(tp, XFS_BMAP_MAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_MAP, ip, whichfork, PREV);
 }
 
 /* Unmap an extent out of a file. */
@@ -6128,12 +6215,10 @@ void
 xfs_bmap_unmap_extent(
 	struct xfs_trans	*tp,
 	struct xfs_inode	*ip,
+	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_bmap_is_update_needed(PREV))
-		return;
-
-	__xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, XFS_DATA_FORK, PREV);
+	__xfs_bmap_add(tp, XFS_BMAP_UNMAP, ip, whichfork, PREV);
 }
 
 /*
@@ -6147,36 +6232,35 @@ xfs_bmap_finish_one(
 {
 	struct xfs_bmbt_irec		*bmap = &bi->bi_bmap;
 	int				error = 0;
+	int				flags = 0;
 
-	ASSERT(tp->t_highest_agno == NULLAGNUMBER);
+	if (bi->bi_whichfork == XFS_ATTR_FORK)
+		flags |= XFS_BMAPI_ATTRFORK;
 
-	trace_xfs_bmap_deferred(tp->t_mountp,
-			XFS_FSB_TO_AGNO(tp->t_mountp, bmap->br_startblock),
-			bi->bi_type,
-			XFS_FSB_TO_AGBNO(tp->t_mountp, bmap->br_startblock),
-			bi->bi_owner->i_ino, bi->bi_whichfork,
-			bmap->br_startoff, bmap->br_blockcount,
-			bmap->br_state);
+	ASSERT(tp->t_highest_agno == NULLAGNUMBER);
 
-	if (WARN_ON_ONCE(bi->bi_whichfork != XFS_DATA_FORK))
-		return -EFSCORRUPTED;
+	trace_xfs_bmap_deferred(bi);
 
-	if (XFS_TEST_ERROR(false, tp->t_mountp,
-			XFS_ERRTAG_BMAP_FINISH_ONE))
+	if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE))
 		return -EIO;
 
 	switch (bi->bi_type) {
 	case XFS_BMAP_MAP:
+		if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+			flags |= XFS_BMAPI_PREALLOC;
 		error = xfs_bmapi_remap(tp, bi->bi_owner, bmap->br_startoff,
-				bmap->br_blockcount, bmap->br_startblock, 0);
+				bmap->br_blockcount, bmap->br_startblock,
+				flags);
 		bmap->br_blockcount = 0;
 		break;
 	case XFS_BMAP_UNMAP:
 		error = __xfs_bunmapi(tp, bi->bi_owner, bmap->br_startoff,
-				&bmap->br_blockcount, XFS_BMAPI_REMAP, 1);
+				&bmap->br_blockcount, flags | XFS_BMAPI_REMAP,
+				1);
 		break;
 	default:
 		ASSERT(0);
+		xfs_bmap_mark_sick(bi->bi_owner, bi->bi_whichfork);
 		error = -EFSCORRUPTED;
 	}
 
@@ -6257,7 +6341,7 @@ xfs_bunmapi_range(
 	xfs_filblks_t		unmap_len = endoff - startoff + 1;
 	int			error = 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	while (unmap_len > 0) {
 		ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
@@ -6274,3 +6358,46 @@ xfs_bunmapi_range(
 out:
 	return error;
 }
+
+struct xfs_bmap_query_range {
+	xfs_bmap_query_range_fn	fn;
+	void			*priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_bmap_query_range_helper(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_bmap_query_range	*query = priv;
+	struct xfs_bmbt_irec		irec;
+	xfs_failaddr_t			fa;
+
+	xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+	fa = xfs_bmap_validate_extent(cur->bc_ino.ip, cur->bc_ino.whichfork,
+			&irec);
+	if (fa) {
+		xfs_btree_mark_sick(cur);
+		return xfs_bmap_complain_bad_rec(cur->bc_ino.ip,
+				cur->bc_ino.whichfork, fa, &irec);
+	}
+
+	return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all bmaps. */
+int
+xfs_bmap_query_all(
+	struct xfs_btree_cur		*cur,
+	xfs_bmap_query_range_fn		fn,
+	void				*priv)
+{
+	struct xfs_bmap_query_range	query = {
+		.priv			= priv,
+		.fn			= fn,
+	};
+
+	return xfs_btree_query_all(cur, xfs_bmap_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f6b73f1bad5f..f7662595309d 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -232,6 +232,10 @@ enum xfs_bmap_intent_type {
 	XFS_BMAP_UNMAP,
 };
 
+#define XFS_BMAP_INTENT_STRINGS \
+	{ XFS_BMAP_MAP,		"map" }, \
+	{ XFS_BMAP_UNMAP,	"unmap" }
+
 struct xfs_bmap_intent {
 	struct list_head			bi_list;
 	enum xfs_bmap_intent_type		bi_type;
@@ -241,14 +245,11 @@ struct xfs_bmap_intent {
 	struct xfs_bmbt_irec			bi_bmap;
 };
 
-void xfs_bmap_update_get_group(struct xfs_mount *mp,
-		struct xfs_bmap_intent *bi);
-
 int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
 void	xfs_bmap_map_extent(struct xfs_trans *tp, struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap);
+		int whichfork, struct xfs_bmbt_irec *imap);
 void	xfs_bmap_unmap_extent(struct xfs_trans *tp, struct xfs_inode *ip,
-		struct xfs_bmbt_irec *imap);
+		int whichfork, struct xfs_bmbt_irec *imap);
 
 static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
 {
@@ -280,4 +281,12 @@ extern struct kmem_cache	*xfs_bmap_intent_cache;
 int __init xfs_bmap_intent_init_cache(void);
 void xfs_bmap_intent_destroy_cache(void);
 
+typedef int (*xfs_bmap_query_range_fn)(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*rec,
+	void			*priv);
+
+int xfs_bmap_query_all(struct xfs_btree_cur *cur, xfs_bmap_query_range_fn fn,
+		void *priv);
+
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 71f2d50f7823..f5d84dcb58da 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -26,6 +26,22 @@
 
 static struct kmem_cache	*xfs_bmbt_cur_cache;
 
+void
+xfs_bmbt_init_block(
+	struct xfs_inode		*ip,
+	struct xfs_btree_block		*buf,
+	struct xfs_buf			*bp,
+	__u16				level,
+	__u16				numrecs)
+{
+	if (bp)
+		xfs_btree_init_buf(ip->i_mount, bp, &xfs_bmbt_ops, level,
+				numrecs, ip->i_ino);
+	else
+		xfs_btree_init_block(ip->i_mount, buf, &xfs_bmbt_ops, level,
+				numrecs, ip->i_ino);
+}
+
 /*
  * Convert on-disk form of btree root to in-memory form.
  */
@@ -44,9 +60,7 @@ xfs_bmdr_to_bmbt(
 	xfs_bmbt_key_t		*tkp;
 	__be64			*tpp;
 
-	xfs_btree_init_block_int(mp, rblock, XFS_BUF_DADDR_NULL,
-				 XFS_BTNUM_BMAP, 0, 0, ip->i_ino,
-				 XFS_BTREE_LONG_PTRS);
+	xfs_bmbt_init_block(ip, rblock, NULL, 0, 0);
 	rblock->bb_level = dblock->bb_level;
 	ASSERT(be16_to_cpu(rblock->bb_level) > 0);
 	rblock->bb_numrecs = dblock->bb_numrecs;
@@ -171,13 +185,8 @@ xfs_bmbt_dup_cursor(
 
 	new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
 			cur->bc_ino.ip, cur->bc_ino.whichfork);
-
-	/*
-	 * Copy the firstblock, dfops, and flags values,
-	 * since init cursor doesn't get them.
-	 */
-	new->bc_ino.flags = cur->bc_ino.flags;
-
+	new->bc_flags |= (cur->bc_flags &
+		(XFS_BTREE_BMBT_INVALID_OWNER | XFS_BTREE_BMBT_WASDEL));
 	return new;
 }
 
@@ -189,10 +198,10 @@ xfs_bmbt_update_cursor(
 	ASSERT((dst->bc_tp->t_highest_agno != NULLAGNUMBER) ||
 	       (dst->bc_ino.ip->i_diflags & XFS_DIFLAG_REALTIME));
 
-	dst->bc_ino.allocated += src->bc_ino.allocated;
+	dst->bc_bmap.allocated += src->bc_bmap.allocated;
 	dst->bc_tp->t_highest_agno = src->bc_tp->t_highest_agno;
 
-	src->bc_ino.allocated = 0;
+	src->bc_bmap.allocated = 0;
 }
 
 STATIC int
@@ -211,7 +220,7 @@ xfs_bmbt_alloc_block(
 	xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_ino.ip->i_ino,
 			cur->bc_ino.whichfork);
 	args.minlen = args.maxlen = args.prod = 1;
-	args.wasdel = cur->bc_ino.flags & XFS_BTCUR_BMBT_WASDEL;
+	args.wasdel = cur->bc_flags & XFS_BTREE_BMBT_WASDEL;
 	if (!args.wasdel && args.tp->t_blk_res == 0)
 		return -ENOSPC;
 
@@ -247,7 +256,7 @@ xfs_bmbt_alloc_block(
 	}
 
 	ASSERT(args.len == 1);
-	cur->bc_ino.allocated++;
+	cur->bc_bmap.allocated++;
 	cur->bc_ino.ip->i_nblocks++;
 	xfs_trans_log_inode(args.tp, cur->bc_ino.ip, XFS_ILOG_CORE);
 	xfs_trans_mod_dquot_byino(args.tp, cur->bc_ino.ip,
@@ -360,14 +369,6 @@ xfs_bmbt_init_rec_from_cur(
 	xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
 }
 
-STATIC void
-xfs_bmbt_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	ptr->l = 0;
-}
-
 STATIC int64_t
 xfs_bmbt_key_diff(
 	struct xfs_btree_cur		*cur,
@@ -419,7 +420,7 @@ xfs_bmbt_verify(
 		 * XXX: need a better way of verifying the owner here. Right now
 		 * just make sure there has been one set.
 		 */
-		fa = xfs_btree_lblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+		fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
 		if (fa)
 			return fa;
 	}
@@ -435,7 +436,7 @@ xfs_bmbt_verify(
 	if (level > max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]))
 		return __this_address;
 
-	return xfs_btree_lblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
+	return xfs_btree_fsblock_verify(bp, mp->m_bmap_dmxr[level != 0]);
 }
 
 static void
@@ -444,7 +445,7 @@ xfs_bmbt_read_verify(
 {
 	xfs_failaddr_t	fa;
 
-	if (!xfs_btree_lblock_verify_crc(bp))
+	if (!xfs_btree_fsblock_verify_crc(bp))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_bmbt_verify(bp);
@@ -468,7 +469,7 @@ xfs_bmbt_write_verify(
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
 	}
-	xfs_btree_lblock_calc_crc(bp);
+	xfs_btree_fsblock_calc_crc(bp);
 }
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
@@ -515,9 +516,16 @@ xfs_bmbt_keys_contiguous(
 				 be64_to_cpu(key2->bmbt.br_startoff));
 }
 
-static const struct xfs_btree_ops xfs_bmbt_ops = {
+const struct xfs_btree_ops xfs_bmbt_ops = {
+	.name			= "bmap",
+	.type			= XFS_BTREE_TYPE_INODE,
+
 	.rec_len		= sizeof(xfs_bmbt_rec_t),
 	.key_len		= sizeof(xfs_bmbt_key_t),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= XFS_BMAP_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_bmbt_2),
 
 	.dup_cursor		= xfs_bmbt_dup_cursor,
 	.update_cursor		= xfs_bmbt_update_cursor,
@@ -529,7 +537,6 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.init_key_from_rec	= xfs_bmbt_init_key_from_rec,
 	.init_high_key_from_rec	= xfs_bmbt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_bmbt_init_rec_from_cur,
-	.init_ptr_from_cur	= xfs_bmbt_init_ptr_from_cur,
 	.key_diff		= xfs_bmbt_key_diff,
 	.diff_two_keys		= xfs_bmbt_diff_two_keys,
 	.buf_ops		= &xfs_bmbt_buf_ops,
@@ -538,35 +545,10 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
 	.keys_contiguous	= xfs_bmbt_keys_contiguous,
 };
 
-static struct xfs_btree_cur *
-xfs_bmbt_init_common(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	int			whichfork)
-{
-	struct xfs_btree_cur	*cur;
-
-	ASSERT(whichfork != XFS_COW_FORK);
-
-	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
-			mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
-	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
-
-	cur->bc_ops = &xfs_bmbt_ops;
-	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
-	if (xfs_has_crc(mp))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
-	cur->bc_ino.ip = ip;
-	cur->bc_ino.allocated = 0;
-	cur->bc_ino.flags = 0;
-
-	return cur;
-}
-
 /*
- * Allocate a new bmap btree cursor.
+ * Create a new bmap btree cursor.
+ *
+ * For staging cursors -1 in passed in whichfork.
  */
 struct xfs_btree_cur *
 xfs_bmbt_init_cursor(
@@ -575,15 +557,34 @@ xfs_bmbt_init_cursor(
 	struct xfs_inode	*ip,
 	int			whichfork)
 {
-	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 	struct xfs_btree_cur	*cur;
+	unsigned int		maxlevels;
 
-	cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+	ASSERT(whichfork != XFS_COW_FORK);
 
-	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-	cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+	/*
+	 * The Data fork always has larger maxlevel, so use that for staging
+	 * cursors.
+	 */
+	switch (whichfork) {
+	case XFS_STAGING_FORK:
+		maxlevels = mp->m_bm_maxlevels[XFS_DATA_FORK];
+		break;
+	default:
+		maxlevels = mp->m_bm_maxlevels[whichfork];
+		break;
+	}
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_bmbt_ops, maxlevels,
+			xfs_bmbt_cur_cache);
+	cur->bc_ino.ip = ip;
 	cur->bc_ino.whichfork = whichfork;
+	cur->bc_bmap.allocated = 0;
+	if (whichfork != XFS_STAGING_FORK) {
+		struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
 
+		cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+		cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
+	}
 	return cur;
 }
 
@@ -599,33 +600,6 @@ xfs_bmbt_block_maxrecs(
 }
 
 /*
- * Allocate a new bmap btree cursor for reloading an inode block mapping data
- * structure.  Note that callers can use the staged cursor to reload extents
- * format inode forks if they rebuild the iext tree and commit the staged
- * cursor immediately.
- */
-struct xfs_btree_cur *
-xfs_bmbt_stage_cursor(
-	struct xfs_mount	*mp,
-	struct xfs_inode	*ip,
-	struct xbtree_ifakeroot	*ifake)
-{
-	struct xfs_btree_cur	*cur;
-	struct xfs_btree_ops	*ops;
-
-	/* data fork always has larger maxheight */
-	cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
-	cur->bc_nlevels = ifake->if_levels;
-	cur->bc_ino.forksize = ifake->if_fork_size;
-
-	/* Don't let anyone think we're attached to the real fork yet. */
-	cur->bc_ino.whichfork = -1;
-	xfs_btree_stage_ifakeroot(cur, ifake, &ops);
-	ops->update_cursor = NULL;
-	return cur;
-}
-
-/*
  * Swap in the new inode fork root.  Once we pass this point the newly rebuilt
  * mappings are in place and we have to kill off any old btree blocks.
  */
@@ -665,7 +639,7 @@ xfs_bmbt_commit_staged_btree(
 		break;
 	}
 	xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
-	xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+	xfs_btree_commit_ifakeroot(cur, tp, whichfork);
 }
 
 /*
@@ -751,7 +725,7 @@ xfs_bmbt_change_owner(
 	ASSERT(xfs_ifork_ptr(ip, whichfork)->if_format == XFS_DINODE_FMT_BTREE);
 
 	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
-	cur->bc_ino.flags |= XFS_BTCUR_BMBT_INVALID_OWNER;
+	cur->bc_flags |= XFS_BTREE_BMBT_INVALID_OWNER;
 
 	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
 	xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 151b8491f60e..de1b73f1225c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -107,8 +107,6 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
-struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
-		struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
 void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
 		struct xfs_trans *tp, int whichfork);
 
@@ -120,4 +118,7 @@ unsigned int xfs_bmbt_maxlevels_ondisk(void);
 int __init xfs_bmbt_init_cur_cache(void);
 void xfs_bmbt_destroy_cur_cache(void);
 
+void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf,
+		struct xfs_buf *bp, __u16 level, __u16 numrecs);
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index ea8d3659df20..d29547572a68 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -27,28 +27,24 @@
 #include "xfs_bmap_btree.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 
 /*
  * Btree magic numbers.
  */
-static const uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
-	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
-	  XFS_FIBT_MAGIC, 0 },
-	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
-	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
-	  XFS_REFC_CRC_MAGIC }
-};
-
 uint32_t
 xfs_btree_magic(
-	int			crc,
-	xfs_btnum_t		btnum)
+	struct xfs_mount		*mp,
+	const struct xfs_btree_ops	*ops)
 {
-	uint32_t		magic = xfs_magics[crc][btnum];
+	int				idx = xfs_has_crc(mp) ? 1 : 0;
+	__be32				magic = ops->buf_ops->magic[idx];
 
 	/* Ensure we asked for crc for crc-only magics. */
 	ASSERT(magic != 0);
-	return magic;
+	return be32_to_cpu(magic);
 }
 
 /*
@@ -63,10 +59,8 @@ xfs_btree_magic(
  * bytes.
  */
 static inline xfs_failaddr_t
-xfs_btree_check_lblock_siblings(
+xfs_btree_check_fsblock_siblings(
 	struct xfs_mount	*mp,
-	struct xfs_btree_cur	*cur,
-	int			level,
 	xfs_fsblock_t		fsb,
 	__be64			dsibling)
 {
@@ -78,22 +72,33 @@ xfs_btree_check_lblock_siblings(
 	sibling = be64_to_cpu(dsibling);
 	if (sibling == fsb)
 		return __this_address;
-	if (level >= 0) {
-		if (!xfs_btree_check_lptr(cur, sibling, level + 1))
-			return __this_address;
-	} else {
-		if (!xfs_verify_fsbno(mp, sibling))
-			return __this_address;
-	}
+	if (!xfs_verify_fsbno(mp, sibling))
+		return __this_address;
+	return NULL;
+}
 
+static inline xfs_failaddr_t
+xfs_btree_check_memblock_siblings(
+	struct xfs_buftarg	*btp,
+	xfbno_t			bno,
+	__be64			dsibling)
+{
+	xfbno_t			sibling;
+
+	if (dsibling == cpu_to_be64(NULLFSBLOCK))
+		return NULL;
+
+	sibling = be64_to_cpu(dsibling);
+	if (sibling == bno)
+		return __this_address;
+	if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
+		return __this_address;
 	return NULL;
 }
 
 static inline xfs_failaddr_t
-xfs_btree_check_sblock_siblings(
+xfs_btree_check_agblock_siblings(
 	struct xfs_perag	*pag,
-	struct xfs_btree_cur	*cur,
-	int			level,
 	xfs_agblock_t		agbno,
 	__be32			dsibling)
 {
@@ -105,34 +110,21 @@ xfs_btree_check_sblock_siblings(
 	sibling = be32_to_cpu(dsibling);
 	if (sibling == agbno)
 		return __this_address;
-	if (level >= 0) {
-		if (!xfs_btree_check_sptr(cur, sibling, level + 1))
-			return __this_address;
-	} else {
-		if (!xfs_verify_agbno(pag, sibling))
-			return __this_address;
-	}
+	if (!xfs_verify_agbno(pag, sibling))
+		return __this_address;
 	return NULL;
 }
 
-/*
- * Check a long btree block header.  Return the address of the failing check,
- * or NULL if everything is ok.
- */
-xfs_failaddr_t
-__xfs_btree_check_lblock(
+static xfs_failaddr_t
+__xfs_btree_check_lblock_hdr(
 	struct xfs_btree_cur	*cur,
 	struct xfs_btree_block	*block,
 	int			level,
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_btnum_t		btnum = cur->bc_btnum;
-	int			crc = xfs_has_crc(mp);
-	xfs_failaddr_t		fa;
-	xfs_fsblock_t		fsb = NULLFSBLOCK;
 
-	if (crc) {
+	if (xfs_has_crc(mp)) {
 		if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
 			return __this_address;
 		if (block->bb_u.l.bb_blkno !=
@@ -142,7 +134,7 @@ __xfs_btree_check_lblock(
 			return __this_address;
 	}
 
-	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
 		return __this_address;
 	if (be16_to_cpu(block->bb_level) != level)
 		return __this_address;
@@ -150,44 +142,83 @@ __xfs_btree_check_lblock(
 	    cur->bc_ops->get_maxrecs(cur, level))
 		return __this_address;
 
-	if (bp)
-		fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	return NULL;
+}
+
+/*
+ * Check a long btree block header.  Return the address of the failing check,
+ * or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_fsblock(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block,
+	int			level,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_failaddr_t		fa;
+	xfs_fsblock_t		fsb;
+
+	fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+	if (fa)
+		return fa;
 
-	fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+	/*
+	 * For inode-rooted btrees, the root block sits in the inode fork.  In
+	 * that case bp is NULL, and the block must not have any siblings.
+	 */
+	if (!bp) {
+		if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
+			return __this_address;
+		if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
+			return __this_address;
+		return NULL;
+	}
+
+	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
+	fa = xfs_btree_check_fsblock_siblings(mp, fsb,
 			block->bb_u.l.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_lblock_siblings(mp, cur, level, fsb,
+		fa = xfs_btree_check_fsblock_siblings(mp, fsb,
 				block->bb_u.l.bb_rightsib);
 	return fa;
 }
 
-/* Check a long btree block header. */
-static int
-xfs_btree_check_lblock(
+/*
+ * Check an in-memory btree block header.  Return the address of the failing
+ * check, or NULL if everything is ok.
+ */
+static xfs_failaddr_t
+__xfs_btree_check_memblock(
 	struct xfs_btree_cur	*cur,
 	struct xfs_btree_block	*block,
 	int			level,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_buftarg	*btp = cur->bc_mem.xfbtree->target;
 	xfs_failaddr_t		fa;
+	xfbno_t			bno;
 
-	fa = __xfs_btree_check_lblock(cur, block, level, bp);
-	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK)) {
-		if (bp)
-			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		return -EFSCORRUPTED;
-	}
-	return 0;
+	fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
+	if (fa)
+		return fa;
+
+	bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+	fa = xfs_btree_check_memblock_siblings(btp, bno,
+			block->bb_u.l.bb_leftsib);
+	if (!fa)
+		fa = xfs_btree_check_memblock_siblings(btp, bno,
+				block->bb_u.l.bb_rightsib);
+	return fa;
 }
 
 /*
  * Check a short btree block header.  Return the address of the failing check,
  * or NULL if everything is ok.
  */
-xfs_failaddr_t
-__xfs_btree_check_sblock(
+static xfs_failaddr_t
+__xfs_btree_check_agblock(
 	struct xfs_btree_cur	*cur,
 	struct xfs_btree_block	*block,
 	int			level,
@@ -195,20 +226,17 @@ __xfs_btree_check_sblock(
 {
 	struct xfs_mount	*mp = cur->bc_mp;
 	struct xfs_perag	*pag = cur->bc_ag.pag;
-	xfs_btnum_t		btnum = cur->bc_btnum;
-	int			crc = xfs_has_crc(mp);
 	xfs_failaddr_t		fa;
-	xfs_agblock_t		agbno = NULLAGBLOCK;
+	xfs_agblock_t		agbno;
 
-	if (crc) {
+	if (xfs_has_crc(mp)) {
 		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
 			return __this_address;
-		if (block->bb_u.s.bb_blkno !=
-		    cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
+		if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
 			return __this_address;
 	}
 
-	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(crc, btnum))
+	if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
 		return __this_address;
 	if (be16_to_cpu(block->bb_level) != level)
 		return __this_address;
@@ -216,36 +244,45 @@ __xfs_btree_check_sblock(
 	    cur->bc_ops->get_maxrecs(cur, level))
 		return __this_address;
 
-	if (bp)
-		agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-
-	fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
+	fa = xfs_btree_check_agblock_siblings(pag, agbno,
 			block->bb_u.s.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_sblock_siblings(pag, cur, level, agbno,
+		fa = xfs_btree_check_agblock_siblings(pag, agbno,
 				block->bb_u.s.bb_rightsib);
 	return fa;
 }
 
-/* Check a short btree block header. */
-STATIC int
-xfs_btree_check_sblock(
+/*
+ * Internal btree block check.
+ *
+ * Return NULL if the block is ok or the address of the failed check otherwise.
+ */
+xfs_failaddr_t
+__xfs_btree_check_block(
 	struct xfs_btree_cur	*cur,
 	struct xfs_btree_block	*block,
 	int			level,
 	struct xfs_buf		*bp)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_failaddr_t		fa;
-
-	fa = __xfs_btree_check_sblock(cur, block, level, bp);
-	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
-	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BTREE_CHECK_SBLOCK)) {
-		if (bp)
-			trace_xfs_btree_corrupt(bp, _RET_IP_);
-		return -EFSCORRUPTED;
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_MEM:
+		return __xfs_btree_check_memblock(cur, block, level, bp);
+	case XFS_BTREE_TYPE_AG:
+		return __xfs_btree_check_agblock(cur, block, level, bp);
+	case XFS_BTREE_TYPE_INODE:
+		return __xfs_btree_check_fsblock(cur, block, level, bp);
+	default:
+		ASSERT(0);
+		return __this_address;
 	}
-	return 0;
+}
+
+static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
+{
+	if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
+		return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
+	return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
 }
 
 /*
@@ -258,34 +295,49 @@ xfs_btree_check_block(
 	int			level,	/* level of the btree block */
 	struct xfs_buf		*bp)	/* buffer containing block, if any */
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return xfs_btree_check_lblock(cur, block, level, bp);
-	else
-		return xfs_btree_check_sblock(cur, block, level, bp);
-}
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_failaddr_t		fa;
 
-/* Check that this long pointer is valid and points within the fs. */
-bool
-xfs_btree_check_lptr(
-	struct xfs_btree_cur	*cur,
-	xfs_fsblock_t		fsbno,
-	int			level)
-{
-	if (level <= 0)
-		return false;
-	return xfs_verify_fsbno(cur->bc_mp, fsbno);
+	fa = __xfs_btree_check_block(cur, block, level, bp);
+	if (XFS_IS_CORRUPT(mp, fa != NULL) ||
+	    XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
+		if (bp)
+			trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_btree_mark_sick(cur);
+		return -EFSCORRUPTED;
+	}
+	return 0;
 }
 
-/* Check that this short pointer is valid and points within the AG. */
-bool
-xfs_btree_check_sptr(
-	struct xfs_btree_cur	*cur,
-	xfs_agblock_t		agbno,
-	int			level)
+int
+__xfs_btree_check_ptr(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				index,
+	int				level)
 {
 	if (level <= 0)
-		return false;
-	return xfs_verify_agbno(cur->bc_ag.pag, agbno);
+		return -EFSCORRUPTED;
+
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_MEM:
+		if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
+				be64_to_cpu((&ptr->l)[index])))
+			return -EFSCORRUPTED;
+		break;
+	case XFS_BTREE_TYPE_INODE:
+		if (!xfs_verify_fsbno(cur->bc_mp,
+				be64_to_cpu((&ptr->l)[index])))
+			return -EFSCORRUPTED;
+		break;
+	case XFS_BTREE_TYPE_AG:
+		if (!xfs_verify_agbno(cur->bc_ag.pag,
+				be32_to_cpu((&ptr->s)[index])))
+			return -EFSCORRUPTED;
+		break;
+	}
+
+	return 0;
 }
 
 /*
@@ -299,26 +351,35 @@ xfs_btree_check_ptr(
 	int				index,
 	int				level)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (xfs_btree_check_lptr(cur, be64_to_cpu((&ptr->l)[index]),
-				level))
-			return 0;
-		xfs_err(cur->bc_mp,
-"Inode %llu fork %d: Corrupt btree %d pointer at level %d index %d.",
+	int				error;
+
+	error = __xfs_btree_check_ptr(cur, ptr, index, level);
+	if (error) {
+		switch (cur->bc_ops->type) {
+		case XFS_BTREE_TYPE_MEM:
+			xfs_err(cur->bc_mp,
+"In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
+				cur->bc_ops->name, cur->bc_flags, level, index,
+				__this_address);
+			break;
+		case XFS_BTREE_TYPE_INODE:
+			xfs_err(cur->bc_mp,
+"Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
 				cur->bc_ino.ip->i_ino,
-				cur->bc_ino.whichfork, cur->bc_btnum,
+				cur->bc_ino.whichfork, cur->bc_ops->name,
 				level, index);
-	} else {
-		if (xfs_btree_check_sptr(cur, be32_to_cpu((&ptr->s)[index]),
-				level))
-			return 0;
-		xfs_err(cur->bc_mp,
-"AG %u: Corrupt btree %d pointer at level %d index %d.",
-				cur->bc_ag.pag->pag_agno, cur->bc_btnum,
+			break;
+		case XFS_BTREE_TYPE_AG:
+			xfs_err(cur->bc_mp,
+"AG %u: Corrupt %sbt pointer at level %d index %d.",
+				cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
 				level, index);
+			break;
+		}
+		xfs_btree_mark_sick(cur);
 	}
 
-	return -EFSCORRUPTED;
+	return error;
 }
 
 #ifdef DEBUG
@@ -336,7 +397,7 @@ xfs_btree_check_ptr(
  * it to disk.
  */
 void
-xfs_btree_lblock_calc_crc(
+xfs_btree_fsblock_calc_crc(
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
@@ -350,7 +411,7 @@ xfs_btree_lblock_calc_crc(
 }
 
 bool
-xfs_btree_lblock_verify_crc(
+xfs_btree_fsblock_verify_crc(
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
@@ -374,7 +435,7 @@ xfs_btree_lblock_verify_crc(
  * it to disk.
  */
 void
-xfs_btree_sblock_calc_crc(
+xfs_btree_agblock_calc_crc(
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
@@ -388,7 +449,7 @@ xfs_btree_sblock_calc_crc(
 }
 
 bool
-xfs_btree_sblock_verify_crc(
+xfs_btree_agblock_verify_crc(
 	struct xfs_buf		*bp)
 {
 	struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
@@ -410,6 +471,17 @@ xfs_btree_free_block(
 {
 	int			error;
 
+	trace_xfs_btree_free_block(cur, bp);
+
+	/*
+	 * Don't allow block freeing for a staging cursor, because staging
+	 * cursors do not support regular btree modifications.
+	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
 	error = cur->bc_ops->free_block(cur, bp);
 	if (!error) {
 		xfs_trans_binval(cur->bc_tp, bp);
@@ -448,33 +520,70 @@ xfs_btree_del_cursor(
 	 * zero, then we should be shut down or on our way to shutdown due to
 	 * cancelling a dirty transaction on error.
 	 */
-	ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+	ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
 	       xfs_is_shutdown(cur->bc_mp) || error != 0);
-	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
-		kmem_free(cur->bc_ops);
-	if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag)
-		xfs_perag_put(cur->bc_ag.pag);
+
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_AG:
+		if (cur->bc_ag.pag)
+			xfs_perag_put(cur->bc_ag.pag);
+		break;
+	case XFS_BTREE_TYPE_INODE:
+		/* nothing to do */
+		break;
+	case XFS_BTREE_TYPE_MEM:
+		if (cur->bc_mem.pag)
+			xfs_perag_put(cur->bc_mem.pag);
+		break;
+	}
+
 	kmem_cache_free(cur->bc_cache, cur);
 }
 
+/* Return the buffer target for this btree's buffer. */
+static inline struct xfs_buftarg *
+xfs_btree_buftarg(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+		return cur->bc_mem.xfbtree->target;
+	return cur->bc_mp->m_ddev_targp;
+}
+
+/* Return the block size (in units of 512b sectors) for this btree. */
+static inline unsigned int
+xfs_btree_bbsize(
+	struct xfs_btree_cur	*cur)
+{
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
+		return XFBNO_BBSIZE;
+	return cur->bc_mp->m_bsize;
+}
+
 /*
  * Duplicate the btree cursor.
  * Allocate a new one, copy the record, re-get the buffers.
  */
-int					/* error */
+int						/* error */
 xfs_btree_dup_cursor(
-	struct xfs_btree_cur *cur,		/* input cursor */
-	struct xfs_btree_cur **ncur)		/* output cursor */
+	struct xfs_btree_cur	*cur,		/* input cursor */
+	struct xfs_btree_cur	**ncur)		/* output cursor */
 {
-	struct xfs_buf	*bp;		/* btree block's buffer pointer */
-	int		error;		/* error return value */
-	int		i;		/* level number of btree block */
-	xfs_mount_t	*mp;		/* mount structure for filesystem */
-	struct xfs_btree_cur *new;		/* new cursor value */
-	xfs_trans_t	*tp;		/* transaction pointer, can be NULL */
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_trans	*tp = cur->bc_tp;
+	struct xfs_buf		*bp;
+	struct xfs_btree_cur	*new;
+	int			error;
+	int			i;
 
-	tp = cur->bc_tp;
-	mp = cur->bc_mp;
+	/*
+	 * Don't allow staging cursors to be duplicated because they're supposed
+	 * to be kept private to a single thread.
+	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
 
 	/*
 	 * Allocate a new cursor like the old one.
@@ -494,10 +603,13 @@ xfs_btree_dup_cursor(
 		new->bc_levels[i].ra = cur->bc_levels[i].ra;
 		bp = cur->bc_levels[i].bp;
 		if (bp) {
-			error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-						   xfs_buf_daddr(bp), mp->m_bsize,
-						   0, &bp,
-						   cur->bc_ops->buf_ops);
+			error = xfs_trans_read_buf(mp, tp,
+					xfs_btree_buftarg(cur),
+					xfs_buf_daddr(bp),
+					xfs_btree_bbsize(cur), 0, &bp,
+					cur->bc_ops->buf_ops);
+			if (xfs_metadata_is_sick(error))
+				xfs_btree_mark_sick(new);
 			if (error) {
 				xfs_btree_del_cursor(new, error);
 				*ncur = NULL;
@@ -539,7 +651,7 @@ xfs_btree_dup_cursor(
  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
  * inside the btree block is done using indices starting at one, not zero!
  *
- * If XFS_BTREE_OVERLAPPING is set, then this btree supports keys containing
+ * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
  * overlapping intervals.  In such a tree, records are still sorted lowest to
  * highest and indexed by the smallest key value that refers to the record.
  * However, nodes are different: each pointer has two associated keys -- one
@@ -589,26 +701,17 @@ xfs_btree_dup_cursor(
  */
 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+		if (xfs_has_crc(cur->bc_mp))
 			return XFS_BTREE_LBLOCK_CRC_LEN;
 		return XFS_BTREE_LBLOCK_LEN;
 	}
-	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
+	if (xfs_has_crc(cur->bc_mp))
 		return XFS_BTREE_SBLOCK_CRC_LEN;
 	return XFS_BTREE_SBLOCK_LEN;
 }
 
 /*
- * Return size of btree block pointers for this btree instance.
- */
-static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
-{
-	return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
-		sizeof(__be64) : sizeof(__be32);
-}
-
-/*
  * Calculate offset of the n-th record in a btree block.
  */
 STATIC size_t
@@ -655,7 +758,7 @@ xfs_btree_ptr_offset(
 {
 	return xfs_btree_block_len(cur) +
 		cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
-		(n - 1) * xfs_btree_ptr_len(cur);
+		(n - 1) * cur->bc_ops->ptr_len;
 }
 
 /*
@@ -718,7 +821,7 @@ struct xfs_ifork *
 xfs_btree_ifork_ptr(
 	struct xfs_btree_cur	*cur)
 {
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 
 	if (cur->bc_flags & XFS_BTREE_STAGING)
 		return cur->bc_ino.ifake->if_fork;
@@ -750,8 +853,7 @@ xfs_btree_get_block(
 	int			level,	/* level in btree */
 	struct xfs_buf		**bpp)	/* buffer containing the block */
 {
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1)) {
+	if (xfs_btree_at_iroot(cur, level)) {
 		*bpp = NULL;
 		return xfs_btree_get_iroot(cur);
 	}
@@ -856,95 +958,52 @@ xfs_btree_offsets(
 	}
 }
 
-/*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int
-xfs_btree_read_bufl(
-	struct xfs_mount	*mp,		/* file system mount point */
-	struct xfs_trans	*tp,		/* transaction pointer */
-	xfs_fsblock_t		fsbno,		/* file system block number */
-	struct xfs_buf		**bpp,		/* buffer for fsbno */
-	int			refval,		/* ref count value for buffer */
-	const struct xfs_buf_ops *ops)
-{
-	struct xfs_buf		*bp;		/* return value */
-	xfs_daddr_t		d;		/* real disk block address */
-	int			error;
-
-	if (!xfs_verify_fsbno(mp, fsbno))
-		return -EFSCORRUPTED;
-	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, 0, &bp, ops);
-	if (error)
-		return error;
-	if (bp)
-		xfs_buf_set_ref(bp, refval);
-	*bpp = bp;
-	return 0;
-}
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufl(
-	struct xfs_mount	*mp,		/* file system mount point */
-	xfs_fsblock_t		fsbno,		/* file system block number */
-	xfs_extlen_t		count,		/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops)
+STATIC int
+xfs_btree_readahead_fsblock(
+	struct xfs_btree_cur	*cur,
+	int			lr,
+	struct xfs_btree_block	*block)
 {
-	xfs_daddr_t		d;
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+	int			rval = 0;
 
-	ASSERT(fsbno != NULLFSBLOCK);
-	d = XFS_FSB_TO_DADDR(mp, fsbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
-}
+	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
+		xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
+				mp->m_bsize, cur->bc_ops->buf_ops);
+		rval++;
+	}
 
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-/* ARGSUSED */
-void
-xfs_btree_reada_bufs(
-	struct xfs_mount	*mp,		/* file system mount point */
-	xfs_agnumber_t		agno,		/* allocation group number */
-	xfs_agblock_t		agbno,		/* allocation group block number */
-	xfs_extlen_t		count,		/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops)
-{
-	xfs_daddr_t		d;
+	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
+		xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
+				mp->m_bsize, cur->bc_ops->buf_ops);
+		rval++;
+	}
 
-	ASSERT(agno != NULLAGNUMBER);
-	ASSERT(agbno != NULLAGBLOCK);
-	d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-	xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
+	return rval;
 }
 
 STATIC int
-xfs_btree_readahead_lblock(
+xfs_btree_readahead_memblock(
 	struct xfs_btree_cur	*cur,
 	int			lr,
 	struct xfs_btree_block	*block)
 {
+	struct xfs_buftarg	*btp = cur->bc_mem.xfbtree->target;
+	xfbno_t			left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+	xfbno_t			right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 	int			rval = 0;
-	xfs_fsblock_t		left = be64_to_cpu(block->bb_u.l.bb_leftsib);
-	xfs_fsblock_t		right = be64_to_cpu(block->bb_u.l.bb_rightsib);
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
-		xfs_btree_reada_bufl(cur->bc_mp, left, 1,
-				     cur->bc_ops->buf_ops);
+		xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
+				cur->bc_ops->buf_ops);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
-		xfs_btree_reada_bufl(cur->bc_mp, right, 1,
-				     cur->bc_ops->buf_ops);
+		xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
+				cur->bc_ops->buf_ops);
 		rval++;
 	}
 
@@ -952,25 +1011,28 @@ xfs_btree_readahead_lblock(
 }
 
 STATIC int
-xfs_btree_readahead_sblock(
+xfs_btree_readahead_agblock(
 	struct xfs_btree_cur	*cur,
 	int			lr,
-	struct xfs_btree_block *block)
+	struct xfs_btree_block	*block)
 {
-	int			rval = 0;
+	struct xfs_mount	*mp = cur->bc_mp;
+	xfs_agnumber_t		agno = cur->bc_ag.pag->pag_agno;
 	xfs_agblock_t		left = be32_to_cpu(block->bb_u.s.bb_leftsib);
 	xfs_agblock_t		right = be32_to_cpu(block->bb_u.s.bb_rightsib);
-
+	int			rval = 0;
 
 	if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
-		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-				     left, 1, cur->bc_ops->buf_ops);
+		xfs_buf_readahead(mp->m_ddev_targp,
+				XFS_AGB_TO_DADDR(mp, agno, left),
+				mp->m_bsize, cur->bc_ops->buf_ops);
 		rval++;
 	}
 
 	if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
-		xfs_btree_reada_bufs(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-				     right, 1, cur->bc_ops->buf_ops);
+		xfs_buf_readahead(mp->m_ddev_targp,
+				XFS_AGB_TO_DADDR(mp, agno, right),
+				mp->m_bsize, cur->bc_ops->buf_ops);
 		rval++;
 	}
 
@@ -993,8 +1055,7 @@ xfs_btree_readahead(
 	 * No readahead needed if we are at the root level and the
 	 * btree root is stored in the inode.
 	 */
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (lev == cur->bc_nlevels - 1))
+	if (xfs_btree_at_iroot(cur, lev))
 		return 0;
 
 	if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
@@ -1003,9 +1064,17 @@ xfs_btree_readahead(
 	cur->bc_levels[lev].ra |= lr;
 	block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		return xfs_btree_readahead_lblock(cur, lr, block);
-	return xfs_btree_readahead_sblock(cur, lr, block);
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_AG:
+		return xfs_btree_readahead_agblock(cur, lr, block);
+	case XFS_BTREE_TYPE_INODE:
+		return xfs_btree_readahead_fsblock(cur, lr, block);
+	case XFS_BTREE_TYPE_MEM:
+		return xfs_btree_readahead_memblock(cur, lr, block);
+	default:
+		ASSERT(0);
+		return 0;
+	}
 }
 
 STATIC int
@@ -1014,23 +1083,24 @@ xfs_btree_ptr_to_daddr(
 	const union xfs_btree_ptr	*ptr,
 	xfs_daddr_t			*daddr)
 {
-	xfs_fsblock_t		fsbno;
-	xfs_agblock_t		agbno;
 	int			error;
 
 	error = xfs_btree_check_ptr(cur, ptr, 0, 1);
 	if (error)
 		return error;
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		fsbno = be64_to_cpu(ptr->l);
-		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, fsbno);
-	} else {
-		agbno = be32_to_cpu(ptr->s);
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_AG:
 		*daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-				agbno);
+				be32_to_cpu(ptr->s));
+		break;
+	case XFS_BTREE_TYPE_INODE:
+		*daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+		break;
+	case XFS_BTREE_TYPE_MEM:
+		*daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
+		break;
 	}
-
 	return 0;
 }
 
@@ -1050,8 +1120,9 @@ xfs_btree_readahead_ptr(
 
 	if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
 		return;
-	xfs_buf_readahead(cur->bc_mp->m_ddev_targp, daddr,
-			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+	xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
+			xfs_btree_bbsize(cur) * count,
+			cur->bc_ops->buf_ops);
 }
 
 /*
@@ -1072,7 +1143,7 @@ xfs_btree_setbuf(
 	cur->bc_levels[lev].ra = 0;
 
 	b = XFS_BUF_TO_BLOCK(bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 		if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
 			cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
 		if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
@@ -1090,7 +1161,7 @@ xfs_btree_ptr_is_null(
 	struct xfs_btree_cur		*cur,
 	const union xfs_btree_ptr	*ptr)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		return ptr->l == cpu_to_be64(NULLFSBLOCK);
 	else
 		return ptr->s == cpu_to_be32(NULLAGBLOCK);
@@ -1101,12 +1172,23 @@ xfs_btree_set_ptr_null(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_ptr	*ptr)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		ptr->l = cpu_to_be64(NULLFSBLOCK);
 	else
 		ptr->s = cpu_to_be32(NULLAGBLOCK);
 }
 
+static inline bool
+xfs_btree_ptrs_equal(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_ptr		*ptr1,
+	union xfs_btree_ptr		*ptr2)
+{
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
+		return ptr1->l == ptr2->l;
+	return ptr1->s == ptr2->s;
+}
+
 /*
  * Get/set/init sibling pointers
  */
@@ -1119,7 +1201,7 @@ xfs_btree_get_sibling(
 {
 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 		if (lr == XFS_BB_RIGHTSIB)
 			ptr->l = block->bb_u.l.bb_rightsib;
 		else
@@ -1141,7 +1223,7 @@ xfs_btree_set_sibling(
 {
 	ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 		if (lr == XFS_BB_RIGHTSIB)
 			block->bb_u.l.bb_rightsib = ptr->l;
 		else
@@ -1154,25 +1236,24 @@ xfs_btree_set_sibling(
 	}
 }
 
-void
-xfs_btree_init_block_int(
+static void
+__xfs_btree_init_block(
 	struct xfs_mount	*mp,
 	struct xfs_btree_block	*buf,
+	const struct xfs_btree_ops *ops,
 	xfs_daddr_t		blkno,
-	xfs_btnum_t		btnum,
 	__u16			level,
 	__u16			numrecs,
-	__u64			owner,
-	unsigned int		flags)
+	__u64			owner)
 {
-	int			crc = xfs_has_crc(mp);
-	__u32			magic = xfs_btree_magic(crc, btnum);
+	bool			crc = xfs_has_crc(mp);
+	__u32			magic = xfs_btree_magic(mp, ops);
 
 	buf->bb_magic = cpu_to_be32(magic);
 	buf->bb_level = cpu_to_be16(level);
 	buf->bb_numrecs = cpu_to_be16(numrecs);
 
-	if (flags & XFS_BTREE_LONG_PTRS) {
+	if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 		buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
 		buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
 		if (crc) {
@@ -1183,14 +1264,12 @@ xfs_btree_init_block_int(
 			buf->bb_u.l.bb_lsn = 0;
 		}
 	} else {
-		/* owner is a 32 bit value on short blocks */
-		__u32 __owner = (__u32)owner;
-
 		buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
 		buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
 		if (crc) {
 			buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
-			buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
+			/* owner is a 32 bit value on short blocks */
+			buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
 			uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
 			buf->bb_u.s.bb_lsn = 0;
 		}
@@ -1199,15 +1278,46 @@ xfs_btree_init_block_int(
 
 void
 xfs_btree_init_block(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	xfs_btnum_t	btnum,
-	__u16		level,
-	__u16		numrecs,
-	__u64		owner)
+	struct xfs_mount	*mp,
+	struct xfs_btree_block	*block,
+	const struct xfs_btree_ops *ops,
+	__u16			level,
+	__u16			numrecs,
+	__u64			owner)
+{
+	__xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
+			numrecs, owner);
+}
+
+void
+xfs_btree_init_buf(
+	struct xfs_mount		*mp,
+	struct xfs_buf			*bp,
+	const struct xfs_btree_ops	*ops,
+	__u16				level,
+	__u16				numrecs,
+	__u64				owner)
+{
+	__xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
+			xfs_buf_daddr(bp), level, numrecs, owner);
+	bp->b_ops = ops->buf_ops;
+}
+
+static inline __u64
+xfs_btree_owner(
+	struct xfs_btree_cur    *cur)
 {
-	xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), xfs_buf_daddr(bp),
-				 btnum, level, numrecs, owner, 0);
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_MEM:
+		return cur->bc_mem.xfbtree->owner;
+	case XFS_BTREE_TYPE_INODE:
+		return cur->bc_ino.ip->i_ino;
+	case XFS_BTREE_TYPE_AG:
+		return cur->bc_ag.pag->pag_agno;
+	default:
+		ASSERT(0);
+		return 0;
+	}
 }
 
 void
@@ -1217,22 +1327,8 @@ xfs_btree_init_block_cur(
 	int			level,
 	int			numrecs)
 {
-	__u64			owner;
-
-	/*
-	 * we can pull the owner from the cursor right now as the different
-	 * owners align directly with the pointer size of the btree. This may
-	 * change in future, but is safe for current users of the generic btree
-	 * code.
-	 */
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		owner = cur->bc_ino.ip->i_ino;
-	else
-		owner = cur->bc_ag.pag->pag_agno;
-
-	xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp),
-				xfs_buf_daddr(bp), cur->bc_btnum, level,
-				numrecs, owner, cur->bc_flags);
+	xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
+			xfs_btree_owner(cur));
 }
 
 /*
@@ -1250,7 +1346,7 @@ xfs_btree_is_lastrec(
 
 	if (level > 0)
 		return 0;
-	if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
 		return 0;
 
 	xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
@@ -1265,41 +1361,27 @@ xfs_btree_buf_to_ptr(
 	struct xfs_buf		*bp,
 	union xfs_btree_ptr	*ptr)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
-					xfs_buf_daddr(bp)));
-	else {
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_AG:
 		ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
 					xfs_buf_daddr(bp)));
+		break;
+	case XFS_BTREE_TYPE_INODE:
+		ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+					xfs_buf_daddr(bp)));
+		break;
+	case XFS_BTREE_TYPE_MEM:
+		ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
+		break;
 	}
 }
 
-STATIC void
+static inline void
 xfs_btree_set_refs(
 	struct xfs_btree_cur	*cur,
 	struct xfs_buf		*bp)
 {
-	switch (cur->bc_btnum) {
-	case XFS_BTNUM_BNO:
-	case XFS_BTNUM_CNT:
-		xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
-		break;
-	case XFS_BTNUM_INO:
-	case XFS_BTNUM_FINO:
-		xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
-		break;
-	case XFS_BTNUM_BMAP:
-		xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
-		break;
-	case XFS_BTNUM_RMAP:
-		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
-		break;
-	case XFS_BTNUM_REFC:
-		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
-		break;
-	default:
-		ASSERT(0);
-	}
+	xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
 }
 
 int
@@ -1309,15 +1391,14 @@ xfs_btree_get_buf_block(
 	struct xfs_btree_block		**block,
 	struct xfs_buf			**bpp)
 {
-	struct xfs_mount	*mp = cur->bc_mp;
-	xfs_daddr_t		d;
-	int			error;
+	xfs_daddr_t			d;
+	int				error;
 
 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
 	if (error)
 		return error;
-	error = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d, mp->m_bsize,
-			0, bpp);
+	error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
+			xfs_btree_bbsize(cur), 0, bpp);
 	if (error)
 		return error;
 
@@ -1348,9 +1429,11 @@ xfs_btree_read_buf_block(
 	error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
 	if (error)
 		return error;
-	error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-				   mp->m_bsize, flags, bpp,
-				   cur->bc_ops->buf_ops);
+	error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
+			xfs_btree_bbsize(cur), flags, bpp,
+			cur->bc_ops->buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_btree_mark_sick(cur);
 	if (error)
 		return error;
 
@@ -1398,7 +1481,7 @@ xfs_btree_copy_ptrs(
 	int			numptrs)
 {
 	ASSERT(numptrs >= 0);
-	memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+	memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
 }
 
 /*
@@ -1454,8 +1537,8 @@ xfs_btree_shift_ptrs(
 	ASSERT(numptrs >= 0);
 	ASSERT(dir == 1 || dir == -1);
 
-	dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
-	memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+	dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
+	memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
 }
 
 /*
@@ -1566,7 +1649,7 @@ xfs_btree_log_block(
 	if (bp) {
 		int nbits;
 
-		if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+		if (xfs_has_crc(cur->bc_mp)) {
 			/*
 			 * We don't log the CRC when updating a btree
 			 * block but instead recreate it during log
@@ -1581,7 +1664,7 @@ xfs_btree_log_block(
 			nbits = XFS_BB_NUM_BITS;
 		}
 		xfs_btree_offsets(fields,
-				  (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+				  (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
 					loffsets : soffsets,
 				  nbits, &first, &last);
 		xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
@@ -1658,9 +1741,10 @@ xfs_btree_increment(
 	 * confused or have the tree root in an inode.
 	 */
 	if (lev == cur->bc_nlevels) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 			goto out0;
 		ASSERT(0);
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1751,9 +1835,10 @@ xfs_btree_decrement(
 	 * or the root of the tree is in an inode.
 	 */
 	if (lev == cur->bc_nlevels) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 			goto out0;
 		ASSERT(0);
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1786,6 +1871,33 @@ error0:
 	return error;
 }
 
+/*
+ * Check the btree block owner now that we have the context to know who the
+ * real owner is.
+ */
+static inline xfs_failaddr_t
+xfs_btree_check_block_owner(
+	struct xfs_btree_cur	*cur,
+	struct xfs_btree_block	*block)
+{
+	__u64			owner;
+
+	if (!xfs_has_crc(cur->bc_mp) ||
+	    (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
+		return NULL;
+
+	owner = xfs_btree_owner(cur);
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+		if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
+			return __this_address;
+	} else {
+		if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
+			return __this_address;
+	}
+
+	return NULL;
+}
+
 int
 xfs_btree_lookup_get_block(
 	struct xfs_btree_cur		*cur,	/* btree cursor */
@@ -1798,8 +1910,7 @@ xfs_btree_lookup_get_block(
 	int			error = 0;
 
 	/* special case the root block if in an inode */
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1)) {
+	if (xfs_btree_at_iroot(cur, level)) {
 		*blkp = xfs_btree_get_iroot(cur);
 		return 0;
 	}
@@ -1824,11 +1935,7 @@ xfs_btree_lookup_get_block(
 		return error;
 
 	/* Check the inode owner since the verifiers don't. */
-	if (xfs_has_crc(cur->bc_mp) &&
-	    !(cur->bc_ino.flags & XFS_BTCUR_BMBT_INVALID_OWNER) &&
-	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
-	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
-			cur->bc_ino.ip->i_ino)
+	if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
 		goto out_bad;
 
 	/* Did we get the level we were looking for? */
@@ -1846,6 +1953,7 @@ out_bad:
 	*blkp = NULL;
 	xfs_buf_mark_corrupt(bp);
 	xfs_trans_brelse(cur->bc_tp, bp);
+	xfs_btree_mark_sick(cur);
 	return -EFSCORRUPTED;
 }
 
@@ -1872,6 +1980,27 @@ xfs_lookup_get_search_key(
 }
 
 /*
+ * Initialize a pointer to the root block.
+ */
+void
+xfs_btree_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
+		/*
+		 * Inode-rooted btrees call xfs_btree_get_iroot to find the root
+		 * in xfs_btree_lookup_get_block and don't need a pointer here.
+		 */
+		ptr->l = 0;
+	} else if (cur->bc_flags & XFS_BTREE_STAGING) {
+		ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
+	} else {
+		cur->bc_ops->init_ptr_from_cur(cur, ptr);
+	}
+}
+
+/*
  * Lookup the record.  The cursor is made to point to it, based on dir.
  * stat is set to 0 if can't find any such record, 1 for success.
  */
@@ -1892,14 +2021,16 @@ xfs_btree_lookup(
 	XFS_BTREE_STATS_INC(cur, lookup);
 
 	/* No such thing as a zero-level tree. */
-	if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0))
+	if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	block = NULL;
 	keyno = 0;
 
 	/* initialise start pointer from cursor */
-	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	xfs_btree_init_ptr_from_cur(cur, &ptr);
 	pp = &ptr;
 
 	/*
@@ -1936,6 +2067,7 @@ xfs_btree_lookup(
 							XFS_ERRLEVEL_LOW,
 							cur->bc_mp, block,
 							sizeof(*block));
+					xfs_btree_mark_sick(cur);
 					return -EFSCORRUPTED;
 				}
 
@@ -2012,8 +2144,10 @@ xfs_btree_lookup(
 			error = xfs_btree_increment(cur, 0, &i);
 			if (error)
 				goto error0;
-			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				return -EFSCORRUPTED;
+			}
 			*stat = 1;
 			return 0;
 		}
@@ -2040,7 +2174,7 @@ xfs_btree_high_key_from_key(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_key	*key)
 {
-	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+	ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
 	return (union xfs_btree_key *)((char *)key +
 			(cur->bc_ops->key_len / 2));
 }
@@ -2061,7 +2195,7 @@ xfs_btree_get_leaf_keys(
 	rec = xfs_btree_rec_addr(cur, 1, block);
 	cur->bc_ops->init_key_from_rec(key, rec);
 
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
 
 		cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
 		for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
@@ -2088,7 +2222,7 @@ xfs_btree_get_node_keys(
 	union xfs_btree_key	*high;
 	int			n;
 
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
 		memcpy(key, xfs_btree_key_addr(cur, 1, block),
 				cur->bc_ops->key_len / 2);
 
@@ -2132,7 +2266,7 @@ xfs_btree_needs_key_update(
 	struct xfs_btree_cur	*cur,
 	int			ptr)
 {
-	return (cur->bc_flags & XFS_BTREE_OVERLAPPING) || ptr == 1;
+	return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
 }
 
 /*
@@ -2156,7 +2290,7 @@ __xfs_btree_updkeys(
 	struct xfs_buf		*bp;
 	int			ptr;
 
-	ASSERT(cur->bc_flags & XFS_BTREE_OVERLAPPING);
+	ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
 
 	/* Exit if there aren't any parent levels to update. */
 	if (level + 1 >= cur->bc_nlevels)
@@ -2225,7 +2359,7 @@ xfs_btree_update_keys(
 	ASSERT(level >= 0);
 
 	block = xfs_btree_get_block(cur, level, &bp);
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING)
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
 		return __xfs_btree_updkeys(cur, level, block, bp, false);
 
 	/*
@@ -2332,8 +2466,7 @@ xfs_btree_lshift(
 	int			error;		/* error return value */
 	int			i;
 
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    level == cur->bc_nlevels - 1)
+	if (xfs_btree_at_iroot(cur, level))
 		goto out0;
 
 	/* Set up variables for this block as "right". */
@@ -2460,12 +2593,13 @@ xfs_btree_lshift(
 	 * Using a temporary cursor, update the parent key values of the
 	 * block on the left.
 	 */
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
 		error = xfs_btree_dup_cursor(cur, &tcur);
 		if (error)
 			goto error0;
 		i = xfs_btree_firstrec(tcur, level);
 		if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -2527,8 +2661,7 @@ xfs_btree_rshift(
 	int			error;		/* error return value */
 	int			i;		/* loop counter */
 
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level == cur->bc_nlevels - 1))
+	if (xfs_btree_at_iroot(cur, level))
 		goto out0;
 
 	/* Set up variables for this block as "left". */
@@ -2636,6 +2769,7 @@ xfs_btree_rshift(
 		goto error0;
 	i = xfs_btree_lastrec(tcur, level);
 	if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -2645,7 +2779,7 @@ xfs_btree_rshift(
 		goto error1;
 
 	/* Update the parent high keys of the left block, if needed. */
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
 		error = xfs_btree_update_keys(cur, level);
 		if (error)
 			goto error1;
@@ -2673,6 +2807,32 @@ error1:
 	return error;
 }
 
+static inline int
+xfs_btree_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*hint_block,
+	union xfs_btree_ptr		*new_block,
+	int				*stat)
+{
+	int				error;
+
+	/*
+	 * Don't allow block allocation for a staging cursor, because staging
+	 * cursors do not support regular btree modifications.
+	 *
+	 * Bulk loading uses a separate callback to obtain new blocks from a
+	 * preallocated list, which prevents ENOSPC failures during loading.
+	 */
+	if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+
+	error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
+	trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
+	return error;
+}
+
 /*
  * Split cur/level block in half.
  * Return new block number and the key to its first
@@ -2716,7 +2876,7 @@ __xfs_btree_split(
 	xfs_btree_buf_to_ptr(cur, lbp, &lptr);
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
+	error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -2823,7 +2983,7 @@ __xfs_btree_split(
 	}
 
 	/* Update the parent high keys of the left block, if needed. */
-	if (cur->bc_flags & XFS_BTREE_OVERLAPPING) {
+	if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
 		error = xfs_btree_update_keys(cur, level);
 		if (error)
 			goto error0;
@@ -2941,7 +3101,7 @@ xfs_btree_split(
 	struct xfs_btree_split_args	args;
 	DECLARE_COMPLETION_ONSTACK(done);
 
-	if (cur->bc_btnum != XFS_BTNUM_BMAP ||
+	if (!xfs_btree_is_bmap(cur->bc_ops) ||
 	    cur->bc_tp->t_highest_agno == NULLAGNUMBER)
 		return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
 
@@ -2963,7 +3123,6 @@ xfs_btree_split(
 #define xfs_btree_split	__xfs_btree_split
 #endif /* __KERNEL__ */
 
-
 /*
  * Copy the old inode root contents into a real block and make the
  * broot point to it.
@@ -2988,7 +3147,7 @@ xfs_btree_new_iroot(
 
 	XFS_BTREE_STATS_INC(cur, newroot);
 
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 
 	level = cur->bc_nlevels - 1;
 
@@ -2996,7 +3155,7 @@ xfs_btree_new_iroot(
 	pp = xfs_btree_ptr_addr(cur, 1, block);
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
+	error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -3014,9 +3173,9 @@ xfs_btree_new_iroot(
 	 * In that case have to also ensure the blkno remains correct
 	 */
 	memcpy(cblock, block, xfs_btree_block_len(cur));
-	if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
+	if (xfs_has_crc(cur->bc_mp)) {
 		__be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
-		if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 			cblock->bb_u.l.bb_blkno = bno;
 		else
 			cblock->bb_u.s.bb_blkno = bno;
@@ -3069,6 +3228,21 @@ error0:
 	return error;
 }
 
+static void
+xfs_btree_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	if (cur->bc_flags & XFS_BTREE_STAGING) {
+		/* Update the btree root information for a per-AG fake root. */
+		cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
+		cur->bc_ag.afake->af_levels += inc;
+	} else {
+		cur->bc_ops->set_root(cur, ptr, inc);
+	}
+}
+
 /*
  * Allocate a new root block, fill it in.
  */
@@ -3093,10 +3267,10 @@ xfs_btree_new_root(
 	XFS_BTREE_STATS_INC(cur, newroot);
 
 	/* initialise our start point from the cursor */
-	cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+	xfs_btree_init_ptr_from_cur(cur, &rptr);
 
 	/* Allocate the new block. If we can't do it, we're toast. Give up. */
-	error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
+	error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
 	if (error)
 		goto error0;
 	if (*stat == 0)
@@ -3109,7 +3283,7 @@ xfs_btree_new_root(
 		goto error0;
 
 	/* Set the root in the holding structure  increasing the level by 1. */
-	cur->bc_ops->set_root(cur, &lptr, 1);
+	xfs_btree_set_root(cur, &lptr, 1);
 
 	/*
 	 * At the previous root level there are now two blocks: the old root,
@@ -3213,8 +3387,7 @@ xfs_btree_make_block_unfull(
 {
 	int			error = 0;
 
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    level == cur->bc_nlevels - 1) {
+	if (xfs_btree_at_iroot(cur, level)) {
 		struct xfs_inode *ip = cur->bc_ino.ip;
 
 		if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
@@ -3299,8 +3472,8 @@ xfs_btree_insrec(
 	 * If we have an external root pointer, and we've made it to the
 	 * root level, allocate a new root block and we're done.
 	 */
-	if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    (level >= cur->bc_nlevels)) {
+	if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
+	    level >= cur->bc_nlevels) {
 		error = xfs_btree_new_root(cur, stat);
 		xfs_btree_set_ptr_null(cur, ptrp);
 
@@ -3524,6 +3697,7 @@ xfs_btree_insert(
 		}
 
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -3537,7 +3711,8 @@ xfs_btree_insert(
 		if (pcur != cur &&
 		    (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
 			/* Save the state from the cursor before we trash it */
-			if (cur->bc_ops->update_cursor)
+			if (cur->bc_ops->update_cursor &&
+			    !(cur->bc_flags & XFS_BTREE_STAGING))
 				cur->bc_ops->update_cursor(pcur, cur);
 			cur->bc_nlevels = pcur->bc_nlevels;
 			xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
@@ -3586,7 +3761,7 @@ xfs_btree_kill_iroot(
 #endif
 	int			i;
 
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 	ASSERT(cur->bc_nlevels > 1);
 
 	/*
@@ -3680,7 +3855,7 @@ xfs_btree_kill_root(
 	 * Update the root pointer, decreasing the level by 1 and then
 	 * free the old root.
 	 */
-	cur->bc_ops->set_root(cur, newroot, -1);
+	xfs_btree_set_root(cur, newroot, -1);
 
 	error = xfs_btree_free_block(cur, bp);
 	if (error)
@@ -3822,27 +3997,25 @@ xfs_btree_delrec(
 	 * Try to get rid of the next level down.  If we can't then there's
 	 * nothing left to do.
 	 */
-	if (level == cur->bc_nlevels - 1) {
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
-			xfs_iroot_realloc(cur->bc_ino.ip, -1,
-					  cur->bc_ino.whichfork);
+	if (xfs_btree_at_iroot(cur, level)) {
+		xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
 
-			error = xfs_btree_kill_iroot(cur);
-			if (error)
-				goto error0;
+		error = xfs_btree_kill_iroot(cur);
+		if (error)
+			goto error0;
 
-			error = xfs_btree_dec_cursor(cur, level, stat);
-			if (error)
-				goto error0;
-			*stat = 1;
-			return 0;
-		}
+		error = xfs_btree_dec_cursor(cur, level, stat);
+		if (error)
+			goto error0;
+		*stat = 1;
+		return 0;
+	}
 
-		/*
-		 * If this is the root level, and there's only one entry left,
-		 * and it's NOT the leaf level, then we can get rid of this
-		 * level.
-		 */
+	/*
+	 * If this is the root level, and there's only one entry left, and it's
+	 * NOT the leaf level, then we can get rid of this level.
+	 */
+	if (level == cur->bc_nlevels - 1) {
 		if (numrecs == 1 && level > 0) {
 			union xfs_btree_ptr	*pp;
 			/*
@@ -3891,7 +4064,7 @@ xfs_btree_delrec(
 	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
 	xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
 
-	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
 		/*
 		 * One child of root, need to get a chance to copy its contents
 		 * into the root and delete it. Can't go up to next level,
@@ -3931,6 +4104,7 @@ xfs_btree_delrec(
 		 */
 		i = xfs_btree_lastrec(tcur, level);
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -3939,12 +4113,14 @@ xfs_btree_delrec(
 		if (error)
 			goto error0;
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
 
 		i = xfs_btree_lastrec(tcur, level);
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -3992,6 +4168,7 @@ xfs_btree_delrec(
 		if (!xfs_btree_ptr_is_null(cur, &lptr)) {
 			i = xfs_btree_firstrec(tcur, level);
 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -4000,6 +4177,7 @@ xfs_btree_delrec(
 			if (error)
 				goto error0;
 			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto error0;
 			}
@@ -4017,6 +4195,7 @@ xfs_btree_delrec(
 		 */
 		i = xfs_btree_firstrec(tcur, level);
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -4026,6 +4205,7 @@ xfs_btree_delrec(
 			goto error0;
 		i = xfs_btree_firstrec(tcur, level);
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -4201,8 +4381,8 @@ xfs_btree_delrec(
 	 * If we joined with the right neighbor and there's a level above
 	 * us, increment the cursor at that level.
 	 */
-	else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
-		   (level + 1 < cur->bc_nlevels)) {
+	else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
+		 level + 1 < cur->bc_nlevels) {
 		error = xfs_btree_increment(cur, level + 1, &i);
 		if (error)
 			goto error0;
@@ -4270,7 +4450,7 @@ xfs_btree_delete(
 	 * If we combined blocks as part of deleting the record, delrec won't
 	 * have updated the parent high keys so we have to do that here.
 	 */
-	if (joined && (cur->bc_flags & XFS_BTREE_OVERLAPPING)) {
+	if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
 		error = xfs_btree_updkeys_force(cur, 0);
 		if (error)
 			goto error0;
@@ -4344,7 +4524,7 @@ xfs_btree_visit_block(
 {
 	struct xfs_btree_block		*block;
 	struct xfs_buf			*bp;
-	union xfs_btree_ptr		rptr;
+	union xfs_btree_ptr		rptr, bufptr;
 	int				error;
 
 	/* do right sibling readahead */
@@ -4367,15 +4547,12 @@ xfs_btree_visit_block(
 	 * return the same block without checking if the right sibling points
 	 * back to us and creates a cyclic reference in the btree.
 	 */
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		if (be64_to_cpu(rptr.l) == XFS_DADDR_TO_FSB(cur->bc_mp,
-							xfs_buf_daddr(bp)))
-			return -EFSCORRUPTED;
-	} else {
-		if (be32_to_cpu(rptr.s) == xfs_daddr_to_agbno(cur->bc_mp,
-							xfs_buf_daddr(bp)))
-			return -EFSCORRUPTED;
+	xfs_btree_buf_to_ptr(cur, bp, &bufptr);
+	if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
+		xfs_btree_mark_sick(cur);
+		return -EFSCORRUPTED;
 	}
+
 	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
 }
 
@@ -4393,7 +4570,7 @@ xfs_btree_visit_blocks(
 	struct xfs_btree_block		*block = NULL;
 	int				error = 0;
 
-	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+	xfs_btree_init_ptr_from_cur(cur, &lptr);
 
 	/* for each level */
 	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
@@ -4471,7 +4648,7 @@ xfs_btree_block_change_owner(
 
 	/* modify the owner */
 	block = xfs_btree_get_block(cur, level, &bp);
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 		if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
 			return 0;
 		block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
@@ -4489,7 +4666,7 @@ xfs_btree_block_change_owner(
 	 * though, so everything is consistent in memory.
 	 */
 	if (!bp) {
-		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+		ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 		ASSERT(level == cur->bc_nlevels - 1);
 		return 0;
 	}
@@ -4523,7 +4700,7 @@ xfs_btree_change_owner(
 
 /* Verify the v5 fields of a long-format btree block. */
 xfs_failaddr_t
-xfs_btree_lblock_v5hdr_verify(
+xfs_btree_fsblock_v5hdr_verify(
 	struct xfs_buf		*bp,
 	uint64_t		owner)
 {
@@ -4544,7 +4721,7 @@ xfs_btree_lblock_v5hdr_verify(
 
 /* Verify a long-format btree block. */
 xfs_failaddr_t
-xfs_btree_lblock_verify(
+xfs_btree_fsblock_verify(
 	struct xfs_buf		*bp,
 	unsigned int		max_recs)
 {
@@ -4553,28 +4730,60 @@ xfs_btree_lblock_verify(
 	xfs_fsblock_t		fsb;
 	xfs_failaddr_t		fa;
 
+	ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
 	/* numrecs verification */
 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
 		return __this_address;
 
 	/* sibling pointer verification */
 	fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
-	fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+	fa = xfs_btree_check_fsblock_siblings(mp, fsb,
 			block->bb_u.l.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_lblock_siblings(mp, NULL, -1, fsb,
+		fa = xfs_btree_check_fsblock_siblings(mp, fsb,
 				block->bb_u.l.bb_rightsib);
 	return fa;
 }
 
+/* Verify an in-memory btree block. */
+xfs_failaddr_t
+xfs_btree_memblock_verify(
+	struct xfs_buf		*bp,
+	unsigned int		max_recs)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_buftarg	*btp = bp->b_target;
+	xfs_failaddr_t		fa;
+	xfbno_t			bno;
+
+	ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > max_recs)
+		return __this_address;
+
+	/* sibling pointer verification */
+	bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
+	fa = xfs_btree_check_memblock_siblings(btp, bno,
+			block->bb_u.l.bb_leftsib);
+	if (fa)
+		return fa;
+	fa = xfs_btree_check_memblock_siblings(btp, bno,
+			block->bb_u.l.bb_rightsib);
+	if (fa)
+		return fa;
+
+	return NULL;
+}
 /**
- * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
  *				      btree block
  *
  * @bp: buffer containing the btree block
  */
 xfs_failaddr_t
-xfs_btree_sblock_v5hdr_verify(
+xfs_btree_agblock_v5hdr_verify(
 	struct xfs_buf		*bp)
 {
 	struct xfs_mount	*mp = bp->b_mount;
@@ -4593,13 +4802,13 @@ xfs_btree_sblock_v5hdr_verify(
 }
 
 /**
- * xfs_btree_sblock_verify() -- verify a short-format btree block
+ * xfs_btree_agblock_verify() -- verify a short-format btree block
  *
  * @bp: buffer containing the btree block
  * @max_recs: maximum records allowed in this btree node
  */
 xfs_failaddr_t
-xfs_btree_sblock_verify(
+xfs_btree_agblock_verify(
 	struct xfs_buf		*bp,
 	unsigned int		max_recs)
 {
@@ -4608,16 +4817,18 @@ xfs_btree_sblock_verify(
 	xfs_agblock_t		agbno;
 	xfs_failaddr_t		fa;
 
+	ASSERT(!xfs_buftarg_is_mem(bp->b_target));
+
 	/* numrecs verification */
 	if (be16_to_cpu(block->bb_numrecs) > max_recs)
 		return __this_address;
 
 	/* sibling pointer verification */
 	agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
-	fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+	fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
 			block->bb_u.s.bb_leftsib);
 	if (!fa)
-		fa = xfs_btree_check_sblock_siblings(bp->b_pag, NULL, -1, agbno,
+		fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
 				block->bb_u.s.bb_rightsib);
 	return fa;
 }
@@ -4815,7 +5026,7 @@ xfs_btree_overlapped_query_range(
 
 	/* Load the root of the btree. */
 	level = cur->bc_nlevels - 1;
-	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	xfs_btree_init_ptr_from_cur(cur, &ptr);
 	error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
 	if (error)
 		return error;
@@ -4966,7 +5177,7 @@ xfs_btree_query_range(
 	if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
 		return -EINVAL;
 
-	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 		return xfs_btree_simple_query_range(cur, &low_key,
 				&high_key, fn, priv);
 	return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
@@ -5020,7 +5231,7 @@ xfs_btree_diff_two_ptrs(
 	const union xfs_btree_ptr	*a,
 	const union xfs_btree_ptr	*b)
 {
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
 	return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
 }
@@ -5074,7 +5285,7 @@ xfs_btree_has_records_helper(
 		key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
 					&rec_key, info->key_mask);
 		if (key_contig == XBTREE_KEY_OVERLAP &&
-				!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+				!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 			return -EFSCORRUPTED;
 		if (key_contig == XBTREE_KEY_GAP)
 			return -ECANCELED;
@@ -5168,7 +5379,7 @@ xfs_btree_has_more_records(
 		return true;
 
 	/* There are more record blocks. */
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
 	else
 		return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
@@ -5233,6 +5444,7 @@ xfs_btree_goto_left_edge(
 		return error;
 	if (stat != 0) {
 		ASSERT(0);
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
 	}
 
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index d906324e25c8..f93374278aa1 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -55,15 +55,8 @@ union xfs_btree_rec {
 #define	XFS_LOOKUP_LE	((xfs_lookup_t)XFS_LOOKUP_LEi)
 #define	XFS_LOOKUP_GE	((xfs_lookup_t)XFS_LOOKUP_GEi)
 
-#define	XFS_BTNUM_BNO	((xfs_btnum_t)XFS_BTNUM_BNOi)
-#define	XFS_BTNUM_CNT	((xfs_btnum_t)XFS_BTNUM_CNTi)
-#define	XFS_BTNUM_BMAP	((xfs_btnum_t)XFS_BTNUM_BMAPi)
-#define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
-#define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)
-#define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
-#define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
-
-uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
+struct xfs_btree_ops;
+uint32_t xfs_btree_magic(struct xfs_mount *mp, const struct xfs_btree_ops *ops);
 
 /*
  * For logging record fields.
@@ -86,9 +79,11 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum);
  * Generic stats interface
  */
 #define XFS_BTREE_STATS_INC(cur, stat)	\
-	XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+	XFS_STATS_INC_OFF((cur)->bc_mp, \
+		(cur)->bc_ops->statoff + __XBTS_ ## stat)
 #define XFS_BTREE_STATS_ADD(cur, stat, val)	\
-	XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
+	XFS_STATS_ADD_OFF((cur)->bc_mp, \
+		(cur)->bc_ops->statoff + __XBTS_ ## stat, val)
 
 enum xbtree_key_contig {
 	XBTREE_KEY_GAP = 0,
@@ -111,10 +106,37 @@ static inline enum xbtree_key_contig xbtree_key_contig(uint64_t x, uint64_t y)
 	return XBTREE_KEY_OVERLAP;
 }
 
+#define XFS_BTREE_LONG_PTR_LEN		(sizeof(__be64))
+#define XFS_BTREE_SHORT_PTR_LEN		(sizeof(__be32))
+
+enum xfs_btree_type {
+	XFS_BTREE_TYPE_AG,
+	XFS_BTREE_TYPE_INODE,
+	XFS_BTREE_TYPE_MEM,
+};
+
 struct xfs_btree_ops {
-	/* size of the key and record structures */
-	size_t	key_len;
-	size_t	rec_len;
+	const char		*name;
+
+	/* Type of btree - AG-rooted or inode-rooted */
+	enum xfs_btree_type	type;
+
+	/* XFS_BTGEO_* flags that determine the geometry of the btree */
+	unsigned int		geom_flags;
+
+	/* size of the key, pointer, and record structures */
+	size_t			key_len;
+	size_t			ptr_len;
+	size_t			rec_len;
+
+	/* LRU refcount to set on each btree buffer created */
+	unsigned int		lru_refs;
+
+	/* offset of btree stats array */
+	unsigned int		statoff;
+
+	/* sick mask for health reporting (only for XFS_BTREE_TYPE_AG) */
+	unsigned int		sick_mask;
 
 	/* cursor operations */
 	struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
@@ -199,6 +221,10 @@ struct xfs_btree_ops {
 			       const union xfs_btree_key *mask);
 };
 
+/* btree geometry flags */
+#define XFS_BTGEO_LASTREC_UPDATE	(1U << 0) /* track last rec externally */
+#define XFS_BTGEO_OVERLAPPING		(1U << 1) /* overlapping intervals */
+
 /*
  * Reasons for the update_lastrec method to be called.
  */
@@ -215,39 +241,6 @@ union xfs_btree_irec {
 	struct xfs_refcount_irec	rc;
 };
 
-/* Per-AG btree information. */
-struct xfs_btree_cur_ag {
-	struct xfs_perag		*pag;
-	union {
-		struct xfs_buf		*agbp;
-		struct xbtree_afakeroot	*afake;	/* for staging cursor */
-	};
-	union {
-		struct {
-			unsigned int	nr_ops;	/* # record updates */
-			unsigned int	shape_changes;	/* # of extent splits */
-		} refc;
-		struct {
-			bool		active;	/* allocation cursor state */
-		} abt;
-	};
-};
-
-/* Btree-in-inode cursor information */
-struct xfs_btree_cur_ino {
-	struct xfs_inode		*ip;
-	struct xbtree_ifakeroot		*ifake;	/* for staging cursor */
-	int				allocated;
-	short				forksize;
-	char				whichfork;
-	char				flags;
-/* We are converting a delalloc reservation */
-#define	XFS_BTCUR_BMBT_WASDEL		(1 << 0)
-
-/* For extent swap, ignore owner check in verifier */
-#define	XFS_BTCUR_BMBT_INVALID_OWNER	(1 << 1)
-};
-
 struct xfs_btree_level {
 	/* buffer pointer */
 	struct xfs_buf		*bp;
@@ -272,21 +265,38 @@ struct xfs_btree_cur
 	const struct xfs_btree_ops *bc_ops;
 	struct kmem_cache	*bc_cache; /* cursor cache */
 	unsigned int		bc_flags; /* btree features - below */
-	xfs_btnum_t		bc_btnum; /* identifies which btree type */
 	union xfs_btree_irec	bc_rec;	/* current insert/search record value */
 	uint8_t			bc_nlevels; /* number of levels in the tree */
 	uint8_t			bc_maxlevels; /* maximum levels for this btree type */
-	int			bc_statoff; /* offset of btree stats array */
 
-	/*
-	 * Short btree pointers need an agno to be able to turn the pointers
-	 * into physical addresses for IO, so the btree cursor switches between
-	 * bc_ino and bc_ag based on whether XFS_BTREE_LONG_PTRS is set for the
-	 * cursor.
-	 */
+	/* per-type information */
 	union {
-		struct xfs_btree_cur_ag	bc_ag;
-		struct xfs_btree_cur_ino bc_ino;
+		struct {
+			struct xfs_inode	*ip;
+			short			forksize;
+			char			whichfork;
+			struct xbtree_ifakeroot	*ifake;	/* for staging cursor */
+		} bc_ino;
+		struct {
+			struct xfs_perag	*pag;
+			struct xfs_buf		*agbp;
+			struct xbtree_afakeroot	*afake;	/* for staging cursor */
+		} bc_ag;
+		struct {
+			struct xfbtree		*xfbtree;
+			struct xfs_perag	*pag;
+		} bc_mem;
+	};
+
+	/* per-format private data */
+	union {
+		struct {
+			int		allocated;
+		} bc_bmap;	/* bmapbt */
+		struct {
+			unsigned int	nr_ops;		/* # record updates */
+			unsigned int	shape_changes;	/* # of extent splits */
+		} bc_refc;	/* refcountbt */
 	};
 
 	/* Must be at the end of the struct! */
@@ -304,18 +314,22 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
 	return struct_size_t(struct xfs_btree_cur, bc_levels, nlevels);
 }
 
-/* cursor flags */
-#define XFS_BTREE_LONG_PTRS		(1<<0)	/* pointers are 64bits long */
-#define XFS_BTREE_ROOT_IN_INODE		(1<<1)	/* root may be variable size */
-#define XFS_BTREE_LASTREC_UPDATE	(1<<2)	/* track last rec externally */
-#define XFS_BTREE_CRC_BLOCKS		(1<<3)	/* uses extended btree blocks */
-#define XFS_BTREE_OVERLAPPING		(1<<4)	/* overlapping intervals */
+/* cursor state flags */
 /*
  * The root of this btree is a fakeroot structure so that we can stage a btree
  * rebuild without leaving it accessible via primary metadata.  The ops struct
  * is dynamically allocated and must be freed when the cursor is deleted.
  */
-#define XFS_BTREE_STAGING		(1<<5)
+#define XFS_BTREE_STAGING		(1U << 0)
+
+/* We are converting a delalloc reservation (only for bmbt btrees) */
+#define	XFS_BTREE_BMBT_WASDEL		(1U << 1)
+
+/* For extent swap, ignore owner check in verifier (only for bmbt btrees) */
+#define	XFS_BTREE_BMBT_INVALID_OWNER	(1U << 2)
+
+/* Cursor is active (only for allocbt btrees) */
+#define	XFS_BTREE_ALLOCBT_ACTIVE	(1U << 3)
 
 #define	XFS_BTREE_NOERROR	0
 #define	XFS_BTREE_ERROR		1
@@ -325,14 +339,10 @@ xfs_btree_cur_sizeof(unsigned int nlevels)
  */
 #define	XFS_BUF_TO_BLOCK(bp)	((struct xfs_btree_block *)((bp)->b_addr))
 
-/*
- * Internal long and short btree block checks.  They return NULL if the
- * block is ok or the address of the failed check otherwise.
- */
-xfs_failaddr_t __xfs_btree_check_lblock(struct xfs_btree_cur *cur,
-		struct xfs_btree_block *block, int level, struct xfs_buf *bp);
-xfs_failaddr_t __xfs_btree_check_sblock(struct xfs_btree_cur *cur,
+xfs_failaddr_t __xfs_btree_check_block(struct xfs_btree_cur *cur,
 		struct xfs_btree_block *block, int level, struct xfs_buf *bp);
+int __xfs_btree_check_ptr(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *ptr, int index, int level);
 
 /*
  * Check that block header is ok.
@@ -345,24 +355,6 @@ xfs_btree_check_block(
 	struct xfs_buf		*bp);	/* buffer containing block, if any */
 
 /*
- * Check that (long) pointer is ok.
- */
-bool					/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_fsblock_t		fsbno,	/* btree block disk address */
-	int			level);	/* btree block level */
-
-/*
- * Check that (short) pointer is ok.
- */
-bool					/* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
-	struct xfs_btree_cur	*cur,	/* btree cursor */
-	xfs_agblock_t		agbno,	/* btree block disk address */
-	int			level);	/* btree block level */
-
-/*
  * Delete the btree cursor.
  */
 void
@@ -392,63 +384,14 @@ xfs_btree_offsets(
 	int			*last);	/* output: last byte offset */
 
 /*
- * Get a buffer for the block, return it read in.
- * Long-form addressing.
- */
-int					/* error */
-xfs_btree_read_bufl(
-	struct xfs_mount	*mp,	/* file system mount point */
-	struct xfs_trans	*tp,	/* transaction pointer */
-	xfs_fsblock_t		fsbno,	/* file system block number */
-	struct xfs_buf		**bpp,	/* buffer for fsbno */
-	int			refval,	/* ref count value for buffer */
-	const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Long-form addressing.
- */
-void					/* error */
-xfs_btree_reada_bufl(
-	struct xfs_mount	*mp,	/* file system mount point */
-	xfs_fsblock_t		fsbno,	/* file system block number */
-	xfs_extlen_t		count,	/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops);
-
-/*
- * Read-ahead the block, don't wait for it, don't return a buffer.
- * Short-form addressing.
- */
-void					/* error */
-xfs_btree_reada_bufs(
-	struct xfs_mount	*mp,	/* file system mount point */
-	xfs_agnumber_t		agno,	/* allocation group number */
-	xfs_agblock_t		agbno,	/* allocation group block number */
-	xfs_extlen_t		count,	/* count of filesystem blocks */
-	const struct xfs_buf_ops *ops);
-
-/*
  * Initialise a new btree block header
  */
-void
-xfs_btree_init_block(
-	struct xfs_mount *mp,
-	struct xfs_buf	*bp,
-	xfs_btnum_t	btnum,
-	__u16		level,
-	__u16		numrecs,
-	__u64		owner);
-
-void
-xfs_btree_init_block_int(
-	struct xfs_mount	*mp,
-	struct xfs_btree_block	*buf,
-	xfs_daddr_t		blkno,
-	xfs_btnum_t		btnum,
-	__u16			level,
-	__u16			numrecs,
-	__u64			owner,
-	unsigned int		flags);
+void xfs_btree_init_buf(struct xfs_mount *mp, struct xfs_buf *bp,
+		const struct xfs_btree_ops *ops, __u16 level, __u16 numrecs,
+		__u64 owner);
+void xfs_btree_init_block(struct xfs_mount *mp,
+		struct xfs_btree_block *buf, const struct xfs_btree_ops *ops,
+		__u16 level, __u16 numrecs, __u64 owner);
 
 /*
  * Common btree core entry points.
@@ -467,10 +410,10 @@ int xfs_btree_change_owner(struct xfs_btree_cur *cur, uint64_t new_owner,
 /*
  * btree block CRC helpers
  */
-void xfs_btree_lblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_lblock_verify_crc(struct xfs_buf *);
-void xfs_btree_sblock_calc_crc(struct xfs_buf *);
-bool xfs_btree_sblock_verify_crc(struct xfs_buf *);
+void xfs_btree_fsblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_fsblock_verify_crc(struct xfs_buf *);
+void xfs_btree_agblock_calc_crc(struct xfs_buf *);
+bool xfs_btree_agblock_verify_crc(struct xfs_buf *);
 
 /*
  * Internal btree helpers also used by xfs_bmap.c.
@@ -510,12 +453,14 @@ static inline int xfs_btree_get_level(const struct xfs_btree_block *block)
 #define	XFS_FILBLKS_MIN(a,b)	min_t(xfs_filblks_t, (a), (b))
 #define	XFS_FILBLKS_MAX(a,b)	max_t(xfs_filblks_t, (a), (b))
 
-xfs_failaddr_t xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
-xfs_failaddr_t xfs_btree_sblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_agblock_v5hdr_verify(struct xfs_buf *bp);
+xfs_failaddr_t xfs_btree_agblock_verify(struct xfs_buf *bp,
 		unsigned int max_recs);
-xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_v5hdr_verify(struct xfs_buf *bp,
 		uint64_t owner);
-xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp,
+xfs_failaddr_t xfs_btree_fsblock_verify(struct xfs_buf *bp,
+		unsigned int max_recs);
+xfs_failaddr_t xfs_btree_memblock_verify(struct xfs_buf *bp,
 		unsigned int max_recs);
 
 unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits,
@@ -690,7 +635,7 @@ xfs_btree_islastblock(
 
 	block = xfs_btree_get_block(cur, level, &bp);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
 	return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
 }
@@ -714,21 +659,28 @@ void xfs_btree_copy_ptrs(struct xfs_btree_cur *cur,
 void xfs_btree_copy_keys(struct xfs_btree_cur *cur,
 		union xfs_btree_key *dst_key,
 		const union xfs_btree_key *src_key, int numkeys);
+void xfs_btree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *ptr);
 
 static inline struct xfs_btree_cur *
 xfs_btree_alloc_cursor(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
-	xfs_btnum_t		btnum,
+	const struct xfs_btree_ops *ops,
 	uint8_t			maxlevels,
 	struct kmem_cache	*cache)
 {
 	struct xfs_btree_cur	*cur;
 
-	cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL);
+	ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN ||
+	       ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN);
+
+	/* BMBT allocations can come through from non-transactional context. */
+	cur = kmem_cache_zalloc(cache,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+	cur->bc_ops = ops;
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
-	cur->bc_btnum = btnum;
 	cur->bc_maxlevels = maxlevels;
 	cur->bc_cache = cache;
 
@@ -740,4 +692,14 @@ void xfs_btree_destroy_cur_caches(void);
 
 int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
 
+/* Does this level of the cursor point to the inode root (and not a block)? */
+static inline bool
+xfs_btree_at_iroot(
+	const struct xfs_btree_cur	*cur,
+	int				level)
+{
+	return cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
+	       level == cur->bc_nlevels - 1;
+}
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_mem.c b/fs/xfs/libxfs/xfs_btree_mem.c
new file mode 100644
index 000000000000..036061fe32cc
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.c
@@ -0,0 +1,347 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_trans.h"
+#include "xfs_btree.h"
+#include "xfs_error.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_ag.h"
+#include "xfs_buf_item.h"
+#include "xfs_trace.h"
+
+/* Set the root of an in-memory btree. */
+void
+xfbtree_set_root(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*ptr,
+	int				inc)
+{
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+	cur->bc_mem.xfbtree->root = *ptr;
+	cur->bc_mem.xfbtree->nlevels += inc;
+}
+
+/* Initialize a pointer from the in-memory btree header. */
+void
+xfbtree_init_ptr_from_cur(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_ptr		*ptr)
+{
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+	*ptr = cur->bc_mem.xfbtree->root;
+}
+
+/* Duplicate an in-memory btree cursor. */
+struct xfs_btree_cur *
+xfbtree_dup_cursor(
+	struct xfs_btree_cur		*cur)
+{
+	struct xfs_btree_cur		*ncur;
+
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+	ncur = xfs_btree_alloc_cursor(cur->bc_mp, cur->bc_tp, cur->bc_ops,
+			cur->bc_maxlevels, cur->bc_cache);
+	ncur->bc_flags = cur->bc_flags;
+	ncur->bc_nlevels = cur->bc_nlevels;
+	ncur->bc_mem.xfbtree = cur->bc_mem.xfbtree;
+
+	if (cur->bc_mem.pag)
+		ncur->bc_mem.pag = xfs_perag_hold(cur->bc_mem.pag);
+
+	return ncur;
+}
+
+/* Close the btree xfile and release all resources. */
+void
+xfbtree_destroy(
+	struct xfbtree		*xfbt)
+{
+	xfs_buftarg_drain(xfbt->target);
+}
+
+/* Compute the number of bytes available for records. */
+static inline unsigned int
+xfbtree_rec_bytes(
+	struct xfs_mount		*mp,
+	const struct xfs_btree_ops	*ops)
+{
+	return XMBUF_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+}
+
+/* Initialize an empty leaf block as the btree root. */
+STATIC int
+xfbtree_init_leaf_block(
+	struct xfs_mount		*mp,
+	struct xfbtree			*xfbt,
+	const struct xfs_btree_ops	*ops)
+{
+	struct xfs_buf			*bp;
+	xfbno_t				bno = xfbt->highest_bno++;
+	int				error;
+
+	error = xfs_buf_get(xfbt->target, xfbno_to_daddr(bno), XFBNO_BBSIZE,
+			&bp);
+	if (error)
+		return error;
+
+	trace_xfbtree_create_root_buf(xfbt, bp);
+
+	bp->b_ops = ops->buf_ops;
+	xfs_btree_init_buf(mp, bp, ops, 0, 0, xfbt->owner);
+	xfs_buf_relse(bp);
+
+	xfbt->root.l = cpu_to_be64(bno);
+	return 0;
+}
+
+/*
+ * Create an in-memory btree root that can be used with the given xmbuf.
+ * Callers must set xfbt->owner.
+ */
+int
+xfbtree_init(
+	struct xfs_mount		*mp,
+	struct xfbtree			*xfbt,
+	struct xfs_buftarg		*btp,
+	const struct xfs_btree_ops	*ops)
+{
+	unsigned int			blocklen = xfbtree_rec_bytes(mp, ops);
+	unsigned int			keyptr_len;
+	int				error;
+
+	/* Requires a long-format CRC-format btree */
+	if (!xfs_has_crc(mp)) {
+		ASSERT(xfs_has_crc(mp));
+		return -EINVAL;
+	}
+	if (ops->ptr_len != XFS_BTREE_LONG_PTR_LEN) {
+		ASSERT(ops->ptr_len == XFS_BTREE_LONG_PTR_LEN);
+		return -EINVAL;
+	}
+
+	memset(xfbt, 0, sizeof(*xfbt));
+	xfbt->target = btp;
+
+	/* Set up min/maxrecs for this btree. */
+	keyptr_len = ops->key_len + sizeof(__be64);
+	xfbt->maxrecs[0] = blocklen / ops->rec_len;
+	xfbt->maxrecs[1] = blocklen / keyptr_len;
+	xfbt->minrecs[0] = xfbt->maxrecs[0] / 2;
+	xfbt->minrecs[1] = xfbt->maxrecs[1] / 2;
+	xfbt->highest_bno = 0;
+	xfbt->nlevels = 1;
+
+	/* Initialize the empty btree. */
+	error = xfbtree_init_leaf_block(mp, xfbt, ops);
+	if (error)
+		goto err_freesp;
+
+	trace_xfbtree_init(mp, xfbt, ops);
+
+	return 0;
+
+err_freesp:
+	xfs_buftarg_drain(xfbt->target);
+	return error;
+}
+
+/* Allocate a block to our in-memory btree. */
+int
+xfbtree_alloc_block(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_ptr	*start,
+	union xfs_btree_ptr		*new,
+	int				*stat)
+{
+	struct xfbtree			*xfbt = cur->bc_mem.xfbtree;
+	xfbno_t				bno = xfbt->highest_bno++;
+
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+	trace_xfbtree_alloc_block(xfbt, cur, bno);
+
+	/* Fail if the block address exceeds the maximum for the buftarg. */
+	if (!xfbtree_verify_bno(xfbt, bno)) {
+		ASSERT(xfbtree_verify_bno(xfbt, bno));
+		*stat = 0;
+		return 0;
+	}
+
+	new->l = cpu_to_be64(bno);
+	*stat = 1;
+	return 0;
+}
+
+/* Free a block from our in-memory btree. */
+int
+xfbtree_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfbtree		*xfbt = cur->bc_mem.xfbtree;
+	xfs_daddr_t		daddr = xfs_buf_daddr(bp);
+	xfbno_t			bno = xfs_daddr_to_xfbno(daddr);
+
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_MEM);
+
+	trace_xfbtree_free_block(xfbt, cur, bno);
+
+	if (bno + 1 == xfbt->highest_bno)
+		xfbt->highest_bno--;
+
+	return 0;
+}
+
+/* Return the minimum number of records for a btree block. */
+int
+xfbtree_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	struct xfbtree		*xfbt = cur->bc_mem.xfbtree;
+
+	return xfbt->minrecs[level != 0];
+}
+
+/* Return the maximum number of records for a btree block. */
+int
+xfbtree_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	struct xfbtree		*xfbt = cur->bc_mem.xfbtree;
+
+	return xfbt->maxrecs[level != 0];
+}
+
+/* If this log item is a buffer item that came from the xfbtree, return it. */
+static inline struct xfs_buf *
+xfbtree_buf_match(
+	struct xfbtree			*xfbt,
+	const struct xfs_log_item	*lip)
+{
+	const struct xfs_buf_log_item	*bli;
+	struct xfs_buf			*bp;
+
+	if (lip->li_type != XFS_LI_BUF)
+		return NULL;
+
+	bli = container_of(lip, struct xfs_buf_log_item, bli_item);
+	bp = bli->bli_buf;
+	if (bp->b_target != xfbt->target)
+		return NULL;
+
+	return bp;
+}
+
+/*
+ * Commit changes to the incore btree immediately by writing all dirty xfbtree
+ * buffers to the backing xfile.  This detaches all xfbtree buffers from the
+ * transaction, even on failure.  The buffer locks are dropped between the
+ * delwri queue and submit, so the caller must synchronize btree access.
+ *
+ * Normally we'd let the buffers commit with the transaction and get written to
+ * the xfile via the log, but online repair stages ephemeral btrees in memory
+ * and uses the btree_staging functions to write new btrees to disk atomically.
+ * The in-memory btree (and its backing store) are discarded at the end of the
+ * repair phase, which means that xfbtree buffers cannot commit with the rest
+ * of a transaction.
+ *
+ * In other words, online repair only needs the transaction to collect buffer
+ * pointers and to avoid buffer deadlocks, not to guarantee consistency of
+ * updates.
+ */
+int
+xfbtree_trans_commit(
+	struct xfbtree		*xfbt,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item	*lip, *n;
+	bool			tp_dirty = false;
+	int			error = 0;
+
+	/*
+	 * For each xfbtree buffer attached to the transaction, write the dirty
+	 * buffers to the xfile and release them.
+	 */
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		struct xfs_buf	*bp = xfbtree_buf_match(xfbt, lip);
+
+		if (!bp) {
+			if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+				tp_dirty |= true;
+			continue;
+		}
+
+		trace_xfbtree_trans_commit_buf(xfbt, bp);
+
+		xmbuf_trans_bdetach(tp, bp);
+
+		/*
+		 * If the buffer fails verification, note the failure but
+		 * continue walking the transaction items so that we remove all
+		 * ephemeral btree buffers.
+		 */
+		if (!error)
+			error = xmbuf_finalize(bp);
+
+		xfs_buf_relse(bp);
+	}
+
+	/*
+	 * Reset the transaction's dirty flag to reflect the dirty state of the
+	 * log items that are still attached.
+	 */
+	tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+			(tp_dirty ? XFS_TRANS_DIRTY : 0);
+
+	return error;
+}
+
+/*
+ * Cancel changes to the incore btree by detaching all the xfbtree buffers.
+ * Changes are not undone, so callers must not access the btree ever again.
+ */
+void
+xfbtree_trans_cancel(
+	struct xfbtree		*xfbt,
+	struct xfs_trans	*tp)
+{
+	struct xfs_log_item	*lip, *n;
+	bool			tp_dirty = false;
+
+	list_for_each_entry_safe(lip, n, &tp->t_items, li_trans) {
+		struct xfs_buf	*bp = xfbtree_buf_match(xfbt, lip);
+
+		if (!bp) {
+			if (test_bit(XFS_LI_DIRTY, &lip->li_flags))
+				tp_dirty |= true;
+			continue;
+		}
+
+		trace_xfbtree_trans_cancel_buf(xfbt, bp);
+
+		xmbuf_trans_bdetach(tp, bp);
+		xfs_buf_relse(bp);
+	}
+
+	/*
+	 * Reset the transaction's dirty flag to reflect the dirty state of the
+	 * log items that are still attached.
+	 */
+	tp->t_flags = (tp->t_flags & ~XFS_TRANS_DIRTY) |
+			(tp_dirty ? XFS_TRANS_DIRTY : 0);
+}
diff --git a/fs/xfs/libxfs/xfs_btree_mem.h b/fs/xfs/libxfs/xfs_btree_mem.h
new file mode 100644
index 000000000000..1c3825786ec8
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_btree_mem.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BTREE_MEM_H__
+#define __XFS_BTREE_MEM_H__
+
+typedef uint64_t xfbno_t;
+
+#define XFBNO_BLOCKSIZE			(XMBUF_BLOCKSIZE)
+#define XFBNO_BBSHIFT			(XMBUF_BLOCKSHIFT - BBSHIFT)
+#define XFBNO_BBSIZE			(XFBNO_BLOCKSIZE >> BBSHIFT)
+
+static inline xfs_daddr_t xfbno_to_daddr(xfbno_t blkno)
+{
+	return blkno << XFBNO_BBSHIFT;
+}
+
+static inline xfbno_t xfs_daddr_to_xfbno(xfs_daddr_t daddr)
+{
+	return daddr >> XFBNO_BBSHIFT;
+}
+
+struct xfbtree {
+	/* buffer cache target for this in-memory btree */
+	struct xfs_buftarg		*target;
+
+	/* Highest block number that has been written to. */
+	xfbno_t				highest_bno;
+
+	/* Owner of this btree. */
+	unsigned long long		owner;
+
+	/* Btree header */
+	union xfs_btree_ptr		root;
+	unsigned int			nlevels;
+
+	/* Minimum and maximum records per block. */
+	unsigned int			maxrecs[2];
+	unsigned int			minrecs[2];
+};
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfbtree_verify_bno(struct xfbtree *xfbt, xfbno_t bno)
+{
+	return xmbuf_verify_daddr(xfbt->target, xfbno_to_daddr(bno));
+}
+
+void xfbtree_set_root(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *ptr, int inc);
+void xfbtree_init_ptr_from_cur(struct xfs_btree_cur *cur,
+		union xfs_btree_ptr *ptr);
+struct xfs_btree_cur *xfbtree_dup_cursor(struct xfs_btree_cur *cur);
+
+int xfbtree_get_minrecs(struct xfs_btree_cur *cur, int level);
+int xfbtree_get_maxrecs(struct xfs_btree_cur *cur, int level);
+
+int xfbtree_alloc_block(struct xfs_btree_cur *cur,
+		const union xfs_btree_ptr *start, union xfs_btree_ptr *ptr,
+		int *stat);
+int xfbtree_free_block(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+/* Callers must set xfbt->target and xfbt->owner before calling this */
+int xfbtree_init(struct xfs_mount *mp, struct xfbtree *xfbt,
+		struct xfs_buftarg *btp, const struct xfs_btree_ops *ops);
+void xfbtree_destroy(struct xfbtree *xfbt);
+
+int xfbtree_trans_commit(struct xfbtree *xfbt, struct xfs_trans *tp);
+void xfbtree_trans_cancel(struct xfbtree *xfbt, struct xfs_trans *tp);
+#else
+# define xfbtree_verify_bno(...)	(false)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_BTREE_MEM_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index e276eba87cb1..694929703152 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -39,63 +39,6 @@
  */
 
 /*
- * Don't allow staging cursors to be duplicated because they're supposed to be
- * kept private to a single thread.
- */
-STATIC struct xfs_btree_cur *
-xfs_btree_fakeroot_dup_cursor(
-	struct xfs_btree_cur	*cur)
-{
-	ASSERT(0);
-	return NULL;
-}
-
-/*
- * Don't allow block allocation for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- *
- * Bulk loading uses a separate callback to obtain new blocks from a
- * preallocated list, which prevents ENOSPC failures during loading.
- */
-STATIC int
-xfs_btree_fakeroot_alloc_block(
-	struct xfs_btree_cur		*cur,
-	const union xfs_btree_ptr	*start_bno,
-	union xfs_btree_ptr		*new_bno,
-	int				*stat)
-{
-	ASSERT(0);
-	return -EFSCORRUPTED;
-}
-
-/*
- * Don't allow block freeing for a staging cursor, because staging cursors
- * do not support regular btree modifications.
- */
-STATIC int
-xfs_btree_fakeroot_free_block(
-	struct xfs_btree_cur	*cur,
-	struct xfs_buf		*bp)
-{
-	ASSERT(0);
-	return -EFSCORRUPTED;
-}
-
-/* Initialize a pointer to the root block from the fakeroot. */
-STATIC void
-xfs_btree_fakeroot_init_ptr_from_cur(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	struct xbtree_afakeroot	*afake;
-
-	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
-
-	afake = cur->bc_ag.afake;
-	ptr->s = cpu_to_be32(afake->af_root);
-}
-
-/*
  * Bulk Loading for AG Btrees
  * ==========================
  *
@@ -109,47 +52,20 @@ xfs_btree_fakeroot_init_ptr_from_cur(
  * cursor into a regular btree cursor.
  */
 
-/* Update the btree root information for a per-AG fake root. */
-STATIC void
-xfs_btree_afakeroot_set_root(
-	struct xfs_btree_cur		*cur,
-	const union xfs_btree_ptr	*ptr,
-	int				inc)
-{
-	struct xbtree_afakeroot	*afake = cur->bc_ag.afake;
-
-	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
-	afake->af_root = be32_to_cpu(ptr->s);
-	afake->af_levels += inc;
-}
-
 /*
  * Initialize a AG-rooted btree cursor with the given AG btree fake root.
- * The btree cursor's bc_ops will be overridden as needed to make the staging
- * functionality work.
  */
 void
 xfs_btree_stage_afakeroot(
 	struct xfs_btree_cur		*cur,
 	struct xbtree_afakeroot		*afake)
 {
-	struct xfs_btree_ops		*nops;
-
 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
-	ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
+	ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
 	ASSERT(cur->bc_tp == NULL);
 
-	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
-	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
-	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
-	nops->free_block = xfs_btree_fakeroot_free_block;
-	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
-	nops->set_root = xfs_btree_afakeroot_set_root;
-	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
 	cur->bc_ag.afake = afake;
 	cur->bc_nlevels = afake->af_levels;
-	cur->bc_ops = nops;
 	cur->bc_flags |= XFS_BTREE_STAGING;
 }
 
@@ -163,17 +79,15 @@ void
 xfs_btree_commit_afakeroot(
 	struct xfs_btree_cur		*cur,
 	struct xfs_trans		*tp,
-	struct xfs_buf			*agbp,
-	const struct xfs_btree_ops	*ops)
+	struct xfs_buf			*agbp)
 {
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 	ASSERT(cur->bc_tp == NULL);
 
 	trace_xfs_btree_commit_afakeroot(cur);
 
-	kmem_free((void *)cur->bc_ops);
+	cur->bc_ag.afake = NULL;
 	cur->bc_ag.agbp = agbp;
-	cur->bc_ops = ops;
 	cur->bc_flags &= ~XFS_BTREE_STAGING;
 	cur->bc_tp = tp;
 }
@@ -211,29 +125,16 @@ xfs_btree_commit_afakeroot(
 void
 xfs_btree_stage_ifakeroot(
 	struct xfs_btree_cur		*cur,
-	struct xbtree_ifakeroot		*ifake,
-	struct xfs_btree_ops		**new_ops)
+	struct xbtree_ifakeroot		*ifake)
 {
-	struct xfs_btree_ops		*nops;
-
 	ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
-	ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+	ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 	ASSERT(cur->bc_tp == NULL);
 
-	nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
-	memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
-	nops->alloc_block = xfs_btree_fakeroot_alloc_block;
-	nops->free_block = xfs_btree_fakeroot_free_block;
-	nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
-	nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
-
 	cur->bc_ino.ifake = ifake;
 	cur->bc_nlevels = ifake->if_levels;
-	cur->bc_ops = nops;
+	cur->bc_ino.forksize = ifake->if_fork_size;
 	cur->bc_flags |= XFS_BTREE_STAGING;
-
-	if (new_ops)
-		*new_ops = nops;
 }
 
 /*
@@ -246,18 +147,15 @@ void
 xfs_btree_commit_ifakeroot(
 	struct xfs_btree_cur		*cur,
 	struct xfs_trans		*tp,
-	int				whichfork,
-	const struct xfs_btree_ops	*ops)
+	int				whichfork)
 {
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 	ASSERT(cur->bc_tp == NULL);
 
 	trace_xfs_btree_commit_ifakeroot(cur);
 
-	kmem_free((void *)cur->bc_ops);
 	cur->bc_ino.ifake = NULL;
 	cur->bc_ino.whichfork = whichfork;
-	cur->bc_ops = ops;
 	cur->bc_flags &= ~XFS_BTREE_STAGING;
 	cur->bc_tp = tp;
 }
@@ -397,8 +295,7 @@ xfs_btree_bload_prep_block(
 	struct xfs_btree_block		*new_block;
 	int				ret;
 
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
-	    level == cur->bc_nlevels - 1) {
+	if (xfs_btree_at_iroot(cur, level)) {
 		struct xfs_ifork	*ifp = xfs_btree_ifork_ptr(cur);
 		size_t			new_size;
 
@@ -406,14 +303,12 @@ xfs_btree_bload_prep_block(
 
 		/* Allocate a new incore btree root block. */
 		new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
-		ifp->if_broot = kmem_zalloc(new_size, 0);
+		ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
 		ifp->if_broot_bytes = (int)new_size;
 
 		/* Initialize it and send it out. */
-		xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
-				XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
-				nr_this_block, cur->bc_ino.ip->i_ino,
-				cur->bc_flags);
+		xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
+				level, nr_this_block, cur->bc_ino.ip->i_ino);
 
 		*bpp = NULL;
 		*blockp = ifp->if_broot;
@@ -704,7 +599,7 @@ xfs_btree_bload_compute_geometry(
 		xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 				&avg_per_block, &level_blocks, &dontcare64);
 
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
 			/*
 			 * If all the items we want to store at this level
 			 * would fit in the inode root block, then we have our
@@ -763,7 +658,7 @@ xfs_btree_bload_compute_geometry(
 		return -EOVERFLOW;
 
 	bbl->btree_height = cur->bc_nlevels;
-	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 		bbl->nr_blocks = nr_blocks - 1;
 	else
 		bbl->nr_blocks = nr_blocks;
@@ -890,7 +785,7 @@ xfs_btree_bload(
 	}
 
 	/* Initialize the new root. */
-	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
 		ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 		cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
 		cur->bc_ino.ifake->if_blocks = total_blocks - 1;
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index 055ea43b1e18..0c9c2ffb127a 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -22,7 +22,7 @@ struct xbtree_afakeroot {
 void xfs_btree_stage_afakeroot(struct xfs_btree_cur *cur,
 		struct xbtree_afakeroot *afake);
 void xfs_btree_commit_afakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
-		struct xfs_buf *agbp, const struct xfs_btree_ops *ops);
+		struct xfs_buf *agbp);
 
 /* Fake root for an inode-rooted btree. */
 struct xbtree_ifakeroot {
@@ -41,10 +41,9 @@ struct xbtree_ifakeroot {
 
 /* Cursor interactions with fake roots for inode-rooted btrees. */
 void xfs_btree_stage_ifakeroot(struct xfs_btree_cur *cur,
-		struct xbtree_ifakeroot *ifake,
-		struct xfs_btree_ops **new_ops);
+		struct xbtree_ifakeroot *ifake);
 void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
-		int whichfork, const struct xfs_btree_ops *ops);
+		int whichfork);
 
 /* Bulk loading of staged btrees. */
 typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
@@ -76,8 +75,7 @@ struct xfs_btree_bload {
 
 	/*
 	 * This function should return the size of the in-core btree root
-	 * block.  It is only necessary for XFS_BTREE_ROOT_IN_INODE btree
-	 * types.
+	 * block.  It is only necessary for XFS_BTREE_TYPE_INODE btrees.
 	 */
 	xfs_btree_bload_iroot_size_fn	iroot_size;
 
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 5457188bb4de..718d071bb21a 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -23,6 +23,7 @@
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
 #include "xfs_errortag.h"
+#include "xfs_health.h"
 
 /*
  * xfs_da_btree.c
@@ -85,7 +86,8 @@ xfs_da_state_alloc(
 {
 	struct xfs_da_state	*state;
 
-	state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL);
+	state = kmem_cache_zalloc(xfs_da_state_cache,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	state->args = args;
 	state->mp = args->dp->i_mount;
 	return state;
@@ -352,6 +354,8 @@ const struct xfs_buf_ops xfs_da3_node_buf_ops = {
 static int
 xfs_da3_node_set_type(
 	struct xfs_trans	*tp,
+	struct xfs_inode	*dp,
+	int			whichfork,
 	struct xfs_buf		*bp)
 {
 	struct xfs_da_blkinfo	*info = bp->b_addr;
@@ -373,6 +377,7 @@ xfs_da3_node_set_type(
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp,
 				info, sizeof(*info));
 		xfs_trans_brelse(tp, bp);
+		xfs_dirattr_mark_sick(dp, whichfork);
 		return -EFSCORRUPTED;
 	}
 }
@@ -391,7 +396,7 @@ xfs_da3_node_read(
 			&xfs_da3_node_buf_ops);
 	if (error || !*bpp || !tp)
 		return error;
-	return xfs_da3_node_set_type(tp, *bpp);
+	return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
 }
 
 int
@@ -408,6 +413,8 @@ xfs_da3_node_read_mapped(
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, mappedbno,
 			XFS_FSB_TO_BB(mp, xfs_dabuf_nfsb(mp, whichfork)), 0,
 			bpp, &xfs_da3_node_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_dirattr_mark_sick(dp, whichfork);
 	if (error || !*bpp)
 		return error;
 
@@ -418,7 +425,7 @@ xfs_da3_node_read_mapped(
 
 	if (!tp)
 		return 0;
-	return xfs_da3_node_set_type(tp, *bpp);
+	return xfs_da3_node_set_type(tp, dp, whichfork, *bpp);
 }
 
 /*
@@ -631,6 +638,7 @@ xfs_da3_split(
 	if (node->hdr.info.forw) {
 		if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
 			xfs_buf_mark_corrupt(oldblk->bp);
+			xfs_da_mark_sick(state->args);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
@@ -644,6 +652,7 @@ xfs_da3_split(
 	if (node->hdr.info.back) {
 		if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
 			xfs_buf_mark_corrupt(oldblk->bp);
+			xfs_da_mark_sick(state->args);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
@@ -1635,6 +1644,7 @@ xfs_da3_node_lookup_int(
 
 		if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
 			xfs_buf_mark_corrupt(blk->bp);
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
 		}
 
@@ -1650,6 +1660,7 @@ xfs_da3_node_lookup_int(
 		/* Tree taller than we can handle; bail out! */
 		if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
 			xfs_buf_mark_corrupt(blk->bp);
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
 		}
 
@@ -1658,6 +1669,7 @@ xfs_da3_node_lookup_int(
 			expected_level = nodehdr.level - 1;
 		else if (expected_level != nodehdr.level) {
 			xfs_buf_mark_corrupt(blk->bp);
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
 		} else
 			expected_level--;
@@ -1709,12 +1721,16 @@ xfs_da3_node_lookup_int(
 		}
 
 		/* We can't point back to the root. */
-		if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk))
+		if (XFS_IS_CORRUPT(dp->i_mount, blkno == args->geo->leafblk)) {
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
+		}
 	}
 
-	if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0))
+	if (XFS_IS_CORRUPT(dp->i_mount, expected_level != 0)) {
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
+	}
 
 	/*
 	 * A leaf block that ends in the hashval that we are interested in
@@ -1732,6 +1748,7 @@ xfs_da3_node_lookup_int(
 			args->blkno = blk->blkno;
 		} else {
 			ASSERT(0);
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
 		}
 		if (((retval == -ENOENT) || (retval == -ENOATTR)) &&
@@ -2182,7 +2199,8 @@ xfs_da_grow_inode_int(
 		 * If we didn't get it and the block might work if fragmented,
 		 * try without the CONTIG flag.  Loop until we get it all.
 		 */
-		mapp = kmem_alloc(sizeof(*mapp) * count, 0);
+		mapp = kmalloc(sizeof(*mapp) * count,
+				GFP_KERNEL | __GFP_NOFAIL);
 		for (b = *bno, mapi = 0; b < *bno + count; ) {
 			c = (int)(*bno + count - b);
 			nmap = min(XFS_BMAP_MAX_NMAP, c);
@@ -2219,7 +2237,7 @@ xfs_da_grow_inode_int(
 
 out_free_map:
 	if (mapp != &map)
-		kmem_free(mapp);
+		kfree(mapp);
 	return error;
 }
 
@@ -2297,8 +2315,10 @@ xfs_da3_swap_lastblock(
 	error = xfs_bmap_last_before(tp, dp, &lastoff, w);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(mp, lastoff == 0))
+	if (XFS_IS_CORRUPT(mp, lastoff == 0)) {
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
+	}
 	/*
 	 * Read the last block in the btree space.
 	 */
@@ -2348,6 +2368,7 @@ xfs_da3_swap_lastblock(
 		if (XFS_IS_CORRUPT(mp,
 				   be32_to_cpu(sib_info->forw) != last_blkno ||
 				   sib_info->magic != dead_info->magic)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2368,6 +2389,7 @@ xfs_da3_swap_lastblock(
 		if (XFS_IS_CORRUPT(mp,
 				   be32_to_cpu(sib_info->back) != last_blkno ||
 				   sib_info->magic != dead_info->magic)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2390,6 +2412,7 @@ xfs_da3_swap_lastblock(
 		xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
 		if (XFS_IS_CORRUPT(mp,
 				   level >= 0 && level != par_hdr.level + 1)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2401,6 +2424,7 @@ xfs_da3_swap_lastblock(
 		     entno++)
 			continue;
 		if (XFS_IS_CORRUPT(mp, entno == par_hdr.count)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2426,6 +2450,7 @@ xfs_da3_swap_lastblock(
 		xfs_trans_brelse(tp, par_buf);
 		par_buf = NULL;
 		if (XFS_IS_CORRUPT(mp, par_blkno == 0)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2435,6 +2460,7 @@ xfs_da3_swap_lastblock(
 		par_node = par_buf->b_addr;
 		xfs_da3_node_hdr_from_disk(dp->i_mount, &par_hdr, par_node);
 		if (XFS_IS_CORRUPT(mp, par_hdr.level != level)) {
+			xfs_da_mark_sick(args);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2518,7 +2544,8 @@ xfs_dabuf_map(
 	int			error = 0, nirecs, i;
 
 	if (nfsb > 1)
-		irecs = kmem_zalloc(sizeof(irec) * nfsb, KM_NOFS);
+		irecs = kzalloc(sizeof(irec) * nfsb,
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 
 	nirecs = nfsb;
 	error = xfs_bmapi_read(dp, bno, nfsb, irecs, &nirecs,
@@ -2531,7 +2558,8 @@ xfs_dabuf_map(
 	 * larger one that needs to be free by the caller.
 	 */
 	if (nirecs > 1) {
-		map = kmem_zalloc(nirecs * sizeof(struct xfs_buf_map), KM_NOFS);
+		map = kzalloc(nirecs * sizeof(struct xfs_buf_map),
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 		if (!map) {
 			error = -ENOMEM;
 			goto out_free_irecs;
@@ -2557,12 +2585,13 @@ xfs_dabuf_map(
 	*nmaps = nirecs;
 out_free_irecs:
 	if (irecs != &irec)
-		kmem_free(irecs);
+		kfree(irecs);
 	return error;
 
 invalid_mapping:
 	/* Caller ok with no mapping. */
 	if (XFS_IS_CORRUPT(mp, !(flags & XFS_DABUF_MAP_HOLE_OK))) {
+		xfs_dirattr_mark_sick(dp, whichfork);
 		error = -EFSCORRUPTED;
 		if (xfs_error_level >= XFS_ERRLEVEL_LOW) {
 			xfs_alert(mp, "%s: bno %u inode %llu",
@@ -2613,7 +2642,7 @@ xfs_da_get_buf(
 
 out_free:
 	if (mapp != &map)
-		kmem_free(mapp);
+		kfree(mapp);
 
 	return error;
 }
@@ -2644,6 +2673,8 @@ xfs_da_read_buf(
 
 	error = xfs_trans_read_buf_map(mp, tp, mp->m_ddev_targp, mapp, nmap, 0,
 			&bp, ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_dirattr_mark_sick(dp, whichfork);
 	if (error)
 		goto out_free;
 
@@ -2654,7 +2685,7 @@ xfs_da_read_buf(
 	*bpp = bp;
 out_free:
 	if (mapp != &map)
-		kmem_free(mapp);
+		kfree(mapp);
 
 	return error;
 }
@@ -2685,7 +2716,7 @@ xfs_da_reada_buf(
 
 out_free:
 	if (mapp != &map)
-		kmem_free(mapp);
+		kfree(mapp);
 
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index 24f9d1461f9a..060e5c96b70f 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -159,6 +159,17 @@ struct xfs_da3_intnode {
 
 #define XFS_DIR3_FT_MAX			9
 
+#define XFS_DIR3_FTYPE_STR \
+	{ XFS_DIR3_FT_UNKNOWN,	"unknown" }, \
+	{ XFS_DIR3_FT_REG_FILE,	"file" }, \
+	{ XFS_DIR3_FT_DIR,	"directory" }, \
+	{ XFS_DIR3_FT_CHRDEV,	"char" }, \
+	{ XFS_DIR3_FT_BLKDEV,	"block" }, \
+	{ XFS_DIR3_FT_FIFO,	"fifo" }, \
+	{ XFS_DIR3_FT_SOCK,	"sock" }, \
+	{ XFS_DIR3_FT_SYMLINK,	"symlink" }, \
+	{ XFS_DIR3_FT_WHT,	"whiteout" }
+
 /*
  * Byte offset in data block and shortform entry.
  */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 66a17910d021..c13276095cc0 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -819,16 +819,16 @@ xfs_defer_can_append(
 /* Create a new pending item at the end of the transaction list. */
 static inline struct xfs_defer_pending *
 xfs_defer_alloc(
-	struct xfs_trans		*tp,
+	struct list_head		*dfops,
 	const struct xfs_defer_op_type	*ops)
 {
 	struct xfs_defer_pending	*dfp;
 
 	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
-			GFP_NOFS | __GFP_NOFAIL);
+			GFP_KERNEL | __GFP_NOFAIL);
 	dfp->dfp_ops = ops;
 	INIT_LIST_HEAD(&dfp->dfp_work);
-	list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+	list_add_tail(&dfp->dfp_list, dfops);
 
 	return dfp;
 }
@@ -846,7 +846,7 @@ xfs_defer_add(
 
 	dfp = xfs_defer_find_last(tp, ops);
 	if (!dfp || !xfs_defer_can_append(dfp, ops))
-		dfp = xfs_defer_alloc(tp, ops);
+		dfp = xfs_defer_alloc(&tp->t_dfops, ops);
 
 	xfs_defer_add_item(dfp, li);
 	trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
@@ -870,7 +870,7 @@ xfs_defer_add_barrier(
 	if (dfp)
 		return;
 
-	xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+	xfs_defer_alloc(&tp->t_dfops, &xfs_barrier_defer_type);
 
 	trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
 }
@@ -885,14 +885,9 @@ xfs_defer_start_recovery(
 	struct list_head		*r_dfops,
 	const struct xfs_defer_op_type	*ops)
 {
-	struct xfs_defer_pending	*dfp;
+	struct xfs_defer_pending	*dfp = xfs_defer_alloc(r_dfops, ops);
 
-	dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
-			GFP_NOFS | __GFP_NOFAIL);
-	dfp->dfp_ops = ops;
 	dfp->dfp_intent = lip;
-	INIT_LIST_HEAD(&dfp->dfp_work);
-	list_add_tail(&dfp->dfp_list, r_dfops);
 }
 
 /*
@@ -979,7 +974,7 @@ xfs_defer_ops_capture(
 		return ERR_PTR(error);
 
 	/* Create an object to capture the defer ops. */
-	dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+	dfc = kzalloc(sizeof(*dfc), GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&dfc->dfc_list);
 	INIT_LIST_HEAD(&dfc->dfc_dfops);
 
@@ -1011,7 +1006,7 @@ xfs_defer_ops_capture(
 	 * transaction.
 	 */
 	for (i = 0; i < dfc->dfc_held.dr_inos; i++) {
-		ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL));
+		xfs_assert_ilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL);
 		ihold(VFS_I(dfc->dfc_held.dr_ip[i]));
 	}
 
@@ -1038,7 +1033,7 @@ xfs_defer_ops_capture_abort(
 	for (i = 0; i < dfc->dfc_held.dr_inos; i++)
 		xfs_irele(dfc->dfc_held.dr_ip[i]);
 
-	kmem_free(dfc);
+	kfree(dfc);
 }
 
 /*
@@ -1114,7 +1109,7 @@ xfs_defer_ops_continue(
 	list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
 	tp->t_flags |= dfc->dfc_tpflags;
 
-	kmem_free(dfc);
+	kfree(dfc);
 }
 
 /* Release the resources captured and continued during recovery. */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index a76673281514..4821519efad4 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -18,6 +18,7 @@
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_trace.h"
+#include "xfs_health.h"
 
 const struct xfs_name xfs_name_dotdot = {
 	.name	= (const unsigned char *)"..",
@@ -25,6 +26,12 @@ const struct xfs_name xfs_name_dotdot = {
 	.type	= XFS_DIR3_FT_DIR,
 };
 
+const struct xfs_name xfs_name_dot = {
+	.name	= (const unsigned char *)".",
+	.len	= 1,
+	.type	= XFS_DIR3_FT_DIR,
+};
+
 /*
  * Convert inode mode to directory entry filetype
  */
@@ -104,13 +111,13 @@ xfs_da_mount(
 	ASSERT(mp->m_sb.sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
 	ASSERT(xfs_dir2_dirblock_bytes(&mp->m_sb) <= XFS_MAX_BLOCKSIZE);
 
-	mp->m_dir_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				    KM_MAYFAIL);
-	mp->m_attr_geo = kmem_zalloc(sizeof(struct xfs_da_geometry),
-				     KM_MAYFAIL);
+	mp->m_dir_geo = kzalloc(sizeof(struct xfs_da_geometry),
+				GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+	mp->m_attr_geo = kzalloc(sizeof(struct xfs_da_geometry),
+				GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!mp->m_dir_geo || !mp->m_attr_geo) {
-		kmem_free(mp->m_dir_geo);
-		kmem_free(mp->m_attr_geo);
+		kfree(mp->m_dir_geo);
+		kfree(mp->m_attr_geo);
 		return -ENOMEM;
 	}
 
@@ -178,8 +185,8 @@ void
 xfs_da_unmount(
 	struct xfs_mount	*mp)
 {
-	kmem_free(mp->m_dir_geo);
-	kmem_free(mp->m_attr_geo);
+	kfree(mp->m_dir_geo);
+	kfree(mp->m_attr_geo);
 }
 
 /*
@@ -236,7 +243,7 @@ xfs_dir_init(
 	if (error)
 		return error;
 
-	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
 	if (!args)
 		return -ENOMEM;
 
@@ -244,7 +251,7 @@ xfs_dir_init(
 	args->dp = dp;
 	args->trans = tp;
 	error = xfs_dir2_sf_create(args, pdp->i_ino);
-	kmem_free(args);
+	kfree(args);
 	return error;
 }
 
@@ -273,7 +280,7 @@ xfs_dir_createname(
 		XFS_STATS_INC(dp->i_mount, xs_dir_create);
 	}
 
-	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
 	if (!args)
 		return -ENOMEM;
 
@@ -313,7 +320,7 @@ xfs_dir_createname(
 		rval = xfs_dir2_node_addname(args);
 
 out_free:
-	kmem_free(args);
+	kfree(args);
 	return rval;
 }
 
@@ -333,7 +340,8 @@ xfs_dir_cilookup_result(
 					!(args->op_flags & XFS_DA_OP_CILOOKUP))
 		return -EEXIST;
 
-	args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
+	args->value = kmalloc(len,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_RETRY_MAYFAIL);
 	if (!args->value)
 		return -ENOMEM;
 
@@ -364,15 +372,8 @@ xfs_dir_lookup(
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_lookup);
 
-	/*
-	 * We need to use KM_NOFS here so that lockdep will not throw false
-	 * positive deadlock warnings on a non-transactional lookup path. It is
-	 * safe to recurse into inode recalim in that case, but lockdep can't
-	 * easily be taught about it. Hence KM_NOFS avoids having to add more
-	 * lockdep Doing this avoids having to add a bunch of lockdep class
-	 * annotations into the reclaim path for the ilock.
-	 */
-	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args = kzalloc(sizeof(*args),
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	args->geo = dp->i_mount->m_dir_geo;
 	args->name = name->name;
 	args->namelen = name->len;
@@ -419,7 +420,7 @@ out_check_rval:
 	}
 out_free:
 	xfs_iunlock(dp, lock_mode);
-	kmem_free(args);
+	kfree(args);
 	return rval;
 }
 
@@ -441,7 +442,7 @@ xfs_dir_removename(
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
 	XFS_STATS_INC(dp->i_mount, xs_dir_remove);
 
-	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
 	if (!args)
 		return -ENOMEM;
 
@@ -477,7 +478,7 @@ xfs_dir_removename(
 	else
 		rval = xfs_dir2_node_removename(args);
 out_free:
-	kmem_free(args);
+	kfree(args);
 	return rval;
 }
 
@@ -502,7 +503,7 @@ xfs_dir_replace(
 	if (rval)
 		return rval;
 
-	args = kmem_zalloc(sizeof(*args), KM_NOFS);
+	args = kzalloc(sizeof(*args), GFP_KERNEL | __GFP_NOFAIL);
 	if (!args)
 		return -ENOMEM;
 
@@ -538,7 +539,7 @@ xfs_dir_replace(
 	else
 		rval = xfs_dir2_node_replace(args);
 out_free:
-	kmem_free(args);
+	kfree(args);
 	return rval;
 }
 
@@ -626,8 +627,10 @@ xfs_dir2_isblock(
 		return 0;
 
 	*isblock = true;
-	if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize))
+	if (XFS_IS_CORRUPT(mp, args->dp->i_disk_size != args->geo->blksize)) {
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
+	}
 	return 0;
 }
 
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index 19af22a16c41..8497d041f316 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -22,6 +22,19 @@ struct xfs_dir3_icfree_hdr;
 struct xfs_dir3_icleaf_hdr;
 
 extern const struct xfs_name	xfs_name_dotdot;
+extern const struct xfs_name	xfs_name_dot;
+
+static inline bool
+xfs_dir2_samename(
+	const struct xfs_name	*n1,
+	const struct xfs_name	*n2)
+{
+	if (n1 == n2)
+		return true;
+	if (n1->len != n2->len)
+		return false;
+	return !memcmp(n1->name, n2->name, n1->len);
+}
 
 /*
  * Convert inode mode to directory entry filetype
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 3c256d4cc40b..a2da007adb46 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -20,6 +20,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_log.h"
+#include "xfs_health.h"
 
 /*
  * Local function prototypes.
@@ -152,6 +153,7 @@ xfs_dir3_block_read(
 		__xfs_buf_mark_corrupt(*bpp, fa);
 		xfs_trans_brelse(tp, *bpp);
 		*bpp = NULL;
+		xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 		return -EFSCORRUPTED;
 	}
 
@@ -1108,7 +1110,7 @@ xfs_dir2_sf_to_block(
 	 * Copy the directory into a temporary buffer.
 	 * Then pitch the incore inode data so we can make extents.
 	 */
-	sfp = kmem_alloc(ifp->if_bytes, 0);
+	sfp = kmalloc(ifp->if_bytes, GFP_KERNEL | __GFP_NOFAIL);
 	memcpy(sfp, oldsfp, ifp->if_bytes);
 
 	xfs_idata_realloc(dp, -ifp->if_bytes, XFS_DATA_FORK);
@@ -1253,7 +1255,7 @@ xfs_dir2_sf_to_block(
 			sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
 	}
 	/* Done with the temporary buffer */
-	kmem_free(sfp);
+	kfree(sfp);
 	/*
 	 * Sort the leaf entries by hash value.
 	 */
@@ -1268,6 +1270,6 @@ xfs_dir2_sf_to_block(
 	xfs_dir3_data_check(dp, bp);
 	return 0;
 out_free:
-	kmem_free(sfp);
+	kfree(sfp);
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index dbcf58979a59..7a6d965bea71 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -18,6 +18,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
+#include "xfs_health.h"
 
 static xfs_failaddr_t xfs_dir2_data_freefind_verify(
 		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
@@ -433,6 +434,7 @@ xfs_dir3_data_read(
 		__xfs_buf_mark_corrupt(*bpp, fa);
 		xfs_trans_brelse(tp, *bpp);
 		*bpp = NULL;
+		xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 		return -EFSCORRUPTED;
 	}
 
@@ -1198,6 +1200,7 @@ xfs_dir2_data_use_free(
 corrupt:
 	xfs_corruption_error(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount,
 			hdr, sizeof(*hdr), __FILE__, __LINE__, fa);
+	xfs_da_mark_sick(args);
 	return -EFSCORRUPTED;
 }
 
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index cb9e950a911d..08dda5ce9d91 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -19,6 +19,7 @@
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
+#include "xfs_health.h"
 
 /*
  * Local function declarations.
@@ -1393,8 +1394,10 @@ xfs_dir2_leaf_removename(
 	bestsp = xfs_dir2_leaf_bests_p(ltp);
 	if (be16_to_cpu(bestsp[db]) != oldbest) {
 		xfs_buf_mark_corrupt(lbp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
+
 	/*
 	 * Mark the former data entry unused.
 	 */
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 7a03aeb9f4c9..be0b8834028c 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -20,6 +20,7 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
+#include "xfs_health.h"
 
 /*
  * Function declarations.
@@ -231,6 +232,7 @@ __xfs_dir3_free_read(
 		__xfs_buf_mark_corrupt(*bpp, fa);
 		xfs_trans_brelse(tp, *bpp);
 		*bpp = NULL;
+		xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 		return -EFSCORRUPTED;
 	}
 
@@ -443,6 +445,7 @@ xfs_dir2_leaf_to_node(
 	if (be32_to_cpu(ltp->bestcount) >
 				(uint)dp->i_disk_size / args->geo->blksize) {
 		xfs_buf_mark_corrupt(lbp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 
@@ -517,6 +520,7 @@ xfs_dir2_leafn_add(
 	 */
 	if (index < 0) {
 		xfs_buf_mark_corrupt(bp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 
@@ -736,6 +740,7 @@ xfs_dir2_leafn_lookup_for_addname(
 					   cpu_to_be16(NULLDATAOFF))) {
 				if (curfdb != newfdb)
 					xfs_trans_brelse(tp, curbp);
+				xfs_da_mark_sick(args);
 				return -EFSCORRUPTED;
 			}
 			curfdb = newfdb;
@@ -804,6 +809,7 @@ xfs_dir2_leafn_lookup_for_entry(
 	xfs_dir3_leaf_check(dp, bp);
 	if (leafhdr.count <= 0) {
 		xfs_buf_mark_corrupt(bp);
+		xfs_da_mark_sick(args);
 		return -EFSCORRUPTED;
 	}
 
@@ -1739,6 +1745,7 @@ xfs_dir2_node_add_datablk(
 			} else {
 				xfs_alert(mp, " ... fblk is NULL");
 			}
+			xfs_da_mark_sick(args);
 			return -EFSCORRUPTED;
 		}
 
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index e1f83fc7b6ad..17a20384c8b7 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -276,7 +276,7 @@ xfs_dir2_block_to_sf(
 	 * format the data into.  Once we have formatted the data, we can free
 	 * the block and copy the formatted data into the inode literal area.
 	 */
-	sfp = kmem_alloc(mp->m_sb.sb_inodesize, 0);
+	sfp = kmalloc(mp->m_sb.sb_inodesize, GFP_KERNEL | __GFP_NOFAIL);
 	memcpy(sfp, sfhp, xfs_dir2_sf_hdr_size(sfhp->i8count));
 
 	/*
@@ -350,7 +350,7 @@ xfs_dir2_block_to_sf(
 	xfs_dir2_sf_check(args);
 out:
 	xfs_trans_log_inode(args->trans, dp, logflags);
-	kmem_free(sfp);
+	kfree(sfp);
 	return error;
 }
 
@@ -524,7 +524,7 @@ xfs_dir2_sf_addname_hard(
 	 * Copy the old directory to the stack buffer.
 	 */
 	old_isize = (int)dp->i_disk_size;
-	buf = kmem_alloc(old_isize, 0);
+	buf = kmalloc(old_isize, GFP_KERNEL | __GFP_NOFAIL);
 	oldsfp = (xfs_dir2_sf_hdr_t *)buf;
 	memcpy(oldsfp, dp->i_df.if_data, old_isize);
 	/*
@@ -576,7 +576,7 @@ xfs_dir2_sf_addname_hard(
 		sfep = xfs_dir2_sf_nextentry(mp, sfp, sfep);
 		memcpy(sfep, oldsfep, old_isize - nbytes);
 	}
-	kmem_free(buf);
+	kfree(buf);
 	dp->i_disk_size = new_isize;
 	xfs_dir2_sf_check(args);
 }
@@ -1151,7 +1151,7 @@ xfs_dir2_sf_toino4(
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, 0);
+	buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
 	ASSERT(oldsfp->i8count == 1);
 	memcpy(buf, oldsfp, oldsize);
 	/*
@@ -1190,7 +1190,7 @@ xfs_dir2_sf_toino4(
 	/*
 	 * Clean up the inode.
 	 */
-	kmem_free(buf);
+	kfree(buf);
 	dp->i_disk_size = newsize;
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
@@ -1223,7 +1223,7 @@ xfs_dir2_sf_toino8(
 	 * Don't want xfs_idata_realloc copying the data here.
 	 */
 	oldsize = dp->i_df.if_bytes;
-	buf = kmem_alloc(oldsize, 0);
+	buf = kmalloc(oldsize, GFP_KERNEL | __GFP_NOFAIL);
 	ASSERT(oldsfp->i8count == 0);
 	memcpy(buf, oldsfp, oldsize);
 	/*
@@ -1262,7 +1262,7 @@ xfs_dir2_sf_toino8(
 	/*
 	 * Clean up the inode.
 	 */
-	kmem_free(buf);
+	kfree(buf);
 	dp->i_disk_size = newsize;
 	xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_DDATA);
 }
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 382ab1e71c0b..2b2f9050fbfb 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -477,15 +477,9 @@ xfs_is_quota_inode(struct xfs_sb *sbp, xfs_ino_t ino)
 #define	XFS_AGI_GOOD_VERSION(v)	((v) == XFS_AGI_VERSION)
 
 /*
- * Btree number 0 is bno, 1 is cnt, 2 is rmap. This value gives the size of the
- * arrays below.
- */
-#define	XFS_BTNUM_AGF	((int)XFS_BTNUM_RMAPi + 1)
-
-/*
- * The second word of agf_levels in the first a.g. overlaps the EFS
- * superblock's magic number.  Since the magic numbers valid for EFS
- * are > 64k, our value cannot be confused for an EFS superblock's.
+ * agf_cnt_level in the first AGF overlaps the EFS superblock's magic number.
+ * Since the magic numbers valid for EFS are > 64k, our value cannot be confused
+ * for an EFS superblock.
  */
 
 typedef struct xfs_agf {
@@ -499,8 +493,13 @@ typedef struct xfs_agf {
 	/*
 	 * Freespace and rmap information
 	 */
-	__be32		agf_roots[XFS_BTNUM_AGF];	/* root blocks */
-	__be32		agf_levels[XFS_BTNUM_AGF];	/* btree levels */
+	__be32		agf_bno_root;	/* bnobt root block */
+	__be32		agf_cnt_root;	/* cntbt root block */
+	__be32		agf_rmap_root;	/* rmapbt root block */
+
+	__be32		agf_bno_level;	/* bnobt btree levels */
+	__be32		agf_cnt_level;	/* cntbt btree levels */
+	__be32		agf_rmap_level;	/* rmapbt btree levels */
 
 	__be32		agf_flfirst;	/* first freelist block's index */
 	__be32		agf_fllast;	/* last freelist block's index */
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 6360073865db..ca1b17d01437 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -195,6 +195,8 @@ struct xfs_fsop_geom {
 #define XFS_FSOP_GEOM_SICK_PQUOTA	(1 << 3)  /* project quota */
 #define XFS_FSOP_GEOM_SICK_RT_BITMAP	(1 << 4)  /* realtime bitmap */
 #define XFS_FSOP_GEOM_SICK_RT_SUMMARY	(1 << 5)  /* realtime summary */
+#define XFS_FSOP_GEOM_SICK_QUOTACHECK	(1 << 6)  /* quota counts */
+#define XFS_FSOP_GEOM_SICK_NLINKS	(1 << 7)  /* inode link counts */
 
 /* Output for XFS_FS_COUNTS */
 typedef struct xfs_fsop_counts {
@@ -292,6 +294,7 @@ struct xfs_ag_geometry {
 #define XFS_AG_GEOM_SICK_FINOBT	(1 << 7)  /* free inode index */
 #define XFS_AG_GEOM_SICK_RMAPBT	(1 << 8)  /* reverse mappings */
 #define XFS_AG_GEOM_SICK_REFCNTBT (1 << 9)  /* reference counts */
+#define XFS_AG_GEOM_SICK_INODES	(1 << 10) /* bad inodes were seen */
 
 /*
  * Structures for XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG & XFS_IOC_FSGROWFSRT
@@ -709,9 +712,12 @@ struct xfs_scrub_metadata {
 #define XFS_SCRUB_TYPE_GQUOTA	22	/* group quotas */
 #define XFS_SCRUB_TYPE_PQUOTA	23	/* project quotas */
 #define XFS_SCRUB_TYPE_FSCOUNTERS 24	/* fs summary counters */
+#define XFS_SCRUB_TYPE_QUOTACHECK 25	/* quota counters */
+#define XFS_SCRUB_TYPE_NLINKS	26	/* inode link counts */
+#define XFS_SCRUB_TYPE_HEALTHY	27	/* everything checked out ok */
 
 /* Number of scrub subcommands. */
-#define XFS_SCRUB_TYPE_NR	25
+#define XFS_SCRUB_TYPE_NR	28
 
 /* i: Repair this metadata. */
 #define XFS_SCRUB_IFLAG_REPAIR		(1u << 0)
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 6296993ff8f3..3c64b5f9bd68 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -26,21 +26,40 @@
  * and the "sick" field tells us if that piece was found to need repairs.
  * Therefore we can conclude that for a given sick flag value:
  *
- *  - checked && sick  => metadata needs repair
- *  - checked && !sick => metadata is ok
- *  - !checked         => has not been examined since mount
+ *  - checked && sick   => metadata needs repair
+ *  - checked && !sick  => metadata is ok
+ *  - !checked && sick  => errors have been observed during normal operation,
+ *                         but the metadata has not been checked thoroughly
+ *  - !checked && !sick => has not been examined since mount
+ *
+ * Evidence of health problems can be sorted into three basic categories:
+ *
+ * a) Primary evidence, which signals that something is defective within the
+ *    general grouping of metadata.
+ *
+ * b) Secondary evidence, which are side effects of primary problem but are
+ *    not themselves problems.  These can be forgotten when the primary
+ *    health problems are addressed.
+ *
+ * c) Indirect evidence, which points to something being wrong in another
+ *    group, but we had to release resources and this is all that's left of
+ *    that state.
  */
 
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_inode;
 struct xfs_fsop_geom;
+struct xfs_btree_cur;
+struct xfs_da_args;
 
 /* Observable health issues for metadata spanning the entire filesystem. */
 #define XFS_SICK_FS_COUNTERS	(1 << 0)  /* summary counters */
 #define XFS_SICK_FS_UQUOTA	(1 << 1)  /* user quota */
 #define XFS_SICK_FS_GQUOTA	(1 << 2)  /* group quota */
 #define XFS_SICK_FS_PQUOTA	(1 << 3)  /* project quota */
+#define XFS_SICK_FS_QUOTACHECK	(1 << 4)  /* quota counts */
+#define XFS_SICK_FS_NLINKS	(1 << 5)  /* inode link counts */
 
 /* Observable health issues for realtime volume metadata. */
 #define XFS_SICK_RT_BITMAP	(1 << 0)  /* realtime bitmap */
@@ -57,6 +76,7 @@ struct xfs_fsop_geom;
 #define XFS_SICK_AG_FINOBT	(1 << 7)  /* free inode index */
 #define XFS_SICK_AG_RMAPBT	(1 << 8)  /* reverse mappings */
 #define XFS_SICK_AG_REFCNTBT	(1 << 9)  /* reference counts */
+#define XFS_SICK_AG_INODES	(1 << 10) /* inactivated bad inodes */
 
 /* Observable health issues for inode metadata. */
 #define XFS_SICK_INO_CORE	(1 << 0)  /* inode core */
@@ -73,11 +93,16 @@ struct xfs_fsop_geom;
 #define XFS_SICK_INO_DIR_ZAPPED		(1 << 10) /* directory erased */
 #define XFS_SICK_INO_SYMLINK_ZAPPED	(1 << 11) /* symlink erased */
 
+/* Don't propagate sick status to ag health summary during inactivation */
+#define XFS_SICK_INO_FORGET	(1 << 12)
+
 /* Primary evidence of health problems in a given group. */
 #define XFS_SICK_FS_PRIMARY	(XFS_SICK_FS_COUNTERS | \
 				 XFS_SICK_FS_UQUOTA | \
 				 XFS_SICK_FS_GQUOTA | \
-				 XFS_SICK_FS_PQUOTA)
+				 XFS_SICK_FS_PQUOTA | \
+				 XFS_SICK_FS_QUOTACHECK | \
+				 XFS_SICK_FS_NLINKS)
 
 #define XFS_SICK_RT_PRIMARY	(XFS_SICK_RT_BITMAP | \
 				 XFS_SICK_RT_SUMMARY)
@@ -107,29 +132,86 @@ struct xfs_fsop_geom;
 				 XFS_SICK_INO_DIR_ZAPPED | \
 				 XFS_SICK_INO_SYMLINK_ZAPPED)
 
-/* These functions must be provided by the xfs implementation. */
+/* Secondary state related to (but not primary evidence of) health problems. */
+#define XFS_SICK_FS_SECONDARY	(0)
+#define XFS_SICK_RT_SECONDARY	(0)
+#define XFS_SICK_AG_SECONDARY	(0)
+#define XFS_SICK_INO_SECONDARY	(XFS_SICK_INO_FORGET)
+
+/* Evidence of health problems elsewhere. */
+#define XFS_SICK_FS_INDIRECT	(0)
+#define XFS_SICK_RT_INDIRECT	(0)
+#define XFS_SICK_AG_INDIRECT	(XFS_SICK_AG_INODES)
+#define XFS_SICK_INO_INDIRECT	(0)
+
+/* All health masks. */
+#define XFS_SICK_FS_ALL	(XFS_SICK_FS_PRIMARY | \
+				 XFS_SICK_FS_SECONDARY | \
+				 XFS_SICK_FS_INDIRECT)
+
+#define XFS_SICK_RT_ALL	(XFS_SICK_RT_PRIMARY | \
+				 XFS_SICK_RT_SECONDARY | \
+				 XFS_SICK_RT_INDIRECT)
+
+#define XFS_SICK_AG_ALL	(XFS_SICK_AG_PRIMARY | \
+				 XFS_SICK_AG_SECONDARY | \
+				 XFS_SICK_AG_INDIRECT)
+
+#define XFS_SICK_INO_ALL	(XFS_SICK_INO_PRIMARY | \
+				 XFS_SICK_INO_SECONDARY | \
+				 XFS_SICK_INO_INDIRECT | \
+				 XFS_SICK_INO_ZAPPED)
+
+/*
+ * These functions must be provided by the xfs implementation.  Function
+ * behavior with respect to the first argument should be as follows:
+ *
+ * xfs_*_mark_sick:        Set the sick flags and do not set checked flags.
+ *                         Runtime code should call this upon encountering
+ *                         a corruption.
+ *
+ * xfs_*_mark_corrupt:     Set the sick and checked flags simultaneously.
+ *                         Fsck tools should call this when corruption is
+ *                         found.
+ *
+ * xfs_*_mark_healthy:     Clear the sick flags and set the checked flags.
+ *                         Fsck tools should call this after correcting errors.
+ *
+ * xfs_*_measure_sickness: Return the sick and check status in the provided
+ *                         out parameters.
+ */
 
 void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_fs_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
 void xfs_fs_mark_healthy(struct xfs_mount *mp, unsigned int mask);
 void xfs_fs_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
 		unsigned int *checked);
 
 void xfs_rt_mark_sick(struct xfs_mount *mp, unsigned int mask);
+void xfs_rt_mark_corrupt(struct xfs_mount *mp, unsigned int mask);
 void xfs_rt_mark_healthy(struct xfs_mount *mp, unsigned int mask);
 void xfs_rt_measure_sickness(struct xfs_mount *mp, unsigned int *sick,
 		unsigned int *checked);
 
+void xfs_agno_mark_sick(struct xfs_mount *mp, xfs_agnumber_t agno,
+		unsigned int mask);
 void xfs_ag_mark_sick(struct xfs_perag *pag, unsigned int mask);
+void xfs_ag_mark_corrupt(struct xfs_perag *pag, unsigned int mask);
 void xfs_ag_mark_healthy(struct xfs_perag *pag, unsigned int mask);
 void xfs_ag_measure_sickness(struct xfs_perag *pag, unsigned int *sick,
 		unsigned int *checked);
 
 void xfs_inode_mark_sick(struct xfs_inode *ip, unsigned int mask);
+void xfs_inode_mark_corrupt(struct xfs_inode *ip, unsigned int mask);
 void xfs_inode_mark_healthy(struct xfs_inode *ip, unsigned int mask);
 void xfs_inode_measure_sickness(struct xfs_inode *ip, unsigned int *sick,
 		unsigned int *checked);
 
 void xfs_health_unmount(struct xfs_mount *mp);
+void xfs_bmap_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_btree_mark_sick(struct xfs_btree_cur *cur);
+void xfs_dirattr_mark_sick(struct xfs_inode *ip, int whichfork);
+void xfs_da_mark_sick(struct xfs_da_args *args);
 
 /* Now some helpers. */
 
@@ -197,4 +279,7 @@ void xfs_fsop_geom_health(struct xfs_mount *mp, struct xfs_fsop_geom *geo);
 void xfs_ag_geom_health(struct xfs_perag *pag, struct xfs_ag_geometry *ageo);
 void xfs_bulkstat_health(struct xfs_inode *ip, struct xfs_bulkstat *bs);
 
+#define xfs_metadata_is_sick(error) \
+	(unlikely((error) == -EFSCORRUPTED || (error) == -EFSBADCRC))
+
 #endif	/* __XFS_HEALTH_H__ */
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 2361a22035b0..e5ac3e5430c4 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -27,6 +27,7 @@
 #include "xfs_log.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
+#include "xfs_health.h"
 
 /*
  * Lookup a record by ino in the btree given by cur.
@@ -140,13 +141,13 @@ xfs_inobt_complain_bad_rec(
 	struct xfs_mount		*mp = cur->bc_mp;
 
 	xfs_warn(mp,
-		"%s Inode BTree record corruption in AG %d detected at %pS!",
-		cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free",
-		cur->bc_ag.pag->pag_agno, fa);
+		"%sbt record corruption in AG %d detected at %pS!",
+		cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 "start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 		irec->ir_startino, irec->ir_count, irec->ir_freecount,
 		irec->ir_free, irec->ir_holemask);
+	xfs_btree_mark_sick(cur);
 	return -EFSCORRUPTED;
 }
 
@@ -205,14 +206,17 @@ xfs_inobt_insert(
 	struct xfs_buf		*agbp,
 	xfs_agino_t		newino,
 	xfs_agino_t		newlen,
-	xfs_btnum_t		btnum)
+	bool			is_finobt)
 {
 	struct xfs_btree_cur	*cur;
 	xfs_agino_t		thisino;
 	int			i;
 	int			error;
 
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+	if (is_finobt)
+		cur = xfs_finobt_init_cursor(pag, tp, agbp);
+	else
+		cur = xfs_inobt_init_cursor(pag, tp, agbp);
 
 	for (thisino = newino;
 	     thisino < newino + newlen;
@@ -528,16 +532,14 @@ __xfs_inobt_rec_merge(
 }
 
 /*
- * Insert a new sparse inode chunk into the associated inode btree. The inode
- * record for the sparse chunk is pre-aligned to a startino that should match
- * any pre-existing sparse inode record in the tree. This allows sparse chunks
- * to fill over time.
+ * Insert a new sparse inode chunk into the associated inode allocation btree.
+ * The inode record for the sparse chunk is pre-aligned to a startino that
+ * should match any pre-existing sparse inode record in the tree. This allows
+ * sparse chunks to fill over time.
  *
- * This function supports two modes of handling preexisting records depending on
- * the merge flag. If merge is true, the provided record is merged with the
+ * If no preexisting record exists, the provided record is inserted.
+ * If there is a preexisting record, the provided record is merged with the
  * existing record and updated in place. The merged record is returned in nrec.
- * If merge is false, an existing record is replaced with the provided record.
- * If no preexisting record exists, the provided record is always inserted.
  *
  * It is considered corruption if a merge is requested and not possible. Given
  * the sparse inode alignment constraints, this should never happen.
@@ -547,9 +549,7 @@ xfs_inobt_insert_sprec(
 	struct xfs_perag		*pag,
 	struct xfs_trans		*tp,
 	struct xfs_buf			*agbp,
-	int				btnum,
-	struct xfs_inobt_rec_incore	*nrec,	/* in/out: new/merged rec. */
-	bool				merge)	/* merge or replace */
+	struct xfs_inobt_rec_incore	*nrec)	/* in/out: new/merged rec. */
 {
 	struct xfs_mount		*mp = pag->pag_mount;
 	struct xfs_btree_cur		*cur;
@@ -557,7 +557,7 @@ xfs_inobt_insert_sprec(
 	int				i;
 	struct xfs_inobt_rec_incore	rec;
 
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, btnum);
+	cur = xfs_inobt_init_cursor(pag, tp, agbp);
 
 	/* the new record is pre-aligned so we know where to look */
 	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
@@ -571,6 +571,7 @@ xfs_inobt_insert_sprec(
 		if (error)
 			goto error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error;
 		}
@@ -579,45 +580,45 @@ xfs_inobt_insert_sprec(
 	}
 
 	/*
-	 * A record exists at this startino. Merge or replace the record
-	 * depending on what we've been asked to do.
+	 * A record exists at this startino.  Merge the records.
 	 */
-	if (merge) {
-		error = xfs_inobt_get_rec(cur, &rec, &i);
-		if (error)
-			goto error;
-		if (XFS_IS_CORRUPT(mp, i != 1)) {
-			error = -EFSCORRUPTED;
-			goto error;
-		}
-		if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
-			error = -EFSCORRUPTED;
-			goto error;
-		}
+	error = xfs_inobt_get_rec(cur, &rec, &i);
+	if (error)
+		goto error;
+	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
+		error = -EFSCORRUPTED;
+		goto error;
+	}
+	if (XFS_IS_CORRUPT(mp, rec.ir_startino != nrec->ir_startino)) {
+		xfs_btree_mark_sick(cur);
+		error = -EFSCORRUPTED;
+		goto error;
+	}
 
-		/*
-		 * This should never fail. If we have coexisting records that
-		 * cannot merge, something is seriously wrong.
-		 */
-		if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
-			error = -EFSCORRUPTED;
-			goto error;
-		}
+	/*
+	 * This should never fail. If we have coexisting records that
+	 * cannot merge, something is seriously wrong.
+	 */
+	if (XFS_IS_CORRUPT(mp, !__xfs_inobt_can_merge(nrec, &rec))) {
+		xfs_btree_mark_sick(cur);
+		error = -EFSCORRUPTED;
+		goto error;
+	}
 
-		trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
-					 rec.ir_holemask, nrec->ir_startino,
-					 nrec->ir_holemask);
+	trace_xfs_irec_merge_pre(mp, pag->pag_agno, rec.ir_startino,
+				 rec.ir_holemask, nrec->ir_startino,
+				 nrec->ir_holemask);
 
-		/* merge to nrec to output the updated record */
-		__xfs_inobt_rec_merge(nrec, &rec);
+	/* merge to nrec to output the updated record */
+	__xfs_inobt_rec_merge(nrec, &rec);
 
-		trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
-					  nrec->ir_holemask);
+	trace_xfs_irec_merge_post(mp, pag->pag_agno, nrec->ir_startino,
+				  nrec->ir_holemask);
 
-		error = xfs_inobt_rec_check_count(mp, nrec);
-		if (error)
-			goto error;
-	}
+	error = xfs_inobt_rec_check_count(mp, nrec);
+	if (error)
+		goto error;
 
 	error = xfs_inobt_update(cur, nrec);
 	if (error)
@@ -632,6 +633,59 @@ error:
 }
 
 /*
+ * Insert a new sparse inode chunk into the free inode btree. The inode
+ * record for the sparse chunk is pre-aligned to a startino that should match
+ * any pre-existing sparse inode record in the tree. This allows sparse chunks
+ * to fill over time.
+ *
+ * The new record is always inserted, overwriting a pre-existing record if
+ * there is one.
+ */
+STATIC int
+xfs_finobt_insert_sprec(
+	struct xfs_perag		*pag,
+	struct xfs_trans		*tp,
+	struct xfs_buf			*agbp,
+	struct xfs_inobt_rec_incore	*nrec)	/* in/out: new rec. */
+{
+	struct xfs_mount		*mp = pag->pag_mount;
+	struct xfs_btree_cur		*cur;
+	int				error;
+	int				i;
+
+	cur = xfs_finobt_init_cursor(pag, tp, agbp);
+
+	/* the new record is pre-aligned so we know where to look */
+	error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
+	if (error)
+		goto error;
+	/* if nothing there, insert a new record and return */
+	if (i == 0) {
+		error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
+					     nrec->ir_count, nrec->ir_freecount,
+					     nrec->ir_free, &i);
+		if (error)
+			goto error;
+		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
+			error = -EFSCORRUPTED;
+			goto error;
+		}
+	} else {
+		error = xfs_inobt_update(cur, nrec);
+		if (error)
+			goto error;
+	}
+
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	return 0;
+error:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	return error;
+}
+
+
+/*
  * Allocate new inodes in the allocation group specified by agbp.  Returns 0 if
  * inodes were allocated in this AG; -EAGAIN if there was no space in this AG so
  * the caller knows it can try another AG, a hard -ENOSPC when over the maximum
@@ -857,8 +911,7 @@ sparse_alloc:
 		 * if necessary. If a merge does occur, rec is updated to the
 		 * merged record.
 		 */
-		error = xfs_inobt_insert_sprec(pag, tp, agbp,
-				XFS_BTNUM_INO, &rec, true);
+		error = xfs_inobt_insert_sprec(pag, tp, agbp, &rec);
 		if (error == -EFSCORRUPTED) {
 			xfs_alert(args.mp,
 	"invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
@@ -882,21 +935,19 @@ sparse_alloc:
 		 * existing record with this one.
 		 */
 		if (xfs_has_finobt(args.mp)) {
-			error = xfs_inobt_insert_sprec(pag, tp, agbp,
-				       XFS_BTNUM_FINO, &rec, false);
+			error = xfs_finobt_insert_sprec(pag, tp, agbp, &rec);
 			if (error)
 				return error;
 		}
 	} else {
 		/* full chunk - insert new records to both btrees */
-		error = xfs_inobt_insert(pag, tp, agbp, newino, newlen,
-					 XFS_BTNUM_INO);
+		error = xfs_inobt_insert(pag, tp, agbp, newino, newlen, false);
 		if (error)
 			return error;
 
 		if (xfs_has_finobt(args.mp)) {
 			error = xfs_inobt_insert(pag, tp, agbp, newino,
-						 newlen, XFS_BTNUM_FINO);
+						 newlen, true);
 			if (error)
 				return error;
 		}
@@ -949,8 +1000,10 @@ xfs_ialloc_next_rec(
 		error = xfs_inobt_get_rec(cur, rec, &i);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 	}
 
 	return 0;
@@ -974,8 +1027,10 @@ xfs_ialloc_get_rec(
 		error = xfs_inobt_get_rec(cur, rec, &i);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 	}
 
 	return 0;
@@ -1030,7 +1085,7 @@ xfs_dialloc_ag_inobt(
 	ASSERT(pag->pagi_freecount > 0);
 
  restart_pagno:
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+	cur = xfs_inobt_init_cursor(pag, tp, agbp);
 	/*
 	 * If pagino is 0 (this is the root inode allocation) use newino.
 	 * This must work because we've just allocated some.
@@ -1053,6 +1108,7 @@ xfs_dialloc_ag_inobt(
 		if (error)
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1061,6 +1117,7 @@ xfs_dialloc_ag_inobt(
 		if (error)
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, j != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1219,6 +1276,7 @@ xfs_dialloc_ag_inobt(
 	if (error)
 		goto error0;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1228,6 +1286,7 @@ xfs_dialloc_ag_inobt(
 		if (error)
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1237,6 +1296,7 @@ xfs_dialloc_ag_inobt(
 		if (error)
 			goto error0;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error0;
 		}
@@ -1297,8 +1357,10 @@ xfs_dialloc_ag_finobt_near(
 		error = xfs_inobt_get_rec(lcur, rec, &i);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1))
+		if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(lcur);
 			return -EFSCORRUPTED;
+		}
 
 		/*
 		 * See if we've landed in the parent inode record. The finobt
@@ -1322,12 +1384,14 @@ xfs_dialloc_ag_finobt_near(
 		if (error)
 			goto error_rcur;
 		if (XFS_IS_CORRUPT(lcur->bc_mp, j != 1)) {
+			xfs_btree_mark_sick(lcur);
 			error = -EFSCORRUPTED;
 			goto error_rcur;
 		}
 	}
 
 	if (XFS_IS_CORRUPT(lcur->bc_mp, i != 1 && j != 1)) {
+		xfs_btree_mark_sick(lcur);
 		error = -EFSCORRUPTED;
 		goto error_rcur;
 	}
@@ -1383,8 +1447,10 @@ xfs_dialloc_ag_finobt_newino(
 			error = xfs_inobt_get_rec(cur, rec, &i);
 			if (error)
 				return error;
-			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+			if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				return -EFSCORRUPTED;
+			}
 			return 0;
 		}
 	}
@@ -1395,14 +1461,18 @@ xfs_dialloc_ag_finobt_newino(
 	error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_inobt_get_rec(cur, rec, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	return 0;
 }
@@ -1424,14 +1494,18 @@ xfs_dialloc_ag_update_inobt(
 	error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	error = xfs_inobt_get_rec(cur, &rec, &i);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1))
+	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 	ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
 				   XFS_INODES_PER_CHUNK) == 0);
 
@@ -1440,8 +1514,10 @@ xfs_dialloc_ag_update_inobt(
 
 	if (XFS_IS_CORRUPT(cur->bc_mp,
 			   rec.ir_free != frec->ir_free ||
-			   rec.ir_freecount != frec->ir_freecount))
+			   rec.ir_freecount != frec->ir_freecount)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	return xfs_inobt_update(cur, &rec);
 }
@@ -1483,7 +1559,7 @@ xfs_dialloc_ag(
 	if (!pagino)
 		pagino = be32_to_cpu(agi->agi_newino);
 
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+	cur = xfs_finobt_init_cursor(pag, tp, agbp);
 
 	error = xfs_check_agi_freecount(cur);
 	if (error)
@@ -1526,7 +1602,7 @@ xfs_dialloc_ag(
 	 * the original freecount. If all is well, make the equivalent update to
 	 * the inobt using the finobt record and offset information.
 	 */
-	icur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+	icur = xfs_inobt_init_cursor(pag, tp, agbp);
 
 	error = xfs_check_agi_freecount(icur);
 	if (error)
@@ -1943,7 +2019,7 @@ xfs_difree_inobt(
 	/*
 	 * Initialize the cursor.
 	 */
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+	cur = xfs_inobt_init_cursor(pag, tp, agbp);
 
 	error = xfs_check_agi_freecount(cur);
 	if (error)
@@ -1958,6 +2034,7 @@ xfs_difree_inobt(
 		goto error0;
 	}
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -1968,6 +2045,7 @@ xfs_difree_inobt(
 		goto error0;
 	}
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error0;
 	}
@@ -2068,7 +2146,7 @@ xfs_difree_finobt(
 	int				error;
 	int				i;
 
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_FINO);
+	cur = xfs_finobt_init_cursor(pag, tp, agbp);
 
 	error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
 	if (error)
@@ -2080,6 +2158,7 @@ xfs_difree_finobt(
 		 * something is out of sync.
 		 */
 		if (XFS_IS_CORRUPT(mp, ibtrec->ir_freecount != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto error;
 		}
@@ -2106,6 +2185,7 @@ xfs_difree_finobt(
 	if (error)
 		goto error;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error;
 	}
@@ -2116,6 +2196,7 @@ xfs_difree_finobt(
 	if (XFS_IS_CORRUPT(mp,
 			   rec.ir_free != ibtrec->ir_free ||
 			   rec.ir_freecount != ibtrec->ir_freecount)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto error;
 	}
@@ -2265,7 +2346,7 @@ xfs_imap_lookup(
 	 * we have a record, we need to ensure it contains the inode number
 	 * we are looking up.
 	 */
-	cur = xfs_inobt_init_cursor(pag, tp, agbp, XFS_BTNUM_INO);
+	cur = xfs_inobt_init_cursor(pag, tp, agbp);
 	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
 	if (!error) {
 		if (i)
@@ -2604,6 +2685,8 @@ xfs_read_agi(
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGI_DADDR(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, agibpp, &xfs_agi_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 	if (error)
 		return error;
 	if (tp)
@@ -2765,7 +2848,7 @@ xfs_ialloc_count_inodes(
 	struct xfs_ialloc_count_inodes	ci = {0};
 	int				error;
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+	ASSERT(xfs_btree_is_ino(cur->bc_ops));
 	error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
 	if (error)
 		return error;
@@ -2982,7 +3065,7 @@ xfs_ialloc_check_shrink(
 	if (!xfs_has_sparseinodes(pag->pag_mount))
 		return 0;
 
-	cur = xfs_inobt_init_cursor(pag, tp, agibp, XFS_BTNUM_INO);
+	cur = xfs_inobt_init_cursor(pag, tp, agibp);
 
 	/* Look up the inobt record that would correspond to the new EOFS. */
 	agino = XFS_AGB_TO_AGINO(pag->pag_mount, new_length);
@@ -2995,6 +3078,7 @@ xfs_ialloc_check_shrink(
 		goto out;
 
 	if (!has) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_INOBT);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 42a5e1f227a0..cc661fca6ff5 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -17,6 +17,7 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_rmap.h"
@@ -37,7 +38,15 @@ xfs_inobt_dup_cursor(
 	struct xfs_btree_cur	*cur)
 {
 	return xfs_inobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
-			cur->bc_ag.agbp, cur->bc_btnum);
+			cur->bc_ag.agbp);
+}
+
+STATIC struct xfs_btree_cur *
+xfs_finobt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_finobt_init_cursor(cur->bc_ag.pag, cur->bc_tp,
+			cur->bc_ag.agbp);
 }
 
 STATIC void
@@ -81,9 +90,9 @@ xfs_inobt_mod_blockcount(
 	if (!xfs_has_inobtcounts(cur->bc_mp))
 		return;
 
-	if (cur->bc_btnum == XFS_BTNUM_FINO)
+	if (xfs_btree_is_fino(cur->bc_ops))
 		be32_add_cpu(&agi->agi_fblocks, howmuch);
-	else if (cur->bc_btnum == XFS_BTNUM_INO)
+	else
 		be32_add_cpu(&agi->agi_iblocks, howmuch);
 	xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_IBLOCKS);
 }
@@ -300,7 +309,7 @@ xfs_inobt_verify(
 	 * xfs_perag_initialised_agi(pag)) if we ever do.
 	 */
 	if (xfs_has_crc(mp)) {
-		fa = xfs_btree_sblock_v5hdr_verify(bp);
+		fa = xfs_btree_agblock_v5hdr_verify(bp);
 		if (fa)
 			return fa;
 	}
@@ -310,7 +319,7 @@ xfs_inobt_verify(
 	if (level >= M_IGEO(mp)->inobt_maxlevels)
 		return __this_address;
 
-	return xfs_btree_sblock_verify(bp,
+	return xfs_btree_agblock_verify(bp,
 			M_IGEO(mp)->inobt_mxr[level != 0]);
 }
 
@@ -320,7 +329,7 @@ xfs_inobt_read_verify(
 {
 	xfs_failaddr_t	fa;
 
-	if (!xfs_btree_sblock_verify_crc(bp))
+	if (!xfs_btree_agblock_verify_crc(bp))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_inobt_verify(bp);
@@ -344,7 +353,7 @@ xfs_inobt_write_verify(
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
 	}
-	xfs_btree_sblock_calc_crc(bp);
+	xfs_btree_agblock_calc_crc(bp);
 
 }
 
@@ -398,9 +407,17 @@ xfs_inobt_keys_contiguous(
 				 be32_to_cpu(key2->inobt.ir_startino));
 }
 
-static const struct xfs_btree_ops xfs_inobt_ops = {
+const struct xfs_btree_ops xfs_inobt_ops = {
+	.name			= "ino",
+	.type			= XFS_BTREE_TYPE_AG,
+
 	.rec_len		= sizeof(xfs_inobt_rec_t),
 	.key_len		= sizeof(xfs_inobt_key_t),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
+
+	.lru_refs		= XFS_INO_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_ibt_2),
+	.sick_mask		= XFS_SICK_AG_INOBT,
 
 	.dup_cursor		= xfs_inobt_dup_cursor,
 	.set_root		= xfs_inobt_set_root,
@@ -420,11 +437,19 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
 	.keys_contiguous	= xfs_inobt_keys_contiguous,
 };
 
-static const struct xfs_btree_ops xfs_finobt_ops = {
+const struct xfs_btree_ops xfs_finobt_ops = {
+	.name			= "fino",
+	.type			= XFS_BTREE_TYPE_AG,
+
 	.rec_len		= sizeof(xfs_inobt_rec_t),
 	.key_len		= sizeof(xfs_inobt_key_t),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
 
-	.dup_cursor		= xfs_inobt_dup_cursor,
+	.lru_refs		= XFS_INO_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_fibt_2),
+	.sick_mask		= XFS_SICK_AG_FINOBT,
+
+	.dup_cursor		= xfs_finobt_dup_cursor,
 	.set_root		= xfs_finobt_set_root,
 	.alloc_block		= xfs_finobt_alloc_block,
 	.free_block		= xfs_finobt_free_block,
@@ -443,65 +468,54 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
 };
 
 /*
- * Initialize a new inode btree cursor.
+ * Create an inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
  */
-static struct xfs_btree_cur *
-xfs_inobt_init_common(
+struct xfs_btree_cur *
+xfs_inobt_init_cursor(
 	struct xfs_perag	*pag,
-	struct xfs_trans	*tp,		/* transaction pointer */
-	xfs_btnum_t		btnum)		/* ialloc or free ino btree */
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp)
 {
 	struct xfs_mount	*mp = pag->pag_mount;
 	struct xfs_btree_cur	*cur;
 
-	cur = xfs_btree_alloc_cursor(mp, tp, btnum,
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_inobt_ops,
 			M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
-	if (btnum == XFS_BTNUM_INO) {
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
-		cur->bc_ops = &xfs_inobt_ops;
-	} else {
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
-		cur->bc_ops = &xfs_finobt_ops;
-	}
-
-	if (xfs_has_crc(mp))
-		cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
 	cur->bc_ag.pag = xfs_perag_hold(pag);
+	cur->bc_ag.agbp = agbp;
+	if (agbp) {
+		struct xfs_agi		*agi = agbp->b_addr;
+
+		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+	}
 	return cur;
 }
 
-/* Create an inode btree cursor. */
+/*
+ * Create a free inode btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
 struct xfs_btree_cur *
-xfs_inobt_init_cursor(
+xfs_finobt_init_cursor(
 	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	xfs_btnum_t		btnum)
+	struct xfs_buf		*agbp)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
 	struct xfs_btree_cur	*cur;
-	struct xfs_agi		*agi = agbp->b_addr;
 
-	cur = xfs_inobt_init_common(pag, tp, btnum);
-	if (btnum == XFS_BTNUM_INO)
-		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
-	else
-		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_finobt_ops,
+			M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache);
+	cur->bc_ag.pag = xfs_perag_hold(pag);
 	cur->bc_ag.agbp = agbp;
-	return cur;
-}
+	if (agbp) {
+		struct xfs_agi		*agi = agbp->b_addr;
 
-/* Create an inode btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_inobt_stage_cursor(
-	struct xfs_perag	*pag,
-	struct xbtree_afakeroot	*afake,
-	xfs_btnum_t		btnum)
-{
-	struct xfs_btree_cur	*cur;
-
-	cur = xfs_inobt_init_common(pag, NULL, btnum);
-	xfs_btree_stage_afakeroot(cur, afake);
+		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
+	}
 	return cur;
 }
 
@@ -521,7 +535,7 @@ xfs_inobt_commit_staged_btree(
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 
-	if (cur->bc_btnum == XFS_BTNUM_INO) {
+	if (xfs_btree_is_ino(cur->bc_ops)) {
 		fields = XFS_AGI_ROOT | XFS_AGI_LEVEL;
 		agi->agi_root = cpu_to_be32(afake->af_root);
 		agi->agi_level = cpu_to_be32(afake->af_levels);
@@ -530,7 +544,7 @@ xfs_inobt_commit_staged_btree(
 			fields |= XFS_AGI_IBLOCKS;
 		}
 		xfs_ialloc_log_agi(tp, agbp, fields);
-		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_inobt_ops);
+		xfs_btree_commit_afakeroot(cur, tp, agbp);
 	} else {
 		fields = XFS_AGI_FREE_ROOT | XFS_AGI_FREE_LEVEL;
 		agi->agi_free_root = cpu_to_be32(afake->af_root);
@@ -540,7 +554,7 @@ xfs_inobt_commit_staged_btree(
 			fields |= XFS_AGI_IBLOCKS;
 		}
 		xfs_ialloc_log_agi(tp, agbp, fields);
-		xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_finobt_ops);
+		xfs_btree_commit_afakeroot(cur, tp, agbp);
 	}
 }
 
@@ -721,45 +735,21 @@ xfs_inobt_max_size(
 					XFS_INODES_PER_CHUNK);
 }
 
-/* Read AGI and create inobt cursor. */
-int
-xfs_inobt_cur(
-	struct xfs_perag	*pag,
-	struct xfs_trans	*tp,
-	xfs_btnum_t		which,
-	struct xfs_btree_cur	**curpp,
-	struct xfs_buf		**agi_bpp)
-{
-	struct xfs_btree_cur	*cur;
-	int			error;
-
-	ASSERT(*agi_bpp == NULL);
-	ASSERT(*curpp == NULL);
-
-	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
-	if (error)
-		return error;
-
-	cur = xfs_inobt_init_cursor(pag, tp, *agi_bpp, which);
-	*curpp = cur;
-	return 0;
-}
-
 static int
-xfs_inobt_count_blocks(
+xfs_finobt_count_blocks(
 	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
-	xfs_btnum_t		btnum,
 	xfs_extlen_t		*tree_blocks)
 {
 	struct xfs_buf		*agbp = NULL;
-	struct xfs_btree_cur	*cur = NULL;
+	struct xfs_btree_cur	*cur;
 	int			error;
 
-	error = xfs_inobt_cur(pag, tp, btnum, &cur, &agbp);
+	error = xfs_ialloc_read_agi(pag, tp, &agbp);
 	if (error)
 		return error;
 
+	cur = xfs_inobt_init_cursor(pag, tp, agbp);
 	error = xfs_btree_count_blocks(cur, tree_blocks);
 	xfs_btree_del_cursor(cur, error);
 	xfs_trans_brelse(tp, agbp);
@@ -807,8 +797,7 @@ xfs_finobt_calc_reserves(
 	if (xfs_has_inobtcounts(pag->pag_mount))
 		error = xfs_finobt_read_blocks(pag, tp, &tree_len);
 	else
-		error = xfs_inobt_count_blocks(pag, tp, XFS_BTNUM_FINO,
-				&tree_len);
+		error = xfs_finobt_count_blocks(pag, tp, &tree_len);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
index 3262c3fe5ebe..6472ec1ecbb4 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.h
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
@@ -46,10 +46,10 @@ struct xfs_perag;
 		 (maxrecs) * sizeof(xfs_inobt_key_t) + \
 		 ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
 
-extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
-		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_btnum_t btnum);
-struct xfs_btree_cur *xfs_inobt_stage_cursor(struct xfs_perag *pag,
-		struct xbtree_afakeroot *afake, xfs_btnum_t btnum);
+struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_perag *pag,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
+struct xfs_btree_cur *xfs_finobt_init_cursor(struct xfs_perag *pag,
+		struct xfs_trans *tp, struct xfs_buf *agbp);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
 
 /* ir_holemask to inode allocation bitmap conversion */
@@ -66,9 +66,6 @@ int xfs_finobt_calc_reserves(struct xfs_perag *perag, struct xfs_trans *tp,
 		xfs_extlen_t *ask, xfs_extlen_t *used);
 extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
-int xfs_inobt_cur(struct xfs_perag *pag, struct xfs_trans *tp,
-		xfs_btnum_t btnum, struct xfs_btree_cur **curpp,
-		struct xfs_buf **agi_bpp);
 
 void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur,
 		struct xfs_trans *tp, struct xfs_buf *agbp);
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index f4e6b200cdf8..8796f2b3e534 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -394,11 +394,18 @@ xfs_iext_leaf_key(
 	return leaf->recs[n].lo & XFS_IEXT_STARTOFF_MASK;
 }
 
+static inline void *
+xfs_iext_alloc_node(
+	int	size)
+{
+	return kzalloc(size, GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
+}
+
 static void
 xfs_iext_grow(
 	struct xfs_ifork	*ifp)
 {
-	struct xfs_iext_node	*node = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	struct xfs_iext_node	*node = xfs_iext_alloc_node(NODE_SIZE);
 	int			i;
 
 	if (ifp->if_height == 1) {
@@ -454,7 +461,7 @@ xfs_iext_split_node(
 	int			*nr_entries)
 {
 	struct xfs_iext_node	*node = *nodep;
-	struct xfs_iext_node	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	struct xfs_iext_node	*new = xfs_iext_alloc_node(NODE_SIZE);
 	const int		nr_move = KEYS_PER_NODE / 2;
 	int			nr_keep = nr_move + (KEYS_PER_NODE & 1);
 	int			i = 0;
@@ -542,7 +549,7 @@ xfs_iext_split_leaf(
 	int			*nr_entries)
 {
 	struct xfs_iext_leaf	*leaf = cur->leaf;
-	struct xfs_iext_leaf	*new = kmem_zalloc(NODE_SIZE, KM_NOFS);
+	struct xfs_iext_leaf	*new = xfs_iext_alloc_node(NODE_SIZE);
 	const int		nr_move = RECS_PER_LEAF / 2;
 	int			nr_keep = nr_move + (RECS_PER_LEAF & 1);
 	int			i;
@@ -583,7 +590,7 @@ xfs_iext_alloc_root(
 {
 	ASSERT(ifp->if_bytes == 0);
 
-	ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+	ifp->if_data = xfs_iext_alloc_node(sizeof(struct xfs_iext_rec));
 	ifp->if_height = 1;
 
 	/* now that we have a node step into it */
@@ -603,7 +610,8 @@ xfs_iext_realloc_root(
 	if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
 		new_size = NODE_SIZE;
 
-	new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
+	new = krealloc(ifp->if_data, new_size,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
 	ifp->if_data = new;
 	cur->leaf = new;
@@ -743,7 +751,7 @@ xfs_iext_remove_node(
 again:
 	ASSERT(node->ptrs[pos]);
 	ASSERT(node->ptrs[pos] == victim);
-	kmem_free(victim);
+	kfree(victim);
 
 	nr_entries = xfs_iext_node_nr_entries(node, pos) - 1;
 	offset = node->keys[0];
@@ -789,7 +797,7 @@ again:
 		ASSERT(node == ifp->if_data);
 		ifp->if_data = node->ptrs[0];
 		ifp->if_height--;
-		kmem_free(node);
+		kfree(node);
 	}
 }
 
@@ -863,7 +871,7 @@ xfs_iext_free_last_leaf(
 	struct xfs_ifork	*ifp)
 {
 	ifp->if_height--;
-	kmem_free(ifp->if_data);
+	kfree(ifp->if_data);
 	ifp->if_data = NULL;
 }
 
@@ -1044,7 +1052,7 @@ xfs_iext_destroy_node(
 		}
 	}
 
-	kmem_free(node);
+	kfree(node);
 }
 
 void
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 137a65bda95d..d0dcce462bf4 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -18,6 +18,7 @@
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
 #include "xfs_dir2.h"
+#include "xfs_health.h"
 
 #include <linux/iversion.h>
 
@@ -132,9 +133,14 @@ xfs_imap_to_bp(
 	struct xfs_imap		*imap,
 	struct xfs_buf		**bpp)
 {
-	return xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-				   imap->im_len, XBF_UNMAPPED, bpp,
-				   &xfs_inode_buf_ops);
+	int			error;
+
+	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
+			imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno),
+				XFS_SICK_AG_INODES);
+	return error;
 }
 
 static inline struct timespec64 xfs_inode_decode_bigtime(uint64_t ts)
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index f4569e18a8d0..7d660a973909 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -25,6 +25,8 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_types.h"
 #include "xfs_errortag.h"
+#include "xfs_health.h"
+#include "xfs_symlink_remote.h"
 
 struct kmem_cache *xfs_ifork_cache;
 
@@ -50,7 +52,8 @@ xfs_init_local_fork(
 		mem_size++;
 
 	if (size) {
-		char *new_data = kmem_alloc(mem_size, KM_NOFS);
+		char *new_data = kmalloc(mem_size,
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 
 		memcpy(new_data, data, size);
 		if (zero_terminate)
@@ -77,7 +80,7 @@ xfs_iformat_local(
 	/*
 	 * If the size is unreasonable, then something
 	 * is wrong and we just bail out rather than crash in
-	 * kmem_alloc() or memcpy() below.
+	 * kmalloc() or memcpy() below.
 	 */
 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
 		xfs_warn(ip->i_mount,
@@ -87,6 +90,7 @@ xfs_iformat_local(
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 				"xfs_iformat_local", dip, sizeof(*dip),
 				__this_address);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
 	}
 
@@ -116,7 +120,7 @@ xfs_iformat_extents(
 
 	/*
 	 * If the number of extents is unreasonable, then something is wrong and
-	 * we just bail out rather than crash in kmem_alloc() or memcpy() below.
+	 * we just bail out rather than crash in kmalloc() or memcpy() below.
 	 */
 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
 		xfs_warn(ip->i_mount, "corrupt inode %llu ((a)extents = %llu).",
@@ -124,6 +128,7 @@ xfs_iformat_extents(
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 				"xfs_iformat_extents(1)", dip, sizeof(*dip),
 				__this_address);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
 	}
 
@@ -143,6 +148,7 @@ xfs_iformat_extents(
 				xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 						"xfs_iformat_extents(2)",
 						dp, sizeof(*dp), fa);
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 				return xfs_bmap_complain_bad_rec(ip, whichfork,
 						fa, &new);
 			}
@@ -201,11 +207,13 @@ xfs_iformat_btree(
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED,
 				"xfs_iformat_btree", dfp, size,
 				__this_address);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
 	}
 
 	ifp->if_broot_bytes = size;
-	ifp->if_broot = kmem_alloc(size, KM_NOFS);
+	ifp->if_broot = kmalloc(size,
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	ASSERT(ifp->if_broot != NULL);
 	/*
 	 * Copy and convert from the on-disk structure
@@ -265,12 +273,14 @@ xfs_iformat_data_fork(
 		default:
 			xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
 					dip, sizeof(*dip), __this_address);
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 			return -EFSCORRUPTED;
 		}
 		break;
 	default:
 		xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
 				sizeof(*dip), __this_address);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		return -EFSCORRUPTED;
 	}
 }
@@ -342,6 +352,7 @@ xfs_iformat_attr_fork(
 	default:
 		xfs_inode_verifier_error(ip, error, __func__, dip,
 				sizeof(*dip), __this_address);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		error = -EFSCORRUPTED;
 		break;
 	}
@@ -399,7 +410,8 @@ xfs_iroot_realloc(
 		 */
 		if (ifp->if_broot_bytes == 0) {
 			new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, rec_diff);
-			ifp->if_broot = kmem_alloc(new_size, KM_NOFS);
+			ifp->if_broot = kmalloc(new_size,
+						GFP_KERNEL | __GFP_NOFAIL);
 			ifp->if_broot_bytes = (int)new_size;
 			return;
 		}
@@ -414,7 +426,7 @@ xfs_iroot_realloc(
 		new_max = cur_max + rec_diff;
 		new_size = XFS_BMAP_BROOT_SPACE_CALC(mp, new_max);
 		ifp->if_broot = krealloc(ifp->if_broot, new_size,
-					 GFP_NOFS | __GFP_NOFAIL);
+					 GFP_KERNEL | __GFP_NOFAIL);
 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
 						     ifp->if_broot_bytes);
 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
@@ -440,7 +452,7 @@ xfs_iroot_realloc(
 	else
 		new_size = 0;
 	if (new_size > 0) {
-		new_broot = kmem_alloc(new_size, KM_NOFS);
+		new_broot = kmalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
 		/*
 		 * First copy over the btree block header.
 		 */
@@ -470,7 +482,7 @@ xfs_iroot_realloc(
 						     (int)new_size);
 		memcpy(np, op, new_max * (uint)sizeof(xfs_fsblock_t));
 	}
-	kmem_free(ifp->if_broot);
+	kfree(ifp->if_broot);
 	ifp->if_broot = new_broot;
 	ifp->if_broot_bytes = (int)new_size;
 	if (ifp->if_broot)
@@ -488,7 +500,7 @@ xfs_iroot_realloc(
  *
  * If the amount of space needed has decreased below the size of the
  * inline buffer, then switch to using the inline buffer.  Otherwise,
- * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
+ * use krealloc() or kmalloc() to adjust the size of the buffer
  * to what is needed.
  *
  * ip -- the inode whose if_data area is changing
@@ -509,7 +521,7 @@ xfs_idata_realloc(
 
 	if (byte_diff) {
 		ifp->if_data = krealloc(ifp->if_data, new_size,
-					GFP_NOFS | __GFP_NOFAIL);
+					GFP_KERNEL | __GFP_NOFAIL);
 		if (new_size == 0)
 			ifp->if_data = NULL;
 		ifp->if_bytes = new_size;
@@ -524,13 +536,13 @@ xfs_idestroy_fork(
 	struct xfs_ifork	*ifp)
 {
 	if (ifp->if_broot != NULL) {
-		kmem_free(ifp->if_broot);
+		kfree(ifp->if_broot);
 		ifp->if_broot = NULL;
 	}
 
 	switch (ifp->if_format) {
 	case XFS_DINODE_FMT_LOCAL:
-		kmem_free(ifp->if_data);
+		kfree(ifp->if_data);
 		ifp->if_data = NULL;
 		break;
 	case XFS_DINODE_FMT_EXTENTS:
@@ -562,7 +574,7 @@ xfs_iextents_copy(
 	struct xfs_bmbt_irec	rec;
 	int64_t			copied = 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
 	ASSERT(ifp->if_bytes > 0);
 
 	for_each_xfs_iext(ifp, &icur, &rec) {
@@ -689,7 +701,7 @@ xfs_ifork_init_cow(
 		return;
 
 	ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache,
-				       GFP_NOFS | __GFP_NOFAIL);
+				GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS;
 }
 
@@ -802,3 +814,12 @@ xfs_iext_count_upgrade(
 
 	return 0;
 }
+
+/* Decide if a file mapping is on the realtime device or not. */
+bool
+xfs_ifork_is_realtime(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	return XFS_IS_REALTIME_INODE(ip) && whichfork != XFS_ATTR_FORK;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96303249d28a..bd53eb951b65 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -260,6 +260,7 @@ int xfs_iext_count_may_overflow(struct xfs_inode *ip, int whichfork,
 		int nr_to_add);
 int xfs_iext_count_upgrade(struct xfs_trans *tp, struct xfs_inode *ip,
 		uint nr_to_add);
+bool xfs_ifork_is_realtime(struct xfs_inode *ip, int whichfork);
 
 /* returns true if the fork has extents but they are not read in yet. */
 static inline bool xfs_need_iread_extents(const struct xfs_ifork *ifp)
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 269573c82808..16872972e1e9 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -838,10 +838,12 @@ struct xfs_cud_log_format {
 
 #define XFS_BMAP_EXTENT_ATTR_FORK	(1U << 31)
 #define XFS_BMAP_EXTENT_UNWRITTEN	(1U << 30)
+#define XFS_BMAP_EXTENT_REALTIME	(1U << 29)
 
 #define XFS_BMAP_EXTENT_FLAGS		(XFS_BMAP_EXTENT_TYPE_MASK | \
 					 XFS_BMAP_EXTENT_ATTR_FORK | \
-					 XFS_BMAP_EXTENT_UNWRITTEN)
+					 XFS_BMAP_EXTENT_UNWRITTEN | \
+					 XFS_BMAP_EXTENT_REALTIME)
 
 /*
  * This is the structure used to lay out an bui log item in the
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 6709a7f8bad5..511c912d515c 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -23,6 +23,7 @@
 #include "xfs_refcount.h"
 #include "xfs_rmap.h"
 #include "xfs_ag.h"
+#include "xfs_health.h"
 
 struct kmem_cache	*xfs_refcount_intent_cache;
 
@@ -156,6 +157,7 @@ xfs_refcount_complain_bad_rec(
 	xfs_warn(mp,
 		"Start block 0x%x, block count 0x%x, references 0x%x",
 		irec->rc_startblock, irec->rc_blockcount, irec->rc_refcount);
+	xfs_btree_mark_sick(cur);
 	return -EFSCORRUPTED;
 }
 
@@ -238,6 +240,7 @@ xfs_refcount_insert(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -268,12 +271,14 @@ xfs_refcount_delete(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
 	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_ag.pag->pag_agno, &irec);
 	error = xfs_btree_delete(cur, i);
 	if (XFS_IS_CORRUPT(cur->bc_mp, *i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -398,6 +403,7 @@ xfs_refcount_split_extent(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -425,6 +431,7 @@ xfs_refcount_split_extent(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -470,6 +477,7 @@ xfs_refcount_merge_center_extents(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -478,6 +486,7 @@ xfs_refcount_merge_center_extents(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -487,6 +496,7 @@ xfs_refcount_merge_center_extents(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -498,6 +508,7 @@ xfs_refcount_merge_center_extents(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -542,6 +553,7 @@ xfs_refcount_merge_left_extent(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -550,6 +562,7 @@ xfs_refcount_merge_left_extent(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -561,6 +574,7 @@ xfs_refcount_merge_left_extent(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -608,6 +622,7 @@ xfs_refcount_merge_right_extent(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -616,6 +631,7 @@ xfs_refcount_merge_right_extent(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -627,6 +643,7 @@ xfs_refcount_merge_right_extent(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -674,6 +691,7 @@ xfs_refcount_find_left_extents(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -693,6 +711,7 @@ xfs_refcount_find_left_extents(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -767,6 +786,7 @@ xfs_refcount_find_right_extents(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -786,6 +806,7 @@ xfs_refcount_find_right_extents(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1056,7 +1077,7 @@ xfs_refcount_still_have_space(
 	 * to handle each of the shape changes to the refcount btree.
 	 */
 	overhead = xfs_allocfree_block_count(cur->bc_mp,
-				cur->bc_ag.refc.shape_changes);
+				cur->bc_refc.shape_changes);
 	overhead += cur->bc_mp->m_refc_maxlevels;
 	overhead *= cur->bc_mp->m_sb.sb_blocksize;
 
@@ -1064,17 +1085,17 @@ xfs_refcount_still_have_space(
 	 * Only allow 2 refcount extent updates per transaction if the
 	 * refcount continue update "error" has been injected.
 	 */
-	if (cur->bc_ag.refc.nr_ops > 2 &&
+	if (cur->bc_refc.nr_ops > 2 &&
 	    XFS_TEST_ERROR(false, cur->bc_mp,
 			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE))
 		return false;
 
-	if (cur->bc_ag.refc.nr_ops == 0)
+	if (cur->bc_refc.nr_ops == 0)
 		return true;
 	else if (overhead > cur->bc_tp->t_log_res)
 		return false;
-	return  cur->bc_tp->t_log_res - overhead >
-		cur->bc_ag.refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
+	return cur->bc_tp->t_log_res - overhead >
+		cur->bc_refc.nr_ops * XFS_REFCOUNT_ITEM_OVERHEAD;
 }
 
 /*
@@ -1134,7 +1155,7 @@ xfs_refcount_adjust_extents(
 			 * Either cover the hole (increment) or
 			 * delete the range (decrement).
 			 */
-			cur->bc_ag.refc.nr_ops++;
+			cur->bc_refc.nr_ops++;
 			if (tmp.rc_refcount) {
 				error = xfs_refcount_insert(cur, &tmp,
 						&found_tmp);
@@ -1142,6 +1163,7 @@ xfs_refcount_adjust_extents(
 					goto out_error;
 				if (XFS_IS_CORRUPT(cur->bc_mp,
 						   found_tmp != 1)) {
+					xfs_btree_mark_sick(cur);
 					error = -EFSCORRUPTED;
 					goto out_error;
 				}
@@ -1180,6 +1202,7 @@ xfs_refcount_adjust_extents(
 		 */
 		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount == 0) ||
 		    XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount > *aglen)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1193,7 +1216,7 @@ xfs_refcount_adjust_extents(
 		ext.rc_refcount += adj;
 		trace_xfs_refcount_modify_extent(cur->bc_mp,
 				cur->bc_ag.pag->pag_agno, &ext);
-		cur->bc_ag.refc.nr_ops++;
+		cur->bc_refc.nr_ops++;
 		if (ext.rc_refcount > 1) {
 			error = xfs_refcount_update(cur, &ext);
 			if (error)
@@ -1203,6 +1226,7 @@ xfs_refcount_adjust_extents(
 			if (error)
 				goto out_error;
 			if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto out_error;
 			}
@@ -1281,7 +1305,7 @@ xfs_refcount_adjust(
 	if (shape_changed)
 		shape_changes++;
 	if (shape_changes)
-		cur->bc_ag.refc.shape_changes++;
+		cur->bc_refc.shape_changes++;
 
 	/* Now that we've taken care of the ends, adjust the middle extents */
 	error = xfs_refcount_adjust_extents(cur, agbno, aglen, adj);
@@ -1327,8 +1351,10 @@ xfs_refcount_continue_op(
 	struct xfs_perag		*pag = cur->bc_ag.pag;
 
 	if (XFS_IS_CORRUPT(mp, !xfs_verify_agbext(pag, new_agbno,
-					ri->ri_blockcount)))
+					ri->ri_blockcount))) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	ri->ri_startblock = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno);
 
@@ -1374,8 +1400,8 @@ xfs_refcount_finish_one(
 	 */
 	rcur = *pcur;
 	if (rcur != NULL && rcur->bc_ag.pag != ri->ri_pag) {
-		nr_ops = rcur->bc_ag.refc.nr_ops;
-		shape_changes = rcur->bc_ag.refc.shape_changes;
+		nr_ops = rcur->bc_refc.nr_ops;
+		shape_changes = rcur->bc_refc.shape_changes;
 		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
 		rcur = NULL;
 		*pcur = NULL;
@@ -1387,8 +1413,8 @@ xfs_refcount_finish_one(
 			return error;
 
 		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, ri->ri_pag);
-		rcur->bc_ag.refc.nr_ops = nr_ops;
-		rcur->bc_ag.refc.shape_changes = shape_changes;
+		rcur->bc_refc.nr_ops = nr_ops;
+		rcur->bc_refc.shape_changes = shape_changes;
 	}
 	*pcur = rcur;
 
@@ -1449,7 +1475,7 @@ __xfs_refcount_add(
 			blockcount);
 
 	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
-			GFP_NOFS | __GFP_NOFAIL);
+			GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_startblock = startblock;
@@ -1535,6 +1561,7 @@ xfs_refcount_find_shared(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -1552,6 +1579,7 @@ xfs_refcount_find_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1585,6 +1613,7 @@ xfs_refcount_find_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1682,6 +1711,7 @@ xfs_refcount_adjust_cow_extents(
 		goto out_error;
 	if (XFS_IS_CORRUPT(cur->bc_mp, found_rec &&
 				ext.rc_domain != XFS_REFC_DOMAIN_COW)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -1697,6 +1727,7 @@ xfs_refcount_adjust_cow_extents(
 		/* Adding a CoW reservation, there should be nothing here. */
 		if (XFS_IS_CORRUPT(cur->bc_mp,
 				   agbno + aglen > ext.rc_startblock)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1714,6 +1745,7 @@ xfs_refcount_adjust_cow_extents(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_tmp != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1721,14 +1753,17 @@ xfs_refcount_adjust_cow_extents(
 	case XFS_REFCOUNT_ADJUST_COW_FREE:
 		/* Removing a CoW reservation, there should be one extent. */
 		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_startblock != agbno)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
 		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_blockcount != aglen)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
 		if (XFS_IS_CORRUPT(cur->bc_mp, ext.rc_refcount != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1740,6 +1775,7 @@ xfs_refcount_adjust_cow_extents(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(cur->bc_mp, found_rec != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1889,8 +1925,10 @@ xfs_refcount_recover_extent(
 	struct xfs_refcount_recovery	*rr;
 
 	if (XFS_IS_CORRUPT(cur->bc_mp,
-			   be32_to_cpu(rec->refc.rc_refcount) != 1))
+			   be32_to_cpu(rec->refc.rc_refcount) != 1)) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	rr = kmalloc(sizeof(struct xfs_refcount_recovery),
 			GFP_KERNEL | __GFP_NOFAIL);
@@ -1900,6 +1938,7 @@ xfs_refcount_recover_extent(
 	if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
 	    XFS_IS_CORRUPT(cur->bc_mp,
 			   rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
+		xfs_btree_mark_sick(cur);
 		kfree(rr);
 		return -EFSCORRUPTED;
 	}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 0d80bd99147c..ca59f6c89f3e 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -16,6 +16,7 @@
 #include "xfs_refcount.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_trans.h"
 #include "xfs_bit.h"
@@ -77,8 +78,6 @@ xfs_refcountbt_alloc_block(
 					xfs_refc_block(args.mp)));
 	if (error)
 		goto out_error;
-	trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			args.agbno, 1);
 	if (args.fsbno == NULLFSBLOCK) {
 		*stat = 0;
 		return 0;
@@ -107,8 +106,6 @@ xfs_refcountbt_free_block(
 	struct xfs_agf		*agf = agbp->b_addr;
 	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
 
-	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_ag.pag->pag_agno,
-			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
 	be32_add_cpu(&agf->agf_refcount_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
 	return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
@@ -220,7 +217,7 @@ xfs_refcountbt_verify(
 
 	if (!xfs_has_reflink(mp))
 		return __this_address;
-	fa = xfs_btree_sblock_v5hdr_verify(bp);
+	fa = xfs_btree_agblock_v5hdr_verify(bp);
 	if (fa)
 		return fa;
 
@@ -242,7 +239,7 @@ xfs_refcountbt_verify(
 	} else if (level >= mp->m_refc_maxlevels)
 		return __this_address;
 
-	return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+	return xfs_btree_agblock_verify(bp, mp->m_refc_mxr[level != 0]);
 }
 
 STATIC void
@@ -251,7 +248,7 @@ xfs_refcountbt_read_verify(
 {
 	xfs_failaddr_t	fa;
 
-	if (!xfs_btree_sblock_verify_crc(bp))
+	if (!xfs_btree_agblock_verify_crc(bp))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_refcountbt_verify(bp);
@@ -275,7 +272,7 @@ xfs_refcountbt_write_verify(
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
 	}
-	xfs_btree_sblock_calc_crc(bp);
+	xfs_btree_agblock_calc_crc(bp);
 
 }
 
@@ -321,9 +318,17 @@ xfs_refcountbt_keys_contiguous(
 				 be32_to_cpu(key2->refc.rc_startblock));
 }
 
-static const struct xfs_btree_ops xfs_refcountbt_ops = {
+const struct xfs_btree_ops xfs_refcountbt_ops = {
+	.name			= "refcount",
+	.type			= XFS_BTREE_TYPE_AG,
+
 	.rec_len		= sizeof(struct xfs_refcount_rec),
 	.key_len		= sizeof(struct xfs_refcount_key),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
+
+	.lru_refs		= XFS_REFC_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_refcbt_2),
+	.sick_mask		= XFS_SICK_AG_REFCNTBT,
 
 	.dup_cursor		= xfs_refcountbt_dup_cursor,
 	.set_root		= xfs_refcountbt_set_root,
@@ -344,59 +349,32 @@ static const struct xfs_btree_ops xfs_refcountbt_ops = {
 };
 
 /*
- * Initialize a new refcount btree cursor.
+ * Create a new refcount btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
  */
-static struct xfs_btree_cur *
-xfs_refcountbt_init_common(
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
 	struct xfs_perag	*pag)
 {
 	struct xfs_btree_cur	*cur;
 
 	ASSERT(pag->pag_agno < mp->m_sb.sb_agcount);
 
-	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC,
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_refcountbt_ops,
 			mp->m_refc_maxlevels, xfs_refcountbt_cur_cache);
-	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
-
-	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
-
 	cur->bc_ag.pag = xfs_perag_hold(pag);
-	cur->bc_ag.refc.nr_ops = 0;
-	cur->bc_ag.refc.shape_changes = 0;
-	cur->bc_ops = &xfs_refcountbt_ops;
-	return cur;
-}
-
-/* Create a btree cursor. */
-struct xfs_btree_cur *
-xfs_refcountbt_init_cursor(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	struct xfs_perag	*pag)
-{
-	struct xfs_agf		*agf = agbp->b_addr;
-	struct xfs_btree_cur	*cur;
-
-	cur = xfs_refcountbt_init_common(mp, tp, pag);
-	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+	cur->bc_refc.nr_ops = 0;
+	cur->bc_refc.shape_changes = 0;
 	cur->bc_ag.agbp = agbp;
-	return cur;
-}
+	if (agbp) {
+		struct xfs_agf		*agf = agbp->b_addr;
 
-/* Create a btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_refcountbt_stage_cursor(
-	struct xfs_mount	*mp,
-	struct xbtree_afakeroot	*afake,
-	struct xfs_perag	*pag)
-{
-	struct xfs_btree_cur	*cur;
-
-	cur = xfs_refcountbt_init_common(mp, NULL, pag);
-	xfs_btree_stage_afakeroot(cur, afake);
+		cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+	}
 	return cur;
 }
 
@@ -421,7 +399,7 @@ xfs_refcountbt_commit_staged_btree(
 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_REFCOUNT_BLOCKS |
 				    XFS_AGF_REFCOUNT_ROOT |
 				    XFS_AGF_REFCOUNT_LEVEL);
-	xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops);
+	xfs_btree_commit_afakeroot(cur, tp, agbp);
 }
 
 /* Calculate number of records in a refcount btree block. */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index d66b37259bed..1e0ab25f6c68 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -48,8 +48,6 @@ struct xbtree_afakeroot;
 extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
 		struct xfs_trans *tp, struct xfs_buf *agbp,
 		struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_refcountbt_stage_cursor(struct xfs_mount *mp,
-		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
 extern int xfs_refcountbt_maxrecs(int blocklen, bool leaf);
 extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 76bf7f48cb5a..ef16f6f9cef6 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -23,6 +23,7 @@
 #include "xfs_error.h"
 #include "xfs_inode.h"
 #include "xfs_ag.h"
+#include "xfs_health.h"
 
 struct kmem_cache	*xfs_rmap_intent_cache;
 
@@ -56,8 +57,10 @@ xfs_rmap_lookup_le(
 	error = xfs_rmap_get_rec(cur, irec, &get_stat);
 	if (error)
 		return error;
-	if (!get_stat)
+	if (!get_stat) {
+		xfs_btree_mark_sick(cur);
 		return -EFSCORRUPTED;
+	}
 
 	return 0;
 }
@@ -132,6 +135,7 @@ xfs_rmap_insert(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 0)) {
+		xfs_btree_mark_sick(rcur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -145,6 +149,7 @@ xfs_rmap_insert(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(rcur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -174,6 +179,7 @@ xfs_rmap_delete(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(rcur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -182,6 +188,7 @@ xfs_rmap_delete(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(rcur->bc_mp, i != 1)) {
+		xfs_btree_mark_sick(rcur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -208,10 +215,10 @@ xfs_rmap_btrec_to_irec(
 /* Simple checks for rmap records. */
 xfs_failaddr_t
 xfs_rmap_check_irec(
-	struct xfs_btree_cur		*cur,
+	struct xfs_perag		*pag,
 	const struct xfs_rmap_irec	*irec)
 {
-	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_mount		*mp = pag->pag_mount;
 	bool				is_inode;
 	bool				is_unwritten;
 	bool				is_bmbt;
@@ -226,8 +233,8 @@ xfs_rmap_check_irec(
 			return __this_address;
 	} else {
 		/* check for valid extent range, including overflow */
-		if (!xfs_verify_agbext(cur->bc_ag.pag, irec->rm_startblock,
-						       irec->rm_blockcount))
+		if (!xfs_verify_agbext(pag, irec->rm_startblock,
+					    irec->rm_blockcount))
 			return __this_address;
 	}
 
@@ -262,6 +269,16 @@ xfs_rmap_check_irec(
 	return NULL;
 }
 
+static inline xfs_failaddr_t
+xfs_rmap_check_btrec(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*irec)
+{
+	if (xfs_btree_is_mem_rmap(cur->bc_ops))
+		return xfs_rmap_check_irec(cur->bc_mem.pag, irec);
+	return xfs_rmap_check_irec(cur->bc_ag.pag, irec);
+}
+
 static inline int
 xfs_rmap_complain_bad_rec(
 	struct xfs_btree_cur		*cur,
@@ -270,13 +287,18 @@ xfs_rmap_complain_bad_rec(
 {
 	struct xfs_mount		*mp = cur->bc_mp;
 
-	xfs_warn(mp,
-		"Reverse Mapping BTree record corruption in AG %d detected at %pS!",
-		cur->bc_ag.pag->pag_agno, fa);
+	if (xfs_btree_is_mem_rmap(cur->bc_ops))
+		xfs_warn(mp,
+ "In-Memory Reverse Mapping BTree record corruption detected at %pS!", fa);
+	else
+		xfs_warn(mp,
+ "Reverse Mapping BTree record corruption in AG %d detected at %pS!",
+			cur->bc_ag.pag->pag_agno, fa);
 	xfs_warn(mp,
 		"Owner 0x%llx, flags 0x%x, start block 0x%x block count 0x%x",
 		irec->rm_owner, irec->rm_flags, irec->rm_startblock,
 		irec->rm_blockcount);
+	xfs_btree_mark_sick(cur);
 	return -EFSCORRUPTED;
 }
 
@@ -299,7 +321,7 @@ xfs_rmap_get_rec(
 
 	fa = xfs_rmap_btrec_to_irec(rec, irec);
 	if (!fa)
-		fa = xfs_rmap_check_irec(cur, irec);
+		fa = xfs_rmap_check_btrec(cur, irec);
 	if (fa)
 		return xfs_rmap_complain_bad_rec(cur, fa, irec);
 
@@ -512,7 +534,7 @@ xfs_rmap_lookup_le_range(
  */
 static int
 xfs_rmap_free_check_owner(
-	struct xfs_mount	*mp,
+	struct xfs_btree_cur	*cur,
 	uint64_t		ltoff,
 	struct xfs_rmap_irec	*rec,
 	xfs_filblks_t		len,
@@ -520,6 +542,7 @@ xfs_rmap_free_check_owner(
 	uint64_t		offset,
 	unsigned int		flags)
 {
+	struct xfs_mount	*mp = cur->bc_mp;
 	int			error = 0;
 
 	if (owner == XFS_RMAP_OWN_UNKNOWN)
@@ -529,12 +552,14 @@ xfs_rmap_free_check_owner(
 	if (XFS_IS_CORRUPT(mp,
 			   (flags & XFS_RMAP_UNWRITTEN) !=
 			   (rec->rm_flags & XFS_RMAP_UNWRITTEN))) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
 
 	/* Make sure the owner matches what we expect to find in the tree. */
 	if (XFS_IS_CORRUPT(mp, owner != rec->rm_owner)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out;
 	}
@@ -546,16 +571,19 @@ xfs_rmap_free_check_owner(
 	if (flags & XFS_RMAP_BMBT_BLOCK) {
 		if (XFS_IS_CORRUPT(mp,
 				   !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
 	} else {
 		if (XFS_IS_CORRUPT(mp, rec->rm_offset > offset)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
 		if (XFS_IS_CORRUPT(mp,
 				   offset + len > ltoff + rec->rm_blockcount)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
@@ -618,6 +646,7 @@ xfs_rmap_unmap(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -639,6 +668,7 @@ xfs_rmap_unmap(
 		if (XFS_IS_CORRUPT(mp,
 				   bno <
 				   ltrec.rm_startblock + ltrec.rm_blockcount)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -665,6 +695,7 @@ xfs_rmap_unmap(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -677,12 +708,13 @@ xfs_rmap_unmap(
 			   ltrec.rm_startblock > bno ||
 			   ltrec.rm_startblock + ltrec.rm_blockcount <
 			   bno + len)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
 
 	/* Check owner information. */
-	error = xfs_rmap_free_check_owner(mp, ltoff, &ltrec, len, owner,
+	error = xfs_rmap_free_check_owner(cur, ltoff, &ltrec, len, owner,
 			offset, flags);
 	if (error)
 		goto out_error;
@@ -697,6 +729,7 @@ xfs_rmap_unmap(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -788,6 +821,86 @@ out_error:
 	return error;
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of rmapbt live updates.  If
+ * the compiler supports jump labels, the static branch will be replaced by a
+ * nop sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_rmap_hooks_switch);
+
+void
+xfs_rmap_hook_disable(void)
+{
+	xfs_hooks_switch_off(&xfs_rmap_hooks_switch);
+}
+
+void
+xfs_rmap_hook_enable(void)
+{
+	xfs_hooks_switch_on(&xfs_rmap_hooks_switch);
+}
+
+/* Call downstream hooks for a reverse mapping update. */
+static inline void
+xfs_rmap_update_hook(
+	struct xfs_trans		*tp,
+	struct xfs_perag		*pag,
+	enum xfs_rmap_intent_type	op,
+	xfs_agblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	bool				unwritten,
+	const struct xfs_owner_info	*oinfo)
+{
+	if (xfs_hooks_switched_on(&xfs_rmap_hooks_switch)) {
+		struct xfs_rmap_update_params	p = {
+			.startblock	= startblock,
+			.blockcount	= blockcount,
+			.unwritten	= unwritten,
+			.oinfo		= *oinfo, /* struct copy */
+		};
+
+		if (pag)
+			xfs_hooks_call(&pag->pag_rmap_update_hooks, op, &p);
+	}
+}
+
+/* Call the specified function during a reverse mapping update. */
+int
+xfs_rmap_hook_add(
+	struct xfs_perag	*pag,
+	struct xfs_rmap_hook	*hook)
+{
+	return xfs_hooks_add(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Stop calling the specified function during a reverse mapping update. */
+void
+xfs_rmap_hook_del(
+	struct xfs_perag	*pag,
+	struct xfs_rmap_hook	*hook)
+{
+	xfs_hooks_del(&pag->pag_rmap_update_hooks, &hook->rmap_hook);
+}
+
+/* Configure rmap update hook functions. */
+void
+xfs_rmap_hook_setup(
+	struct xfs_rmap_hook	*hook,
+	notifier_fn_t		mod_fn)
+{
+	xfs_hook_setup(&hook->rmap_hook, mod_fn);
+}
+#else
+# define xfs_rmap_update_hook(t, p, o, s, b, u, oi)	do { } while (0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Remove a reference to an extent in the rmap btree.
  */
@@ -808,7 +921,7 @@ xfs_rmap_free(
 		return 0;
 
 	cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
-
+	xfs_rmap_update_hook(tp, pag, XFS_RMAP_UNMAP, bno, len, false, oinfo);
 	error = xfs_rmap_unmap(cur, bno, len, false, oinfo);
 
 	xfs_btree_del_cursor(cur, error);
@@ -900,6 +1013,7 @@ xfs_rmap_map(
 	if (XFS_IS_CORRUPT(mp,
 			   have_lt != 0 &&
 			   ltrec.rm_startblock + ltrec.rm_blockcount > bno)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -917,10 +1031,12 @@ xfs_rmap_map(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
 		if (XFS_IS_CORRUPT(mp, bno + len > gtrec.rm_startblock)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -974,6 +1090,7 @@ xfs_rmap_map(
 			if (error)
 				goto out_error;
 			if (XFS_IS_CORRUPT(mp, i != 1)) {
+				xfs_btree_mark_sick(cur);
 				error = -EFSCORRUPTED;
 				goto out_error;
 			}
@@ -1021,6 +1138,7 @@ xfs_rmap_map(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -1055,6 +1173,7 @@ xfs_rmap_alloc(
 		return 0;
 
 	cur = xfs_rmapbt_init_cursor(mp, tp, agbp, pag);
+	xfs_rmap_update_hook(tp, pag, XFS_RMAP_MAP, bno, len, false, oinfo);
 	error = xfs_rmap_map(cur, bno, len, false, oinfo);
 
 	xfs_btree_del_cursor(cur, error);
@@ -1116,6 +1235,7 @@ xfs_rmap_convert(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -1153,12 +1273,14 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
 		if (XFS_IS_CORRUPT(mp,
 				   LEFT.rm_startblock + LEFT.rm_blockcount >
 				   bno)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1181,6 +1303,7 @@ xfs_rmap_convert(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -1193,10 +1316,12 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
 		if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1227,6 +1352,7 @@ xfs_rmap_convert(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -1246,6 +1372,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1257,6 +1384,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1264,6 +1392,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1275,6 +1404,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1282,6 +1412,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1305,6 +1436,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1312,6 +1444,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1331,6 +1464,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1342,6 +1476,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1349,6 +1484,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1419,6 +1555,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1461,6 +1598,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1476,6 +1614,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1509,6 +1648,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1522,6 +1662,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 0)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1534,6 +1675,7 @@ xfs_rmap_convert(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1606,6 +1748,7 @@ xfs_rmap_convert_shared(
 	if (error)
 		goto done;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto done;
 	}
@@ -1634,6 +1777,7 @@ xfs_rmap_convert_shared(
 		if (XFS_IS_CORRUPT(mp,
 				   LEFT.rm_startblock + LEFT.rm_blockcount >
 				   bno)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1652,10 +1796,12 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
 		if (XFS_IS_CORRUPT(mp, bno + len > RIGHT.rm_startblock)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1706,6 +1852,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1732,6 +1879,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1758,6 +1906,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1781,6 +1930,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1816,6 +1966,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1861,6 +2012,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1896,6 +2048,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -1934,6 +2087,7 @@ xfs_rmap_convert_shared(
 		if (error)
 			goto done;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto done;
 		}
@@ -2023,6 +2177,7 @@ xfs_rmap_unmap_shared(
 	if (error)
 		goto out_error;
 	if (XFS_IS_CORRUPT(mp, i != 1)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -2033,12 +2188,14 @@ xfs_rmap_unmap_shared(
 			   ltrec.rm_startblock > bno ||
 			   ltrec.rm_startblock + ltrec.rm_blockcount <
 			   bno + len)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
 
 	/* Make sure the owner matches what we expect to find in the tree. */
 	if (XFS_IS_CORRUPT(mp, owner != ltrec.rm_owner)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -2047,16 +2204,19 @@ xfs_rmap_unmap_shared(
 	if (XFS_IS_CORRUPT(mp,
 			   (flags & XFS_RMAP_UNWRITTEN) !=
 			   (ltrec.rm_flags & XFS_RMAP_UNWRITTEN))) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
 
 	/* Check the offset. */
 	if (XFS_IS_CORRUPT(mp, ltrec.rm_offset > offset)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
 	if (XFS_IS_CORRUPT(mp, offset > ltoff + ltrec.rm_blockcount)) {
+		xfs_btree_mark_sick(cur);
 		error = -EFSCORRUPTED;
 		goto out_error;
 	}
@@ -2113,6 +2273,7 @@ xfs_rmap_unmap_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -2142,6 +2303,7 @@ xfs_rmap_unmap_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -2221,6 +2383,7 @@ xfs_rmap_map_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, have_gt != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -2273,6 +2436,7 @@ xfs_rmap_map_shared(
 		if (error)
 			goto out_error;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out_error;
 		}
@@ -2335,15 +2499,12 @@ xfs_rmap_map_raw(
 {
 	struct xfs_owner_info	oinfo;
 
-	oinfo.oi_owner = rmap->rm_owner;
-	oinfo.oi_offset = rmap->rm_offset;
-	oinfo.oi_flags = 0;
-	if (rmap->rm_flags & XFS_RMAP_ATTR_FORK)
-		oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
-	if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK)
-		oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
+	xfs_owner_info_pack(&oinfo, rmap->rm_owner, rmap->rm_offset,
+			rmap->rm_flags);
 
-	if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+	if ((rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+			       XFS_RMAP_UNWRITTEN)) ||
+	    XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
 		return xfs_rmap_map(cur, rmap->rm_startblock,
 				rmap->rm_blockcount,
 				rmap->rm_flags & XFS_RMAP_UNWRITTEN,
@@ -2373,7 +2534,7 @@ xfs_rmap_query_range_helper(
 
 	fa = xfs_rmap_btrec_to_irec(rec, &irec);
 	if (!fa)
-		fa = xfs_rmap_check_irec(cur, &irec);
+		fa = xfs_rmap_check_btrec(cur, &irec);
 	if (fa)
 		return xfs_rmap_complain_bad_rec(cur, fa, &irec);
 
@@ -2428,6 +2589,38 @@ xfs_rmap_finish_one_cleanup(
 		xfs_trans_brelse(tp, agbp);
 }
 
+/* Commit an rmap operation into the ondisk tree. */
+int
+__xfs_rmap_finish_intent(
+	struct xfs_btree_cur		*rcur,
+	enum xfs_rmap_intent_type	op,
+	xfs_agblock_t			bno,
+	xfs_extlen_t			len,
+	const struct xfs_owner_info	*oinfo,
+	bool				unwritten)
+{
+	switch (op) {
+	case XFS_RMAP_ALLOC:
+	case XFS_RMAP_MAP:
+		return xfs_rmap_map(rcur, bno, len, unwritten, oinfo);
+	case XFS_RMAP_MAP_SHARED:
+		return xfs_rmap_map_shared(rcur, bno, len, unwritten, oinfo);
+	case XFS_RMAP_FREE:
+	case XFS_RMAP_UNMAP:
+		return xfs_rmap_unmap(rcur, bno, len, unwritten, oinfo);
+	case XFS_RMAP_UNMAP_SHARED:
+		return xfs_rmap_unmap_shared(rcur, bno, len, unwritten, oinfo);
+	case XFS_RMAP_CONVERT:
+		return xfs_rmap_convert(rcur, bno, len, !unwritten, oinfo);
+	case XFS_RMAP_CONVERT_SHARED:
+		return xfs_rmap_convert_shared(rcur, bno, len, !unwritten,
+				oinfo);
+	default:
+		ASSERT(0);
+		return -EFSCORRUPTED;
+	}
+}
+
 /*
  * Process one of the deferred rmap operations.  We pass back the
  * btree cursor to maintain our lock on the rmapbt between calls.
@@ -2476,10 +2669,14 @@ xfs_rmap_finish_one(
 		 * allocate blocks.
 		 */
 		error = xfs_free_extent_fix_freelist(tp, ri->ri_pag, &agbp);
-		if (error)
+		if (error) {
+			xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
 			return error;
-		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp))
+		}
+		if (XFS_IS_CORRUPT(tp->t_mountp, !agbp)) {
+			xfs_ag_mark_sick(ri->ri_pag, XFS_SICK_AG_AGFL);
 			return -EFSCORRUPTED;
+		}
 
 		rcur = xfs_rmapbt_init_cursor(mp, tp, agbp, ri->ri_pag);
 	}
@@ -2490,39 +2687,14 @@ xfs_rmap_finish_one(
 	unwritten = ri->ri_bmap.br_state == XFS_EXT_UNWRITTEN;
 	bno = XFS_FSB_TO_AGBNO(rcur->bc_mp, ri->ri_bmap.br_startblock);
 
-	switch (ri->ri_type) {
-	case XFS_RMAP_ALLOC:
-	case XFS_RMAP_MAP:
-		error = xfs_rmap_map(rcur, bno, ri->ri_bmap.br_blockcount,
-				unwritten, &oinfo);
-		break;
-	case XFS_RMAP_MAP_SHARED:
-		error = xfs_rmap_map_shared(rcur, bno,
-				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
-		break;
-	case XFS_RMAP_FREE:
-	case XFS_RMAP_UNMAP:
-		error = xfs_rmap_unmap(rcur, bno, ri->ri_bmap.br_blockcount,
-				unwritten, &oinfo);
-		break;
-	case XFS_RMAP_UNMAP_SHARED:
-		error = xfs_rmap_unmap_shared(rcur, bno,
-				ri->ri_bmap.br_blockcount, unwritten, &oinfo);
-		break;
-	case XFS_RMAP_CONVERT:
-		error = xfs_rmap_convert(rcur, bno, ri->ri_bmap.br_blockcount,
-				!unwritten, &oinfo);
-		break;
-	case XFS_RMAP_CONVERT_SHARED:
-		error = xfs_rmap_convert_shared(rcur, bno,
-				ri->ri_bmap.br_blockcount, !unwritten, &oinfo);
-		break;
-	default:
-		ASSERT(0);
-		error = -EFSCORRUPTED;
-	}
+	error = __xfs_rmap_finish_intent(rcur, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, &oinfo, unwritten);
+	if (error)
+		return error;
 
-	return error;
+	xfs_rmap_update_hook(tp, ri->ri_pag, ri->ri_type, bno,
+			ri->ri_bmap.br_blockcount, unwritten, &oinfo);
+	return 0;
 }
 
 /*
@@ -2559,7 +2731,7 @@ __xfs_rmap_add(
 			bmap->br_blockcount,
 			bmap->br_state);
 
-	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ri->ri_list);
 	ri->ri_type = type;
 	ri->ri_owner = owner;
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 3c98d9d50afb..9d01fe689497 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -186,6 +186,10 @@ void xfs_rmap_finish_one_cleanup(struct xfs_trans *tp,
 		struct xfs_btree_cur *rcur, int error);
 int xfs_rmap_finish_one(struct xfs_trans *tp, struct xfs_rmap_intent *ri,
 		struct xfs_btree_cur **pcur);
+int __xfs_rmap_finish_intent(struct xfs_btree_cur *rcur,
+		enum xfs_rmap_intent_type op, xfs_agblock_t bno,
+		xfs_extlen_t len, const struct xfs_owner_info *oinfo,
+		bool unwritten);
 
 int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		uint64_t owner, uint64_t offset, unsigned int flags,
@@ -195,7 +199,7 @@ int xfs_rmap_compare(const struct xfs_rmap_irec *a,
 union xfs_btree_rec;
 xfs_failaddr_t xfs_rmap_btrec_to_irec(const union xfs_btree_rec *rec,
 		struct xfs_rmap_irec *irec);
-xfs_failaddr_t xfs_rmap_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_rmap_check_irec(struct xfs_perag *pag,
 		const struct xfs_rmap_irec *irec);
 
 int xfs_rmap_has_records(struct xfs_btree_cur *cur, xfs_agblock_t bno,
@@ -235,4 +239,29 @@ extern struct kmem_cache	*xfs_rmap_intent_cache;
 int __init xfs_rmap_intent_init_cache(void);
 void xfs_rmap_intent_destroy_cache(void);
 
+/*
+ * Parameters for tracking reverse mapping changes.  The hook function arg
+ * parameter is enum xfs_rmap_intent_type, and the rest is below.
+ */
+struct xfs_rmap_update_params {
+	xfs_agblock_t			startblock;
+	xfs_extlen_t			blockcount;
+	struct xfs_owner_info		oinfo;
+	bool				unwritten;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+
+struct xfs_rmap_hook {
+	struct xfs_hook			rmap_hook;
+};
+
+void xfs_rmap_hook_disable(void);
+void xfs_rmap_hook_enable(void);
+
+int xfs_rmap_hook_add(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_del(struct xfs_perag *pag, struct xfs_rmap_hook *hook);
+void xfs_rmap_hook_setup(struct xfs_rmap_hook *hook, notifier_fn_t mod_fn);
+#endif
+
 #endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 6c81b20e97d2..9e759efa81cc 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -16,11 +16,14 @@
 #include "xfs_btree_staging.h"
 #include "xfs_rmap.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_health.h"
 #include "xfs_trace.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
 
 static struct kmem_cache	*xfs_rmapbt_cur_cache;
 
@@ -65,13 +68,12 @@ xfs_rmapbt_set_root(
 {
 	struct xfs_buf		*agbp = cur->bc_ag.agbp;
 	struct xfs_agf		*agf = agbp->b_addr;
-	int			btnum = cur->bc_btnum;
 
 	ASSERT(ptr->s != 0);
 
-	agf->agf_roots[btnum] = ptr->s;
-	be32_add_cpu(&agf->agf_levels[btnum], inc);
-	cur->bc_ag.pag->pagf_levels[btnum] += inc;
+	agf->agf_rmap_root = ptr->s;
+	be32_add_cpu(&agf->agf_rmap_level, inc);
+	cur->bc_ag.pag->pagf_rmap_level += inc;
 
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
 }
@@ -94,8 +96,6 @@ xfs_rmapbt_alloc_block(
 				       &bno, 1);
 	if (error)
 		return error;
-
-	trace_xfs_rmapbt_alloc_block(cur->bc_mp, pag->pag_agno, bno, 1);
 	if (bno == NULLAGBLOCK) {
 		*stat = 0;
 		return 0;
@@ -125,8 +125,6 @@ xfs_rmapbt_free_block(
 	int			error;
 
 	bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
-	trace_xfs_rmapbt_free_block(cur->bc_mp, pag->pag_agno,
-			bno, 1);
 	be32_add_cpu(&agf->agf_rmap_blocks, -1);
 	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 	error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
@@ -226,7 +224,7 @@ xfs_rmapbt_init_ptr_from_cur(
 
 	ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
 
-	ptr->s = agf->agf_roots[cur->bc_btnum];
+	ptr->s = agf->agf_rmap_root;
 }
 
 /*
@@ -340,18 +338,29 @@ xfs_rmapbt_verify(
 
 	if (!xfs_has_rmapbt(mp))
 		return __this_address;
-	fa = xfs_btree_sblock_v5hdr_verify(bp);
+	fa = xfs_btree_agblock_v5hdr_verify(bp);
 	if (fa)
 		return fa;
 
 	level = be16_to_cpu(block->bb_level);
 	if (pag && xfs_perag_initialised_agf(pag)) {
-		if (level >= pag->pagf_levels[XFS_BTNUM_RMAPi])
+		unsigned int	maxlevel = pag->pagf_rmap_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+		/*
+		 * Online repair could be rewriting the free space btrees, so
+		 * we'll validate against the larger of either tree while this
+		 * is going on.
+		 */
+		maxlevel = max_t(unsigned int, maxlevel,
+				pag->pagf_repair_rmap_level);
+#endif
+		if (level >= maxlevel)
 			return __this_address;
 	} else if (level >= mp->m_rmap_maxlevels)
 		return __this_address;
 
-	return xfs_btree_sblock_verify(bp, mp->m_rmap_mxr[level != 0]);
+	return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
 }
 
 static void
@@ -360,7 +369,7 @@ xfs_rmapbt_read_verify(
 {
 	xfs_failaddr_t	fa;
 
-	if (!xfs_btree_sblock_verify_crc(bp))
+	if (!xfs_btree_agblock_verify_crc(bp))
 		xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 	else {
 		fa = xfs_rmapbt_verify(bp);
@@ -384,7 +393,7 @@ xfs_rmapbt_write_verify(
 		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 		return;
 	}
-	xfs_btree_sblock_calc_crc(bp);
+	xfs_btree_agblock_calc_crc(bp);
 
 }
 
@@ -476,9 +485,19 @@ xfs_rmapbt_keys_contiguous(
 				 be32_to_cpu(key2->rmap.rm_startblock));
 }
 
-static const struct xfs_btree_ops xfs_rmapbt_ops = {
+const struct xfs_btree_ops xfs_rmapbt_ops = {
+	.name			= "rmap",
+	.type			= XFS_BTREE_TYPE_AG,
+	.geom_flags		= XFS_BTGEO_OVERLAPPING,
+
 	.rec_len		= sizeof(struct xfs_rmap_rec),
+	/* Overlapping btree; 2 keys per pointer. */
 	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.ptr_len		= XFS_BTREE_SHORT_PTR_LEN,
+
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rmap_2),
+	.sick_mask		= XFS_SICK_AG_RMAPBT,
 
 	.dup_cursor		= xfs_rmapbt_dup_cursor,
 	.set_root		= xfs_rmapbt_set_root,
@@ -498,55 +517,176 @@ static const struct xfs_btree_ops xfs_rmapbt_ops = {
 	.keys_contiguous	= xfs_rmapbt_keys_contiguous,
 };
 
-static struct xfs_btree_cur *
-xfs_rmapbt_init_common(
+/*
+ * Create a new reverse mapping btree cursor.
+ *
+ * For staging cursors tp and agbp are NULL.
+ */
+struct xfs_btree_cur *
+xfs_rmapbt_init_cursor(
 	struct xfs_mount	*mp,
 	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
 	struct xfs_perag	*pag)
 {
 	struct xfs_btree_cur	*cur;
 
-	/* Overlapping btree; 2 keys per pointer. */
-	cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP,
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
 			mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
-	cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING;
-	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
-	cur->bc_ops = &xfs_rmapbt_ops;
-
 	cur->bc_ag.pag = xfs_perag_hold(pag);
+	cur->bc_ag.agbp = agbp;
+	if (agbp) {
+		struct xfs_agf		*agf = agbp->b_addr;
+
+		cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
+	}
 	return cur;
 }
 
-/* Create a new reverse mapping btree cursor. */
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline unsigned int
+xfs_rmapbt_mem_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct xfs_rmap_rec);
+	return blocklen /
+		(2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
+}
+
+/*
+ * Validate an in-memory rmap btree block.  Callers are allowed to generate an
+ * in-memory btree even if the ondisk feature is not enabled.
+ */
+static xfs_failaddr_t
+xfs_rmapbt_mem_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+	unsigned int		maxrecs;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (level >= xfs_rmapbt_maxlevels_ondisk())
+		return __this_address;
+
+	maxrecs = xfs_rmapbt_mem_block_maxrecs(
+			XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
+	return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+xfs_rmapbt_mem_rw_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa = xfs_rmapbt_mem_verify(bp);
+
+	if (fa)
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
+	.name			= "xfs_rmapbt_mem",
+	.magic			= { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
+	.verify_read		= xfs_rmapbt_mem_rw_verify,
+	.verify_write		= xfs_rmapbt_mem_rw_verify,
+	.verify_struct		= xfs_rmapbt_mem_verify,
+};
+
+const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
+	.name			= "mem_rmap",
+	.type			= XFS_BTREE_TYPE_MEM,
+	.geom_flags		= XFS_BTGEO_OVERLAPPING,
+
+	.rec_len		= sizeof(struct xfs_rmap_rec),
+	/* Overlapping btree; 2 keys per pointer. */
+	.key_len		= 2 * sizeof(struct xfs_rmap_key),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= XFS_RMAP_BTREE_REF,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
+
+	.dup_cursor		= xfbtree_dup_cursor,
+	.set_root		= xfbtree_set_root,
+	.alloc_block		= xfbtree_alloc_block,
+	.free_block		= xfbtree_free_block,
+	.get_minrecs		= xfbtree_get_minrecs,
+	.get_maxrecs		= xfbtree_get_maxrecs,
+	.init_key_from_rec	= xfs_rmapbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_rmapbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_rmapbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
+	.key_diff		= xfs_rmapbt_key_diff,
+	.buf_ops		= &xfs_rmapbt_mem_buf_ops,
+	.diff_two_keys		= xfs_rmapbt_diff_two_keys,
+	.keys_inorder		= xfs_rmapbt_keys_inorder,
+	.recs_inorder		= xfs_rmapbt_recs_inorder,
+	.keys_contiguous	= xfs_rmapbt_keys_contiguous,
+};
+
+/* Create a cursor for an in-memory btree. */
 struct xfs_btree_cur *
-xfs_rmapbt_init_cursor(
-	struct xfs_mount	*mp,
+xfs_rmapbt_mem_cursor(
+	struct xfs_perag	*pag,
 	struct xfs_trans	*tp,
-	struct xfs_buf		*agbp,
-	struct xfs_perag	*pag)
+	struct xfbtree		*xfbt)
 {
-	struct xfs_agf		*agf = agbp->b_addr;
 	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = pag->pag_mount;
 
-	cur = xfs_rmapbt_init_common(mp, tp, pag);
-	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
-	cur->bc_ag.agbp = agbp;
+	cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
+			xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
+	cur->bc_mem.xfbtree = xfbt;
+	cur->bc_nlevels = xfbt->nlevels;
+
+	cur->bc_mem.pag = xfs_perag_hold(pag);
 	return cur;
 }
 
-/* Create a new reverse mapping btree cursor with a fake root for staging. */
-struct xfs_btree_cur *
-xfs_rmapbt_stage_cursor(
+/* Create an in-memory rmap btree. */
+int
+xfs_rmapbt_mem_init(
 	struct xfs_mount	*mp,
-	struct xbtree_afakeroot	*afake,
-	struct xfs_perag	*pag)
+	struct xfbtree		*xfbt,
+	struct xfs_buftarg	*btp,
+	xfs_agnumber_t		agno)
 {
-	struct xfs_btree_cur	*cur;
+	xfbt->owner = agno;
+	return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
+}
 
-	cur = xfs_rmapbt_init_common(mp, NULL, pag);
-	xfs_btree_stage_afakeroot(cur, afake);
-	return cur;
+/* Compute the max possible height for reverse mapping btrees in memory. */
+static unsigned int
+xfs_rmapbt_mem_maxlevels(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
+
+	/*
+	 * How tall can an in-memory rmap btree become if we filled the entire
+	 * AG with rmap records?
+	 */
+	return xfs_btree_compute_maxlevels(minrecs,
+			XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
 }
+#else
+# define xfs_rmapbt_mem_maxlevels()	(0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
 
 /*
  * Install a new reverse mapping btree root.  Caller is responsible for
@@ -563,12 +703,12 @@ xfs_rmapbt_commit_staged_btree(
 
 	ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 
-	agf->agf_roots[cur->bc_btnum] = cpu_to_be32(afake->af_root);
-	agf->agf_levels[cur->bc_btnum] = cpu_to_be32(afake->af_levels);
+	agf->agf_rmap_root = cpu_to_be32(afake->af_root);
+	agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
 	agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
 	xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
 				    XFS_AGF_RMAP_BLOCKS);
-	xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops);
+	xfs_btree_commit_afakeroot(cur, tp, agbp);
 }
 
 /* Calculate number of records in a reverse mapping btree block. */
@@ -618,7 +758,8 @@ xfs_rmapbt_maxlevels_ondisk(void)
 	 * like if it consumes almost all the blocks in the AG due to maximal
 	 * sharing factor.
 	 */
-	return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS);
+	return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
+		   xfs_rmapbt_mem_maxlevels());
 }
 
 /* Compute the maximum height of an rmap btree. */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 3244715dd111..eb90d89e8086 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -10,6 +10,7 @@ struct xfs_buf;
 struct xfs_btree_cur;
 struct xfs_mount;
 struct xbtree_afakeroot;
+struct xfbtree;
 
 /* rmaps only exist on crc enabled filesystems */
 #define XFS_RMAP_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
@@ -44,8 +45,6 @@ struct xbtree_afakeroot;
 struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
 				struct xfs_trans *tp, struct xfs_buf *bp,
 				struct xfs_perag *pag);
-struct xfs_btree_cur *xfs_rmapbt_stage_cursor(struct xfs_mount *mp,
-		struct xbtree_afakeroot *afake, struct xfs_perag *pag);
 void xfs_rmapbt_commit_staged_btree(struct xfs_btree_cur *cur,
 		struct xfs_trans *tp, struct xfs_buf *agbp);
 int xfs_rmapbt_maxrecs(int blocklen, int leaf);
@@ -64,4 +63,9 @@ unsigned int xfs_rmapbt_maxlevels_ondisk(void);
 int __init xfs_rmapbt_init_cur_cache(void);
 void xfs_rmapbt_destroy_cur_cache(void);
 
+struct xfs_btree_cur *xfs_rmapbt_mem_cursor(struct xfs_perag *pag,
+		struct xfs_trans *tp, struct xfbtree *xfbtree);
+int xfs_rmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+		struct xfs_buftarg *btp, xfs_agnumber_t agno);
+
 #endif /* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e31663cb7b43..f246d6dbf4ec 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -17,6 +17,7 @@
 #include "xfs_rtalloc.h"
 #include "xfs_error.h"
 #include "xfs_rtbitmap.h"
+#include "xfs_health.h"
 
 /*
  * Realtime allocator bitmap functions shared with userspace.
@@ -115,13 +116,19 @@ xfs_rtbuf_get(
 	if (error)
 		return error;
 
-	if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map)))
+	if (XFS_IS_CORRUPT(mp, nmap == 0 || !xfs_bmap_is_written_extent(&map))) {
+		xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+					     XFS_SICK_RT_BITMAP);
 		return -EFSCORRUPTED;
+	}
 
 	ASSERT(map.br_startblock != NULLFSBLOCK);
 	error = xfs_trans_read_buf(mp, args->tp, mp->m_ddev_targp,
 				   XFS_FSB_TO_DADDR(mp, map.br_startblock),
 				   mp->m_bsize, 0, &bp, &xfs_rtbuf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_rt_mark_sick(mp, issum ? XFS_SICK_RT_SUMMARY :
+					     XFS_SICK_RT_BITMAP);
 	if (error)
 		return error;
 
@@ -934,7 +941,7 @@ xfs_rtfree_extent(
 	struct timespec64	atime;
 
 	ASSERT(mp->m_rbmip->i_itemp != NULL);
-	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
 
 	error = xfs_rtcheck_alloc_range(&args, start, len);
 	if (error)
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5bb6e2bd6dee..73a4b895de67 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -530,7 +530,8 @@ xfs_validate_sb_common(
 	}
 
 	if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit),
-			XFS_FSB_TO_B(mp, sbp->sb_width), 0, false))
+			XFS_FSB_TO_B(mp, sbp->sb_width), 0,
+			xfs_buf_daddr(bp) == XFS_SB_DADDR, false))
 		return -EFSCORRUPTED;
 
 	/*
@@ -1290,6 +1291,8 @@ xfs_sb_read_secondary(
 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
 			XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
 			XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB);
 	if (error)
 		return error;
 	xfs_buf_set_ref(bp, XFS_SSB_REF);
@@ -1321,8 +1324,10 @@ xfs_sb_get_secondary(
 }
 
 /*
- * sunit, swidth, sectorsize(optional with 0) should be all in bytes,
- * so users won't be confused by values in error messages.
+ * sunit, swidth, sectorsize(optional with 0) should be all in bytes, so users
+ * won't be confused by values in error messages.  This function returns false
+ * if the stripe geometry is invalid and the caller is unable to repair the
+ * stripe configuration later in the mount process.
  */
 bool
 xfs_validate_stripe_geometry(
@@ -1330,20 +1335,21 @@ xfs_validate_stripe_geometry(
 	__s64			sunit,
 	__s64			swidth,
 	int			sectorsize,
+	bool			may_repair,
 	bool			silent)
 {
 	if (swidth > INT_MAX) {
 		if (!silent)
 			xfs_notice(mp,
 "stripe width (%lld) is too large", swidth);
-		return false;
+		goto check_override;
 	}
 
 	if (sunit > swidth) {
 		if (!silent)
 			xfs_notice(mp,
 "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth);
-		return false;
+		goto check_override;
 	}
 
 	if (sectorsize && (int)sunit % sectorsize) {
@@ -1351,21 +1357,21 @@ xfs_validate_stripe_geometry(
 			xfs_notice(mp,
 "stripe unit (%lld) must be a multiple of the sector size (%d)",
 				   sunit, sectorsize);
-		return false;
+		goto check_override;
 	}
 
 	if (sunit && !swidth) {
 		if (!silent)
 			xfs_notice(mp,
 "invalid stripe unit (%lld) and stripe width of 0", sunit);
-		return false;
+		goto check_override;
 	}
 
 	if (!sunit && swidth) {
 		if (!silent)
 			xfs_notice(mp,
 "invalid stripe width (%lld) and stripe unit of 0", swidth);
-		return false;
+		goto check_override;
 	}
 
 	if (sunit && (int)swidth % (int)sunit) {
@@ -1373,9 +1379,27 @@ xfs_validate_stripe_geometry(
 			xfs_notice(mp,
 "stripe width (%lld) must be a multiple of the stripe unit (%lld)",
 				   swidth, sunit);
-		return false;
+		goto check_override;
 	}
 	return true;
+
+check_override:
+	if (!may_repair)
+		return false;
+	/*
+	 * During mount, mp->m_dalign will not be set unless the sunit mount
+	 * option was set. If it was set, ignore the bad stripe alignment values
+	 * and allow the validation and overwrite later in the mount process to
+	 * attempt to overwrite the bad stripe alignment values with the values
+	 * supplied by mount options.
+	 */
+	if (!mp->m_dalign)
+		return false;
+	if (!silent)
+		xfs_notice(mp,
+"Will try to correct with specified mount options sunit (%d) and swidth (%d)",
+			BBTOB(mp->m_dalign), BBTOB(mp->m_swidth));
+	return true;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 2e8e8d63d4eb..37b1ed1bc209 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -35,8 +35,9 @@ extern int	xfs_sb_get_secondary(struct xfs_mount *mp,
 				struct xfs_trans *tp, xfs_agnumber_t agno,
 				struct xfs_buf **bpp);
 
-extern bool	xfs_validate_stripe_geometry(struct xfs_mount *mp,
-		__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+bool	xfs_validate_stripe_geometry(struct xfs_mount *mp,
+		__s64 sunit, __s64 swidth, int sectorsize, bool may_repair,
+		bool silent);
 
 uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
 
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 4220d3584c1b..dfd61fa8332e 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -43,6 +43,60 @@ extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 
+/* btree ops */
+extern const struct xfs_btree_ops xfs_bnobt_ops;
+extern const struct xfs_btree_ops xfs_cntbt_ops;
+extern const struct xfs_btree_ops xfs_inobt_ops;
+extern const struct xfs_btree_ops xfs_finobt_ops;
+extern const struct xfs_btree_ops xfs_bmbt_ops;
+extern const struct xfs_btree_ops xfs_refcountbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_ops;
+extern const struct xfs_btree_ops xfs_rmapbt_mem_ops;
+
+static inline bool xfs_btree_is_bno(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_bnobt_ops;
+}
+
+static inline bool xfs_btree_is_cnt(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_cntbt_ops;
+}
+
+static inline bool xfs_btree_is_bmap(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_bmbt_ops;
+}
+
+static inline bool xfs_btree_is_ino(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_inobt_ops;
+}
+
+static inline bool xfs_btree_is_fino(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_finobt_ops;
+}
+
+static inline bool xfs_btree_is_refcount(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_refcountbt_ops;
+}
+
+static inline bool xfs_btree_is_rmap(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_rmapbt_ops;
+}
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+static inline bool xfs_btree_is_mem_rmap(const struct xfs_btree_ops *ops)
+{
+	return ops == &xfs_rmapbt_mem_ops;
+}
+#else
+# define xfs_btree_is_mem_rmap(...)	(false)
+#endif
+
 /* log size calculation functions */
 int	xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes);
 int	xfs_log_calc_minimum_size(struct xfs_mount *);
@@ -128,19 +182,6 @@ void	xfs_log_get_max_trans_res(struct xfs_mount *mp,
 #define	XFS_ICHGTIME_CHG	0x2	/* inode field change timestamp */
 #define	XFS_ICHGTIME_CREATE	0x4	/* inode create timestamp */
 
-
-/*
- * Symlink decoding/encoding functions
- */
-int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
-int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
-			uint32_t size, struct xfs_buf *bp);
-bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
-			uint32_t size, struct xfs_buf *bp);
-void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
-				 struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
-
 /* Computed inode geometry for the filesystem. */
 struct xfs_ino_geometry {
 	/* Maximum inode count in this filesystem. */
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index 160aa20aa441..ffb1317a9212 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -16,7 +16,10 @@
 #include "xfs_trans.h"
 #include "xfs_buf_item.h"
 #include "xfs_log.h"
-
+#include "xfs_symlink_remote.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_health.h"
 
 /*
  * Each contiguous block has a header, so it is not just a simple pathlen
@@ -227,3 +230,153 @@ xfs_symlink_shortform_verify(
 		return __this_address;
 	return NULL;
 }
+
+/* Read a remote symlink target into the buffer. */
+int
+xfs_symlink_remote_read(
+	struct xfs_inode	*ip,
+	char			*link)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	char			*cur_chunk;
+	int			pathlen = ip->i_disk_size;
+	int			nmaps = XFS_SYMLINK_MAPS;
+	int			byte_cnt;
+	int			n;
+	int			error = 0;
+	int			fsblocks = 0;
+	int			offset;
+
+	xfs_assert_ilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
+
+	fsblocks = xfs_symlink_blocks(mp, pathlen);
+	error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
+	if (error)
+		goto out;
+
+	offset = 0;
+	for (n = 0; n < nmaps; n++) {
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+
+		error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
+				&bp, &xfs_symlink_buf_ops);
+		if (xfs_metadata_is_sick(error))
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+		if (error)
+			return error;
+		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+		if (pathlen < byte_cnt)
+			byte_cnt = pathlen;
+
+		cur_chunk = bp->b_addr;
+		if (xfs_has_crc(mp)) {
+			if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
+							byte_cnt, bp)) {
+				xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+				error = -EFSCORRUPTED;
+				xfs_alert(mp,
+"symlink header does not match required off/len/owner (0x%x/0x%x,0x%llx)",
+					offset, byte_cnt, ip->i_ino);
+				xfs_buf_relse(bp);
+				goto out;
+
+			}
+
+			cur_chunk += sizeof(struct xfs_dsymlink_hdr);
+		}
+
+		memcpy(link + offset, cur_chunk, byte_cnt);
+
+		pathlen -= byte_cnt;
+		offset += byte_cnt;
+
+		xfs_buf_relse(bp);
+	}
+	ASSERT(pathlen == 0);
+
+	link[ip->i_disk_size] = '\0';
+	error = 0;
+
+ out:
+	return error;
+}
+
+/* Write the symlink target into the inode. */
+int
+xfs_symlink_write_target(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	const char		*target_path,
+	int			pathlen,
+	xfs_fsblock_t		fs_blocks,
+	uint			resblks)
+{
+	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
+	struct xfs_mount	*mp = tp->t_mountp;
+	const char		*cur_chunk;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		d;
+	int			byte_cnt;
+	int			nmaps;
+	int			offset = 0;
+	int			n;
+	int			error;
+
+	/*
+	 * If the symlink will fit into the inode, write it inline.
+	 */
+	if (pathlen <= xfs_inode_data_fork_size(ip)) {
+		xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
+
+		ip->i_disk_size = pathlen;
+		ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
+		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
+		return 0;
+	}
+
+	nmaps = XFS_SYMLINK_MAPS;
+	error = xfs_bmapi_write(tp, ip, 0, fs_blocks, XFS_BMAPI_METADATA,
+			resblks, mval, &nmaps);
+	if (error)
+		return error;
+
+	ip->i_disk_size = pathlen;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+
+	cur_chunk = target_path;
+	offset = 0;
+	for (n = 0; n < nmaps; n++) {
+		char	*buf;
+
+		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
+		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
+		error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+				BTOBB(byte_cnt), 0, &bp);
+		if (error)
+			return error;
+		bp->b_ops = &xfs_symlink_buf_ops;
+
+		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
+		byte_cnt = min(byte_cnt, pathlen);
+
+		buf = bp->b_addr;
+		buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, byte_cnt,
+				bp);
+
+		memcpy(buf, cur_chunk, byte_cnt);
+
+		cur_chunk += byte_cnt;
+		pathlen -= byte_cnt;
+		offset += byte_cnt;
+
+		xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
+		xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
+						(char *)bp->b_addr);
+	}
+	ASSERT(pathlen == 0);
+	return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.h b/fs/xfs/libxfs/xfs_symlink_remote.h
new file mode 100644
index 000000000000..a63bd38ae4fa
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_symlink_remote.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * Copyright (c) 2013 Red Hat, Inc.
+ * All Rights Reserved.
+ */
+#ifndef __XFS_SYMLINK_REMOTE_H
+#define __XFS_SYMLINK_REMOTE_H
+
+/*
+ * Symlink decoding/encoding functions
+ */
+int xfs_symlink_blocks(struct xfs_mount *mp, int pathlen);
+int xfs_symlink_hdr_set(struct xfs_mount *mp, xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
+			uint32_t size, struct xfs_buf *bp);
+void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
+				 struct xfs_inode *ip, struct xfs_ifork *ifp);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
+int xfs_symlink_remote_read(struct xfs_inode *ip, char *link);
+int xfs_symlink_write_target(struct xfs_trans *tp, struct xfs_inode *ip,
+		const char *target_path, int pathlen, xfs_fsblock_t fs_blocks,
+		uint resblks);
+
+#endif /* __XFS_SYMLINK_REMOTE_H */
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 70e97ea6eee7..69fc5b981352 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -31,7 +31,7 @@ xfs_trans_ijoin(
 {
 	struct xfs_inode_log_item *iip;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	if (ip->i_itemp == NULL)
 		xfs_inode_item_init(ip, ip->i_mount);
 	iip = ip->i_itemp;
@@ -60,7 +60,7 @@ xfs_trans_ichgtime(
 	struct timespec64	tv;
 
 	ASSERT(tp);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	tv = current_time(inode);
 
@@ -90,7 +90,7 @@ xfs_trans_log_inode(
 	struct inode		*inode = VFS_I(ip);
 
 	ASSERT(iip);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(!xfs_iflags_test(ip, XFS_ISTALE));
 
 	tp->t_flags |= XFS_TRANS_DIRTY;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 62e02d5380ad..76eb9e328835 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -80,11 +80,13 @@ typedef void *		xfs_failaddr_t;
 /*
  * Inode fork identifiers.
  */
-#define	XFS_DATA_FORK	0
-#define	XFS_ATTR_FORK	1
-#define	XFS_COW_FORK	2
+#define XFS_STAGING_FORK	(-1)	/* fake fork for staging a btree */
+#define	XFS_DATA_FORK		(0)
+#define	XFS_ATTR_FORK		(1)
+#define	XFS_COW_FORK		(2)
 
 #define XFS_WHICHFORK_STRINGS \
+	{ XFS_STAGING_FORK, 	"staging" }, \
 	{ XFS_DATA_FORK, 	"data" }, \
 	{ XFS_ATTR_FORK,	"attr" }, \
 	{ XFS_COW_FORK,		"cow" }
@@ -114,24 +116,6 @@ typedef enum {
 	{ XFS_LOOKUP_LEi,	"le" }, \
 	{ XFS_LOOKUP_GEi,	"ge" }
 
-/*
- * This enum is used in string mapping in xfs_trace.h and scrub/trace.h;
- * please keep the TRACE_DEFINE_ENUMs for it up to date.
- */
-typedef enum {
-	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
-	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
-} xfs_btnum_t;
-
-#define XFS_BTNUM_STRINGS \
-	{ XFS_BTNUM_BNOi,	"bnobt" }, \
-	{ XFS_BTNUM_CNTi,	"cntbt" }, \
-	{ XFS_BTNUM_RMAPi,	"rmapbt" }, \
-	{ XFS_BTNUM_BMAPi,	"bmbt" }, \
-	{ XFS_BTNUM_INOi,	"inobt" }, \
-	{ XFS_BTNUM_FINOi,	"finobt" }, \
-	{ XFS_BTNUM_REFCi,	"refcbt" }
-
 struct xfs_name {
 	const unsigned char	*name;
 	int			len;
diff --git a/fs/xfs/mrlock.h b/fs/xfs/mrlock.h
deleted file mode 100644
index 79155eec341b..000000000000
--- a/fs/xfs/mrlock.h
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- */
-#ifndef __XFS_SUPPORT_MRLOCK_H__
-#define __XFS_SUPPORT_MRLOCK_H__
-
-#include <linux/rwsem.h>
-
-typedef struct {
-	struct rw_semaphore	mr_lock;
-#if defined(DEBUG) || defined(XFS_WARN)
-	int			mr_writer;
-#endif
-} mrlock_t;
-
-#if defined(DEBUG) || defined(XFS_WARN)
-#define mrinit(mrp, name)	\
-	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
-#else
-#define mrinit(mrp, name)	\
-	do { init_rwsem(&(mrp)->mr_lock); } while (0)
-#endif
-
-#define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
-#define mrfree(mrp)		do { } while (0)
-
-static inline void mraccess_nested(mrlock_t *mrp, int subclass)
-{
-	down_read_nested(&mrp->mr_lock, subclass);
-}
-
-static inline void mrupdate_nested(mrlock_t *mrp, int subclass)
-{
-	down_write_nested(&mrp->mr_lock, subclass);
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 1;
-#endif
-}
-
-static inline int mrtryaccess(mrlock_t *mrp)
-{
-	return down_read_trylock(&mrp->mr_lock);
-}
-
-static inline int mrtryupdate(mrlock_t *mrp)
-{
-	if (!down_write_trylock(&mrp->mr_lock))
-		return 0;
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 1;
-#endif
-	return 1;
-}
-
-static inline void mrunlock_excl(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 0;
-#endif
-	up_write(&mrp->mr_lock);
-}
-
-static inline void mrunlock_shared(mrlock_t *mrp)
-{
-	up_read(&mrp->mr_lock);
-}
-
-static inline void mrdemote(mrlock_t *mrp)
-{
-#if defined(DEBUG) || defined(XFS_WARN)
-	mrp->mr_writer = 0;
-#endif
-	downgrade_write(&mrp->mr_lock);
-}
-
-#endif /* __XFS_SUPPORT_MRLOCK_H__ */
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
index ed08f76ff4f3..e488e1f4f63d 100644
--- a/fs/xfs/scrub/agb_bitmap.h
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -65,4 +65,9 @@ int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
 int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
 		struct xfs_btree_cur *cur);
 
+static inline uint32_t xagb_bitmap_count_set_regions(struct xagb_bitmap *b)
+{
+	return xbitmap32_count_set_regions(&b->agbitmap);
+}
+
 #endif	/* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c
index 6c6e5eba42c8..e954f07679dd 100644
--- a/fs/xfs/scrub/agheader.c
+++ b/fs/xfs/scrub/agheader.c
@@ -556,28 +556,28 @@ xchk_agf(
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 	/* Check the AGF btree roots and levels */
-	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]);
+	agbno = be32_to_cpu(agf->agf_bno_root);
 	if (!xfs_verify_agbno(pag, agbno))
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
-	agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]);
+	agbno = be32_to_cpu(agf->agf_cnt_root);
 	if (!xfs_verify_agbno(pag, agbno))
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
-	level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
+	level = be32_to_cpu(agf->agf_bno_level);
 	if (level <= 0 || level > mp->m_alloc_maxlevels)
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
-	level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
+	level = be32_to_cpu(agf->agf_cnt_level);
 	if (level <= 0 || level > mp->m_alloc_maxlevels)
 		xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
 	if (xfs_has_rmapbt(mp)) {
-		agbno = be32_to_cpu(agf->agf_roots[XFS_BTNUM_RMAP]);
+		agbno = be32_to_cpu(agf->agf_rmap_root);
 		if (!xfs_verify_agbno(pag, agbno))
 			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 
-		level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+		level = be32_to_cpu(agf->agf_rmap_level);
 		if (level <= 0 || level > mp->m_rmap_maxlevels)
 			xchk_block_set_corrupt(sc, sc->sa.agf_bp);
 	}
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 26bd1ff68f1b..427054b65b23 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -174,8 +174,7 @@ xrep_agf_find_btrees(
 	 * We relied on the rmapbt to reconstruct the AGF.  If we get a
 	 * different root then something's seriously wrong.
 	 */
-	if (fab[XREP_AGF_RMAPBT].root !=
-	    be32_to_cpu(old_agf->agf_roots[XFS_BTNUM_RMAPi]))
+	if (fab[XREP_AGF_RMAPBT].root != be32_to_cpu(old_agf->agf_rmap_root))
 		return -EFSCORRUPTED;
 
 	/* We must find the refcountbt root if that feature is enabled. */
@@ -224,20 +223,14 @@ xrep_agf_set_roots(
 	struct xfs_agf			*agf,
 	struct xrep_find_ag_btree	*fab)
 {
-	agf->agf_roots[XFS_BTNUM_BNOi] =
-			cpu_to_be32(fab[XREP_AGF_BNOBT].root);
-	agf->agf_levels[XFS_BTNUM_BNOi] =
-			cpu_to_be32(fab[XREP_AGF_BNOBT].height);
+	agf->agf_bno_root = cpu_to_be32(fab[XREP_AGF_BNOBT].root);
+	agf->agf_bno_level = cpu_to_be32(fab[XREP_AGF_BNOBT].height);
 
-	agf->agf_roots[XFS_BTNUM_CNTi] =
-			cpu_to_be32(fab[XREP_AGF_CNTBT].root);
-	agf->agf_levels[XFS_BTNUM_CNTi] =
-			cpu_to_be32(fab[XREP_AGF_CNTBT].height);
+	agf->agf_cnt_root = cpu_to_be32(fab[XREP_AGF_CNTBT].root);
+	agf->agf_cnt_level = cpu_to_be32(fab[XREP_AGF_CNTBT].height);
 
-	agf->agf_roots[XFS_BTNUM_RMAPi] =
-			cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
-	agf->agf_levels[XFS_BTNUM_RMAPi] =
-			cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
+	agf->agf_rmap_root = cpu_to_be32(fab[XREP_AGF_RMAPBT].root);
+	agf->agf_rmap_level = cpu_to_be32(fab[XREP_AGF_RMAPBT].height);
 
 	if (xfs_has_reflink(sc->mp)) {
 		agf->agf_refcount_root =
@@ -262,8 +255,7 @@ xrep_agf_calc_from_btrees(
 	int			error;
 
 	/* Update the AGF counters from the bnobt. */
-	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
-			sc->sa.pag, XFS_BTNUM_BNO);
+	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 	error = xfs_alloc_query_all(cur, xrep_agf_walk_allocbt, &raa);
 	if (error)
 		goto err;
@@ -276,8 +268,7 @@ xrep_agf_calc_from_btrees(
 	agf->agf_longest = cpu_to_be32(raa.longest);
 
 	/* Update the AGF counters from the cntbt. */
-	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
-			sc->sa.pag, XFS_BTNUM_CNT);
+	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 	error = xfs_btree_count_blocks(cur, &blocks);
 	if (error)
 		goto err;
@@ -333,12 +324,9 @@ xrep_agf_commit_new(
 	pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
 	pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
 	pag->pagf_longest = be32_to_cpu(agf->agf_longest);
-	pag->pagf_levels[XFS_BTNUM_BNOi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
-	pag->pagf_levels[XFS_BTNUM_CNTi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
-	pag->pagf_levels[XFS_BTNUM_RMAPi] =
-			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+	pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level);
+	pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level);
+	pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level);
 	pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 	set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
 
@@ -559,16 +547,14 @@ xrep_agfl_collect_blocks(
 		goto out_bmp;
 
 	/* Find all blocks currently being used by the bnobt. */
-	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
-			sc->sa.pag, XFS_BTNUM_BNO);
+	cur = xfs_bnobt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
 	xfs_btree_del_cursor(cur, error);
 	if (error)
 		goto out_bmp;
 
 	/* Find all blocks currently being used by the cntbt. */
-	cur = xfs_allocbt_init_cursor(mp, sc->tp, agf_bp,
-			sc->sa.pag, XFS_BTNUM_CNT);
+	cur = xfs_cntbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
 	error = xagb_bitmap_set_btblocks(&ra.agmetablocks, cur);
 	xfs_btree_del_cursor(cur, error);
 	if (error)
@@ -908,7 +894,7 @@ xrep_agi_calc_from_btrees(
 	xfs_agino_t		freecount;
 	int			error;
 
-	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp, XFS_BTNUM_INO);
+	cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
 	error = xfs_ialloc_count_inodes(cur, &count, &freecount);
 	if (error)
 		goto err;
@@ -928,8 +914,7 @@ xrep_agi_calc_from_btrees(
 	if (xfs_has_finobt(mp) && xfs_has_inobtcounts(mp)) {
 		xfs_agblock_t	blocks;
 
-		cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, agi_bp,
-				XFS_BTNUM_FINO);
+		cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp, agi_bp);
 		error = xfs_btree_count_blocks(cur, &blocks);
 		if (error)
 			goto err;
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
index 45edda096869..d421b253923e 100644
--- a/fs/xfs/scrub/alloc_repair.c
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -687,8 +687,8 @@ xrep_abt_reset_counters(
 	 * height values before re-initializing the perag info from the updated
 	 * AGF to capture all the new values.
 	 */
-	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
-	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+	pag->pagf_repair_bno_level = pag->pagf_bno_level;
+	pag->pagf_repair_cnt_level = pag->pagf_cnt_level;
 
 	/* Reinitialize with the values we just logged. */
 	return xrep_reinit_pagf(sc);
@@ -735,10 +735,11 @@ xrep_abt_build_new_trees(
 	ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
 
 	/* Allocate cursors for the staged btrees. */
-	bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
-			pag, XFS_BTNUM_BNO);
-	cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
-			pag, XFS_BTNUM_CNT);
+	bno_cur = xfs_bnobt_init_cursor(sc->mp, NULL, NULL, pag);
+	xfs_btree_stage_afakeroot(bno_cur, &ra->new_bnobt.afake);
+
+	cnt_cur = xfs_cntbt_init_cursor(sc->mp, NULL, NULL, pag);
+	xfs_btree_stage_afakeroot(cnt_cur, &ra->new_cntbt.afake);
 
 	/* Last chance to abort before we start committing fixes. */
 	if (xchk_should_terminate(sc, &error))
@@ -765,10 +766,8 @@ xrep_abt_build_new_trees(
 	 * height so that we don't trip the verifiers when writing the new
 	 * btree blocks to disk.
 	 */
-	pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
-					ra->new_bnobt.bload.btree_height;
-	pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
-					ra->new_cntbt.bload.btree_height;
+	pag->pagf_repair_bno_level = ra->new_bnobt.bload.btree_height;
+	pag->pagf_repair_cnt_level = ra->new_cntbt.bload.btree_height;
 
 	/* Load the free space by length tree. */
 	ra->array_cur = XFARRAY_CURSOR_INIT;
@@ -807,8 +806,8 @@ xrep_abt_build_new_trees(
 	return xrep_roll_ag_trans(sc);
 
 err_levels:
-	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
-	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+	pag->pagf_repair_bno_level = 0;
+	pag->pagf_repair_cnt_level = 0;
 err_cur:
 	xfs_btree_del_cursor(cnt_cur, error);
 	xfs_btree_del_cursor(bno_cur, error);
@@ -838,8 +837,8 @@ xrep_abt_remove_old_trees(
 	 * Now that we've zapped all the old allocbt blocks we can turn off
 	 * the alternate height mechanism.
 	 */
-	pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
-	pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+	pag->pagf_repair_bno_level = 0;
+	pag->pagf_repair_cnt_level = 0;
 	return 0;
 }
 
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index 1449bb5262d9..0cb8d43912a8 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -566,3 +566,17 @@ xbitmap32_test(
 	*len = bn->bn_start - start;
 	return false;
 }
+
+/* Count the number of set regions in this bitmap. */
+uint32_t
+xbitmap32_count_set_regions(
+	struct xbitmap32	*bitmap)
+{
+	struct xbitmap32_node	*bn;
+	uint32_t		nr = 0;
+
+	for_each_xbitmap32_extent(bn, bitmap)
+		nr++;
+
+	return nr;
+}
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 2df8911606d6..710c1ac5e323 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -62,4 +62,6 @@ int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
 bool xbitmap32_empty(struct xbitmap32 *bitmap);
 bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
 
+uint32_t xbitmap32_count_set_regions(struct xbitmap32 *bitmap);
+
 #endif	/* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index b169cddde6da..24a15bf784f1 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -924,7 +924,7 @@ xchk_bmap(
 	if (!ifp)
 		return -ENOENT;
 
-	info.is_rt = whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip);
+	info.is_rt = xfs_ifork_is_realtime(ip, whichfork);
 	info.whichfork = whichfork;
 	info.is_shared = whichfork == XFS_DATA_FORK && xfs_is_reflink_inode(ip);
 	info.sc = sc;
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
index a4bb89fdd510..1e656fab5e41 100644
--- a/fs/xfs/scrub/bmap_repair.c
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -639,7 +639,13 @@ xrep_bmap_build_new_fork(
 	rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
 	rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
 	rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
-	bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+	/*
+	 * Allocate a new bmap btree cursor for reloading an inode block mapping
+	 * data structure.
+	 */
+	bmap_cur = xfs_bmbt_init_cursor(sc->mp, NULL, sc->ip, XFS_STAGING_FORK);
+	xfs_btree_stage_ifakeroot(bmap_cur, ifake);
 
 	/*
 	 * Figure out the size and format of the new fork, then fill it with
diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c
index 1935b9ce1885..fe678a0438bc 100644
--- a/fs/xfs/scrub/btree.c
+++ b/fs/xfs/scrub/btree.c
@@ -47,7 +47,7 @@ __xchk_btree_process_error(
 		*error = 0;
 		fallthrough;
 	default:
-		if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 			trace_xchk_ifork_btree_op_error(sc, cur, level,
 					*error, ret_ip);
 		else
@@ -91,7 +91,7 @@ __xchk_btree_set_corrupt(
 {
 	sc->sm->sm_flags |= errflag;
 
-	if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 		trace_xchk_ifork_btree_error(sc, cur, level,
 				ret_ip);
 	else
@@ -168,7 +168,7 @@ xchk_btree_rec(
 	if (xfs_btree_keycmp_lt(cur, &key, keyp))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 
-	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 		return;
 
 	/* Is high_key(rec) no larger than the parent high key? */
@@ -215,7 +215,7 @@ xchk_btree_key(
 	if (xfs_btree_keycmp_lt(cur, key, keyp))
 		xchk_btree_set_corrupt(bs->sc, cur, level);
 
-	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 		return;
 
 	/* Is this block's high key no larger than the parent high key? */
@@ -236,22 +236,18 @@ xchk_btree_ptr_ok(
 	int			level,
 	union xfs_btree_ptr	*ptr)
 {
-	bool			res;
-
 	/* A btree rooted in an inode has no block pointer to the root. */
-	if ((bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
 	    level == bs->cur->bc_nlevels)
 		return true;
 
 	/* Otherwise, check the pointers. */
-	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		res = xfs_btree_check_lptr(bs->cur, be64_to_cpu(ptr->l), level);
-	else
-		res = xfs_btree_check_sptr(bs->cur, be32_to_cpu(ptr->s), level);
-	if (!res)
+	if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
+		return false;
+	}
 
-	return res;
+	return true;
 }
 
 /* Check that a btree block's sibling matches what we expect it. */
@@ -374,18 +370,21 @@ xchk_btree_check_block_owner(
 {
 	xfs_agnumber_t		agno;
 	xfs_agblock_t		agbno;
-	xfs_btnum_t		btnum;
 	bool			init_sa;
 	int			error = 0;
 
 	if (!bs->cur)
 		return 0;
 
-	btnum = bs->cur->bc_btnum;
 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
 
-	init_sa = bs->cur->bc_flags & XFS_BTREE_LONG_PTRS;
+	/*
+	 * If the btree being examined is not itself a per-AG btree, initialize
+	 * sc->sa so that we can check for the presence of an ownership record
+	 * in the rmap btree for the AG containing the block.
+	 */
+	init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
 	if (init_sa) {
 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
@@ -399,11 +398,11 @@ xchk_btree_check_block_owner(
 	 * have to nullify it (to shut down further block owner checks) if
 	 * self-xref encounters problems.
 	 */
-	if (!bs->sc->sa.bno_cur && btnum == XFS_BTNUM_BNO)
+	if (!bs->sc->sa.bno_cur && xfs_btree_is_bno(bs->cur->bc_ops))
 		bs->cur = NULL;
 
 	xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
-	if (!bs->sc->sa.rmap_cur && btnum == XFS_BTNUM_RMAP)
+	if (!bs->sc->sa.rmap_cur && xfs_btree_is_rmap(bs->cur->bc_ops))
 		bs->cur = NULL;
 
 out_free:
@@ -429,7 +428,7 @@ xchk_btree_check_owner(
 	 * up.
 	 */
 	if (bp == NULL) {
-		if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+		if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
 		return 0;
 	}
@@ -442,7 +441,7 @@ xchk_btree_check_owner(
 	 * duplicate cursors.  Therefore, save the buffer daddr for
 	 * later scanning.
 	 */
-	if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) {
+	if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
 		struct check_owner	*co;
 
 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
@@ -475,7 +474,7 @@ xchk_btree_check_iroot_minrecs(
 	 * existing filesystems, so instead we disable the check for data fork
 	 * bmap btrees when there's an attr fork.
 	 */
-	if (bs->cur->bc_btnum == XFS_BTNUM_BMAP &&
+	if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
 	    xfs_inode_has_attr_fork(bs->sc->ip))
 		return false;
@@ -508,7 +507,7 @@ xchk_btree_check_minrecs(
 	 * child block might be less than the standard minrecs, but that's ok
 	 * provided that there's only one direct child of the root.
 	 */
-	if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
 	    level == cur->bc_nlevels - 2) {
 		struct xfs_btree_block	*root_block;
 		struct xfs_buf		*root_bp;
@@ -562,7 +561,7 @@ xchk_btree_block_check_keys(
 		return;
 	}
 
-	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 		return;
 
 	/* Make sure the high key of this block matches the parent. */
@@ -585,7 +584,6 @@ xchk_btree_get_block(
 	struct xfs_btree_block	**pblock,
 	struct xfs_buf		**pbp)
 {
-	xfs_failaddr_t		failed_at;
 	int			error;
 
 	*pblock = NULL;
@@ -597,13 +595,7 @@ xchk_btree_get_block(
 		return error;
 
 	xfs_btree_get_block(bs->cur, level, pbp);
-	if (bs->cur->bc_flags & XFS_BTREE_LONG_PTRS)
-		failed_at = __xfs_btree_check_lblock(bs->cur, *pblock,
-				level, *pbp);
-	else
-		failed_at = __xfs_btree_check_sblock(bs->cur, *pblock,
-				 level, *pbp);
-	if (failed_at) {
+	if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
 		return 0;
 	}
@@ -664,7 +656,7 @@ xchk_btree_block_keys(
 	if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
 		xchk_btree_set_corrupt(bs->sc, cur, 1);
 
-	if (!(cur->bc_flags & XFS_BTREE_OVERLAPPING))
+	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
 		return;
 
 	/* Get high keys */
@@ -728,7 +720,7 @@ xchk_btree(
 	 * error codes for us.
 	 */
 	level = cur->bc_nlevels - 1;
-	cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+	xfs_btree_init_ptr_from_cur(cur, &ptr);
 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
 		goto out;
 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index 81f2b96bb5a7..47a20cf5205f 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -29,6 +29,8 @@
 #include "xfs_attr.h"
 #include "xfs_reflink.h"
 #include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_quota.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -82,6 +84,15 @@ __xchk_process_error(
 				sc->ip ? sc->ip : XFS_I(file_inode(sc->file)),
 				sc->sm, *error);
 		break;
+	case -ECANCELED:
+		/*
+		 * ECANCELED here means that the caller set one of the scrub
+		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+		 * quickly.  Set error to zero and do not continue.
+		 */
+		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
+		*error = 0;
+		break;
 	case -EFSBADCRC:
 	case -EFSCORRUPTED:
 		/* Note the badness but don't abort. */
@@ -89,8 +100,7 @@ __xchk_process_error(
 		*error = 0;
 		fallthrough;
 	default:
-		trace_xchk_op_error(sc, agno, bno, *error,
-				ret_ip);
+		trace_xchk_op_error(sc, agno, bno, *error, ret_ip);
 		break;
 	}
 	return false;
@@ -136,6 +146,16 @@ __xchk_fblock_process_error(
 		/* Used to restart an op with deadlock avoidance. */
 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
 		break;
+	case -ECANCELED:
+		/*
+		 * ECANCELED here means that the caller set one of the scrub
+		 * outcome flags (corrupt, xfail, xcorrupt) and wants to exit
+		 * quickly.  Set error to zero and do not continue.
+		 */
+		trace_xchk_file_op_error(sc, whichfork, offset, *error,
+				ret_ip);
+		*error = 0;
+		break;
 	case -EFSBADCRC:
 	case -EFSCORRUPTED:
 		/* Note the badness but don't abort. */
@@ -227,6 +247,19 @@ xchk_block_set_corrupt(
 	trace_xchk_block_error(sc, xfs_buf_daddr(bp), __return_address);
 }
 
+#ifdef CONFIG_XFS_QUOTA
+/* Record a corrupt quota counter. */
+void
+xchk_qcheck_set_corrupt(
+	struct xfs_scrub	*sc,
+	unsigned int		dqtype,
+	xfs_dqid_t		id)
+{
+	sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT;
+	trace_xchk_qcheck_error(sc, dqtype, id, __return_address);
+}
+#endif
+
 /* Record a corruption while cross-referencing. */
 void
 xchk_block_xref_set_corrupt(
@@ -427,7 +460,7 @@ xchk_perag_read_headers(
  * Grab the AG headers for the attached perag structure and wait for pending
  * intents to drain.
  */
-static int
+int
 xchk_perag_drain_and_lock(
 	struct xfs_scrub	*sc)
 {
@@ -555,46 +588,50 @@ xchk_ag_btcur_init(
 {
 	struct xfs_mount	*mp = sc->mp;
 
-	if (sa->agf_bp &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_BNO)) {
+	if (sa->agf_bp) {
 		/* Set up a bnobt cursor for cross-referencing. */
-		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
-				sa->pag, XFS_BTNUM_BNO);
-	}
+		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sa->pag);
+		xchk_ag_btree_del_cursor_if_sick(sc, &sa->bno_cur,
+				XFS_SCRUB_TYPE_BNOBT);
 
-	if (sa->agf_bp &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_CNT)) {
 		/* Set up a cntbt cursor for cross-referencing. */
-		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
-				sa->pag, XFS_BTNUM_CNT);
-	}
-
-	/* Set up a inobt cursor for cross-referencing. */
-	if (sa->agi_bp &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_INO)) {
-		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
-				XFS_BTNUM_INO);
-	}
-
-	/* Set up a finobt cursor for cross-referencing. */
-	if (sa->agi_bp && xfs_has_finobt(mp) &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_FINO)) {
-		sa->fino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp, sa->agi_bp,
-				XFS_BTNUM_FINO);
-	}
-
-	/* Set up a rmapbt cursor for cross-referencing. */
-	if (sa->agf_bp && xfs_has_rmapbt(mp) &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_RMAP)) {
-		sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
 				sa->pag);
+		xchk_ag_btree_del_cursor_if_sick(sc, &sa->cnt_cur,
+				XFS_SCRUB_TYPE_CNTBT);
+
+		/* Set up a rmapbt cursor for cross-referencing. */
+		if (xfs_has_rmapbt(mp)) {
+			sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp,
+					sa->agf_bp, sa->pag);
+			xchk_ag_btree_del_cursor_if_sick(sc, &sa->rmap_cur,
+					XFS_SCRUB_TYPE_RMAPBT);
+		}
+
+		/* Set up a refcountbt cursor for cross-referencing. */
+		if (xfs_has_reflink(mp)) {
+			sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+					sa->agf_bp, sa->pag);
+			xchk_ag_btree_del_cursor_if_sick(sc, &sa->refc_cur,
+					XFS_SCRUB_TYPE_REFCNTBT);
+		}
 	}
 
-	/* Set up a refcountbt cursor for cross-referencing. */
-	if (sa->agf_bp && xfs_has_reflink(mp) &&
-	    xchk_ag_btree_healthy_enough(sc, sa->pag, XFS_BTNUM_REFC)) {
-		sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
-				sa->agf_bp, sa->pag);
+	if (sa->agi_bp) {
+		/* Set up a inobt cursor for cross-referencing. */
+		sa->ino_cur = xfs_inobt_init_cursor(sa->pag, sc->tp,
+				sa->agi_bp);
+		xchk_ag_btree_del_cursor_if_sick(sc, &sa->ino_cur,
+				XFS_SCRUB_TYPE_INOBT);
+
+		/* Set up a finobt cursor for cross-referencing. */
+		if (xfs_has_finobt(mp)) {
+			sa->fino_cur = xfs_finobt_init_cursor(sa->pag, sc->tp,
+					sa->agi_bp);
+			xchk_ag_btree_del_cursor_if_sick(sc, &sa->fino_cur,
+					XFS_SCRUB_TYPE_FINOBT);
+		}
 	}
 }
 
@@ -653,6 +690,13 @@ xchk_trans_cancel(
 	sc->tp = NULL;
 }
 
+int
+xchk_trans_alloc_empty(
+	struct xfs_scrub	*sc)
+{
+	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+}
+
 /*
  * Grab an empty transaction so that we can re-grab locked buffers if
  * one of our btrees turns out to be cyclic.
@@ -672,7 +716,7 @@ xchk_trans_alloc(
 		return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate,
 				resblks, 0, 0, &sc->tp);
 
-	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+	return xchk_trans_alloc_empty(sc);
 }
 
 /* Set us up with a transaction and an empty context. */
@@ -1000,9 +1044,7 @@ xchk_irele(
 	struct xfs_scrub	*sc,
 	struct xfs_inode	*ip)
 {
-	if (current->journal_info != NULL) {
-		ASSERT(current->journal_info == sc->tp);
-
+	if (sc->tp) {
 		/*
 		 * If we are in a transaction, we /cannot/ drop the inode
 		 * ourselves, because the VFS will trigger writeback, which
@@ -1259,6 +1301,15 @@ xchk_fsgates_enable(
 	if (scrub_fsgates & XCHK_FSGATES_DRAIN)
 		xfs_drain_wait_enable();
 
+	if (scrub_fsgates & XCHK_FSGATES_QUOTA)
+		xfs_dqtrx_hook_enable();
+
+	if (scrub_fsgates & XCHK_FSGATES_DIRENTS)
+		xfs_dir_hook_enable();
+
+	if (scrub_fsgates & XCHK_FSGATES_RMAP)
+		xfs_rmap_hook_enable();
+
 	sc->flags |= scrub_fsgates;
 }
 
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index da09580b454a..89f7bbec887e 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -32,6 +32,7 @@ xchk_should_terminate(
 }
 
 int xchk_trans_alloc(struct xfs_scrub *sc, uint resblks);
+int xchk_trans_alloc_empty(struct xfs_scrub *sc);
 void xchk_trans_cancel(struct xfs_scrub *sc);
 
 bool xchk_process_error(struct xfs_scrub *sc, xfs_agnumber_t agno,
@@ -54,6 +55,10 @@ void xchk_block_set_corrupt(struct xfs_scrub *sc,
 void xchk_ino_set_corrupt(struct xfs_scrub *sc, xfs_ino_t ino);
 void xchk_fblock_set_corrupt(struct xfs_scrub *sc, int whichfork,
 		xfs_fileoff_t offset);
+#ifdef CONFIG_XFS_QUOTA
+void xchk_qcheck_set_corrupt(struct xfs_scrub *sc, unsigned int dqtype,
+		xfs_dqid_t id);
+#endif
 
 void xchk_block_xref_set_corrupt(struct xfs_scrub *sc,
 		struct xfs_buf *bp);
@@ -105,6 +110,7 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
 #ifdef CONFIG_XFS_QUOTA
 int xchk_ino_dqattach(struct xfs_scrub *sc);
 int xchk_setup_quota(struct xfs_scrub *sc);
+int xchk_setup_quotacheck(struct xfs_scrub *sc);
 #else
 static inline int
 xchk_ino_dqattach(struct xfs_scrub *sc)
@@ -116,12 +122,19 @@ xchk_setup_quota(struct xfs_scrub *sc)
 {
 	return -ENOENT;
 }
+static inline int
+xchk_setup_quotacheck(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
 #endif
 int xchk_setup_fscounters(struct xfs_scrub *sc);
+int xchk_setup_nlinks(struct xfs_scrub *sc);
 
 void xchk_ag_free(struct xfs_scrub *sc, struct xchk_ag *sa);
 int xchk_ag_init(struct xfs_scrub *sc, xfs_agnumber_t agno,
 		struct xchk_ag *sa);
+int xchk_perag_drain_and_lock(struct xfs_scrub *sc);
 
 /*
  * Grab all AG resources, treating the inability to grab the perag structure as
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
index 1e82c727af8e..4de3f0f40f48 100644
--- a/fs/xfs/scrub/cow_repair.c
+++ b/fs/xfs/scrub/cow_repair.c
@@ -609,6 +609,6 @@ xrep_bmap_cow(
 out_bitmap:
 	xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
 	xoff_bitmap_destroy(&xc->bad_fileoffs);
-	kmem_free(xc);
+	kfree(xc);
 	return error;
 }
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index d86ab51af928..076a310b8eb0 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -93,11 +93,11 @@ xchk_dir_actor(
 		return -ECANCELED;
 	}
 
-	if (!strncmp(".", name->name, name->len)) {
+	if (xfs_dir2_samename(name, &xfs_name_dot)) {
 		/* If this is "." then check that the inum matches the dir. */
 		if (ino != dp->i_ino)
 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
-	} else if (!strncmp("..", name->name, name->len)) {
+	} else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
 		/*
 		 * If this is ".." in the root inode, check that the inum
 		 * matches this dir.
diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
index 5799e9a94f1f..d310737c8823 100644
--- a/fs/xfs/scrub/fscounters.c
+++ b/fs/xfs/scrub/fscounters.c
@@ -22,6 +22,7 @@
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
+#include "scrub/fscounters.h"
 
 /*
  * FS Summary Counters
@@ -48,17 +49,6 @@
  * our tolerance for mismatch between expected and actual counter values.
  */
 
-struct xchk_fscounters {
-	struct xfs_scrub	*sc;
-	uint64_t		icount;
-	uint64_t		ifree;
-	uint64_t		fdblocks;
-	uint64_t		frextents;
-	unsigned long long	icount_min;
-	unsigned long long	icount_max;
-	bool			frozen;
-};
-
 /*
  * Since the expected value computation is lockless but only browses incore
  * values, the percpu counters should be fairly close to each other.  However,
@@ -235,14 +225,19 @@ xchk_setup_fscounters(
 	 * Pause all writer activity in the filesystem while we're scrubbing to
 	 * reduce the likelihood of background perturbations to the counters
 	 * throwing off our calculations.
+	 *
+	 * If we're repairing, we need to prevent any other thread from
+	 * changing the global fs summary counters while we're repairing them.
+	 * This requires the fs to be frozen, which will disable background
+	 * reclaim and purge all inactive inodes.
 	 */
-	if (sc->flags & XCHK_TRY_HARDER) {
+	if ((sc->flags & XCHK_TRY_HARDER) || xchk_could_repair(sc)) {
 		error = xchk_fscounters_freeze(sc);
 		if (error)
 			return error;
 	}
 
-	return xfs_trans_alloc_empty(sc->mp, &sc->tp);
+	return xchk_trans_alloc_empty(sc);
 }
 
 /*
@@ -254,7 +249,9 @@ xchk_setup_fscounters(
  * set the INCOMPLETE flag even when a negative errno is returned.  This care
  * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
  * ECANCELED) that are absorbed into a scrub state flag update by
- * xchk_*_process_error.
+ * xchk_*_process_error.  Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
  */
 
 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */
@@ -482,6 +479,10 @@ xchk_fscount_within_range(
 	if (curr_value == expected)
 		return true;
 
+	/* We require exact matches when repair is running. */
+	if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+		return false;
+
 	min_value = min(old_value, curr_value);
 	max_value = max(old_value, curr_value);
 
diff --git a/fs/xfs/scrub/fscounters.h b/fs/xfs/scrub/fscounters.h
new file mode 100644
index 000000000000..461a13d25f4b
--- /dev/null
+++ b/fs/xfs/scrub/fscounters.h
@@ -0,0 +1,20 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSCOUNTERS_H__
+#define __XFS_SCRUB_FSCOUNTERS_H__
+
+struct xchk_fscounters {
+	struct xfs_scrub	*sc;
+	uint64_t		icount;
+	uint64_t		ifree;
+	uint64_t		fdblocks;
+	uint64_t		frextents;
+	unsigned long long	icount_min;
+	unsigned long long	icount_max;
+	bool			frozen;
+};
+
+#endif /* __XFS_SCRUB_FSCOUNTERS_H__ */
diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c
new file mode 100644
index 000000000000..94cdb852bee4
--- /dev/null
+++ b/fs/xfs/scrub/fscounters_repair.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/fscounters.h"
+
+/*
+ * FS Summary Counters
+ * ===================
+ *
+ * We correct errors in the filesystem summary counters by setting them to the
+ * values computed during the obligatory scrub phase.  However, we must be
+ * careful not to allow any other thread to change the counters while we're
+ * computing and setting new values.  To achieve this, we freeze the
+ * filesystem for the whole operation if the REPAIR flag is set.  The checking
+ * function is stricter when we've frozen the fs.
+ */
+
+/*
+ * Reset the superblock counters.  Caller is responsible for freezing the
+ * filesystem during the calculation and reset phases.
+ */
+int
+xrep_fscounters(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xchk_fscounters	*fsc = sc->buf;
+
+	/*
+	 * Reinitialize the in-core counters from what we computed.  We froze
+	 * the filesystem, so there shouldn't be anyone else trying to modify
+	 * these counters.
+	 */
+	if (!fsc->frozen) {
+		ASSERT(fsc->frozen);
+		return -EFSCORRUPTED;
+	}
+
+	trace_xrep_reset_counters(mp, fsc);
+
+	percpu_counter_set(&mp->m_icount, fsc->icount);
+	percpu_counter_set(&mp->m_ifree, fsc->ifree);
+	percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks);
+	percpu_counter_set(&mp->m_frextents, fsc->frextents);
+	mp->m_sb.sb_frextents = fsc->frextents;
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 531006910ca9..9020a6bef7f1 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -14,6 +14,7 @@
 #include "xfs_health.h"
 #include "scrub/scrub.h"
 #include "scrub/health.h"
+#include "scrub/common.h"
 
 /*
  * Scrub and In-Core Filesystem Health Assessments
@@ -105,6 +106,8 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_GQUOTA]		= { XHG_FS,  XFS_SICK_FS_GQUOTA },
 	[XFS_SCRUB_TYPE_PQUOTA]		= { XHG_FS,  XFS_SICK_FS_PQUOTA },
 	[XFS_SCRUB_TYPE_FSCOUNTERS]	= { XHG_FS,  XFS_SICK_FS_COUNTERS },
+	[XFS_SCRUB_TYPE_QUOTACHECK]	= { XHG_FS,  XFS_SICK_FS_QUOTACHECK },
+	[XFS_SCRUB_TYPE_NLINKS]		= { XHG_FS,  XFS_SICK_FS_NLINKS },
 };
 
 /* Return the health status mask for this scrub type. */
@@ -148,6 +151,24 @@ xchk_file_looks_zapped(
 }
 
 /*
+ * Scrub gave the filesystem a clean bill of health, so clear all the indirect
+ * markers of past problems (at least for the fs and ags) so that we can be
+ * healthy again.
+ */
+STATIC void
+xchk_mark_all_healthy(
+	struct xfs_mount	*mp)
+{
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	xfs_fs_mark_healthy(mp, XFS_SICK_FS_INDIRECT);
+	xfs_rt_mark_healthy(mp, XFS_SICK_RT_INDIRECT);
+	for_each_perag(mp, agno, pag)
+		xfs_ag_mark_healthy(pag, XFS_SICK_AG_INDIRECT);
+}
+
+/*
  * Update filesystem health assessments based on what we found and did.
  *
  * If the scrubber finds errors, we mark sick whatever's mentioned in
@@ -164,6 +185,18 @@ xchk_update_health(
 	struct xfs_perag	*pag;
 	bool			bad;
 
+	/*
+	 * The HEALTHY scrub type is a request from userspace to clear all the
+	 * indirect flags after a clean scan of the entire filesystem.  As such
+	 * there's no sick flag defined for it, so we branch here ahead of the
+	 * mask check.
+	 */
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_HEALTHY &&
+	    !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
+		xchk_mark_all_healthy(sc->mp);
+		return;
+	}
+
 	if (!sc->sick_mask)
 		return;
 
@@ -173,7 +206,7 @@ xchk_update_health(
 	case XHG_AG:
 		pag = xfs_perag_get(sc->mp, sc->sm->sm_agno);
 		if (bad)
-			xfs_ag_mark_sick(pag, sc->sick_mask);
+			xfs_ag_mark_corrupt(pag, sc->sick_mask);
 		else
 			xfs_ag_mark_healthy(pag, sc->sick_mask);
 		xfs_perag_put(pag);
@@ -181,20 +214,30 @@ xchk_update_health(
 	case XHG_INO:
 		if (!sc->ip)
 			return;
-		if (bad)
-			xfs_inode_mark_sick(sc->ip, sc->sick_mask);
-		else
+		if (bad) {
+			unsigned int	mask = sc->sick_mask;
+
+			/*
+			 * If we're coming in for repairs then we don't want
+			 * sickness flags to propagate to the incore health
+			 * status if the inode gets inactivated before we can
+			 * fix it.
+			 */
+			if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)
+				mask |= XFS_SICK_INO_FORGET;
+			xfs_inode_mark_corrupt(sc->ip, mask);
+		} else
 			xfs_inode_mark_healthy(sc->ip, sc->sick_mask);
 		break;
 	case XHG_FS:
 		if (bad)
-			xfs_fs_mark_sick(sc->mp, sc->sick_mask);
+			xfs_fs_mark_corrupt(sc->mp, sc->sick_mask);
 		else
 			xfs_fs_mark_healthy(sc->mp, sc->sick_mask);
 		break;
 	case XHG_RT:
 		if (bad)
-			xfs_rt_mark_sick(sc->mp, sc->sick_mask);
+			xfs_rt_mark_corrupt(sc->mp, sc->sick_mask);
 		else
 			xfs_rt_mark_healthy(sc->mp, sc->sick_mask);
 		break;
@@ -205,13 +248,13 @@ xchk_update_health(
 }
 
 /* Is the given per-AG btree healthy enough for scanning? */
-bool
-xchk_ag_btree_healthy_enough(
+void
+xchk_ag_btree_del_cursor_if_sick(
 	struct xfs_scrub	*sc,
-	struct xfs_perag	*pag,
-	xfs_btnum_t		btnum)
+	struct xfs_btree_cur	**curp,
+	unsigned int		sm_type)
 {
-	unsigned int		mask = 0;
+	unsigned int		mask = (*curp)->bc_ops->sick_mask;
 
 	/*
 	 * We always want the cursor if it's the same type as whatever we're
@@ -220,41 +263,8 @@ xchk_ag_btree_healthy_enough(
 	 * Otherwise, we're only interested in the btree for cross-referencing.
 	 * If we know the btree is bad then don't bother, just set XFAIL.
 	 */
-	switch (btnum) {
-	case XFS_BTNUM_BNO:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_BNOBT)
-			return true;
-		mask = XFS_SICK_AG_BNOBT;
-		break;
-	case XFS_BTNUM_CNT:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_CNTBT)
-			return true;
-		mask = XFS_SICK_AG_CNTBT;
-		break;
-	case XFS_BTNUM_INO:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
-			return true;
-		mask = XFS_SICK_AG_INOBT;
-		break;
-	case XFS_BTNUM_FINO:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_FINOBT)
-			return true;
-		mask = XFS_SICK_AG_FINOBT;
-		break;
-	case XFS_BTNUM_RMAP:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_RMAPBT)
-			return true;
-		mask = XFS_SICK_AG_RMAPBT;
-		break;
-	case XFS_BTNUM_REFC:
-		if (sc->sm->sm_type == XFS_SCRUB_TYPE_REFCNTBT)
-			return true;
-		mask = XFS_SICK_AG_REFCNTBT;
-		break;
-	default:
-		ASSERT(0);
-		return true;
-	}
+	if (sc->sm->sm_type == sm_type)
+		return;
 
 	/*
 	 * If we just repaired some AG metadata, sc->sick_mask will reflect all
@@ -266,10 +276,42 @@ xchk_ag_btree_healthy_enough(
 	    type_to_health_flag[sc->sm->sm_type].group == XHG_AG)
 		mask &= ~sc->sick_mask;
 
-	if (xfs_ag_has_sickness(pag, mask)) {
+	if (xfs_ag_has_sickness((*curp)->bc_ag.pag, mask)) {
 		sc->sm->sm_flags |= XFS_SCRUB_OFLAG_XFAIL;
-		return false;
+		xfs_btree_del_cursor(*curp, XFS_BTREE_NOERROR);
+		*curp = NULL;
+	}
+}
+
+/*
+ * Quick scan to double-check that there isn't any evidence of lingering
+ * primary health problems.  If we're still clear, then the health update will
+ * take care of clearing the indirect evidence.
+ */
+int
+xchk_health_record(
+	struct xfs_scrub	*sc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
+	unsigned int		sick;
+	unsigned int		checked;
+
+	xfs_fs_measure_sickness(mp, &sick, &checked);
+	if (sick & XFS_SICK_FS_PRIMARY)
+		xchk_set_corrupt(sc);
+
+	xfs_rt_measure_sickness(mp, &sick, &checked);
+	if (sick & XFS_SICK_RT_PRIMARY)
+		xchk_set_corrupt(sc);
+
+	for_each_perag(mp, agno, pag) {
+		xfs_ag_measure_sickness(pag, &sick, &checked);
+		if (sick & XFS_SICK_AG_PRIMARY)
+			xchk_set_corrupt(sc);
 	}
 
-	return true;
+	return 0;
 }
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index a731b2467399..63fc426eb5ae 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -8,9 +8,10 @@
 
 unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
 void xchk_update_health(struct xfs_scrub *sc);
-bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
-		xfs_btnum_t btnum);
+void xchk_ag_btree_del_cursor_if_sick(struct xfs_scrub *sc,
+		struct xfs_btree_cur **curp, unsigned int sm_type);
 void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
 bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
+int xchk_health_record(struct xfs_scrub *sc);
 
 #endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index a720fc62262a..750d7b0cd25a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -76,7 +76,7 @@ xchk_inobt_xref_finobt(
 	int			has_record;
 	int			error;
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_FINO);
+	ASSERT(xfs_btree_is_fino(cur->bc_ops));
 
 	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
 	if (error)
@@ -179,7 +179,7 @@ xchk_finobt_xref_inobt(
 	int			has_record;
 	int			error;
 
-	ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
+	ASSERT(xfs_btree_is_ino(cur->bc_ops));
 
 	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_record);
 	if (error)
@@ -514,7 +514,7 @@ xchk_iallocbt_rec_alignment(
 	 * Otherwise, we expect that the finobt record is aligned to the
 	 * cluster alignment as told by the superblock.
 	 */
-	if (bs->cur->bc_btnum == XFS_BTNUM_FINO) {
+	if (xfs_btree_is_fino(bs->cur->bc_ops)) {
 		unsigned int	imask;
 
 		imask = min_t(unsigned int, XFS_INODES_PER_CHUNK,
@@ -649,8 +649,7 @@ out:
  */
 STATIC void
 xchk_iallocbt_xref_rmap_btreeblks(
-	struct xfs_scrub	*sc,
-	int			which)
+	struct xfs_scrub	*sc)
 {
 	xfs_filblks_t		blocks;
 	xfs_extlen_t		inobt_blocks = 0;
@@ -688,7 +687,6 @@ xchk_iallocbt_xref_rmap_btreeblks(
 STATIC void
 xchk_iallocbt_xref_rmap_inodes(
 	struct xfs_scrub	*sc,
-	int			which,
 	unsigned long long	inodes)
 {
 	xfs_filblks_t		blocks;
@@ -719,17 +717,14 @@ xchk_iallocbt(
 		.next_startino	= NULLAGINO,
 		.next_cluster_ino = NULLAGINO,
 	};
-	xfs_btnum_t		which;
 	int			error;
 
 	switch (sc->sm->sm_type) {
 	case XFS_SCRUB_TYPE_INOBT:
 		cur = sc->sa.ino_cur;
-		which = XFS_BTNUM_INO;
 		break;
 	case XFS_SCRUB_TYPE_FINOBT:
 		cur = sc->sa.fino_cur;
-		which = XFS_BTNUM_FINO;
 		break;
 	default:
 		ASSERT(0);
@@ -741,7 +736,7 @@ xchk_iallocbt(
 	if (error)
 		return error;
 
-	xchk_iallocbt_xref_rmap_btreeblks(sc, which);
+	xchk_iallocbt_xref_rmap_btreeblks(sc);
 
 	/*
 	 * If we're scrubbing the inode btree, inode_blocks is the number of
@@ -750,9 +745,8 @@ xchk_iallocbt(
 	 * knows about.  We can't do this for the finobt since it only points
 	 * to inode chunks with free inodes.
 	 */
-	if (which == XFS_BTNUM_INO)
-		xchk_iallocbt_xref_rmap_inodes(sc, which, iabt.inodes);
-
+	if (sc->sm->sm_type == XFS_SCRUB_TYPE_INOBT)
+		xchk_iallocbt_xref_rmap_inodes(sc, iabt.inodes);
 	return error;
 }
 
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
index b3f7182dd2f5..a00ec7ae1792 100644
--- a/fs/xfs/scrub/ialloc_repair.c
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -369,7 +369,7 @@ xrep_ibt_check_inode_ext(
 	 * On a sparse inode fs, this cluster could be part of a sparse chunk.
 	 * Sparse clusters must be aligned to sparse chunk alignment.
 	 */
-	if (xfs_has_sparseinodes(mp) &&
+	if (xfs_has_sparseinodes(mp) && mp->m_sb.sb_spino_align &&
 	    (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
 	     !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
 		return -EFSCORRUPTED;
@@ -663,8 +663,8 @@ xrep_ibt_build_new_trees(
 	ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
 	ri->new_inobt.bload.get_records = xrep_ibt_get_records;
 
-	ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
-			XFS_BTNUM_INO);
+	ino_cur = xfs_inobt_init_cursor(sc->sa.pag, NULL, NULL);
+	xfs_btree_stage_afakeroot(ino_cur, &ri->new_inobt.afake);
 	error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
 			xfarray_length(ri->inode_records));
 	if (error)
@@ -684,8 +684,8 @@ xrep_ibt_build_new_trees(
 		ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
 		ri->new_finobt.bload.get_records = xrep_fibt_get_records;
 
-		fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
-				&ri->new_finobt.afake, XFS_BTNUM_FINO);
+		fino_cur = xfs_finobt_init_cursor(sc->sa.pag, NULL, NULL);
+		xfs_btree_stage_afakeroot(fino_cur, &ri->new_finobt.afake);
 		error = xfs_btree_bload_compute_geometry(fino_cur,
 				&ri->new_finobt.bload, ri->finobt_recs);
 		if (error)
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
index 0ca62d59f84a..eab380e95ef4 100644
--- a/fs/xfs/scrub/inode_repair.c
+++ b/fs/xfs/scrub/inode_repair.c
@@ -37,12 +37,15 @@
 #include "xfs_attr_leaf.h"
 #include "xfs_log_priv.h"
 #include "xfs_health.h"
+#include "xfs_symlink_remote.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
+#include "scrub/iscan.h"
+#include "scrub/readdir.h"
 
 /*
  * Inode Record Repair
@@ -126,6 +129,10 @@ struct xrep_inode {
 
 	/* Must we remove all access from this file? */
 	bool			zap_acls;
+
+	/* Inode scanner to see if we can find the ftype from dirents */
+	struct xchk_iscan	ftype_iscan;
+	uint8_t			alleged_ftype;
 };
 
 /*
@@ -227,26 +234,233 @@ xrep_dinode_header(
 	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
 }
 
-/* Turn di_mode into /something/ recognizable. */
-STATIC void
+/*
+ * If this directory entry points to the scrub target inode, then the directory
+ * we're scanning is the parent of the scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_dirent(
+	struct xfs_scrub		*sc,
+	struct xfs_inode		*dp,
+	xfs_dir2_dataptr_t		dapos,
+	const struct xfs_name		*name,
+	xfs_ino_t			ino,
+	void				*priv)
+{
+	struct xrep_inode		*ri = priv;
+	int				error = 0;
+
+	if (xchk_should_terminate(ri->sc, &error))
+		return error;
+
+	if (ino != sc->sm->sm_ino)
+		return 0;
+
+	/* Ignore garbage directory entry names. */
+	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
+		return -EFSCORRUPTED;
+
+	/* Don't pick up dot or dotdot entries; we only want child dirents. */
+	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
+	    xfs_dir2_samename(name, &xfs_name_dot))
+		return 0;
+
+	/*
+	 * Uhoh, more than one parent for this inode and they don't agree on
+	 * the file type?
+	 */
+	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
+	    ri->alleged_ftype != name->type) {
+		trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
+				ri->alleged_ftype);
+		return -EFSCORRUPTED;
+	}
+
+	/* We found a potential parent; remember the ftype. */
+	trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
+	ri->alleged_ftype = name->type;
+	return 0;
+}
+
+/*
+ * If this is a directory, walk the dirents looking for any that point to the
+ * scrub target inode.
+ */
+STATIC int
+xrep_dinode_findmode_walk_directory(
+	struct xrep_inode	*ri,
+	struct xfs_inode	*dp)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	unsigned int		lock_mode;
+	int			error = 0;
+
+	/*
+	 * Scan the directory to see if there it contains an entry pointing to
+	 * the directory that we are repairing.
+	 */
+	lock_mode = xfs_ilock_data_map_shared(dp);
+
+	/*
+	 * If this directory is known to be sick, we cannot scan it reliably
+	 * and must abort.
+	 */
+	if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
+				       XFS_SICK_INO_BMBTD |
+				       XFS_SICK_INO_DIR)) {
+		error = -EFSCORRUPTED;
+		goto out_unlock;
+	}
+
+	/*
+	 * We cannot complete our parent pointer scan if a directory looks as
+	 * though it has been zapped by the inode record repair code.
+	 */
+	if (xchk_dir_looks_zapped(dp)) {
+		error = -EBUSY;
+		goto out_unlock;
+	}
+
+	error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
+	if (error)
+		goto out_unlock;
+
+out_unlock:
+	xfs_iunlock(dp, lock_mode);
+	return error;
+}
+
+/*
+ * Try to find the mode of the inode being repaired by looking for directories
+ * that point down to this file.
+ */
+STATIC int
+xrep_dinode_find_mode(
+	struct xrep_inode	*ri,
+	uint16_t		*mode)
+{
+	struct xfs_scrub	*sc = ri->sc;
+	struct xfs_inode	*dp;
+	int			error;
+
+	/* No ftype means we have no other metadata to consult. */
+	if (!xfs_has_ftype(sc->mp)) {
+		*mode = S_IFREG;
+		return 0;
+	}
+
+	/*
+	 * Scan all directories for parents that might point down to this
+	 * inode.  Skip the inode being repaired during the scan since it
+	 * cannot be its own parent.  Note that we still hold the AGI locked
+	 * so there's a real possibility that _iscan_iter can return EBUSY.
+	 */
+	xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
+	ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
+	ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
+	while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
+		if (S_ISDIR(VFS_I(dp)->i_mode))
+			error = xrep_dinode_findmode_walk_directory(ri, dp);
+		xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
+		xchk_irele(sc, dp);
+		if (error < 0)
+			break;
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&ri->ftype_iscan);
+	xchk_iscan_teardown(&ri->ftype_iscan);
+
+	if (error == -EBUSY) {
+		if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
+			/*
+			 * If we got an EBUSY after finding at least one
+			 * dirent, that means the scan found an inode on the
+			 * inactivation list and could not open it.  Accept the
+			 * alleged ftype and install a new mode below.
+			 */
+			error = 0;
+		} else if (!(sc->flags & XCHK_TRY_HARDER)) {
+			/*
+			 * Otherwise, retry the operation one time to see if
+			 * the reason for the delay is an inode from the same
+			 * cluster buffer waiting on the inactivation list.
+			 */
+			error = -EDEADLOCK;
+		}
+	}
+	if (error)
+		return error;
+
+	/*
+	 * Convert the discovered ftype into the file mode.  If all else fails,
+	 * return S_IFREG.
+	 */
+	switch (ri->alleged_ftype) {
+	case XFS_DIR3_FT_DIR:
+		*mode = S_IFDIR;
+		break;
+	case XFS_DIR3_FT_WHT:
+	case XFS_DIR3_FT_CHRDEV:
+		*mode = S_IFCHR;
+		break;
+	case XFS_DIR3_FT_BLKDEV:
+		*mode = S_IFBLK;
+		break;
+	case XFS_DIR3_FT_FIFO:
+		*mode = S_IFIFO;
+		break;
+	case XFS_DIR3_FT_SOCK:
+		*mode = S_IFSOCK;
+		break;
+	case XFS_DIR3_FT_SYMLINK:
+		*mode = S_IFLNK;
+		break;
+	default:
+		*mode = S_IFREG;
+		break;
+	}
+	return 0;
+}
+
+/* Turn di_mode into /something/ recognizable.  Returns true if we succeed. */
+STATIC int
 xrep_dinode_mode(
 	struct xrep_inode	*ri,
 	struct xfs_dinode	*dip)
 {
 	struct xfs_scrub	*sc = ri->sc;
 	uint16_t		mode = be16_to_cpu(dip->di_mode);
+	int			error;
 
 	trace_xrep_dinode_mode(sc, dip);
 
 	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
-		return;
+		return 0;
+
+	/* Try to fix the mode.  If we cannot, then leave everything alone. */
+	error = xrep_dinode_find_mode(ri, &mode);
+	switch (error) {
+	case -EINTR:
+	case -EBUSY:
+	case -EDEADLOCK:
+		/* temporary failure or fatal signal */
+		return error;
+	case 0:
+		/* found mode */
+		break;
+	default:
+		/* some other error, assume S_IFREG */
+		mode = S_IFREG;
+		break;
+	}
 
 	/* bad mode, so we set it to a file that only root can read */
-	mode = S_IFREG;
 	dip->di_mode = cpu_to_be16(mode);
 	dip->di_uid = 0;
 	dip->di_gid = 0;
 	ri->zap_acls = true;
+	return 0;
 }
 
 /* Fix any conflicting flags that the verifiers complain about. */
@@ -1107,12 +1321,15 @@ xrep_dinode_core(
 	/* Fix everything the verifier will complain about. */
 	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
 	xrep_dinode_header(sc, dip);
-	xrep_dinode_mode(ri, dip);
+	iget_error = xrep_dinode_mode(ri, dip);
+	if (iget_error)
+		goto write;
 	xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
 	xrep_dinode_size(ri, dip);
 	xrep_dinode_extsize_hints(sc, dip);
 	xrep_dinode_zap_forks(ri, dip);
 
+write:
 	/* Write out the inode. */
 	trace_xrep_dinode_fixed(sc, dip);
 	xfs_dinode_calc_crc(sc->mp, dip);
@@ -1128,7 +1345,8 @@ xrep_dinode_core(
 	 * accessing the inode.  If iget fails, we still need to commit the
 	 * changes.
 	 */
-	iget_error = xchk_iget(sc, ino, &sc->ip);
+	if (!iget_error)
+		iget_error = xchk_iget(sc, ino, &sc->ip);
 	if (!iget_error)
 		xchk_ilock(sc, XFS_IOLOCK_EXCL);
 
@@ -1496,6 +1714,13 @@ xrep_inode(
 		ASSERT(ri != NULL);
 
 		error = xrep_dinode_problems(ri);
+		if (error == -EBUSY) {
+			/*
+			 * Directory scan to recover inode mode encountered a
+			 * busy inode, so we did not continue repairing things.
+			 */
+			return 0;
+		}
 		if (error)
 			return error;
 
diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
new file mode 100644
index 000000000000..ec3478bc505e
--- /dev/null
+++ b/fs/xfs/scrub/iscan.c
@@ -0,0 +1,767 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_bit.h"
+#include "xfs_icache.h"
+#include "scrub/scrub.h"
+#include "scrub/iscan.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+
+/*
+ * Live File Scan
+ * ==============
+ *
+ * Live file scans walk every inode in a live filesystem.  This is more or
+ * less like a regular iwalk, except that when we're advancing the scan cursor,
+ * we must ensure that inodes cannot be added or deleted anywhere between the
+ * old cursor value and the new cursor value.  If we're advancing the cursor
+ * by one inode, the caller must hold that inode; if we're finding the next
+ * inode to scan, we must grab the AGI and hold it until we've updated the
+ * scan cursor.
+ *
+ * Callers are expected to use this code to scan all files in the filesystem to
+ * construct a new metadata index of some kind.  The scan races against other
+ * live updates, which means there must be a provision to update the new index
+ * when updates are made to inodes that already been scanned.  The iscan lock
+ * can be used in live update hook code to stop the scan and protect this data
+ * structure.
+ *
+ * To keep the new index up to date with other metadata updates being made to
+ * the live filesystem, it is assumed that the caller will add hooks as needed
+ * to be notified when a metadata update occurs.  The inode scanner must tell
+ * the hook code when an inode has been visited with xchk_iscan_mark_visit.
+ * Hook functions can use xchk_iscan_want_live_update to decide if the
+ * scanner's observations must be updated.
+ */
+
+/*
+ * If the inobt record @rec covers @iscan->skip_ino, mark the inode free so
+ * that the scan ignores that inode.
+ */
+STATIC void
+xchk_iscan_mask_skipino(
+	struct xchk_iscan	*iscan,
+	struct xfs_perag	*pag,
+	struct xfs_inobt_rec_incore	*rec,
+	xfs_agino_t		lastrecino)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_agnumber_t		skip_agno = XFS_INO_TO_AGNO(mp, iscan->skip_ino);
+	xfs_agnumber_t		skip_agino = XFS_INO_TO_AGINO(mp, iscan->skip_ino);
+
+	if (pag->pag_agno != skip_agno)
+		return;
+	if (skip_agino < rec->ir_startino)
+		return;
+	if (skip_agino > lastrecino)
+		return;
+
+	rec->ir_free |= xfs_inobt_maskn(skip_agino - rec->ir_startino, 1);
+}
+
+/*
+ * Set *cursor to the next allocated inode after whatever it's set to now.
+ * If there are no more inodes in this AG, cursor is set to NULLAGINO.
+ */
+STATIC int
+xchk_iscan_find_next(
+	struct xchk_iscan	*iscan,
+	struct xfs_buf		*agi_bp,
+	struct xfs_perag	*pag,
+	xfs_inofree_t		*allocmaskp,
+	xfs_agino_t		*cursor,
+	uint8_t			*nr_inodesp)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	struct xfs_inobt_rec_incore	rec;
+	struct xfs_btree_cur	*cur;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_trans	*tp = sc->tp;
+	xfs_agnumber_t		agno = pag->pag_agno;
+	xfs_agino_t		lastino = NULLAGINO;
+	xfs_agino_t		first, last;
+	xfs_agino_t		agino = *cursor;
+	int			has_rec;
+	int			error;
+
+	/* If the cursor is beyond the end of this AG, move to the next one. */
+	xfs_agino_range(mp, agno, &first, &last);
+	if (agino > last) {
+		*cursor = NULLAGINO;
+		return 0;
+	}
+
+	/*
+	 * Look up the inode chunk for the current cursor position.  If there
+	 * is no chunk here, we want the next one.
+	 */
+	cur = xfs_inobt_init_cursor(pag, tp, agi_bp);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &has_rec);
+	if (!error && !has_rec)
+		error = xfs_btree_increment(cur, 0, &has_rec);
+	for (; !error; error = xfs_btree_increment(cur, 0, &has_rec)) {
+		xfs_inofree_t	allocmask;
+
+		/*
+		 * If we've run out of inobt records in this AG, move the
+		 * cursor on to the next AG and exit.  The caller can try
+		 * again with the next AG.
+		 */
+		if (!has_rec) {
+			*cursor = NULLAGINO;
+			break;
+		}
+
+		error = xfs_inobt_get_rec(cur, &rec, &has_rec);
+		if (error)
+			break;
+		if (!has_rec) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+
+		/* Make sure that we always move forward. */
+		if (lastino != NULLAGINO &&
+		    XFS_IS_CORRUPT(mp, lastino >= rec.ir_startino)) {
+			error = -EFSCORRUPTED;
+			break;
+		}
+		lastino = rec.ir_startino + XFS_INODES_PER_CHUNK - 1;
+
+		/*
+		 * If this record only covers inodes that come before the
+		 * cursor, advance to the next record.
+		 */
+		if (rec.ir_startino + XFS_INODES_PER_CHUNK <= agino)
+			continue;
+
+		if (iscan->skip_ino)
+			xchk_iscan_mask_skipino(iscan, pag, &rec, lastino);
+
+		/*
+		 * If the incoming lookup put us in the middle of an inobt
+		 * record, mark it and the previous inodes "free" so that the
+		 * search for allocated inodes will start at the cursor.
+		 * We don't care about ir_freecount here.
+		 */
+		if (agino >= rec.ir_startino)
+			rec.ir_free |= xfs_inobt_maskn(0,
+						agino + 1 - rec.ir_startino);
+
+		/*
+		 * If there are allocated inodes in this chunk, find them
+		 * and update the scan cursor.
+		 */
+		allocmask = ~rec.ir_free;
+		if (hweight64(allocmask) > 0) {
+			int	next = xfs_lowbit64(allocmask);
+
+			ASSERT(next >= 0);
+			*cursor = rec.ir_startino + next;
+			*allocmaskp = allocmask >> next;
+			*nr_inodesp = XFS_INODES_PER_CHUNK - next;
+			break;
+		}
+	}
+
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Advance both the scan and the visited cursors.
+ *
+ * The inumber address space for a given filesystem is sparse, which means that
+ * the scan cursor can jump a long ways in a single iter() call.  There are no
+ * inodes in these sparse areas, so we must move the visited cursor forward at
+ * the same time so that the scan user can receive live updates for inodes that
+ * may get created once we release the AGI buffer.
+ */
+static inline void
+xchk_iscan_move_cursor(
+	struct xchk_iscan	*iscan,
+	xfs_agnumber_t		agno,
+	xfs_agino_t		agino)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_ino_t		cursor, visited;
+
+	BUILD_BUG_ON(XFS_MAXINUMBER == NULLFSINO);
+
+	/*
+	 * Special-case ino == 0 here so that we never set visited_ino to
+	 * NULLFSINO when wrapping around EOFS, for that will let through all
+	 * live updates.
+	 */
+	cursor = XFS_AGINO_TO_INO(mp, agno, agino);
+	if (cursor == 0)
+		visited = XFS_MAXINUMBER;
+	else
+		visited = cursor - 1;
+
+	mutex_lock(&iscan->lock);
+	iscan->cursor_ino = cursor;
+	iscan->__visited_ino = visited;
+	trace_xchk_iscan_move_cursor(iscan);
+	mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Prepare to return agno/agino to the iscan caller by moving the lastino
+ * cursor to the previous inode.  Do this while we still hold the AGI so that
+ * no other threads can create or delete inodes in this AG.
+ */
+static inline void
+xchk_iscan_finish(
+	struct xchk_iscan	*iscan)
+{
+	mutex_lock(&iscan->lock);
+	iscan->cursor_ino = NULLFSINO;
+
+	/* All live updates will be applied from now on */
+	iscan->__visited_ino = NULLFSINO;
+
+	mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance ino to the next inode that the inobt thinks is allocated, being
+ * careful to jump to the next AG if we've reached the right end of this AG's
+ * inode btree.  Advancing ino effectively means that we've pushed the inode
+ * scan forward, so set the iscan cursor to (ino - 1) so that our live update
+ * predicates will track inode allocations in that part of the inode number
+ * key space once we release the AGI buffer.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, or the usual negative errno.
+ */
+STATIC int
+xchk_iscan_advance(
+	struct xchk_iscan	*iscan,
+	struct xfs_perag	**pagp,
+	struct xfs_buf		**agi_bpp,
+	xfs_inofree_t		*allocmaskp,
+	uint8_t			*nr_inodesp)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*agi_bp;
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+	xfs_agino_t		agino;
+	int			ret;
+
+	ASSERT(iscan->cursor_ino >= iscan->__visited_ino);
+
+	do {
+		if (xchk_iscan_aborted(iscan))
+			return -ECANCELED;
+
+		agno = XFS_INO_TO_AGNO(mp, iscan->cursor_ino);
+		pag = xfs_perag_get(mp, agno);
+		if (!pag)
+			return -ECANCELED;
+
+		ret = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp);
+		if (ret)
+			goto out_pag;
+
+		agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
+		ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
+				&agino, nr_inodesp);
+		if (ret)
+			goto out_buf;
+
+		if (agino != NULLAGINO) {
+			/*
+			 * Found the next inode in this AG, so return it along
+			 * with the AGI buffer and the perag structure to
+			 * ensure it cannot go away.
+			 */
+			xchk_iscan_move_cursor(iscan, agno, agino);
+			*agi_bpp = agi_bp;
+			*pagp = pag;
+			return 1;
+		}
+
+		/*
+		 * Did not find any more inodes in this AG, move on to the next
+		 * AG.
+		 */
+		agno = (agno + 1) % mp->m_sb.sb_agcount;
+		xchk_iscan_move_cursor(iscan, agno, 0);
+		xfs_trans_brelse(sc->tp, agi_bp);
+		xfs_perag_put(pag);
+
+		trace_xchk_iscan_advance_ag(iscan);
+	} while (iscan->cursor_ino != iscan->scan_start_ino);
+
+	xchk_iscan_finish(iscan);
+	return 0;
+
+out_buf:
+	xfs_trans_brelse(sc->tp, agi_bp);
+out_pag:
+	xfs_perag_put(pag);
+	return ret;
+}
+
+/*
+ * Grabbing the inode failed, so we need to back up the scan and ask the caller
+ * to try to _advance the scan again.  Returns -EBUSY if we've run out of retry
+ * opportunities, -ECANCELED if the process has a fatal signal pending, or
+ * -EAGAIN if we should try again.
+ */
+STATIC int
+xchk_iscan_iget_retry(
+	struct xchk_iscan	*iscan,
+	bool			wait)
+{
+	ASSERT(iscan->cursor_ino == iscan->__visited_ino + 1);
+
+	if (!iscan->iget_timeout ||
+	    time_is_before_jiffies(iscan->__iget_deadline))
+		return -EBUSY;
+
+	if (wait) {
+		unsigned long	relax;
+
+		/*
+		 * Sleep for a period of time to let the rest of the system
+		 * catch up.  If we return early, someone sent a kill signal to
+		 * the calling process.
+		 */
+		relax = msecs_to_jiffies(iscan->iget_retry_delay);
+		trace_xchk_iscan_iget_retry_wait(iscan);
+
+		if (schedule_timeout_killable(relax) ||
+		    xchk_iscan_aborted(iscan))
+			return -ECANCELED;
+	}
+
+	iscan->cursor_ino--;
+	return -EAGAIN;
+}
+
+/*
+ * Grab an inode as part of an inode scan.  While scanning this inode, the
+ * caller must ensure that no other threads can modify the inode until a call
+ * to xchk_iscan_visit succeeds.
+ *
+ * Returns the number of incore inodes grabbed; -EAGAIN if the caller should
+ * call again xchk_iscan_advance; -EBUSY if we couldn't grab an inode;
+ * -ECANCELED if there's a fatal signal pending; or some other negative errno.
+ */
+STATIC int
+xchk_iscan_iget(
+	struct xchk_iscan	*iscan,
+	struct xfs_perag	*pag,
+	struct xfs_buf		*agi_bp,
+	xfs_inofree_t		allocmask,
+	uint8_t			nr_inodes)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	struct xfs_mount	*mp = sc->mp;
+	xfs_ino_t		ino = iscan->cursor_ino;
+	unsigned int		idx = 0;
+	unsigned int		i;
+	int			error;
+
+	ASSERT(iscan->__inodes[0] == NULL);
+
+	/* Fill the first slot in the inode array. */
+	error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+			&iscan->__inodes[idx]);
+
+	trace_xchk_iscan_iget(iscan, error);
+
+	if (error == -ENOENT || error == -EAGAIN) {
+		xfs_trans_brelse(sc->tp, agi_bp);
+		xfs_perag_put(pag);
+
+		/*
+		 * It's possible that this inode has lost all of its links but
+		 * hasn't yet been inactivated.  If we don't have a transaction
+		 * or it's not writable, flush the inodegc workers and wait.
+		 */
+		xfs_inodegc_flush(mp);
+		return xchk_iscan_iget_retry(iscan, true);
+	}
+
+	if (error == -EINVAL) {
+		xfs_trans_brelse(sc->tp, agi_bp);
+		xfs_perag_put(pag);
+
+		/*
+		 * We thought the inode was allocated, but the inode btree
+		 * lookup failed, which means that it was freed since the last
+		 * time we advanced the cursor.  Back up and try again.  This
+		 * should never happen since still hold the AGI buffer from the
+		 * inobt check, but we need to be careful about infinite loops.
+		 */
+		return xchk_iscan_iget_retry(iscan, false);
+	}
+
+	if (error) {
+		xfs_trans_brelse(sc->tp, agi_bp);
+		xfs_perag_put(pag);
+		return error;
+	}
+	idx++;
+	ino++;
+	allocmask >>= 1;
+
+	/*
+	 * Now that we've filled the first slot in __inodes, try to fill the
+	 * rest of the batch with consecutively ordered inodes.  to reduce the
+	 * number of _iter calls.  Make a bitmap of unallocated inodes from the
+	 * zeroes in the inuse bitmap; these inodes will not be scanned, but
+	 * the _want_live_update predicate will pass through all live updates.
+	 *
+	 * If we can't iget an allocated inode, stop and return what we have.
+	 */
+	mutex_lock(&iscan->lock);
+	iscan->__batch_ino = ino - 1;
+	iscan->__skipped_inomask = 0;
+	mutex_unlock(&iscan->lock);
+
+	for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+		if (!(allocmask & 1)) {
+			ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+			mutex_lock(&iscan->lock);
+			iscan->cursor_ino = ino;
+			iscan->__skipped_inomask |= (1ULL << i);
+			mutex_unlock(&iscan->lock);
+			continue;
+		}
+
+		ASSERT(iscan->__inodes[idx] == NULL);
+
+		error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
+				&iscan->__inodes[idx]);
+		if (error)
+			break;
+
+		mutex_lock(&iscan->lock);
+		iscan->cursor_ino = ino;
+		mutex_unlock(&iscan->lock);
+		idx++;
+	}
+
+	trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
+	xfs_trans_brelse(sc->tp, agi_bp);
+	xfs_perag_put(pag);
+	return idx;
+}
+
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+	struct xchk_iscan	*iscan)
+{
+	xfs_ino_t		highest_skipped;
+
+	mutex_lock(&iscan->lock);
+
+	if (iscan->__batch_ino != NULLFSINO) {
+		highest_skipped = iscan->__batch_ino +
+					xfs_highbit64(iscan->__skipped_inomask);
+		iscan->__visited_ino = max(iscan->__visited_ino,
+					   highest_skipped);
+
+		trace_xchk_iscan_skip(iscan);
+	}
+
+	iscan->__batch_ino = NULLFSINO;
+	iscan->__skipped_inomask = 0;
+
+	mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return up to
+ * 64 consecutive allocated inodes starting with the cursor position.
+ */
+STATIC int
+xchk_iscan_iter_batch(
+	struct xchk_iscan	*iscan)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	int			ret;
+
+	xchk_iscan_finish_batch(iscan);
+
+	if (iscan->iget_timeout)
+		iscan->__iget_deadline = jiffies +
+					 msecs_to_jiffies(iscan->iget_timeout);
+
+	do {
+		struct xfs_buf	*agi_bp = NULL;
+		struct xfs_perag *pag = NULL;
+		xfs_inofree_t	allocmask = 0;
+		uint8_t		nr_inodes = 0;
+
+		ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+				&nr_inodes);
+		if (ret != 1)
+			return ret;
+
+		if (xchk_iscan_aborted(iscan)) {
+			xfs_trans_brelse(sc->tp, agi_bp);
+			xfs_perag_put(pag);
+			ret = -ECANCELED;
+			break;
+		}
+
+		ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
+	} while (ret == -EAGAIN);
+
+	return ret;
+}
+
+/*
+ * Advance the inode scan cursor to the next allocated inode and return the
+ * incore inode structure associated with it.
+ *
+ * Returns 1 if there's a new inode to examine, 0 if we've run out of inodes,
+ * -ECANCELED if the live scan aborted, -EBUSY if the incore inode could not be
+ * grabbed, or the usual negative errno.
+ *
+ * If the function returns -EBUSY and the caller can handle skipping an inode,
+ * it may call this function again to continue the scan with the next allocated
+ * inode.
+ */
+int
+xchk_iscan_iter(
+	struct xchk_iscan	*iscan,
+	struct xfs_inode	**ipp)
+{
+	unsigned int		i;
+	int			error;
+
+	/* Find a cached inode, or go get another batch. */
+	for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+		if (iscan->__inodes[i])
+			goto foundit;
+	}
+
+	error = xchk_iscan_iter_batch(iscan);
+	if (error <= 0)
+		return error;
+
+	ASSERT(iscan->__inodes[0] != NULL);
+	i = 0;
+
+foundit:
+	/* Give the caller our reference. */
+	*ipp = iscan->__inodes[i];
+	iscan->__inodes[i] = NULL;
+	return 1;
+}
+
+/* Clean up an xfs_iscan_iter call by dropping any inodes that we still hold. */
+void
+xchk_iscan_iter_finish(
+	struct xchk_iscan	*iscan)
+{
+	struct xfs_scrub	*sc = iscan->sc;
+	unsigned int		i;
+
+	for (i = 0; i < XFS_INODES_PER_CHUNK; i++) {
+		if (iscan->__inodes[i]) {
+			xchk_irele(sc, iscan->__inodes[i]);
+			iscan->__inodes[i] = NULL;
+		}
+	}
+}
+
+/* Mark this inode scan finished and release resources. */
+void
+xchk_iscan_teardown(
+	struct xchk_iscan	*iscan)
+{
+	xchk_iscan_iter_finish(iscan);
+	xchk_iscan_finish(iscan);
+	mutex_destroy(&iscan->lock);
+}
+
+/* Pick an AG from which to start a scan. */
+static inline xfs_ino_t
+xchk_iscan_rotor(
+	struct xfs_mount	*mp)
+{
+	static atomic_t		agi_rotor;
+	unsigned int		r = atomic_inc_return(&agi_rotor) - 1;
+
+	/*
+	 * Rotoring *backwards* through the AGs, so we add one here before
+	 * subtracting from the agcount to arrive at an AG number.
+	 */
+	r = (r % mp->m_sb.sb_agcount) + 1;
+
+	return XFS_AGINO_TO_INO(mp, mp->m_sb.sb_agcount - r, 0);
+}
+
+/*
+ * Set ourselves up to start an inode scan.  If the @iget_timeout and
+ * @iget_retry_delay parameters are set, the scan will try to iget each inode
+ * for @iget_timeout milliseconds.  If an iget call indicates that the inode is
+ * waiting to be inactivated, the CPU will relax for @iget_retry_delay
+ * milliseconds after pushing the inactivation workers.
+ */
+void
+xchk_iscan_start(
+	struct xfs_scrub	*sc,
+	unsigned int		iget_timeout,
+	unsigned int		iget_retry_delay,
+	struct xchk_iscan	*iscan)
+{
+	xfs_ino_t		start_ino;
+
+	start_ino = xchk_iscan_rotor(sc->mp);
+
+	iscan->__batch_ino = NULLFSINO;
+	iscan->__skipped_inomask = 0;
+
+	iscan->sc = sc;
+	clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+	iscan->iget_timeout = iget_timeout;
+	iscan->iget_retry_delay = iget_retry_delay;
+	iscan->__visited_ino = start_ino;
+	iscan->cursor_ino = start_ino;
+	iscan->scan_start_ino = start_ino;
+	mutex_init(&iscan->lock);
+	memset(iscan->__inodes, 0, sizeof(iscan->__inodes));
+
+	trace_xchk_iscan_start(iscan, start_ino);
+}
+
+/*
+ * Mark this inode as having been visited.  Callers must hold a sufficiently
+ * exclusive lock on the inode to prevent concurrent modifications.
+ */
+void
+xchk_iscan_mark_visited(
+	struct xchk_iscan	*iscan,
+	struct xfs_inode	*ip)
+{
+	mutex_lock(&iscan->lock);
+	iscan->__visited_ino = ip->i_ino;
+	trace_xchk_iscan_visit(iscan);
+	mutex_unlock(&iscan->lock);
+}
+
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned.  All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+	const struct xchk_iscan	*iscan,
+	xfs_ino_t		ino)
+{
+	if (iscan->__batch_ino == NULLFSINO)
+		return false;
+	if (ino < iscan->__batch_ino)
+		return false;
+	if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+		return false;
+
+	return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
+/*
+ * Do we need a live update for this inode?  This is true if the scanner thread
+ * has visited this inode and the scan hasn't been aborted due to errors.
+ * Callers must hold a sufficiently exclusive lock on the inode to prevent
+ * scanners from reading any inode metadata.
+ */
+bool
+xchk_iscan_want_live_update(
+	struct xchk_iscan	*iscan,
+	xfs_ino_t		ino)
+{
+	bool			ret = false;
+
+	if (xchk_iscan_aborted(iscan))
+		return false;
+
+	mutex_lock(&iscan->lock);
+
+	trace_xchk_iscan_want_live_update(iscan, ino);
+
+	/* Scan is finished, caller should receive all updates. */
+	if (iscan->__visited_ino == NULLFSINO) {
+		ret = true;
+		goto unlock;
+	}
+
+	/*
+	 * No inodes have been visited yet, so the visited cursor points at the
+	 * start of the scan range.  The caller should not receive any updates.
+	 */
+	if (iscan->scan_start_ino == iscan->__visited_ino) {
+		ret = false;
+		goto unlock;
+	}
+
+	/*
+	 * This inode was not allocated at the time of the iscan batch.
+	 * The caller should receive all updates.
+	 */
+	if (xchk_iscan_skipped(iscan, ino)) {
+		ret = true;
+		goto unlock;
+	}
+
+	/*
+	 * The visited cursor hasn't yet wrapped around the end of the FS.  If
+	 * @ino is inside the starred range, the caller should receive updates:
+	 *
+	 * 0 ------------ S ************ V ------------ EOFS
+	 */
+	if (iscan->scan_start_ino <= iscan->__visited_ino) {
+		if (ino >= iscan->scan_start_ino &&
+		    ino <= iscan->__visited_ino)
+			ret = true;
+
+		goto unlock;
+	}
+
+	/*
+	 * The visited cursor wrapped around the end of the FS.  If @ino is
+	 * inside the starred range, the caller should receive updates:
+	 *
+	 * 0 ************ V ------------ S ************ EOFS
+	 */
+	if (ino >= iscan->scan_start_ino || ino <= iscan->__visited_ino)
+		ret = true;
+
+unlock:
+	mutex_unlock(&iscan->lock);
+	return ret;
+}
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
new file mode 100644
index 000000000000..71f657552dfa
--- /dev/null
+++ b/fs/xfs/scrub/iscan.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_ISCAN_H__
+#define __XFS_SCRUB_ISCAN_H__
+
+struct xchk_iscan {
+	struct xfs_scrub	*sc;
+
+	/* Lock to protect the scan cursor. */
+	struct mutex		lock;
+
+	/*
+	 * This is the first inode in the inumber address space that we
+	 * examined.  When the scan wraps around back to here, the scan is
+	 * finished.
+	 */
+	xfs_ino_t		scan_start_ino;
+
+	/* This is the inode that will be examined next. */
+	xfs_ino_t		cursor_ino;
+
+	/* If nonzero and non-NULL, skip this inode when scanning. */
+	xfs_ino_t		skip_ino;
+
+	/*
+	 * This is the last inode that we've successfully scanned, either
+	 * because the caller scanned it, or we moved the cursor past an empty
+	 * part of the inode address space.  Scan callers should only use the
+	 * xchk_iscan_visit function to modify this.
+	 */
+	xfs_ino_t		__visited_ino;
+
+	/* Operational state of the livescan. */
+	unsigned long		__opstate;
+
+	/* Give up on iterating @cursor_ino if we can't iget it by this time. */
+	unsigned long		__iget_deadline;
+
+	/* Amount of time (in ms) that we will try to iget an inode. */
+	unsigned int		iget_timeout;
+
+	/* Wait this many ms to retry an iget. */
+	unsigned int		iget_retry_delay;
+
+	/*
+	 * The scan grabs batches of inodes and stashes them here before
+	 * handing them out with _iter.  Unallocated inodes are set in the
+	 * mask so that all updates to that inode are selected for live
+	 * update propagation.
+	 */
+	xfs_ino_t		__batch_ino;
+	xfs_inofree_t		__skipped_inomask;
+	struct xfs_inode	*__inodes[XFS_INODES_PER_CHUNK];
+};
+
+/* Set if the scan has been aborted due to some event in the fs. */
+#define XCHK_ISCAN_OPSTATE_ABORTED	(1)
+
+static inline bool
+xchk_iscan_aborted(const struct xchk_iscan *iscan)
+{
+	return test_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+static inline void
+xchk_iscan_abort(struct xchk_iscan *iscan)
+{
+	set_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
+}
+
+void xchk_iscan_start(struct xfs_scrub *sc, unsigned int iget_timeout,
+		unsigned int iget_retry_delay, struct xchk_iscan *iscan);
+void xchk_iscan_teardown(struct xchk_iscan *iscan);
+
+int xchk_iscan_iter(struct xchk_iscan *iscan, struct xfs_inode **ipp);
+void xchk_iscan_iter_finish(struct xchk_iscan *iscan);
+
+void xchk_iscan_mark_visited(struct xchk_iscan *iscan, struct xfs_inode *ip);
+bool xchk_iscan_want_live_update(struct xchk_iscan *iscan, xfs_ino_t ino);
+
+#endif /* __XFS_SCRUB_ISCAN_H__ */
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
index bb6d980b4fcd..4a0271123d94 100644
--- a/fs/xfs/scrub/newbt.c
+++ b/fs/xfs/scrub/newbt.c
@@ -239,7 +239,11 @@ xrep_newbt_alloc_ag_blocks(
 
 		xrep_newbt_validate_ag_alloc_hint(xnr);
 
-		error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+		if (xnr->alloc_vextent)
+			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+		else
+			error = xfs_alloc_vextent_near_bno(&args,
+					xnr->alloc_hint);
 		if (error)
 			return error;
 		if (args.fsbno == NULLFSBLOCK)
@@ -309,7 +313,11 @@ xrep_newbt_alloc_file_blocks(
 
 		xrep_newbt_validate_file_alloc_hint(xnr);
 
-		error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+		if (xnr->alloc_vextent)
+			error = xnr->alloc_vextent(sc, &args, xnr->alloc_hint);
+		else
+			error = xfs_alloc_vextent_start_ag(&args,
+					xnr->alloc_hint);
 		if (error)
 			return error;
 		if (args.fsbno == NULLFSBLOCK)
@@ -535,7 +543,7 @@ xrep_newbt_claim_block(
 	trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
 			xnr->oinfo.oi_owner);
 
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+	if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
 		ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
 								agbno));
 	else
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
index 89f8e3970b1f..3d804d31af24 100644
--- a/fs/xfs/scrub/newbt.h
+++ b/fs/xfs/scrub/newbt.h
@@ -6,6 +6,8 @@
 #ifndef __XFS_SCRUB_NEWBT_H__
 #define __XFS_SCRUB_NEWBT_H__
 
+struct xfs_alloc_arg;
+
 struct xrep_newbt_resv {
 	/* Link to list of extents that we've reserved. */
 	struct list_head	list;
@@ -28,6 +30,11 @@ struct xrep_newbt_resv {
 struct xrep_newbt {
 	struct xfs_scrub	*sc;
 
+	/* Custom allocation function, or NULL for xfs_alloc_vextent */
+	int			(*alloc_vextent)(struct xfs_scrub *sc,
+						 struct xfs_alloc_arg *args,
+						 xfs_fsblock_t alloc_hint);
+
 	/* List of extents that we've reserved. */
 	struct list_head	resv_list;
 
diff --git a/fs/xfs/scrub/nlinks.c b/fs/xfs/scrub/nlinks.c
new file mode 100644
index 000000000000..8a7d9557897c
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.c
@@ -0,0 +1,930 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+#include "scrub/readdir.h"
+
+/*
+ * Live Inode Link Count Checking
+ * ==============================
+ *
+ * Inode link counts are "summary" metadata, in the sense that they are
+ * computed as the number of directory entries referencing each file on the
+ * filesystem.  Therefore, we compute the correct link counts by creating a
+ * shadow link count structure and walking every inode.
+ */
+
+/* Set us up to scrub inode link counts. */
+int
+xchk_setup_nlinks(
+	struct xfs_scrub	*sc)
+{
+	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
+
+	sc->buf = kzalloc(sizeof(struct xchk_nlink_ctrs), XCHK_GFP_FLAGS);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting file link counts.  For each file, we create a shadow link
+ * counting structure, then walk the entire directory tree, incrementing parent
+ * and child link counts for each directory entry seen.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned.  This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.  Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the link counts for an inode that we've already scanned.
+ * This will cause our counts to be incorrect.  Therefore, we hook all
+ * directory entry updates because that is when link count updates occur.  By
+ * shadowing transaction updates in this manner, live nlink check can ensure by
+ * locking the inode and the shadow structure that its own copies are not out
+ * of date.  Because the hook code runs in a different process context from the
+ * scrub code and the scrub state flags are not accessed atomically, failures
+ * in the hook code must abort the iscan and the scrubber must notice the
+ * aborted scan and set the incomplete flag.
+ *
+ * Note that we use jump labels and srcu notifier hooks to minimize the
+ * overhead when live nlinks is /not/ running.  Locking order for nlink
+ * observations is inode ILOCK -> iscan_lock/xchk_nlink_ctrs lock.
+ */
+
+/*
+ * Add a delta to an nlink counter, clamping the value to U32_MAX.  Because
+ * XFS_MAXLINK < U32_MAX, the checking code will produce the correct results
+ * even if we lose some precision.
+ */
+static inline void
+careful_add(
+	xfs_nlink_t	*nlinkp,
+	int		delta)
+{
+	uint64_t	new_value = (uint64_t)(*nlinkp) + delta;
+
+	BUILD_BUG_ON(XFS_MAXLINK > U32_MAX);
+	*nlinkp = min_t(uint64_t, new_value, U32_MAX);
+}
+
+/* Update incore link count information.  Caller must hold the nlinks lock. */
+STATIC int
+xchk_nlinks_update_incore(
+	struct xchk_nlink_ctrs	*xnc,
+	xfs_ino_t		ino,
+	int			parents_delta,
+	int			backrefs_delta,
+	int			children_delta)
+{
+	struct xchk_nlink	nl;
+	int			error;
+
+	if (!xnc->nlinks)
+		return 0;
+
+	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+	if (error)
+		return error;
+
+	trace_xchk_nlinks_update_incore(xnc->sc->mp, ino, &nl, parents_delta,
+			backrefs_delta, children_delta);
+
+	careful_add(&nl.parents, parents_delta);
+	careful_add(&nl.backrefs, backrefs_delta);
+	careful_add(&nl.children, children_delta);
+
+	nl.flags |= XCHK_NLINK_WRITTEN;
+	error = xfarray_store(xnc->nlinks, ino, &nl);
+	if (error == -EFBIG) {
+		/*
+		 * EFBIG means we tried to store data at too high a byte offset
+		 * in the sparse array.  IOWs, we cannot complete the check and
+		 * must notify userspace that the check was incomplete.
+		 */
+		error = -ECANCELED;
+	}
+	return error;
+}
+
+/*
+ * Apply a link count change from the regular filesystem into our shadow link
+ * count structure based on a directory update in progress.
+ */
+STATIC int
+xchk_nlinks_live_update(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_dir_update_params	*p = data;
+	struct xchk_nlink_ctrs		*xnc;
+	int				error;
+
+	xnc = container_of(nb, struct xchk_nlink_ctrs, dhook.dirent_hook.nb);
+
+	trace_xchk_nlinks_live_update(xnc->sc->mp, p->dp, action, p->ip->i_ino,
+			p->delta, p->name->name, p->name->len);
+
+	/*
+	 * If we've already scanned @dp, update the number of parents that link
+	 * to @ip.  If @ip is a subdirectory, update the number of child links
+	 * going out of @dp.
+	 */
+	if (xchk_iscan_want_live_update(&xnc->collect_iscan, p->dp->i_ino)) {
+		mutex_lock(&xnc->lock);
+		error = xchk_nlinks_update_incore(xnc, p->ip->i_ino, p->delta,
+				0, 0);
+		if (!error && S_ISDIR(VFS_IC(p->ip)->i_mode))
+			error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+					0, p->delta);
+		mutex_unlock(&xnc->lock);
+		if (error)
+			goto out_abort;
+	}
+
+	/*
+	 * If @ip is a subdirectory and we've already scanned it, update the
+	 * number of backrefs pointing to @dp.
+	 */
+	if (S_ISDIR(VFS_IC(p->ip)->i_mode) &&
+	    xchk_iscan_want_live_update(&xnc->collect_iscan, p->ip->i_ino)) {
+		mutex_lock(&xnc->lock);
+		error = xchk_nlinks_update_incore(xnc, p->dp->i_ino, 0,
+				p->delta, 0);
+		mutex_unlock(&xnc->lock);
+		if (error)
+			goto out_abort;
+	}
+
+	return NOTIFY_DONE;
+
+out_abort:
+	xchk_iscan_abort(&xnc->collect_iscan);
+	return NOTIFY_DONE;
+}
+
+/* Bump the observed link count for the inode referenced by this entry. */
+STATIC int
+xchk_nlinks_collect_dirent(
+	struct xfs_scrub	*sc,
+	struct xfs_inode	*dp,
+	xfs_dir2_dataptr_t	dapos,
+	const struct xfs_name	*name,
+	xfs_ino_t		ino,
+	void			*priv)
+{
+	struct xchk_nlink_ctrs	*xnc = priv;
+	bool			dot = false, dotdot = false;
+	int			error;
+
+	/* Does this name make sense? */
+	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len)) {
+		error = -ECANCELED;
+		goto out_abort;
+	}
+
+	if (name->len == 1 && name->name[0] == '.')
+		dot = true;
+	else if (name->len == 2 && name->name[0] == '.' &&
+				   name->name[1] == '.')
+		dotdot = true;
+
+	/* Don't accept a '.' entry that points somewhere else. */
+	if (dot && ino != dp->i_ino) {
+		error = -ECANCELED;
+		goto out_abort;
+	}
+
+	/* Don't accept an invalid inode number. */
+	if (!xfs_verify_dir_ino(sc->mp, ino)) {
+		error = -ECANCELED;
+		goto out_abort;
+	}
+
+	/* Update the shadow link counts if we haven't already failed. */
+
+	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+		error = -ECANCELED;
+		goto out_incomplete;
+	}
+
+	trace_xchk_nlinks_collect_dirent(sc->mp, dp, ino, name);
+
+	mutex_lock(&xnc->lock);
+
+	/*
+	 * If this is a dotdot entry, it is a back link from dp to ino.  How
+	 * we handle this depends on whether or not dp is the root directory.
+	 *
+	 * The root directory is its own parent, so we pretend the dotdot entry
+	 * establishes the "parent" of the root directory.  Increment the
+	 * number of parents of the root directory.
+	 *
+	 * Otherwise, increment the number of backrefs pointing back to ino.
+	 */
+	if (dotdot) {
+		if (dp == sc->mp->m_rootip)
+			error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+		else
+			error = xchk_nlinks_update_incore(xnc, ino, 0, 1, 0);
+		if (error)
+			goto out_unlock;
+	}
+
+	/*
+	 * If this dirent is a forward link from dp to ino, increment the
+	 * number of parents linking into ino.
+	 */
+	if (!dot && !dotdot) {
+		error = xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+		if (error)
+			goto out_unlock;
+	}
+
+	/*
+	 * If this dirent is a forward link to a subdirectory, increment the
+	 * number of child links of dp.
+	 */
+	if (!dot && !dotdot && name->type == XFS_DIR3_FT_DIR) {
+		error = xchk_nlinks_update_incore(xnc, dp->i_ino, 0, 0, 1);
+		if (error)
+			goto out_unlock;
+	}
+
+	mutex_unlock(&xnc->lock);
+	return 0;
+
+out_unlock:
+	mutex_unlock(&xnc->lock);
+out_abort:
+	xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+	xchk_set_incomplete(sc);
+	return error;
+}
+
+/* Walk a directory to bump the observed link counts of the children. */
+STATIC int
+xchk_nlinks_collect_dir(
+	struct xchk_nlink_ctrs	*xnc,
+	struct xfs_inode	*dp)
+{
+	struct xfs_scrub	*sc = xnc->sc;
+	unsigned int		lock_mode;
+	int			error = 0;
+
+	/* Prevent anyone from changing this directory while we walk it. */
+	xfs_ilock(dp, XFS_IOLOCK_SHARED);
+	lock_mode = xfs_ilock_data_map_shared(dp);
+
+	/*
+	 * The dotdot entry of an unlinked directory still points to the last
+	 * parent, but the parent no longer links to this directory.  Skip the
+	 * directory to avoid overcounting.
+	 */
+	if (VFS_I(dp)->i_nlink == 0)
+		goto out_unlock;
+
+	/*
+	 * We cannot count file links if the directory looks as though it has
+	 * been zapped by the inode record repair code.
+	 */
+	if (xchk_dir_looks_zapped(dp)) {
+		error = -EBUSY;
+		goto out_abort;
+	}
+
+	error = xchk_dir_walk(sc, dp, xchk_nlinks_collect_dirent, xnc);
+	if (error == -ECANCELED) {
+		error = 0;
+		goto out_unlock;
+	}
+	if (error)
+		goto out_abort;
+
+	xchk_iscan_mark_visited(&xnc->collect_iscan, dp);
+	goto out_unlock;
+
+out_abort:
+	xchk_set_incomplete(sc);
+	xchk_iscan_abort(&xnc->collect_iscan);
+out_unlock:
+	xfs_iunlock(dp, lock_mode);
+	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
+	return error;
+}
+
+/* If this looks like a valid pointer, count it. */
+static inline int
+xchk_nlinks_collect_metafile(
+	struct xchk_nlink_ctrs	*xnc,
+	xfs_ino_t		ino)
+{
+	if (!xfs_verify_ino(xnc->sc->mp, ino))
+		return 0;
+
+	trace_xchk_nlinks_collect_metafile(xnc->sc->mp, ino);
+	return xchk_nlinks_update_incore(xnc, ino, 1, 0, 0);
+}
+
+/* Bump the link counts of metadata files rooted in the superblock. */
+STATIC int
+xchk_nlinks_collect_metafiles(
+	struct xchk_nlink_ctrs	*xnc)
+{
+	struct xfs_mount	*mp = xnc->sc->mp;
+	int			error = -ECANCELED;
+
+
+	if (xchk_iscan_aborted(&xnc->collect_iscan))
+		goto out_incomplete;
+
+	mutex_lock(&xnc->lock);
+	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rbmino);
+	if (error)
+		goto out_abort;
+
+	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_rsumino);
+	if (error)
+		goto out_abort;
+
+	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_uquotino);
+	if (error)
+		goto out_abort;
+
+	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_gquotino);
+	if (error)
+		goto out_abort;
+
+	error = xchk_nlinks_collect_metafile(xnc, mp->m_sb.sb_pquotino);
+	if (error)
+		goto out_abort;
+	mutex_unlock(&xnc->lock);
+
+	return 0;
+
+out_abort:
+	mutex_unlock(&xnc->lock);
+	xchk_iscan_abort(&xnc->collect_iscan);
+out_incomplete:
+	xchk_set_incomplete(xnc->sc);
+	return error;
+}
+
+/* Advance the collection scan cursor for this non-directory file. */
+static inline int
+xchk_nlinks_collect_file(
+	struct xchk_nlink_ctrs	*xnc,
+	struct xfs_inode	*ip)
+{
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	xchk_iscan_mark_visited(&xnc->collect_iscan, ip);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return 0;
+}
+
+/* Walk all directories and count inode links. */
+STATIC int
+xchk_nlinks_collect(
+	struct xchk_nlink_ctrs	*xnc)
+{
+	struct xfs_scrub	*sc = xnc->sc;
+	struct xfs_inode	*ip;
+	int			error;
+
+	/* Count the rt and quota files that are rooted in the superblock. */
+	error = xchk_nlinks_collect_metafiles(xnc);
+	if (error)
+		return error;
+
+	/*
+	 * Set up for a potentially lengthy filesystem scan by reducing our
+	 * transaction resource usage for the duration.  Specifically:
+	 *
+	 * Cancel the transaction to release the log grant space while we scan
+	 * the filesystem.
+	 *
+	 * Create a new empty transaction to eliminate the possibility of the
+	 * inode scan deadlocking on cyclical metadata.
+	 *
+	 * We pass the empty transaction to the file scanning function to avoid
+	 * repeatedly cycling empty transactions.  This can be done even though
+	 * we take the IOLOCK to quiesce the file because empty transactions
+	 * do not take sb_internal.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	while ((error = xchk_iscan_iter(&xnc->collect_iscan, &ip)) == 1) {
+		if (S_ISDIR(VFS_I(ip)->i_mode))
+			error = xchk_nlinks_collect_dir(xnc, ip);
+		else
+			error = xchk_nlinks_collect_file(xnc, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&xnc->collect_iscan);
+	if (error) {
+		xchk_set_incomplete(sc);
+		/*
+		 * If we couldn't grab an inode that was busy with a state
+		 * change, change the error code so that we exit to userspace
+		 * as quickly as possible.
+		 */
+		if (error == -EBUSY)
+			return -ECANCELED;
+		return error;
+	}
+
+	/*
+	 * Switch out for a real transaction in preparation for building a new
+	 * tree.
+	 */
+	xchk_trans_cancel(sc);
+	return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing file link counters.  Walk each inode and compare the link
+ * counts against our shadow information; and then walk each shadow link count
+ * structure (that wasn't covered in the first part), comparing it against the
+ * file.
+ */
+
+/* Read the observed link count for comparison with the actual inode. */
+STATIC int
+xchk_nlinks_comparison_read(
+	struct xchk_nlink_ctrs	*xnc,
+	xfs_ino_t		ino,
+	struct xchk_nlink	*obs)
+{
+	struct xchk_nlink	nl;
+	int			error;
+
+	error = xfarray_load_sparse(xnc->nlinks, ino, &nl);
+	if (error)
+		return error;
+
+	nl.flags |= (XCHK_NLINK_COMPARE_SCANNED | XCHK_NLINK_WRITTEN);
+
+	error = xfarray_store(xnc->nlinks, ino, &nl);
+	if (error == -EFBIG) {
+		/*
+		 * EFBIG means we tried to store data at too high a byte offset
+		 * in the sparse array.  IOWs, we cannot complete the check and
+		 * must notify userspace that the check was incomplete.  This
+		 * shouldn't really happen outside of the collection phase.
+		 */
+		xchk_set_incomplete(xnc->sc);
+		return -ECANCELED;
+	}
+	if (error)
+		return error;
+
+	/* Copy the counters, but do not expose the internal state. */
+	obs->parents = nl.parents;
+	obs->backrefs = nl.backrefs;
+	obs->children = nl.children;
+	obs->flags = 0;
+	return 0;
+}
+
+/* Check our link count against an inode. */
+STATIC int
+xchk_nlinks_compare_inode(
+	struct xchk_nlink_ctrs	*xnc,
+	struct xfs_inode	*ip)
+{
+	struct xchk_nlink	obs;
+	struct xfs_scrub	*sc = xnc->sc;
+	uint64_t		total_links;
+	unsigned int		actual_nlink;
+	int			error;
+
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	mutex_lock(&xnc->lock);
+
+	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+		xchk_set_incomplete(xnc->sc);
+		error = -ECANCELED;
+		goto out_scanlock;
+	}
+
+	error = xchk_nlinks_comparison_read(xnc, ip->i_ino, &obs);
+	if (error)
+		goto out_scanlock;
+
+	/*
+	 * If we don't have ftype to get an accurate count of the subdirectory
+	 * entries in this directory, take advantage of the fact that on a
+	 * consistent ftype=0 filesystem, the number of subdirectory
+	 * backreferences (dotdot entries) pointing towards this directory
+	 * should be equal to the number of subdirectory entries in the
+	 * directory.
+	 */
+	if (!xfs_has_ftype(sc->mp) && S_ISDIR(VFS_I(ip)->i_mode))
+		obs.children = obs.backrefs;
+
+	total_links = xchk_nlink_total(ip, &obs);
+	actual_nlink = VFS_I(ip)->i_nlink;
+
+	trace_xchk_nlinks_compare_inode(sc->mp, ip, &obs);
+
+	/*
+	 * If we found so many parents that we'd overflow i_nlink, we must flag
+	 * this as a corruption.  The VFS won't let users increase the link
+	 * count, but it will let them decrease it.
+	 */
+	if (total_links > XFS_MAXLINK) {
+		xchk_ino_set_corrupt(sc, ip->i_ino);
+		goto out_corrupt;
+	}
+
+	/* Link counts should match. */
+	if (total_links != actual_nlink) {
+		xchk_ino_set_corrupt(sc, ip->i_ino);
+		goto out_corrupt;
+	}
+
+	if (S_ISDIR(VFS_I(ip)->i_mode) && actual_nlink > 0) {
+		/*
+		 * The collection phase ignores directories with zero link
+		 * count, so we ignore them here too.
+		 *
+		 * The number of subdirectory backreferences (dotdot entries)
+		 * pointing towards this directory should be equal to the
+		 * number of subdirectory entries in the directory.
+		 */
+		if (obs.children != obs.backrefs)
+			xchk_ino_xref_set_corrupt(sc, ip->i_ino);
+	} else {
+		/*
+		 * Non-directories and unlinked directories should not have
+		 * back references.
+		 */
+		if (obs.backrefs != 0) {
+			xchk_ino_set_corrupt(sc, ip->i_ino);
+			goto out_corrupt;
+		}
+
+		/*
+		 * Non-directories and unlinked directories should not have
+		 * children.
+		 */
+		if (obs.children != 0) {
+			xchk_ino_set_corrupt(sc, ip->i_ino);
+			goto out_corrupt;
+		}
+	}
+
+	if (ip == sc->mp->m_rootip) {
+		/*
+		 * For the root of a directory tree, both the '.' and '..'
+		 * entries should point to the root directory.  The dotdot
+		 * entry is counted as a parent of the root /and/ a backref of
+		 * the root directory.
+		 */
+		if (obs.parents != 1) {
+			xchk_ino_set_corrupt(sc, ip->i_ino);
+			goto out_corrupt;
+		}
+	} else if (actual_nlink > 0) {
+		/*
+		 * Linked files that are not the root directory should have at
+		 * least one parent.
+		 */
+		if (obs.parents == 0) {
+			xchk_ino_set_corrupt(sc, ip->i_ino);
+			goto out_corrupt;
+		}
+	}
+
+out_corrupt:
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		error = -ECANCELED;
+out_scanlock:
+	mutex_unlock(&xnc->lock);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+/*
+ * Check our link count against an inode that wasn't checked previously.  This
+ * is intended to catch directories with dangling links, though we could be
+ * racing with inode allocation in other threads.
+ */
+STATIC int
+xchk_nlinks_compare_inum(
+	struct xchk_nlink_ctrs	*xnc,
+	xfs_ino_t		ino)
+{
+	struct xchk_nlink	obs;
+	struct xfs_mount	*mp = xnc->sc->mp;
+	struct xfs_trans	*tp = xnc->sc->tp;
+	struct xfs_buf		*agi_bp;
+	struct xfs_inode	*ip;
+	int			error;
+
+	/*
+	 * The first iget failed, so try again with the variant that returns
+	 * either an incore inode or the AGI buffer.  If the function returns
+	 * EINVAL/ENOENT, it should have passed us the AGI buffer so that we
+	 * can guarantee that the inode won't be allocated while we check for
+	 * a zero link count in the observed link count data.
+	 */
+	error = xchk_iget_agi(xnc->sc, ino, &agi_bp, &ip);
+	if (!error) {
+		/* Actually got an inode, so use the inode compare. */
+		error = xchk_nlinks_compare_inode(xnc, ip);
+		xchk_irele(xnc->sc, ip);
+		return error;
+	}
+	if (error == -ENOENT || error == -EINVAL) {
+		/* No inode was found.  Check for zero link count below. */
+		error = 0;
+	}
+	if (error)
+		goto out_agi;
+
+	/* Ensure that we have protected against inode allocation/freeing. */
+	if (agi_bp == NULL) {
+		ASSERT(agi_bp != NULL);
+		xchk_set_incomplete(xnc->sc);
+		return -ECANCELED;
+	}
+
+	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+		xchk_set_incomplete(xnc->sc);
+		error = -ECANCELED;
+		goto out_agi;
+	}
+
+	mutex_lock(&xnc->lock);
+	error = xchk_nlinks_comparison_read(xnc, ino, &obs);
+	if (error)
+		goto out_scanlock;
+
+	trace_xchk_nlinks_check_zero(mp, ino, &obs);
+
+	/*
+	 * If we can't grab the inode, the link count had better be zero.  We
+	 * still hold the AGI to prevent inode allocation/freeing.
+	 */
+	if (xchk_nlink_total(NULL, &obs) != 0) {
+		xchk_ino_set_corrupt(xnc->sc, ino);
+		error = -ECANCELED;
+	}
+
+out_scanlock:
+	mutex_unlock(&xnc->lock);
+out_agi:
+	if (agi_bp)
+		xfs_trans_brelse(tp, agi_bp);
+	return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem to compare the link count.  Move
+ * on if we can't grab an inode, since we'll revisit unchecked nlink records in
+ * the second part.
+ */
+static int
+xchk_nlinks_compare_iter(
+	struct xchk_nlink_ctrs	*xnc,
+	struct xfs_inode	**ipp)
+{
+	int			error;
+
+	do {
+		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+	} while (error == -EBUSY);
+
+	return error;
+}
+
+/* Compare the link counts we observed against the live information. */
+STATIC int
+xchk_nlinks_compare(
+	struct xchk_nlink_ctrs	*xnc)
+{
+	struct xchk_nlink	nl;
+	struct xfs_scrub	*sc = xnc->sc;
+	struct xfs_inode	*ip;
+	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
+	int			error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/*
+	 * Create a new empty transaction so that we can advance the iscan
+	 * cursor without deadlocking if the inobt has a cycle and push on the
+	 * inactivation workqueue.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	/*
+	 * Use the inobt to walk all allocated inodes to compare the link
+	 * counts.  Inodes skipped by _compare_iter will be tried again in the
+	 * next phase of the scan.
+	 */
+	xchk_iscan_start(sc, 0, 0, &xnc->compare_iscan);
+	while ((error = xchk_nlinks_compare_iter(xnc, &ip)) == 1) {
+		error = xchk_nlinks_compare_inode(xnc, ip);
+		xchk_iscan_mark_visited(&xnc->compare_iscan, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&xnc->compare_iscan);
+	xchk_iscan_teardown(&xnc->compare_iscan);
+	if (error)
+		return error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/*
+	 * Walk all the non-null nlink observations that weren't checked in the
+	 * previous step.
+	 */
+	mutex_lock(&xnc->lock);
+	while ((error = xfarray_iter(xnc->nlinks, &cur, &nl)) == 1) {
+		xfs_ino_t	ino = cur - 1;
+
+		if (nl.flags & XCHK_NLINK_COMPARE_SCANNED)
+			continue;
+
+		mutex_unlock(&xnc->lock);
+
+		error = xchk_nlinks_compare_inum(xnc, ino);
+		if (error)
+			return error;
+
+		if (xchk_should_terminate(xnc->sc, &error))
+			return error;
+
+		mutex_lock(&xnc->lock);
+	}
+	mutex_unlock(&xnc->lock);
+
+	return error;
+}
+
+/* Tear down everything associated with a nlinks check. */
+static void
+xchk_nlinks_teardown_scan(
+	void			*priv)
+{
+	struct xchk_nlink_ctrs	*xnc = priv;
+
+	/* Discourage any hook functions that might be running. */
+	xchk_iscan_abort(&xnc->collect_iscan);
+
+	xfs_dir_hook_del(xnc->sc->mp, &xnc->dhook);
+
+	xfarray_destroy(xnc->nlinks);
+	xnc->nlinks = NULL;
+
+	xchk_iscan_teardown(&xnc->collect_iscan);
+	mutex_destroy(&xnc->lock);
+	xnc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate link count data.  If
+ * the scan is successful, the counts will be left alive for a repair.  If any
+ * error occurs, we'll tear everything down.
+ */
+STATIC int
+xchk_nlinks_setup_scan(
+	struct xfs_scrub	*sc,
+	struct xchk_nlink_ctrs	*xnc)
+{
+	struct xfs_mount	*mp = sc->mp;
+	char			*descr;
+	unsigned long long	max_inos;
+	xfs_agnumber_t		last_agno = mp->m_sb.sb_agcount - 1;
+	xfs_agino_t		first_agino, last_agino;
+	int			error;
+
+	ASSERT(xnc->sc == NULL);
+	xnc->sc = sc;
+
+	mutex_init(&xnc->lock);
+
+	/* Retry iget every tenth of a second for up to 30 seconds. */
+	xchk_iscan_start(sc, 30000, 100, &xnc->collect_iscan);
+
+	/*
+	 * Set up enough space to store an nlink record for the highest
+	 * possible inode number in this system.
+	 */
+	xfs_agino_range(mp, last_agno, &first_agino, &last_agino);
+	max_inos = XFS_AGINO_TO_INO(mp, last_agno, last_agino) + 1;
+	descr = xchk_xfile_descr(sc, "file link counts");
+	error = xfarray_create(descr, min(XFS_MAXINUMBER + 1, max_inos),
+			sizeof(struct xchk_nlink), &xnc->nlinks);
+	kfree(descr);
+	if (error)
+		goto out_teardown;
+
+	/*
+	 * Hook into the directory entry code so that we can capture updates to
+	 * file link counts.  The hook only triggers for inodes that were
+	 * already scanned, and the scanner thread takes each inode's ILOCK,
+	 * which means that any in-progress inode updates will finish before we
+	 * can scan the inode.
+	 */
+	ASSERT(sc->flags & XCHK_FSGATES_DIRENTS);
+	xfs_dir_hook_setup(&xnc->dhook, xchk_nlinks_live_update);
+	error = xfs_dir_hook_add(mp, &xnc->dhook);
+	if (error)
+		goto out_teardown;
+
+	/* Use deferred cleanup to pass the inode link count data to repair. */
+	sc->buf_cleanup = xchk_nlinks_teardown_scan;
+	return 0;
+
+out_teardown:
+	xchk_nlinks_teardown_scan(xnc);
+	return error;
+}
+
+/* Scrub the link count of all inodes on the filesystem. */
+int
+xchk_nlinks(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_nlink_ctrs	*xnc = sc->buf;
+	int			error = 0;
+
+	/* Set ourselves up to check link counts on the live filesystem. */
+	error = xchk_nlinks_setup_scan(sc, xnc);
+	if (error)
+		return error;
+
+	/* Walk all inodes, picking up link count information. */
+	error = xchk_nlinks_collect(xnc);
+	if (!xchk_xref_process_error(sc, 0, 0, &error))
+		return error;
+
+	/* Fail fast if we're not playing with a full dataset. */
+	if (xchk_iscan_aborted(&xnc->collect_iscan))
+		xchk_set_incomplete(sc);
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		return 0;
+
+	/* Compare link counts. */
+	error = xchk_nlinks_compare(xnc);
+	if (!xchk_xref_process_error(sc, 0, 0, &error))
+		return error;
+
+	/* Check one last time for an incomplete dataset. */
+	if (xchk_iscan_aborted(&xnc->collect_iscan))
+		xchk_set_incomplete(sc);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/nlinks.h b/fs/xfs/scrub/nlinks.h
new file mode 100644
index 000000000000..a950f3daf204
--- /dev/null
+++ b/fs/xfs/scrub/nlinks.h
@@ -0,0 +1,102 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NLINKS_H__
+#define __XFS_SCRUB_NLINKS_H__
+
+/* Live link count control structure. */
+struct xchk_nlink_ctrs {
+	struct xfs_scrub	*sc;
+
+	/* Shadow link count data and its mutex. */
+	struct xfarray		*nlinks;
+	struct mutex		lock;
+
+	/*
+	 * The collection step uses a separate iscan context from the compare
+	 * step because the collection iscan coordinates live updates to the
+	 * observation data while this scanner is running.  The compare iscan
+	 * is secondary and can be reinitialized as needed.
+	 */
+	struct xchk_iscan	collect_iscan;
+	struct xchk_iscan	compare_iscan;
+
+	/*
+	 * Hook into directory updates so that we can receive live updates
+	 * from other writer threads.
+	 */
+	struct xfs_dir_hook	dhook;
+};
+
+/*
+ * In-core link counts for a given inode in the filesystem.
+ *
+ * For an empty rootdir, the directory entries and the field to which they are
+ * accounted are as follows:
+ *
+ * Root directory:
+ *
+ * . points to self		(root.child)
+ * .. points to self		(root.parent)
+ * f1 points to a child file	(f1.parent)
+ * d1 points to a child dir	(d1.parent, root.child)
+ *
+ * Subdirectory d1:
+ *
+ * . points to self		(d1.child)
+ * .. points to root dir	(root.backref)
+ * f2 points to child file	(f2.parent)
+ * f3 points to root.f1		(f1.parent)
+ *
+ * root.nlink == 3 (root.dot, root.dotdot, root.d1)
+ * d1.nlink == 2 (root.d1, d1.dot)
+ * f1.nlink == 2 (root.f1, d1.f3)
+ * f2.nlink == 1 (d1.f2)
+ */
+struct xchk_nlink {
+	/* Count of forward links from parent directories to this file. */
+	xfs_nlink_t		parents;
+
+	/*
+	 * Count of back links to this parent directory from child
+	 * subdirectories.
+	 */
+	xfs_nlink_t		backrefs;
+
+	/*
+	 * Count of forward links from this directory to all child files and
+	 * the number of dot entries.  Should be zero for non-directories.
+	 */
+	xfs_nlink_t		children;
+
+	/* Record state flags */
+	unsigned int		flags;
+};
+
+/*
+ * This incore link count has been written at least once.  We never want to
+ * store an xchk_nlink that looks uninitialized.
+ */
+#define XCHK_NLINK_WRITTEN		(1U << 0)
+
+/* Already checked this link count record. */
+#define XCHK_NLINK_COMPARE_SCANNED	(1U << 1)
+
+/* Already made a repair with this link count record. */
+#define XREP_NLINK_DIRTY		(1U << 2)
+
+/* Compute total link count, using large enough variables to detect overflow. */
+static inline uint64_t
+xchk_nlink_total(struct xfs_inode *ip, const struct xchk_nlink *live)
+{
+	uint64_t	ret = live->parents;
+
+	/* Add one link count for the dot entry of any linked directory. */
+	if (ip && S_ISDIR(VFS_I(ip)->i_mode) && VFS_I(ip)->i_nlink)
+		ret++;
+	return ret + live->children;
+}
+
+#endif /* __XFS_SCRUB_NLINKS_H__ */
diff --git a/fs/xfs/scrub/nlinks_repair.c b/fs/xfs/scrub/nlinks_repair.c
new file mode 100644
index 000000000000..b87618322f55
--- /dev/null
+++ b/fs/xfs/scrub/nlinks_repair.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2021-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Inode Link Count Repair
+ * ============================
+ *
+ * Use the live inode link count information that we collected to replace the
+ * nlink values of the incore inodes.  A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * inode is locked.
+ */
+
+/*
+ * Correct the link count of the given inode.  Because we have to grab locks
+ * and resources in a certain order, it's possible that this will be a no-op.
+ */
+STATIC int
+xrep_nlinks_repair_inode(
+	struct xchk_nlink_ctrs	*xnc)
+{
+	struct xchk_nlink	obs;
+	struct xfs_scrub	*sc = xnc->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_inode	*ip = sc->ip;
+	uint64_t		total_links;
+	uint64_t		actual_nlink;
+	bool			dirty = false;
+	int			error;
+
+	xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_link, 0, 0, 0, &sc->tp);
+	if (error)
+		return error;
+
+	xchk_ilock(sc, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(sc->tp, ip, 0);
+
+	mutex_lock(&xnc->lock);
+
+	if (xchk_iscan_aborted(&xnc->collect_iscan)) {
+		error = -ECANCELED;
+		goto out_scanlock;
+	}
+
+	error = xfarray_load_sparse(xnc->nlinks, ip->i_ino, &obs);
+	if (error)
+		goto out_scanlock;
+
+	/*
+	 * We're done accessing the shared scan data, so we can drop the lock.
+	 * We still hold @ip's ILOCK, so its link count cannot change.
+	 */
+	mutex_unlock(&xnc->lock);
+
+	total_links = xchk_nlink_total(ip, &obs);
+	actual_nlink = VFS_I(ip)->i_nlink;
+
+	/*
+	 * Non-directories cannot have directories pointing up to them.
+	 *
+	 * We previously set error to zero, but set it again because one static
+	 * checker author fears that programmers will fail to maintain this
+	 * invariant and built their tool to flag this as a security risk.  A
+	 * different tool author made their bot complain about the redundant
+	 * store.  This is a never-ending and stupid battle; both tools missed
+	 * *actual bugs* elsewhere; and I no longer care.
+	 */
+	if (!S_ISDIR(VFS_I(ip)->i_mode) && obs.children != 0) {
+		trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+		error = 0;
+		goto out_trans;
+	}
+
+	/*
+	 * We did not find any links to this inode.  If the inode agrees, we
+	 * have nothing further to do.  If not, the inode has a nonzero link
+	 * count and we don't have anywhere to graft the child onto.  Dropping
+	 * a live inode's link count to zero can cause unexpected shutdowns in
+	 * inactivation, so leave it alone.
+	 */
+	if (total_links == 0) {
+		if (actual_nlink != 0)
+			trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+		goto out_trans;
+	}
+
+	/* Commit the new link count if it changed. */
+	if (total_links != actual_nlink) {
+		if (total_links > XFS_MAXLINK) {
+			trace_xrep_nlinks_unfixable_inode(mp, ip, &obs);
+			goto out_trans;
+		}
+
+		trace_xrep_nlinks_update_inode(mp, ip, &obs);
+
+		set_nlink(VFS_I(ip), total_links);
+		dirty = true;
+	}
+
+	if (!dirty) {
+		error = 0;
+		goto out_trans;
+	}
+
+	xfs_trans_log_inode(sc->tp, ip, XFS_ILOG_CORE);
+
+	error = xrep_trans_commit(sc);
+	xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	return error;
+
+out_scanlock:
+	mutex_unlock(&xnc->lock);
+out_trans:
+	xchk_trans_cancel(sc);
+	xchk_iunlock(sc, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	return error;
+}
+
+/*
+ * Try to visit every inode in the filesystem for repairs.  Move on if we can't
+ * grab an inode, since we're still making forward progress.
+ */
+static int
+xrep_nlinks_iter(
+	struct xchk_nlink_ctrs	*xnc,
+	struct xfs_inode	**ipp)
+{
+	int			error;
+
+	do {
+		error = xchk_iscan_iter(&xnc->compare_iscan, ipp);
+	} while (error == -EBUSY);
+
+	return error;
+}
+
+/* Commit the new inode link counters. */
+int
+xrep_nlinks(
+	struct xfs_scrub	*sc)
+{
+	struct xchk_nlink_ctrs	*xnc = sc->buf;
+	int			error;
+
+	/*
+	 * We need ftype for an accurate count of the number of child
+	 * subdirectory links.  Child subdirectories with a back link (dotdot
+	 * entry) but no forward link are unfixable, so we cannot repair the
+	 * link count of the parent directory based on the back link count
+	 * alone.  Filesystems without ftype support are rare (old V4) so we
+	 * just skip out here.
+	 */
+	if (!xfs_has_ftype(sc->mp))
+		return -EOPNOTSUPP;
+
+	/*
+	 * Use the inobt to walk all allocated inodes to compare and fix the
+	 * link counts.  Retry iget every tenth of a second for up to 30
+	 * seconds -- even if repair misses a few inodes, we still try to fix
+	 * as many of them as we can.
+	 */
+	xchk_iscan_start(sc, 30000, 100, &xnc->compare_iscan);
+	ASSERT(sc->ip == NULL);
+
+	while ((error = xrep_nlinks_iter(xnc, &sc->ip)) == 1) {
+		/*
+		 * Commit the scrub transaction so that we can create repair
+		 * transactions with the correct reservations.
+		 */
+		xchk_trans_cancel(sc);
+
+		error = xrep_nlinks_repair_inode(xnc);
+		xchk_iscan_mark_visited(&xnc->compare_iscan, sc->ip);
+		xchk_irele(sc, sc->ip);
+		sc->ip = NULL;
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+
+		/*
+		 * Create a new empty transaction so that we can advance the
+		 * iscan cursor without deadlocking if the inobt has a cycle.
+		 * We can only push the inactivation workqueues with an empty
+		 * transaction.
+		 */
+		error = xchk_trans_alloc_empty(sc);
+		if (error)
+			break;
+	}
+	xchk_iscan_iter_finish(&xnc->compare_iscan);
+	xchk_iscan_teardown(&xnc->compare_iscan);
+
+	return error;
+}
diff --git a/fs/xfs/scrub/quotacheck.c b/fs/xfs/scrub/quotacheck.c
new file mode 100644
index 000000000000..c77eb2de8df7
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck
+ * ===============
+ *
+ * Quota counters are "summary" metadata, in the sense that they are computed
+ * as the summation of the block usage counts for every file on the filesystem.
+ * Therefore, we compute the correct icount, bcount, and rtbcount values by
+ * creating a shadow quota counter structure and walking every inode.
+ */
+
+/* Track the quota deltas for a dquot in a transaction. */
+struct xqcheck_dqtrx {
+	xfs_dqtype_t		q_type;
+	xfs_dqid_t		q_id;
+
+	int64_t			icount_delta;
+
+	int64_t			bcount_delta;
+	int64_t			delbcnt_delta;
+
+	int64_t			rtbcount_delta;
+	int64_t			delrtb_delta;
+};
+
+#define XQCHECK_MAX_NR_DQTRXS	(XFS_QM_TRANS_DQTYPES * XFS_QM_TRANS_MAXDQS)
+
+/*
+ * Track the quota deltas for all dquots attached to a transaction if the
+ * quota deltas are being applied to an inode that we already scanned.
+ */
+struct xqcheck_dqacct {
+	struct rhash_head	hash;
+	uintptr_t		tx_id;
+	struct xqcheck_dqtrx	dqtrx[XQCHECK_MAX_NR_DQTRXS];
+	unsigned int		refcount;
+};
+
+/* Free a shadow dquot accounting structure. */
+static void
+xqcheck_dqacct_free(
+	void			*ptr,
+	void			*arg)
+{
+	struct xqcheck_dqacct	*dqa = ptr;
+
+	kfree(dqa);
+}
+
+/* Set us up to scrub quota counters. */
+int
+xchk_setup_quotacheck(
+	struct xfs_scrub	*sc)
+{
+	if (!XFS_IS_QUOTA_ON(sc->mp))
+		return -ENOENT;
+
+	xchk_fsgates_enable(sc, XCHK_FSGATES_QUOTA);
+
+	sc->buf = kzalloc(sizeof(struct xqcheck), XCHK_GFP_FLAGS);
+	if (!sc->buf)
+		return -ENOMEM;
+
+	return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 1: Collecting dquot resource usage counts.  For each xfs_dquot attached
+ * to each inode, we create a shadow dquot, and compute the inode count and add
+ * the data/rt block usage from what we see.
+ *
+ * To avoid false corruption reports in part 2, any failure in this part must
+ * set the INCOMPLETE flag even when a negative errno is returned.  This care
+ * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED,
+ * ECANCELED) that are absorbed into a scrub state flag update by
+ * xchk_*_process_error.  Scrub and repair share the same incore data
+ * structures, so the INCOMPLETE flag is critical to prevent a repair based on
+ * insufficient information.
+ *
+ * Because we are scanning a live filesystem, it's possible that another thread
+ * will try to update the quota counters for an inode that we've already
+ * scanned.  This will cause our counts to be incorrect.  Therefore, we hook
+ * the live transaction code in two places: (1) when the callers update the
+ * per-transaction dqtrx structure to log quota counter updates; and (2) when
+ * transaction commit actually logs those updates to the incore dquot.  By
+ * shadowing transaction updates in this manner, live quotacheck can ensure
+ * by locking the dquot and the shadow structure that its own copies are not
+ * out of date.  Because the hook code runs in a different process context from
+ * the scrub code and the scrub state flags are not accessed atomically,
+ * failures in the hook code must abort the iscan and the scrubber must notice
+ * the aborted scan and set the incomplete flag.
+ *
+ * Note that we use srcu notifier hooks to minimize the overhead when live
+ * quotacheck is /not/ running.
+ */
+
+/* Update an incore dquot counter information from a live update. */
+static int
+xqcheck_update_incore_counts(
+	struct xqcheck		*xqc,
+	struct xfarray		*counts,
+	xfs_dqid_t		id,
+	int64_t			inodes,
+	int64_t			nblks,
+	int64_t			rtblks)
+{
+	struct xqcheck_dquot	xcdq;
+	int			error;
+
+	error = xfarray_load_sparse(counts, id, &xcdq);
+	if (error)
+		return error;
+
+	xcdq.flags |= XQCHECK_DQUOT_WRITTEN;
+	xcdq.icount += inodes;
+	xcdq.bcount += nblks;
+	xcdq.rtbcount += rtblks;
+
+	error = xfarray_store(counts, id, &xcdq);
+	if (error == -EFBIG) {
+		/*
+		 * EFBIG means we tried to store data at too high a byte offset
+		 * in the sparse array.  IOWs, we cannot complete the check and
+		 * must notify userspace that the check was incomplete.
+		 */
+		error = -ECANCELED;
+	}
+	return error;
+}
+
+/* Decide if this is the shadow dquot accounting structure for a transaction. */
+static int
+xqcheck_dqacct_obj_cmpfn(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const uintptr_t			*tx_idp = arg->key;
+	const struct xqcheck_dqacct	*dqa = obj;
+
+	if (dqa->tx_id != *tx_idp)
+		return 1;
+	return 0;
+}
+
+static const struct rhashtable_params xqcheck_dqacct_hash_params = {
+	.min_size		= 32,
+	.key_len		= sizeof(uintptr_t),
+	.key_offset		= offsetof(struct xqcheck_dqacct, tx_id),
+	.head_offset		= offsetof(struct xqcheck_dqacct, hash),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= xqcheck_dqacct_obj_cmpfn,
+};
+
+/* Find a shadow dqtrx slot for the given dquot. */
+STATIC struct xqcheck_dqtrx *
+xqcheck_get_dqtrx(
+	struct xqcheck_dqacct	*dqa,
+	xfs_dqtype_t		q_type,
+	xfs_dqid_t		q_id)
+{
+	int			i;
+
+	for (i = 0; i < XQCHECK_MAX_NR_DQTRXS; i++) {
+		if (dqa->dqtrx[i].q_type == 0 ||
+		    (dqa->dqtrx[i].q_type == q_type &&
+		     dqa->dqtrx[i].q_id == q_id))
+			return &dqa->dqtrx[i];
+	}
+
+	return NULL;
+}
+
+/*
+ * Create and fill out a quota delta tracking structure to shadow the updates
+ * going on in the regular quota code.
+ */
+static int
+xqcheck_mod_live_ino_dqtrx(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_mod_ino_dqtrx_params *p = data;
+	struct xqcheck			*xqc;
+	struct xqcheck_dqacct		*dqa;
+	struct xqcheck_dqtrx		*dqtrx;
+	int				error;
+
+	xqc = container_of(nb, struct xqcheck, qhook.mod_hook.nb);
+
+	/* Skip quota reservation fields. */
+	switch (action) {
+	case XFS_TRANS_DQ_BCOUNT:
+	case XFS_TRANS_DQ_DELBCOUNT:
+	case XFS_TRANS_DQ_ICOUNT:
+	case XFS_TRANS_DQ_RTBCOUNT:
+	case XFS_TRANS_DQ_DELRTBCOUNT:
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* Ignore dqtrx updates for quota types we don't care about. */
+	switch (p->q_type) {
+	case XFS_DQTYPE_USER:
+		if (!xqc->ucounts)
+			return NOTIFY_DONE;
+		break;
+	case XFS_DQTYPE_GROUP:
+		if (!xqc->gcounts)
+			return NOTIFY_DONE;
+		break;
+	case XFS_DQTYPE_PROJ:
+		if (!xqc->pcounts)
+			return NOTIFY_DONE;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	/* Skip inodes that haven't been scanned yet. */
+	if (!xchk_iscan_want_live_update(&xqc->iscan, p->ino))
+		return NOTIFY_DONE;
+
+	/* Make a shadow quota accounting tracker for this transaction. */
+	mutex_lock(&xqc->lock);
+	dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+			xqcheck_dqacct_hash_params);
+	if (!dqa) {
+		dqa = kzalloc(sizeof(struct xqcheck_dqacct), XCHK_GFP_FLAGS);
+		if (!dqa)
+			goto out_abort;
+
+		dqa->tx_id = p->tx_id;
+		error = rhashtable_insert_fast(&xqc->shadow_dquot_acct,
+				&dqa->hash, xqcheck_dqacct_hash_params);
+		if (error)
+			goto out_abort;
+	}
+
+	/* Find the shadow dqtrx (or an empty slot) here. */
+	dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+	if (!dqtrx)
+		goto out_abort;
+	if (dqtrx->q_type == 0) {
+		dqtrx->q_type = p->q_type;
+		dqtrx->q_id = p->q_id;
+		dqa->refcount++;
+	}
+
+	/* Update counter */
+	switch (action) {
+	case XFS_TRANS_DQ_BCOUNT:
+		dqtrx->bcount_delta += p->delta;
+		break;
+	case XFS_TRANS_DQ_DELBCOUNT:
+		dqtrx->delbcnt_delta += p->delta;
+		break;
+	case XFS_TRANS_DQ_ICOUNT:
+		dqtrx->icount_delta += p->delta;
+		break;
+	case XFS_TRANS_DQ_RTBCOUNT:
+		dqtrx->rtbcount_delta += p->delta;
+		break;
+	case XFS_TRANS_DQ_DELRTBCOUNT:
+		dqtrx->delrtb_delta += p->delta;
+		break;
+	}
+
+	mutex_unlock(&xqc->lock);
+	return NOTIFY_DONE;
+
+out_abort:
+	xchk_iscan_abort(&xqc->iscan);
+	mutex_unlock(&xqc->lock);
+	return NOTIFY_DONE;
+}
+
+/*
+ * Apply the transaction quota deltas to our shadow quota accounting info when
+ * the regular quota code are doing the same.
+ */
+static int
+xqcheck_apply_live_dqtrx(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_apply_dqtrx_params	*p = data;
+	struct xqcheck			*xqc;
+	struct xqcheck_dqacct		*dqa;
+	struct xqcheck_dqtrx		*dqtrx;
+	struct xfarray			*counts;
+	int				error;
+
+	xqc = container_of(nb, struct xqcheck, qhook.apply_hook.nb);
+
+	/* Map the dquot type to an incore counter object. */
+	switch (p->q_type) {
+	case XFS_DQTYPE_USER:
+		counts = xqc->ucounts;
+		break;
+	case XFS_DQTYPE_GROUP:
+		counts = xqc->gcounts;
+		break;
+	case XFS_DQTYPE_PROJ:
+		counts = xqc->pcounts;
+		break;
+	default:
+		return NOTIFY_DONE;
+	}
+
+	if (xchk_iscan_aborted(&xqc->iscan) || counts == NULL)
+		return NOTIFY_DONE;
+
+	/*
+	 * Find the shadow dqtrx for this transaction and dquot, if any deltas
+	 * need to be applied here.  If not, we're finished early.
+	 */
+	mutex_lock(&xqc->lock);
+	dqa = rhashtable_lookup_fast(&xqc->shadow_dquot_acct, &p->tx_id,
+			xqcheck_dqacct_hash_params);
+	if (!dqa)
+		goto out_unlock;
+	dqtrx = xqcheck_get_dqtrx(dqa, p->q_type, p->q_id);
+	if (!dqtrx || dqtrx->q_type == 0)
+		goto out_unlock;
+
+	/* Update our shadow dquot if we're committing. */
+	if (action == XFS_APPLY_DQTRX_COMMIT) {
+		error = xqcheck_update_incore_counts(xqc, counts, p->q_id,
+				dqtrx->icount_delta,
+				dqtrx->bcount_delta + dqtrx->delbcnt_delta,
+				dqtrx->rtbcount_delta + dqtrx->delrtb_delta);
+		if (error)
+			goto out_abort;
+	}
+
+	/* Free the shadow accounting structure if that was the last user. */
+	dqa->refcount--;
+	if (dqa->refcount == 0) {
+		error = rhashtable_remove_fast(&xqc->shadow_dquot_acct,
+				&dqa->hash, xqcheck_dqacct_hash_params);
+		if (error)
+			goto out_abort;
+		xqcheck_dqacct_free(dqa, NULL);
+	}
+
+	mutex_unlock(&xqc->lock);
+	return NOTIFY_DONE;
+
+out_abort:
+	xchk_iscan_abort(&xqc->iscan);
+out_unlock:
+	mutex_unlock(&xqc->lock);
+	return NOTIFY_DONE;
+}
+
+/* Record this inode's quota usage in our shadow quota counter data. */
+STATIC int
+xqcheck_collect_inode(
+	struct xqcheck		*xqc,
+	struct xfs_inode	*ip)
+{
+	struct xfs_trans	*tp = xqc->sc->tp;
+	xfs_filblks_t		nblks, rtblks;
+	uint			ilock_flags = 0;
+	xfs_dqid_t		id;
+	bool			isreg = S_ISREG(VFS_I(ip)->i_mode);
+	int			error = 0;
+
+	if (xfs_is_quota_inode(&tp->t_mountp->m_sb, ip->i_ino)) {
+		/*
+		 * Quota files are never counted towards quota, so we do not
+		 * need to take the lock.
+		 */
+		xchk_iscan_mark_visited(&xqc->iscan, ip);
+		return 0;
+	}
+
+	/* Figure out the data / rt device block counts. */
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	if (isreg)
+		xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+	if (XFS_IS_REALTIME_INODE(ip)) {
+		/*
+		 * Read in the data fork for rt files so that _count_blocks
+		 * can count the number of blocks allocated from the rt volume.
+		 * Inodes do not track that separately.
+		 */
+		ilock_flags = xfs_ilock_data_map_shared(ip);
+		error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
+		if (error)
+			goto out_abort;
+	} else {
+		ilock_flags = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, XFS_ILOCK_SHARED);
+	}
+	xfs_inode_count_blocks(tp, ip, &nblks, &rtblks);
+
+	if (xchk_iscan_aborted(&xqc->iscan)) {
+		error = -ECANCELED;
+		goto out_incomplete;
+	}
+
+	/* Update the shadow dquot counters. */
+	mutex_lock(&xqc->lock);
+	if (xqc->ucounts) {
+		id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_USER);
+		error = xqcheck_update_incore_counts(xqc, xqc->ucounts, id, 1,
+				nblks, rtblks);
+		if (error)
+			goto out_mutex;
+	}
+
+	if (xqc->gcounts) {
+		id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_GROUP);
+		error = xqcheck_update_incore_counts(xqc, xqc->gcounts, id, 1,
+				nblks, rtblks);
+		if (error)
+			goto out_mutex;
+	}
+
+	if (xqc->pcounts) {
+		id = xfs_qm_id_for_quotatype(ip, XFS_DQTYPE_PROJ);
+		error = xqcheck_update_incore_counts(xqc, xqc->pcounts, id, 1,
+				nblks, rtblks);
+		if (error)
+			goto out_mutex;
+	}
+	mutex_unlock(&xqc->lock);
+
+	xchk_iscan_mark_visited(&xqc->iscan, ip);
+	goto out_ilock;
+
+out_mutex:
+	mutex_unlock(&xqc->lock);
+out_abort:
+	xchk_iscan_abort(&xqc->iscan);
+out_incomplete:
+	xchk_set_incomplete(xqc->sc);
+out_ilock:
+	xfs_iunlock(ip, ilock_flags);
+	if (isreg)
+		xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	return error;
+}
+
+/* Walk all the allocated inodes and run a quota scan on them. */
+STATIC int
+xqcheck_collect_counts(
+	struct xqcheck		*xqc)
+{
+	struct xfs_scrub	*sc = xqc->sc;
+	struct xfs_inode	*ip;
+	int			error;
+
+	/*
+	 * Set up for a potentially lengthy filesystem scan by reducing our
+	 * transaction resource usage for the duration.  Specifically:
+	 *
+	 * Cancel the transaction to release the log grant space while we scan
+	 * the filesystem.
+	 *
+	 * Create a new empty transaction to eliminate the possibility of the
+	 * inode scan deadlocking on cyclical metadata.
+	 *
+	 * We pass the empty transaction to the file scanning function to avoid
+	 * repeatedly cycling empty transactions.  This can be done without
+	 * risk of deadlock between sb_internal and the IOLOCK (we take the
+	 * IOLOCK to quiesce the file before scanning) because empty
+	 * transactions do not take sb_internal.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	while ((error = xchk_iscan_iter(&xqc->iscan, &ip)) == 1) {
+		error = xqcheck_collect_inode(xqc, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&xqc->iscan);
+	if (error) {
+		xchk_set_incomplete(sc);
+		/*
+		 * If we couldn't grab an inode that was busy with a state
+		 * change, change the error code so that we exit to userspace
+		 * as quickly as possible.
+		 */
+		if (error == -EBUSY)
+			return -ECANCELED;
+		return error;
+	}
+
+	/*
+	 * Switch out for a real transaction in preparation for building a new
+	 * tree.
+	 */
+	xchk_trans_cancel(sc);
+	return xchk_setup_fs(sc);
+}
+
+/*
+ * Part 2: Comparing dquot resource counters.  Walk each xfs_dquot, comparing
+ * the resource usage counters against our shadow dquots; and then walk each
+ * shadow dquot (that wasn't covered in the first part), comparing it against
+ * the xfs_dquot.
+ */
+
+/*
+ * Check the dquot data against what we observed.  Caller must hold the dquot
+ * lock.
+ */
+STATIC int
+xqcheck_compare_dquot(
+	struct xqcheck		*xqc,
+	xfs_dqtype_t		dqtype,
+	struct xfs_dquot	*dq)
+{
+	struct xqcheck_dquot	xcdq;
+	struct xfarray		*counts = xqcheck_counters_for(xqc, dqtype);
+	int			error;
+
+	if (xchk_iscan_aborted(&xqc->iscan)) {
+		xchk_set_incomplete(xqc->sc);
+		return -ECANCELED;
+	}
+
+	mutex_lock(&xqc->lock);
+	error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+	if (error)
+		goto out_unlock;
+
+	if (xcdq.icount != dq->q_ino.count)
+		xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+	if (xcdq.bcount != dq->q_blk.count)
+		xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+	if (xcdq.rtbcount != dq->q_rtb.count)
+		xchk_qcheck_set_corrupt(xqc->sc, dqtype, dq->q_id);
+
+	xcdq.flags |= (XQCHECK_DQUOT_COMPARE_SCANNED | XQCHECK_DQUOT_WRITTEN);
+	error = xfarray_store(counts, dq->q_id, &xcdq);
+	if (error == -EFBIG) {
+		/*
+		 * EFBIG means we tried to store data at too high a byte offset
+		 * in the sparse array.  IOWs, we cannot complete the check and
+		 * must notify userspace that the check was incomplete.  This
+		 * should never happen outside of the collection phase.
+		 */
+		xchk_set_incomplete(xqc->sc);
+		error = -ECANCELED;
+	}
+	mutex_unlock(&xqc->lock);
+	if (error)
+		return error;
+
+	if (xqc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return -ECANCELED;
+
+	return 0;
+
+out_unlock:
+	mutex_unlock(&xqc->lock);
+	return error;
+}
+
+/*
+ * Walk all the observed dquots, and make sure there's a matching incore
+ * dquot and that its counts match ours.
+ */
+STATIC int
+xqcheck_walk_observations(
+	struct xqcheck		*xqc,
+	xfs_dqtype_t		dqtype)
+{
+	struct xqcheck_dquot	xcdq;
+	struct xfs_dquot	*dq;
+	struct xfarray		*counts = xqcheck_counters_for(xqc, dqtype);
+	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
+	int			error;
+
+	mutex_lock(&xqc->lock);
+	while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+		xfs_dqid_t	id = cur - 1;
+
+		if (xcdq.flags & XQCHECK_DQUOT_COMPARE_SCANNED)
+			continue;
+
+		mutex_unlock(&xqc->lock);
+
+		error = xfs_qm_dqget(xqc->sc->mp, id, dqtype, false, &dq);
+		if (error == -ENOENT) {
+			xchk_qcheck_set_corrupt(xqc->sc, dqtype, id);
+			return 0;
+		}
+		if (error)
+			return error;
+
+		error = xqcheck_compare_dquot(xqc, dqtype, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			return error;
+
+		if (xchk_should_terminate(xqc->sc, &error))
+			return error;
+
+		mutex_lock(&xqc->lock);
+	}
+	mutex_unlock(&xqc->lock);
+
+	return error;
+}
+
+/* Compare the quota counters we observed against the live dquots. */
+STATIC int
+xqcheck_compare_dqtype(
+	struct xqcheck		*xqc,
+	xfs_dqtype_t		dqtype)
+{
+	struct xchk_dqiter	cursor = { };
+	struct xfs_scrub	*sc = xqc->sc;
+	struct xfs_dquot	*dq;
+	int			error;
+
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
+		return 0;
+
+	/* If the quota CHKD flag is cleared, we need to repair this quota. */
+	if (!(xfs_quota_chkd_flag(dqtype) & sc->mp->m_qflags)) {
+		xchk_qcheck_set_corrupt(xqc->sc, dqtype, 0);
+		return 0;
+	}
+
+	/* Compare what we observed against the actual dquots. */
+	xchk_dqiter_init(&cursor, sc, dqtype);
+	while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+		error = xqcheck_compare_dquot(xqc, dqtype, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			break;
+	}
+	if (error)
+		return error;
+
+	/* Walk all the observed dquots and compare to the incore ones. */
+	return xqcheck_walk_observations(xqc, dqtype);
+}
+
+/* Tear down everything associated with a quotacheck. */
+static void
+xqcheck_teardown_scan(
+	void			*priv)
+{
+	struct xqcheck		*xqc = priv;
+	struct xfs_quotainfo	*qi = xqc->sc->mp->m_quotainfo;
+
+	/* Discourage any hook functions that might be running. */
+	xchk_iscan_abort(&xqc->iscan);
+
+	/*
+	 * As noted above, the apply hook is responsible for cleaning up the
+	 * shadow dquot accounting data when a transaction completes.  The mod
+	 * hook must be removed before the apply hook so that we don't
+	 * mistakenly leave an active shadow account for the mod hook to get
+	 * its hands on.  No hooks should be running after these functions
+	 * return.
+	 */
+	xfs_dqtrx_hook_del(qi, &xqc->qhook);
+
+	if (xqc->shadow_dquot_acct.key_len) {
+		rhashtable_free_and_destroy(&xqc->shadow_dquot_acct,
+				xqcheck_dqacct_free, NULL);
+		xqc->shadow_dquot_acct.key_len = 0;
+	}
+
+	if (xqc->pcounts) {
+		xfarray_destroy(xqc->pcounts);
+		xqc->pcounts = NULL;
+	}
+
+	if (xqc->gcounts) {
+		xfarray_destroy(xqc->gcounts);
+		xqc->gcounts = NULL;
+	}
+
+	if (xqc->ucounts) {
+		xfarray_destroy(xqc->ucounts);
+		xqc->ucounts = NULL;
+	}
+
+	xchk_iscan_teardown(&xqc->iscan);
+	mutex_destroy(&xqc->lock);
+	xqc->sc = NULL;
+}
+
+/*
+ * Scan all inodes in the entire filesystem to generate quota counter data.
+ * If the scan is successful, the quota data will be left alive for a repair.
+ * If any error occurs, we'll tear everything down.
+ */
+STATIC int
+xqcheck_setup_scan(
+	struct xfs_scrub	*sc,
+	struct xqcheck		*xqc)
+{
+	char			*descr;
+	struct xfs_quotainfo	*qi = sc->mp->m_quotainfo;
+	unsigned long long	max_dquots = XFS_DQ_ID_MAX + 1ULL;
+	int			error;
+
+	ASSERT(xqc->sc == NULL);
+	xqc->sc = sc;
+
+	mutex_init(&xqc->lock);
+
+	/* Retry iget every tenth of a second for up to 30 seconds. */
+	xchk_iscan_start(sc, 30000, 100, &xqc->iscan);
+
+	error = -ENOMEM;
+	if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_USER)) {
+		descr = xchk_xfile_descr(sc, "user dquot records");
+		error = xfarray_create(descr, max_dquots,
+				sizeof(struct xqcheck_dquot), &xqc->ucounts);
+		kfree(descr);
+		if (error)
+			goto out_teardown;
+	}
+
+	if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_GROUP)) {
+		descr = xchk_xfile_descr(sc, "group dquot records");
+		error = xfarray_create(descr, max_dquots,
+				sizeof(struct xqcheck_dquot), &xqc->gcounts);
+		kfree(descr);
+		if (error)
+			goto out_teardown;
+	}
+
+	if (xfs_this_quota_on(sc->mp, XFS_DQTYPE_PROJ)) {
+		descr = xchk_xfile_descr(sc, "project dquot records");
+		error = xfarray_create(descr, max_dquots,
+				sizeof(struct xqcheck_dquot), &xqc->pcounts);
+		kfree(descr);
+		if (error)
+			goto out_teardown;
+	}
+
+	/*
+	 * Set up hash table to map transactions to our internal shadow dqtrx
+	 * structures.
+	 */
+	error = rhashtable_init(&xqc->shadow_dquot_acct,
+			&xqcheck_dqacct_hash_params);
+	if (error)
+		goto out_teardown;
+
+	/*
+	 * Hook into the quota code.  The hook only triggers for inodes that
+	 * were already scanned, and the scanner thread takes each inode's
+	 * ILOCK, which means that any in-progress inode updates will finish
+	 * before we can scan the inode.
+	 *
+	 * The apply hook (which removes the shadow dquot accounting struct)
+	 * must be installed before the mod hook so that we never fail to catch
+	 * the end of a quota update sequence and leave stale shadow data.
+	 */
+	ASSERT(sc->flags & XCHK_FSGATES_QUOTA);
+	xfs_dqtrx_hook_setup(&xqc->qhook, xqcheck_mod_live_ino_dqtrx,
+			xqcheck_apply_live_dqtrx);
+
+	error = xfs_dqtrx_hook_add(qi, &xqc->qhook);
+	if (error)
+		goto out_teardown;
+
+	/* Use deferred cleanup to pass the quota count data to repair. */
+	sc->buf_cleanup = xqcheck_teardown_scan;
+	return 0;
+
+out_teardown:
+	xqcheck_teardown_scan(xqc);
+	return error;
+}
+
+/* Scrub all counters for a given quota type. */
+int
+xchk_quotacheck(
+	struct xfs_scrub	*sc)
+{
+	struct xqcheck		*xqc = sc->buf;
+	int			error = 0;
+
+	/* Check quota counters on the live filesystem. */
+	error = xqcheck_setup_scan(sc, xqc);
+	if (error)
+		return error;
+
+	/* Walk all inodes, picking up quota information. */
+	error = xqcheck_collect_counts(xqc);
+	if (!xchk_xref_process_error(sc, 0, 0, &error))
+		return error;
+
+	/* Fail fast if we're not playing with a full dataset. */
+	if (xchk_iscan_aborted(&xqc->iscan))
+		xchk_set_incomplete(sc);
+	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
+		return 0;
+
+	/* Compare quota counters. */
+	if (xqc->ucounts) {
+		error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_USER);
+		if (!xchk_xref_process_error(sc, 0, 0, &error))
+			return error;
+	}
+	if (xqc->gcounts) {
+		error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_GROUP);
+		if (!xchk_xref_process_error(sc, 0, 0, &error))
+			return error;
+	}
+	if (xqc->pcounts) {
+		error = xqcheck_compare_dqtype(xqc, XFS_DQTYPE_PROJ);
+		if (!xchk_xref_process_error(sc, 0, 0, &error))
+			return error;
+	}
+
+	/* Check one last time for an incomplete dataset. */
+	if (xchk_iscan_aborted(&xqc->iscan))
+		xchk_set_incomplete(sc);
+
+	return 0;
+}
diff --git a/fs/xfs/scrub/quotacheck.h b/fs/xfs/scrub/quotacheck.h
new file mode 100644
index 000000000000..4ea5f249c978
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTACHECK_H__
+#define __XFS_SCRUB_QUOTACHECK_H__
+
+/* Quota counters for live quotacheck. */
+struct xqcheck_dquot {
+	/* block usage count */
+	int64_t			bcount;
+
+	/* inode usage count */
+	int64_t			icount;
+
+	/* realtime block usage count */
+	int64_t			rtbcount;
+
+	/* Record state */
+	unsigned int		flags;
+};
+
+/*
+ * This incore dquot record has been written at least once.  We never want to
+ * store an xqcheck_dquot that looks uninitialized.
+ */
+#define XQCHECK_DQUOT_WRITTEN		(1U << 0)
+
+/* Already checked this dquot. */
+#define XQCHECK_DQUOT_COMPARE_SCANNED	(1U << 1)
+
+/* Already repaired this dquot. */
+#define XQCHECK_DQUOT_REPAIR_SCANNED	(1U << 2)
+
+/* Live quotacheck control structure. */
+struct xqcheck {
+	struct xfs_scrub	*sc;
+
+	/* Shadow dquot counter data. */
+	struct xfarray		*ucounts;
+	struct xfarray		*gcounts;
+	struct xfarray		*pcounts;
+
+	/* Lock protecting quotacheck count observations */
+	struct mutex		lock;
+
+	struct xchk_iscan	iscan;
+
+	/* Hooks into the quota code. */
+	struct xfs_dqtrx_hook	qhook;
+
+	/* Shadow quota delta tracking structure. */
+	struct rhashtable	shadow_dquot_acct;
+};
+
+/* Return the incore counter array for a given quota type. */
+static inline struct xfarray *
+xqcheck_counters_for(
+	struct xqcheck		*xqc,
+	xfs_dqtype_t		dqtype)
+{
+	switch (dqtype) {
+	case XFS_DQTYPE_USER:
+		return xqc->ucounts;
+	case XFS_DQTYPE_GROUP:
+		return xqc->gcounts;
+	case XFS_DQTYPE_PROJ:
+		return xqc->pcounts;
+	}
+
+	ASSERT(0);
+	return NULL;
+}
+
+#endif /* __XFS_SCRUB_QUOTACHECK_H__ */
diff --git a/fs/xfs/scrub/quotacheck_repair.c b/fs/xfs/scrub/quotacheck_repair.c
new file mode 100644
index 000000000000..dd8554c755b5
--- /dev/null
+++ b/fs/xfs/scrub/quotacheck_repair.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_util.h"
+#include "xfs_iwalk.h"
+#include "xfs_ialloc.h"
+#include "xfs_sb.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/quota.h"
+#include "scrub/quotacheck.h"
+#include "scrub/trace.h"
+
+/*
+ * Live Quotacheck Repair
+ * ======================
+ *
+ * Use the live quota counter information that we collected to replace the
+ * counter values in the incore dquots.  A scrub->repair cycle should have left
+ * the live data and hooks active, so this is safe so long as we make sure the
+ * dquot is locked.
+ */
+
+/* Commit new counters to a dquot. */
+static int
+xqcheck_commit_dquot(
+	struct xqcheck		*xqc,
+	xfs_dqtype_t		dqtype,
+	struct xfs_dquot	*dq)
+{
+	struct xqcheck_dquot	xcdq;
+	struct xfarray		*counts = xqcheck_counters_for(xqc, dqtype);
+	int64_t			delta;
+	bool			dirty = false;
+	int			error = 0;
+
+	/* Unlock the dquot just long enough to allocate a transaction. */
+	xfs_dqunlock(dq);
+	error = xchk_trans_alloc(xqc->sc, 0);
+	xfs_dqlock(dq);
+	if (error)
+		return error;
+
+	xfs_trans_dqjoin(xqc->sc->tp, dq);
+
+	if (xchk_iscan_aborted(&xqc->iscan)) {
+		error = -ECANCELED;
+		goto out_cancel;
+	}
+
+	mutex_lock(&xqc->lock);
+	error = xfarray_load_sparse(counts, dq->q_id, &xcdq);
+	if (error)
+		goto out_unlock;
+
+	/* Adjust counters as needed. */
+	delta = (int64_t)xcdq.icount - dq->q_ino.count;
+	if (delta) {
+		dq->q_ino.reserved += delta;
+		dq->q_ino.count += delta;
+		dirty = true;
+	}
+
+	delta = (int64_t)xcdq.bcount - dq->q_blk.count;
+	if (delta) {
+		dq->q_blk.reserved += delta;
+		dq->q_blk.count += delta;
+		dirty = true;
+	}
+
+	delta = (int64_t)xcdq.rtbcount - dq->q_rtb.count;
+	if (delta) {
+		dq->q_rtb.reserved += delta;
+		dq->q_rtb.count += delta;
+		dirty = true;
+	}
+
+	xcdq.flags |= (XQCHECK_DQUOT_REPAIR_SCANNED | XQCHECK_DQUOT_WRITTEN);
+	error = xfarray_store(counts, dq->q_id, &xcdq);
+	if (error == -EFBIG) {
+		/*
+		 * EFBIG means we tried to store data at too high a byte offset
+		 * in the sparse array.  IOWs, we cannot complete the repair
+		 * and must cancel the whole operation.  This should never
+		 * happen, but we need to catch it anyway.
+		 */
+		error = -ECANCELED;
+	}
+	mutex_unlock(&xqc->lock);
+	if (error || !dirty)
+		goto out_cancel;
+
+	trace_xrep_quotacheck_dquot(xqc->sc->mp, dq->q_type, dq->q_id);
+
+	/* Commit the dirty dquot to disk. */
+	dq->q_flags |= XFS_DQFLAG_DIRTY;
+	if (dq->q_id)
+		xfs_qm_adjust_dqtimers(dq);
+	xfs_trans_log_dquot(xqc->sc->tp, dq);
+
+	/*
+	 * Transaction commit unlocks the dquot, so we must re-lock it so that
+	 * the caller can put the reference (which apparently requires a locked
+	 * dquot).
+	 */
+	error = xrep_trans_commit(xqc->sc);
+	xfs_dqlock(dq);
+	return error;
+
+out_unlock:
+	mutex_unlock(&xqc->lock);
+out_cancel:
+	xchk_trans_cancel(xqc->sc);
+
+	/* Re-lock the dquot so the caller can put the reference. */
+	xfs_dqlock(dq);
+	return error;
+}
+
+/* Commit new quota counters for a particular quota type. */
+STATIC int
+xqcheck_commit_dqtype(
+	struct xqcheck		*xqc,
+	unsigned int		dqtype)
+{
+	struct xchk_dqiter	cursor = { };
+	struct xqcheck_dquot	xcdq;
+	struct xfs_scrub	*sc = xqc->sc;
+	struct xfs_mount	*mp = sc->mp;
+	struct xfarray		*counts = xqcheck_counters_for(xqc, dqtype);
+	struct xfs_dquot	*dq;
+	xfarray_idx_t		cur = XFARRAY_CURSOR_INIT;
+	int			error;
+
+	/*
+	 * Update the counters of every dquot that the quota file knows about.
+	 */
+	xchk_dqiter_init(&cursor, sc, dqtype);
+	while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+		error = xqcheck_commit_dquot(xqc, dqtype, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			break;
+	}
+	if (error)
+		return error;
+
+	/*
+	 * Make a second pass to deal with the dquots that we know about but
+	 * the quota file previously did not know about.
+	 */
+	mutex_lock(&xqc->lock);
+	while ((error = xfarray_iter(counts, &cur, &xcdq)) == 1) {
+		xfs_dqid_t	id = cur - 1;
+
+		if (xcdq.flags & XQCHECK_DQUOT_REPAIR_SCANNED)
+			continue;
+
+		mutex_unlock(&xqc->lock);
+
+		/*
+		 * Grab the dquot, allowing for dquot block allocation in a
+		 * separate transaction.  We committed the scrub transaction
+		 * in a previous step, so we will not be creating nested
+		 * transactions here.
+		 */
+		error = xfs_qm_dqget(mp, id, dqtype, true, &dq);
+		if (error)
+			return error;
+
+		error = xqcheck_commit_dquot(xqc, dqtype, dq);
+		xfs_qm_dqput(dq);
+		if (error)
+			return error;
+
+		mutex_lock(&xqc->lock);
+	}
+	mutex_unlock(&xqc->lock);
+
+	return error;
+}
+
+/* Figure out quota CHKD flags for the running quota types. */
+static inline unsigned int
+xqcheck_chkd_flags(
+	struct xfs_mount	*mp)
+{
+	unsigned int		ret = 0;
+
+	if (XFS_IS_UQUOTA_ON(mp))
+		ret |= XFS_UQUOTA_CHKD;
+	if (XFS_IS_GQUOTA_ON(mp))
+		ret |= XFS_GQUOTA_CHKD;
+	if (XFS_IS_PQUOTA_ON(mp))
+		ret |= XFS_PQUOTA_CHKD;
+	return ret;
+}
+
+/* Commit the new dquot counters. */
+int
+xrep_quotacheck(
+	struct xfs_scrub	*sc)
+{
+	struct xqcheck		*xqc = sc->buf;
+	unsigned int		qflags = xqcheck_chkd_flags(sc->mp);
+	int			error;
+
+	/*
+	 * Clear the CHKD flag for the running quota types and commit the scrub
+	 * transaction so that we can allocate new quota block mappings if we
+	 * have to.  If we crash after this point, the sb still has the CHKD
+	 * flags cleared, so mount quotacheck will fix all of this up.
+	 */
+	xrep_update_qflags(sc, qflags, 0);
+	error = xrep_trans_commit(sc);
+	if (error)
+		return error;
+
+	/* Commit the new counters to the dquots. */
+	if (xqc->ucounts) {
+		error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_USER);
+		if (error)
+			return error;
+	}
+	if (xqc->gcounts) {
+		error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_GROUP);
+		if (error)
+			return error;
+	}
+	if (xqc->pcounts) {
+		error = xqcheck_commit_dqtype(xqc, XFS_DQTYPE_PROJ);
+		if (error)
+			return error;
+	}
+
+	/* Set the CHKD flags now that we've fixed quota counts. */
+	error = xchk_trans_alloc(sc, 0);
+	if (error)
+		return error;
+
+	xrep_update_qflags(sc, 0, qflags);
+	return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/rcbag.c b/fs/xfs/scrub/rcbag.c
new file mode 100644
index 000000000000..e1e52bc20713
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.c
@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/scrub.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/rcbag.h"
+#include "scrub/trace.h"
+
+struct rcbag {
+	struct xfs_mount	*mp;
+	struct xfbtree		xfbtree;
+	uint64_t		nr_items;
+};
+
+int
+rcbag_init(
+	struct xfs_mount	*mp,
+	struct xfs_buftarg	*btp,
+	struct rcbag		**bagp)
+{
+	struct rcbag		*bag;
+	int			error;
+
+	bag = kzalloc(sizeof(struct rcbag), XCHK_GFP_FLAGS);
+	if (!bag)
+		return -ENOMEM;
+
+	bag->nr_items = 0;
+	bag->mp = mp;
+
+	error = rcbagbt_mem_init(mp, &bag->xfbtree, btp);
+	if (error)
+		goto out_bag;
+
+	*bagp = bag;
+	return 0;
+
+out_bag:
+	kfree(bag);
+	return error;
+}
+
+void
+rcbag_free(
+	struct rcbag		**bagp)
+{
+	struct rcbag		*bag = *bagp;
+
+	xfbtree_destroy(&bag->xfbtree);
+	kfree(bag);
+	*bagp = NULL;
+}
+
+/* Track an rmap in the refcount bag. */
+int
+rcbag_add(
+	struct rcbag			*bag,
+	struct xfs_trans		*tp,
+	const struct xfs_rmap_irec	*rmap)
+{
+	struct rcbag_rec		bagrec;
+	struct xfs_mount		*mp = bag->mp;
+	struct xfs_btree_cur		*cur;
+	int				has;
+	int				error;
+
+	cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+	error = rcbagbt_lookup_eq(cur, rmap, &has);
+	if (error)
+		goto out_cur;
+
+	if (has) {
+		error = rcbagbt_get_rec(cur, &bagrec, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		bagrec.rbg_refcount++;
+		error = rcbagbt_update(cur, &bagrec);
+		if (error)
+			goto out_cur;
+	} else {
+		bagrec.rbg_startblock = rmap->rm_startblock;
+		bagrec.rbg_blockcount = rmap->rm_blockcount;
+		bagrec.rbg_refcount = 1;
+
+		error = rcbagbt_insert(cur, &bagrec, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+	}
+
+	xfs_btree_del_cursor(cur, 0);
+
+	error = xfbtree_trans_commit(&bag->xfbtree, tp);
+	if (error)
+		return error;
+
+	bag->nr_items++;
+	return 0;
+
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	xfbtree_trans_cancel(&bag->xfbtree, tp);
+	return error;
+}
+
+/* Return the number of records in the bag. */
+uint64_t
+rcbag_count(
+	const struct rcbag	*rcbag)
+{
+	return rcbag->nr_items;
+}
+
+static inline uint32_t rcbag_rec_next_bno(const struct rcbag_rec *r)
+{
+	return r->rbg_startblock + r->rbg_blockcount;
+}
+
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+int
+rcbag_next_edge(
+	struct rcbag			*bag,
+	struct xfs_trans		*tp,
+	const struct xfs_rmap_irec	*next_rmap,
+	bool				next_valid,
+	uint32_t			*next_bnop)
+{
+	struct rcbag_rec		bagrec;
+	struct xfs_mount		*mp = bag->mp;
+	struct xfs_btree_cur		*cur;
+	uint32_t			next_bno = NULLAGBLOCK;
+	int				has;
+	int				error;
+
+	if (next_valid)
+		next_bno = next_rmap->rm_startblock;
+
+	cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+	error = xfs_btree_goto_left_edge(cur);
+	if (error)
+		goto out_cur;
+
+	while (true) {
+		error = xfs_btree_increment(cur, 0, &has);
+		if (error)
+			goto out_cur;
+		if (!has)
+			break;
+
+		error = rcbagbt_get_rec(cur, &bagrec, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		next_bno = min(next_bno, rcbag_rec_next_bno(&bagrec));
+	}
+
+	/*
+	 * We should have found /something/ because either next_rrm is the next
+	 * interesting rmap to look at after emitting this refcount extent, or
+	 * there are other rmaps in rmap_bag contributing to the current
+	 * sharing count.  But if something is seriously wrong, bail out.
+	 */
+	if (next_bno == NULLAGBLOCK) {
+		error = -EFSCORRUPTED;
+		goto out_cur;
+	}
+
+	xfs_btree_del_cursor(cur, 0);
+
+	*next_bnop = next_bno;
+	return 0;
+
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/* Pop all refcount bag records that end at next_bno */
+int
+rcbag_remove_ending_at(
+	struct rcbag		*bag,
+	struct xfs_trans	*tp,
+	uint32_t		next_bno)
+{
+	struct rcbag_rec	bagrec;
+	struct xfs_mount	*mp = bag->mp;
+	struct xfs_btree_cur	*cur;
+	int			has;
+	int			error;
+
+	/* go to the right edge of the tree */
+	cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+	memset(&cur->bc_rec, 0xFF, sizeof(cur->bc_rec));
+	error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, &has);
+	if (error)
+		goto out_cur;
+
+	while (true) {
+		error = xfs_btree_decrement(cur, 0, &has);
+		if (error)
+			goto out_cur;
+		if (!has)
+			break;
+
+		error = rcbagbt_get_rec(cur, &bagrec, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		if (rcbag_rec_next_bno(&bagrec) != next_bno)
+			continue;
+
+		error = xfs_btree_delete(cur, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		bag->nr_items -= bagrec.rbg_refcount;
+	}
+
+	xfs_btree_del_cursor(cur, 0);
+	return xfbtree_trans_commit(&bag->xfbtree, tp);
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	xfbtree_trans_cancel(&bag->xfbtree, tp);
+	return error;
+}
+
+/* Dump the rcbag. */
+void
+rcbag_dump(
+	struct rcbag			*bag,
+	struct xfs_trans		*tp)
+{
+	struct rcbag_rec		bagrec;
+	struct xfs_mount		*mp = bag->mp;
+	struct xfs_btree_cur		*cur;
+	unsigned long long		nr = 0;
+	int				has;
+	int				error;
+
+	cur = rcbagbt_mem_cursor(mp, tp, &bag->xfbtree);
+	error = xfs_btree_goto_left_edge(cur);
+	if (error)
+		goto out_cur;
+
+	while (true) {
+		error = xfs_btree_increment(cur, 0, &has);
+		if (error)
+			goto out_cur;
+		if (!has)
+			break;
+
+		error = rcbagbt_get_rec(cur, &bagrec, &has);
+		if (error)
+			goto out_cur;
+		if (!has) {
+			error = -EFSCORRUPTED;
+			goto out_cur;
+		}
+
+		xfs_err(bag->mp, "[%llu]: bno 0x%x fsbcount 0x%x refcount 0x%llx\n",
+				nr++,
+				(unsigned int)bagrec.rbg_startblock,
+				(unsigned int)bagrec.rbg_blockcount,
+				(unsigned long long)bagrec.rbg_refcount);
+	}
+
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+}
diff --git a/fs/xfs/scrub/rcbag.h b/fs/xfs/scrub/rcbag.h
new file mode 100644
index 000000000000..e29ef788ba72
--- /dev/null
+++ b/fs/xfs/scrub/rcbag.h
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_H__
+#define __XFS_SCRUB_RCBAG_H__
+
+struct xfs_mount;
+struct rcbag;
+struct xfs_buftarg;
+
+int rcbag_init(struct xfs_mount *mp, struct xfs_buftarg *btp,
+		struct rcbag **bagp);
+void rcbag_free(struct rcbag **bagp);
+int rcbag_add(struct rcbag *bag, struct xfs_trans *tp,
+		const struct xfs_rmap_irec *rmap);
+uint64_t rcbag_count(const struct rcbag *bag);
+
+int rcbag_next_edge(struct rcbag *bag, struct xfs_trans *tp,
+		const struct xfs_rmap_irec *next_rmap, bool next_valid,
+		uint32_t *next_bnop);
+int rcbag_remove_ending_at(struct rcbag *bag, struct xfs_trans *tp,
+		uint32_t next_bno);
+
+void rcbag_dump(struct rcbag *bag, struct xfs_trans *tp);
+
+#endif /* __XFS_SCRUB_RCBAG_H__ */
diff --git a/fs/xfs/scrub/rcbag_btree.c b/fs/xfs/scrub/rcbag_btree.c
new file mode 100644
index 000000000000..709356dc6256
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_error.h"
+#include "scrub/rcbag_btree.h"
+#include "scrub/trace.h"
+
+static struct kmem_cache	*rcbagbt_cur_cache;
+
+STATIC void
+rcbagbt_init_key_from_rec(
+	union xfs_btree_key		*key,
+	const union xfs_btree_rec	*rec)
+{
+	struct rcbag_key	*bag_key = (struct rcbag_key *)key;
+	const struct rcbag_rec	*bag_rec = (const struct rcbag_rec *)rec;
+
+	BUILD_BUG_ON(sizeof(struct rcbag_key) > sizeof(union xfs_btree_key));
+	BUILD_BUG_ON(sizeof(struct rcbag_rec) > sizeof(union xfs_btree_rec));
+
+	bag_key->rbg_startblock = bag_rec->rbg_startblock;
+	bag_key->rbg_blockcount = bag_rec->rbg_blockcount;
+}
+
+STATIC void
+rcbagbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	struct rcbag_rec	*bag_rec = (struct rcbag_rec *)rec;
+	struct rcbag_rec	*bag_irec = (struct rcbag_rec *)&cur->bc_rec;
+
+	bag_rec->rbg_startblock = bag_irec->rbg_startblock;
+	bag_rec->rbg_blockcount = bag_irec->rbg_blockcount;
+	bag_rec->rbg_refcount = bag_irec->rbg_refcount;
+}
+
+STATIC int64_t
+rcbagbt_key_diff(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*key)
+{
+	struct rcbag_rec		*rec = (struct rcbag_rec *)&cur->bc_rec;
+	const struct rcbag_key		*kp = (const struct rcbag_key *)key;
+
+	if (kp->rbg_startblock > rec->rbg_startblock)
+		return 1;
+	if (kp->rbg_startblock < rec->rbg_startblock)
+		return -1;
+
+	if (kp->rbg_blockcount > rec->rbg_blockcount)
+		return 1;
+	if (kp->rbg_blockcount < rec->rbg_blockcount)
+		return -1;
+
+	return 0;
+}
+
+STATIC int64_t
+rcbagbt_diff_two_keys(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2,
+	const union xfs_btree_key	*mask)
+{
+	const struct rcbag_key		*kp1 = (const struct rcbag_key *)k1;
+	const struct rcbag_key		*kp2 = (const struct rcbag_key *)k2;
+
+	ASSERT(mask == NULL);
+
+	if (kp1->rbg_startblock > kp2->rbg_startblock)
+		return 1;
+	if (kp1->rbg_startblock < kp2->rbg_startblock)
+		return -1;
+
+	if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+		return 1;
+	if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+		return -1;
+
+	return 0;
+}
+
+STATIC int
+rcbagbt_keys_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_key	*k1,
+	const union xfs_btree_key	*k2)
+{
+	const struct rcbag_key		*kp1 = (const struct rcbag_key *)k1;
+	const struct rcbag_key		*kp2 = (const struct rcbag_key *)k2;
+
+	if (kp1->rbg_startblock > kp2->rbg_startblock)
+		return 0;
+	if (kp1->rbg_startblock < kp2->rbg_startblock)
+		return 1;
+
+	if (kp1->rbg_blockcount > kp2->rbg_blockcount)
+		return 0;
+	if (kp1->rbg_blockcount < kp2->rbg_blockcount)
+		return 1;
+
+	return 0;
+}
+
+STATIC int
+rcbagbt_recs_inorder(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*r1,
+	const union xfs_btree_rec	*r2)
+{
+	const struct rcbag_rec		*rp1 = (const struct rcbag_rec *)r1;
+	const struct rcbag_rec		*rp2 = (const struct rcbag_rec *)r2;
+
+	if (rp1->rbg_startblock > rp2->rbg_startblock)
+		return 0;
+	if (rp1->rbg_startblock < rp2->rbg_startblock)
+		return 1;
+
+	if (rp1->rbg_blockcount > rp2->rbg_blockcount)
+		return 0;
+	if (rp1->rbg_blockcount < rp2->rbg_blockcount)
+		return 1;
+
+	return 0;
+}
+
+static xfs_failaddr_t
+rcbagbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	xfs_failaddr_t		fa;
+	unsigned int		level;
+	unsigned int		maxrecs;
+
+	if (!xfs_verify_magic(bp, block->bb_magic))
+		return __this_address;
+
+	fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
+	if (fa)
+		return fa;
+
+	level = be16_to_cpu(block->bb_level);
+	if (level >= rcbagbt_maxlevels_possible())
+		return __this_address;
+
+	maxrecs = rcbagbt_maxrecs(mp, XFBNO_BLOCKSIZE, level == 0);
+	return xfs_btree_memblock_verify(bp, maxrecs);
+}
+
+static void
+rcbagbt_rw_verify(
+	struct xfs_buf	*bp)
+{
+	xfs_failaddr_t	fa = rcbagbt_verify(bp);
+
+	if (fa)
+		xfs_verifier_error(bp, -EFSCORRUPTED, fa);
+}
+
+/* skip crc checks on in-memory btrees to save time */
+static const struct xfs_buf_ops rcbagbt_mem_buf_ops = {
+	.name			= "rcbagbt_mem",
+	.magic			= { 0, cpu_to_be32(RCBAG_MAGIC) },
+	.verify_read		= rcbagbt_rw_verify,
+	.verify_write		= rcbagbt_rw_verify,
+	.verify_struct		= rcbagbt_verify,
+};
+
+static const struct xfs_btree_ops rcbagbt_mem_ops = {
+	.name			= "rcbag",
+	.type			= XFS_BTREE_TYPE_MEM,
+
+	.rec_len		= sizeof(struct rcbag_rec),
+	.key_len		= sizeof(struct rcbag_key),
+	.ptr_len		= XFS_BTREE_LONG_PTR_LEN,
+
+	.lru_refs		= 1,
+	.statoff		= XFS_STATS_CALC_INDEX(xs_rcbag_2),
+
+	.dup_cursor		= xfbtree_dup_cursor,
+	.set_root		= xfbtree_set_root,
+	.alloc_block		= xfbtree_alloc_block,
+	.free_block		= xfbtree_free_block,
+	.get_minrecs		= xfbtree_get_minrecs,
+	.get_maxrecs		= xfbtree_get_maxrecs,
+	.init_key_from_rec	= rcbagbt_init_key_from_rec,
+	.init_rec_from_cur	= rcbagbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfbtree_init_ptr_from_cur,
+	.key_diff		= rcbagbt_key_diff,
+	.buf_ops		= &rcbagbt_mem_buf_ops,
+	.diff_two_keys		= rcbagbt_diff_two_keys,
+	.keys_inorder		= rcbagbt_keys_inorder,
+	.recs_inorder		= rcbagbt_recs_inorder,
+};
+
+/* Create a cursor for an in-memory btree. */
+struct xfs_btree_cur *
+rcbagbt_mem_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfbtree		*xfbtree)
+{
+	struct xfs_btree_cur	*cur;
+
+	cur = xfs_btree_alloc_cursor(mp, tp, &rcbagbt_mem_ops,
+			rcbagbt_maxlevels_possible(), rcbagbt_cur_cache);
+
+	cur->bc_mem.xfbtree = xfbtree;
+	cur->bc_nlevels = xfbtree->nlevels;
+	return cur;
+}
+
+/* Create an in-memory refcount bag btree. */
+int
+rcbagbt_mem_init(
+	struct xfs_mount	*mp,
+	struct xfbtree		*xfbt,
+	struct xfs_buftarg	*btp)
+{
+	xfbt->owner = 0;
+	return xfbtree_init(mp, xfbt, btp, &rcbagbt_mem_ops);
+}
+
+/* Calculate number of records in a refcount bag btree block. */
+static inline unsigned int
+rcbagbt_block_maxrecs(
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	if (leaf)
+		return blocklen / sizeof(struct rcbag_rec);
+	return blocklen /
+		(sizeof(struct rcbag_key) + sizeof(rcbag_ptr_t));
+}
+
+/*
+ * Calculate number of records in an refcount bag btree block.
+ */
+unsigned int
+rcbagbt_maxrecs(
+	struct xfs_mount	*mp,
+	unsigned int		blocklen,
+	bool			leaf)
+{
+	blocklen -= RCBAG_BLOCK_LEN;
+	return rcbagbt_block_maxrecs(blocklen, leaf);
+}
+
+/* Compute the max possible height for refcount bag btrees. */
+unsigned int
+rcbagbt_maxlevels_possible(void)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_space_to_height(minrecs, ULLONG_MAX);
+}
+
+/* Calculate the refcount bag btree size for some records. */
+unsigned long long
+rcbagbt_calc_size(
+	unsigned long long	nr_records)
+{
+	unsigned int		minrecs[2];
+	unsigned int		blocklen;
+
+	blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
+
+	minrecs[0] = rcbagbt_block_maxrecs(blocklen, true) / 2;
+	minrecs[1] = rcbagbt_block_maxrecs(blocklen, false) / 2;
+
+	return xfs_btree_calc_size(minrecs, nr_records);
+}
+
+int __init
+rcbagbt_init_cur_cache(void)
+{
+	rcbagbt_cur_cache = kmem_cache_create("xfs_rcbagbt_cur",
+			xfs_btree_cur_sizeof(rcbagbt_maxlevels_possible()),
+			0, 0, NULL);
+
+	if (!rcbagbt_cur_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void
+rcbagbt_destroy_cur_cache(void)
+{
+	kmem_cache_destroy(rcbagbt_cur_cache);
+	rcbagbt_cur_cache = NULL;
+}
+
+/* Look up the refcount bag record corresponding to this reverse mapping. */
+int
+rcbagbt_lookup_eq(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rmap,
+	int				*success)
+{
+	struct rcbag_rec		*rec = (struct rcbag_rec *)&cur->bc_rec;
+
+	rec->rbg_startblock = rmap->rm_startblock;
+	rec->rbg_blockcount = rmap->rm_blockcount;
+
+	return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, success);
+}
+
+/* Get the data from the pointed-to record. */
+int
+rcbagbt_get_rec(
+	struct xfs_btree_cur	*cur,
+	struct rcbag_rec	*rec,
+	int			*has)
+{
+	union xfs_btree_rec	*btrec;
+	int			error;
+
+	error = xfs_btree_get_rec(cur, &btrec, has);
+	if (error || !(*has))
+		return error;
+
+	memcpy(rec, btrec, sizeof(struct rcbag_rec));
+	return 0;
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_update(
+	struct xfs_btree_cur	*cur,
+	const struct rcbag_rec	*rec)
+{
+	union xfs_btree_rec	btrec;
+
+	memcpy(&btrec, rec, sizeof(struct rcbag_rec));
+	return xfs_btree_update(cur, &btrec);
+}
+
+/* Update the record referred to by cur to the value given. */
+int
+rcbagbt_insert(
+	struct xfs_btree_cur	*cur,
+	const struct rcbag_rec	*rec,
+	int			*success)
+{
+	struct rcbag_rec	*btrec = (struct rcbag_rec *)&cur->bc_rec;
+
+	memcpy(btrec, rec, sizeof(struct rcbag_rec));
+	return xfs_btree_insert(cur, success);
+}
diff --git a/fs/xfs/scrub/rcbag_btree.h b/fs/xfs/scrub/rcbag_btree.h
new file mode 100644
index 000000000000..03cadb032552
--- /dev/null
+++ b/fs/xfs/scrub/rcbag_btree.h
@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RCBAG_BTREE_H__
+#define __XFS_SCRUB_RCBAG_BTREE_H__
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+#define RCBAG_MAGIC	0x74826671	/* 'JRBG' */
+
+struct rcbag_key {
+	uint32_t	rbg_startblock;
+	uint32_t	rbg_blockcount;
+};
+
+struct rcbag_rec {
+	uint32_t	rbg_startblock;
+	uint32_t	rbg_blockcount;
+	uint64_t	rbg_refcount;
+};
+
+typedef __be64 rcbag_ptr_t;
+
+/* reflinks only exist on crc enabled filesystems */
+#define RCBAG_BLOCK_LEN	XFS_BTREE_LBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define RCBAG_REC_ADDR(block, index) \
+	((struct rcbag_rec *) \
+		((char *)(block) + RCBAG_BLOCK_LEN + \
+		 (((index) - 1) * sizeof(struct rcbag_rec))))
+
+#define RCBAG_KEY_ADDR(block, index) \
+	((struct rcbag_key *) \
+		((char *)(block) + RCBAG_BLOCK_LEN + \
+		 ((index) - 1) * sizeof(struct rcbag_key)))
+
+#define RCBAG_PTR_ADDR(block, index, maxrecs) \
+	((rcbag_ptr_t *) \
+		((char *)(block) + RCBAG_BLOCK_LEN + \
+		 (maxrecs) * sizeof(struct rcbag_key) + \
+		 ((index) - 1) * sizeof(rcbag_ptr_t)))
+
+unsigned int rcbagbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen,
+		bool leaf);
+
+unsigned long long rcbagbt_calc_size(unsigned long long nr_records);
+
+unsigned int rcbagbt_maxlevels_possible(void);
+
+int __init rcbagbt_init_cur_cache(void);
+void rcbagbt_destroy_cur_cache(void);
+
+struct xfs_btree_cur *rcbagbt_mem_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfbtree *xfbtree);
+int rcbagbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree,
+		struct xfs_buftarg *btp);
+
+int rcbagbt_lookup_eq(struct xfs_btree_cur *cur,
+		const struct xfs_rmap_irec *rmap, int *success);
+int rcbagbt_get_rec(struct xfs_btree_cur *cur, struct rcbag_rec *rec, int *has);
+int rcbagbt_update(struct xfs_btree_cur *cur, const struct rcbag_rec *rec);
+int rcbagbt_insert(struct xfs_btree_cur *cur, const struct rcbag_rec *rec,
+		int *success);
+
+#else
+# define rcbagbt_init_cur_cache()		0
+# define rcbagbt_destroy_cur_cache()		((void)0)
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
+#endif /* __XFS_SCRUB_RCBAG_BTREE_H__ */
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index 16462332c897..dfdcb96b6c16 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -281,7 +281,7 @@ xchk_dir_walk(
 		return -EIO;
 
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
-	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
 		return xchk_dir_walk_sf(sc, dp, dirent_fn, priv);
@@ -332,7 +332,7 @@ xchk_dir_lookup(
 		return -EIO;
 
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
-	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
 		error = xfs_dir2_sf_lookup(&args);
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index f99eca799809..0252a3b5b65a 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -114,7 +114,7 @@ xreap_put_freelist(
 	int			error;
 
 	/* Make sure there's space on the freelist. */
-	error = xrep_fix_freelist(sc, true);
+	error = xrep_fix_freelist(sc, 0);
 	if (error)
 		return error;
 
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index bf22f245bbfa..d0c7d4a29c0f 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -7,8 +7,10 @@
 #include "xfs_fs.h"
 #include "xfs_shared.h"
 #include "xfs_format.h"
+#include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
 #include "xfs_mount.h"
+#include "xfs_trans.h"
 #include "xfs_ag.h"
 #include "xfs_btree.h"
 #include "xfs_rmap.h"
@@ -17,6 +19,7 @@
 #include "scrub/common.h"
 #include "scrub/btree.h"
 #include "scrub/trace.h"
+#include "scrub/repair.h"
 
 /*
  * Set us up to scrub reference count btrees.
@@ -27,6 +30,15 @@ xchk_setup_ag_refcountbt(
 {
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
+
+	if (xchk_could_repair(sc)) {
+		int		error;
+
+		error = xrep_setup_ag_refcountbt(sc);
+		if (error)
+			return error;
+	}
+
 	return xchk_setup_ag_btree(sc, false);
 }
 
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
index f38fccc42a20..a00d7ce7ae5b 100644
--- a/fs/xfs/scrub/refcount_repair.c
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -25,6 +25,7 @@
 #include "xfs_refcount_btree.h"
 #include "xfs_error.h"
 #include "xfs_ag.h"
+#include "xfs_health.h"
 #include "scrub/xfs_scrub.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
@@ -37,6 +38,7 @@
 #include "scrub/xfarray.h"
 #include "scrub/newbt.h"
 #include "scrub/reap.h"
+#include "scrub/rcbag.h"
 
 /*
  * Rebuilding the Reference Count Btree
@@ -97,12 +99,6 @@
  * insert all the records.
  */
 
-/* The only parts of the rmap that we care about for computing refcounts. */
-struct xrep_refc_rmap {
-	xfs_agblock_t		startblock;
-	xfs_extlen_t		blockcount;
-} __packed;
-
 struct xrep_refc {
 	/* refcount extents */
 	struct xfarray		*refcount_records;
@@ -122,6 +118,20 @@ struct xrep_refc {
 	xfs_extlen_t		btblocks;
 };
 
+/* Set us up to repair refcount btrees. */
+int
+xrep_setup_ag_refcountbt(
+	struct xfs_scrub	*sc)
+{
+	char			*descr;
+	int			error;
+
+	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+	error = xrep_setup_xfbtree(sc, descr);
+	kfree(descr);
+	return error;
+}
+
 /* Check for any obvious conflicts with this shared/CoW staging extent. */
 STATIC int
 xrep_refc_check_ext(
@@ -223,10 +233,9 @@ xrep_refc_rmap_shareable(
 STATIC int
 xrep_refc_walk_rmaps(
 	struct xrep_refc	*rr,
-	struct xrep_refc_rmap	*rrm,
+	struct xfs_rmap_irec	*rmap,
 	bool			*have_rec)
 {
-	struct xfs_rmap_irec	rmap;
 	struct xfs_btree_cur	*cur = rr->sc->sa.rmap_cur;
 	struct xfs_mount	*mp = cur->bc_mp;
 	int			have_gt;
@@ -250,29 +259,30 @@ xrep_refc_walk_rmaps(
 		if (!have_gt)
 			return 0;
 
-		error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+		error = xfs_rmap_get_rec(cur, rmap, &have_gt);
 		if (error)
 			return error;
-		if (XFS_IS_CORRUPT(mp, !have_gt))
+		if (XFS_IS_CORRUPT(mp, !have_gt)) {
+			xfs_btree_mark_sick(cur);
 			return -EFSCORRUPTED;
+		}
 
-		if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
-			error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
-					rmap.rm_blockcount);
+		if (rmap->rm_owner == XFS_RMAP_OWN_COW) {
+			error = xrep_refc_stash_cow(rr, rmap->rm_startblock,
+					rmap->rm_blockcount);
 			if (error)
 				return error;
-		} else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+		} else if (rmap->rm_owner == XFS_RMAP_OWN_REFC) {
 			/* refcountbt block, dump it when we're done. */
-			rr->btblocks += rmap.rm_blockcount;
+			rr->btblocks += rmap->rm_blockcount;
 			error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
-					rmap.rm_startblock, rmap.rm_blockcount);
+					rmap->rm_startblock,
+					rmap->rm_blockcount);
 			if (error)
 				return error;
 		}
-	} while (!xrep_refc_rmap_shareable(mp, &rmap));
+	} while (!xrep_refc_rmap_shareable(mp, rmap));
 
-	rrm->startblock = rmap.rm_startblock;
-	rrm->blockcount = rmap.rm_blockcount;
 	*have_rec = true;
 	return 0;
 }
@@ -354,45 +364,6 @@ xrep_refc_sort_records(
 	return error;
 }
 
-#define RRM_NEXT(r)	((r).startblock + (r).blockcount)
-/*
- * Find the next block where the refcount changes, given the next rmap we
- * looked at and the ones we're already tracking.
- */
-static inline int
-xrep_refc_next_edge(
-	struct xfarray		*rmap_bag,
-	struct xrep_refc_rmap	*next_rrm,
-	bool			next_valid,
-	xfs_agblock_t		*nbnop)
-{
-	struct xrep_refc_rmap	rrm;
-	xfarray_idx_t		array_cur = XFARRAY_CURSOR_INIT;
-	xfs_agblock_t		nbno = NULLAGBLOCK;
-	int			error;
-
-	if (next_valid)
-		nbno = next_rrm->startblock;
-
-	while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
-		nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
-
-	if (error)
-		return error;
-
-	/*
-	 * We should have found /something/ because either next_rrm is the next
-	 * interesting rmap to look at after emitting this refcount extent, or
-	 * there are other rmaps in rmap_bag contributing to the current
-	 * sharing count.  But if something is seriously wrong, bail out.
-	 */
-	if (nbno == NULLAGBLOCK)
-		return -EFSCORRUPTED;
-
-	*nbnop = nbno;
-	return 0;
-}
-
 /*
  * Walk forward through the rmap btree to collect all rmaps starting at
  * @bno in @rmap_bag.  These represent the file(s) that share ownership of
@@ -402,22 +373,21 @@ xrep_refc_next_edge(
 static int
 xrep_refc_push_rmaps_at(
 	struct xrep_refc	*rr,
-	struct xfarray		*rmap_bag,
+	struct rcbag		*rcstack,
 	xfs_agblock_t		bno,
-	struct xrep_refc_rmap	*rrm,
-	bool			*have,
-	uint64_t		*stack_sz)
+	struct xfs_rmap_irec	*rmap,
+	bool			*have)
 {
 	struct xfs_scrub	*sc = rr->sc;
 	int			have_gt;
 	int			error;
 
-	while (*have && rrm->startblock == bno) {
-		error = xfarray_store_anywhere(rmap_bag, rrm);
+	while (*have && rmap->rm_startblock == bno) {
+		error = rcbag_add(rcstack, rr->sc->tp, rmap);
 		if (error)
 			return error;
-		(*stack_sz)++;
-		error = xrep_refc_walk_rmaps(rr, rrm, have);
+
+		error = xrep_refc_walk_rmaps(rr, rmap, have);
 		if (error)
 			return error;
 	}
@@ -425,8 +395,10 @@ xrep_refc_push_rmaps_at(
 	error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+	if (XFS_IS_CORRUPT(sc->mp, !have_gt)) {
+		xfs_btree_mark_sick(sc->sa.rmap_cur);
 		return -EFSCORRUPTED;
+	}
 
 	return 0;
 }
@@ -436,12 +408,9 @@ STATIC int
 xrep_refc_find_refcounts(
 	struct xrep_refc	*rr)
 {
-	struct xrep_refc_rmap	rrm;
 	struct xfs_scrub	*sc = rr->sc;
-	struct xfarray		*rmap_bag;
-	char			*descr;
-	uint64_t		old_stack_sz;
-	uint64_t		stack_sz = 0;
+	struct rcbag		*rcstack;
+	uint64_t		old_stack_height;
 	xfs_agblock_t		sbno;
 	xfs_agblock_t		cbno;
 	xfs_agblock_t		nbno;
@@ -451,14 +420,11 @@ xrep_refc_find_refcounts(
 	xrep_ag_btcur_init(sc, &sc->sa);
 
 	/*
-	 * Set up a sparse array to store all the rmap records that we're
-	 * tracking to generate a reference count record.  If this exceeds
+	 * Set up a bag to store all the rmap records that we're tracking to
+	 * generate a reference count record.  If the size of the bag exceeds
 	 * MAXREFCOUNT, we clamp rc_refcount.
 	 */
-	descr = xchk_xfile_ag_descr(sc, "rmap record bag");
-	error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
-			&rmap_bag);
-	kfree(descr);
+	error = rcbag_init(sc->mp, sc->xmbtp, &rcstack);
 	if (error)
 		goto out_cur;
 
@@ -469,62 +435,54 @@ xrep_refc_find_refcounts(
 
 	/* Process reverse mappings into refcount data. */
 	while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+		struct xfs_rmap_irec	rmap;
+
 		/* Push all rmaps with pblk == sbno onto the stack */
-		error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+		error = xrep_refc_walk_rmaps(rr, &rmap, &have);
 		if (error)
 			goto out_bag;
 		if (!have)
 			break;
-		sbno = cbno = rrm.startblock;
-		error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
-					&rrm, &have, &stack_sz);
+		sbno = cbno = rmap.rm_startblock;
+		error = xrep_refc_push_rmaps_at(rr, rcstack, sbno, &rmap,
+				&have);
 		if (error)
 			goto out_bag;
 
 		/* Set nbno to the bno of the next refcount change */
-		error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+		error = rcbag_next_edge(rcstack, sc->tp, &rmap, have, &nbno);
 		if (error)
 			goto out_bag;
 
 		ASSERT(nbno > sbno);
-		old_stack_sz = stack_sz;
+		old_stack_height = rcbag_count(rcstack);
 
 		/* While stack isn't empty... */
-		while (stack_sz) {
-			xfarray_idx_t	array_cur = XFARRAY_CURSOR_INIT;
-
+		while (rcbag_count(rcstack) > 0) {
 			/* Pop all rmaps that end at nbno */
-			while ((error = xfarray_iter(rmap_bag, &array_cur,
-								&rrm)) == 1) {
-				if (RRM_NEXT(rrm) != nbno)
-					continue;
-				error = xfarray_unset(rmap_bag, array_cur - 1);
-				if (error)
-					goto out_bag;
-				stack_sz--;
-			}
+			error = rcbag_remove_ending_at(rcstack, sc->tp, nbno);
 			if (error)
 				goto out_bag;
 
 			/* Push array items that start at nbno */
-			error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+			error = xrep_refc_walk_rmaps(rr, &rmap, &have);
 			if (error)
 				goto out_bag;
 			if (have) {
-				error = xrep_refc_push_rmaps_at(rr, rmap_bag,
-						nbno, &rrm, &have, &stack_sz);
+				error = xrep_refc_push_rmaps_at(rr, rcstack,
+						nbno, &rmap, &have);
 				if (error)
 					goto out_bag;
 			}
 
 			/* Emit refcount if necessary */
 			ASSERT(nbno > cbno);
-			if (stack_sz != old_stack_sz) {
-				if (old_stack_sz > 1) {
+			if (rcbag_count(rcstack) != old_stack_height) {
+				if (old_stack_height > 1) {
 					error = xrep_refc_stash(rr,
 							XFS_REFC_DOMAIN_SHARED,
 							cbno, nbno - cbno,
-							old_stack_sz);
+							old_stack_height);
 					if (error)
 						goto out_bag;
 				}
@@ -532,13 +490,13 @@ xrep_refc_find_refcounts(
 			}
 
 			/* Stack empty, go find the next rmap */
-			if (stack_sz == 0)
+			if (rcbag_count(rcstack) == 0)
 				break;
-			old_stack_sz = stack_sz;
+			old_stack_height = rcbag_count(rcstack);
 			sbno = nbno;
 
 			/* Set nbno to the bno of the next refcount change */
-			error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+			error = rcbag_next_edge(rcstack, sc->tp, &rmap, have,
 					&nbno);
 			if (error)
 				goto out_bag;
@@ -547,14 +505,13 @@ xrep_refc_find_refcounts(
 		}
 	}
 
-	ASSERT(stack_sz == 0);
+	ASSERT(rcbag_count(rcstack) == 0);
 out_bag:
-	xfarray_destroy(rmap_bag);
+	rcbag_free(&rcstack);
 out_cur:
 	xchk_ag_btcur_free(&sc->sa);
 	return error;
 }
-#undef RRM_NEXT
 
 /* Retrieve refcountbt data for bulk load. */
 STATIC int
@@ -653,8 +610,8 @@ xrep_refc_build_new_tree(
 	rr->new_btree.bload.claim_block = xrep_refc_claim_block;
 
 	/* Compute how many blocks we'll need. */
-	refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
-			pag);
+	refc_cur = xfs_refcountbt_init_cursor(sc->mp, NULL, NULL, pag);
+	xfs_btree_stage_afakeroot(refc_cur, &rr->new_btree.afake);
 	error = xfs_btree_bload_compute_geometry(refc_cur,
 			&rr->new_btree.bload,
 			xfarray_length(rr->refcount_records));
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 745d5b8f405a..f43dce771cdd 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -30,12 +30,15 @@
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_reflink.h"
+#include "xfs_health.h"
+#include "xfs_buf_mem.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
 #include "scrub/repair.h"
 #include "scrub/bitmap.h"
 #include "scrub/stats.h"
+#include "scrub/xfile.h"
 
 /*
  * Attempt to repair some metadata, if the metadata is corrupt and userspace
@@ -400,7 +403,7 @@ xrep_calc_ag_resblks(
 int
 xrep_fix_freelist(
 	struct xfs_scrub	*sc,
-	bool			can_shrink)
+	int			alloc_flags)
 {
 	struct xfs_alloc_arg	args = {0};
 
@@ -410,8 +413,7 @@ xrep_fix_freelist(
 	args.alignment = 1;
 	args.pag = sc->sa.pag;
 
-	return xfs_alloc_fix_freelist(&args,
-			can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
+	return xfs_alloc_fix_freelist(&args, alloc_flags);
 }
 
 /*
@@ -687,6 +689,44 @@ xrep_find_ag_btree_roots(
 }
 
 #ifdef CONFIG_XFS_QUOTA
+/* Update some quota flags in the superblock. */
+void
+xrep_update_qflags(
+	struct xfs_scrub	*sc,
+	unsigned int		clear_flags,
+	unsigned int		set_flags)
+{
+	struct xfs_mount	*mp = sc->mp;
+	struct xfs_buf		*bp;
+
+	mutex_lock(&mp->m_quotainfo->qi_quotaofflock);
+	if ((mp->m_qflags & clear_flags) == 0 &&
+	    (mp->m_qflags & set_flags) == set_flags)
+		goto no_update;
+
+	mp->m_qflags &= ~clear_flags;
+	mp->m_qflags |= set_flags;
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_sb.sb_qflags &= ~clear_flags;
+	mp->m_sb.sb_qflags |= set_flags;
+	spin_unlock(&mp->m_sb_lock);
+
+	/*
+	 * Update the quota flags in the ondisk superblock without touching
+	 * the summary counters.  We have not quiesced inode chunk allocation,
+	 * so we cannot coordinate with updates to the icount and ifree percpu
+	 * counters.
+	 */
+	bp = xfs_trans_getsb(sc->tp);
+	xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
+	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
+	xfs_trans_log_buf(sc->tp, bp, 0, sizeof(struct xfs_dsb) - 1);
+
+no_update:
+	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+}
+
 /* Force a quotacheck the next time we mount. */
 void
 xrep_force_quotacheck(
@@ -699,13 +739,7 @@ xrep_force_quotacheck(
 	if (!(flag & sc->mp->m_qflags))
 		return;
 
-	mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
-	sc->mp->m_qflags &= ~flag;
-	spin_lock(&sc->mp->m_sb_lock);
-	sc->mp->m_sb.sb_qflags &= ~flag;
-	spin_unlock(&sc->mp->m_sb_lock);
-	xfs_log_sb(sc->tp);
-	mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
+	xrep_update_qflags(sc, flag, 0);
 }
 
 /*
@@ -799,20 +833,20 @@ xrep_ag_btcur_init(
 	/* Set up a bnobt cursor for cross-referencing. */
 	if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
 	    sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
-		sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
-				sc->sa.pag, XFS_BTNUM_BNO);
-		sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
-				sc->sa.pag, XFS_BTNUM_CNT);
+		sa->bno_cur = xfs_bnobt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sc->sa.pag);
+		sa->cnt_cur = xfs_cntbt_init_cursor(mp, sc->tp, sa->agf_bp,
+				sc->sa.pag);
 	}
 
 	/* Set up a inobt cursor for cross-referencing. */
 	if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
 	    sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
 		sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
-				sa->agi_bp, XFS_BTNUM_INO);
+				sa->agi_bp);
 		if (xfs_has_finobt(mp))
-			sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
-					sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+			sa->fino_cur = xfs_finobt_init_cursor(sc->sa.pag,
+					sc->tp, sa->agi_bp);
 	}
 
 	/* Set up a rmapbt cursor for cross-referencing. */
@@ -1115,3 +1149,55 @@ xrep_metadata_inode_forks(
 
 	return 0;
 }
+
+/*
+ * Set up an in-memory buffer cache so that we can use the xfbtree.  Allocating
+ * a shmem file might take loks, so we cannot be in transaction context.  Park
+ * our resources in the scrub context and let the teardown function take care
+ * of them at the right time.
+ */
+int
+xrep_setup_xfbtree(
+	struct xfs_scrub	*sc,
+	const char		*descr)
+{
+	ASSERT(sc->tp == NULL);
+
+	return xmbuf_alloc(sc->mp, descr, &sc->xmbtp);
+}
+
+/*
+ * Create a dummy transaction for use in a live update hook function.  This
+ * function MUST NOT be called from regular repair code because the current
+ * process' transaction is saved via the cookie.
+ */
+int
+xrep_trans_alloc_hook_dummy(
+	struct xfs_mount	*mp,
+	void			**cookiep,
+	struct xfs_trans	**tpp)
+{
+	int			error;
+
+	*cookiep = current->journal_info;
+	current->journal_info = NULL;
+
+	error = xfs_trans_alloc_empty(mp, tpp);
+	if (!error)
+		return 0;
+
+	current->journal_info = *cookiep;
+	*cookiep = NULL;
+	return error;
+}
+
+/* Cancel a dummy transaction used by a live update hook function. */
+void
+xrep_trans_cancel_hook_dummy(
+	void			**cookiep,
+	struct xfs_trans	*tp)
+{
+	xfs_trans_cancel(tp);
+	current->journal_info = *cookiep;
+	*cookiep = NULL;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 17114327e6fa..ce082d941459 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -51,7 +51,7 @@ struct xbitmap;
 struct xagb_bitmap;
 struct xfsb_bitmap;
 
-int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
+int xrep_fix_freelist(struct xfs_scrub *sc, int alloc_flags);
 
 struct xrep_find_ag_btree {
 	/* in: rmap owner of the btree we're looking for */
@@ -72,6 +72,8 @@ int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
 		struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
 
 #ifdef CONFIG_XFS_QUOTA
+void xrep_update_qflags(struct xfs_scrub *sc, unsigned int clear_flags,
+		unsigned int set_flags);
 void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
 int xrep_ino_dqattach(struct xfs_scrub *sc);
 #else
@@ -79,11 +81,15 @@ int xrep_ino_dqattach(struct xfs_scrub *sc);
 # define xrep_ino_dqattach(sc)			(0)
 #endif /* CONFIG_XFS_QUOTA */
 
+int xrep_setup_xfbtree(struct xfs_scrub *sc, const char *descr);
+
 int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
 		xfs_extnum_t nextents);
 int xrep_reset_perag_resv(struct xfs_scrub *sc);
 int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
 int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+int xrep_setup_ag_rmapbt(struct xfs_scrub *sc);
+int xrep_setup_ag_refcountbt(struct xfs_scrub *sc);
 
 /* Repair setup functions */
 int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
@@ -109,11 +115,14 @@ int xrep_agfl(struct xfs_scrub *sc);
 int xrep_agi(struct xfs_scrub *sc);
 int xrep_allocbt(struct xfs_scrub *sc);
 int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_rmapbt(struct xfs_scrub *sc);
 int xrep_refcountbt(struct xfs_scrub *sc);
 int xrep_inode(struct xfs_scrub *sc);
 int xrep_bmap_data(struct xfs_scrub *sc);
 int xrep_bmap_attr(struct xfs_scrub *sc);
 int xrep_bmap_cow(struct xfs_scrub *sc);
+int xrep_nlinks(struct xfs_scrub *sc);
+int xrep_fscounters(struct xfs_scrub *sc);
 
 #ifdef CONFIG_XFS_RT
 int xrep_rtbitmap(struct xfs_scrub *sc);
@@ -123,13 +132,19 @@ int xrep_rtbitmap(struct xfs_scrub *sc);
 
 #ifdef CONFIG_XFS_QUOTA
 int xrep_quota(struct xfs_scrub *sc);
+int xrep_quotacheck(struct xfs_scrub *sc);
 #else
 # define xrep_quota			xrep_notsupported
+# define xrep_quotacheck		xrep_notsupported
 #endif /* CONFIG_XFS_QUOTA */
 
 int xrep_reinit_pagf(struct xfs_scrub *sc);
 int xrep_reinit_pagi(struct xfs_scrub *sc);
 
+int xrep_trans_alloc_hook_dummy(struct xfs_mount *mp, void **cookiep,
+		struct xfs_trans **tpp);
+void xrep_trans_cancel_hook_dummy(void **cookiep, struct xfs_trans *tp);
+
 #else
 
 #define xrep_ino_dqattach(sc)	(0)
@@ -171,6 +186,8 @@ xrep_setup_nothing(
 	return 0;
 }
 #define xrep_setup_ag_allocbt		xrep_setup_nothing
+#define xrep_setup_ag_rmapbt		xrep_setup_nothing
+#define xrep_setup_ag_refcountbt	xrep_setup_nothing
 
 #define xrep_setup_inode(sc, imap)	((void)0)
 
@@ -184,6 +201,7 @@ xrep_setup_nothing(
 #define xrep_agi			xrep_notsupported
 #define xrep_allocbt			xrep_notsupported
 #define xrep_iallocbt			xrep_notsupported
+#define xrep_rmapbt			xrep_notsupported
 #define xrep_refcountbt			xrep_notsupported
 #define xrep_inode			xrep_notsupported
 #define xrep_bmap_data			xrep_notsupported
@@ -191,6 +209,9 @@ xrep_setup_nothing(
 #define xrep_bmap_cow			xrep_notsupported
 #define xrep_rtbitmap			xrep_notsupported
 #define xrep_quota			xrep_notsupported
+#define xrep_quotacheck			xrep_notsupported
+#define xrep_nlinks			xrep_notsupported
+#define xrep_fscounters			xrep_notsupported
 
 #endif /* CONFIG_XFS_ONLINE_REPAIR */
 
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index c99d1714f283..ba5bbc3fb754 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -25,6 +25,7 @@
 #include "scrub/btree.h"
 #include "scrub/bitmap.h"
 #include "scrub/agb_bitmap.h"
+#include "scrub/repair.h"
 
 /*
  * Set us up to scrub reverse mapping btrees.
@@ -36,6 +37,14 @@ xchk_setup_ag_rmapbt(
 	if (xchk_need_intent_drain(sc))
 		xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
 
+	if (xchk_could_repair(sc)) {
+		int		error;
+
+		error = xrep_setup_ag_rmapbt(sc);
+		if (error)
+			return error;
+	}
+
 	return xchk_setup_ag_btree(sc, false);
 }
 
@@ -349,7 +358,7 @@ xchk_rmapbt_rec(
 	struct xfs_rmap_irec	irec;
 
 	if (xfs_rmap_btrec_to_irec(rec, &irec) != NULL ||
-	    xfs_rmap_check_irec(bs->cur, &irec) != NULL) {
+	    xfs_rmap_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
 		xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
 		return 0;
 	}
@@ -412,8 +421,8 @@ xchk_rmapbt_walk_ag_metadata(
 	/* OWN_AG: bnobt, cntbt, rmapbt, and AGFL */
 	cur = sc->sa.bno_cur;
 	if (!cur)
-		cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
-				sc->sa.pag, XFS_BTNUM_BNO);
+		cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+				sc->sa.pag);
 	error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
 	if (cur != sc->sa.bno_cur)
 		xfs_btree_del_cursor(cur, error);
@@ -422,8 +431,8 @@ xchk_rmapbt_walk_ag_metadata(
 
 	cur = sc->sa.cnt_cur;
 	if (!cur)
-		cur = xfs_allocbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
-				sc->sa.pag, XFS_BTNUM_CNT);
+		cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+				sc->sa.pag);
 	error = xagb_bitmap_set_btblocks(&cr->ag_owned, cur);
 	if (cur != sc->sa.cnt_cur)
 		xfs_btree_del_cursor(cur, error);
@@ -447,8 +456,7 @@ xchk_rmapbt_walk_ag_metadata(
 	/* OWN_INOBT: inobt, finobt */
 	cur = sc->sa.ino_cur;
 	if (!cur)
-		cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp,
-				XFS_BTNUM_INO);
+		cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp, sc->sa.agi_bp);
 	error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
 	if (cur != sc->sa.ino_cur)
 		xfs_btree_del_cursor(cur, error);
@@ -458,8 +466,8 @@ xchk_rmapbt_walk_ag_metadata(
 	if (xfs_has_finobt(sc->mp)) {
 		cur = sc->sa.fino_cur;
 		if (!cur)
-			cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
-					sc->sa.agi_bp, XFS_BTNUM_FINO);
+			cur = xfs_finobt_init_cursor(sc->sa.pag, sc->tp,
+					sc->sa.agi_bp);
 		error = xagb_bitmap_set_btblocks(&cr->inobt_owned, cur);
 		if (cur != sc->sa.fino_cur)
 			xfs_btree_del_cursor(cur, error);
diff --git a/fs/xfs/scrub/rmap_repair.c b/fs/xfs/scrub/rmap_repair.c
new file mode 100644
index 000000000000..e8e07b683eab
--- /dev/null
+++ b/fs/xfs/scrub/rmap_repair.c
@@ -0,0 +1,1697 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2018-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/iscan.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Reverse Mapping Btree Repair
+ * ============================
+ *
+ * This is the most involved of all the AG space btree rebuilds.  Everywhere
+ * else in XFS we lock inodes and then AG data structures, but generating the
+ * list of rmap records requires that we be able to scan both block mapping
+ * btrees of every inode in the filesystem to see if it owns any extents in
+ * this AG.  We can't tolerate any inode updates while we do this, so we
+ * freeze the filesystem to lock everyone else out, and grant ourselves
+ * special privileges to run transactions with regular background reclamation
+ * turned off.
+ *
+ * We also have to be very careful not to allow inode reclaim to start a
+ * transaction because all transactions (other than our own) will block.
+ * Deferred inode inactivation helps us out there.
+ *
+ * I) Reverse mappings for all non-space metadata and file data are collected
+ * according to the following algorithm:
+ *
+ * 1. For each fork of each inode:
+ * 1.1. Create a bitmap BMBIT to track bmbt blocks if necessary.
+ * 1.2. If the incore extent map isn't loaded, walk the bmbt to accumulate
+ *      bmaps into rmap records (see 1.1.4).  Set bits in BMBIT for each btree
+ *      block.
+ * 1.3. If the incore extent map is loaded but the fork is in btree format,
+ *      just visit the bmbt blocks to set the corresponding BMBIT areas.
+ * 1.4. From the incore extent map, accumulate each bmap that falls into our
+ *      target AG.  Remember, multiple bmap records can map to a single rmap
+ *      record, so we cannot simply emit rmap records 1:1.
+ * 1.5. Emit rmap records for each extent in BMBIT and free it.
+ * 2. Create bitmaps INOBIT and ICHUNKBIT.
+ * 3. For each record in the inobt, set the corresponding areas in ICHUNKBIT,
+ *    and set bits in INOBIT for each btree block.  If the inobt has no records
+ *    at all, we must be careful to record its root in INOBIT.
+ * 4. For each block in the finobt, set the corresponding INOBIT area.
+ * 5. Emit rmap records for each extent in INOBIT and ICHUNKBIT and free them.
+ * 6. Create bitmaps REFCBIT and COWBIT.
+ * 7. For each CoW staging extent in the refcountbt, set the corresponding
+ *    areas in COWBIT.
+ * 8. For each block in the refcountbt, set the corresponding REFCBIT area.
+ * 9. Emit rmap records for each extent in REFCBIT and COWBIT and free them.
+ * A. Emit rmap for the AG headers.
+ * B. Emit rmap for the log, if there is one.
+ *
+ * II) The rmapbt shape and space metadata rmaps are computed as follows:
+ *
+ * 1. Count the rmaps collected in the previous step. (= NR)
+ * 2. Estimate the number of rmapbt blocks needed to store NR records. (= RMB)
+ * 3. Reserve RMB blocks through the newbt using the allocator in normap mode.
+ * 4. Create bitmap AGBIT.
+ * 5. For each reservation in the newbt, set the corresponding areas in AGBIT.
+ * 6. For each block in the AGFL, bnobt, and cntbt, set the bits in AGBIT.
+ * 7. Count the extents in AGBIT. (= AGNR)
+ * 8. Estimate the number of rmapbt blocks needed for NR + AGNR rmaps. (= RMB')
+ * 9. If RMB' >= RMB, reserve RMB' - RMB more newbt blocks, set RMB = RMB',
+ *    and clear AGBIT.  Go to step 5.
+ * A. Emit rmaps for each extent in AGBIT.
+ *
+ * III) The rmapbt is constructed and set in place as follows:
+ *
+ * 1. Sort the rmap records.
+ * 2. Bulk load the rmaps.
+ *
+ * IV) Reap the old btree blocks.
+ *
+ * 1. Create a bitmap OLDRMBIT.
+ * 2. For each gap in the new rmapbt, set the corresponding areas of OLDRMBIT.
+ * 3. For each extent in the bnobt, clear the corresponding parts of OLDRMBIT.
+ * 4. Reap the extents corresponding to the set areas in OLDRMBIT.  These are
+ *    the parts of the AG that the rmap didn't find during its scan of the
+ *    primary metadata and aren't known to be in the free space, which implies
+ *    that they were the old rmapbt blocks.
+ * 5. Commit.
+ *
+ * We use the 'xrep_rmap' prefix for all the rmap functions.
+ */
+
+/* Context for collecting rmaps */
+struct xrep_rmap {
+	/* new rmapbt information */
+	struct xrep_newbt	new_btree;
+
+	/* lock for the xfbtree and xfile */
+	struct mutex		lock;
+
+	/* rmap records generated from primary metadata */
+	struct xfbtree		rmap_btree;
+
+	struct xfs_scrub	*sc;
+
+	/* in-memory btree cursor for the xfs_btree_bload iteration */
+	struct xfs_btree_cur	*mcur;
+
+	/* Hooks into rmap update code. */
+	struct xfs_rmap_hook	rhook;
+
+	/* inode scan cursor */
+	struct xchk_iscan	iscan;
+
+	/* Number of non-freespace records found. */
+	unsigned long long	nr_records;
+
+	/* bnobt/cntbt contribution to btreeblks */
+	xfs_agblock_t		freesp_btblocks;
+
+	/* old agf_rmap_blocks counter */
+	unsigned int		old_rmapbt_fsbcount;
+};
+
+/* Set us up to repair reverse mapping btrees. */
+int
+xrep_setup_ag_rmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rmap	*rr;
+	char			*descr;
+	int			error;
+
+	xchk_fsgates_enable(sc, XCHK_FSGATES_RMAP);
+
+	descr = xchk_xfile_ag_descr(sc, "reverse mapping records");
+	error = xrep_setup_xfbtree(sc, descr);
+	kfree(descr);
+	if (error)
+		return error;
+
+	rr = kzalloc(sizeof(struct xrep_rmap), XCHK_GFP_FLAGS);
+	if (!rr)
+		return -ENOMEM;
+
+	rr->sc = sc;
+	sc->buf = rr;
+	return 0;
+}
+
+/* Make sure there's nothing funny about this mapping. */
+STATIC int
+xrep_rmap_check_mapping(
+	struct xfs_scrub	*sc,
+	const struct xfs_rmap_irec *rec)
+{
+	enum xbtree_recpacking	outcome;
+	int			error;
+
+	if (xfs_rmap_check_irec(sc->sa.pag, rec) != NULL)
+		return -EFSCORRUPTED;
+
+	/* Make sure this isn't free space. */
+	error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+			rec->rm_blockcount, &outcome);
+	if (error)
+		return error;
+	if (outcome != XBTREE_RECPACKING_EMPTY)
+		return -EFSCORRUPTED;
+
+	return 0;
+}
+
+/* Store a reverse-mapping record. */
+static inline int
+xrep_rmap_stash(
+	struct xrep_rmap	*rr,
+	xfs_agblock_t		startblock,
+	xfs_extlen_t		blockcount,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	struct xfs_rmap_irec	rmap = {
+		.rm_startblock	= startblock,
+		.rm_blockcount	= blockcount,
+		.rm_owner	= owner,
+		.rm_offset	= offset,
+		.rm_flags	= flags,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_btree_cur	*mcur;
+	int			error = 0;
+
+	if (xchk_should_terminate(sc, &error))
+		return error;
+
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
+	trace_xrep_rmap_found(sc->mp, sc->sa.pag->pag_agno, &rmap);
+
+	mutex_lock(&rr->lock);
+	mcur = xfs_rmapbt_mem_cursor(sc->sa.pag, sc->tp, &rr->rmap_btree);
+	error = xfs_rmap_map_raw(mcur, &rmap);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	error = xfbtree_trans_commit(&rr->rmap_btree, sc->tp);
+	if (error)
+		goto out_abort;
+
+	mutex_unlock(&rr->lock);
+	return 0;
+
+out_cancel:
+	xfbtree_trans_cancel(&rr->rmap_btree, sc->tp);
+out_abort:
+	xchk_iscan_abort(&rr->iscan);
+	mutex_unlock(&rr->lock);
+	return error;
+}
+
+struct xrep_rmap_stash_run {
+	struct xrep_rmap	*rr;
+	uint64_t		owner;
+	unsigned int		rmap_flags;
+};
+
+static int
+xrep_rmap_stash_run(
+	uint32_t			start,
+	uint32_t			len,
+	void				*priv)
+{
+	struct xrep_rmap_stash_run	*rsr = priv;
+	struct xrep_rmap		*rr = rsr->rr;
+
+	return xrep_rmap_stash(rr, start, len, rsr->owner, 0, rsr->rmap_flags);
+}
+
+/*
+ * Emit rmaps for every extent of bits set in the bitmap.  Caller must ensure
+ * that the ranges are in units of FS blocks.
+ */
+STATIC int
+xrep_rmap_stash_bitmap(
+	struct xrep_rmap		*rr,
+	struct xagb_bitmap		*bitmap,
+	const struct xfs_owner_info	*oinfo)
+{
+	struct xrep_rmap_stash_run	rsr = {
+		.rr			= rr,
+		.owner			= oinfo->oi_owner,
+		.rmap_flags		= 0,
+	};
+
+	if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
+		rsr.rmap_flags |= XFS_RMAP_ATTR_FORK;
+	if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
+		rsr.rmap_flags |= XFS_RMAP_BMBT_BLOCK;
+
+	return xagb_bitmap_walk(bitmap, xrep_rmap_stash_run, &rsr);
+}
+
+/* Section (I): Finding all file and bmbt extents. */
+
+/* Context for accumulating rmaps for an inode fork. */
+struct xrep_rmap_ifork {
+	/*
+	 * Accumulate rmap data here to turn multiple adjacent bmaps into a
+	 * single rmap.
+	 */
+	struct xfs_rmap_irec	accum;
+
+	/* Bitmap of bmbt blocks in this AG. */
+	struct xagb_bitmap	bmbt_blocks;
+
+	struct xrep_rmap	*rr;
+
+	/* Which inode fork? */
+	int			whichfork;
+};
+
+/* Stash an rmap that we accumulated while walking an inode fork. */
+STATIC int
+xrep_rmap_stash_accumulated(
+	struct xrep_rmap_ifork	*rf)
+{
+	if (rf->accum.rm_blockcount == 0)
+		return 0;
+
+	return xrep_rmap_stash(rf->rr, rf->accum.rm_startblock,
+			rf->accum.rm_blockcount, rf->accum.rm_owner,
+			rf->accum.rm_offset, rf->accum.rm_flags);
+}
+
+/* Accumulate a bmbt record. */
+STATIC int
+xrep_rmap_visit_bmbt(
+	struct xfs_btree_cur	*cur,
+	struct xfs_bmbt_irec	*rec,
+	void			*priv)
+{
+	struct xrep_rmap_ifork	*rf = priv;
+	struct xfs_mount	*mp = rf->rr->sc->mp;
+	struct xfs_rmap_irec	*accum = &rf->accum;
+	xfs_agblock_t		agbno;
+	unsigned int		rmap_flags = 0;
+	int			error;
+
+	if (XFS_FSB_TO_AGNO(mp, rec->br_startblock) !=
+			rf->rr->sc->sa.pag->pag_agno)
+		return 0;
+
+	agbno = XFS_FSB_TO_AGBNO(mp, rec->br_startblock);
+	if (rf->whichfork == XFS_ATTR_FORK)
+		rmap_flags |= XFS_RMAP_ATTR_FORK;
+	if (rec->br_state == XFS_EXT_UNWRITTEN)
+		rmap_flags |= XFS_RMAP_UNWRITTEN;
+
+	/* If this bmap is adjacent to the previous one, just add it. */
+	if (accum->rm_blockcount > 0 &&
+	    rec->br_startoff == accum->rm_offset + accum->rm_blockcount &&
+	    agbno == accum->rm_startblock + accum->rm_blockcount &&
+	    rmap_flags == accum->rm_flags) {
+		accum->rm_blockcount += rec->br_blockcount;
+		return 0;
+	}
+
+	/* Otherwise stash the old rmap and start accumulating a new one. */
+	error = xrep_rmap_stash_accumulated(rf);
+	if (error)
+		return error;
+
+	accum->rm_startblock = agbno;
+	accum->rm_blockcount = rec->br_blockcount;
+	accum->rm_offset = rec->br_startoff;
+	accum->rm_flags = rmap_flags;
+	return 0;
+}
+
+/* Add a btree block to the bitmap. */
+STATIC int
+xrep_rmap_visit_iroot_btree_block(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	void			*priv)
+{
+	struct xrep_rmap_ifork	*rf = priv;
+	struct xfs_buf		*bp;
+	xfs_fsblock_t		fsbno;
+	xfs_agblock_t		agbno;
+
+	xfs_btree_get_block(cur, level, &bp);
+	if (!bp)
+		return 0;
+
+	fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+	if (XFS_FSB_TO_AGNO(cur->bc_mp, fsbno) != rf->rr->sc->sa.pag->pag_agno)
+		return 0;
+
+	agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+	return xagb_bitmap_set(&rf->bmbt_blocks, agbno, 1);
+}
+
+/*
+ * Iterate a metadata btree rooted in an inode to collect rmap records for
+ * anything in this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iroot_btree(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_btree_cur	*cur)
+{
+	struct xfs_owner_info	oinfo;
+	struct xrep_rmap	*rr = rf->rr;
+	int			error;
+
+	xagb_bitmap_init(&rf->bmbt_blocks);
+
+	/* Record all the blocks in the btree itself. */
+	error = xfs_btree_visit_blocks(cur, xrep_rmap_visit_iroot_btree_block,
+			XFS_BTREE_VISIT_ALL, rf);
+	if (error)
+		goto out;
+
+	/* Emit rmaps for the btree blocks. */
+	xfs_rmap_ino_bmbt_owner(&oinfo, rf->accum.rm_owner, rf->whichfork);
+	error = xrep_rmap_stash_bitmap(rr, &rf->bmbt_blocks, &oinfo);
+	if (error)
+		goto out;
+
+	/* Stash any remaining accumulated rmaps. */
+	error = xrep_rmap_stash_accumulated(rf);
+out:
+	xagb_bitmap_destroy(&rf->bmbt_blocks);
+	return error;
+}
+
+static inline bool
+is_rt_data_fork(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	return XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK;
+}
+
+/*
+ * Iterate the block mapping btree to collect rmap records for anything in this
+ * fork that matches the AG.  Sets @mappings_done to true if we've scanned the
+ * block mappings in this fork.
+ */
+STATIC int
+xrep_rmap_scan_bmbt(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_inode	*ip,
+	bool			*mappings_done)
+{
+	struct xrep_rmap	*rr = rf->rr;
+	struct xfs_btree_cur	*cur;
+	struct xfs_ifork	*ifp;
+	int			error;
+
+	*mappings_done = false;
+	ifp = xfs_ifork_ptr(ip, rf->whichfork);
+	cur = xfs_bmbt_init_cursor(rr->sc->mp, rr->sc->tp, ip, rf->whichfork);
+
+	if (!xfs_ifork_is_realtime(ip, rf->whichfork) &&
+	    xfs_need_iread_extents(ifp)) {
+		/*
+		 * If the incore extent cache isn't loaded, scan the bmbt for
+		 * mapping records.  This avoids loading the incore extent
+		 * tree, which will increase memory pressure at a time when
+		 * we're trying to run as quickly as we possibly can.  Ignore
+		 * realtime extents.
+		 */
+		error = xfs_bmap_query_all(cur, xrep_rmap_visit_bmbt, rf);
+		if (error)
+			goto out_cur;
+
+		*mappings_done = true;
+	}
+
+	/* Scan for the bmbt blocks, which always live on the data device. */
+	error = xrep_rmap_scan_iroot_btree(rf, cur);
+out_cur:
+	xfs_btree_del_cursor(cur, error);
+	return error;
+}
+
+/*
+ * Iterate the in-core extent cache to collect rmap records for anything in
+ * this fork that matches the AG.
+ */
+STATIC int
+xrep_rmap_scan_iext(
+	struct xrep_rmap_ifork	*rf,
+	struct xfs_ifork	*ifp)
+{
+	struct xfs_bmbt_irec	rec;
+	struct xfs_iext_cursor	icur;
+	int			error;
+
+	for_each_xfs_iext(ifp, &icur, &rec) {
+		if (isnullstartblock(rec.br_startblock))
+			continue;
+		error = xrep_rmap_visit_bmbt(NULL, &rec, rf);
+		if (error)
+			return error;
+	}
+
+	return xrep_rmap_stash_accumulated(rf);
+}
+
+/* Find all the extents from a given AG in an inode fork. */
+STATIC int
+xrep_rmap_scan_ifork(
+	struct xrep_rmap	*rr,
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	struct xrep_rmap_ifork	rf = {
+		.accum		= { .rm_owner = ip->i_ino, },
+		.rr		= rr,
+		.whichfork	= whichfork,
+	};
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, whichfork);
+	int			error = 0;
+
+	if (!ifp)
+		return 0;
+
+	if (ifp->if_format == XFS_DINODE_FMT_BTREE) {
+		bool		mappings_done;
+
+		/*
+		 * Scan the bmap btree for data device mappings.  This includes
+		 * the btree blocks themselves, even if this is a realtime
+		 * file.
+		 */
+		error = xrep_rmap_scan_bmbt(&rf, ip, &mappings_done);
+		if (error || mappings_done)
+			return error;
+	} else if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+		return 0;
+	}
+
+	/* Scan incore extent cache if this isn't a realtime file. */
+	if (xfs_ifork_is_realtime(ip, whichfork))
+		return 0;
+
+	return xrep_rmap_scan_iext(&rf, ifp);
+}
+
+/*
+ * Take ILOCK on a file that we want to scan.
+ *
+ * Select ILOCK_EXCL if the file has an unloaded data bmbt or has an unloaded
+ * attr bmbt.  Otherwise, take ILOCK_SHARED.
+ */
+static inline unsigned int
+xrep_rmap_scan_ilock(
+	struct xfs_inode	*ip)
+{
+	uint			lock_mode = XFS_ILOCK_SHARED;
+
+	if (xfs_need_iread_extents(&ip->i_df)) {
+		lock_mode = XFS_ILOCK_EXCL;
+		goto lock;
+	}
+
+	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
+		lock_mode = XFS_ILOCK_EXCL;
+
+lock:
+	xfs_ilock(ip, lock_mode);
+	return lock_mode;
+}
+
+/* Record reverse mappings for a file. */
+STATIC int
+xrep_rmap_scan_inode(
+	struct xrep_rmap	*rr,
+	struct xfs_inode	*ip)
+{
+	unsigned int		lock_mode = 0;
+	int			error;
+
+	/*
+	 * Directory updates (create/link/unlink/rename) drop the directory's
+	 * ILOCK before finishing any rmapbt updates associated with directory
+	 * shape changes.  For this scan to coordinate correctly with the live
+	 * update hook, we must take the only lock (i_rwsem) that is held all
+	 * the way to dir op completion.  This will get fixed by the parent
+	 * pointer patchset.
+	 */
+	if (S_ISDIR(VFS_I(ip)->i_mode)) {
+		lock_mode = XFS_IOLOCK_SHARED;
+		xfs_ilock(ip, lock_mode);
+	}
+	lock_mode |= xrep_rmap_scan_ilock(ip);
+
+	/* Check the data fork. */
+	error = xrep_rmap_scan_ifork(rr, ip, XFS_DATA_FORK);
+	if (error)
+		goto out_unlock;
+
+	/* Check the attr fork. */
+	error = xrep_rmap_scan_ifork(rr, ip, XFS_ATTR_FORK);
+	if (error)
+		goto out_unlock;
+
+	/* COW fork extents are "owned" by the refcount btree. */
+
+	xchk_iscan_mark_visited(&rr->iscan, ip);
+out_unlock:
+	xfs_iunlock(ip, lock_mode);
+	return error;
+}
+
+/* Section (I): Find all AG metadata extents except for free space metadata. */
+
+struct xrep_rmap_inodes {
+	struct xrep_rmap	*rr;
+	struct xagb_bitmap	inobt_blocks;	/* INOBIT */
+	struct xagb_bitmap	ichunk_blocks;	/* ICHUNKBIT */
+};
+
+/* Record inode btree rmaps. */
+STATIC int
+xrep_rmap_walk_inobt(
+	struct xfs_btree_cur		*cur,
+	const union xfs_btree_rec	*rec,
+	void				*priv)
+{
+	struct xfs_inobt_rec_incore	irec;
+	struct xrep_rmap_inodes		*ri = priv;
+	struct xfs_mount		*mp = cur->bc_mp;
+	xfs_agblock_t			agbno;
+	xfs_extlen_t			aglen;
+	xfs_agino_t			agino;
+	xfs_agino_t			iperhole;
+	unsigned int			i;
+	int				error;
+
+	/* Record the inobt blocks. */
+	error = xagb_bitmap_set_btcur_path(&ri->inobt_blocks, cur);
+	if (error)
+		return error;
+
+	xfs_inobt_btrec_to_irec(mp, rec, &irec);
+	if (xfs_inobt_check_irec(cur->bc_ag.pag, &irec) != NULL)
+		return -EFSCORRUPTED;
+
+	agino = irec.ir_startino;
+
+	/* Record a non-sparse inode chunk. */
+	if (!xfs_inobt_issparse(irec.ir_holemask)) {
+		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+		aglen = max_t(xfs_extlen_t, 1,
+				XFS_INODES_PER_CHUNK / mp->m_sb.sb_inopblock);
+
+		return xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+	}
+
+	/* Iterate each chunk. */
+	iperhole = max_t(xfs_agino_t, mp->m_sb.sb_inopblock,
+			XFS_INODES_PER_HOLEMASK_BIT);
+	aglen = iperhole / mp->m_sb.sb_inopblock;
+	for (i = 0, agino = irec.ir_startino;
+	     i < XFS_INOBT_HOLEMASK_BITS;
+	     i += iperhole / XFS_INODES_PER_HOLEMASK_BIT, agino += iperhole) {
+		/* Skip holes. */
+		if (irec.ir_holemask & (1 << i))
+			continue;
+
+		/* Record the inode chunk otherwise. */
+		agbno = XFS_AGINO_TO_AGBNO(mp, agino);
+		error = xagb_bitmap_set(&ri->ichunk_blocks, agbno, aglen);
+		if (error)
+			return error;
+	}
+
+	return 0;
+}
+
+/* Collect rmaps for the blocks containing inode btrees and the inode chunks. */
+STATIC int
+xrep_rmap_find_inode_rmaps(
+	struct xrep_rmap	*rr)
+{
+	struct xrep_rmap_inodes	ri = {
+		.rr		= rr,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	xagb_bitmap_init(&ri.inobt_blocks);
+	xagb_bitmap_init(&ri.ichunk_blocks);
+
+	/*
+	 * Iterate every record in the inobt so we can capture all the inode
+	 * chunks and the blocks in the inobt itself.
+	 */
+	error = xfs_btree_query_all(sc->sa.ino_cur, xrep_rmap_walk_inobt, &ri);
+	if (error)
+		goto out_bitmap;
+
+	/*
+	 * Note that if there are zero records in the inobt then query_all does
+	 * nothing and we have to account the empty inobt root manually.
+	 */
+	if (xagb_bitmap_empty(&ri.ichunk_blocks)) {
+		struct xfs_agi	*agi = sc->sa.agi_bp->b_addr;
+
+		error = xagb_bitmap_set(&ri.inobt_blocks,
+				be32_to_cpu(agi->agi_root), 1);
+		if (error)
+			goto out_bitmap;
+	}
+
+	/* Scan the finobt too. */
+	if (xfs_has_finobt(sc->mp)) {
+		error = xagb_bitmap_set_btblocks(&ri.inobt_blocks,
+				sc->sa.fino_cur);
+		if (error)
+			goto out_bitmap;
+	}
+
+	/* Generate rmaps for everything. */
+	error = xrep_rmap_stash_bitmap(rr, &ri.inobt_blocks,
+			&XFS_RMAP_OINFO_INOBT);
+	if (error)
+		goto out_bitmap;
+	error = xrep_rmap_stash_bitmap(rr, &ri.ichunk_blocks,
+			&XFS_RMAP_OINFO_INODES);
+
+out_bitmap:
+	xagb_bitmap_destroy(&ri.inobt_blocks);
+	xagb_bitmap_destroy(&ri.ichunk_blocks);
+	return error;
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_rmap_walk_cowblocks(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_refcount_irec	*irec,
+	void				*priv)
+{
+	struct xagb_bitmap		*bitmap = priv;
+
+	if (!xfs_refcount_check_domain(irec) ||
+	    irec->rc_domain != XFS_REFC_DOMAIN_COW)
+		return -EFSCORRUPTED;
+
+	return xagb_bitmap_set(bitmap, irec->rc_startblock, irec->rc_blockcount);
+}
+
+/*
+ * Collect rmaps for the blocks containing the refcount btree, and all CoW
+ * staging extents.
+ */
+STATIC int
+xrep_rmap_find_refcount_rmaps(
+	struct xrep_rmap	*rr)
+{
+	struct xagb_bitmap	refcountbt_blocks;	/* REFCBIT */
+	struct xagb_bitmap	cow_blocks;		/* COWBIT */
+	struct xfs_refcount_irec low = {
+		.rc_startblock	= 0,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_refcount_irec high = {
+		.rc_startblock	= -1U,
+		.rc_domain	= XFS_REFC_DOMAIN_COW,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	if (!xfs_has_reflink(sc->mp))
+		return 0;
+
+	xagb_bitmap_init(&refcountbt_blocks);
+	xagb_bitmap_init(&cow_blocks);
+
+	/* refcountbt */
+	error = xagb_bitmap_set_btblocks(&refcountbt_blocks, sc->sa.refc_cur);
+	if (error)
+		goto out_bitmap;
+
+	/* Collect rmaps for CoW staging extents. */
+	error = xfs_refcount_query_range(sc->sa.refc_cur, &low, &high,
+			xrep_rmap_walk_cowblocks, &cow_blocks);
+	if (error)
+		goto out_bitmap;
+
+	/* Generate rmaps for everything. */
+	error = xrep_rmap_stash_bitmap(rr, &cow_blocks, &XFS_RMAP_OINFO_COW);
+	if (error)
+		goto out_bitmap;
+	error = xrep_rmap_stash_bitmap(rr, &refcountbt_blocks,
+			&XFS_RMAP_OINFO_REFC);
+
+out_bitmap:
+	xagb_bitmap_destroy(&cow_blocks);
+	xagb_bitmap_destroy(&refcountbt_blocks);
+	return error;
+}
+
+/* Generate rmaps for the AG headers (AGI/AGF/AGFL) */
+STATIC int
+xrep_rmap_find_agheader_rmaps(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+
+	/* Create a record for the AG sb->agfl. */
+	return xrep_rmap_stash(rr, XFS_SB_BLOCK(sc->mp),
+			XFS_AGFL_BLOCK(sc->mp) - XFS_SB_BLOCK(sc->mp) + 1,
+			XFS_RMAP_OWN_FS, 0, 0);
+}
+
+/* Generate rmaps for the log, if it's in this AG. */
+STATIC int
+xrep_rmap_find_log_rmaps(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+
+	if (!xfs_ag_contains_log(sc->mp, sc->sa.pag->pag_agno))
+		return 0;
+
+	return xrep_rmap_stash(rr,
+			XFS_FSB_TO_AGBNO(sc->mp, sc->mp->m_sb.sb_logstart),
+			sc->mp->m_sb.sb_logblocks, XFS_RMAP_OWN_LOG, 0, 0);
+}
+
+/* Check and count all the records that we gathered. */
+STATIC int
+xrep_rmap_check_record(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rmap		*rr = priv;
+	int				error;
+
+	error = xrep_rmap_check_mapping(rr->sc, rec);
+	if (error)
+		return error;
+
+	rr->nr_records++;
+	return 0;
+}
+
+/*
+ * Generate all the reverse-mappings for this AG, a list of the old rmapbt
+ * blocks, and the new btreeblks count.  Figure out if we have enough free
+ * space to reconstruct the inode btrees.  The caller must clean up the lists
+ * if anything goes wrong.  This implements section (I) above.
+ */
+STATIC int
+xrep_rmap_find_rmaps(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xchk_ag		*sa = &sc->sa;
+	struct xfs_inode	*ip;
+	struct xfs_btree_cur	*mcur;
+	int			error;
+
+	/* Find all the per-AG metadata. */
+	xrep_ag_btcur_init(sc, &sc->sa);
+
+	error = xrep_rmap_find_inode_rmaps(rr);
+	if (error)
+		goto end_agscan;
+
+	error = xrep_rmap_find_refcount_rmaps(rr);
+	if (error)
+		goto end_agscan;
+
+	error = xrep_rmap_find_agheader_rmaps(rr);
+	if (error)
+		goto end_agscan;
+
+	error = xrep_rmap_find_log_rmaps(rr);
+end_agscan:
+	xchk_ag_btcur_free(&sc->sa);
+	if (error)
+		return error;
+
+	/*
+	 * Set up for a potentially lengthy filesystem scan by reducing our
+	 * transaction resource usage for the duration.  Specifically:
+	 *
+	 * Unlock the AG header buffers and cancel the transaction to release
+	 * the log grant space while we scan the filesystem.
+	 *
+	 * Create a new empty transaction to eliminate the possibility of the
+	 * inode scan deadlocking on cyclical metadata.
+	 *
+	 * We pass the empty transaction to the file scanning function to avoid
+	 * repeatedly cycling empty transactions.  This can be done even though
+	 * we take the IOLOCK to quiesce the file because empty transactions
+	 * do not take sb_internal.
+	 */
+	sa->agf_bp = NULL;
+	sa->agi_bp = NULL;
+	xchk_trans_cancel(sc);
+	error = xchk_trans_alloc_empty(sc);
+	if (error)
+		return error;
+
+	/* Iterate all AGs for inodes rmaps. */
+	while ((error = xchk_iscan_iter(&rr->iscan, &ip)) == 1) {
+		error = xrep_rmap_scan_inode(rr, ip);
+		xchk_irele(sc, ip);
+		if (error)
+			break;
+
+		if (xchk_should_terminate(sc, &error))
+			break;
+	}
+	xchk_iscan_iter_finish(&rr->iscan);
+	if (error)
+		return error;
+
+	/*
+	 * Switch out for a real transaction and lock the AG headers in
+	 * preparation for building a new tree.
+	 */
+	xchk_trans_cancel(sc);
+	error = xchk_setup_fs(sc);
+	if (error)
+		return error;
+	error = xchk_perag_drain_and_lock(sc);
+	if (error)
+		return error;
+
+	/*
+	 * If a hook failed to update the in-memory btree, we lack the data to
+	 * continue the repair.
+	 */
+	if (xchk_iscan_aborted(&rr->iscan))
+		return -EFSCORRUPTED;
+
+	/*
+	 * Now that we have everything locked again, we need to count the
+	 * number of rmap records stashed in the btree.  This should reflect
+	 * all actively-owned space in the filesystem.  At the same time, check
+	 * all our records before we start building a new btree, which requires
+	 * a bnobt cursor.
+	 */
+	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+
+	rr->nr_records = 0;
+	error = xfs_rmap_query_all(mcur, xrep_rmap_check_record, rr);
+
+	xfs_btree_del_cursor(sc->sa.bno_cur, error);
+	sc->sa.bno_cur = NULL;
+	xfs_btree_del_cursor(mcur, error);
+
+	return error;
+}
+
+/* Section (II): Reserving space for new rmapbt and setting free space bitmap */
+
+struct xrep_rmap_agfl {
+	struct xagb_bitmap	*bitmap;
+	xfs_agnumber_t		agno;
+};
+
+/* Add an AGFL block to the rmap list. */
+STATIC int
+xrep_rmap_walk_agfl(
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agbno,
+	void			*priv)
+{
+	struct xrep_rmap_agfl	*ra = priv;
+
+	return xagb_bitmap_set(ra->bitmap, agbno, 1);
+}
+
+/*
+ * Run one round of reserving space for the new rmapbt and recomputing the
+ * number of blocks needed to store the previously observed rmapbt records and
+ * the ones we'll create for the free space metadata.  When we don't need more
+ * blocks, return a bitmap of OWN_AG extents in @freesp_blocks and set @done to
+ * true.
+ */
+STATIC int
+xrep_rmap_try_reserve(
+	struct xrep_rmap	*rr,
+	struct xfs_btree_cur	*rmap_cur,
+	struct xagb_bitmap	*freesp_blocks,
+	uint64_t		*blocks_reserved,
+	bool			*done)
+{
+	struct xrep_rmap_agfl	ra = {
+		.bitmap		= freesp_blocks,
+		.agno		= rr->sc->sa.pag->pag_agno,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	struct xrep_newbt_resv	*resv, *n;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_buf		*agfl_bp;
+	uint64_t		nr_blocks;	/* RMB */
+	uint64_t		freesp_records;
+	int			error;
+
+	/*
+	 * We're going to recompute new_btree.bload.nr_blocks at the end of
+	 * this function to reflect however many btree blocks we need to store
+	 * all the rmap records (including the ones that reflect the changes we
+	 * made to support the new rmapbt blocks), so we save the old value
+	 * here so we can decide if we've reserved enough blocks.
+	 */
+	nr_blocks = rr->new_btree.bload.nr_blocks;
+
+	/*
+	 * Make sure we've reserved enough space for the new btree.  This can
+	 * change the shape of the free space btrees, which can cause secondary
+	 * interactions with the rmap records because all three space btrees
+	 * have the same rmap owner.  We'll account for all that below.
+	 */
+	error = xrep_newbt_alloc_blocks(&rr->new_btree,
+			nr_blocks - *blocks_reserved);
+	if (error)
+		return error;
+
+	*blocks_reserved = rr->new_btree.bload.nr_blocks;
+
+	/* Clear everything in the bitmap. */
+	xagb_bitmap_destroy(freesp_blocks);
+
+	/* Set all the bnobt blocks in the bitmap. */
+	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.bno_cur);
+	xfs_btree_del_cursor(sc->sa.bno_cur, error);
+	sc->sa.bno_cur = NULL;
+	if (error)
+		return error;
+
+	/* Set all the cntbt blocks in the bitmap. */
+	sc->sa.cnt_cur = xfs_cntbt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+	error = xagb_bitmap_set_btblocks(freesp_blocks, sc->sa.cnt_cur);
+	xfs_btree_del_cursor(sc->sa.cnt_cur, error);
+	sc->sa.cnt_cur = NULL;
+	if (error)
+		return error;
+
+	/* Record our new btreeblks value. */
+	rr->freesp_btblocks = xagb_bitmap_hweight(freesp_blocks) - 2;
+
+	/* Set all the new rmapbt blocks in the bitmap. */
+	list_for_each_entry_safe(resv, n, &rr->new_btree.resv_list, list) {
+		error = xagb_bitmap_set(freesp_blocks, resv->agbno, resv->len);
+		if (error)
+			return error;
+	}
+
+	/* Set all the AGFL blocks in the bitmap. */
+	error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+	if (error)
+		return error;
+
+	error = xfs_agfl_walk(sc->mp, agf, agfl_bp, xrep_rmap_walk_agfl, &ra);
+	if (error)
+		return error;
+
+	/* Count the extents in the bitmap. */
+	freesp_records = xagb_bitmap_count_set_regions(freesp_blocks);
+
+	/* Compute how many blocks we'll need for all the rmaps. */
+	error = xfs_btree_bload_compute_geometry(rmap_cur,
+			&rr->new_btree.bload, rr->nr_records + freesp_records);
+	if (error)
+		return error;
+
+	/* We're done when we don't need more blocks. */
+	*done = nr_blocks >= rr->new_btree.bload.nr_blocks;
+	return 0;
+}
+
+/*
+ * Iteratively reserve space for rmap btree while recording OWN_AG rmaps for
+ * the free space metadata.  This implements section (II) above.
+ */
+STATIC int
+xrep_rmap_reserve_space(
+	struct xrep_rmap	*rr,
+	struct xfs_btree_cur	*rmap_cur)
+{
+	struct xagb_bitmap	freesp_blocks;	/* AGBIT */
+	uint64_t		blocks_reserved = 0;
+	bool			done = false;
+	int			error;
+
+	/* Compute how many blocks we'll need for the rmaps collected so far. */
+	error = xfs_btree_bload_compute_geometry(rmap_cur,
+			&rr->new_btree.bload, rr->nr_records);
+	if (error)
+		return error;
+
+	/* Last chance to abort before we start committing fixes. */
+	if (xchk_should_terminate(rr->sc, &error))
+		return error;
+
+	xagb_bitmap_init(&freesp_blocks);
+
+	/*
+	 * Iteratively reserve space for the new rmapbt and recompute the
+	 * number of blocks needed to store the previously observed rmapbt
+	 * records and the ones we'll create for the free space metadata.
+	 * Finish when we don't need more blocks.
+	 */
+	do {
+		error = xrep_rmap_try_reserve(rr, rmap_cur, &freesp_blocks,
+				&blocks_reserved, &done);
+		if (error)
+			goto out_bitmap;
+	} while (!done);
+
+	/* Emit rmaps for everything in the free space bitmap. */
+	xrep_ag_btcur_init(rr->sc, &rr->sc->sa);
+	error = xrep_rmap_stash_bitmap(rr, &freesp_blocks, &XFS_RMAP_OINFO_AG);
+	xchk_ag_btcur_free(&rr->sc->sa);
+
+out_bitmap:
+	xagb_bitmap_destroy(&freesp_blocks);
+	return error;
+}
+
+/* Section (III): Building the new rmap btree. */
+
+/* Update the AGF counters. */
+STATIC int
+xrep_rmap_reset_counters(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	xfs_agblock_t		rmap_btblocks;
+
+	/*
+	 * The AGF header contains extra information related to the reverse
+	 * mapping btree, so we must update those fields here.
+	 */
+	rmap_btblocks = rr->new_btree.afake.af_blocks - 1;
+	agf->agf_btreeblks = cpu_to_be32(rr->freesp_btblocks + rmap_btblocks);
+	xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS);
+
+	/*
+	 * After we commit the new btree to disk, it is possible that the
+	 * process to reap the old btree blocks will race with the AIL trying
+	 * to checkpoint the old btree blocks into the filesystem.  If the new
+	 * tree is shorter than the old one, the rmapbt write verifier will
+	 * fail and the AIL will shut down the filesystem.
+	 *
+	 * To avoid this, save the old incore btree height values as the alt
+	 * height values before re-initializing the perag info from the updated
+	 * AGF to capture all the new values.
+	 */
+	pag->pagf_repair_rmap_level = pag->pagf_rmap_level;
+
+	/* Reinitialize with the values we just logged. */
+	return xrep_reinit_pagf(sc);
+}
+
+/* Retrieve rmapbt data for bulk load. */
+STATIC int
+xrep_rmap_get_records(
+	struct xfs_btree_cur	*cur,
+	unsigned int		idx,
+	struct xfs_btree_block	*block,
+	unsigned int		nr_wanted,
+	void			*priv)
+{
+	struct xrep_rmap	*rr = priv;
+	union xfs_btree_rec	*block_rec;
+	unsigned int		loaded;
+	int			error;
+
+	for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+		int		stat = 0;
+
+		error = xfs_btree_increment(rr->mcur, 0, &stat);
+		if (error)
+			return error;
+		if (!stat)
+			return -EFSCORRUPTED;
+
+		error = xfs_rmap_get_rec(rr->mcur, &cur->bc_rec.r, &stat);
+		if (error)
+			return error;
+		if (!stat)
+			return -EFSCORRUPTED;
+
+		block_rec = xfs_btree_rec_addr(cur, idx, block);
+		cur->bc_ops->init_rec_from_cur(cur, block_rec);
+	}
+
+	return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_rmap_claim_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	void			*priv)
+{
+	struct xrep_rmap        *rr = priv;
+
+	return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Custom allocation function for new rmap btrees. */
+STATIC int
+xrep_rmap_alloc_vextent(
+	struct xfs_scrub	*sc,
+	struct xfs_alloc_arg	*args,
+	xfs_fsblock_t		alloc_hint)
+{
+	int			error;
+
+	/*
+	 * We don't want an rmap update on the allocation, since we iteratively
+	 * compute the OWN_AG records /after/ allocating blocks for the records
+	 * that we already know we need to store.  Therefore, fix the freelist
+	 * with the NORMAP flag set so that we don't also try to create an rmap
+	 * for new AGFL blocks.
+	 */
+	error = xrep_fix_freelist(sc, XFS_ALLOC_FLAG_NORMAP);
+	if (error)
+		return error;
+
+	/*
+	 * If xrep_fix_freelist fixed the freelist by moving blocks from the
+	 * free space btrees or by removing blocks from the AGFL and queueing
+	 * an EFI to free the block, the transaction will be dirty.  This
+	 * second case is of interest to us.
+	 *
+	 * Later on, we will need to compare gaps in the new recordset against
+	 * the block usage of all OWN_AG owners in order to free the old
+	 * btree's blocks, which means that we can't have EFIs for former AGFL
+	 * blocks attached to the repair transaction when we commit the new
+	 * btree.
+	 *
+	 * xrep_newbt_alloc_blocks guarantees this for us by calling
+	 * xrep_defer_finish to commit anything that fix_freelist may have
+	 * added to the transaction.
+	 */
+	return xfs_alloc_vextent_near_bno(args, alloc_hint);
+}
+
+
+/* Count the records in this btree. */
+STATIC int
+xrep_rmap_count_records(
+	struct xfs_btree_cur	*cur,
+	unsigned long long	*nr)
+{
+	int			running = 1;
+	int			error;
+
+	*nr = 0;
+
+	error = xfs_btree_goto_left_edge(cur);
+	if (error)
+		return error;
+
+	while (running && !(error = xfs_btree_increment(cur, 0, &running))) {
+		if (running)
+			(*nr)++;
+	}
+
+	return error;
+}
+/*
+ * Use the collected rmap information to stage a new rmap btree.  If this is
+ * successful we'll return with the new btree root information logged to the
+ * repair transaction but not yet committed.  This implements section (III)
+ * above.
+ */
+STATIC int
+xrep_rmap_build_new_tree(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_btree_cur	*rmap_cur;
+	xfs_fsblock_t		fsbno;
+	int			error;
+
+	/*
+	 * Preserve the old rmapbt block count so that we can adjust the
+	 * per-AG rmapbt reservation after we commit the new btree root and
+	 * want to dispose of the old btree blocks.
+	 */
+	rr->old_rmapbt_fsbcount = be32_to_cpu(agf->agf_rmap_blocks);
+
+	/*
+	 * Prepare to construct the new btree by reserving disk space for the
+	 * new btree and setting up all the accounting information we'll need
+	 * to root the new btree while it's under construction and before we
+	 * attach it to the AG header.  The new blocks are accounted to the
+	 * rmapbt per-AG reservation, which we will adjust further after
+	 * committing the new btree.
+	 */
+	fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, XFS_RMAP_BLOCK(sc->mp));
+	xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_SKIP_UPDATE,
+			fsbno, XFS_AG_RESV_RMAPBT);
+	rr->new_btree.bload.get_records = xrep_rmap_get_records;
+	rr->new_btree.bload.claim_block = xrep_rmap_claim_block;
+	rr->new_btree.alloc_vextent = xrep_rmap_alloc_vextent;
+	rmap_cur = xfs_rmapbt_init_cursor(sc->mp, NULL, NULL, pag);
+	xfs_btree_stage_afakeroot(rmap_cur, &rr->new_btree.afake);
+
+	/*
+	 * Initialize @rr->new_btree, reserve space for the new rmapbt,
+	 * and compute OWN_AG rmaps.
+	 */
+	error = xrep_rmap_reserve_space(rr, rmap_cur);
+	if (error)
+		goto err_cur;
+
+	/*
+	 * Count the rmapbt records again, because the space reservation
+	 * for the rmapbt itself probably added more records to the btree.
+	 */
+	rr->mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL,
+			&rr->rmap_btree);
+
+	error = xrep_rmap_count_records(rr->mcur, &rr->nr_records);
+	if (error)
+		goto err_mcur;
+
+	/*
+	 * Due to btree slack factors, it's possible for a new btree to be one
+	 * level taller than the old btree.  Update the incore btree height so
+	 * that we don't trip the verifiers when writing the new btree blocks
+	 * to disk.
+	 */
+	pag->pagf_repair_rmap_level = rr->new_btree.bload.btree_height;
+
+	/*
+	 * Move the cursor to the left edge of the tree so that the first
+	 * increment in ->get_records positions us at the first record.
+	 */
+	error = xfs_btree_goto_left_edge(rr->mcur);
+	if (error)
+		goto err_level;
+
+	/* Add all observed rmap records. */
+	error = xfs_btree_bload(rmap_cur, &rr->new_btree.bload, rr);
+	if (error)
+		goto err_level;
+
+	/*
+	 * Install the new btree in the AG header.  After this point the old
+	 * btree is no longer accessible and the new tree is live.
+	 */
+	xfs_rmapbt_commit_staged_btree(rmap_cur, sc->tp, sc->sa.agf_bp);
+	xfs_btree_del_cursor(rmap_cur, 0);
+	xfs_btree_del_cursor(rr->mcur, 0);
+	rr->mcur = NULL;
+
+	/*
+	 * Now that we've written the new btree to disk, we don't need to keep
+	 * updating the in-memory btree.  Abort the scan to stop live updates.
+	 */
+	xchk_iscan_abort(&rr->iscan);
+
+	/*
+	 * The newly committed rmap recordset includes mappings for the blocks
+	 * that we reserved to build the new btree.  If there is excess space
+	 * reservation to be freed, the corresponding rmap records must also be
+	 * removed.
+	 */
+	rr->new_btree.oinfo = XFS_RMAP_OINFO_AG;
+
+	/* Reset the AGF counters now that we've changed the btree shape. */
+	error = xrep_rmap_reset_counters(rr);
+	if (error)
+		goto err_newbt;
+
+	/* Dispose of any unused blocks and the accounting information. */
+	error = xrep_newbt_commit(&rr->new_btree);
+	if (error)
+		return error;
+
+	return xrep_roll_ag_trans(sc);
+
+err_level:
+	pag->pagf_repair_rmap_level = 0;
+err_mcur:
+	xfs_btree_del_cursor(rr->mcur, error);
+err_cur:
+	xfs_btree_del_cursor(rmap_cur, error);
+err_newbt:
+	xrep_newbt_cancel(&rr->new_btree);
+	return error;
+}
+
+/* Section (IV): Reaping the old btree. */
+
+struct xrep_rmap_find_gaps {
+	struct xagb_bitmap	rmap_gaps;
+	xfs_agblock_t		next_agbno;
+};
+
+/* Subtract each free extent in the bnobt from the rmap gaps. */
+STATIC int
+xrep_rmap_find_freesp(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_alloc_rec_incore *rec,
+	void				*priv)
+{
+	struct xrep_rmap_find_gaps	*rfg = priv;
+
+	return xagb_bitmap_clear(&rfg->rmap_gaps, rec->ar_startblock,
+			rec->ar_blockcount);
+}
+
+/* Record the free space we find, as part of cleaning out the btree. */
+STATIC int
+xrep_rmap_find_gaps(
+	struct xfs_btree_cur		*cur,
+	const struct xfs_rmap_irec	*rec,
+	void				*priv)
+{
+	struct xrep_rmap_find_gaps	*rfg = priv;
+	int				error;
+
+	if (rec->rm_startblock > rfg->next_agbno) {
+		error = xagb_bitmap_set(&rfg->rmap_gaps, rfg->next_agbno,
+				rec->rm_startblock - rfg->next_agbno);
+		if (error)
+			return error;
+	}
+
+	rfg->next_agbno = max_t(xfs_agblock_t, rfg->next_agbno,
+				rec->rm_startblock + rec->rm_blockcount);
+	return 0;
+}
+
+/*
+ * Reap the old rmapbt blocks.  Now that the rmapbt is fully rebuilt, we make
+ * a list of gaps in the rmap records and a list of the extents mentioned in
+ * the bnobt.  Any block that's in the new rmapbt gap list but not mentioned
+ * in the bnobt is a block from the old rmapbt and can be removed.
+ */
+STATIC int
+xrep_rmap_remove_old_tree(
+	struct xrep_rmap	*rr)
+{
+	struct xrep_rmap_find_gaps rfg = {
+		.next_agbno	= 0,
+	};
+	struct xfs_scrub	*sc = rr->sc;
+	struct xfs_agf		*agf = sc->sa.agf_bp->b_addr;
+	struct xfs_perag	*pag = sc->sa.pag;
+	struct xfs_btree_cur	*mcur;
+	xfs_agblock_t		agend;
+	int			error;
+
+	xagb_bitmap_init(&rfg.rmap_gaps);
+
+	/* Compute free space from the new rmapbt. */
+	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, NULL, &rr->rmap_btree);
+
+	error = xfs_rmap_query_all(mcur, xrep_rmap_find_gaps, &rfg);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_bitmap;
+
+	/* Insert a record for space between the last rmap and EOAG. */
+	agend = be32_to_cpu(agf->agf_length);
+	if (rfg.next_agbno < agend) {
+		error = xagb_bitmap_set(&rfg.rmap_gaps, rfg.next_agbno,
+				agend - rfg.next_agbno);
+		if (error)
+			goto out_bitmap;
+	}
+
+	/* Compute free space from the existing bnobt. */
+	sc->sa.bno_cur = xfs_bnobt_init_cursor(sc->mp, sc->tp, sc->sa.agf_bp,
+			sc->sa.pag);
+	error = xfs_alloc_query_all(sc->sa.bno_cur, xrep_rmap_find_freesp,
+			&rfg);
+	xfs_btree_del_cursor(sc->sa.bno_cur, error);
+	sc->sa.bno_cur = NULL;
+	if (error)
+		goto out_bitmap;
+
+	/*
+	 * Free the "free" blocks that the new rmapbt knows about but the bnobt
+	 * doesn't--these are the old rmapbt blocks.  Credit the old rmapbt
+	 * block usage count back to the per-AG rmapbt reservation (and not
+	 * fdblocks, since the rmap btree lives in free space) to keep the
+	 * reservation and free space accounting correct.
+	 */
+	error = xrep_reap_agblocks(sc, &rfg.rmap_gaps,
+			&XFS_RMAP_OINFO_ANY_OWNER, XFS_AG_RESV_RMAPBT);
+	if (error)
+		goto out_bitmap;
+
+	/*
+	 * Now that we've zapped all the old rmapbt blocks we can turn off
+	 * the alternate height mechanism and reset the per-AG space
+	 * reservation.
+	 */
+	pag->pagf_repair_rmap_level = 0;
+	sc->flags |= XREP_RESET_PERAG_RESV;
+out_bitmap:
+	xagb_bitmap_destroy(&rfg.rmap_gaps);
+	return error;
+}
+
+static inline bool
+xrep_rmapbt_want_live_update(
+	struct xchk_iscan		*iscan,
+	const struct xfs_owner_info	*oi)
+{
+	if (xchk_iscan_aborted(iscan))
+		return false;
+
+	/*
+	 * Before unlocking the AG header to perform the inode scan, we
+	 * recorded reverse mappings for all AG metadata except for the OWN_AG
+	 * metadata.  IOWs, the in-memory btree knows about the AG headers, the
+	 * two inode btrees, the CoW staging extents, and the refcount btrees.
+	 * For these types of metadata, we need to record the live updates in
+	 * the in-memory rmap btree.
+	 *
+	 * However, we do not scan the free space btrees or the AGFL until we
+	 * have re-locked the AGF and are ready to reserve space for the new
+	 * rmap btree, so we do not want live updates for OWN_AG metadata.
+	 */
+	if (XFS_RMAP_NON_INODE_OWNER(oi->oi_owner))
+		return oi->oi_owner != XFS_RMAP_OWN_AG;
+
+	/* Ignore updates to files that the scanner hasn't visited yet. */
+	return xchk_iscan_want_live_update(iscan, oi->oi_owner);
+}
+
+/*
+ * Apply a rmapbt update from the regular filesystem into our shadow btree.
+ * We're running from the thread that owns the AGF buffer and is generating
+ * the update, so we must be careful about which parts of the struct xrep_rmap
+ * that we change.
+ */
+static int
+xrep_rmapbt_live_update(
+	struct notifier_block		*nb,
+	unsigned long			action,
+	void				*data)
+{
+	struct xfs_rmap_update_params	*p = data;
+	struct xrep_rmap		*rr;
+	struct xfs_mount		*mp;
+	struct xfs_btree_cur		*mcur;
+	struct xfs_trans		*tp;
+	void				*txcookie;
+	int				error;
+
+	rr = container_of(nb, struct xrep_rmap, rhook.rmap_hook.nb);
+	mp = rr->sc->mp;
+
+	if (!xrep_rmapbt_want_live_update(&rr->iscan, &p->oinfo))
+		goto out_unlock;
+
+	trace_xrep_rmap_live_update(mp, rr->sc->sa.pag->pag_agno, action, p);
+
+	error = xrep_trans_alloc_hook_dummy(mp, &txcookie, &tp);
+	if (error)
+		goto out_abort;
+
+	mutex_lock(&rr->lock);
+	mcur = xfs_rmapbt_mem_cursor(rr->sc->sa.pag, tp, &rr->rmap_btree);
+	error = __xfs_rmap_finish_intent(mcur, action, p->startblock,
+			p->blockcount, &p->oinfo, p->unwritten);
+	xfs_btree_del_cursor(mcur, error);
+	if (error)
+		goto out_cancel;
+
+	error = xfbtree_trans_commit(&rr->rmap_btree, tp);
+	if (error)
+		goto out_cancel;
+
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+	mutex_unlock(&rr->lock);
+	return NOTIFY_DONE;
+
+out_cancel:
+	xfbtree_trans_cancel(&rr->rmap_btree, tp);
+	xrep_trans_cancel_hook_dummy(&txcookie, tp);
+out_abort:
+	mutex_unlock(&rr->lock);
+	xchk_iscan_abort(&rr->iscan);
+out_unlock:
+	return NOTIFY_DONE;
+}
+
+/* Set up the filesystem scan components. */
+STATIC int
+xrep_rmap_setup_scan(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+	int			error;
+
+	mutex_init(&rr->lock);
+
+	/* Set up in-memory rmap btree */
+	error = xfs_rmapbt_mem_init(sc->mp, &rr->rmap_btree, sc->xmbtp,
+			sc->sa.pag->pag_agno);
+	if (error)
+		goto out_mutex;
+
+	/* Retry iget every tenth of a second for up to 30 seconds. */
+	xchk_iscan_start(sc, 30000, 100, &rr->iscan);
+
+	/*
+	 * Hook into live rmap operations so that we can update our in-memory
+	 * btree to reflect live changes on the filesystem.  Since we drop the
+	 * AGF buffer to scan all the inodes, we need this piece to avoid
+	 * installing a stale btree.
+	 */
+	ASSERT(sc->flags & XCHK_FSGATES_RMAP);
+	xfs_rmap_hook_setup(&rr->rhook, xrep_rmapbt_live_update);
+	error = xfs_rmap_hook_add(sc->sa.pag, &rr->rhook);
+	if (error)
+		goto out_iscan;
+	return 0;
+
+out_iscan:
+	xchk_iscan_teardown(&rr->iscan);
+	xfbtree_destroy(&rr->rmap_btree);
+out_mutex:
+	mutex_destroy(&rr->lock);
+	return error;
+}
+
+/* Tear down scan components. */
+STATIC void
+xrep_rmap_teardown(
+	struct xrep_rmap	*rr)
+{
+	struct xfs_scrub	*sc = rr->sc;
+
+	xchk_iscan_abort(&rr->iscan);
+	xfs_rmap_hook_del(sc->sa.pag, &rr->rhook);
+	xchk_iscan_teardown(&rr->iscan);
+	xfbtree_destroy(&rr->rmap_btree);
+	mutex_destroy(&rr->lock);
+}
+
+/* Repair the rmap btree for some AG. */
+int
+xrep_rmapbt(
+	struct xfs_scrub	*sc)
+{
+	struct xrep_rmap	*rr = sc->buf;
+	int			error;
+
+	error = xrep_rmap_setup_scan(rr);
+	if (error)
+		return error;
+
+	/*
+	 * Collect rmaps for everything in this AG that isn't space metadata.
+	 * These rmaps won't change even as we try to allocate blocks.
+	 */
+	error = xrep_rmap_find_rmaps(rr);
+	if (error)
+		goto out_records;
+
+	/* Rebuild the rmap information. */
+	error = xrep_rmap_build_new_tree(rr);
+	if (error)
+		goto out_records;
+
+	/* Kill the old tree. */
+	error = xrep_rmap_remove_old_tree(rr);
+	if (error)
+		goto out_records;
+
+out_records:
+	xrep_rmap_teardown(rr);
+	return error;
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index b1ff4f33324a..5055092bd9e8 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -119,7 +119,7 @@ xfsum_load(
 	xfs_rtsumoff_t		sumoff,
 	union xfs_suminfo_raw	*rawinfo)
 {
-	return xfile_obj_load(sc->xfile, rawinfo,
+	return xfile_load(sc->xfile, rawinfo,
 			sizeof(union xfs_suminfo_raw),
 			sumoff << XFS_WORDLOG);
 }
@@ -130,7 +130,7 @@ xfsum_store(
 	xfs_rtsumoff_t		sumoff,
 	const union xfs_suminfo_raw rawinfo)
 {
-	return xfile_obj_store(sc->xfile, &rawinfo,
+	return xfile_store(sc->xfile, &rawinfo,
 			sizeof(union xfs_suminfo_raw),
 			sumoff << XFS_WORDLOG);
 }
@@ -142,7 +142,7 @@ xfsum_copyout(
 	union xfs_suminfo_raw	*rawinfo,
 	unsigned int		nr_words)
 {
-	return xfile_obj_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
+	return xfile_load(sc->xfile, rawinfo, nr_words << XFS_WORDLOG,
 			sumoff << XFS_WORDLOG);
 }
 
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index caf324c2b991..20fac9723c08 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -15,6 +15,8 @@
 #include "xfs_quota.h"
 #include "xfs_qm.h"
 #include "xfs_scrub.h"
+#include "xfs_buf_mem.h"
+#include "xfs_rmap.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/trace.h"
@@ -157,6 +159,15 @@ xchk_fsgates_disable(
 	if (sc->flags & XCHK_FSGATES_DRAIN)
 		xfs_drain_wait_disable();
 
+	if (sc->flags & XCHK_FSGATES_QUOTA)
+		xfs_dqtrx_hook_disable();
+
+	if (sc->flags & XCHK_FSGATES_DIRENTS)
+		xfs_dir_hook_disable();
+
+	if (sc->flags & XCHK_FSGATES_RMAP)
+		xfs_rmap_hook_disable();
+
 	sc->flags &= ~XCHK_FSGATES_ALL;
 }
 
@@ -184,6 +195,10 @@ xchk_teardown(
 		sc->flags &= ~XCHK_HAVE_FREEZE_PROT;
 		mnt_drop_write_file(sc->file);
 	}
+	if (sc->xmbtp) {
+		xmbuf_free(sc->xmbtp);
+		sc->xmbtp = NULL;
+	}
 	if (sc->xfile) {
 		xfile_destroy(sc->xfile);
 		sc->xfile = NULL;
@@ -267,7 +282,7 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.setup	= xchk_setup_ag_rmapbt,
 		.scrub	= xchk_rmapbt,
 		.has	= xfs_has_rmapbt,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_rmapbt,
 	},
 	[XFS_SCRUB_TYPE_REFCNTBT] = {	/* refcountbt */
 		.type	= ST_PERAG,
@@ -358,7 +373,25 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
 		.type	= ST_FS,
 		.setup	= xchk_setup_fscounters,
 		.scrub	= xchk_fscounters,
-		.repair	= xrep_notsupported,
+		.repair	= xrep_fscounters,
+	},
+	[XFS_SCRUB_TYPE_QUOTACHECK] = {	/* quota counters */
+		.type	= ST_FS,
+		.setup	= xchk_setup_quotacheck,
+		.scrub	= xchk_quotacheck,
+		.repair	= xrep_quotacheck,
+	},
+	[XFS_SCRUB_TYPE_NLINKS] = {	/* inode link counts */
+		.type	= ST_FS,
+		.setup	= xchk_setup_nlinks,
+		.scrub	= xchk_nlinks,
+		.repair	= xrep_nlinks,
+	},
+	[XFS_SCRUB_TYPE_HEALTHY] = {	/* fs healthy; clean all reminders */
+		.type	= ST_FS,
+		.setup	= xchk_setup_fs,
+		.scrub	= xchk_health_record,
+		.repair = xrep_notsupported,
 	},
 };
 
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 7fc50654c4fe..9ad65b604fe1 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -99,6 +99,9 @@ struct xfs_scrub {
 	/* xfile used by the scrubbers; freed at teardown. */
 	struct xfile			*xfile;
 
+	/* buffer target for in-memory btrees; also freed at teardown. */
+	struct xfs_buftarg		*xmbtp;
+
 	/* Lock flags for @ip. */
 	uint				ilock_flags;
 
@@ -121,6 +124,9 @@ struct xfs_scrub {
 #define XCHK_HAVE_FREEZE_PROT	(1U << 1)  /* do we have freeze protection? */
 #define XCHK_FSGATES_DRAIN	(1U << 2)  /* defer ops draining enabled */
 #define XCHK_NEED_DRAIN		(1U << 3)  /* scrub needs to drain defer ops */
+#define XCHK_FSGATES_QUOTA	(1U << 4)  /* quota live update enabled */
+#define XCHK_FSGATES_DIRENTS	(1U << 5)  /* directory live update enabled */
+#define XCHK_FSGATES_RMAP	(1U << 6)  /* rmapbt live update enabled */
 #define XREP_RESET_PERAG_RESV	(1U << 30) /* must reset AG space reservation */
 #define XREP_ALREADY_FIXED	(1U << 31) /* checking our repair work */
 
@@ -130,7 +136,10 @@ struct xfs_scrub {
  * features are gated off via dynamic code patching, which is why the state
  * must be enabled during scrub setup and can only be torn down afterwards.
  */
-#define XCHK_FSGATES_ALL	(XCHK_FSGATES_DRAIN)
+#define XCHK_FSGATES_ALL	(XCHK_FSGATES_DRAIN | \
+				 XCHK_FSGATES_QUOTA | \
+				 XCHK_FSGATES_DIRENTS | \
+				 XCHK_FSGATES_RMAP)
 
 /* Metadata scrubbers */
 int xchk_tester(struct xfs_scrub *sc);
@@ -167,14 +176,21 @@ xchk_rtsummary(struct xfs_scrub *sc)
 #endif
 #ifdef CONFIG_XFS_QUOTA
 int xchk_quota(struct xfs_scrub *sc);
+int xchk_quotacheck(struct xfs_scrub *sc);
 #else
 static inline int
 xchk_quota(struct xfs_scrub *sc)
 {
 	return -ENOENT;
 }
+static inline int
+xchk_quotacheck(struct xfs_scrub *sc)
+{
+	return -ENOENT;
+}
 #endif
 int xchk_fscounters(struct xfs_scrub *sc);
+int xchk_nlinks(struct xfs_scrub *sc);
 
 /* cross-referencing helpers */
 void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c
index cd91db4a5548..42cafbed94ac 100644
--- a/fs/xfs/scrub/stats.c
+++ b/fs/xfs/scrub/stats.c
@@ -77,6 +77,8 @@ static const char *name_map[XFS_SCRUB_TYPE_NR] = {
 	[XFS_SCRUB_TYPE_GQUOTA]		= "grpquota",
 	[XFS_SCRUB_TYPE_PQUOTA]		= "prjquota",
 	[XFS_SCRUB_TYPE_FSCOUNTERS]	= "fscounters",
+	[XFS_SCRUB_TYPE_QUOTACHECK]	= "quotacheck",
+	[XFS_SCRUB_TYPE_NLINKS]		= "nlinks",
 };
 
 /* Format the scrub stats into a text buffer, similar to pcp style. */
@@ -329,9 +331,9 @@ xchk_stats_register(
 	if (!cs->cs_debugfs)
 		return;
 
-	debugfs_create_file("stats", 0644, cs->cs_debugfs, cs,
+	debugfs_create_file("stats", 0444, cs->cs_debugfs, cs,
 			&scrub_stats_fops);
-	debugfs_create_file("clear_stats", 0400, cs->cs_debugfs, cs,
+	debugfs_create_file("clear_stats", 0200, cs->cs_debugfs, cs,
 			&clear_scrub_stats_fops);
 }
 
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index ddff86713df3..d77d8a9598f6 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -13,6 +13,7 @@
 #include "xfs_inode.h"
 #include "xfs_symlink.h"
 #include "xfs_health.h"
+#include "xfs_symlink_remote.h"
 #include "scrub/scrub.h"
 #include "scrub/common.h"
 #include "scrub/health.h"
@@ -67,7 +68,7 @@ xchk_symlink(
 	}
 
 	/* Remote symlink; must read the contents. */
-	error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
+	error = xfs_symlink_remote_read(sc->ip, sc->buf);
 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
 		return error;
 	if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index d0e24ffaf754..3dd281d6d185 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -16,10 +16,16 @@
 #include "xfs_rtbitmap.h"
 #include "xfs_quota.h"
 #include "xfs_quota_defs.h"
+#include "xfs_da_format.h"
+#include "xfs_dir2.h"
+#include "xfs_rmap.h"
 #include "scrub/scrub.h"
 #include "scrub/xfile.h"
 #include "scrub/xfarray.h"
 #include "scrub/quota.h"
+#include "scrub/iscan.h"
+#include "scrub/nlinks.h"
+#include "scrub/fscounters.h"
 
 /* Figure out which block the btree cursor was pointing to. */
 static inline xfs_fsblock_t
@@ -32,7 +38,7 @@ xchk_btree_cur_fsbno(
 				xfs_buf_daddr(cur->bc_levels[level].bp));
 
 	if (level == cur->bc_nlevels - 1 &&
-	    (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE))
+	    cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 		return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino);
 
 	return NULLFSBLOCK;
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 6bbb4e8639dc..5b294be52c55 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -15,11 +15,17 @@
 
 #include <linux/tracepoint.h>
 #include "xfs_bit.h"
+#include "xfs_quota_defs.h"
 
+struct xfs_scrub;
 struct xfile;
 struct xfarray;
 struct xfarray_sortinfo;
 struct xchk_dqiter;
+struct xchk_iscan;
+struct xchk_nlink;
+struct xchk_fscounters;
+struct xfs_rmap_update_params;
 
 /*
  * ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -27,14 +33,6 @@ struct xchk_dqiter;
  * ring buffer.  Somehow this was only worth mentioning in the ftrace sample
  * code.
  */
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_SHARED);
 TRACE_DEFINE_ENUM(XFS_REFC_DOMAIN_COW);
 
@@ -63,6 +61,9 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
 TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_QUOTACHECK);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_NLINKS);
+TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_HEALTHY);
 
 #define XFS_SCRUB_TYPE_STRINGS \
 	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
@@ -89,7 +90,10 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XFS_SCRUB_TYPE_UQUOTA,	"usrquota" }, \
 	{ XFS_SCRUB_TYPE_GQUOTA,	"grpquota" }, \
 	{ XFS_SCRUB_TYPE_PQUOTA,	"prjquota" }, \
-	{ XFS_SCRUB_TYPE_FSCOUNTERS,	"fscounters" }
+	{ XFS_SCRUB_TYPE_FSCOUNTERS,	"fscounters" }, \
+	{ XFS_SCRUB_TYPE_QUOTACHECK,	"quotacheck" }, \
+	{ XFS_SCRUB_TYPE_NLINKS,	"nlinks" }, \
+	{ XFS_SCRUB_TYPE_HEALTHY,	"healthy" }
 
 #define XFS_SCRUB_FLAG_STRINGS \
 	{ XFS_SCRUB_IFLAG_REPAIR,		"repair" }, \
@@ -107,9 +111,21 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
 	{ XCHK_HAVE_FREEZE_PROT,		"nofreeze" }, \
 	{ XCHK_FSGATES_DRAIN,			"fsgates_drain" }, \
 	{ XCHK_NEED_DRAIN,			"need_drain" }, \
+	{ XCHK_FSGATES_QUOTA,			"fsgates_quota" }, \
+	{ XCHK_FSGATES_DIRENTS,			"fsgates_dirents" }, \
+	{ XCHK_FSGATES_RMAP,			"fsgates_rmap" }, \
 	{ XREP_RESET_PERAG_RESV,		"reset_perag_resv" }, \
 	{ XREP_ALREADY_FIXED,			"already_fixed" }
 
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_MAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP);
+TRACE_DEFINE_ENUM(XFS_RMAP_UNMAP_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT);
+TRACE_DEFINE_ENUM(XFS_RMAP_CONVERT_SHARED);
+TRACE_DEFINE_ENUM(XFS_RMAP_ALLOC);
+TRACE_DEFINE_ENUM(XFS_RMAP_FREE);
+
 DECLARE_EVENT_CLASS(xchk_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
 		 int error),
@@ -395,6 +411,29 @@ DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
 DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
 DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
 DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+
+TRACE_EVENT(xchk_qcheck_error,
+	TP_PROTO(struct xfs_scrub *sc, xfs_dqtype_t dqtype, xfs_dqid_t id,
+		 void *ret_ip),
+	TP_ARGS(sc, dqtype, id, ret_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_dqtype_t, dqtype)
+		__field(xfs_dqid_t, id)
+		__field(void *, ret_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->dqtype = dqtype;
+		__entry->id = id;
+		__entry->ret_ip = ret_ip;
+	),
+	TP_printk("dev %d:%d dquot type %s id 0x%x ret_ip %pS",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+		  __entry->id,
+		  __entry->ret_ip)
+);
 #endif /* CONFIG_XFS_QUOTA */
 
 TRACE_EVENT(xchk_incomplete,
@@ -423,7 +462,7 @@ TRACE_EVENT(xchk_btree_op_error,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(unsigned int, type)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(int, level)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, bno)
@@ -436,7 +475,7 @@ TRACE_EVENT(xchk_btree_op_error,
 
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
@@ -444,10 +483,10 @@ TRACE_EVENT(xchk_btree_op_error,
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+	TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -465,7 +504,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 		__field(xfs_ino_t, ino)
 		__field(int, whichfork)
 		__field(unsigned int, type)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(int, level)
 		__field(int, ptr)
 		__field(xfs_agnumber_t, agno)
@@ -479,7 +518,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 		__entry->ino = sc->ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->ptr = cur->bc_levels[level].ptr;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
@@ -487,12 +526,12 @@ TRACE_EVENT(xchk_ifork_btree_op_error,
 		__entry->error = error;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x error %d ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
 		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -508,7 +547,7 @@ TRACE_EVENT(xchk_btree_error,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(unsigned int, type)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(int, level)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, bno)
@@ -519,17 +558,17 @@ TRACE_EVENT(xchk_btree_error,
 		xfs_fsblock_t fsbno = xchk_btree_cur_fsbno(cur, level);
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
 		__entry->ptr = cur->bc_levels[level].ptr;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+	TP_printk("dev %d:%d type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -546,7 +585,7 @@ TRACE_EVENT(xchk_ifork_btree_error,
 		__field(xfs_ino_t, ino)
 		__field(int, whichfork)
 		__field(unsigned int, type)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(int, level)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, bno)
@@ -559,19 +598,19 @@ TRACE_EVENT(xchk_ifork_btree_error,
 		__entry->ino = sc->ip->i_ino;
 		__entry->whichfork = cur->bc_ino.whichfork;
 		__entry->type = sc->sm->sm_type;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
 		__entry->ptr = cur->bc_levels[level].ptr;
 		__entry->ret_ip = ret_ip;
 	),
-	TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
+	TP_printk("dev %d:%d ino 0x%llx fork %s type %s %sbt level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->ino,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
 		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->ptr,
 		  __entry->agno,
@@ -586,7 +625,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(int, type)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, bno)
 		__field(int, level)
@@ -598,17 +637,17 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class,
 
 		__entry->dev = sc->mp->m_super->s_dev;
 		__entry->type = sc->sm->sm_type;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno);
 		__entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
 		__entry->level = level;
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->ptr = cur->bc_levels[level].ptr;
 	),
-	TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
+	TP_printk("dev %d:%d type %s %sbt agno 0x%x agbno 0x%x level %d nlevels %d ptr %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->agno,
 		  __entry->bno,
 		  __entry->level,
@@ -861,18 +900,11 @@ TRACE_EVENT(xfile_destroy,
 		__field(loff_t, size)
 	),
 	TP_fast_assign(
-		struct xfile_stat	statbuf;
-		int			ret;
+		struct inode		*inode = file_inode(xf->file);
 
-		ret = xfile_stat(xf, &statbuf);
-		if (!ret) {
-			__entry->bytes = statbuf.bytes;
-			__entry->size = statbuf.size;
-		} else {
-			__entry->bytes = -1;
-			__entry->size = -1;
-		}
-		__entry->ino = file_inode(xf->file)->i_ino;
+		__entry->ino = inode->i_ino;
+		__entry->bytes = inode->i_blocks << SECTOR_SHIFT;
+		__entry->size = i_size_read(inode);
 	),
 	TP_printk("xfino 0x%lx mem_bytes 0x%llx isize 0x%llx",
 		  __entry->ino,
@@ -891,19 +923,12 @@ DECLARE_EVENT_CLASS(xfile_class,
 		__field(unsigned long long, bytecount)
 	),
 	TP_fast_assign(
-		struct xfile_stat	statbuf;
-		int			ret;
+		struct inode		*inode = file_inode(xf->file);
 
-		ret = xfile_stat(xf, &statbuf);
-		if (!ret) {
-			__entry->bytes_used = statbuf.bytes;
-			__entry->size = statbuf.size;
-		} else {
-			__entry->bytes_used = -1;
-			__entry->size = -1;
-		}
-		__entry->ino = file_inode(xf->file)->i_ino;
+		__entry->ino = inode->i_ino;
+		__entry->bytes_used = inode->i_blocks << SECTOR_SHIFT;
 		__entry->pos = pos;
+		__entry->size = i_size_read(inode);
 		__entry->bytecount = bytecount;
 	),
 	TP_printk("xfino 0x%lx mem_bytes 0x%llx pos 0x%llx bytecount 0x%llx isize 0x%llx",
@@ -917,11 +942,11 @@ DECLARE_EVENT_CLASS(xfile_class,
 DEFINE_EVENT(xfile_class, name, \
 	TP_PROTO(struct xfile *xf, loff_t pos, unsigned long long bytecount), \
 	TP_ARGS(xf, pos, bytecount))
-DEFINE_XFILE_EVENT(xfile_pread);
-DEFINE_XFILE_EVENT(xfile_pwrite);
+DEFINE_XFILE_EVENT(xfile_load);
+DEFINE_XFILE_EVENT(xfile_store);
 DEFINE_XFILE_EVENT(xfile_seek_data);
-DEFINE_XFILE_EVENT(xfile_get_page);
-DEFINE_XFILE_EVENT(xfile_put_page);
+DEFINE_XFILE_EVENT(xfile_get_folio);
+DEFINE_XFILE_EVENT(xfile_put_folio);
 
 TRACE_EVENT(xfarray_create,
 	TP_PROTO(struct xfarray *xfa, unsigned long long required_capacity),
@@ -968,7 +993,7 @@ TRACE_EVENT(xfarray_isort,
 		  __entry->hi - __entry->lo)
 );
 
-TRACE_EVENT(xfarray_pagesort,
+TRACE_EVENT(xfarray_foliosort,
 	TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
 	TP_ARGS(si, lo, hi),
 	TP_STRUCT__entry(
@@ -1039,6 +1064,47 @@ TRACE_EVENT(xfarray_sort,
 		  __entry->bytes)
 );
 
+TRACE_EVENT(xfarray_sort_scan,
+	TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
+	TP_ARGS(si, idx),
+	TP_STRUCT__entry(
+		__field(unsigned long, ino)
+		__field(unsigned long long, nr)
+		__field(size_t, obj_size)
+		__field(unsigned long long, idx)
+		__field(unsigned long long, folio_pos)
+		__field(unsigned long, folio_bytes)
+		__field(unsigned long long, first_idx)
+		__field(unsigned long long, last_idx)
+	),
+	TP_fast_assign(
+		__entry->nr = si->array->nr;
+		__entry->obj_size = si->array->obj_size;
+		__entry->ino = file_inode(si->array->xfile->file)->i_ino;
+		__entry->idx = idx;
+		if (si->folio) {
+			__entry->folio_pos = folio_pos(si->folio);
+			__entry->folio_bytes = folio_size(si->folio);
+			__entry->first_idx = si->first_folio_idx;
+			__entry->last_idx = si->last_folio_idx;
+		} else {
+			__entry->folio_pos = 0;
+			__entry->folio_bytes = 0;
+			__entry->first_idx = 0;
+			__entry->last_idx = 0;
+		}
+	),
+	TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
+		  __entry->ino,
+		  __entry->nr,
+		  __entry->obj_size,
+		  __entry->idx,
+		  __entry->folio_pos,
+		  __entry->folio_bytes,
+		  __entry->first_idx,
+		  __entry->last_idx)
+);
+
 TRACE_EVENT(xfarray_sort_stats,
 	TP_PROTO(struct xfarray_sortinfo *si, int error),
 	TP_ARGS(si, error),
@@ -1119,6 +1185,323 @@ TRACE_EVENT(xchk_rtsum_record_free,
 );
 #endif /* CONFIG_XFS_RT */
 
+DECLARE_EVENT_CLASS(xchk_iscan_class,
+	TP_PROTO(struct xchk_iscan *iscan),
+	TP_ARGS(iscan),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, cursor)
+		__field(xfs_ino_t, visited)
+	),
+	TP_fast_assign(
+		__entry->dev = iscan->sc->mp->m_super->s_dev;
+		__entry->cursor = iscan->cursor_ino;
+		__entry->visited = iscan->__visited_ino;
+	),
+	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->cursor,
+		  __entry->visited)
+)
+#define DEFINE_ISCAN_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_class, name, \
+	TP_PROTO(struct xchk_iscan *iscan), \
+	TP_ARGS(iscan))
+DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
+DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
+DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
+
+DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
+	TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino),
+	TP_ARGS(iscan, ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, startino)
+		__field(xfs_ino_t, cursor)
+		__field(xfs_ino_t, visited)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = iscan->sc->mp->m_super->s_dev;
+		__entry->startino = iscan->scan_start_ino;
+		__entry->cursor = iscan->cursor_ino;
+		__entry->visited = iscan->__visited_ino;
+		__entry->ino = ino;
+	),
+	TP_printk("dev %d:%d iscan start 0x%llx cursor 0x%llx visited 0x%llx ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->startino,
+		  __entry->cursor,
+		  __entry->visited,
+		  __entry->ino)
+)
+#define DEFINE_ISCAN_INO_EVENT(name) \
+DEFINE_EVENT(xchk_iscan_ino_class, name, \
+	TP_PROTO(struct xchk_iscan *iscan, xfs_ino_t ino), \
+	TP_ARGS(iscan, ino))
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_want_live_update);
+DEFINE_ISCAN_INO_EVENT(xchk_iscan_start);
+
+TRACE_EVENT(xchk_iscan_iget,
+	TP_PROTO(struct xchk_iscan *iscan, int error),
+	TP_ARGS(iscan, error),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, cursor)
+		__field(xfs_ino_t, visited)
+		__field(int, error)
+	),
+	TP_fast_assign(
+		__entry->dev = iscan->sc->mp->m_super->s_dev;
+		__entry->cursor = iscan->cursor_ino;
+		__entry->visited = iscan->__visited_ino;
+		__entry->error = error;
+	),
+	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->cursor,
+		  __entry->visited,
+		  __entry->error)
+);
+
+TRACE_EVENT(xchk_iscan_iget_batch,
+	TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
+		 unsigned int nr, unsigned int avail),
+	TP_ARGS(mp, iscan, nr, avail),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, cursor)
+		__field(xfs_ino_t, visited)
+		__field(unsigned int, nr)
+		__field(unsigned int, avail)
+		__field(unsigned int, unavail)
+		__field(xfs_ino_t, batch_ino)
+		__field(unsigned long long, skipmask)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->cursor = iscan->cursor_ino;
+		__entry->visited = iscan->__visited_ino;
+		__entry->nr = nr;
+		__entry->avail = avail;
+		__entry->unavail = hweight64(iscan->__skipped_inomask);
+		__entry->batch_ino = iscan->__batch_ino;
+		__entry->skipmask = iscan->__skipped_inomask;
+	),
+	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->cursor,
+		  __entry->visited,
+		  __entry->batch_ino,
+		  __entry->skipmask,
+		  __entry->nr,
+		  __entry->avail,
+		  __entry->unavail)
+);
+
+TRACE_EVENT(xchk_iscan_iget_retry_wait,
+	TP_PROTO(struct xchk_iscan *iscan),
+	TP_ARGS(iscan),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, cursor)
+		__field(xfs_ino_t, visited)
+		__field(unsigned int, retry_delay)
+		__field(unsigned long, remaining)
+		__field(unsigned int, iget_timeout)
+	),
+	TP_fast_assign(
+		__entry->dev = iscan->sc->mp->m_super->s_dev;
+		__entry->cursor = iscan->cursor_ino;
+		__entry->visited = iscan->__visited_ino;
+		__entry->retry_delay = iscan->iget_retry_delay;
+		__entry->remaining = jiffies_to_msecs(iscan->__iget_deadline - jiffies);
+		__entry->iget_timeout = iscan->iget_timeout;
+	),
+	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx remaining %lu timeout %u delay %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->cursor,
+		  __entry->visited,
+		  __entry->remaining,
+		  __entry->iget_timeout,
+		  __entry->retry_delay)
+);
+
+TRACE_EVENT(xchk_nlinks_collect_dirent,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_inode *dp,
+		 xfs_ino_t ino, const struct xfs_name *name),
+	TP_ARGS(mp, dp, ino, name),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dir)
+		__field(xfs_ino_t, ino)
+		__field(unsigned int, namelen)
+		__dynamic_array(char, name, name->len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->dir = dp->i_ino;
+		__entry->ino = ino;
+		__entry->namelen = name->len;
+		memcpy(__get_str(name), name->name, name->len);
+	),
+	TP_printk("dev %d:%d dir 0x%llx -> ino 0x%llx name '%.*s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dir,
+		  __entry->ino,
+		  __entry->namelen,
+		  __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_collect_metafile,
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino),
+	TP_ARGS(mp, ino),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ino;
+	),
+	TP_printk("dev %d:%d ino 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino)
+);
+
+TRACE_EVENT(xchk_nlinks_live_update,
+	TP_PROTO(struct xfs_mount *mp, const struct xfs_inode *dp,
+		 int action, xfs_ino_t ino, int delta,
+		 const char *name, unsigned int namelen),
+	TP_ARGS(mp, dp, action, ino, delta, name, namelen),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, dir)
+		__field(int, action)
+		__field(xfs_ino_t, ino)
+		__field(int, delta)
+		__field(unsigned int, namelen)
+		__dynamic_array(char, name, namelen)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->dir = dp ? dp->i_ino : NULLFSINO;
+		__entry->action = action;
+		__entry->ino = ino;
+		__entry->delta = delta;
+		__entry->namelen = namelen;
+		memcpy(__get_str(name), name, namelen);
+	),
+	TP_printk("dev %d:%d dir 0x%llx ino 0x%llx nlink_delta %d name '%.*s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->dir,
+		  __entry->ino,
+		  __entry->delta,
+		  __entry->namelen,
+		  __get_str(name))
+);
+
+TRACE_EVENT(xchk_nlinks_check_zero,
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+		 const struct xchk_nlink *live),
+	TP_ARGS(mp, ino, live),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_nlink_t, parents)
+		__field(xfs_nlink_t, backrefs)
+		__field(xfs_nlink_t, children)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ino;
+		__entry->parents = live->parents;
+		__entry->backrefs = live->backrefs;
+		__entry->children = live->children;
+	),
+	TP_printk("dev %d:%d ino 0x%llx parents %u backrefs %u children %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->parents,
+		  __entry->backrefs,
+		  __entry->children)
+);
+
+TRACE_EVENT(xchk_nlinks_update_incore,
+	TP_PROTO(struct xfs_mount *mp, xfs_ino_t ino,
+		 const struct xchk_nlink *live, int parents_delta,
+		 int backrefs_delta, int children_delta),
+	TP_ARGS(mp, ino, live, parents_delta, backrefs_delta, children_delta),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_nlink_t, parents)
+		__field(xfs_nlink_t, backrefs)
+		__field(xfs_nlink_t, children)
+		__field(int, parents_delta)
+		__field(int, backrefs_delta)
+		__field(int, children_delta)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ino;
+		__entry->parents = live->parents;
+		__entry->backrefs = live->backrefs;
+		__entry->children = live->children;
+		__entry->parents_delta = parents_delta;
+		__entry->backrefs_delta = backrefs_delta;
+		__entry->children_delta = children_delta;
+	),
+	TP_printk("dev %d:%d ino 0x%llx parents %d:%u backrefs %d:%u children %d:%u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->parents_delta,
+		  __entry->parents,
+		  __entry->backrefs_delta,
+		  __entry->backrefs,
+		  __entry->children_delta,
+		  __entry->children)
+);
+
+DECLARE_EVENT_CLASS(xchk_nlinks_diff_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip,
+		 const struct xchk_nlink *live),
+	TP_ARGS(mp, ip, live),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(uint8_t, ftype)
+		__field(xfs_nlink_t, nlink)
+		__field(xfs_nlink_t, parents)
+		__field(xfs_nlink_t, backrefs)
+		__field(xfs_nlink_t, children)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->ftype = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
+		__entry->nlink = VFS_I(ip)->i_nlink;
+		__entry->parents = live->parents;
+		__entry->backrefs = live->backrefs;
+		__entry->children = live->children;
+	),
+	TP_printk("dev %d:%d ino 0x%llx ftype %s nlink %u parents %u backrefs %u children %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+		  __entry->nlink,
+		  __entry->parents,
+		  __entry->backrefs,
+		  __entry->children)
+);
+#define DEFINE_SCRUB_NLINKS_DIFF_EVENT(name) \
+DEFINE_EVENT(xchk_nlinks_diff_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_inode *ip, \
+		 const struct xchk_nlink *live), \
+	TP_ARGS(mp, ip, live))
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xchk_nlinks_compare_inode);
+
 /* repair tracepoints */
 #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
 
@@ -1223,7 +1606,6 @@ DEFINE_EVENT(xrep_rmap_class, name, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
 	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
 DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
-DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
 DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
 
 TRACE_EVENT(xrep_abt_found,
@@ -1341,6 +1723,38 @@ TRACE_EVENT(xrep_bmap_found,
 		  __entry->state)
 );
 
+TRACE_EVENT(xrep_rmap_found,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 const struct xfs_rmap_irec *rec),
+	TP_ARGS(mp, agno, rec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = rec->rm_startblock;
+		__entry->len = rec->rm_blockcount;
+		__entry->owner = rec->rm_owner;
+		__entry->offset = rec->rm_offset;
+		__entry->flags = rec->rm_flags;
+	),
+	TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+
 TRACE_EVENT(xrep_findroot_block,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
 		 uint32_t magic, uint16_t level),
@@ -1425,16 +1839,28 @@ TRACE_EVENT(xrep_calc_ag_resblks_btsize,
 		  __entry->refcbt_sz)
 )
 TRACE_EVENT(xrep_reset_counters,
-	TP_PROTO(struct xfs_mount *mp),
-	TP_ARGS(mp),
+	TP_PROTO(struct xfs_mount *mp, struct xchk_fscounters *fsc),
+	TP_ARGS(mp, fsc),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
+		__field(uint64_t, icount)
+		__field(uint64_t, ifree)
+		__field(uint64_t, fdblocks)
+		__field(uint64_t, frextents)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
+		__entry->icount = fsc->icount;
+		__entry->ifree = fsc->ifree;
+		__entry->fdblocks = fsc->fdblocks;
+		__entry->frextents = fsc->frextents;
 	),
-	TP_printk("dev %d:%d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev))
+	TP_printk("dev %d:%d icount %llu ifree %llu fdblocks %llu frextents %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->icount,
+		  __entry->ifree,
+		  __entry->fdblocks,
+		  __entry->frextents)
 )
 
 DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
@@ -1645,6 +2071,55 @@ TRACE_EVENT(xrep_dinode_count_rmaps,
 		  __entry->attr_extents)
 );
 
+TRACE_EVENT(xrep_dinode_findmode_dirent,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+		 unsigned int ftype),
+	TP_ARGS(sc, dp, ftype),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_ino_t, parent_ino)
+		__field(unsigned int, ftype)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->parent_ino = dp->i_ino;
+		__entry->ftype = ftype;
+	),
+	TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->parent_ino,
+		  __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR))
+);
+
+TRACE_EVENT(xrep_dinode_findmode_dirent_inval,
+	TP_PROTO(struct xfs_scrub *sc, struct xfs_inode *dp,
+		 unsigned int ftype, unsigned int found_ftype),
+	TP_ARGS(sc, dp, ftype, found_ftype),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_ino_t, parent_ino)
+		__field(unsigned int, ftype)
+		__field(unsigned int, found_ftype)
+	),
+	TP_fast_assign(
+		__entry->dev = sc->mp->m_super->s_dev;
+		__entry->ino = sc->sm->sm_ino;
+		__entry->parent_ino = dp->i_ino;
+		__entry->ftype = ftype;
+		__entry->found_ftype = found_ftype;
+	),
+	TP_printk("dev %d:%d ino 0x%llx parent_ino 0x%llx ftype '%s' found_ftype '%s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->parent_ino,
+		  __print_symbolic(__entry->ftype, XFS_DIR3_FTYPE_STR),
+		  __print_symbolic(__entry->found_ftype, XFS_DIR3_FTYPE_STR))
+);
+
 TRACE_EVENT(xrep_cow_mark_file_range,
 	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
 		 xfs_fileoff_t startoff, xfs_filblks_t blockcount),
@@ -1756,8 +2231,48 @@ DEFINE_EVENT(xrep_dquot_class, name, \
 DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
 DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
 DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+DEFINE_XREP_DQUOT_EVENT(xrep_quotacheck_dquot);
 #endif /* CONFIG_XFS_QUOTA */
 
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_update_inode);
+DEFINE_SCRUB_NLINKS_DIFF_EVENT(xrep_nlinks_unfixable_inode);
+
+TRACE_EVENT(xrep_rmap_live_update,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, unsigned int op,
+		 const struct xfs_rmap_update_params *p),
+	TP_ARGS(mp, agno, op, p),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(unsigned int, op)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(uint64_t, owner)
+		__field(uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->op = op;
+		__entry->agbno = p->startblock;
+		__entry->len = p->blockcount;
+		xfs_owner_info_unpack(&p->oinfo, &__entry->owner,
+				&__entry->offset, &__entry->flags);
+		if (p->unwritten)
+			__entry->flags |= XFS_RMAP_UNWRITTEN;
+	),
+	TP_printk("dev %d:%d agno 0x%x op %d agbno 0x%x fsbcount 0x%x owner 0x%llx fileoff 0x%llx flags 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->op,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+);
+
 #endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
 
 #endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.c b/fs/xfs/scrub/xfarray.c
index f0f532c10a5a..17c982a4821d 100644
--- a/fs/xfs/scrub/xfarray.c
+++ b/fs/xfs/scrub/xfarray.c
@@ -16,7 +16,7 @@
  * Large Arrays of Fixed-Size Records
  * ==================================
  *
- * This memory array uses an xfile (which itself is a memfd "file") to store
+ * This memory array uses an xfile (which itself is a shmem file) to store
  * large numbers of fixed-size records in memory that can be paged out.  This
  * puts less stress on the memory reclaim algorithms during an online repair
  * because we don't have to pin so much memory.  However, array access is less
@@ -136,7 +136,7 @@ xfarray_load(
 	if (idx >= array->nr)
 		return -ENODATA;
 
-	return xfile_obj_load(array->xfile, ptr, array->obj_size,
+	return xfile_load(array->xfile, ptr, array->obj_size,
 			xfarray_pos(array, idx));
 }
 
@@ -152,7 +152,7 @@ xfarray_is_unset(
 	if (array->unset_slots == 0)
 		return false;
 
-	error = xfile_obj_load(array->xfile, temp, array->obj_size, pos);
+	error = xfile_load(array->xfile, temp, array->obj_size, pos);
 	if (!error && xfarray_element_is_null(array, temp))
 		return true;
 
@@ -184,7 +184,7 @@ xfarray_unset(
 		return 0;
 
 	memset(temp, 0, array->obj_size);
-	error = xfile_obj_store(array->xfile, temp, array->obj_size, pos);
+	error = xfile_store(array->xfile, temp, array->obj_size, pos);
 	if (error)
 		return error;
 
@@ -209,7 +209,7 @@ xfarray_store(
 
 	ASSERT(!xfarray_element_is_null(array, ptr));
 
-	ret = xfile_obj_store(array->xfile, ptr, array->obj_size,
+	ret = xfile_store(array->xfile, ptr, array->obj_size,
 			xfarray_pos(array, idx));
 	if (ret)
 		return ret;
@@ -245,12 +245,12 @@ xfarray_store_anywhere(
 	for (pos = 0;
 	     pos < endpos && array->unset_slots > 0;
 	     pos += array->obj_size) {
-		error = xfile_obj_load(array->xfile, temp, array->obj_size,
+		error = xfile_load(array->xfile, temp, array->obj_size,
 				pos);
 		if (error || !xfarray_element_is_null(array, temp))
 			continue;
 
-		error = xfile_obj_store(array->xfile, ptr, array->obj_size,
+		error = xfile_store(array->xfile, ptr, array->obj_size,
 				pos);
 		if (error)
 			return error;
@@ -552,7 +552,7 @@ xfarray_isort(
 	trace_xfarray_isort(si, lo, hi);
 
 	xfarray_sort_bump_loads(si);
-	error = xfile_obj_load(si->array->xfile, scratch, len, lo_pos);
+	error = xfile_load(si->array->xfile, scratch, len, lo_pos);
 	if (error)
 		return error;
 
@@ -560,88 +560,45 @@ xfarray_isort(
 	sort(scratch, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 
 	xfarray_sort_bump_stores(si);
-	return xfile_obj_store(si->array->xfile, scratch, len, lo_pos);
+	return xfile_store(si->array->xfile, scratch, len, lo_pos);
 }
 
-/* Grab a page for sorting records. */
-static inline int
-xfarray_sort_get_page(
-	struct xfarray_sortinfo	*si,
-	loff_t			pos,
-	uint64_t		len)
-{
-	int			error;
-
-	error = xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
-	if (error)
-		return error;
-
-	/*
-	 * xfile pages must never be mapped into userspace, so we skip the
-	 * dcache flush when mapping the page.
-	 */
-	si->page_kaddr = kmap_local_page(si->xfpage.page);
-	return 0;
-}
-
-/* Release a page we grabbed for sorting records. */
-static inline int
-xfarray_sort_put_page(
-	struct xfarray_sortinfo	*si)
-{
-	if (!si->page_kaddr)
-		return 0;
-
-	kunmap_local(si->page_kaddr);
-	si->page_kaddr = NULL;
-
-	return xfile_put_page(si->array->xfile, &si->xfpage);
-}
-
-/* Decide if these records are eligible for in-page sorting. */
-static inline bool
-xfarray_want_pagesort(
-	struct xfarray_sortinfo	*si,
-	xfarray_idx_t		lo,
-	xfarray_idx_t		hi)
-{
-	pgoff_t			lo_page;
-	pgoff_t			hi_page;
-	loff_t			end_pos;
-
-	/* We can only map one page at a time. */
-	lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
-	end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
-	hi_page = end_pos >> PAGE_SHIFT;
-
-	return lo_page == hi_page;
-}
-
-/* Sort a bunch of records that all live in the same memory page. */
+/*
+ * Sort the records from lo to hi (inclusive) if they are all backed by the
+ * same memory folio.  Returns 1 if it sorted, 0 if it did not, or a negative
+ * errno.
+ */
 STATIC int
-xfarray_pagesort(
+xfarray_foliosort(
 	struct xfarray_sortinfo	*si,
 	xfarray_idx_t		lo,
 	xfarray_idx_t		hi)
 {
+	struct folio		*folio;
 	void			*startp;
 	loff_t			lo_pos = xfarray_pos(si->array, lo);
-	uint64_t		len = xfarray_pos(si->array, hi - lo);
-	int			error = 0;
+	uint64_t		len = xfarray_pos(si->array, hi - lo + 1);
 
-	trace_xfarray_pagesort(si, lo, hi);
+	/* No single folio could back this many records. */
+	if (len > XFILE_MAX_FOLIO_SIZE)
+		return 0;
 
 	xfarray_sort_bump_loads(si);
-	error = xfarray_sort_get_page(si, lo_pos, len);
-	if (error)
-		return error;
+	folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
+	if (IS_ERR(folio))
+		return PTR_ERR(folio);
+	if (!folio)
+		return 0;
+
+	trace_xfarray_foliosort(si, lo, hi);
 
 	xfarray_sort_bump_heapsorts(si);
-	startp = si->page_kaddr + offset_in_page(lo_pos);
+	startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
 	sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
 
 	xfarray_sort_bump_stores(si);
-	return xfarray_sort_put_page(si);
+	xfile_put_folio(si->array->xfile, folio);
+	return 1;
 }
 
 /* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@@ -829,63 +786,78 @@ xfarray_qsort_push(
 	return 0;
 }
 
+static inline void
+xfarray_sort_scan_done(
+	struct xfarray_sortinfo	*si)
+{
+	if (si->folio)
+		xfile_put_folio(si->array->xfile, si->folio);
+	si->folio = NULL;
+}
+
 /*
- * Load an element from the array into the first scratchpad and cache the page,
- * if possible.
+ * Cache the folio backing the start of the given array element.  If the array
+ * element is contained entirely within the folio, return a pointer to the
+ * cached folio.  Otherwise, load the element into the scratchpad and return a
+ * pointer to the scratchpad.
  */
 static inline int
-xfarray_sort_load_cached(
+xfarray_sort_scan(
 	struct xfarray_sortinfo	*si,
 	xfarray_idx_t		idx,
-	void			*ptr)
+	void			**ptrp)
 {
 	loff_t			idx_pos = xfarray_pos(si->array, idx);
-	pgoff_t			startpage;
-	pgoff_t			endpage;
 	int			error = 0;
 
-	/*
-	 * If this load would split a page, release the cached page, if any,
-	 * and perform a traditional read.
-	 */
-	startpage = idx_pos >> PAGE_SHIFT;
-	endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
-	if (startpage != endpage) {
-		error = xfarray_sort_put_page(si);
-		if (error)
-			return error;
+	if (xfarray_sort_terminated(si, &error))
+		return error;
 
-		if (xfarray_sort_terminated(si, &error))
-			return error;
+	trace_xfarray_sort_scan(si, idx);
 
-		return xfile_obj_load(si->array->xfile, ptr,
-				si->array->obj_size, idx_pos);
-	}
+	/* If the cached folio doesn't cover this index, release it. */
+	if (si->folio &&
+	    (idx < si->first_folio_idx || idx > si->last_folio_idx))
+		xfarray_sort_scan_done(si);
 
-	/* If the cached page is not the one we want, release it. */
-	if (xfile_page_cached(&si->xfpage) &&
-	    xfile_page_index(&si->xfpage) != startpage) {
-		error = xfarray_sort_put_page(si);
-		if (error)
-			return error;
+	/* Grab the first folio that backs this array element. */
+	if (!si->folio) {
+		loff_t		next_pos;
+
+		si->folio = xfile_get_folio(si->array->xfile, idx_pos,
+				si->array->obj_size, XFILE_ALLOC);
+		if (IS_ERR(si->folio))
+			return PTR_ERR(si->folio);
+
+		si->first_folio_idx = xfarray_idx(si->array,
+				folio_pos(si->folio) + si->array->obj_size - 1);
+
+		next_pos = folio_pos(si->folio) + folio_size(si->folio);
+		si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
+		if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
+			si->last_folio_idx--;
+
+		trace_xfarray_sort_scan(si, idx);
 	}
 
 	/*
-	 * If we don't have a cached page (and we know the load is contained
-	 * in a single page) then grab it.
+	 * If this folio still doesn't cover the desired element, it must cross
+	 * a folio boundary.  Read into the scratchpad and we're done.
 	 */
-	if (!xfile_page_cached(&si->xfpage)) {
-		if (xfarray_sort_terminated(si, &error))
-			return error;
+	if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
+		void		*temp = xfarray_scratch(si->array);
 
-		error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
-				PAGE_SIZE);
+		error = xfile_load(si->array->xfile, temp, si->array->obj_size,
+				idx_pos);
 		if (error)
 			return error;
+
+		*ptrp = temp;
+		return 0;
 	}
 
-	memcpy(ptr, si->page_kaddr + offset_in_page(idx_pos),
-			si->array->obj_size);
+	/* Otherwise return a pointer to the array element in the folio. */
+	*ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
 	return 0;
 }
 
@@ -952,6 +924,8 @@ xfarray_sort(
 	pivot = xfarray_sortinfo_pivot(si);
 
 	while (si->stack_depth >= 0) {
+		int		ret;
+
 		lo = si_lo[si->stack_depth];
 		hi = si_hi[si->stack_depth];
 
@@ -964,13 +938,13 @@ xfarray_sort(
 		}
 
 		/*
-		 * If directly mapping the page and sorting can solve our
+		 * If directly mapping the folio and sorting can solve our
 		 * problems, we're done.
 		 */
-		if (xfarray_want_pagesort(si, lo, hi)) {
-			error = xfarray_pagesort(si, lo, hi);
-			if (error)
-				goto out_free;
+		ret = xfarray_foliosort(si, lo, hi);
+		if (ret < 0)
+			goto out_free;
+		if (ret == 1) {
 			si->stack_depth--;
 			continue;
 		}
@@ -995,25 +969,24 @@ xfarray_sort(
 		 * than the pivot is on the right side of the range.
 		 */
 		while (lo < hi) {
+			void	*p;
+
 			/*
 			 * Decrement hi until it finds an a[hi] less than the
 			 * pivot value.
 			 */
-			error = xfarray_sort_load_cached(si, hi, scratch);
+			error = xfarray_sort_scan(si, hi, &p);
 			if (error)
 				goto out_free;
-			while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
-								lo < hi) {
+			while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
 				hi--;
-				error = xfarray_sort_load_cached(si, hi,
-						scratch);
+				error = xfarray_sort_scan(si, hi, &p);
 				if (error)
 					goto out_free;
 			}
-			error = xfarray_sort_put_page(si);
-			if (error)
-				goto out_free;
-
+			if (p != scratch)
+				memcpy(scratch, p, si->array->obj_size);
+			xfarray_sort_scan_done(si);
 			if (xfarray_sort_terminated(si, &error))
 				goto out_free;
 
@@ -1028,21 +1001,18 @@ xfarray_sort(
 			 * Increment lo until it finds an a[lo] greater than
 			 * the pivot value.
 			 */
-			error = xfarray_sort_load_cached(si, lo, scratch);
+			error = xfarray_sort_scan(si, lo, &p);
 			if (error)
 				goto out_free;
-			while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
-								lo < hi) {
+			while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
 				lo++;
-				error = xfarray_sort_load_cached(si, lo,
-						scratch);
+				error = xfarray_sort_scan(si, lo, &p);
 				if (error)
 					goto out_free;
 			}
-			error = xfarray_sort_put_page(si);
-			if (error)
-				goto out_free;
-
+			if (p != scratch)
+				memcpy(scratch, p, si->array->obj_size);
+			xfarray_sort_scan_done(si);
 			if (xfarray_sort_terminated(si, &error))
 				goto out_free;
 
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 62b9c506fdd1..acb2f94c56c1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -45,6 +45,25 @@ int xfarray_store(struct xfarray *array, xfarray_idx_t idx, const void *ptr);
 int xfarray_store_anywhere(struct xfarray *array, const void *ptr);
 bool xfarray_element_is_null(struct xfarray *array, const void *ptr);
 
+/*
+ * Load an array element, but zero the buffer if there's no data because we
+ * haven't stored to that array element yet.
+ */
+static inline int
+xfarray_load_sparse(
+	struct xfarray	*array,
+	uint64_t	idx,
+	void		*rec)
+{
+	int		error = xfarray_load(array, idx, rec);
+
+	if (error == -ENODATA) {
+		memset(rec, 0, array->obj_size);
+		return 0;
+	}
+	return error;
+}
+
 /* Append an element to the array. */
 static inline int xfarray_append(struct xfarray *array, const void *ptr)
 {
@@ -105,9 +124,14 @@ struct xfarray_sortinfo {
 	/* XFARRAY_SORT_* flags; see below. */
 	unsigned int		flags;
 
-	/* Cache a page here for faster access. */
-	struct xfile_page	xfpage;
-	void			*page_kaddr;
+	/* Cache a folio here for faster scanning for pivots */
+	struct folio		*folio;
+
+	/* First array index in folio that is completely readable */
+	xfarray_idx_t		first_folio_idx;
+
+	/* Last array index in folio that is completely readable */
+	xfarray_idx_t		last_folio_idx;
 
 #ifdef DEBUG
 	/* Performance statistics. */
diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c
index 090c3ead43fd..8cdd863db585 100644
--- a/fs/xfs/scrub/xfile.c
+++ b/fs/xfs/scrub/xfile.c
@@ -34,13 +34,6 @@
  * xfiles assume that the caller will handle all required concurrency
  * management; standard vfs locks (freezer and inode) are not taken.  Reads
  * and writes are satisfied directly from the page cache.
- *
- * NOTE: The current shmemfs implementation has a quirk that in-kernel reads
- * of a hole cause a page to be mapped into the file.  If you are going to
- * create a sparse xfile, please be careful about reading from uninitialized
- * parts of the file.  These pages are !Uptodate and will eventually be
- * reclaimed if not written, but in the short term this boosts memory
- * consumption.
  */
 
 /*
@@ -62,38 +55,27 @@ xfile_create(
 {
 	struct inode		*inode;
 	struct xfile		*xf;
-	int			error = -ENOMEM;
+	int			error;
 
 	xf = kmalloc(sizeof(struct xfile), XCHK_GFP_FLAGS);
 	if (!xf)
 		return -ENOMEM;
 
-	xf->file = shmem_file_setup(description, isize, 0);
-	if (!xf->file)
-		goto out_xfile;
+	xf->file = shmem_kernel_file_setup(description, isize, VM_NORESERVE);
 	if (IS_ERR(xf->file)) {
 		error = PTR_ERR(xf->file);
 		goto out_xfile;
 	}
 
-	/*
-	 * We want a large sparse file that we can pread, pwrite, and seek.
-	 * xfile users are responsible for keeping the xfile hidden away from
-	 * all other callers, so we skip timestamp updates and security checks.
-	 * Make the inode only accessible by root, just in case the xfile ever
-	 * escapes.
-	 */
-	xf->file->f_mode |= FMODE_PREAD | FMODE_PWRITE | FMODE_NOCMTIME |
-			    FMODE_LSEEK;
-	xf->file->f_flags |= O_RDWR | O_LARGEFILE | O_NOATIME;
 	inode = file_inode(xf->file);
-	inode->i_flags |= S_PRIVATE | S_NOCMTIME | S_NOATIME;
-	inode->i_mode &= ~0177;
-	inode->i_uid = GLOBAL_ROOT_UID;
-	inode->i_gid = GLOBAL_ROOT_GID;
-
 	lockdep_set_class(&inode->i_rwsem, &xfile_i_mutex_key);
 
+	/*
+	 * We don't want to bother with kmapping data during repair, so don't
+	 * allow highmem pages to back this mapping.
+	 */
+	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
 	trace_xfile_create(xf);
 
 	*xfilep = xf;
@@ -118,164 +100,128 @@ xfile_destroy(
 }
 
 /*
- * Read a memory object directly from the xfile's page cache.  Unlike regular
- * pread, we return -E2BIG and -EFBIG for reads that are too large or at too
- * high an offset, instead of truncating the read.  Otherwise, we return
- * bytes read or an error code, like regular pread.
+ * Load an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
  */
-ssize_t
-xfile_pread(
+int
+xfile_load(
 	struct xfile		*xf,
 	void			*buf,
 	size_t			count,
 	loff_t			pos)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	struct page		*page = NULL;
-	ssize_t			read = 0;
 	unsigned int		pflags;
-	int			error = 0;
 
 	if (count > MAX_RW_COUNT)
-		return -E2BIG;
+		return -ENOMEM;
 	if (inode->i_sb->s_maxbytes - pos < count)
-		return -EFBIG;
+		return -ENOMEM;
 
-	trace_xfile_pread(xf, pos, count);
+	trace_xfile_load(xf, pos, count);
 
 	pflags = memalloc_nofs_save();
 	while (count > 0) {
-		void		*p, *kaddr;
+		struct folio	*folio;
 		unsigned int	len;
+		unsigned int	offset;
 
-		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
-		/*
-		 * In-kernel reads of a shmem file cause it to allocate a page
-		 * if the mapping shows a hole.  Therefore, if we hit ENOMEM
-		 * we can continue by zeroing the caller's buffer.
-		 */
-		page = shmem_read_mapping_page_gfp(mapping, pos >> PAGE_SHIFT,
-				__GFP_NOWARN);
-		if (IS_ERR(page)) {
-			error = PTR_ERR(page);
-			if (error != -ENOMEM)
-				break;
-
-			memset(buf, 0, len);
-			goto advance;
-		}
-
-		if (PageUptodate(page)) {
+		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+				SGP_READ) < 0)
+			break;
+		if (!folio) {
 			/*
-			 * xfile pages must never be mapped into userspace, so
-			 * we skip the dcache flush.
+			 * No data stored at this offset, just zero the output
+			 * buffer until the next page boundary.
 			 */
-			kaddr = kmap_local_page(page);
-			p = kaddr + offset_in_page(pos);
-			memcpy(buf, p, len);
-			kunmap_local(kaddr);
-		} else {
+			len = min_t(ssize_t, count,
+				PAGE_SIZE - offset_in_page(pos));
 			memset(buf, 0, len);
-		}
-		put_page(page);
+		} else {
+			if (filemap_check_wb_err(inode->i_mapping, 0)) {
+				folio_unlock(folio);
+				folio_put(folio);
+				break;
+			}
+
+			offset = offset_in_folio(folio, pos);
+			len = min_t(ssize_t, count, folio_size(folio) - offset);
+			memcpy(buf, folio_address(folio) + offset, len);
 
-advance:
+			folio_unlock(folio);
+			folio_put(folio);
+		}
 		count -= len;
 		pos += len;
 		buf += len;
-		read += len;
 	}
 	memalloc_nofs_restore(pflags);
 
-	if (read > 0)
-		return read;
-	return error;
+	if (count)
+		return -ENOMEM;
+	return 0;
 }
 
 /*
- * Write a memory object directly to the xfile's page cache.  Unlike regular
- * pwrite, we return -E2BIG and -EFBIG for writes that are too large or at too
- * high an offset, instead of truncating the write.  Otherwise, we return
- * bytes written or an error code, like regular pwrite.
+ * Store an object.  Since we're treating this file as "memory", any error or
+ * short IO is treated as a failure to allocate memory.
  */
-ssize_t
-xfile_pwrite(
+int
+xfile_store(
 	struct xfile		*xf,
 	const void		*buf,
 	size_t			count,
 	loff_t			pos)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
-	struct page		*page = NULL;
-	ssize_t			written = 0;
 	unsigned int		pflags;
-	int			error = 0;
 
 	if (count > MAX_RW_COUNT)
-		return -E2BIG;
+		return -ENOMEM;
 	if (inode->i_sb->s_maxbytes - pos < count)
-		return -EFBIG;
+		return -ENOMEM;
 
-	trace_xfile_pwrite(xf, pos, count);
+	trace_xfile_store(xf, pos, count);
+
+	/*
+	 * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+	 * actually allocates a folio instead of erroring out.
+	 */
+	if (pos + count > i_size_read(inode))
+		i_size_write(inode, pos + count);
 
 	pflags = memalloc_nofs_save();
 	while (count > 0) {
-		void		*fsdata = NULL;
-		void		*p, *kaddr;
+		struct folio	*folio;
 		unsigned int	len;
-		int		ret;
-
-		len = min_t(ssize_t, count, PAGE_SIZE - offset_in_page(pos));
-
-		/*
-		 * We call write_begin directly here to avoid all the freezer
-		 * protection lock-taking that happens in the normal path.
-		 * shmem doesn't support fs freeze, but lockdep doesn't know
-		 * that and will trip over that.
-		 */
-		error = aops->write_begin(NULL, mapping, pos, len, &page,
-				&fsdata);
-		if (error)
-			break;
+		unsigned int	offset;
 
-		/*
-		 * xfile pages must never be mapped into userspace, so we skip
-		 * the dcache flush.  If the page is not uptodate, zero it
-		 * before writing data.
-		 */
-		kaddr = kmap_local_page(page);
-		if (!PageUptodate(page)) {
-			memset(kaddr, 0, PAGE_SIZE);
-			SetPageUptodate(page);
-		}
-		p = kaddr + offset_in_page(pos);
-		memcpy(p, buf, len);
-		kunmap_local(kaddr);
-
-		ret = aops->write_end(NULL, mapping, pos, len, len, page,
-				fsdata);
-		if (ret < 0) {
-			error = ret;
+		if (shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+				SGP_CACHE) < 0)
+			break;
+		if (filemap_check_wb_err(inode->i_mapping, 0)) {
+			folio_unlock(folio);
+			folio_put(folio);
 			break;
 		}
 
-		written += ret;
-		if (ret != len)
-			break;
+		offset = offset_in_folio(folio, pos);
+		len = min_t(ssize_t, count, folio_size(folio) - offset);
+		memcpy(folio_address(folio) + offset, buf, len);
+
+		folio_mark_dirty(folio);
+		folio_unlock(folio);
+		folio_put(folio);
 
-		count -= ret;
-		pos += ret;
-		buf += ret;
+		count -= len;
+		pos += len;
+		buf += len;
 	}
 	memalloc_nofs_restore(pflags);
 
-	if (written > 0)
-		return written;
-	return error;
+	if (count)
+		return -ENOMEM;
+	return 0;
 }
 
 /* Find the next written area in the xfile data for a given offset. */
@@ -291,129 +237,76 @@ xfile_seek_data(
 	return ret;
 }
 
-/* Query stat information for an xfile. */
-int
-xfile_stat(
-	struct xfile		*xf,
-	struct xfile_stat	*statbuf)
-{
-	struct kstat		ks;
-	int			error;
-
-	error = vfs_getattr_nosec(&xf->file->f_path, &ks,
-			STATX_SIZE | STATX_BLOCKS, AT_STATX_DONT_SYNC);
-	if (error)
-		return error;
-
-	statbuf->size = ks.size;
-	statbuf->bytes = ks.blocks << SECTOR_SHIFT;
-	return 0;
-}
-
 /*
- * Grab the (locked) page for a memory object.  The object cannot span a page
- * boundary.  Returns 0 (and a locked page) if successful, -ENOTBLK if we
- * cannot grab the page, or the usual negative errno.
+ * Grab the (locked) folio for a memory object.  The object cannot span a folio
+ * boundary.  Returns the locked folio if successful, NULL if there was no
+ * folio or it didn't cover the range requested, or an ERR_PTR on failure.
  */
-int
-xfile_get_page(
+struct folio *
+xfile_get_folio(
 	struct xfile		*xf,
 	loff_t			pos,
-	unsigned int		len,
-	struct xfile_page	*xfpage)
+	size_t			len,
+	unsigned int		flags)
 {
 	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
-	struct page		*page = NULL;
-	void			*fsdata = NULL;
-	loff_t			key = round_down(pos, PAGE_SIZE);
+	struct folio		*folio = NULL;
 	unsigned int		pflags;
 	int			error;
 
 	if (inode->i_sb->s_maxbytes - pos < len)
-		return -ENOMEM;
-	if (len > PAGE_SIZE - offset_in_page(pos))
-		return -ENOTBLK;
-
-	trace_xfile_get_page(xf, pos, len);
+		return ERR_PTR(-ENOMEM);
 
-	pflags = memalloc_nofs_save();
+	trace_xfile_get_folio(xf, pos, len);
 
 	/*
-	 * We call write_begin directly here to avoid all the freezer
-	 * protection lock-taking that happens in the normal path.  shmem
-	 * doesn't support fs freeze, but lockdep doesn't know that and will
-	 * trip over that.
+	 * Increase the file size first so that shmem_get_folio(..., SGP_CACHE),
+	 * actually allocates a folio instead of erroring out.
 	 */
-	error = aops->write_begin(NULL, mapping, key, PAGE_SIZE, &page,
-			&fsdata);
+	if ((flags & XFILE_ALLOC) && pos + len > i_size_read(inode))
+		i_size_write(inode, pos + len);
+
+	pflags = memalloc_nofs_save();
+	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio,
+			(flags & XFILE_ALLOC) ? SGP_CACHE : SGP_READ);
+	memalloc_nofs_restore(pflags);
 	if (error)
-		goto out_pflags;
+		return ERR_PTR(error);
 
-	/* We got the page, so make sure we push out EOF. */
-	if (i_size_read(inode) < pos + len)
-		i_size_write(inode, pos + len);
+	if (!folio)
+		return NULL;
 
-	/*
-	 * If the page isn't up to date, fill it with zeroes before we hand it
-	 * to the caller and make sure the backing store will hold on to them.
-	 */
-	if (!PageUptodate(page)) {
-		void	*kaddr;
+	if (len > folio_size(folio) - offset_in_folio(folio, pos)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return NULL;
+	}
 
-		kaddr = kmap_local_page(page);
-		memset(kaddr, 0, PAGE_SIZE);
-		kunmap_local(kaddr);
-		SetPageUptodate(page);
+	if (filemap_check_wb_err(inode->i_mapping, 0)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return ERR_PTR(-EIO);
 	}
 
 	/*
-	 * Mark each page dirty so that the contents are written to some
-	 * backing store when we drop this buffer, and take an extra reference
-	 * to prevent the xfile page from being swapped or removed from the
-	 * page cache by reclaim if the caller unlocks the page.
+	 * Mark the folio dirty so that it won't be reclaimed once we drop the
+	 * (potentially last) reference in xfile_put_folio.
 	 */
-	set_page_dirty(page);
-	get_page(page);
-
-	xfpage->page = page;
-	xfpage->fsdata = fsdata;
-	xfpage->pos = key;
-out_pflags:
-	memalloc_nofs_restore(pflags);
-	return error;
+	if (flags & XFILE_ALLOC)
+		folio_set_dirty(folio);
+	return folio;
 }
 
 /*
- * Release the (locked) page for a memory object.  Returns 0 or a negative
- * errno.
+ * Release the (locked) folio for a memory object.
  */
-int
-xfile_put_page(
+void
+xfile_put_folio(
 	struct xfile		*xf,
-	struct xfile_page	*xfpage)
+	struct folio		*folio)
 {
-	struct inode		*inode = file_inode(xf->file);
-	struct address_space	*mapping = inode->i_mapping;
-	const struct address_space_operations *aops = mapping->a_ops;
-	unsigned int		pflags;
-	int			ret;
-
-	trace_xfile_put_page(xf, xfpage->pos, PAGE_SIZE);
-
-	/* Give back the reference that we took in xfile_get_page. */
-	put_page(xfpage->page);
+	trace_xfile_put_folio(xf, folio_pos(folio), folio_size(folio));
 
-	pflags = memalloc_nofs_save();
-	ret = aops->write_end(NULL, mapping, xfpage->pos, PAGE_SIZE, PAGE_SIZE,
-			xfpage->page, xfpage->fsdata);
-	memalloc_nofs_restore(pflags);
-	memset(xfpage, 0, sizeof(struct xfile_page));
-
-	if (ret < 0)
-		return ret;
-	if (ret != PAGE_SIZE)
-		return -EIO;
-	return 0;
+	folio_unlock(folio);
+	folio_put(folio);
 }
diff --git a/fs/xfs/scrub/xfile.h b/fs/xfs/scrub/xfile.h
index d56643b0f429..76d78dba7e34 100644
--- a/fs/xfs/scrub/xfile.h
+++ b/fs/xfs/scrub/xfile.h
@@ -6,22 +6,6 @@
 #ifndef __XFS_SCRUB_XFILE_H__
 #define __XFS_SCRUB_XFILE_H__
 
-struct xfile_page {
-	struct page		*page;
-	void			*fsdata;
-	loff_t			pos;
-};
-
-static inline bool xfile_page_cached(const struct xfile_page *xfpage)
-{
-	return xfpage->page != NULL;
-}
-
-static inline pgoff_t xfile_page_index(const struct xfile_page *xfpage)
-{
-	return xfpage->page->index;
-}
-
 struct xfile {
 	struct file		*file;
 };
@@ -29,49 +13,17 @@ struct xfile {
 int xfile_create(const char *description, loff_t isize, struct xfile **xfilep);
 void xfile_destroy(struct xfile *xf);
 
-ssize_t xfile_pread(struct xfile *xf, void *buf, size_t count, loff_t pos);
-ssize_t xfile_pwrite(struct xfile *xf, const void *buf, size_t count,
+int xfile_load(struct xfile *xf, void *buf, size_t count, loff_t pos);
+int xfile_store(struct xfile *xf, const void *buf, size_t count,
 		loff_t pos);
 
-/*
- * Load an object.  Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_load(struct xfile *xf, void *buf, size_t count, loff_t pos)
-{
-	ssize_t	ret = xfile_pread(xf, buf, count, pos);
-
-	if (ret < 0 || ret != count)
-		return -ENOMEM;
-	return 0;
-}
-
-/*
- * Store an object.  Since we're treating this file as "memory", any error or
- * short IO is treated as a failure to allocate memory.
- */
-static inline int
-xfile_obj_store(struct xfile *xf, const void *buf, size_t count, loff_t pos)
-{
-	ssize_t	ret = xfile_pwrite(xf, buf, count, pos);
-
-	if (ret < 0 || ret != count)
-		return -ENOMEM;
-	return 0;
-}
-
 loff_t xfile_seek_data(struct xfile *xf, loff_t pos);
 
-struct xfile_stat {
-	loff_t			size;
-	unsigned long long	bytes;
-};
-
-int xfile_stat(struct xfile *xf, struct xfile_stat *statbuf);
+#define XFILE_MAX_FOLIO_SIZE	(PAGE_SIZE << MAX_PAGECACHE_ORDER)
 
-int xfile_get_page(struct xfile *xf, loff_t offset, unsigned int len,
-		struct xfile_page *xbuf);
-int xfile_put_page(struct xfile *xf, struct xfile_page *xbuf);
+#define XFILE_ALLOC		(1 << 0) /* allocate folio if not present */
+struct folio *xfile_get_folio(struct xfile *xf, loff_t offset, size_t len,
+		unsigned int flags);
+void xfile_put_folio(struct xfile *xf, struct folio *folio);
 
 #endif /* __XFS_SCRUB_XFILE_H__ */
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6b840301817a..4bf69c9c088e 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -167,7 +167,7 @@ xfs_get_acl(struct inode *inode, int type, bool rcu)
 		acl = ERR_PTR(error);
 	}
 
-	kmem_free(args.value);
+	kvfree(args.value);
 	return acl;
 }
 
@@ -204,7 +204,7 @@ __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	}
 
 	error = xfs_attr_change(&args);
-	kmem_free(args.value);
+	kvfree(args.value);
 
 	/*
 	 * If the attribute didn't exist to start with that's fine.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 813f85156b0c..3f428620ebf2 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -112,7 +112,7 @@ xfs_end_ioend(
 	 * longer dirty. If we don't remove delalloc blocks here, they become
 	 * stale and can corrupt free space accounting on unmount.
 	 */
-	error = blk_status_to_errno(ioend->io_bio->bi_status);
+	error = blk_status_to_errno(ioend->io_bio.bi_status);
 	if (unlikely(error)) {
 		if (ioend->io_flags & IOMAP_F_SHARED) {
 			xfs_reflink_cancel_cow_range(ip, offset, size, true);
@@ -179,7 +179,7 @@ STATIC void
 xfs_end_bio(
 	struct bio		*bio)
 {
-	struct iomap_ioend	*ioend = bio->bi_private;
+	struct iomap_ioend	*ioend = iomap_ioend_from_bio(bio);
 	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
 	unsigned long		flags;
 
@@ -276,7 +276,8 @@ static int
 xfs_map_blocks(
 	struct iomap_writepage_ctx *wpc,
 	struct inode		*inode,
-	loff_t			offset)
+	loff_t			offset,
+	unsigned int		len)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -444,7 +445,7 @@ xfs_prepare_ioend(
 	/* send ioends that might require a transaction to the completion wq */
 	if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
 	    (ioend->io_flags & IOMAP_F_SHARED))
-		ioend->io_bio->bi_end_io = xfs_end_bio;
+		ioend->io_bio.bi_end_io = xfs_end_bio;
 	return status;
 }
 
@@ -502,13 +503,6 @@ xfs_vm_writepages(
 {
 	struct xfs_writepage_ctx wpc = { };
 
-	/*
-	 * Writing back data in a transaction context can result in recursive
-	 * transactions. This is bad, so issue a warning and get out of here.
-	 */
-	if (WARN_ON_ONCE(current->journal_info))
-		return 0;
-
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 	return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
 }
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index 89c7a9f4f930..24fb12986a56 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -23,6 +23,7 @@
 #include "xfs_quota.h"
 #include "xfs_dir2.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 /*
  * Invalidate any incore buffers associated with this remote attribute value
@@ -147,6 +148,7 @@ xfs_attr3_node_inactive(
 	if (level > XFS_DA_NODE_MAXDEPTH) {
 		xfs_buf_mark_corrupt(bp);
 		xfs_trans_brelse(*trans, bp);	/* no locks for later trans */
+		xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 		return -EFSCORRUPTED;
 	}
 
@@ -197,6 +199,7 @@ xfs_attr3_node_inactive(
 		default:
 			xfs_buf_mark_corrupt(child_bp);
 			xfs_trans_brelse(*trans, child_bp);
+			xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 			error = -EFSCORRUPTED;
 			break;
 		}
@@ -286,6 +289,7 @@ xfs_attr3_root_inactive(
 		error = xfs_attr3_leaf_inactive(trans, dp, bp);
 		break;
 	default:
+		xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 		error = -EFSCORRUPTED;
 		xfs_buf_mark_corrupt(bp);
 		xfs_trans_brelse(*trans, bp);
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 9e02111bd890..9b4c61e1c22e 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -108,7 +108,7 @@ STATIC void
 xfs_attri_item_free(
 	struct xfs_attri_log_item	*attrip)
 {
-	kmem_free(attrip->attri_item.li_lv_shadow);
+	kvfree(attrip->attri_item.li_lv_shadow);
 	xfs_attri_log_nameval_put(attrip->attri_nameval);
 	kmem_cache_free(xfs_attri_cache, attrip);
 }
@@ -226,7 +226,7 @@ xfs_attri_init(
 {
 	struct xfs_attri_log_item	*attrip;
 
-	attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_NOFS | __GFP_NOFAIL);
+	attrip = kmem_cache_zalloc(xfs_attri_cache, GFP_KERNEL | __GFP_NOFAIL);
 
 	/*
 	 * Grab an extra reference to the name/value buffer for this log item.
@@ -251,7 +251,7 @@ static inline struct xfs_attrd_log_item *ATTRD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_attrd_item_free(struct xfs_attrd_log_item *attrdp)
 {
-	kmem_free(attrdp->attrd_item.li_lv_shadow);
+	kvfree(attrdp->attrd_item.li_lv_shadow);
 	kmem_cache_free(xfs_attrd_cache, attrdp);
 }
 
@@ -386,11 +386,16 @@ xfs_attr_free_item(
 		xfs_da_state_free(attr->xattri_da_state);
 	xfs_attri_log_nameval_put(attr->xattri_nameval);
 	if (attr->xattri_da_args->op_flags & XFS_DA_OP_RECOVERY)
-		kmem_free(attr);
+		kfree(attr);
 	else
 		kmem_cache_free(xfs_attr_intent_cache, attr);
 }
 
+static inline struct xfs_attr_intent *attri_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_attr_intent, xattri_list);
+}
+
 /* Process an attr. */
 STATIC int
 xfs_attr_finish_item(
@@ -399,11 +404,10 @@ xfs_attr_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_attr_intent		*attr;
+	struct xfs_attr_intent		*attr = attri_entry(item);
 	struct xfs_da_args		*args;
 	int				error;
 
-	attr = container_of(item, struct xfs_attr_intent, xattri_list);
 	args = attr->xattri_da_args;
 
 	/* Reset trans after EAGAIN cycle since the transaction is new */
@@ -443,9 +447,8 @@ STATIC void
 xfs_attr_cancel_item(
 	struct list_head		*item)
 {
-	struct xfs_attr_intent		*attr;
+	struct xfs_attr_intent		*attr = attri_entry(item);
 
-	attr = container_of(item, struct xfs_attr_intent, xattri_list);
 	xfs_attr_free_item(attr);
 }
 
@@ -512,8 +515,8 @@ xfs_attri_recover_work(
 	if (error)
 		return ERR_PTR(error);
 
-	attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
-			   sizeof(struct xfs_da_args), KM_NOFS);
+	attr = kzalloc(sizeof(struct xfs_attr_intent) +
+			sizeof(struct xfs_da_args), GFP_KERNEL | __GFP_NOFAIL);
 	args = (struct xfs_da_args *)(attr + 1);
 
 	attr->xattri_da_args = args;
@@ -666,7 +669,7 @@ xfs_attr_create_done(
 
 	attrip = ATTRI_ITEM(intent);
 
-	attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+	attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_KERNEL | __GFP_NOFAIL);
 
 	xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
 			  &xfs_attrd_item_ops);
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index e368ad671e26..a6819a642cc0 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -22,6 +22,7 @@
 #include "xfs_error.h"
 #include "xfs_trace.h"
 #include "xfs_dir2.h"
+#include "xfs_health.h"
 
 STATIC int
 xfs_attr_shortform_compare(const void *a, const void *b)
@@ -82,8 +83,10 @@ xfs_attr_shortform_list(
 		for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
 			if (XFS_IS_CORRUPT(context->dp->i_mount,
 					   !xfs_attr_namecheck(sfe->nameval,
-							       sfe->namelen)))
+							       sfe->namelen))) {
+				xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
 				return -EFSCORRUPTED;
+			}
 			context->put_listent(context,
 					     sfe->flags,
 					     sfe->nameval,
@@ -109,7 +112,7 @@ xfs_attr_shortform_list(
 	 * It didn't all fit, so we have to sort everything on hashval.
 	 */
 	sbsize = sf->count * sizeof(*sbuf);
-	sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
+	sbp = sbuf = kmalloc(sbsize, GFP_KERNEL | __GFP_NOFAIL);
 
 	/*
 	 * Scan the attribute list for the rest of the entries, storing
@@ -124,7 +127,8 @@ xfs_attr_shortform_list(
 					     XFS_ERRLEVEL_LOW,
 					     context->dp->i_mount, sfe,
 					     sizeof(*sfe));
-			kmem_free(sbuf);
+			kfree(sbuf);
+			xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 			return -EFSCORRUPTED;
 		}
 
@@ -175,6 +179,7 @@ xfs_attr_shortform_list(
 		if (XFS_IS_CORRUPT(context->dp->i_mount,
 				   !xfs_attr_namecheck(sbp->name,
 						       sbp->namelen))) {
+			xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
@@ -188,7 +193,7 @@ xfs_attr_shortform_list(
 		cursor->offset++;
 	}
 out:
-	kmem_free(sbuf);
+	kfree(sbuf);
 	return error;
 }
 
@@ -262,8 +267,10 @@ xfs_attr_node_list_lookup(
 			return 0;
 
 		/* We can't point back to the root. */
-		if (XFS_IS_CORRUPT(mp, cursor->blkno == 0))
+		if (XFS_IS_CORRUPT(mp, cursor->blkno == 0)) {
+			xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 			return -EFSCORRUPTED;
+		}
 	}
 
 	if (expected_level != 0)
@@ -275,6 +282,7 @@ xfs_attr_node_list_lookup(
 out_corruptbuf:
 	xfs_buf_mark_corrupt(bp);
 	xfs_trans_brelse(tp, bp);
+	xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 	return -EFSCORRUPTED;
 }
 
@@ -304,6 +312,8 @@ xfs_attr_node_list(
 	if (cursor->blkno > 0) {
 		error = xfs_da3_node_read(context->tp, dp, cursor->blkno, &bp,
 				XFS_ATTR_FORK);
+		if (xfs_metadata_is_sick(error))
+			xfs_dirattr_mark_sick(dp, XFS_ATTR_FORK);
 		if ((error != 0) && (error != -EFSCORRUPTED))
 			return error;
 		if (bp) {
@@ -464,8 +474,10 @@ xfs_attr3_leaf_list_int(
 		}
 
 		if (XFS_IS_CORRUPT(context->dp->i_mount,
-				   !xfs_attr_namecheck(name, namelen)))
+				   !xfs_attr_namecheck(name, namelen))) {
+			xfs_dirattr_mark_sick(context->dp, XFS_ATTR_FORK);
 			return -EFSCORRUPTED;
+		}
 		context->put_listent(context, entry->flags,
 					      name, namelen, valuelen);
 		if (context->seen_enough)
@@ -504,7 +516,7 @@ xfs_attr_list_ilocked(
 {
 	struct xfs_inode		*dp = context->dp;
 
-	ASSERT(xfs_isilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(dp, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL);
 
 	/*
 	 * Decide on what work routines to call based on the inode size.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 52fb8a148b7d..d27859a684aa 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -25,6 +25,7 @@
 #include "xfs_log_priv.h"
 #include "xfs_log_recover.h"
 #include "xfs_ag.h"
+#include "xfs_trace.h"
 
 struct kmem_cache	*xfs_bui_cache;
 struct kmem_cache	*xfs_bud_cache;
@@ -40,7 +41,7 @@ STATIC void
 xfs_bui_item_free(
 	struct xfs_bui_log_item	*buip)
 {
-	kmem_free(buip->bui_item.li_lv_shadow);
+	kvfree(buip->bui_item.li_lv_shadow);
 	kmem_cache_free(xfs_bui_cache, buip);
 }
 
@@ -201,7 +202,7 @@ xfs_bud_item_release(
 	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
 
 	xfs_bui_release(budp->bud_buip);
-	kmem_free(budp->bud_item.li_lv_shadow);
+	kvfree(budp->bud_item.li_lv_shadow);
 	kmem_cache_free(xfs_bud_cache, budp);
 }
 
@@ -221,6 +222,11 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
 	.iop_intent	= xfs_bud_item_intent,
 };
 
+static inline struct xfs_bmap_intent *bi_entry(const struct list_head *e)
+{
+	return list_entry(e, struct xfs_bmap_intent, bi_list);
+}
+
 /* Sort bmap intents by inode. */
 static int
 xfs_bmap_update_diff_items(
@@ -228,37 +234,12 @@ xfs_bmap_update_diff_items(
 	const struct list_head		*a,
 	const struct list_head		*b)
 {
-	struct xfs_bmap_intent		*ba;
-	struct xfs_bmap_intent		*bb;
+	struct xfs_bmap_intent		*ba = bi_entry(a);
+	struct xfs_bmap_intent		*bb = bi_entry(b);
 
-	ba = container_of(a, struct xfs_bmap_intent, bi_list);
-	bb = container_of(b, struct xfs_bmap_intent, bi_list);
 	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
 }
 
-/* Set the map extent flags for this mapping. */
-static void
-xfs_trans_set_bmap_flags(
-	struct xfs_map_extent		*map,
-	enum xfs_bmap_intent_type	type,
-	int				whichfork,
-	xfs_exntst_t			state)
-{
-	map->me_flags = 0;
-	switch (type) {
-	case XFS_BMAP_MAP:
-	case XFS_BMAP_UNMAP:
-		map->me_flags = type;
-		break;
-	default:
-		ASSERT(0);
-	}
-	if (state == XFS_EXT_UNWRITTEN)
-		map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
-	if (whichfork == XFS_ATTR_FORK)
-		map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
-}
-
 /* Log bmap updates in the intent item. */
 STATIC void
 xfs_bmap_update_log_item(
@@ -281,8 +262,21 @@ xfs_bmap_update_log_item(
 	map->me_startblock = bi->bi_bmap.br_startblock;
 	map->me_startoff = bi->bi_bmap.br_startoff;
 	map->me_len = bi->bi_bmap.br_blockcount;
-	xfs_trans_set_bmap_flags(map, bi->bi_type, bi->bi_whichfork,
-			bi->bi_bmap.br_state);
+
+	switch (bi->bi_type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		map->me_flags = bi->bi_type;
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (bi->bi_bmap.br_state == XFS_EXT_UNWRITTEN)
+		map->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+	if (bi->bi_whichfork == XFS_ATTR_FORK)
+		map->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+		map->me_flags |= XFS_BMAP_EXTENT_REALTIME;
 }
 
 static struct xfs_log_item *
@@ -325,13 +319,16 @@ xfs_bmap_update_create_done(
 }
 
 /* Take a passive ref to the AG containing the space we're mapping. */
-void
+static inline void
 xfs_bmap_update_get_group(
 	struct xfs_mount	*mp,
 	struct xfs_bmap_intent	*bi)
 {
 	xfs_agnumber_t		agno;
 
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+		return;
+
 	agno = XFS_FSB_TO_AGNO(mp, bi->bi_bmap.br_startblock);
 
 	/*
@@ -344,14 +341,40 @@ xfs_bmap_update_get_group(
 	bi->bi_pag = xfs_perag_intent_get(mp, agno);
 }
 
+/* Add this deferred BUI to the transaction. */
+void
+xfs_bmap_defer_add(
+	struct xfs_trans	*tp,
+	struct xfs_bmap_intent	*bi)
+{
+	trace_xfs_bmap_defer(bi);
+
+	xfs_bmap_update_get_group(tp->t_mountp, bi);
+	xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
+}
+
 /* Release a passive AG ref after finishing mapping work. */
 static inline void
 xfs_bmap_update_put_group(
 	struct xfs_bmap_intent	*bi)
 {
+	if (xfs_ifork_is_realtime(bi->bi_owner, bi->bi_whichfork))
+		return;
+
 	xfs_perag_intent_put(bi->bi_pag);
 }
 
+/* Cancel a deferred bmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_bmap_intent		*bi = bi_entry(item);
+
+	xfs_bmap_update_put_group(bi);
+	kmem_cache_free(xfs_bmap_intent_cache, bi);
+}
+
 /* Process a deferred bmap update. */
 STATIC int
 xfs_bmap_update_finish_item(
@@ -360,19 +383,16 @@ xfs_bmap_update_finish_item(
 	struct list_head		*item,
 	struct xfs_btree_cur		**state)
 {
-	struct xfs_bmap_intent		*bi;
+	struct xfs_bmap_intent		*bi = bi_entry(item);
 	int				error;
 
-	bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
 	error = xfs_bmap_finish_one(tp, bi);
 	if (!error && bi->bi_bmap.br_blockcount > 0) {
 		ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
 		return -EAGAIN;
 	}
 
-	xfs_bmap_update_put_group(bi);
-	kmem_cache_free(xfs_bmap_intent_cache, bi);
+	xfs_bmap_update_cancel_item(item);
 	return error;
 }
 
@@ -384,19 +404,6 @@ xfs_bmap_update_abort_intent(
 	xfs_bui_release(BUI_ITEM(intent));
 }
 
-/* Cancel a deferred bmap update. */
-STATIC void
-xfs_bmap_update_cancel_item(
-	struct list_head		*item)
-{
-	struct xfs_bmap_intent		*bi;
-
-	bi = container_of(item, struct xfs_bmap_intent, bi_list);
-
-	xfs_bmap_update_put_group(bi);
-	kmem_cache_free(xfs_bmap_intent_cache, bi);
-}
-
 /* Is this recovered BUI ok? */
 static inline bool
 xfs_bui_validate(
@@ -428,6 +435,9 @@ xfs_bui_validate(
 	if (!xfs_verify_fileext(mp, map->me_startoff, map->me_len))
 		return false;
 
+	if (map->me_flags & XFS_BMAP_EXTENT_REALTIME)
+		return xfs_verify_rtbext(mp, map->me_startblock, map->me_len);
+
 	return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
 }
 
@@ -445,7 +455,8 @@ xfs_bui_recover_work(
 	if (error)
 		return ERR_PTR(error);
 
-	bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	bi = kmem_cache_zalloc(xfs_bmap_intent_cache,
+			GFP_KERNEL | __GFP_NOFAIL);
 	bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
 			XFS_ATTR_FORK : XFS_DATA_FORK;
 	bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
@@ -502,6 +513,12 @@ xfs_bmap_recover_work(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
+	if (!!(map->me_flags & XFS_BMAP_EXTENT_REALTIME) !=
+	    xfs_ifork_is_realtime(ip, work->bi_whichfork)) {
+		error = -EFSCORRUPTED;
+		goto err_cancel;
+	}
+
 	if (work->bi_type == XFS_BMAP_MAP)
 		iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
 	else
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index 3fafd3881a0b..6fee6a508343 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -68,4 +68,8 @@ struct xfs_bud_log_item {
 extern struct kmem_cache	*xfs_bui_cache;
 extern struct kmem_cache	*xfs_bud_cache;
 
+struct xfs_bmap_intent;
+
+void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi);
+
 #endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index c2531c28905c..19e11d1da660 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -66,7 +66,7 @@ xfs_zero_extent(
 	return blkdev_issue_zeroout(target->bt_bdev,
 		block << (mp->m_super->s_blocksize_bits - 9),
 		count_fsb << (mp->m_super->s_blocksize_bits - 9),
-		GFP_NOFS, 0);
+		GFP_KERNEL, 0);
 }
 
 /*
@@ -508,8 +508,8 @@ xfs_can_free_eofblocks(
 	 * Caller must either hold the exclusive io lock; or be inactivating
 	 * the inode, which guarantees there are no other users of the inode.
 	 */
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL) ||
-	       (VFS_I(ip)->i_state & I_FREEING));
+	if (!(VFS_I(ip)->i_state & I_FREEING))
+		xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
 
 	/* prealloc/delalloc exists only on regular files */
 	if (!S_ISREG(VFS_I(ip)->i_mode))
@@ -965,8 +965,7 @@ xfs_collapse_file_space(
 	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
 	bool			done = false;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
 
 	trace_xfs_collapse_file_space(ip);
 
@@ -1035,8 +1034,7 @@ xfs_insert_file_space(
 	xfs_fileoff_t		shift_fsb = XFS_B_TO_FSB(mp, len);
 	bool			done = false;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
 
 	trace_xfs_insert_file_space(ip);
 
@@ -1307,16 +1305,16 @@ xfs_swap_extent_rmap(
 			}
 
 			/* Remove the mapping from the donor file. */
-			xfs_bmap_unmap_extent(tp, tip, &uirec);
+			xfs_bmap_unmap_extent(tp, tip, XFS_DATA_FORK, &uirec);
 
 			/* Remove the mapping from the source file. */
-			xfs_bmap_unmap_extent(tp, ip, &irec);
+			xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &irec);
 
 			/* Map the donor file's blocks into the source file. */
-			xfs_bmap_map_extent(tp, ip, &uirec);
+			xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &uirec);
 
 			/* Map the source file's blocks into the donor file. */
-			xfs_bmap_map_extent(tp, tip, &irec);
+			xfs_bmap_map_extent(tp, tip, XFS_DATA_FORK, &irec);
 
 			error = xfs_defer_finish(tpp);
 			tp = *tpp;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 8e5bd50d29fe..f0fa02264eda 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -21,6 +21,7 @@
 #include "xfs_errortag.h"
 #include "xfs_error.h"
 #include "xfs_ag.h"
+#include "xfs_buf_mem.h"
 
 struct kmem_cache *xfs_buf_cache;
 
@@ -60,6 +61,11 @@ xfs_buf_submit(
 	return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
 }
 
+static inline bool xfs_buf_is_uncached(struct xfs_buf *bp)
+{
+	return bp->b_rhash_key == XFS_BUF_DADDR_NULL;
+}
+
 static inline int
 xfs_buf_is_vmapped(
 	struct xfs_buf	*bp)
@@ -189,8 +195,8 @@ xfs_buf_get_maps(
 		return 0;
 	}
 
-	bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
-				KM_NOFS);
+	bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map),
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 	if (!bp->b_maps)
 		return -ENOMEM;
 	return 0;
@@ -204,7 +210,7 @@ xfs_buf_free_maps(
 	struct xfs_buf	*bp)
 {
 	if (bp->b_maps != &bp->__b_map) {
-		kmem_free(bp->b_maps);
+		kfree(bp->b_maps);
 		bp->b_maps = NULL;
 	}
 }
@@ -222,7 +228,8 @@ _xfs_buf_alloc(
 	int			i;
 
 	*bpp = NULL;
-	bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
+	bp = kmem_cache_zalloc(xfs_buf_cache,
+			GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL);
 
 	/*
 	 * We don't want certain flags to appear in b_flags unless they are
@@ -289,7 +296,7 @@ xfs_buf_free_pages(
 	mm_account_reclaimed_pages(bp->b_page_count);
 
 	if (bp->b_pages != bp->b_page_array)
-		kmem_free(bp->b_pages);
+		kfree(bp->b_pages);
 	bp->b_pages = NULL;
 	bp->b_flags &= ~_XBF_PAGES;
 }
@@ -312,10 +319,12 @@ xfs_buf_free(
 
 	ASSERT(list_empty(&bp->b_lru));
 
-	if (bp->b_flags & _XBF_PAGES)
+	if (xfs_buftarg_is_mem(bp->b_target))
+		xmbuf_unmap_page(bp);
+	else if (bp->b_flags & _XBF_PAGES)
 		xfs_buf_free_pages(bp);
 	else if (bp->b_flags & _XBF_KMEM)
-		kmem_free(bp->b_addr);
+		kfree(bp->b_addr);
 
 	call_rcu(&bp->b_rcu, xfs_buf_free_callback);
 }
@@ -325,21 +334,21 @@ xfs_buf_alloc_kmem(
 	struct xfs_buf	*bp,
 	xfs_buf_flags_t	flags)
 {
-	xfs_km_flags_t	kmflag_mask = KM_NOFS;
+	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL;
 	size_t		size = BBTOB(bp->b_length);
 
 	/* Assure zeroed buffer for non-read cases. */
 	if (!(flags & XBF_READ))
-		kmflag_mask |= KM_ZERO;
+		gfp_mask |= __GFP_ZERO;
 
-	bp->b_addr = kmem_alloc(size, kmflag_mask);
+	bp->b_addr = kmalloc(size, gfp_mask);
 	if (!bp->b_addr)
 		return -ENOMEM;
 
 	if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
 	    ((unsigned long)bp->b_addr & PAGE_MASK)) {
 		/* b_addr spans two pages - use alloc_page instead */
-		kmem_free(bp->b_addr);
+		kfree(bp->b_addr);
 		bp->b_addr = NULL;
 		return -ENOMEM;
 	}
@@ -356,13 +365,11 @@ xfs_buf_alloc_pages(
 	struct xfs_buf	*bp,
 	xfs_buf_flags_t	flags)
 {
-	gfp_t		gfp_mask = __GFP_NOWARN;
+	gfp_t		gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN;
 	long		filled = 0;
 
 	if (flags & XBF_READ_AHEAD)
 		gfp_mask |= __GFP_NORETRY;
-	else
-		gfp_mask |= GFP_NOFS;
 
 	/* Make sure that we have a page list */
 	bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
@@ -429,11 +436,18 @@ _xfs_buf_map_pages(
 
 		/*
 		 * vm_map_ram() will allocate auxiliary structures (e.g.
-		 * pagetables) with GFP_KERNEL, yet we are likely to be under
-		 * GFP_NOFS context here. Hence we need to tell memory reclaim
-		 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
-		 * memory reclaim re-entering the filesystem here and
-		 * potentially deadlocking.
+		 * pagetables) with GFP_KERNEL, yet we often under a scoped nofs
+		 * context here. Mixing GFP_KERNEL with GFP_NOFS allocations
+		 * from the same call site that can be run from both above and
+		 * below memory reclaim causes lockdep false positives. Hence we
+		 * always need to force this allocation to nofs context because
+		 * we can't pass __GFP_NOLOCKDEP down to auxillary structures to
+		 * prevent false positive lockdep reports.
+		 *
+		 * XXX(dgc): I think dquot reclaim is the only place we can get
+		 * to this function from memory reclaim context now. If we fix
+		 * that like we've fixed inode reclaim to avoid writeback from
+		 * reclaim, this nofs wrapping can go away.
 		 */
 		nofs_flag = memalloc_nofs_save();
 		do {
@@ -499,18 +513,18 @@ static const struct rhashtable_params xfs_buf_hash_params = {
 };
 
 int
-xfs_buf_hash_init(
-	struct xfs_perag	*pag)
+xfs_buf_cache_init(
+	struct xfs_buf_cache	*bch)
 {
-	spin_lock_init(&pag->pag_buf_lock);
-	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+	spin_lock_init(&bch->bc_lock);
+	return rhashtable_init(&bch->bc_hash, &xfs_buf_hash_params);
 }
 
 void
-xfs_buf_hash_destroy(
-	struct xfs_perag	*pag)
+xfs_buf_cache_destroy(
+	struct xfs_buf_cache	*bch)
 {
-	rhashtable_destroy(&pag->pag_buf_hash);
+	rhashtable_destroy(&bch->bc_hash);
 }
 
 static int
@@ -573,7 +587,7 @@ xfs_buf_find_lock(
 
 static inline int
 xfs_buf_lookup(
-	struct xfs_perag	*pag,
+	struct xfs_buf_cache	*bch,
 	struct xfs_buf_map	*map,
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
@@ -582,7 +596,7 @@ xfs_buf_lookup(
 	int			error;
 
 	rcu_read_lock();
-	bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
+	bp = rhashtable_lookup(&bch->bc_hash, map, xfs_buf_hash_params);
 	if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
 		rcu_read_unlock();
 		return -ENOENT;
@@ -607,6 +621,7 @@ xfs_buf_lookup(
 static int
 xfs_buf_find_insert(
 	struct xfs_buftarg	*btp,
+	struct xfs_buf_cache	*bch,
 	struct xfs_perag	*pag,
 	struct xfs_buf_map	*cmap,
 	struct xfs_buf_map	*map,
@@ -622,31 +637,33 @@ xfs_buf_find_insert(
 	if (error)
 		goto out_drop_pag;
 
-	/*
-	 * For buffers that fit entirely within a single page, first attempt to
-	 * allocate the memory from the heap to minimise memory usage. If we
-	 * can't get heap memory for these small buffers, we fall back to using
-	 * the page allocator.
-	 */
-	if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
-	    xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+	if (xfs_buftarg_is_mem(new_bp->b_target)) {
+		error = xmbuf_map_page(new_bp);
+	} else if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
+		   xfs_buf_alloc_kmem(new_bp, flags) < 0) {
+		/*
+		 * For buffers that fit entirely within a single page, first
+		 * attempt to allocate the memory from the heap to minimise
+		 * memory usage. If we can't get heap memory for these small
+		 * buffers, we fall back to using the page allocator.
+		 */
 		error = xfs_buf_alloc_pages(new_bp, flags);
-		if (error)
-			goto out_free_buf;
 	}
+	if (error)
+		goto out_free_buf;
 
-	spin_lock(&pag->pag_buf_lock);
-	bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
+	spin_lock(&bch->bc_lock);
+	bp = rhashtable_lookup_get_insert_fast(&bch->bc_hash,
 			&new_bp->b_rhash_head, xfs_buf_hash_params);
 	if (IS_ERR(bp)) {
 		error = PTR_ERR(bp);
-		spin_unlock(&pag->pag_buf_lock);
+		spin_unlock(&bch->bc_lock);
 		goto out_free_buf;
 	}
 	if (bp) {
 		/* found an existing buffer */
 		atomic_inc(&bp->b_hold);
-		spin_unlock(&pag->pag_buf_lock);
+		spin_unlock(&bch->bc_lock);
 		error = xfs_buf_find_lock(bp, flags);
 		if (error)
 			xfs_buf_rele(bp);
@@ -657,17 +674,40 @@ xfs_buf_find_insert(
 
 	/* The new buffer keeps the perag reference until it is freed. */
 	new_bp->b_pag = pag;
-	spin_unlock(&pag->pag_buf_lock);
+	spin_unlock(&bch->bc_lock);
 	*bpp = new_bp;
 	return 0;
 
 out_free_buf:
 	xfs_buf_free(new_bp);
 out_drop_pag:
-	xfs_perag_put(pag);
+	if (pag)
+		xfs_perag_put(pag);
 	return error;
 }
 
+static inline struct xfs_perag *
+xfs_buftarg_get_pag(
+	struct xfs_buftarg		*btp,
+	const struct xfs_buf_map	*map)
+{
+	struct xfs_mount		*mp = btp->bt_mount;
+
+	if (xfs_buftarg_is_mem(btp))
+		return NULL;
+	return xfs_perag_get(mp, xfs_daddr_to_agno(mp, map->bm_bn));
+}
+
+static inline struct xfs_buf_cache *
+xfs_buftarg_buf_cache(
+	struct xfs_buftarg		*btp,
+	struct xfs_perag		*pag)
+{
+	if (pag)
+		return &pag->pag_bcache;
+	return btp->bt_cache;
+}
+
 /*
  * Assembles a buffer covering the specified range. The code is optimised for
  * cache hits, as metadata intensive workloads will see 3 orders of magnitude
@@ -681,6 +721,7 @@ xfs_buf_get_map(
 	xfs_buf_flags_t		flags,
 	struct xfs_buf		**bpp)
 {
+	struct xfs_buf_cache	*bch;
 	struct xfs_perag	*pag;
 	struct xfs_buf		*bp = NULL;
 	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
@@ -696,10 +737,10 @@ xfs_buf_get_map(
 	if (error)
 		return error;
 
-	pag = xfs_perag_get(btp->bt_mount,
-			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
+	pag = xfs_buftarg_get_pag(btp, &cmap);
+	bch = xfs_buftarg_buf_cache(btp, pag);
 
-	error = xfs_buf_lookup(pag, &cmap, flags, &bp);
+	error = xfs_buf_lookup(bch, &cmap, flags, &bp);
 	if (error && error != -ENOENT)
 		goto out_put_perag;
 
@@ -711,13 +752,14 @@ xfs_buf_get_map(
 			goto out_put_perag;
 
 		/* xfs_buf_find_insert() consumes the perag reference. */
-		error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
+		error = xfs_buf_find_insert(btp, bch, pag, &cmap, map, nmaps,
 				flags, &bp);
 		if (error)
 			return error;
 	} else {
 		XFS_STATS_INC(btp->bt_mount, xb_get_locked);
-		xfs_perag_put(pag);
+		if (pag)
+			xfs_perag_put(pag);
 	}
 
 	/* We do not hold a perag reference anymore. */
@@ -745,7 +787,8 @@ xfs_buf_get_map(
 	return 0;
 
 out_put_perag:
-	xfs_perag_put(pag);
+	if (pag)
+		xfs_perag_put(pag);
 	return error;
 }
 
@@ -892,6 +935,13 @@ xfs_buf_readahead_map(
 {
 	struct xfs_buf		*bp;
 
+	/*
+	 * Currently we don't have a good means or justification for performing
+	 * xmbuf_map_page asynchronously, so we don't do readahead.
+	 */
+	if (xfs_buftarg_is_mem(target))
+		return;
+
 	xfs_buf_read_map(target, map, nmaps,
 		     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
 		     __this_address);
@@ -957,7 +1007,10 @@ xfs_buf_get_uncached(
 	if (error)
 		return error;
 
-	error = xfs_buf_alloc_pages(bp, flags);
+	if (xfs_buftarg_is_mem(bp->b_target))
+		error = xmbuf_map_page(bp);
+	else
+		error = xfs_buf_alloc_pages(bp, flags);
 	if (error)
 		goto fail_free_buf;
 
@@ -990,29 +1043,29 @@ xfs_buf_hold(
 	atomic_inc(&bp->b_hold);
 }
 
-/*
- * Release a hold on the specified buffer. If the hold count is 1, the buffer is
- * placed on LRU or freed (depending on b_lru_ref).
- */
-void
-xfs_buf_rele(
+static void
+xfs_buf_rele_uncached(
+	struct xfs_buf		*bp)
+{
+	ASSERT(list_empty(&bp->b_lru));
+	if (atomic_dec_and_test(&bp->b_hold)) {
+		xfs_buf_ioacct_dec(bp);
+		xfs_buf_free(bp);
+	}
+}
+
+static void
+xfs_buf_rele_cached(
 	struct xfs_buf		*bp)
 {
+	struct xfs_buftarg	*btp = bp->b_target;
 	struct xfs_perag	*pag = bp->b_pag;
+	struct xfs_buf_cache	*bch = xfs_buftarg_buf_cache(btp, pag);
 	bool			release;
 	bool			freebuf = false;
 
 	trace_xfs_buf_rele(bp, _RET_IP_);
 
-	if (!pag) {
-		ASSERT(list_empty(&bp->b_lru));
-		if (atomic_dec_and_test(&bp->b_hold)) {
-			xfs_buf_ioacct_dec(bp);
-			xfs_buf_free(bp);
-		}
-		return;
-	}
-
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 
 	/*
@@ -1026,7 +1079,7 @@ xfs_buf_rele(
 	 * leading to a use-after-free scenario.
 	 */
 	spin_lock(&bp->b_lock);
-	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+	release = atomic_dec_and_lock(&bp->b_hold, &bch->bc_lock);
 	if (!release) {
 		/*
 		 * Drop the in-flight state if the buffer is already on the LRU
@@ -1047,11 +1100,11 @@ xfs_buf_rele(
 		 * buffer for the LRU and clear the (now stale) dispose list
 		 * state flag
 		 */
-		if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
+		if (list_lru_add_obj(&btp->bt_lru, &bp->b_lru)) {
 			bp->b_state &= ~XFS_BSTATE_DISPOSE;
 			atomic_inc(&bp->b_hold);
 		}
-		spin_unlock(&pag->pag_buf_lock);
+		spin_unlock(&bch->bc_lock);
 	} else {
 		/*
 		 * most of the time buffers will already be removed from the
@@ -1060,16 +1113,17 @@ xfs_buf_rele(
 		 * was on was the disposal list
 		 */
 		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
+			list_lru_del_obj(&btp->bt_lru, &bp->b_lru);
 		} else {
 			ASSERT(list_empty(&bp->b_lru));
 		}
 
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
-				       xfs_buf_hash_params);
-		spin_unlock(&pag->pag_buf_lock);
-		xfs_perag_put(pag);
+		rhashtable_remove_fast(&bch->bc_hash, &bp->b_rhash_head,
+				xfs_buf_hash_params);
+		spin_unlock(&bch->bc_lock);
+		if (pag)
+			xfs_perag_put(pag);
 		freebuf = true;
 	}
 
@@ -1080,6 +1134,19 @@ out_unlock:
 		xfs_buf_free(bp);
 }
 
+/*
+ * Release a hold on the specified buffer.
+ */
+void
+xfs_buf_rele(
+	struct xfs_buf		*bp)
+{
+	trace_xfs_buf_rele(bp, _RET_IP_);
+	if (xfs_buf_is_uncached(bp))
+		xfs_buf_rele_uncached(bp);
+	else
+		xfs_buf_rele_cached(bp);
+}
 
 /*
  *	Lock a buffer object, if it is not already locked.
@@ -1585,6 +1652,12 @@ _xfs_buf_ioapply(
 	/* we only use the buffer cache for meta-data */
 	op |= REQ_META;
 
+	/* in-memory targets are directly mapped, no IO required. */
+	if (xfs_buftarg_is_mem(bp->b_target)) {
+		xfs_buf_ioend(bp);
+		return;
+	}
+
 	/*
 	 * Walk all the vectors issuing IO on them. Set up the initial offset
 	 * into the buffer and the desired IO size before we start -
@@ -1940,25 +2013,30 @@ xfs_buftarg_shrink_count(
 }
 
 void
-xfs_free_buftarg(
+xfs_destroy_buftarg(
 	struct xfs_buftarg	*btp)
 {
 	shrinker_free(btp->bt_shrinker);
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
 	list_lru_destroy(&btp->bt_lru);
+}
 
+void
+xfs_free_buftarg(
+	struct xfs_buftarg	*btp)
+{
+	xfs_destroy_buftarg(btp);
 	fs_put_dax(btp->bt_daxdev, btp->bt_mount);
 	/* the main block device is closed by kill_block_super */
 	if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev)
-		bdev_release(btp->bt_bdev_handle);
-
-	kmem_free(btp);
+		bdev_fput(btp->bt_bdev_file);
+	kfree(btp);
 }
 
 int
 xfs_setsize_buftarg(
-	xfs_buftarg_t		*btp,
+	struct xfs_buftarg	*btp,
 	unsigned int		sectorsize)
 {
 	/* Set up metadata sector size info */
@@ -1972,80 +2050,82 @@ xfs_setsize_buftarg(
 		return -EINVAL;
 	}
 
-	/* Set up device logical sector size mask */
-	btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
-	btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
-
 	return 0;
 }
 
-/*
- * When allocating the initial buffer target we have not yet
- * read in the superblock, so don't know what sized sectors
- * are being used at this early stage.  Play safe.
- */
-STATIC int
-xfs_setsize_buftarg_early(
-	xfs_buftarg_t		*btp)
+int
+xfs_init_buftarg(
+	struct xfs_buftarg		*btp,
+	size_t				logical_sectorsize,
+	const char			*descr)
 {
-	return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev));
+	/* Set up device logical sector size mask */
+	btp->bt_logical_sectorsize = logical_sectorsize;
+	btp->bt_logical_sectormask = logical_sectorsize - 1;
+
+	/*
+	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
+	 * per 30 seconds so as to not spam logs too much on repeated errors.
+	 */
+	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
+			     DEFAULT_RATELIMIT_BURST);
+
+	if (list_lru_init(&btp->bt_lru))
+		return -ENOMEM;
+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+		goto out_destroy_lru;
+
+	btp->bt_shrinker =
+		shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", descr);
+	if (!btp->bt_shrinker)
+		goto out_destroy_io_count;
+	btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
+	btp->bt_shrinker->private_data = btp;
+	shrinker_register(btp->bt_shrinker);
+	return 0;
+
+out_destroy_io_count:
+	percpu_counter_destroy(&btp->bt_io_count);
+out_destroy_lru:
+	list_lru_destroy(&btp->bt_lru);
+	return -ENOMEM;
 }
 
 struct xfs_buftarg *
 xfs_alloc_buftarg(
 	struct xfs_mount	*mp,
-	struct bdev_handle	*bdev_handle)
+	struct file		*bdev_file)
 {
-	xfs_buftarg_t		*btp;
+	struct xfs_buftarg	*btp;
 	const struct dax_holder_operations *ops = NULL;
 
 #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
 	ops = &xfs_dax_holder_operations;
 #endif
-	btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
+	btp = kzalloc(sizeof(*btp), GFP_KERNEL | __GFP_NOFAIL);
 
 	btp->bt_mount = mp;
-	btp->bt_bdev_handle = bdev_handle;
-	btp->bt_dev = bdev_handle->bdev->bd_dev;
-	btp->bt_bdev = bdev_handle->bdev;
+	btp->bt_bdev_file = bdev_file;
+	btp->bt_bdev = file_bdev(bdev_file);
+	btp->bt_dev = btp->bt_bdev->bd_dev;
 	btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off,
 					    mp, ops);
 
 	/*
-	 * Buffer IO error rate limiting. Limit it to no more than 10 messages
-	 * per 30 seconds so as to not spam logs too much on repeated errors.
+	 * When allocating the buftargs we have not yet read the super block and
+	 * thus don't know the file system sector size yet.
 	 */
-	ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
-			     DEFAULT_RATELIMIT_BURST);
-
-	if (xfs_setsize_buftarg_early(btp))
+	if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)))
 		goto error_free;
-
-	if (list_lru_init(&btp->bt_lru))
+	if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev),
+			mp->m_super->s_id))
 		goto error_free;
 
-	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
-		goto error_lru;
-
-	btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s",
-					  mp->m_super->s_id);
-	if (!btp->bt_shrinker)
-		goto error_pcpu;
-
-	btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count;
-	btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan;
-	btp->bt_shrinker->private_data = btp;
-
-	shrinker_register(btp->bt_shrinker);
-
 	return btp;
 
-error_pcpu:
-	percpu_counter_destroy(&btp->bt_io_count);
-error_lru:
-	list_lru_destroy(&btp->bt_lru);
 error_free:
-	kmem_free(btp);
+	kfree(btp);
 	return NULL;
 }
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index b470de08a46c..b1580644501f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -83,6 +83,14 @@ typedef unsigned int xfs_buf_flags_t;
 #define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
 #define XFS_BSTATE_IN_FLIGHT	 (1 << 1)	/* I/O in flight */
 
+struct xfs_buf_cache {
+	spinlock_t		bc_lock;
+	struct rhashtable	bc_hash;
+};
+
+int xfs_buf_cache_init(struct xfs_buf_cache *bch);
+void xfs_buf_cache_destroy(struct xfs_buf_cache *bch);
+
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
  *
@@ -96,11 +104,12 @@ typedef unsigned int xfs_buf_flags_t;
  * The latter is derived from the underlying device, and controls direct IO
  * alignment constraints.
  */
-typedef struct xfs_buftarg {
+struct xfs_buftarg {
 	dev_t			bt_dev;
-	struct bdev_handle	*bt_bdev_handle;
+	struct file		*bt_bdev_file;
 	struct block_device	*bt_bdev;
 	struct dax_device	*bt_daxdev;
+	struct file		*bt_file;
 	u64			bt_dax_part_off;
 	struct xfs_mount	*bt_mount;
 	unsigned int		bt_meta_sectorsize;
@@ -114,7 +123,10 @@ typedef struct xfs_buftarg {
 
 	struct percpu_counter	bt_io_count;
 	struct ratelimit_state	bt_ioerror_rl;
-} xfs_buftarg_t;
+
+	/* built-in cache, if we're not using the perag one */
+	struct xfs_buf_cache	bt_cache[];
+};
 
 #define XB_PAGES	2
 
@@ -366,7 +378,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset)
  *	Handling of buftargs.
  */
 struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp,
-		struct bdev_handle *bdev_handle);
+		struct file *bdev_file);
 extern void xfs_free_buftarg(struct xfs_buftarg *);
 extern void xfs_buftarg_wait(struct xfs_buftarg *);
 extern void xfs_buftarg_drain(struct xfs_buftarg *);
@@ -379,4 +391,9 @@ int xfs_buf_reverify(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
 bool xfs_verify_magic(struct xfs_buf *bp, __be32 dmagic);
 bool xfs_verify_magic16(struct xfs_buf *bp, __be16 dmagic);
 
+/* for xfs_buf_mem.c only: */
+int xfs_init_buftarg(struct xfs_buftarg *btp, size_t logical_sectorsize,
+		const char *descr);
+void xfs_destroy_buftarg(struct xfs_buftarg *btp);
+
 #endif	/* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 023d4e0385dd..43031842341a 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -805,8 +805,8 @@ xfs_buf_item_get_format(
 		return;
 	}
 
-	bip->bli_formats = kmem_zalloc(count * sizeof(struct xfs_buf_log_format),
-				0);
+	bip->bli_formats = kzalloc(count * sizeof(struct xfs_buf_log_format),
+				GFP_KERNEL | __GFP_NOFAIL);
 }
 
 STATIC void
@@ -814,7 +814,7 @@ xfs_buf_item_free_format(
 	struct xfs_buf_log_item	*bip)
 {
 	if (bip->bli_formats != &bip->__bli_format) {
-		kmem_free(bip->bli_formats);
+		kfree(bip->bli_formats);
 		bip->bli_formats = NULL;
 	}
 }
@@ -1044,7 +1044,7 @@ xfs_buf_item_free(
 	struct xfs_buf_log_item	*bip)
 {
 	xfs_buf_item_free_format(bip);
-	kmem_free(bip->bli_item.li_lv_shadow);
+	kvfree(bip->bli_item.li_lv_shadow);
 	kmem_cache_free(xfs_buf_item_cache, bip);
 }
 
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index 43167f543afc..09e893cf563c 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -85,7 +85,7 @@ xlog_add_buffer_cancelled(
 		return false;
 	}
 
-	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), 0);
+	bcp = kmalloc(sizeof(struct xfs_buf_cancel), GFP_KERNEL | __GFP_NOFAIL);
 	bcp->bc_blkno = blkno;
 	bcp->bc_len = len;
 	bcp->bc_refcount = 1;
@@ -129,7 +129,7 @@ xlog_put_buffer_cancelled(
 
 	if (--bcp->bc_refcount == 0) {
 		list_del(&bcp->bc_list);
-		kmem_free(bcp);
+		kfree(bcp);
 	}
 	return true;
 }
@@ -1062,10 +1062,10 @@ xlog_free_buf_cancel_table(
 				&log->l_buf_cancel_table[i],
 				struct xfs_buf_cancel, bc_list))) {
 			list_del(&bc->bc_list);
-			kmem_free(bc);
+			kfree(bc);
 		}
 	}
 
-	kmem_free(log->l_buf_cancel_table);
+	kfree(log->l_buf_cancel_table);
 	log->l_buf_cancel_table = NULL;
 }
diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c
new file mode 100644
index 000000000000..9bb2d24de709
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.c
@@ -0,0 +1,270 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_buf.h"
+#include "xfs_buf_mem.h"
+#include "xfs_trace.h"
+#include <linux/shmem_fs.h>
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_buf_item.h"
+#include "xfs_error.h"
+
+/*
+ * Buffer Cache for In-Memory Files
+ * ================================
+ *
+ * Online fsck wants to create ephemeral ordered recordsets.  The existing
+ * btree infrastructure can do this, but we need the buffer cache to target
+ * memory instead of block devices.
+ *
+ * When CONFIG_TMPFS=y, shmemfs is enough of a filesystem to meet those
+ * requirements.  Therefore, the xmbuf mechanism uses an unlinked shmem file to
+ * store our staging data.  This file is not installed in the file descriptor
+ * table so that user programs cannot access the data, which means that the
+ * xmbuf must be freed with xmbuf_destroy.
+ *
+ * xmbufs assume that the caller will handle all required concurrency
+ * management; standard vfs locks (freezer and inode) are not taken.  Reads
+ * and writes are satisfied directly from the page cache.
+ *
+ * The only supported block size is PAGE_SIZE, and we cannot use highmem.
+ */
+
+/*
+ * shmem files used to back an in-memory buffer cache must not be exposed to
+ * userspace.  Upper layers must coordinate access to the one handle returned
+ * by the constructor, so establish a separate lock class for xmbufs to avoid
+ * confusing lockdep.
+ */
+static struct lock_class_key xmbuf_i_mutex_key;
+
+/*
+ * Allocate a buffer cache target for a memory-backed file and set up the
+ * buffer target.
+ */
+int
+xmbuf_alloc(
+	struct xfs_mount	*mp,
+	const char		*descr,
+	struct xfs_buftarg	**btpp)
+{
+	struct file		*file;
+	struct inode		*inode;
+	struct xfs_buftarg	*btp;
+	int			error;
+
+	btp = kzalloc(struct_size(btp, bt_cache, 1), GFP_KERNEL);
+	if (!btp)
+		return -ENOMEM;
+
+	file = shmem_kernel_file_setup(descr, 0, 0);
+	if (IS_ERR(file)) {
+		error = PTR_ERR(file);
+		goto out_free_btp;
+	}
+	inode = file_inode(file);
+
+	/* private file, private locking */
+	lockdep_set_class(&inode->i_rwsem, &xmbuf_i_mutex_key);
+
+	/*
+	 * We don't want to bother with kmapping data during repair, so don't
+	 * allow highmem pages to back this mapping.
+	 */
+	mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
+
+	/* ensure all writes are below EOF to avoid pagecache zeroing */
+	i_size_write(inode, inode->i_sb->s_maxbytes);
+
+	error = xfs_buf_cache_init(btp->bt_cache);
+	if (error)
+		goto out_file;
+
+	/* Initialize buffer target */
+	btp->bt_mount = mp;
+	btp->bt_dev = (dev_t)-1U;
+	btp->bt_bdev = NULL; /* in-memory buftargs have no bdev */
+	btp->bt_file = file;
+	btp->bt_meta_sectorsize = XMBUF_BLOCKSIZE;
+	btp->bt_meta_sectormask = XMBUF_BLOCKSIZE - 1;
+
+	error = xfs_init_buftarg(btp, XMBUF_BLOCKSIZE, descr);
+	if (error)
+		goto out_bcache;
+
+	trace_xmbuf_create(btp);
+
+	*btpp = btp;
+	return 0;
+
+out_bcache:
+	xfs_buf_cache_destroy(btp->bt_cache);
+out_file:
+	fput(file);
+out_free_btp:
+	kfree(btp);
+	return error;
+}
+
+/* Free a buffer cache target for a memory-backed buffer cache. */
+void
+xmbuf_free(
+	struct xfs_buftarg	*btp)
+{
+	ASSERT(xfs_buftarg_is_mem(btp));
+	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
+
+	trace_xmbuf_free(btp);
+
+	xfs_destroy_buftarg(btp);
+	xfs_buf_cache_destroy(btp->bt_cache);
+	fput(btp->bt_file);
+	kfree(btp);
+}
+
+/* Directly map a shmem page into the buffer cache. */
+int
+xmbuf_map_page(
+	struct xfs_buf		*bp)
+{
+	struct inode		*inode = file_inode(bp->b_target->bt_file);
+	struct folio		*folio = NULL;
+	struct page		*page;
+	loff_t                  pos = BBTOB(xfs_buf_daddr(bp));
+	int			error;
+
+	ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+	if (bp->b_map_count != 1)
+		return -ENOMEM;
+	if (BBTOB(bp->b_length) != XMBUF_BLOCKSIZE)
+		return -ENOMEM;
+	if (offset_in_page(pos) != 0) {
+		ASSERT(offset_in_page(pos));
+		return -ENOMEM;
+	}
+
+	error = shmem_get_folio(inode, pos >> PAGE_SHIFT, &folio, SGP_CACHE);
+	if (error)
+		return error;
+
+	if (filemap_check_wb_err(inode->i_mapping, 0)) {
+		folio_unlock(folio);
+		folio_put(folio);
+		return -EIO;
+	}
+
+	page = folio_file_page(folio, pos >> PAGE_SHIFT);
+
+	/*
+	 * Mark the page dirty so that it won't be reclaimed once we drop the
+	 * (potentially last) reference in xmbuf_unmap_page.
+	 */
+	set_page_dirty(page);
+	unlock_page(page);
+
+	bp->b_addr = page_address(page);
+	bp->b_pages = bp->b_page_array;
+	bp->b_pages[0] = page;
+	bp->b_page_count = 1;
+	return 0;
+}
+
+/* Unmap a shmem page that was mapped into the buffer cache. */
+void
+xmbuf_unmap_page(
+	struct xfs_buf		*bp)
+{
+	struct page		*page = bp->b_pages[0];
+
+	ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+	put_page(page);
+
+	bp->b_addr = NULL;
+	bp->b_pages[0] = NULL;
+	bp->b_pages = NULL;
+	bp->b_page_count = 0;
+}
+
+/* Is this a valid daddr within the buftarg? */
+bool
+xmbuf_verify_daddr(
+	struct xfs_buftarg	*btp,
+	xfs_daddr_t		daddr)
+{
+	struct inode		*inode = file_inode(btp->bt_file);
+
+	ASSERT(xfs_buftarg_is_mem(btp));
+
+	return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT);
+}
+
+/* Discard the page backing this buffer. */
+static void
+xmbuf_stale(
+	struct xfs_buf		*bp)
+{
+	struct inode		*inode = file_inode(bp->b_target->bt_file);
+	loff_t			pos;
+
+	ASSERT(xfs_buftarg_is_mem(bp->b_target));
+
+	pos = BBTOB(xfs_buf_daddr(bp));
+	shmem_truncate_range(inode, pos, pos + BBTOB(bp->b_length) - 1);
+}
+
+/*
+ * Finalize a buffer -- discard the backing page if it's stale, or run the
+ * write verifier to detect problems.
+ */
+int
+xmbuf_finalize(
+	struct xfs_buf		*bp)
+{
+	xfs_failaddr_t		fa;
+	int			error = 0;
+
+	if (bp->b_flags & XBF_STALE) {
+		xmbuf_stale(bp);
+		return 0;
+	}
+
+	/*
+	 * Although this btree is ephemeral, validate the buffer structure so
+	 * that we can detect memory corruption errors and software bugs.
+	 */
+	fa = bp->b_ops->verify_struct(bp);
+	if (fa) {
+		error = -EFSCORRUPTED;
+		xfs_verifier_error(bp, error, fa);
+	}
+
+	return error;
+}
+
+/*
+ * Detach this xmbuf buffer from the transaction by any means necessary.
+ * All buffers are direct-mapped, so they do not need bwrite.
+ */
+void
+xmbuf_trans_bdetach(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bli = bp->b_log_item;
+
+	ASSERT(bli != NULL);
+
+	bli->bli_flags &= ~(XFS_BLI_DIRTY | XFS_BLI_ORDERED |
+			    XFS_BLI_LOGGED | XFS_BLI_STALE);
+	clear_bit(XFS_LI_DIRTY, &bli->bli_item.li_flags);
+
+	while (bp->b_log_item != NULL)
+		xfs_trans_bdetach(tp, bp);
+}
diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h
new file mode 100644
index 000000000000..eed4a7b63232
--- /dev/null
+++ b/fs/xfs/xfs_buf_mem.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2023-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_BUF_MEM_H__
+#define __XFS_BUF_MEM_H__
+
+#define XMBUF_BLOCKSIZE			(PAGE_SIZE)
+#define XMBUF_BLOCKSHIFT		(PAGE_SHIFT)
+
+#ifdef CONFIG_XFS_MEMORY_BUFS
+static inline bool xfs_buftarg_is_mem(const struct xfs_buftarg *btp)
+{
+	return btp->bt_bdev == NULL;
+}
+
+int xmbuf_alloc(struct xfs_mount *mp, const char *descr,
+		struct xfs_buftarg **btpp);
+void xmbuf_free(struct xfs_buftarg *btp);
+
+int xmbuf_map_page(struct xfs_buf *bp);
+void xmbuf_unmap_page(struct xfs_buf *bp);
+bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr);
+void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
+int xmbuf_finalize(struct xfs_buf *bp);
+#else
+# define xfs_buftarg_is_mem(...)	(false)
+# define xmbuf_map_page(...)		(-ENOMEM)
+# define xmbuf_unmap_page(...)		((void)0)
+# define xmbuf_verify_daddr(...)	(false)
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#endif /* __XFS_BUF_MEM_H__ */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index cc6dc56f455d..cf9296b7e06f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -118,8 +118,10 @@ xfs_dir2_sf_getdents(
 		ctx->pos = off & 0x7fffffff;
 		if (XFS_IS_CORRUPT(dp->i_mount,
 				   !xfs_dir2_namecheck(sfep->name,
-						       sfep->namelen)))
+						       sfep->namelen))) {
+			xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 			return -EFSCORRUPTED;
+		}
 		if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
 			    xfs_dir3_get_dtype(mp, filetype)))
 			return 0;
@@ -211,6 +213,7 @@ xfs_dir2_block_getdents(
 		if (XFS_IS_CORRUPT(dp->i_mount,
 				   !xfs_dir2_namecheck(dep->name,
 						       dep->namelen))) {
+			xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 			error = -EFSCORRUPTED;
 			goto out_rele;
 		}
@@ -465,6 +468,7 @@ xfs_dir2_leaf_getdents(
 		if (XFS_IS_CORRUPT(dp->i_mount,
 				   !xfs_dir2_namecheck(dep->name,
 						       dep->namelen))) {
+			xfs_dirattr_mark_sick(dp, XFS_DATA_FORK);
 			error = -EFSCORRUPTED;
 			break;
 		}
@@ -522,7 +526,7 @@ xfs_readdir(
 		return -EIO;
 
 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
-	ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+	xfs_assert_ilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
 	XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
 
 	args.dp = dp;
diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d5787991bb5b..268bb734dc0a 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -8,6 +8,7 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_trans.h"
 #include "xfs_mount.h"
 #include "xfs_btree.h"
 #include "xfs_alloc_btree.h"
@@ -18,6 +19,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_ag.h"
+#include "xfs_health.h"
 
 /*
  * Notes on an efficient, low latency fstrim algorithm
@@ -79,7 +81,7 @@ xfs_discard_endio_work(
 		container_of(work, struct xfs_busy_extents, endio_work);
 
 	xfs_extent_busy_clear(extents->mount, &extents->extent_list, false);
-	kmem_free(extents->owner);
+	kfree(extents->owner);
 }
 
 /*
@@ -120,7 +122,7 @@ xfs_discard_extents(
 		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
-				GFP_NOFS, &bio);
+				GFP_KERNEL, &bio);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llx,%u], error %d",
@@ -155,6 +157,7 @@ xfs_trim_gather_extents(
 	uint64_t		*blocks_trimmed)
 {
 	struct xfs_mount	*mp = pag->pag_mount;
+	struct xfs_trans	*tp;
 	struct xfs_btree_cur	*cur;
 	struct xfs_buf		*agbp;
 	int			error;
@@ -168,11 +171,15 @@ xfs_trim_gather_extents(
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
 
-	error = xfs_alloc_read_agf(pag, NULL, 0, &agbp);
+	error = xfs_trans_alloc_empty(mp, &tp);
 	if (error)
 		return error;
 
-	cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT);
+	error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
+	if (error)
+		goto out_trans_cancel;
+
+	cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag);
 
 	/*
 	 * Look up the extent length requested in the AGF and start with it.
@@ -204,6 +211,7 @@ xfs_trim_gather_extents(
 		if (error)
 			break;
 		if (XFS_IS_CORRUPT(mp, i != 1)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			break;
 		}
@@ -279,7 +287,8 @@ next_extent:
 		xfs_extent_busy_clear(mp, &extents->extent_list, false);
 out_del_cursor:
 	xfs_btree_del_cursor(cur, error);
-	xfs_buf_relse(agbp);
+out_trans_cancel:
+	xfs_trans_cancel(tp);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index b4f20d9c8f98..c98cb468c357 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -24,6 +24,7 @@
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 /*
  * Lock order:
@@ -44,6 +45,29 @@ static struct kmem_cache	*xfs_dquot_cache;
 static struct lock_class_key xfs_dquot_group_class;
 static struct lock_class_key xfs_dquot_project_class;
 
+/* Record observations of quota corruption with the health tracking system. */
+static void
+xfs_dquot_mark_sick(
+	struct xfs_dquot	*dqp)
+{
+	struct xfs_mount	*mp = dqp->q_mount;
+
+	switch (dqp->q_type) {
+	case XFS_DQTYPE_USER:
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_UQUOTA);
+		break;
+	case XFS_DQTYPE_GROUP:
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
+		break;
+	case XFS_DQTYPE_PROJ:
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
+		break;
+	default:
+		ASSERT(0);
+		break;
+	}
+}
+
 /*
  * This is called to free all the memory associated with a dquot
  */
@@ -53,7 +77,7 @@ xfs_qm_dqdestroy(
 {
 	ASSERT(list_empty(&dqp->q_lru));
 
-	kmem_free(dqp->q_logitem.qli_item.li_lv_shadow);
+	kvfree(dqp->q_logitem.qli_item.li_lv_shadow);
 	mutex_destroy(&dqp->q_qlock);
 
 	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot);
@@ -451,6 +475,8 @@ xfs_dquot_disk_read(
 	error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
 			mp->m_quotainfo->qi_dqchunklen, 0, &bp,
 			&xfs_dquot_buf_ops);
+	if (xfs_metadata_is_sick(error))
+		xfs_dquot_mark_sick(dqp);
 	if (error) {
 		ASSERT(bp == NULL);
 		return error;
@@ -574,6 +600,7 @@ xfs_dquot_from_disk(
 			  "Metadata corruption detected at %pS, quota %u",
 			  __this_address, dqp->q_id);
 		xfs_alert(bp->b_mount, "Unmount and run xfs_repair");
+		xfs_dquot_mark_sick(dqp);
 		return -EFSCORRUPTED;
 	}
 
@@ -784,6 +811,12 @@ restart:
  * caller should throw away the dquot and start over.  Otherwise, the dquot
  * is returned locked (and held by the cache) as if there had been a cache
  * hit.
+ *
+ * The insert needs to be done under memalloc_nofs context because the radix
+ * tree can do memory allocation during insert. The qi->qi_tree_lock is taken in
+ * memory reclaim when freeing unused dquots, so we cannot have the radix tree
+ * node allocation recursing into filesystem reclaim whilst we hold the
+ * qi_tree_lock.
  */
 static int
 xfs_qm_dqget_cache_insert(
@@ -793,25 +826,27 @@ xfs_qm_dqget_cache_insert(
 	xfs_dqid_t		id,
 	struct xfs_dquot	*dqp)
 {
+	unsigned int		nofs_flags;
 	int			error;
 
+	nofs_flags = memalloc_nofs_save();
 	mutex_lock(&qi->qi_tree_lock);
 	error = radix_tree_insert(tree, id, dqp);
 	if (unlikely(error)) {
 		/* Duplicate found!  Caller must try again. */
-		mutex_unlock(&qi->qi_tree_lock);
 		trace_xfs_dqget_dup(dqp);
-		return error;
+		goto out_unlock;
 	}
 
 	/* Return a locked dquot to the caller, with a reference taken. */
 	xfs_dqlock(dqp);
 	dqp->q_nrefs = 1;
-
 	qi->qi_dquots++;
-	mutex_unlock(&qi->qi_tree_lock);
 
-	return 0;
+out_unlock:
+	mutex_unlock(&qi->qi_tree_lock);
+	memalloc_nofs_restore(nofs_flags);
+	return error;
 }
 
 /* Check our input parameters. */
@@ -950,7 +985,7 @@ xfs_qm_dqget_inode(
 	if (error)
 		return error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(xfs_inode_dquot(ip, type) == NULL);
 
 	id = xfs_qm_id_for_quotatype(ip, type);
@@ -1007,7 +1042,7 @@ restart:
 	}
 
 dqret:
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	trace_xfs_dqget_miss(dqp);
 	*O_dqpp = dqp;
 	return 0;
@@ -1238,6 +1273,8 @@ xfs_qm_dqflush(
 				   &bp, &xfs_dquot_buf_ops);
 	if (error == -EAGAIN)
 		goto out_unlock;
+	if (xfs_metadata_is_sick(error))
+		xfs_dquot_mark_sick(dqp);
 	if (error)
 		goto out_abort;
 
@@ -1246,6 +1283,7 @@ xfs_qm_dqflush(
 		xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
 				dqp->q_id, fa);
 		xfs_buf_relse(bp);
+		xfs_dquot_mark_sick(dqp);
 		error = -EFSCORRUPTED;
 		goto out_abort;
 	}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index b2cbbba3e15a..7ad0e92c6b5b 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -240,15 +240,15 @@ xfs_errortag_init(
 {
 	int ret;
 
-	mp->m_errortag = kmem_zalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
-			KM_MAYFAIL);
+	mp->m_errortag = kzalloc(sizeof(unsigned int) * XFS_ERRTAG_MAX,
+				GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!mp->m_errortag)
 		return -ENOMEM;
 
 	ret = xfs_sysfs_init(&mp->m_errortag_kobj, &xfs_errortag_ktype,
 				&mp->m_kobj, "errortag");
 	if (ret)
-		kmem_free(mp->m_errortag);
+		kfree(mp->m_errortag);
 	return ret;
 }
 
@@ -257,7 +257,7 @@ xfs_errortag_del(
 	struct xfs_mount	*mp)
 {
 	xfs_sysfs_del(&mp->m_errortag_kobj);
-	kmem_free(mp->m_errortag);
+	kfree(mp->m_errortag);
 }
 
 static bool
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2ccde32c9a9e..56cfa1498571 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -32,7 +32,8 @@ xfs_extent_busy_insert_list(
 	struct rb_node		**rbp;
 	struct rb_node		*parent = NULL;
 
-	new = kmem_zalloc(sizeof(struct xfs_extent_busy), 0);
+	new = kzalloc(sizeof(struct xfs_extent_busy),
+			GFP_KERNEL | __GFP_NOFAIL);
 	new->agno = pag->pag_agno;
 	new->bno = bno;
 	new->length = len;
@@ -530,7 +531,7 @@ xfs_extent_busy_clear_one(
 	}
 
 	list_del_init(&busyp->list);
-	kmem_free(busyp);
+	kfree(busyp);
 }
 
 static void
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 1d1185fca6a5..8c382f092332 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -40,9 +40,9 @@ STATIC void
 xfs_efi_item_free(
 	struct xfs_efi_log_item	*efip)
 {
-	kmem_free(efip->efi_item.li_lv_shadow);
+	kvfree(efip->efi_item.li_lv_shadow);
 	if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS)
-		kmem_free(efip);
+		kfree(efip);
 	else
 		kmem_cache_free(xfs_efi_cache, efip);
 }
@@ -229,9 +229,9 @@ static inline struct xfs_efd_log_item *EFD_ITEM(struct xfs_log_item *lip)
 STATIC void
 xfs_efd_item_free(struct xfs_efd_log_item *efdp)
 {
-	kmem_free(efdp->efd_item.li_lv_shadow);
+	kvfree(efdp->efd_item.li_lv_shadow);
 	if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS)
-		kmem_free(efdp);
+		kfree(efdp);
 	else
 		kmem_cache_free(xfs_efd_cache, efdp);
 }
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index e33e5e13b95f..632653e00906 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -879,7 +879,7 @@ xfs_break_dax_layouts(
 {
 	struct page		*page;
 
-	ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
+	xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL);
 
 	page = dax_layout_busy_page(inode->i_mapping);
 	if (!page)
@@ -900,7 +900,7 @@ xfs_break_layouts(
 	bool			retry;
 	int			error;
 
-	ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+	xfs_assert_ilocked(XFS_I(inode), XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL);
 
 	do {
 		retry = false;
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 2fc98d313708..e3aaa0555597 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -44,7 +44,7 @@ xfs_fstrm_free_func(
 	atomic_dec(&pag->pagf_fstrms);
 	xfs_perag_rele(pag);
 
-	kmem_free(item);
+	kfree(item);
 }
 
 /*
@@ -313,7 +313,7 @@ xfs_filestream_create_association(
 	 * we return a referenced AG, the allocation can still go ahead just
 	 * fine.
 	 */
-	item = kmem_alloc(sizeof(*item), KM_MAYFAIL);
+	item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!item)
 		goto out_put_fstrms;
 
@@ -326,7 +326,7 @@ xfs_filestream_create_association(
 
 out_free_item:
 	xfs_perag_rele(item->pag);
-	kmem_free(item);
+	kfree(item);
 out_put_fstrms:
 	atomic_dec(&args->pag->pagf_fstrms);
 	return 0;
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 5a72217f5feb..de59eec74765 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -763,8 +763,8 @@ xfs_getfsmap_datadev_bnobt_query(
 		return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
 
 	/* Allocate cursor for this AG and query_range it. */
-	*curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
-			info->pag, XFS_BTNUM_BNO);
+	*curpp = xfs_bnobt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+			info->pag);
 	key->ar_startblock = info->low.rm_startblock;
 	key[1].ar_startblock = info->high.rm_startblock;
 	return xfs_alloc_query_range(*curpp, key, &key[1],
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 9a57afee9338..b39f959146bc 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -14,6 +14,10 @@
 #include "xfs_trace.h"
 #include "xfs_health.h"
 #include "xfs_ag.h"
+#include "xfs_btree.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_quota_defs.h"
 
 /*
  * Warn about metadata corruption that we detected but haven't fixed, and
@@ -93,11 +97,25 @@ xfs_fs_mark_sick(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
 	trace_xfs_fs_mark_sick(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
 	mp->m_fs_sick |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark per-fs metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_fs_mark_corrupt(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
+	trace_xfs_fs_mark_corrupt(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_fs_sick |= mask;
 	mp->m_fs_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
 }
@@ -108,11 +126,13 @@ xfs_fs_mark_healthy(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_FS_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_FS_ALL));
 	trace_xfs_fs_mark_healthy(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
 	mp->m_fs_sick &= ~mask;
+	if (!(mp->m_fs_sick & XFS_SICK_FS_PRIMARY))
+		mp->m_fs_sick &= ~XFS_SICK_FS_SECONDARY;
 	mp->m_fs_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
 }
@@ -136,11 +156,25 @@ xfs_rt_mark_sick(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_RT_ALL));
 	trace_xfs_rt_mark_sick(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
 	mp->m_rt_sick |= mask;
+	spin_unlock(&mp->m_sb_lock);
+}
+
+/* Mark realtime metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_rt_mark_corrupt(
+	struct xfs_mount	*mp,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_RT_ALL));
+	trace_xfs_rt_mark_corrupt(mp, mask);
+
+	spin_lock(&mp->m_sb_lock);
+	mp->m_rt_sick |= mask;
 	mp->m_rt_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
 }
@@ -151,11 +185,13 @@ xfs_rt_mark_healthy(
 	struct xfs_mount	*mp,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_RT_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_RT_ALL));
 	trace_xfs_rt_mark_healthy(mp, mask);
 
 	spin_lock(&mp->m_sb_lock);
 	mp->m_rt_sick &= ~mask;
+	if (!(mp->m_rt_sick & XFS_SICK_RT_PRIMARY))
+		mp->m_rt_sick &= ~XFS_SICK_RT_SECONDARY;
 	mp->m_rt_checked |= mask;
 	spin_unlock(&mp->m_sb_lock);
 }
@@ -173,17 +209,48 @@ xfs_rt_measure_sickness(
 	spin_unlock(&mp->m_sb_lock);
 }
 
+/* Mark unhealthy per-ag metadata given a raw AG number. */
+void
+xfs_agno_mark_sick(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	unsigned int		mask)
+{
+	struct xfs_perag	*pag = xfs_perag_get(mp, agno);
+
+	/* per-ag structure not set up yet? */
+	if (!pag)
+		return;
+
+	xfs_ag_mark_sick(pag, mask);
+	xfs_perag_put(pag);
+}
+
 /* Mark unhealthy per-ag metadata. */
 void
 xfs_ag_mark_sick(
 	struct xfs_perag	*pag,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_AG_ALL));
 	trace_xfs_ag_mark_sick(pag->pag_mount, pag->pag_agno, mask);
 
 	spin_lock(&pag->pag_state_lock);
 	pag->pag_sick |= mask;
+	spin_unlock(&pag->pag_state_lock);
+}
+
+/* Mark per-ag metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_ag_mark_corrupt(
+	struct xfs_perag	*pag,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_AG_ALL));
+	trace_xfs_ag_mark_corrupt(pag->pag_mount, pag->pag_agno, mask);
+
+	spin_lock(&pag->pag_state_lock);
+	pag->pag_sick |= mask;
 	pag->pag_checked |= mask;
 	spin_unlock(&pag->pag_state_lock);
 }
@@ -194,11 +261,13 @@ xfs_ag_mark_healthy(
 	struct xfs_perag	*pag,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~XFS_SICK_AG_PRIMARY));
+	ASSERT(!(mask & ~XFS_SICK_AG_ALL));
 	trace_xfs_ag_mark_healthy(pag->pag_mount, pag->pag_agno, mask);
 
 	spin_lock(&pag->pag_state_lock);
 	pag->pag_sick &= ~mask;
+	if (!(pag->pag_sick & XFS_SICK_AG_PRIMARY))
+		pag->pag_sick &= ~XFS_SICK_AG_SECONDARY;
 	pag->pag_checked |= mask;
 	spin_unlock(&pag->pag_state_lock);
 }
@@ -222,11 +291,34 @@ xfs_inode_mark_sick(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
 	trace_xfs_inode_mark_sick(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
 	ip->i_sick |= mask;
+	spin_unlock(&ip->i_flags_lock);
+
+	/*
+	 * Keep this inode around so we don't lose the sickness report.  Scrub
+	 * grabs inodes with DONTCACHE assuming that most inode are ok, which
+	 * is not the case here.
+	 */
+	spin_lock(&VFS_I(ip)->i_lock);
+	VFS_I(ip)->i_state &= ~I_DONTCACHE;
+	spin_unlock(&VFS_I(ip)->i_lock);
+}
+
+/* Mark inode metadata as having been checked and found unhealthy by fsck. */
+void
+xfs_inode_mark_corrupt(
+	struct xfs_inode	*ip,
+	unsigned int		mask)
+{
+	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
+	trace_xfs_inode_mark_corrupt(ip, mask);
+
+	spin_lock(&ip->i_flags_lock);
+	ip->i_sick |= mask;
 	ip->i_checked |= mask;
 	spin_unlock(&ip->i_flags_lock);
 
@@ -246,11 +338,13 @@ xfs_inode_mark_healthy(
 	struct xfs_inode	*ip,
 	unsigned int		mask)
 {
-	ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
+	ASSERT(!(mask & ~XFS_SICK_INO_ALL));
 	trace_xfs_inode_mark_healthy(ip, mask);
 
 	spin_lock(&ip->i_flags_lock);
 	ip->i_sick &= ~mask;
+	if (!(ip->i_sick & XFS_SICK_INO_PRIMARY))
+		ip->i_sick &= ~XFS_SICK_INO_SECONDARY;
 	ip->i_checked |= mask;
 	spin_unlock(&ip->i_flags_lock);
 }
@@ -280,6 +374,8 @@ static const struct ioctl_sick_map fs_map[] = {
 	{ XFS_SICK_FS_UQUOTA,	XFS_FSOP_GEOM_SICK_UQUOTA },
 	{ XFS_SICK_FS_GQUOTA,	XFS_FSOP_GEOM_SICK_GQUOTA },
 	{ XFS_SICK_FS_PQUOTA,	XFS_FSOP_GEOM_SICK_PQUOTA },
+	{ XFS_SICK_FS_QUOTACHECK, XFS_FSOP_GEOM_SICK_QUOTACHECK },
+	{ XFS_SICK_FS_NLINKS,	XFS_FSOP_GEOM_SICK_NLINKS },
 	{ 0, 0 },
 };
 
@@ -335,6 +431,7 @@ static const struct ioctl_sick_map ag_map[] = {
 	{ XFS_SICK_AG_FINOBT,	XFS_AG_GEOM_SICK_FINOBT },
 	{ XFS_SICK_AG_RMAPBT,	XFS_AG_GEOM_SICK_RMAPBT },
 	{ XFS_SICK_AG_REFCNTBT,	XFS_AG_GEOM_SICK_REFCNTBT },
+	{ XFS_SICK_AG_INODES,	XFS_AG_GEOM_SICK_INODES },
 	{ 0, 0 },
 };
 
@@ -397,3 +494,92 @@ xfs_bulkstat_health(
 			bs->bs_sick |= m->ioctl_mask;
 	}
 }
+
+/* Mark a block mapping sick. */
+void
+xfs_bmap_mark_sick(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	unsigned int		mask;
+
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		mask = XFS_SICK_INO_BMBTD;
+		break;
+	case XFS_ATTR_FORK:
+		mask = XFS_SICK_INO_BMBTA;
+		break;
+	case XFS_COW_FORK:
+		mask = XFS_SICK_INO_BMBTC;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	xfs_inode_mark_sick(ip, mask);
+}
+
+/* Record observations of btree corruption with the health tracking system. */
+void
+xfs_btree_mark_sick(
+	struct xfs_btree_cur		*cur)
+{
+	switch (cur->bc_ops->type) {
+	case XFS_BTREE_TYPE_MEM:
+		/* no health state tracking for ephemeral btrees */
+		return;
+	case XFS_BTREE_TYPE_AG:
+		ASSERT(cur->bc_ops->sick_mask);
+		xfs_ag_mark_sick(cur->bc_ag.pag, cur->bc_ops->sick_mask);
+		return;
+	case XFS_BTREE_TYPE_INODE:
+		if (xfs_btree_is_bmap(cur->bc_ops)) {
+			xfs_bmap_mark_sick(cur->bc_ino.ip,
+					   cur->bc_ino.whichfork);
+			return;
+		}
+		fallthrough;
+	default:
+		ASSERT(0);
+		return;
+	}
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_dirattr_mark_sick(
+	struct xfs_inode	*ip,
+	int			whichfork)
+{
+	unsigned int		mask;
+
+	switch (whichfork) {
+	case XFS_DATA_FORK:
+		mask = XFS_SICK_INO_DIR;
+		break;
+	case XFS_ATTR_FORK:
+		mask = XFS_SICK_INO_XATTR;
+		break;
+	default:
+		ASSERT(0);
+		return;
+	}
+
+	xfs_inode_mark_sick(ip, mask);
+}
+
+/*
+ * Record observations of dir/attr btree corruption with the health tracking
+ * system.
+ */
+void
+xfs_da_mark_sick(
+	struct xfs_da_args	*args)
+{
+	xfs_dirattr_mark_sick(args->dp, args->whichfork);
+}
diff --git a/fs/xfs/xfs_hooks.c b/fs/xfs/xfs_hooks.c
new file mode 100644
index 000000000000..a58d1de2d37d
--- /dev/null
+++ b/fs/xfs/xfs_hooks.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_ag.h"
+#include "xfs_trace.h"
+
+/* Initialize a notifier chain. */
+void
+xfs_hooks_init(
+	struct xfs_hooks	*chain)
+{
+	BLOCKING_INIT_NOTIFIER_HEAD(&chain->head);
+}
+
+/* Make it so a function gets called whenever we hit a certain hook point. */
+int
+xfs_hooks_add(
+	struct xfs_hooks	*chain,
+	struct xfs_hook		*hook)
+{
+	ASSERT(hook->nb.notifier_call != NULL);
+	BUILD_BUG_ON(offsetof(struct xfs_hook, nb) != 0);
+
+	return blocking_notifier_chain_register(&chain->head, &hook->nb);
+}
+
+/* Remove a previously installed hook. */
+void
+xfs_hooks_del(
+	struct xfs_hooks	*chain,
+	struct xfs_hook		*hook)
+{
+	blocking_notifier_chain_unregister(&chain->head, &hook->nb);
+}
+
+/* Call a hook.  Returns the NOTIFY_* value returned by the last hook. */
+int
+xfs_hooks_call(
+	struct xfs_hooks	*chain,
+	unsigned long		val,
+	void			*priv)
+{
+	return blocking_notifier_call_chain(&chain->head, val, priv);
+}
diff --git a/fs/xfs/xfs_hooks.h b/fs/xfs/xfs_hooks.h
new file mode 100644
index 000000000000..60b8a5831536
--- /dev/null
+++ b/fs/xfs/xfs_hooks.h
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (c) 2022-2024 Oracle.  All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef XFS_HOOKS_H_
+#define XFS_HOOKS_H_
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+struct xfs_hooks {
+	struct blocking_notifier_head	head;
+};
+
+/*
+ * If jump labels are enabled in Kconfig, the static key uses nop sleds and
+ * code patching to eliminate the overhead of taking the rwsem in
+ * blocking_notifier_call_chain when there are no hooks configured.  If not,
+ * the static key per-call overhead is an atomic read.  Most arches that can
+ * handle XFS also support jump labels.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name) \
+	static DEFINE_STATIC_KEY_FALSE(name)
+# define xfs_hooks_switch_on(name)	static_branch_inc(name)
+# define xfs_hooks_switch_off(name)	static_branch_dec(name)
+# define xfs_hooks_switched_on(name)	static_branch_unlikely(name)
+
+struct xfs_hook {
+	/* This must come at the start of the structure. */
+	struct notifier_block		nb;
+};
+
+typedef	int (*xfs_hook_fn_t)(struct xfs_hook *hook, unsigned long action,
+		void *data);
+
+void xfs_hooks_init(struct xfs_hooks *chain);
+int xfs_hooks_add(struct xfs_hooks *chain, struct xfs_hook *hook);
+void xfs_hooks_del(struct xfs_hooks *chain, struct xfs_hook *hook);
+int xfs_hooks_call(struct xfs_hooks *chain, unsigned long action,
+		void *priv);
+
+static inline void xfs_hook_setup(struct xfs_hook *hook, notifier_fn_t fn)
+{
+	hook->nb.notifier_call = fn;
+	hook->nb.priority = 0;
+}
+
+#else
+
+struct xfs_hooks { /* empty */ };
+
+# define DEFINE_STATIC_XFS_HOOK_SWITCH(name)
+# define xfs_hooks_switch_on(name)		((void)0)
+# define xfs_hooks_switch_off(name)		((void)0)
+# define xfs_hooks_switched_on(name)		(false)
+
+# define xfs_hooks_init(chain)			((void)0)
+# define xfs_hooks_call(chain, val, priv)	(NOTIFY_DONE)
+#endif
+
+#endif /* XFS_HOOKS_H_ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index dba514a2c84d..74f1812b03cb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -24,6 +24,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_ag.h"
 #include "xfs_log_priv.h"
+#include "xfs_health.h"
 
 #include <linux/iversion.h>
 
@@ -415,6 +416,9 @@ xfs_iget_check_free_state(
 			xfs_warn(ip->i_mount,
 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
 				ip->i_ino, VFS_I(ip)->i_mode);
+			xfs_agno_mark_sick(ip->i_mount,
+					XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+					XFS_SICK_AG_INOBT);
 			return -EFSCORRUPTED;
 		}
 
@@ -422,6 +426,9 @@ xfs_iget_check_free_state(
 			xfs_warn(ip->i_mount,
 "Corruption detected! Free inode 0x%llx has blocks allocated!",
 				ip->i_ino);
+			xfs_agno_mark_sick(ip->i_mount,
+					XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+					XFS_SICK_AG_INOBT);
 			return -EFSCORRUPTED;
 		}
 		return 0;
@@ -640,6 +647,8 @@ xfs_iget_cache_miss(
 				xfs_buf_offset(bp, ip->i_imap.im_boffset));
 		if (!error)
 			xfs_buf_set_ref(bp, XFS_INO_REF);
+		else
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 		xfs_trans_brelse(tp, bp);
 
 		if (error)
@@ -659,10 +668,9 @@ xfs_iget_cache_miss(
 	/*
 	 * Preload the radix tree so we can insert safely under the
 	 * write spinlock. Note that we cannot sleep inside the preload
-	 * region. Since we can be called from transaction context, don't
-	 * recurse into the file system.
+	 * region.
 	 */
-	if (radix_tree_preload(GFP_NOFS)) {
+	if (radix_tree_preload(GFP_KERNEL | __GFP_NOLOCKDEP)) {
 		error = -EAGAIN;
 		goto out_destroy;
 	}
@@ -2031,8 +2039,10 @@ xfs_inodegc_want_queue_work(
  *  - Memory shrinkers queued the inactivation worker and it hasn't finished.
  *  - The queue depth exceeds the maximum allowable percpu backlog.
  *
- * Note: If the current thread is running a transaction, we don't ever want to
- * wait for other transactions because that could introduce a deadlock.
+ * Note: If we are in a NOFS context here (e.g. current thread is running a
+ * transaction) the we don't want to block here as inodegc progress may require
+ * filesystem resources we hold to make progress and that could result in a
+ * deadlock. Hence we skip out of here if we are in a scoped NOFS context.
  */
 static inline bool
 xfs_inodegc_want_flush_work(
@@ -2040,7 +2050,7 @@ xfs_inodegc_want_flush_work(
 	unsigned int		items,
 	unsigned int		shrinker_hits)
 {
-	if (current->journal_info)
+	if (current->flags & PF_MEMALLOC_NOFS)
 		return false;
 
 	if (shrinker_hits > 0)
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index b05314d48176..4345db501714 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -63,7 +63,7 @@ STATIC void
 xfs_icreate_item_release(
 	struct xfs_log_item	*lip)
 {
-	kmem_free(ICR_ITEM(lip)->ic_item.li_lv_shadow);
+	kvfree(ICR_ITEM(lip)->ic_item.li_lv_shadow);
 	kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip));
 }
 
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1fd94958aa97..d55b42b2480d 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -203,9 +203,9 @@ xfs_ilock(
 	}
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+		down_write_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 	else if (lock_flags & XFS_ILOCK_SHARED)
-		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+		down_read_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
 }
 
 /*
@@ -246,10 +246,10 @@ xfs_ilock_nowait(
 	}
 
 	if (lock_flags & XFS_ILOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_lock))
+		if (!down_write_trylock(&ip->i_lock))
 			goto out_undo_mmaplock;
 	} else if (lock_flags & XFS_ILOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_lock))
+		if (!down_read_trylock(&ip->i_lock))
 			goto out_undo_mmaplock;
 	}
 	return 1;
@@ -298,9 +298,9 @@ xfs_iunlock(
 		up_read(&VFS_I(ip)->i_mapping->invalidate_lock);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrunlock_excl(&ip->i_lock);
+		up_write(&ip->i_lock);
 	else if (lock_flags & XFS_ILOCK_SHARED)
-		mrunlock_shared(&ip->i_lock);
+		up_read(&ip->i_lock);
 
 	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
 }
@@ -319,7 +319,7 @@ xfs_ilock_demote(
 		~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
 
 	if (lock_flags & XFS_ILOCK_EXCL)
-		mrdemote(&ip->i_lock);
+		downgrade_write(&ip->i_lock);
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		downgrade_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
@@ -328,52 +328,30 @@ xfs_ilock_demote(
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
 
-#if defined(DEBUG) || defined(XFS_WARN)
-static inline bool
-__xfs_rwsem_islocked(
-	struct rw_semaphore	*rwsem,
-	bool			shared)
-{
-	if (!debug_locks)
-		return rwsem_is_locked(rwsem);
-
-	if (!shared)
-		return lockdep_is_held_type(rwsem, 0);
-
-	/*
-	 * We are checking that the lock is held at least in shared
-	 * mode but don't care that it might be held exclusively
-	 * (i.e. shared | excl). Hence we check if the lock is held
-	 * in any mode rather than an explicit shared mode.
-	 */
-	return lockdep_is_held_type(rwsem, -1);
-}
-
-bool
-xfs_isilocked(
+void
+xfs_assert_ilocked(
 	struct xfs_inode	*ip,
 	uint			lock_flags)
 {
-	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-		if (!(lock_flags & XFS_ILOCK_SHARED))
-			return !!ip->i_lock.mr_writer;
-		return rwsem_is_locked(&ip->i_lock.mr_lock);
-	}
-
-	if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) {
-		return __xfs_rwsem_islocked(&VFS_I(ip)->i_mapping->invalidate_lock,
-				(lock_flags & XFS_MMAPLOCK_SHARED));
-	}
+	/*
+	 * Sometimes we assert the ILOCK is held exclusively, but we're in
+	 * a workqueue, so lockdep doesn't know we're the owner.
+	 */
+	if (lock_flags & XFS_ILOCK_SHARED)
+		rwsem_assert_held(&ip->i_lock);
+	else if (lock_flags & XFS_ILOCK_EXCL)
+		rwsem_assert_held_write_nolockdep(&ip->i_lock);
 
-	if (lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) {
-		return __xfs_rwsem_islocked(&VFS_I(ip)->i_rwsem,
-				(lock_flags & XFS_IOLOCK_SHARED));
-	}
+	if (lock_flags & XFS_MMAPLOCK_SHARED)
+		rwsem_assert_held(&VFS_I(ip)->i_mapping->invalidate_lock);
+	else if (lock_flags & XFS_MMAPLOCK_EXCL)
+		rwsem_assert_held_write(&VFS_I(ip)->i_mapping->invalidate_lock);
 
-	ASSERT(0);
-	return false;
+	if (lock_flags & XFS_IOLOCK_SHARED)
+		rwsem_assert_held(&VFS_I(ip)->i_rwsem);
+	else if (lock_flags & XFS_IOLOCK_EXCL)
+		rwsem_assert_held_write(&VFS_I(ip)->i_rwsem);
 }
-#endif
 
 /*
  * xfs_lockdep_subclass_ok() is only used in an ASSERT, so is only called when
@@ -671,7 +649,7 @@ xfs_lookup(
 
 out_free_name:
 	if (ci_name)
-		kmem_free(ci_name->name);
+		kfree(ci_name->name);
 out_unlock:
 	*ipp = NULL;
 	return error;
@@ -802,6 +780,8 @@ xfs_init_new_inode(
 	 */
 	if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) {
 		xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino);
+		xfs_agno_mark_sick(mp, XFS_INO_TO_AGNO(mp, ino),
+				XFS_SICK_AG_INOBT);
 		return -EFSCORRUPTED;
 	}
 
@@ -947,6 +927,81 @@ xfs_bumplink(
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of directory live update hooks.
+ * If the compiler supports jump labels, the static branch will be replaced by
+ * a nop sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dir_hooks_switch);
+
+void
+xfs_dir_hook_disable(void)
+{
+	xfs_hooks_switch_off(&xfs_dir_hooks_switch);
+}
+
+void
+xfs_dir_hook_enable(void)
+{
+	xfs_hooks_switch_on(&xfs_dir_hooks_switch);
+}
+
+/* Call hooks for a directory update relating to a child dirent update. */
+inline void
+xfs_dir_update_hook(
+	struct xfs_inode		*dp,
+	struct xfs_inode		*ip,
+	int				delta,
+	const struct xfs_name		*name)
+{
+	if (xfs_hooks_switched_on(&xfs_dir_hooks_switch)) {
+		struct xfs_dir_update_params	p = {
+			.dp		= dp,
+			.ip		= ip,
+			.delta		= delta,
+			.name		= name,
+		};
+		struct xfs_mount	*mp = ip->i_mount;
+
+		xfs_hooks_call(&mp->m_dir_update_hooks, 0, &p);
+	}
+}
+
+/* Call the specified function during a directory update. */
+int
+xfs_dir_hook_add(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	return xfs_hooks_add(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Stop calling the specified function during a directory update. */
+void
+xfs_dir_hook_del(
+	struct xfs_mount	*mp,
+	struct xfs_dir_hook	*hook)
+{
+	xfs_hooks_del(&mp->m_dir_update_hooks, &hook->dirent_hook);
+}
+
+/* Configure directory update hook functions. */
+void
+xfs_dir_hook_setup(
+	struct xfs_dir_hook	*hook,
+	notifier_fn_t		mod_fn)
+{
+	xfs_hook_setup(&hook->dirent_hook, mod_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 int
 xfs_create(
 	struct mnt_idmap	*idmap,
@@ -1058,6 +1113,12 @@ xfs_create(
 	}
 
 	/*
+	 * Create ip with a reference from dp, and add '.' and '..' references
+	 * if it's a directory.
+	 */
+	xfs_dir_update_hook(dp, ip, 1, name);
+
+	/*
 	 * If this is a synchronous mount, make sure that the
 	 * create transaction goes to disk before returning to
 	 * the user.
@@ -1240,8 +1301,19 @@ xfs_link(
 	 */
 	if (unlikely((tdp->i_diflags & XFS_DIFLAG_PROJINHERIT) &&
 		     tdp->i_projid != sip->i_projid)) {
-		error = -EXDEV;
-		goto error_return;
+		/*
+		 * Project quota setup skips special files which can
+		 * leave inodes in a PROJINHERIT directory without a
+		 * project ID set. We need to allow links to be made
+		 * to these "project-less" inodes because userspace
+		 * expects them to succeed after project ID setup,
+		 * but everything else should be rejected.
+		 */
+		if (!special_file(VFS_I(sip)->i_mode) ||
+		    sip->i_projid != 0) {
+			error = -EXDEV;
+			goto error_return;
+		}
 	}
 
 	if (!resblks) {
@@ -1271,6 +1343,7 @@ xfs_link(
 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
 
 	xfs_bumplink(tp, sip);
+	xfs_dir_update_hook(tdp, sip, 1, target_name);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -1342,9 +1415,9 @@ xfs_itruncate_extents_flags(
 	xfs_fileoff_t		first_unmap_block;
 	int			error = 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
-	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
+	if (atomic_read(&VFS_I(ip)->i_count))
+		xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL);
 	ASSERT(new_size <= XFS_ISIZE(ip));
 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
 	ASSERT(ip->i_itemp != NULL);
@@ -1596,7 +1669,7 @@ xfs_inactive_ifree(
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	error = xfs_ifree(tp, ip);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	if (error) {
 		/*
 		 * If we fail to free the inode, shut down.  The cancel
@@ -1677,6 +1750,39 @@ xfs_inode_needs_inactive(
 }
 
 /*
+ * Save health status somewhere, if we're dumping an inode with uncorrected
+ * errors and online repair isn't running.
+ */
+static inline void
+xfs_inactive_health(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_perag	*pag;
+	unsigned int		sick;
+	unsigned int		checked;
+
+	xfs_inode_measure_sickness(ip, &sick, &checked);
+	if (!sick)
+		return;
+
+	trace_xfs_inode_unfixed_corruption(ip, sick);
+
+	if (sick & XFS_SICK_INO_FORGET)
+		return;
+
+	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+	if (!pag) {
+		/* There had better still be a perag structure! */
+		ASSERT(0);
+		return;
+	}
+
+	xfs_ag_mark_sick(pag, XFS_SICK_AG_INODES);
+	xfs_perag_put(pag);
+}
+
+/*
  * xfs_inactive
  *
  * This is called when the vnode reference count for the vnode
@@ -1704,6 +1810,8 @@ xfs_inactive(
 	mp = ip->i_mount;
 	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
 
+	xfs_inactive_health(ip);
+
 	/*
 	 * If this is a read-only mount, don't do this (would generate I/O)
 	 * unless we're in log recovery and cleaning the iunlinked list.
@@ -1910,6 +2018,7 @@ xfs_iunlink_update_bucket(
 	 */
 	if (old_value == new_agino) {
 		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 		return -EFSCORRUPTED;
 	}
 
@@ -1959,11 +2068,14 @@ xfs_iunlink_reload_next(
 	 */
 	ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino);
 	error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip);
-	if (error)
+	if (error) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 		return error;
+	}
 
 	/* If this is not an unlinked inode, something is very wrong. */
 	if (VFS_I(next_ip)->i_nlink != 0) {
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 		error = -EFSCORRUPTED;
 		goto rele;
 	}
@@ -2001,6 +2113,7 @@ xfs_iunlink_insert_inode(
 	if (next_agino == agino ||
 	    !xfs_verify_agino_or_null(pag, next_agino)) {
 		xfs_buf_mark_corrupt(agibp);
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 		return -EFSCORRUPTED;
 	}
 
@@ -2088,6 +2201,7 @@ xfs_iunlink_remove_inode(
 	if (!xfs_verify_agino(pag, head_agino)) {
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 				agi, sizeof(*agi));
+		xfs_ag_mark_sick(pag, XFS_SICK_AG_AGI);
 		return -EFSCORRUPTED;
 	}
 
@@ -2116,8 +2230,10 @@ xfs_iunlink_remove_inode(
 		struct xfs_inode	*prev_ip;
 
 		prev_ip = xfs_iunlink_lookup(pag, ip->i_prev_unlinked);
-		if (!prev_ip)
+		if (!prev_ip) {
+			xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 			return -EFSCORRUPTED;
+		}
 
 		error = xfs_iunlink_log_inode(tp, prev_ip, pag,
 				ip->i_next_unlinked);
@@ -2350,7 +2466,7 @@ xfs_ifree(
 	struct xfs_inode_log_item *iip = ip->i_itemp;
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(ip->i_df.if_nextents == 0);
 	ASSERT(ip->i_disk_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
@@ -2378,7 +2494,7 @@ xfs_ifree(
 	 * already been freed by xfs_attr_inactive.
 	 */
 	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
-		kmem_free(ip->i_df.if_data);
+		kfree(ip->i_df.if_data);
 		ip->i_df.if_data = NULL;
 		ip->i_df.if_bytes = 0;
 	}
@@ -2419,7 +2535,7 @@ static void
 xfs_iunpin(
 	struct xfs_inode	*ip)
 {
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
 
 	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
 
@@ -2585,6 +2701,12 @@ xfs_remove(
 	}
 
 	/*
+	 * Drop the link from dp to ip, and if ip was a directory, remove the
+	 * '.' and '..' references since we freed the directory.
+	 */
+	xfs_dir_update_hook(dp, ip, -1, name);
+
+	/*
 	 * If this is a synchronous mount, make sure that the
 	 * remove transaction goes to disk before returning to
 	 * the user.
@@ -2774,6 +2896,20 @@ xfs_cross_rename(
 	}
 	xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+
+	/*
+	 * Inform our hook clients that we've finished an exchange operation as
+	 * follows: removed the source and target files from their directories;
+	 * added the target to the source directory; and added the source to
+	 * the target directory.  All inodes are locked, so it's ok to model a
+	 * rename this way so long as we say we deleted entries before we add
+	 * new ones.
+	 */
+	xfs_dir_update_hook(dp1, ip1, -1, name1);
+	xfs_dir_update_hook(dp2, ip2, -1, name2);
+	xfs_dir_update_hook(dp1, ip2, 1, name1);
+	xfs_dir_update_hook(dp2, ip1, 1, name2);
+
 	return xfs_finish_rename(tp);
 
 out_trans_abort:
@@ -3157,6 +3293,21 @@ retry:
 	if (new_parent)
 		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
 
+	/*
+	 * Inform our hook clients that we've finished a rename operation as
+	 * follows: removed the source and target files from their directories;
+	 * that we've added the source to the target directory; and finally
+	 * that we've added the whiteout, if there was one.  All inodes are
+	 * locked, so it's ok to model a rename this way so long as we say we
+	 * deleted entries before we add new ones.
+	 */
+	if (target_ip)
+		xfs_dir_update_hook(target_dp, target_ip, -1, target_name);
+	xfs_dir_update_hook(src_dp, src_ip, -1, src_name);
+	xfs_dir_update_hook(target_dp, src_ip, 1, target_name);
+	if (wip)
+		xfs_dir_update_hook(src_dp, wip, 1, src_name);
+
 	error = xfs_finish_rename(tp);
 	if (wip)
 		xfs_irele(wip);
@@ -3182,7 +3333,7 @@ xfs_iflush(
 	struct xfs_mount	*mp = ip->i_mount;
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED);
 	ASSERT(xfs_iflags_test(ip, XFS_IFLUSHING));
 	ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
 	       ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
@@ -3317,6 +3468,8 @@ flush_out:
 
 	/* generate the checksum. */
 	xfs_dinode_calc_crc(mp, dip);
+	if (error)
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_CORE);
 	return error;
 }
 
@@ -3777,3 +3930,19 @@ xfs_ifork_zapped(
 		return false;
 	}
 }
+
+/* Compute the number of data and realtime blocks used by a file. */
+void
+xfs_inode_count_blocks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_filblks_t		*dblocks,
+	xfs_filblks_t		*rblocks)
+{
+	struct xfs_ifork	*ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+	*rblocks = 0;
+	if (XFS_IS_REALTIME_INODE(ip))
+		xfs_bmap_count_leaves(ifp, rblocks);
+	*dblocks = ip->i_nblocks - *rblocks;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 97f63bacd4c2..ab46ffb3ac19 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -39,7 +39,7 @@ typedef struct xfs_inode {
 
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
-	mrlock_t		i_lock;		/* inode lock */
+	struct rw_semaphore	i_lock;		/* inode lock */
 	atomic_t		i_pincount;	/* inode pin count */
 	struct llist_node	i_gclist;	/* deferred inactivation list */
 
@@ -171,6 +171,12 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 	return &ip->i_vnode;
 }
 
+/* convert from const xfs inode to const vfs inode */
+static inline const struct inode *VFS_IC(const struct xfs_inode *ip)
+{
+	return &ip->i_vnode;
+}
+
 /*
  * For regular files we only update the on-disk filesize when actually
  * writing data back to disk.  Until then only the copy in the VFS inode
@@ -523,7 +529,7 @@ void		xfs_ilock(xfs_inode_t *, uint);
 int		xfs_ilock_nowait(xfs_inode_t *, uint);
 void		xfs_iunlock(xfs_inode_t *, uint);
 void		xfs_ilock_demote(xfs_inode_t *, uint);
-bool		xfs_isilocked(struct xfs_inode *, uint);
+void		xfs_assert_ilocked(struct xfs_inode *, uint);
 uint		xfs_ilock_data_map_shared(struct xfs_inode *);
 uint		xfs_ilock_attr_map_shared(struct xfs_inode *);
 
@@ -623,5 +629,32 @@ int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip)
 int xfs_inode_reload_unlinked(struct xfs_inode *ip);
 
 bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+void xfs_inode_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_filblks_t *dblocks, xfs_filblks_t *rblocks);
+
+struct xfs_dir_update_params {
+	const struct xfs_inode	*dp;
+	const struct xfs_inode	*ip;
+	const struct xfs_name	*name;
+	int			delta;
+};
+
+#ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_dir_update_hook(struct xfs_inode *dp, struct xfs_inode *ip,
+		int delta, const struct xfs_name *name);
+
+struct xfs_dir_hook {
+	struct xfs_hook		dirent_hook;
+};
+
+void xfs_dir_hook_disable(void);
+void xfs_dir_hook_enable(void);
+
+int xfs_dir_hook_add(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_del(struct xfs_mount *mp, struct xfs_dir_hook *hook);
+void xfs_dir_hook_setup(struct xfs_dir_hook *hook, notifier_fn_t mod_fn);
+#else
+# define xfs_dir_update_hook(dp, ip, delta, name)	((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
 
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 0aee97ba0be8..f28d653300d1 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -650,7 +650,7 @@ xfs_inode_item_pin(
 {
 	struct xfs_inode	*ip = INODE_ITEM(lip)->ili_inode;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(lip->li_buf);
 
 	trace_xfs_inode_pin(ip, _RET_IP_);
@@ -756,7 +756,7 @@ xfs_inode_item_release(
 	unsigned short		lock_flags;
 
 	ASSERT(ip->i_itemp != NULL);
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	lock_flags = iip->ili_lock_flags;
 	iip->ili_lock_flags = 0;
@@ -856,7 +856,7 @@ xfs_inode_item_destroy(
 	ASSERT(iip->ili_item.li_buf == NULL);
 
 	ip->i_itemp = NULL;
-	kmem_free(iip->ili_item.li_lv_shadow);
+	kvfree(iip->ili_item.li_lv_shadow);
 	kmem_cache_free(xfs_ili_cache, iip);
 }
 
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index 144198a6b270..dbdab4ce7c44 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -291,7 +291,8 @@ xlog_recover_inode_commit_pass2(
 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
 		in_f = item->ri_buf[0].i_addr;
 	} else {
-		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
+		in_f = kmalloc(sizeof(struct xfs_inode_log_format),
+				GFP_KERNEL | __GFP_NOFAIL);
 		need_free = 1;
 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
 		if (error)
@@ -553,7 +554,7 @@ out_release:
 	xfs_buf_relse(bp);
 error:
 	if (need_free)
-		kmem_free(in_f);
+		kfree(in_f);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index f02b6e558af5..d0e2cec6210d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -435,7 +435,7 @@ xfs_ioc_attr_list(
 	    copy_to_user(ucursor, &context.cursor, sizeof(context.cursor)))
 		error = -EFAULT;
 out_free:
-	kmem_free(buffer);
+	kvfree(buffer);
 	return error;
 }
 
@@ -493,7 +493,7 @@ xfs_attrmulti_attr_get(
 		error = -EFAULT;
 
 out_kfree:
-	kmem_free(args.value);
+	kvfree(args.value);
 	return error;
 }
 
@@ -1506,7 +1506,7 @@ xfs_ioc_getbmap(
 
 	error = 0;
 out_free_buf:
-	kmem_free(buf);
+	kvfree(buf);
 	return error;
 }
 
@@ -1636,7 +1636,7 @@ xfs_ioc_getfsmap(
 	}
 
 out_free:
-	kmem_free(recs);
+	kvfree(recs);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 18c8f168b153..4087af7f3c9f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -27,6 +27,7 @@
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
 #include "xfs_reflink.h"
+#include "xfs_health.h"
 
 #define XFS_ALLOC_ALIGN(mp, off) \
 	(((off) >> mp->m_allocsize_log) << mp->m_allocsize_log)
@@ -45,6 +46,7 @@ xfs_alert_fsblock_zero(
 		(unsigned long long)imap->br_startoff,
 		(unsigned long long)imap->br_blockcount,
 		imap->br_state);
+	xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 	return -EFSCORRUPTED;
 }
 
@@ -99,8 +101,10 @@ xfs_bmbt_to_iomap(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_buftarg	*target = xfs_inode_buftarg(ip);
 
-	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		return xfs_alert_fsblock_zero(ip, imap);
+	}
 
 	if (imap->br_startblock == HOLESTARTBLOCK) {
 		iomap->addr = IOMAP_NULL_ADDR;
@@ -325,8 +329,10 @@ xfs_iomap_write_direct(
 		goto out_unlock;
 	}
 
-	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
+	if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) {
+		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = xfs_alert_fsblock_zero(ip, imap);
+	}
 
 out_unlock:
 	*seq = xfs_iomap_inode_sequence(ip, 0);
@@ -639,8 +645,10 @@ xfs_iomap_write_unwritten(
 		if (error)
 			return error;
 
-		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock)))
+		if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) {
+			xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 			return xfs_alert_fsblock_zero(ip, &imap);
+		}
 
 		if ((numblks_fsb = imap.br_blockcount) == 0) {
 			/*
@@ -986,6 +994,7 @@ xfs_buffered_write_iomap_begin(
 
 	if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
 	    XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
+		xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 		error = -EFSCORRUPTED;
 		goto out_unlock;
 	}
@@ -1323,7 +1332,7 @@ xfs_seek_iomap_begin(
 	if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
 		if (data_fsb < cow_fsb + cmap.br_blockcount)
 			end_fsb = min(end_fsb, data_fsb);
-		xfs_trim_extent(&cmap, offset_fsb, end_fsb);
+		xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb);
 		seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
 		error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
 				IOMAP_F_SHARED, seq);
@@ -1348,7 +1357,7 @@ xfs_seek_iomap_begin(
 	imap.br_state = XFS_EXT_NORM;
 done:
 	seq = xfs_iomap_inode_sequence(ip, 0);
-	xfs_trim_extent(&imap, offset_fsb, end_fsb);
+	xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
 	error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
 out_unlock:
 	xfs_iunlock(ip, lockmode);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index a0d77f5f512e..66f8c47642e8 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -346,7 +346,7 @@ xfs_vn_ci_lookup(
 	dname.name = ci_name.name;
 	dname.len = ci_name.len;
 	dentry = d_add_ci(dentry, VFS_I(ip), &dname);
-	kmem_free(ci_name.name);
+	kfree(ci_name.name);
 	return dentry;
 }
 
@@ -796,8 +796,7 @@ xfs_setattr_size(
 	uint			lock_flags = 0;
 	bool			did_zeroing = false;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-	ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
 	ASSERT(S_ISREG(inode->i_mode));
 	ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
 		ATTR_MTIME_SET|ATTR_TIMES_SET)) == 0);
@@ -1285,9 +1284,9 @@ xfs_setup_inode(
 		 */
 		lockdep_set_class(&inode->i_rwsem,
 				  &inode->i_sb->s_type->i_mutex_dir_key);
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_dir_ilock_class);
+		lockdep_set_class(&ip->i_lock, &xfs_dir_ilock_class);
 	} else {
-		lockdep_set_class(&ip->i_lock.mr_lock, &xfs_nondir_ilock_class);
+		lockdep_set_class(&ip->i_lock, &xfs_nondir_ilock_class);
 	}
 
 	/*
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 14462614fcc8..95fc31b9f87d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -197,8 +197,8 @@ xfs_bulkstat_one(
 
 	ASSERT(breq->icount == 1);
 
-	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-			KM_MAYFAIL);
+	bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!bc.buf)
 		return -ENOMEM;
 
@@ -214,7 +214,7 @@ xfs_bulkstat_one(
 			breq->startino, &bc);
 	xfs_trans_cancel(tp);
 out:
-	kmem_free(bc.buf);
+	kfree(bc.buf);
 
 	/*
 	 * If we reported one inode to userspace then we abort because we hit
@@ -289,8 +289,8 @@ xfs_bulkstat(
 	if (xfs_bulkstat_already_done(breq->mp, breq->startino))
 		return 0;
 
-	bc.buf = kmem_zalloc(sizeof(struct xfs_bulkstat),
-			KM_MAYFAIL);
+	bc.buf = kzalloc(sizeof(struct xfs_bulkstat),
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!bc.buf)
 		return -ENOMEM;
 
@@ -309,7 +309,7 @@ xfs_bulkstat(
 			xfs_bulkstat_iwalk, breq->icount, &bc);
 	xfs_trans_cancel(tp);
 out:
-	kmem_free(bc.buf);
+	kfree(bc.buf);
 
 	/*
 	 * We found some inodes, so clear the error status and return them.
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index b3275e8d47b6..01b55f03a102 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -22,6 +22,7 @@
 #include "xfs_trans.h"
 #include "xfs_pwork.h"
 #include "xfs_ag.h"
+#include "xfs_bit.h"
 
 /*
  * Walking Inodes in the Filesystem
@@ -99,6 +100,7 @@ xfs_iwalk_ichunk_ra(
 	struct xfs_inobt_rec_incore	*irec)
 {
 	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
+	xfs_agnumber_t			agno = pag->pag_agno;
 	xfs_agblock_t			agbno;
 	struct blk_plug			plug;
 	int				i;	/* inode chunk index */
@@ -111,8 +113,9 @@ xfs_iwalk_ichunk_ra(
 
 		imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
 		if (imask & ~irec->ir_free) {
-			xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
-					igeo->blocks_per_cluster,
+			xfs_buf_readahead(mp->m_ddev_targp,
+					XFS_AGB_TO_DADDR(mp, agno, agbno),
+					igeo->blocks_per_cluster * mp->m_bsize,
 					&xfs_inode_buf_ops);
 		}
 		agbno += igeo->blocks_per_cluster;
@@ -131,21 +134,11 @@ xfs_iwalk_adjust_start(
 	struct xfs_inobt_rec_incore	*irec)	/* btree record */
 {
 	int				idx;	/* index into inode chunk */
-	int				i;
 
 	idx = agino - irec->ir_startino;
 
-	/*
-	 * We got a right chunk with some left inodes allocated at it.  Grab
-	 * the chunk record.  Mark all the uninteresting inodes free because
-	 * they're before our start point.
-	 */
-	for (i = 0; i < idx; i++) {
-		if (XFS_INOBT_MASK(i) & ~irec->ir_free)
-			irec->ir_freecount++;
-	}
-
 	irec->ir_free |= xfs_inobt_maskn(0, idx);
+	irec->ir_freecount = hweight64(irec->ir_free);
 }
 
 /* Allocate memory for a walk. */
@@ -160,7 +153,7 @@ xfs_iwalk_alloc(
 
 	/* Allocate a prefetch buffer for inobt records. */
 	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
-	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
+	iwag->recs = kmalloc(size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (iwag->recs == NULL)
 		return -ENOMEM;
 
@@ -172,7 +165,7 @@ STATIC void
 xfs_iwalk_free(
 	struct xfs_iwalk_ag	*iwag)
 {
-	kmem_free(iwag->recs);
+	kfree(iwag->recs);
 	iwag->recs = NULL;
 }
 
@@ -275,9 +268,10 @@ xfs_iwalk_ag_start(
 
 	/* Set up a fresh cursor and empty the inobt cache. */
 	iwag->nr_recs = 0;
-	error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp);
+	error = xfs_ialloc_read_agi(pag, tp, agi_bpp);
 	if (error)
 		return error;
+	*curpp = xfs_inobt_init_cursor(pag, tp, *agi_bpp);
 
 	/* Starting at the beginning of the AG?  That's easy! */
 	if (agino == 0)
@@ -306,8 +300,10 @@ xfs_iwalk_ag_start(
 	error = xfs_inobt_get_rec(*curpp, irec, has_more);
 	if (error)
 		return error;
-	if (XFS_IS_CORRUPT(mp, *has_more != 1))
+	if (XFS_IS_CORRUPT(mp, *has_more != 1)) {
+		xfs_btree_mark_sick(*curpp);
 		return -EFSCORRUPTED;
+	}
 
 	iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
 				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
@@ -390,11 +386,10 @@ xfs_iwalk_run_callbacks(
 	}
 
 	/* ...and recreate the cursor just past where we left off. */
-	error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp,
-			agi_bpp);
+	error = xfs_ialloc_read_agi(iwag->pag, iwag->tp, agi_bpp);
 	if (error)
 		return error;
-
+	*curpp = xfs_inobt_init_cursor(iwag->pag, iwag->tp, *agi_bpp);
 	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
 }
 
@@ -434,6 +429,7 @@ xfs_iwalk_ag(
 		rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
 		if (iwag->lastino != NULLFSINO &&
 		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
+			xfs_btree_mark_sick(cur);
 			error = -EFSCORRUPTED;
 			goto out;
 		}
@@ -627,7 +623,7 @@ xfs_iwalk_ag_work(
 	xfs_iwalk_free(iwag);
 out:
 	xfs_perag_put(iwag->pag);
-	kmem_free(iwag);
+	kfree(iwag);
 	return error;
 }
 
@@ -663,7 +659,8 @@ xfs_iwalk_threaded(
 		if (xfs_pwork_ctl_want_abort(&pctl))
 			break;
 
-		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
+		iwag = kzalloc(sizeof(struct xfs_iwalk_ag),
+				GFP_KERNEL | __GFP_NOFAIL);
 		iwag->mp = mp;
 
 		/*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index d7873e0360f0..8f07c9f6157f 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -21,15 +21,13 @@ typedef __u32			xfs_nlink_t;
 
 #include "xfs_types.h"
 
-#include "kmem.h"
-#include "mrlock.h"
-
 #include <linux/semaphore.h>
 #include <linux/mm.h>
 #include <linux/sched/mm.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/crc32c.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -51,6 +49,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/notifier.h>
 #include <linux/delay.h>
 #include <linux/log2.h>
+#include <linux/rwsem.h>
 #include <linux/spinlock.h>
 #include <linux/random.h>
 #include <linux/ctype.h>
@@ -82,6 +81,7 @@ typedef __u32			xfs_nlink_t;
 #include "xfs_buf.h"
 #include "xfs_message.h"
 #include "xfs_drain.h"
+#include "xfs_hooks.h"
 
 #ifdef __BIG_ENDIAN
 #define XFS_NATIVE_HOST 1
@@ -269,4 +269,15 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
 # define PTR_FMT "%p"
 #endif
 
+/*
+ * Helper for IO routines to grab backing pages from allocated kernel memory.
+ */
+static inline struct page *
+kmem_to_page(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return vmalloc_to_page(addr);
+	return virt_to_page(addr);
+}
+
 #endif /* __XFS_LINUX__ */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a1650fc81382..5004f23d344e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -633,14 +633,14 @@ xlog_state_release_iclog(
  */
 int
 xfs_log_mount(
-	xfs_mount_t	*mp,
-	xfs_buftarg_t	*log_target,
-	xfs_daddr_t	blk_offset,
-	int		num_bblks)
+	xfs_mount_t		*mp,
+	struct xfs_buftarg	*log_target,
+	xfs_daddr_t		blk_offset,
+	int			num_bblks)
 {
-	struct xlog	*log;
-	int		error = 0;
-	int		min_logfsbs;
+	struct xlog		*log;
+	int			error = 0;
+	int			min_logfsbs;
 
 	if (!xfs_has_norecovery(mp)) {
 		xfs_notice(mp, "Mounting V%d Filesystem %pU",
@@ -1528,7 +1528,7 @@ xlog_alloc_log(
 	int			error = -ENOMEM;
 	uint			log2_size = 0;
 
-	log = kmem_zalloc(sizeof(struct xlog), KM_MAYFAIL);
+	log = kzalloc(sizeof(struct xlog), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!log) {
 		xfs_warn(mp, "Log allocation failed: No memory!");
 		goto out;
@@ -1605,7 +1605,8 @@ xlog_alloc_log(
 		size_t bvec_size = howmany(log->l_iclog_size, PAGE_SIZE) *
 				sizeof(struct bio_vec);
 
-		iclog = kmem_zalloc(sizeof(*iclog) + bvec_size, KM_MAYFAIL);
+		iclog = kzalloc(sizeof(*iclog) + bvec_size,
+				GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 		if (!iclog)
 			goto out_free_iclog;
 
@@ -1661,13 +1662,13 @@ out_destroy_workqueue:
 out_free_iclog:
 	for (iclog = log->l_iclog; iclog; iclog = prev_iclog) {
 		prev_iclog = iclog->ic_next;
-		kmem_free(iclog->ic_data);
-		kmem_free(iclog);
+		kvfree(iclog->ic_data);
+		kfree(iclog);
 		if (prev_iclog == log->l_iclog)
 			break;
 	}
 out_free_log:
-	kmem_free(log);
+	kfree(log);
 out:
 	return ERR_PTR(error);
 }	/* xlog_alloc_log */
@@ -2118,14 +2119,14 @@ xlog_dealloc_log(
 	iclog = log->l_iclog;
 	for (i = 0; i < log->l_iclog_bufs; i++) {
 		next_iclog = iclog->ic_next;
-		kmem_free(iclog->ic_data);
-		kmem_free(iclog);
+		kvfree(iclog->ic_data);
+		kfree(iclog);
 		iclog = next_iclog;
 	}
 
 	log->l_mp->m_log = NULL;
 	destroy_workqueue(log->l_ioend_workqueue);
-	kmem_free(log);
+	kfree(log);
 }
 
 /*
@@ -3517,7 +3518,8 @@ xlog_ticket_alloc(
 	struct xlog_ticket	*tic;
 	int			unit_res;
 
-	tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL);
+	tic = kmem_cache_zalloc(xfs_log_ticket_cache,
+			GFP_KERNEL | __GFP_NOFAIL);
 
 	unit_res = xlog_calc_unit_res(log, unit_bytes, &tic->t_iclog_hdrs);
 
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 67a99d94701e..73f5b7f628f4 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -100,7 +100,7 @@ xlog_cil_ctx_alloc(void)
 {
 	struct xfs_cil_ctx	*ctx;
 
-	ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&ctx->committing);
 	INIT_LIST_HEAD(&ctx->busy_extents.extent_list);
 	INIT_LIST_HEAD(&ctx->log_items);
@@ -339,7 +339,7 @@ xlog_cil_alloc_shadow_bufs(
 			 * the buffer, only the log vector header and the iovec
 			 * storage.
 			 */
-			kmem_free(lip->li_lv_shadow);
+			kvfree(lip->li_lv_shadow);
 			lv = xlog_kvmalloc(buf_size);
 
 			memset(lv, 0, xlog_cil_iovec_space(niovecs));
@@ -703,7 +703,7 @@ xlog_cil_free_logvec(
 	while (!list_empty(lv_chain)) {
 		lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list);
 		list_del_init(&lv->lv_list);
-		kmem_free(lv);
+		kvfree(lv);
 	}
 }
 
@@ -753,7 +753,7 @@ xlog_cil_committed(
 		return;
 	}
 
-	kmem_free(ctx);
+	kfree(ctx);
 }
 
 void
@@ -1116,11 +1116,18 @@ xlog_cil_cleanup_whiteouts(
  * same sequence twice.  If we get a race between multiple pushes for the same
  * sequence they will block on the first one and then abort, hence avoiding
  * needless pushes.
+ *
+ * This runs from a workqueue so it does not inherent any specific memory
+ * allocation context. However, we do not want to block on memory reclaim
+ * recursing back into the filesystem because this push may have been triggered
+ * by memory reclaim itself. Hence we really need to run under full GFP_NOFS
+ * contraints here.
  */
 static void
 xlog_cil_push_work(
 	struct work_struct	*work)
 {
+	unsigned int		nofs_flags = memalloc_nofs_save();
 	struct xfs_cil_ctx	*ctx =
 		container_of(work, struct xfs_cil_ctx, push_work);
 	struct xfs_cil		*cil = ctx->cil;
@@ -1334,12 +1341,14 @@ xlog_cil_push_work(
 	spin_unlock(&log->l_icloglock);
 	xlog_cil_cleanup_whiteouts(&whiteouts);
 	xfs_log_ticket_ungrant(log, ticket);
+	memalloc_nofs_restore(nofs_flags);
 	return;
 
 out_skip:
 	up_write(&cil->xc_ctx_lock);
 	xfs_log_ticket_put(new_ctx->ticket);
-	kmem_free(new_ctx);
+	kfree(new_ctx);
+	memalloc_nofs_restore(nofs_flags);
 	return;
 
 out_abort_free_ticket:
@@ -1348,6 +1357,7 @@ out_abort_free_ticket:
 	if (!ctx->commit_iclog) {
 		xfs_log_ticket_ungrant(log, ctx->ticket);
 		xlog_cil_committed(ctx);
+		memalloc_nofs_restore(nofs_flags);
 		return;
 	}
 	spin_lock(&log->l_icloglock);
@@ -1356,6 +1366,7 @@ out_abort_free_ticket:
 	/* Not safe to reference ctx now! */
 	spin_unlock(&log->l_icloglock);
 	xfs_log_ticket_ungrant(log, ticket);
+	memalloc_nofs_restore(nofs_flags);
 }
 
 /*
@@ -1533,7 +1544,7 @@ xlog_cil_process_intents(
 		set_bit(XFS_LI_WHITEOUT, &ilip->li_flags);
 		trace_xfs_cil_whiteout_mark(ilip);
 		len += ilip->li_lv->lv_bytes;
-		kmem_free(ilip->li_lv);
+		kvfree(ilip->li_lv);
 		ilip->li_lv = NULL;
 
 		xfs_trans_del_item(lip);
@@ -1747,7 +1758,7 @@ xlog_cil_init(
 	struct xlog_cil_pcp	*cilpcp;
 	int			cpu;
 
-	cil = kmem_zalloc(sizeof(*cil), KM_MAYFAIL);
+	cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!cil)
 		return -ENOMEM;
 	/*
@@ -1786,7 +1797,7 @@ xlog_cil_init(
 out_destroy_wq:
 	destroy_workqueue(cil->xc_push_wq);
 out_destroy_cil:
-	kmem_free(cil);
+	kfree(cil);
 	return -ENOMEM;
 }
 
@@ -1799,12 +1810,12 @@ xlog_cil_destroy(
 	if (cil->xc_ctx) {
 		if (cil->xc_ctx->ticket)
 			xfs_log_ticket_put(cil->xc_ctx->ticket);
-		kmem_free(cil->xc_ctx);
+		kfree(cil->xc_ctx);
 	}
 
 	ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags));
 	free_percpu(cil->xc_pcp);
 	destroy_workqueue(cil->xc_push_wq);
-	kmem_free(cil);
+	kfree(cil);
 }
 
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1251c81e55f9..13f1d2e91540 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -361,7 +361,7 @@ xlog_find_verify_cycle(
 	*new_blk = -1;
 
 out:
-	kmem_free(buffer);
+	kvfree(buffer);
 	return error;
 }
 
@@ -477,7 +477,7 @@ xlog_find_verify_log_record(
 		*last_blk = i;
 
 out:
-	kmem_free(buffer);
+	kvfree(buffer);
 	return error;
 }
 
@@ -731,7 +731,7 @@ validate_head:
 			goto out_free_buffer;
 	}
 
-	kmem_free(buffer);
+	kvfree(buffer);
 	if (head_blk == log_bbnum)
 		*return_head_blk = 0;
 	else
@@ -745,7 +745,7 @@ validate_head:
 	return 0;
 
 out_free_buffer:
-	kmem_free(buffer);
+	kvfree(buffer);
 	if (error)
 		xfs_warn(log->l_mp, "failed to find log head");
 	return error;
@@ -999,7 +999,7 @@ xlog_verify_tail(
 		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
 			 orig_tail, *tail_blk);
 out:
-	kmem_free(buffer);
+	kvfree(buffer);
 	return error;
 }
 
@@ -1046,7 +1046,7 @@ xlog_verify_head(
 	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
 				      XLOG_MAX_ICLOGS, tmp_buffer,
 				      &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
-	kmem_free(tmp_buffer);
+	kvfree(tmp_buffer);
 	if (error < 0)
 		return error;
 
@@ -1365,7 +1365,7 @@ xlog_find_tail(
 		error = xlog_clear_stale_blocks(log, tail_lsn);
 
 done:
-	kmem_free(buffer);
+	kvfree(buffer);
 
 	if (error)
 		xfs_warn(log->l_mp, "failed to locate log tail");
@@ -1399,6 +1399,7 @@ xlog_find_zeroed(
 	xfs_daddr_t	new_blk, last_blk, start_blk;
 	xfs_daddr_t     num_scan_bblks;
 	int	        error, log_bbnum = log->l_logBBsize;
+	int		ret = 1;
 
 	*blk_no = 0;
 
@@ -1413,8 +1414,7 @@ xlog_find_zeroed(
 	first_cycle = xlog_get_cycle(offset);
 	if (first_cycle == 0) {		/* completely zeroed log */
 		*blk_no = 0;
-		kmem_free(buffer);
-		return 1;
+		goto out_free_buffer;
 	}
 
 	/* check partially zeroed log */
@@ -1424,8 +1424,8 @@ xlog_find_zeroed(
 
 	last_cycle = xlog_get_cycle(offset);
 	if (last_cycle != 0) {		/* log completely written to */
-		kmem_free(buffer);
-		return 0;
+		ret = 0;
+		goto out_free_buffer;
 	}
 
 	/* we have a partially zeroed log */
@@ -1471,10 +1471,10 @@ xlog_find_zeroed(
 
 	*blk_no = last_blk;
 out_free_buffer:
-	kmem_free(buffer);
+	kvfree(buffer);
 	if (error)
 		return error;
-	return 1;
+	return ret;
 }
 
 /*
@@ -1583,7 +1583,7 @@ xlog_write_log_records(
 	}
 
 out_free_buffer:
-	kmem_free(buffer);
+	kvfree(buffer);
 	return error;
 }
 
@@ -2057,7 +2057,8 @@ xlog_recover_add_item(
 {
 	struct xlog_recover_item *item;
 
-	item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
+	item = kzalloc(sizeof(struct xlog_recover_item),
+			GFP_KERNEL | __GFP_NOFAIL);
 	INIT_LIST_HEAD(&item->ri_list);
 	list_add_tail(&item->ri_list, head);
 }
@@ -2160,7 +2161,7 @@ xlog_recover_add_to_trans(
 		return 0;
 	}
 
-	ptr = kmem_alloc(len, 0);
+	ptr = xlog_kvmalloc(len);
 	memcpy(ptr, dp, len);
 	in_f = (struct xfs_inode_log_format *)ptr;
 
@@ -2182,14 +2183,13 @@ xlog_recover_add_to_trans(
 		"bad number of regions (%d) in inode log format",
 				  in_f->ilf_size);
 			ASSERT(0);
-			kmem_free(ptr);
+			kvfree(ptr);
 			return -EFSCORRUPTED;
 		}
 
 		item->ri_total = in_f->ilf_size;
-		item->ri_buf =
-			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
-				    0);
+		item->ri_buf = kzalloc(item->ri_total * sizeof(xfs_log_iovec_t),
+				GFP_KERNEL | __GFP_NOFAIL);
 	}
 
 	if (item->ri_total <= item->ri_cnt) {
@@ -2197,7 +2197,7 @@ xlog_recover_add_to_trans(
 	"log item region count (%d) overflowed size (%d)",
 				item->ri_cnt, item->ri_total);
 		ASSERT(0);
-		kmem_free(ptr);
+		kvfree(ptr);
 		return -EFSCORRUPTED;
 	}
 
@@ -2227,13 +2227,13 @@ xlog_recover_free_trans(
 		/* Free the regions in the item. */
 		list_del(&item->ri_list);
 		for (i = 0; i < item->ri_cnt; i++)
-			kmem_free(item->ri_buf[i].i_addr);
+			kvfree(item->ri_buf[i].i_addr);
 		/* Free the item itself */
-		kmem_free(item->ri_buf);
-		kmem_free(item);
+		kfree(item->ri_buf);
+		kfree(item);
 	}
 	/* Free the transaction recover structure */
-	kmem_free(trans);
+	kfree(trans);
 }
 
 /*
@@ -2332,7 +2332,7 @@ xlog_recover_ophdr_to_trans(
 	 * This is a new transaction so allocate a new recovery container to
 	 * hold the recovery ops that will follow.
 	 */
-	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
+	trans = kzalloc(sizeof(struct xlog_recover), GFP_KERNEL | __GFP_NOFAIL);
 	trans->r_log_tid = tid;
 	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
 	INIT_LIST_HEAD(&trans->r_itemq);
@@ -3024,7 +3024,7 @@ xlog_do_recovery_pass(
 
 		hblks = xlog_logrec_hblks(log, rhead);
 		if (hblks != 1) {
-			kmem_free(hbp);
+			kvfree(hbp);
 			hbp = xlog_alloc_buffer(log, hblks);
 		}
 	} else {
@@ -3038,7 +3038,7 @@ xlog_do_recovery_pass(
 		return -ENOMEM;
 	dbp = xlog_alloc_buffer(log, BTOBB(h_size));
 	if (!dbp) {
-		kmem_free(hbp);
+		kvfree(hbp);
 		return -ENOMEM;
 	}
 
@@ -3199,16 +3199,33 @@ xlog_do_recovery_pass(
 	}
 
  bread_err2:
-	kmem_free(dbp);
+	kvfree(dbp);
  bread_err1:
-	kmem_free(hbp);
+	kvfree(hbp);
 
 	/*
-	 * Submit buffers that have been added from the last record processed,
-	 * regardless of error status.
+	 * Submit buffers that have been dirtied by the last record recovered.
 	 */
-	if (!list_empty(&buffer_list))
+	if (!list_empty(&buffer_list)) {
+		if (error) {
+			/*
+			 * If there has been an item recovery error then we
+			 * cannot allow partial checkpoint writeback to
+			 * occur.  We might have multiple checkpoints with the
+			 * same start LSN in this buffer list, and partial
+			 * writeback of a checkpoint in this situation can
+			 * prevent future recovery of all the changes in the
+			 * checkpoints at this start LSN.
+			 *
+			 * Note: Shutting down the filesystem will result in the
+			 * delwri submission marking all the buffers stale,
+			 * completing them and cleaning up _XBF_LOGRECOVERY
+			 * state without doing any IO.
+			 */
+			xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+		}
 		error2 = xfs_buf_delwri_submit(&buffer_list);
+	}
 
 	if (error && first_bad)
 		*first_bad = rhead_blk;
@@ -3443,12 +3460,19 @@ xlog_recover(
  * part of recovery so that the root and real-time bitmap inodes can be read in
  * from disk in between the two stages.  This is necessary so that we can free
  * space in the real-time portion of the file system.
+ *
+ * We run this whole process under GFP_NOFS allocation context. We do a
+ * combination of non-transactional and transactional work, yet we really don't
+ * want to recurse into the filesystem from direct reclaim during any of this
+ * processing. This allows all the recovery code run here not to care about the
+ * memory allocation context it is running in.
  */
 int
 xlog_recover_finish(
 	struct xlog	*log)
 {
-	int	error;
+	unsigned int	nofs_flags = memalloc_nofs_save();
+	int		error;
 
 	error = xlog_recover_process_intents(log);
 	if (error) {
@@ -3462,7 +3486,7 @@ xlog_recover_finish(
 		xlog_recover_cancel_intents(log);
 		xfs_alert(log->l_mp, "Failed to recover intents");
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
-		return error;
+		goto out_error;
 	}
 
 	/*
@@ -3483,7 +3507,7 @@ xlog_recover_finish(
 		if (error < 0) {
 			xfs_alert(log->l_mp,
 	"Failed to clear log incompat features on recovery");
-			return error;
+			goto out_error;
 		}
 	}
 
@@ -3508,9 +3532,13 @@ xlog_recover_finish(
 		 * and AIL.
 		 */
 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
+		error = 0;
+		goto out_error;
 	}
 
-	return 0;
+out_error:
+	memalloc_nofs_restore(nofs_flags);
+	return error;
 }
 
 void
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aabb25dc3efa..df370eb5dc15 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -45,7 +45,7 @@ xfs_uuid_table_free(void)
 {
 	if (xfs_uuid_table_size == 0)
 		return;
-	kmem_free(xfs_uuid_table);
+	kfree(xfs_uuid_table);
 	xfs_uuid_table = NULL;
 	xfs_uuid_table_size = 0;
 }
@@ -62,7 +62,7 @@ xfs_uuid_mount(
 	int			hole, i;
 
 	/* Publish UUID in struct super_block */
-	uuid_copy(&mp->m_super->s_uuid, uuid);
+	super_set_uuid(mp->m_super, uuid->b, sizeof(*uuid));
 
 	if (xfs_has_nouuid(mp))
 		return 0;
@@ -706,6 +706,8 @@ xfs_mountfs(
 	/* enable fail_at_unmount as default */
 	mp->m_fail_unmount = true;
 
+	super_set_sysfs_name_id(mp->m_super);
+
 	error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype,
 			       NULL, mp->m_super->s_id);
 	if (error)
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 503fe3c7edbf..e880aa48de68 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -94,9 +94,9 @@ typedef struct xfs_mount {
 	struct xfs_inode	*m_rsumip;	/* pointer to summary inode */
 	struct xfs_inode	*m_rootip;	/* pointer to root directory */
 	struct xfs_quotainfo	*m_quotainfo;	/* disk quota information */
-	xfs_buftarg_t		*m_ddev_targp;	/* saves taking the address */
-	xfs_buftarg_t		*m_logdev_targp;/* ptr to log device */
-	xfs_buftarg_t		*m_rtdev_targp;	/* ptr to rt device */
+	struct xfs_buftarg	*m_ddev_targp;	/* data device */
+	struct xfs_buftarg	*m_logdev_targp;/* log device */
+	struct xfs_buftarg	*m_rtdev_targp;	/* rt device */
 	void __percpu		*m_inodegc;	/* percpu inodegc structures */
 
 	/*
@@ -252,6 +252,9 @@ typedef struct xfs_mount {
 
 	/* cpus that have inodes queued for inactivation */
 	struct cpumask		m_inodegc_cpumask;
+
+	/* Hook to feed dirent updates to an active online repair. */
+	struct xfs_hooks	m_dir_update_hooks;
 } xfs_mount_t;
 
 #define M_IGEO(mp)		(&(mp)->m_ino_geo)
@@ -502,9 +505,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
 
-int xfs_buf_hash_init(struct xfs_perag *pag);
-void xfs_buf_hash_destroy(struct xfs_perag *pag);
-
 extern void	xfs_uuid_table_free(void);
 extern uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern int	xfs_mountfs(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index f85e3b07ab44..7443debaffd6 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -333,13 +333,14 @@ xfs_mru_cache_create(
 	if (!(grp_time = msecs_to_jiffies(lifetime_ms) / grp_count))
 		return -EINVAL;
 
-	if (!(mru = kmem_zalloc(sizeof(*mru), 0)))
+	mru = kzalloc(sizeof(*mru), GFP_KERNEL | __GFP_NOFAIL);
+	if (!mru)
 		return -ENOMEM;
 
 	/* An extra list is needed to avoid reaping up to a grp_time early. */
 	mru->grp_count = grp_count + 1;
-	mru->lists = kmem_zalloc(mru->grp_count * sizeof(*mru->lists), 0);
-
+	mru->lists = kzalloc(mru->grp_count * sizeof(*mru->lists),
+				GFP_KERNEL | __GFP_NOFAIL);
 	if (!mru->lists) {
 		err = -ENOMEM;
 		goto exit;
@@ -364,9 +365,9 @@ xfs_mru_cache_create(
 
 exit:
 	if (err && mru && mru->lists)
-		kmem_free(mru->lists);
+		kfree(mru->lists);
 	if (err && mru)
-		kmem_free(mru);
+		kfree(mru);
 
 	return err;
 }
@@ -406,8 +407,8 @@ xfs_mru_cache_destroy(
 
 	xfs_mru_cache_flush(mru);
 
-	kmem_free(mru->lists);
-	kmem_free(mru);
+	kfree(mru->lists);
+	kfree(mru);
 }
 
 /*
@@ -427,7 +428,7 @@ xfs_mru_cache_insert(
 	if (!mru || !mru->lists)
 		return -EINVAL;
 
-	if (radix_tree_preload(GFP_NOFS))
+	if (radix_tree_preload(GFP_KERNEL))
 		return -ENOMEM;
 
 	INIT_LIST_HEAD(&elem->list_node);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 67d0a8564ff3..0f4cf4170c35 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -26,6 +26,7 @@
 #include "xfs_ag.h"
 #include "xfs_ialloc.h"
 #include "xfs_log_priv.h"
+#include "xfs_health.h"
 
 /*
  * The global quota manager. There is only one of these for the entire
@@ -254,7 +255,7 @@ xfs_qm_dqattach_one(
 	struct xfs_dquot	*dqp;
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	error = 0;
 
 	/*
@@ -322,7 +323,7 @@ xfs_qm_dqattach_locked(
 	if (!xfs_qm_need_dqattach(ip))
 		return 0;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
 		error = xfs_qm_dqattach_one(ip, XFS_DQTYPE_USER,
@@ -353,7 +354,7 @@ done:
 	 * Don't worry about the dquots that we may have attached before any
 	 * error - they'll get detached later if it has not already been done.
 	 */
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -628,7 +629,8 @@ xfs_qm_init_quotainfo(
 
 	ASSERT(XFS_IS_QUOTA_ON(mp));
 
-	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(struct xfs_quotainfo), 0);
+	qinf = mp->m_quotainfo = kzalloc(sizeof(struct xfs_quotainfo),
+					GFP_KERNEL | __GFP_NOFAIL);
 
 	error = list_lru_init(&qinf->qi_lru);
 	if (error)
@@ -642,9 +644,9 @@ xfs_qm_init_quotainfo(
 	if (error)
 		goto out_free_lru;
 
-	INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
-	INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
-	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
+	INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_KERNEL);
+	INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_KERNEL);
+	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_KERNEL);
 	mutex_init(&qinf->qi_tree_lock);
 
 	/* mutex used to serialize quotaoffs */
@@ -691,6 +693,9 @@ xfs_qm_init_quotainfo(
 
 	shrinker_register(qinf->qi_shrinker);
 
+	xfs_hooks_init(&qinf->qi_mod_ino_dqtrx_hooks);
+	xfs_hooks_init(&qinf->qi_apply_dqtrx_hooks);
+
 	return 0;
 
 out_free_inos:
@@ -700,7 +705,7 @@ out_free_inos:
 out_free_lru:
 	list_lru_destroy(&qinf->qi_lru);
 out_free_qinf:
-	kmem_free(qinf);
+	kfree(qinf);
 	mp->m_quotainfo = NULL;
 	return error;
 }
@@ -724,7 +729,7 @@ xfs_qm_destroy_quotainfo(
 	xfs_qm_destroy_quotainos(qi);
 	mutex_destroy(&qi->qi_tree_lock);
 	mutex_destroy(&qi->qi_quotaofflock);
-	kmem_free(qi);
+	kfree(qi);
 	mp->m_quotainfo = NULL;
 }
 
@@ -758,14 +763,18 @@ xfs_qm_qino_alloc(
 			     (mp->m_sb.sb_gquotino != NULLFSINO)) {
 			ino = mp->m_sb.sb_gquotino;
 			if (XFS_IS_CORRUPT(mp,
-					   mp->m_sb.sb_pquotino != NULLFSINO))
+					   mp->m_sb.sb_pquotino != NULLFSINO)) {
+				xfs_fs_mark_sick(mp, XFS_SICK_FS_PQUOTA);
 				return -EFSCORRUPTED;
+			}
 		} else if ((flags & XFS_QMOPT_GQUOTA) &&
 			     (mp->m_sb.sb_pquotino != NULLFSINO)) {
 			ino = mp->m_sb.sb_pquotino;
 			if (XFS_IS_CORRUPT(mp,
-					   mp->m_sb.sb_gquotino != NULLFSINO))
+					   mp->m_sb.sb_gquotino != NULLFSINO)) {
+				xfs_fs_mark_sick(mp, XFS_SICK_FS_GQUOTA);
 				return -EFSCORRUPTED;
+			}
 		}
 		if (ino != NULLFSINO) {
 			error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
@@ -996,7 +1005,8 @@ xfs_qm_reset_dqcounts_buf(
 	if (qip->i_nblocks == 0)
 		return 0;
 
-	map = kmem_alloc(XFS_DQITER_MAP_SIZE * sizeof(*map), 0);
+	map = kmalloc(XFS_DQITER_MAP_SIZE * sizeof(*map),
+			GFP_KERNEL | __GFP_NOFAIL);
 
 	lblkno = 0;
 	maxlblkcnt = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
@@ -1058,7 +1068,7 @@ xfs_qm_reset_dqcounts_buf(
 	} while (nmaps > 0);
 
 out:
-	kmem_free(map);
+	kfree(map);
 	return error;
 }
 
@@ -1406,8 +1416,12 @@ error_return:
 			xfs_warn(mp,
 				"Quotacheck: Failed to reset quota flags.");
 		}
-	} else
+		xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
+	} else {
 		xfs_notice(mp, "Quotacheck: Done.");
+		xfs_fs_mark_healthy(mp, XFS_SICK_FS_QUOTACHECK);
+	}
+
 	return error;
 
 error_purge:
@@ -1809,7 +1823,7 @@ xfs_qm_vop_chown(
 				 XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
 
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	ASSERT(XFS_IS_QUOTA_ON(ip->i_mount));
 
 	/* old dquot */
@@ -1817,12 +1831,12 @@ xfs_qm_vop_chown(
 	ASSERT(prevdq);
 	ASSERT(prevdq != newdq);
 
-	xfs_trans_mod_dquot(tp, prevdq, bfield, -(ip->i_nblocks));
-	xfs_trans_mod_dquot(tp, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
+	xfs_trans_mod_ino_dquot(tp, ip, prevdq, bfield, -(ip->i_nblocks));
+	xfs_trans_mod_ino_dquot(tp, ip, prevdq, XFS_TRANS_DQ_ICOUNT, -1);
 
 	/* the sparkling new dquot */
-	xfs_trans_mod_dquot(tp, newdq, bfield, ip->i_nblocks);
-	xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
+	xfs_trans_mod_ino_dquot(tp, ip, newdq, bfield, ip->i_nblocks);
+	xfs_trans_mod_ino_dquot(tp, ip, newdq, XFS_TRANS_DQ_ICOUNT, 1);
 
 	/*
 	 * Back when we made quota reservations for the chown, we reserved the
@@ -1897,29 +1911,28 @@ xfs_qm_vop_create_dqattach(
 	if (!XFS_IS_QUOTA_ON(mp))
 		return;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	if (udqp && XFS_IS_UQUOTA_ON(mp)) {
 		ASSERT(ip->i_udquot == NULL);
 		ASSERT(i_uid_read(VFS_I(ip)) == udqp->q_id);
 
 		ip->i_udquot = xfs_qm_dqhold(udqp);
-		xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
 	if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
 		ASSERT(ip->i_gdquot == NULL);
 		ASSERT(i_gid_read(VFS_I(ip)) == gdqp->q_id);
 
 		ip->i_gdquot = xfs_qm_dqhold(gdqp);
-		xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
 	if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
 		ASSERT(ip->i_pdquot == NULL);
 		ASSERT(ip->i_projid == pdqp->q_id);
 
 		ip->i_pdquot = xfs_qm_dqhold(pdqp);
-		xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
 	}
+
+	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, 1);
 }
 
 /* Decide if this inode's dquot is near an enforcement boundary. */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index d5c9fc4ba591..f5993012bf98 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -68,6 +68,10 @@ struct xfs_quotainfo {
 	/* Minimum and maximum quota expiration timestamp values. */
 	time64_t		qi_expiry_min;
 	time64_t		qi_expiry_max;
+
+	/* Hook to feed quota counter updates to an active online repair. */
+	struct xfs_hooks	qi_mod_ino_dqtrx_hooks;
+	struct xfs_hooks	qi_apply_dqtrx_hooks;
 };
 
 static inline struct radix_tree_root *
@@ -104,6 +108,18 @@ xfs_quota_inode(struct xfs_mount *mp, xfs_dqtype_t type)
 	return NULL;
 }
 
+/*
+ * Parameters for tracking dqtrx changes on behalf of an inode.  The hook
+ * function arg parameter is the field being updated.
+ */
+struct xfs_mod_ino_dqtrx_params {
+	uintptr_t		tx_id;
+	xfs_ino_t		ino;
+	xfs_dqtype_t		q_type;
+	xfs_dqid_t		q_id;
+	int64_t			delta;
+};
+
 extern void	xfs_trans_mod_dquot(struct xfs_trans *tp, struct xfs_dquot *dqp,
 				    uint field, int64_t delta);
 extern void	xfs_trans_dqjoin(struct xfs_trans *, struct xfs_dquot *);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index b77673dd0558..271c1021c733 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -9,6 +9,7 @@
 #include "xfs_format.h"
 #include "xfs_log_format.h"
 #include "xfs_trans_resv.h"
+#include "xfs_mount.h"
 #include "xfs_quota.h"
 #include "xfs_mount.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index e0d56489f3b2..85a4ae1a17f6 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -74,6 +74,22 @@ struct xfs_dqtrx {
 	int64_t		qt_icount_delta;  /* dquot inode count changes */
 };
 
+enum xfs_apply_dqtrx_type {
+	XFS_APPLY_DQTRX_COMMIT = 0,
+	XFS_APPLY_DQTRX_UNRESERVE,
+};
+
+/*
+ * Parameters for applying dqtrx changes to a dquot.  The hook function arg
+ * parameter is enum xfs_apply_dqtrx_type.
+ */
+struct xfs_apply_dqtrx_params {
+	uintptr_t		tx_id;
+	xfs_ino_t		ino;
+	xfs_dqtype_t		q_type;
+	xfs_dqid_t		q_id;
+};
+
 #ifdef CONFIG_XFS_QUOTA
 extern void xfs_trans_dup_dqinfo(struct xfs_trans *, struct xfs_trans *);
 extern void xfs_trans_free_dqinfo(struct xfs_trans *);
@@ -114,6 +130,30 @@ xfs_quota_reserve_blkres(struct xfs_inode *ip, int64_t blocks)
 	return xfs_trans_reserve_quota_nblks(NULL, ip, blocks, 0, false);
 }
 bool xfs_inode_near_dquot_enforcement(struct xfs_inode *ip, xfs_dqtype_t type);
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+void xfs_trans_mod_ino_dquot(struct xfs_trans *tp, struct xfs_inode *ip,
+		struct xfs_dquot *dqp, unsigned int field, int64_t delta);
+
+struct xfs_quotainfo;
+
+struct xfs_dqtrx_hook {
+	struct xfs_hook		mod_hook;
+	struct xfs_hook		apply_hook;
+};
+
+void xfs_dqtrx_hook_disable(void);
+void xfs_dqtrx_hook_enable(void);
+
+int xfs_dqtrx_hook_add(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_del(struct xfs_quotainfo *qi, struct xfs_dqtrx_hook *hook);
+void xfs_dqtrx_hook_setup(struct xfs_dqtrx_hook *hook, notifier_fn_t mod_fn,
+		notifier_fn_t apply_fn);
+# else
+#  define xfs_trans_mod_ino_dquot(tp, ip, dqp, field, delta) \
+		xfs_trans_mod_dquot((tp), (dqp), (field), (delta))
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
 #else
 static inline int
 xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
@@ -173,6 +213,12 @@ xfs_trans_reserve_quota_icreate(struct xfs_trans *tp, struct xfs_dquot *udqp,
 #define xfs_qm_unmount(mp)
 #define xfs_qm_unmount_quotas(mp)
 #define xfs_inode_near_dquot_enforcement(ip, type)			(false)
+
+# ifdef CONFIG_XFS_LIVE_HOOKS
+#  define xfs_dqtrx_hook_enable()		((void)0)
+#  define xfs_dqtrx_hook_disable()		((void)0)
+# endif /* CONFIG_XFS_LIVE_HOOKS */
+
 #endif /* CONFIG_XFS_QUOTA */
 
 static inline int
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 20ad8086da60..14919b33e4fe 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -36,9 +36,9 @@ STATIC void
 xfs_cui_item_free(
 	struct xfs_cui_log_item	*cuip)
 {
-	kmem_free(cuip->cui_item.li_lv_shadow);
+	kvfree(cuip->cui_item.li_lv_shadow);
 	if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
-		kmem_free(cuip);
+		kfree(cuip);
 	else
 		kmem_cache_free(xfs_cui_cache, cuip);
 }
@@ -143,8 +143,8 @@ xfs_cui_init(
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
-		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
-				0);
+		cuip = kzalloc(xfs_cui_log_item_sizeof(nextents),
+				GFP_KERNEL | __GFP_NOFAIL);
 	else
 		cuip = kmem_cache_zalloc(xfs_cui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
@@ -207,7 +207,7 @@ xfs_cud_item_release(
 	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
 
 	xfs_cui_release(cudp->cud_cuip);
-	kmem_free(cudp->cud_item.li_lv_shadow);
+	kvfree(cudp->cud_item.li_lv_shadow);
 	kmem_cache_free(xfs_cud_cache, cudp);
 }
 
@@ -425,7 +425,7 @@ xfs_cui_recover_work(
 	struct xfs_refcount_intent	*ri;
 
 	ri = kmem_cache_alloc(xfs_refcount_intent_cache,
-			GFP_NOFS | __GFP_NOFAIL);
+			GFP_KERNEL | __GFP_NOFAIL);
 	ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
 	ri->ri_startblock = pmap->pe_startblock;
 	ri->ri_blockcount = pmap->pe_len;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index d5ca8bcae65b..7da0e8f961d3 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -29,6 +29,7 @@
 #include "xfs_iomap.h"
 #include "xfs_ag.h"
 #include "xfs_ag_resv.h"
+#include "xfs_health.h"
 
 /*
  * Copy on Write of Shared Blocks
@@ -527,7 +528,7 @@ xfs_reflink_allocate_cow(
 	int			error;
 	bool			found;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 	if (!ip->i_cowfp) {
 		ASSERT(!xfs_is_reflink_inode(ip));
 		xfs_ifork_init_cow(ip);
@@ -805,7 +806,7 @@ xfs_reflink_end_cow_extent(
 		 * If the extent we're remapping is backed by storage (written
 		 * or not), unmap the extent and drop its refcount.
 		 */
-		xfs_bmap_unmap_extent(tp, ip, &data);
+		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data);
 		xfs_refcount_decrease_extent(tp, &data);
 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
 				-data.br_blockcount);
@@ -829,7 +830,7 @@ xfs_reflink_end_cow_extent(
 	xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
 
 	/* Map the new blocks into the data fork. */
-	xfs_bmap_map_extent(tp, ip, &del);
+	xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, &del);
 
 	/* Charge this new data fork mapping to the on-disk quota. */
 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
@@ -1227,8 +1228,10 @@ xfs_reflink_remap_extent(
 	 * extent if they're both holes or both the same physical extent.
 	 */
 	if (dmap->br_startblock == smap.br_startblock) {
-		if (dmap->br_state != smap.br_state)
+		if (dmap->br_state != smap.br_state) {
+			xfs_bmap_mark_sick(ip, XFS_DATA_FORK);
 			error = -EFSCORRUPTED;
+		}
 		goto out_cancel;
 	}
 
@@ -1291,7 +1294,7 @@ xfs_reflink_remap_extent(
 		 * If the extent we're unmapping is backed by storage (written
 		 * or not), unmap the extent and drop its refcount.
 		 */
-		xfs_bmap_unmap_extent(tp, ip, &smap);
+		xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &smap);
 		xfs_refcount_decrease_extent(tp, &smap);
 		qdelta -= smap.br_blockcount;
 	} else if (smap.br_startblock == DELAYSTARTBLOCK) {
@@ -1316,7 +1319,7 @@ xfs_reflink_remap_extent(
 	 */
 	if (dmap_written) {
 		xfs_refcount_increase_extent(tp, dmap);
-		xfs_bmap_map_extent(tp, ip, dmap);
+		xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, dmap);
 		qdelta += dmap->br_blockcount;
 	}
 
@@ -1391,6 +1394,7 @@ xfs_reflink_remap_blocks(
 		ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
 		if (imap.br_startblock == DELAYSTARTBLOCK) {
 			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+			xfs_bmap_mark_sick(src, XFS_DATA_FORK);
 			error = -EFSCORRUPTED;
 			break;
 		}
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 79ad0087aeca..e473124e29cc 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -36,9 +36,9 @@ STATIC void
 xfs_rui_item_free(
 	struct xfs_rui_log_item	*ruip)
 {
-	kmem_free(ruip->rui_item.li_lv_shadow);
+	kvfree(ruip->rui_item.li_lv_shadow);
 	if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS)
-		kmem_free(ruip);
+		kfree(ruip);
 	else
 		kmem_cache_free(xfs_rui_cache, ruip);
 }
@@ -142,7 +142,8 @@ xfs_rui_init(
 
 	ASSERT(nextents > 0);
 	if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
-		ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+		ruip = kzalloc(xfs_rui_log_item_sizeof(nextents),
+				GFP_KERNEL | __GFP_NOFAIL);
 	else
 		ruip = kmem_cache_zalloc(xfs_rui_cache,
 					 GFP_KERNEL | __GFP_NOFAIL);
@@ -205,7 +206,7 @@ xfs_rud_item_release(
 	struct xfs_rud_log_item	*rudp = RUD_ITEM(lip);
 
 	xfs_rui_release(rudp->rud_ruip);
-	kmem_free(rudp->rud_item.li_lv_shadow);
+	kvfree(rudp->rud_item.li_lv_shadow);
 	kmem_cache_free(xfs_rud_cache, rudp);
 }
 
@@ -454,7 +455,7 @@ xfs_rui_recover_work(
 {
 	struct xfs_rmap_intent		*ri;
 
-	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+	ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_KERNEL | __GFP_NOFAIL);
 
 	switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
 	case XFS_RMAP_EXTENT_MAP:
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 8649d981a097..e66f9bd5de5c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -22,6 +22,8 @@
 #include "xfs_sb.h"
 #include "xfs_rtbitmap.h"
 #include "xfs_quota.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
 
 /*
  * Return whether there are any free extents in the size range given
@@ -903,7 +905,7 @@ xfs_growfs_rt(
 	/*
 	 * Allocate a new (fake) mount/sb.
 	 */
-	nmp = kmem_alloc(sizeof(*nmp), 0);
+	nmp = kmalloc(sizeof(*nmp), GFP_KERNEL | __GFP_NOFAIL);
 	/*
 	 * Loop over the bitmap blocks.
 	 * We will do everything one bitmap block at a time.
@@ -1050,7 +1052,7 @@ out_free:
 	/*
 	 * Free the fake mp structure.
 	 */
-	kmem_free(nmp);
+	kfree(nmp);
 
 	/*
 	 * If we had to allocate a new rsum_cache, we either need to free the
@@ -1059,10 +1061,10 @@ out_free:
 	 */
 	if (rsum_cache != mp->m_rsum_cache) {
 		if (error) {
-			kmem_free(mp->m_rsum_cache);
+			kvfree(mp->m_rsum_cache);
 			mp->m_rsum_cache = rsum_cache;
 		} else {
-			kmem_free(rsum_cache);
+			kvfree(rsum_cache);
 		}
 	}
 
@@ -1202,6 +1204,8 @@ xfs_rtmount_inodes(
 
 	sbp = &mp->m_sb;
 	error = xfs_iget(mp, NULL, sbp->sb_rbmino, 0, 0, &mp->m_rbmip);
+	if (xfs_metadata_is_sick(error))
+		xfs_rt_mark_sick(mp, XFS_SICK_RT_BITMAP);
 	if (error)
 		return error;
 	ASSERT(mp->m_rbmip != NULL);
@@ -1211,6 +1215,8 @@ xfs_rtmount_inodes(
 		goto out_rele_bitmap;
 
 	error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip);
+	if (xfs_metadata_is_sick(error))
+		xfs_rt_mark_sick(mp, XFS_SICK_RT_SUMMARY);
 	if (error)
 		goto out_rele_bitmap;
 	ASSERT(mp->m_rsumip != NULL);
@@ -1233,7 +1239,7 @@ void
 xfs_rtunmount_inodes(
 	struct xfs_mount	*mp)
 {
-	kmem_free(mp->m_rsum_cache);
+	kvfree(mp->m_rsum_cache);
 	if (mp->m_rbmip)
 		xfs_irele(mp->m_rbmip);
 	if (mp->m_rsumip)
@@ -1260,7 +1266,7 @@ xfs_rtpick_extent(
 	uint64_t		seq;		/* sequence number of file creation */
 	struct timespec64	ts;		/* timespec in inode */
 
-	ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(mp->m_rbmip, XFS_ILOCK_EXCL);
 
 	ts = inode_get_atime(VFS_I(mp->m_rbmip));
 	if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) {
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 90a77cd3ebad..ed97d72caa66 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -50,7 +50,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 		{ "ibt2",		xfsstats_offset(xs_fibt_2)	},
 		{ "fibt2",		xfsstats_offset(xs_rmap_2)	},
 		{ "rmapbt",		xfsstats_offset(xs_refcbt_2)	},
-		{ "refcntbt",		xfsstats_offset(xs_qm_dqreclaims)},
+		{ "refcntbt",		xfsstats_offset(xs_rmap_mem_2)	},
+		{ "rmapbt_mem",		xfsstats_offset(xs_rcbag_2)	},
+		{ "rcbagbt",		xfsstats_offset(xs_qm_dqreclaims)},
 		/* we print both series of quota information together */
 		{ "qm",			xfsstats_offset(xs_xstrat_bytes)},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 43ffba74f045..a61fb56ed2e6 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -125,6 +125,8 @@ struct __xfsstats {
 	uint32_t		xs_fibt_2[__XBTS_MAX];
 	uint32_t		xs_rmap_2[__XBTS_MAX];
 	uint32_t		xs_refcbt_2[__XBTS_MAX];
+	uint32_t		xs_rmap_mem_2[__XBTS_MAX];
+	uint32_t		xs_rcbag_2[__XBTS_MAX];
 	uint32_t		xs_qm_dqreclaims;
 	uint32_t		xs_qm_dqreclaim_misses;
 	uint32_t		xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 98401de832ee..bce020374c5e 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -44,6 +44,7 @@
 #include "xfs_dahash_test.h"
 #include "xfs_rtbitmap.h"
 #include "scrub/stats.h"
+#include "scrub/rcbag_btree.h"
 
 #include <linux/magic.h>
 #include <linux/fs_context.h>
@@ -361,16 +362,16 @@ STATIC int
 xfs_blkdev_get(
 	xfs_mount_t		*mp,
 	const char		*name,
-	struct bdev_handle	**handlep)
+	struct file		**bdev_filep)
 {
 	int			error = 0;
 
-	*handlep = bdev_open_by_path(name,
+	*bdev_filep = bdev_file_open_by_path(name,
 		BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
 		mp->m_super, &fs_holder_ops);
-	if (IS_ERR(*handlep)) {
-		error = PTR_ERR(*handlep);
-		*handlep = NULL;
+	if (IS_ERR(*bdev_filep)) {
+		error = PTR_ERR(*bdev_filep);
+		*bdev_filep = NULL;
 		xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
 	}
 
@@ -435,26 +436,26 @@ xfs_open_devices(
 {
 	struct super_block	*sb = mp->m_super;
 	struct block_device	*ddev = sb->s_bdev;
-	struct bdev_handle	*logdev_handle = NULL, *rtdev_handle = NULL;
+	struct file		*logdev_file = NULL, *rtdev_file = NULL;
 	int			error;
 
 	/*
 	 * Open real time and log devices - order is important.
 	 */
 	if (mp->m_logname) {
-		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
+		error = xfs_blkdev_get(mp, mp->m_logname, &logdev_file);
 		if (error)
 			return error;
 	}
 
 	if (mp->m_rtname) {
-		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle);
+		error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_file);
 		if (error)
 			goto out_close_logdev;
 
-		if (rtdev_handle->bdev == ddev ||
-		    (logdev_handle &&
-		     rtdev_handle->bdev == logdev_handle->bdev)) {
+		if (file_bdev(rtdev_file) == ddev ||
+		    (logdev_file &&
+		     file_bdev(rtdev_file) == file_bdev(logdev_file))) {
 			xfs_warn(mp,
 	"Cannot mount filesystem with identical rtdev and ddev/logdev.");
 			error = -EINVAL;
@@ -466,25 +467,25 @@ xfs_open_devices(
 	 * Setup xfs_mount buffer target pointers
 	 */
 	error = -ENOMEM;
-	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle);
+	mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file);
 	if (!mp->m_ddev_targp)
 		goto out_close_rtdev;
 
-	if (rtdev_handle) {
-		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle);
+	if (rtdev_file) {
+		mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file);
 		if (!mp->m_rtdev_targp)
 			goto out_free_ddev_targ;
 	}
 
-	if (logdev_handle && logdev_handle->bdev != ddev) {
-		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle);
+	if (logdev_file && file_bdev(logdev_file) != ddev) {
+		mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file);
 		if (!mp->m_logdev_targp)
 			goto out_free_rtdev_targ;
 	} else {
 		mp->m_logdev_targp = mp->m_ddev_targp;
 		/* Handle won't be used, drop it */
-		if (logdev_handle)
-			bdev_release(logdev_handle);
+		if (logdev_file)
+			bdev_fput(logdev_file);
 	}
 
 	return 0;
@@ -495,11 +496,11 @@ xfs_open_devices(
  out_free_ddev_targ:
 	xfs_free_buftarg(mp->m_ddev_targp);
  out_close_rtdev:
-	 if (rtdev_handle)
-		bdev_release(rtdev_handle);
+	 if (rtdev_file)
+		bdev_fput(rtdev_file);
  out_close_logdev:
-	if (logdev_handle)
-		bdev_release(logdev_handle);
+	if (logdev_file)
+		bdev_fput(logdev_file);
 	return error;
 }
 
@@ -715,9 +716,7 @@ xfs_fs_inode_init_once(
 	/* xfs inode */
 	atomic_set(&ip->i_pincount, 0);
 	spin_lock_init(&ip->i_flags_lock);
-
-	mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
-		     "xfsino", ip->i_ino);
+	init_rwsem(&ip->i_lock);
 }
 
 /*
@@ -760,7 +759,7 @@ xfs_mount_free(
 	debugfs_remove(mp->m_debugfs);
 	kfree(mp->m_rtname);
 	kfree(mp->m_logname);
-	kmem_free(mp);
+	kfree(mp);
 }
 
 STATIC int
@@ -1986,7 +1985,7 @@ static int xfs_init_fs_context(
 {
 	struct xfs_mount	*mp;
 
-	mp = kmem_alloc(sizeof(struct xfs_mount), KM_ZERO);
+	mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL | __GFP_NOFAIL);
 	if (!mp)
 		return -ENOMEM;
 
@@ -2012,6 +2011,8 @@ static int xfs_init_fs_context(
 	mp->m_logbsize = -1;
 	mp->m_allocsize_log = 16; /* 64k */
 
+	xfs_hooks_init(&mp->m_dir_update_hooks);
+
 	fc->s_fs_info = mp;
 	fc->ops = &xfs_context_ops;
 
@@ -2043,8 +2044,7 @@ xfs_init_caches(void)
 
 	xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
 					 SLAB_HWCACHE_ALIGN |
-					 SLAB_RECLAIM_ACCOUNT |
-					 SLAB_MEM_SPREAD,
+					 SLAB_RECLAIM_ACCOUNT,
 					 NULL);
 	if (!xfs_buf_cache)
 		goto out;
@@ -2059,10 +2059,14 @@ xfs_init_caches(void)
 	if (error)
 		goto out_destroy_log_ticket_cache;
 
-	error = xfs_defer_init_item_caches();
+	error = rcbagbt_init_cur_cache();
 	if (error)
 		goto out_destroy_btree_cur_cache;
 
+	error = xfs_defer_init_item_caches();
+	if (error)
+		goto out_destroy_rcbagbt_cur_cache;
+
 	xfs_da_state_cache = kmem_cache_create("xfs_da_state",
 					      sizeof(struct xfs_da_state),
 					      0, 0, NULL);
@@ -2109,14 +2113,14 @@ xfs_init_caches(void)
 					   sizeof(struct xfs_inode), 0,
 					   (SLAB_HWCACHE_ALIGN |
 					    SLAB_RECLAIM_ACCOUNT |
-					    SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+					    SLAB_ACCOUNT),
 					   xfs_fs_inode_init_once);
 	if (!xfs_inode_cache)
 		goto out_destroy_efi_cache;
 
 	xfs_ili_cache = kmem_cache_create("xfs_ili",
 					 sizeof(struct xfs_inode_log_item), 0,
-					 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					 SLAB_RECLAIM_ACCOUNT,
 					 NULL);
 	if (!xfs_ili_cache)
 		goto out_destroy_inode_cache;
@@ -2219,6 +2223,8 @@ xfs_init_caches(void)
 	kmem_cache_destroy(xfs_da_state_cache);
  out_destroy_defer_item_cache:
 	xfs_defer_destroy_item_caches();
+ out_destroy_rcbagbt_cur_cache:
+	rcbagbt_destroy_cur_cache();
  out_destroy_btree_cur_cache:
 	xfs_btree_destroy_cur_caches();
  out_destroy_log_ticket_cache:
@@ -2256,6 +2262,7 @@ xfs_destroy_caches(void)
 	kmem_cache_destroy(xfs_ifork_cache);
 	kmem_cache_destroy(xfs_da_state_cache);
 	xfs_defer_destroy_item_caches();
+	rcbagbt_destroy_cur_cache();
 	xfs_btree_destroy_cur_caches();
 	kmem_cache_destroy(xfs_log_ticket_cache);
 	kmem_cache_destroy(xfs_buf_cache);
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 92974a4414c8..3e376d24c7c1 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -24,77 +24,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_error.h"
 #include "xfs_health.h"
-
-/* ----- Kernel only functions below ----- */
-int
-xfs_readlink_bmap_ilocked(
-	struct xfs_inode	*ip,
-	char			*link)
-{
-	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
-	struct xfs_buf		*bp;
-	xfs_daddr_t		d;
-	char			*cur_chunk;
-	int			pathlen = ip->i_disk_size;
-	int			nmaps = XFS_SYMLINK_MAPS;
-	int			byte_cnt;
-	int			n;
-	int			error = 0;
-	int			fsblocks = 0;
-	int			offset;
-
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-
-	fsblocks = xfs_symlink_blocks(mp, pathlen);
-	error = xfs_bmapi_read(ip, 0, fsblocks, mval, &nmaps, 0);
-	if (error)
-		goto out;
-
-	offset = 0;
-	for (n = 0; n < nmaps; n++) {
-		d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
-		byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-
-		error = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0,
-				&bp, &xfs_symlink_buf_ops);
-		if (error)
-			return error;
-		byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
-		if (pathlen < byte_cnt)
-			byte_cnt = pathlen;
-
-		cur_chunk = bp->b_addr;
-		if (xfs_has_crc(mp)) {
-			if (!xfs_symlink_hdr_ok(ip->i_ino, offset,
-							byte_cnt, bp)) {
-				error = -EFSCORRUPTED;
-				xfs_alert(mp,
-"symlink header does not match required off/len/owner (0x%x/Ox%x,0x%llx)",
-					offset, byte_cnt, ip->i_ino);
-				xfs_buf_relse(bp);
-				goto out;
-
-			}
-
-			cur_chunk += sizeof(struct xfs_dsymlink_hdr);
-		}
-
-		memcpy(link + offset, cur_chunk, byte_cnt);
-
-		pathlen -= byte_cnt;
-		offset += byte_cnt;
-
-		xfs_buf_relse(bp);
-	}
-	ASSERT(pathlen == 0);
-
-	link[ip->i_disk_size] = '\0';
-	error = 0;
-
- out:
-	return error;
-}
+#include "xfs_symlink_remote.h"
 
 int
 xfs_readlink(
@@ -103,7 +33,7 @@ xfs_readlink(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	xfs_fsize_t		pathlen;
-	int			error = -EFSCORRUPTED;
+	int			error;
 
 	trace_xfs_readlink(ip);
 
@@ -116,14 +46,14 @@ xfs_readlink(
 
 	pathlen = ip->i_disk_size;
 	if (!pathlen)
-		goto out;
+		goto out_corrupt;
 
 	if (pathlen < 0 || pathlen > XFS_SYMLINK_MAXLEN) {
 		xfs_alert(mp, "%s: inode (%llu) bad symlink length (%lld)",
 			 __func__, (unsigned long long) ip->i_ino,
 			 (long long) pathlen);
 		ASSERT(0);
-		goto out;
+		goto out_corrupt;
 	}
 
 	if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
@@ -132,17 +62,20 @@ xfs_readlink(
 		 * if if_data is junk.
 		 */
 		if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
-			goto out;
+			goto out_corrupt;
 
 		memcpy(link, ip->i_df.if_data, pathlen + 1);
 		error = 0;
 	} else {
-		error = xfs_readlink_bmap_ilocked(ip, link);
+		error = xfs_symlink_remote_read(ip, link);
 	}
 
- out:
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 	return error;
+ out_corrupt:
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
+	return -EFSCORRUPTED;
 }
 
 int
@@ -160,15 +93,7 @@ xfs_symlink(
 	int			error = 0;
 	int			pathlen;
 	bool                    unlock_dp_on_error = false;
-	xfs_fileoff_t		first_fsb;
 	xfs_filblks_t		fs_blocks;
-	int			nmaps;
-	struct xfs_bmbt_irec	mval[XFS_SYMLINK_MAPS];
-	xfs_daddr_t		d;
-	const char		*cur_chunk;
-	int			byte_cnt;
-	int			n;
-	struct xfs_buf		*bp;
 	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
@@ -256,62 +181,11 @@ xfs_symlink(
 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
 
 	resblks -= XFS_IALLOC_SPACE_RES(mp);
-	/*
-	 * If the symlink will fit into the inode, write it inline.
-	 */
-	if (pathlen <= xfs_inode_data_fork_size(ip)) {
-		xfs_init_local_fork(ip, XFS_DATA_FORK, target_path, pathlen);
-
-		ip->i_disk_size = pathlen;
-		ip->i_df.if_format = XFS_DINODE_FMT_LOCAL;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_DDATA | XFS_ILOG_CORE);
-	} else {
-		int	offset;
-
-		first_fsb = 0;
-		nmaps = XFS_SYMLINK_MAPS;
-
-		error = xfs_bmapi_write(tp, ip, first_fsb, fs_blocks,
-				  XFS_BMAPI_METADATA, resblks, mval, &nmaps);
-		if (error)
-			goto out_trans_cancel;
-
-		resblks -= fs_blocks;
-		ip->i_disk_size = pathlen;
-		xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-		cur_chunk = target_path;
-		offset = 0;
-		for (n = 0; n < nmaps; n++) {
-			char	*buf;
-
-			d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
-			byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-			error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-					       BTOBB(byte_cnt), 0, &bp);
-			if (error)
-				goto out_trans_cancel;
-			bp->b_ops = &xfs_symlink_buf_ops;
-
-			byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt);
-			byte_cnt = min(byte_cnt, pathlen);
-
-			buf = bp->b_addr;
-			buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset,
-						   byte_cnt, bp);
-
-			memcpy(buf, cur_chunk, byte_cnt);
-
-			cur_chunk += byte_cnt;
-			pathlen -= byte_cnt;
-			offset += byte_cnt;
-
-			xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
-			xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
-							(char *)bp->b_addr);
-		}
-		ASSERT(pathlen == 0);
-	}
+	error = xfs_symlink_write_target(tp, ip, target_path, pathlen,
+			fs_blocks, resblks);
+	if (error)
+		goto out_trans_cancel;
+	resblks -= fs_blocks;
 	i_size_write(VFS_I(ip), ip->i_disk_size);
 
 	/*
@@ -322,6 +196,7 @@ xfs_symlink(
 		goto out_trans_cancel;
 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
+	xfs_dir_update_hook(dp, ip, 1, link_name);
 
 	/*
 	 * If this is a synchronous mount, make sure that the
@@ -496,6 +371,7 @@ xfs_inactive_symlink(
 			 __func__, (unsigned long long)ip->i_ino, pathlen);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		ASSERT(0);
+		xfs_inode_mark_sick(ip, XFS_SICK_INO_SYMLINK);
 		return -EFSCORRUPTED;
 	}
 
diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h
index d1ca1ce62a93..0d29a50e66fd 100644
--- a/fs/xfs/xfs_symlink.h
+++ b/fs/xfs/xfs_symlink.h
@@ -10,7 +10,6 @@
 int xfs_symlink(struct mnt_idmap *idmap, struct xfs_inode *dp,
 		struct xfs_name *link_name, const char *target_path,
 		umode_t mode, struct xfs_inode **ipp);
-int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link);
 int xfs_readlink(struct xfs_inode *ip, char *link);
 int xfs_inactive_symlink(struct xfs_inode *ip);
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 17485666b672..d2391eec37fe 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -193,7 +193,6 @@ always_cow_show(
 }
 XFS_SYSFS_ATTR_RW(always_cow);
 
-#ifdef DEBUG
 /*
  * Override how many threads the parallel work queue is allowed to create.
  * This has to be a debug-only global (instead of an errortag) because one of
@@ -260,7 +259,6 @@ larp_show(
 	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp);
 }
 XFS_SYSFS_ATTR_RW(larp);
-#endif /* DEBUG */
 
 STATIC ssize_t
 bload_leaf_slack_store(
@@ -319,10 +317,8 @@ static struct attribute *xfs_dbg_attrs[] = {
 	ATTR_LIST(log_recovery_delay),
 	ATTR_LIST(mount_delay),
 	ATTR_LIST(always_cow),
-#ifdef DEBUG
 	ATTR_LIST(pwork_threads),
 	ATTR_LIST(larp),
-#endif
 	ATTR_LIST(bload_leaf_slack),
 	ATTR_LIST(bload_node_slack),
 	NULL,
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 8a5dc1538aa8..1a963382e5e9 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -36,6 +36,9 @@
 #include "xfs_error.h"
 #include <linux/iomap.h>
 #include "xfs_iomap.h"
+#include "xfs_buf_mem.h"
+#include "xfs_btree_mem.h"
+#include "xfs_bmap.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0984a1c884c7..aea97fc074f8 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -79,6 +79,9 @@ union xfs_btree_ptr;
 struct xfs_dqtrx;
 struct xfs_icwalk;
 struct xfs_perag;
+struct xfbtree;
+struct xfs_btree_ops;
+struct xfs_bmap_intent;
 
 #define XFS_ATTR_FILTER_FLAGS \
 	{ XFS_ATTR_ROOT,	"ROOT" }, \
@@ -640,6 +643,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_read_buf_recur);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_log_buf);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_brelse);
+DEFINE_BUF_ITEM_EVENT(xfs_trans_bdetach);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
@@ -1710,12 +1714,10 @@ DECLARE_EVENT_CLASS(xfs_agf_class,
 		__entry->agno = be32_to_cpu(agf->agf_seqno),
 		__entry->flags = flags;
 		__entry->length = be32_to_cpu(agf->agf_length),
-		__entry->bno_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_BNO]),
-		__entry->cnt_root = be32_to_cpu(agf->agf_roots[XFS_BTNUM_CNT]),
-		__entry->bno_level =
-				be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]),
-		__entry->cnt_level =
-				be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]),
+		__entry->bno_root = be32_to_cpu(agf->agf_bno_root),
+		__entry->cnt_root = be32_to_cpu(agf->agf_cnt_root),
+		__entry->bno_level = be32_to_cpu(agf->agf_bno_level),
+		__entry->cnt_level = be32_to_cpu(agf->agf_cnt_level),
 		__entry->flfirst = be32_to_cpu(agf->agf_flfirst),
 		__entry->fllast = be32_to_cpu(agf->agf_fllast),
 		__entry->flcount = be32_to_cpu(agf->agf_flcount),
@@ -1890,28 +1892,28 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_near_bno);
 DEFINE_ALLOC_EVENT(xfs_alloc_vextent_finish);
 
 TRACE_EVENT(xfs_alloc_cur_check,
-	TP_PROTO(struct xfs_mount *mp, xfs_btnum_t btnum, xfs_agblock_t bno,
+	TP_PROTO(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		 xfs_extlen_t len, xfs_extlen_t diff, bool new),
-	TP_ARGS(mp, btnum, bno, len, diff, new),
+	TP_ARGS(cur, bno, len, diff, new),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(xfs_agblock_t, bno)
 		__field(xfs_extlen_t, len)
 		__field(xfs_extlen_t, diff)
 		__field(bool, new)
 	),
 	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->btnum = btnum;
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->bno = bno;
 		__entry->len = len;
 		__entry->diff = diff;
 		__entry->new = new;
 	),
-	TP_printk("dev %d:%d btree %s agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
+	TP_printk("dev %d:%d %sbt agbno 0x%x fsbcount 0x%x diff 0x%x new %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->bno, __entry->len, __entry->diff, __entry->new)
 )
 
@@ -2452,21 +2454,12 @@ DEFINE_DISCARD_EVENT(xfs_discard_toosmall);
 DEFINE_DISCARD_EVENT(xfs_discard_exclude);
 DEFINE_DISCARD_EVENT(xfs_discard_busy);
 
-/* btree cursor events */
-TRACE_DEFINE_ENUM(XFS_BTNUM_BNOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_CNTi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_BMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_INOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_FINOi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_RMAPi);
-TRACE_DEFINE_ENUM(XFS_BTNUM_REFCi);
-
 DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	TP_PROTO(struct xfs_btree_cur *cur, int level, struct xfs_buf *bp),
 	TP_ARGS(cur, level, bp),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(int, level)
 		__field(int, nlevels)
 		__field(int, ptr)
@@ -2474,15 +2467,15 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->ptr = cur->bc_levels[level].ptr;
 		__entry->daddr = bp ? xfs_buf_daddr(bp) : -1;
 	),
-	TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx",
+	TP_printk("dev %d:%d %sbt level %d/%d ptr %d daddr 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->nlevels,
 		  __entry->ptr,
@@ -2496,6 +2489,90 @@ DEFINE_EVENT(xfs_btree_cur_class, name, \
 DEFINE_BTREE_CUR_EVENT(xfs_btree_updkeys);
 DEFINE_BTREE_CUR_EVENT(xfs_btree_overlapped_query_range);
 
+TRACE_EVENT(xfs_btree_alloc_block,
+	TP_PROTO(struct xfs_btree_cur *cur, union xfs_btree_ptr *ptr, int stat,
+		 int error),
+	TP_ARGS(cur, ptr, stat, error),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__string(name, cur->bc_ops->name)
+		__field(int, error)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		switch (cur->bc_ops->type) {
+		case XFS_BTREE_TYPE_INODE:
+			__entry->agno = 0;
+			__entry->ino = cur->bc_ino.ip->i_ino;
+			break;
+		case XFS_BTREE_TYPE_AG:
+			__entry->agno = cur->bc_ag.pag->pag_agno;
+			__entry->ino = 0;
+			break;
+		case XFS_BTREE_TYPE_MEM:
+			__entry->agno = 0;
+			__entry->ino = 0;
+			break;
+		}
+		__assign_str(name, cur->bc_ops->name);
+		__entry->error = error;
+		if (!error && stat) {
+			if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
+				xfs_fsblock_t	fsb = be64_to_cpu(ptr->l);
+
+				__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp,
+								fsb);
+				__entry->agbno = XFS_FSB_TO_AGBNO(cur->bc_mp,
+								fsb);
+			} else {
+				__entry->agbno = be32_to_cpu(ptr->s);
+			}
+		} else {
+			__entry->agbno = NULLAGBLOCK;
+		}
+	),
+	TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x error %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __get_str(name),
+		  __entry->agno,
+		  __entry->ino,
+		  __entry->agbno,
+		  __entry->error)
+);
+
+TRACE_EVENT(xfs_btree_free_block,
+	TP_PROTO(struct xfs_btree_cur *cur, struct xfs_buf *bp),
+	TP_ARGS(cur, bp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__string(name, cur->bc_ops->name)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = cur->bc_mp->m_super->s_dev;
+		__entry->agno = xfs_daddr_to_agno(cur->bc_mp,
+							xfs_buf_daddr(bp));
+		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
+			__entry->ino = cur->bc_ino.ip->i_ino;
+		else
+			__entry->ino = 0;
+		__assign_str(name, cur->bc_ops->name);
+		__entry->agbno = xfs_daddr_to_agbno(cur->bc_mp,
+							xfs_buf_daddr(bp));
+	),
+	TP_printk("dev %d:%d %sbt agno 0x%x ino 0x%llx agbno 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __get_str(name),
+		  __entry->agno,
+		  __entry->ino,
+		  __entry->agbno)
+);
+
 /* deferred ops */
 struct xfs_defer_pending;
 
@@ -2579,7 +2656,25 @@ DEFINE_EVENT(xfs_defer_pending_class, name, \
 	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp), \
 	TP_ARGS(mp, dfp))
 
-DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
+DEFINE_DEFER_EVENT(xfs_defer_cancel);
+DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
+DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
+DEFINE_DEFER_EVENT(xfs_defer_finish);
+DEFINE_DEFER_EVENT(xfs_defer_finish_done);
+
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
+DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
+
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
+
+DECLARE_EVENT_CLASS(xfs_free_extent_deferred_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
 		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
 	TP_ARGS(mp, agno, type, agbno, len),
@@ -2604,92 +2699,17 @@ DECLARE_EVENT_CLASS(xfs_phys_extent_deferred_class,
 		  __entry->agbno,
 		  __entry->len)
 );
-#define DEFINE_PHYS_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_phys_extent_deferred_class, name, \
+#define DEFINE_FREE_EXTENT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_free_extent_deferred_class, name, \
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
 		 int type, \
 		 xfs_agblock_t bno, \
 		 xfs_extlen_t len), \
 	TP_ARGS(mp, agno, type, bno, len))
-
-DECLARE_EVENT_CLASS(xfs_map_extent_deferred_class,
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-		 int op,
-		 xfs_agblock_t agbno,
-		 xfs_ino_t ino,
-		 int whichfork,
-		 xfs_fileoff_t offset,
-		 xfs_filblks_t len,
-		 xfs_exntst_t state),
-	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_agnumber_t, agno)
-		__field(xfs_ino_t, ino)
-		__field(xfs_agblock_t, agbno)
-		__field(int, whichfork)
-		__field(xfs_fileoff_t, l_loff)
-		__field(xfs_filblks_t, l_len)
-		__field(xfs_exntst_t, l_state)
-		__field(int, op)
-	),
-	TP_fast_assign(
-		__entry->dev = mp->m_super->s_dev;
-		__entry->agno = agno;
-		__entry->ino = ino;
-		__entry->agbno = agbno;
-		__entry->whichfork = whichfork;
-		__entry->l_loff = offset;
-		__entry->l_len = len;
-		__entry->l_state = state;
-		__entry->op = op;
-	),
-	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->op,
-		  __entry->agno,
-		  __entry->agbno,
-		  __entry->ino,
-		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
-		  __entry->l_loff,
-		  __entry->l_len,
-		  __entry->l_state)
-);
-#define DEFINE_MAP_EXTENT_DEFERRED_EVENT(name) \
-DEFINE_EVENT(xfs_map_extent_deferred_class, name, \
-	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
-		 int op, \
-		 xfs_agblock_t agbno, \
-		 xfs_ino_t ino, \
-		 int whichfork, \
-		 xfs_fileoff_t offset, \
-		 xfs_filblks_t len, \
-		 xfs_exntst_t state), \
-	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
-
-DEFINE_DEFER_EVENT(xfs_defer_cancel);
-DEFINE_DEFER_EVENT(xfs_defer_trans_roll);
-DEFINE_DEFER_EVENT(xfs_defer_trans_abort);
-DEFINE_DEFER_EVENT(xfs_defer_finish);
-DEFINE_DEFER_EVENT(xfs_defer_finish_done);
-
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
-
-#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer);
-DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_bmap_free_deferred);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_defer);
+DEFINE_FREE_EXTENT_DEFERRED_EVENT(xfs_agfl_free_deferred);
 
 DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
 	TP_PROTO(struct xfs_mount *mp, struct xfs_defer_pending *dfp,
@@ -2854,12 +2874,63 @@ DEFINE_EVENT(xfs_rmapbt_class, name, \
 		 uint64_t owner, uint64_t offset, unsigned int flags), \
 	TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
 
-#define DEFINE_RMAP_DEFERRED_EVENT DEFINE_MAP_EXTENT_DEFERRED_EVENT
+DECLARE_EVENT_CLASS(xfs_rmap_deferred_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int op,
+		 xfs_agblock_t agbno,
+		 xfs_ino_t ino,
+		 int whichfork,
+		 xfs_fileoff_t offset,
+		 xfs_filblks_t len,
+		 xfs_exntst_t state),
+	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agblock_t, agbno)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, l_loff)
+		__field(xfs_filblks_t, l_len)
+		__field(xfs_exntst_t, l_state)
+		__field(int, op)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->ino = ino;
+		__entry->agbno = agbno;
+		__entry->whichfork = whichfork;
+		__entry->l_loff = offset;
+		__entry->l_len = len;
+		__entry->l_state = state;
+		__entry->op = op;
+	),
+	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x owner 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->op,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->ino,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->l_loff,
+		  __entry->l_len,
+		  __entry->l_state)
+);
+#define DEFINE_RMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_rmap_deferred_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 int op, \
+		 xfs_agblock_t agbno, \
+		 xfs_ino_t ino, \
+		 int whichfork, \
+		 xfs_fileoff_t offset, \
+		 xfs_filblks_t len, \
+		 xfs_exntst_t state), \
+	TP_ARGS(mp, agno, op, agbno, ino, whichfork, offset, len, state))
 DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_defer);
 DEFINE_RMAP_DEFERRED_EVENT(xfs_rmap_deferred);
 
-DEFINE_BUSY_EVENT(xfs_rmapbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_rmapbt_free_block);
 DEFINE_RMAPBT_EVENT(xfs_rmap_update);
 DEFINE_RMAPBT_EVENT(xfs_rmap_insert);
 DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
@@ -2876,7 +2947,66 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
 
 /* deferred bmbt updates */
-#define DEFINE_BMAP_DEFERRED_EVENT	DEFINE_RMAP_DEFERRED_EVENT
+TRACE_DEFINE_ENUM(XFS_BMAP_MAP);
+TRACE_DEFINE_ENUM(XFS_BMAP_UNMAP);
+
+DECLARE_EVENT_CLASS(xfs_bmap_deferred_class,
+	TP_PROTO(struct xfs_bmap_intent *bi),
+	TP_ARGS(bi),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, opdev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_ino_t, ino)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_fsblock_t, rtbno)
+		__field(int, whichfork)
+		__field(xfs_fileoff_t, l_loff)
+		__field(xfs_filblks_t, l_len)
+		__field(xfs_exntst_t, l_state)
+		__field(int, op)
+	),
+	TP_fast_assign(
+		struct xfs_inode	*ip = bi->bi_owner;
+
+		__entry->dev = ip->i_mount->m_super->s_dev;
+		if (xfs_ifork_is_realtime(ip, bi->bi_whichfork)) {
+			__entry->agno = 0;
+			__entry->agbno = 0;
+			__entry->rtbno = bi->bi_bmap.br_startblock;
+			__entry->opdev = ip->i_mount->m_rtdev_targp->bt_dev;
+		} else {
+			__entry->agno = XFS_FSB_TO_AGNO(ip->i_mount,
+						bi->bi_bmap.br_startblock);
+			__entry->agbno = XFS_FSB_TO_AGBNO(ip->i_mount,
+						bi->bi_bmap.br_startblock);
+			__entry->rtbno = 0;
+			__entry->opdev = __entry->dev;
+		}
+		__entry->ino = ip->i_ino;
+		__entry->whichfork = bi->bi_whichfork;
+		__entry->l_loff = bi->bi_bmap.br_startoff;
+		__entry->l_len = bi->bi_bmap.br_blockcount;
+		__entry->l_state = bi->bi_bmap.br_state;
+		__entry->op = bi->bi_type;
+	),
+	TP_printk("dev %d:%d op %s opdev %d:%d ino 0x%llx agno 0x%x agbno 0x%x rtbno 0x%llx %s fileoff 0x%llx fsbcount 0x%llx state %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __print_symbolic(__entry->op, XFS_BMAP_INTENT_STRINGS),
+		  MAJOR(__entry->opdev), MINOR(__entry->opdev),
+		  __entry->ino,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->rtbno,
+		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+		  __entry->l_loff,
+		  __entry->l_len,
+		  __entry->l_state)
+);
+#define DEFINE_BMAP_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_bmap_deferred_class, name, \
+	TP_PROTO(struct xfs_bmap_intent *bi), \
+	TP_ARGS(bi))
 DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
 DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
 
@@ -3217,8 +3347,6 @@ DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
 	TP_ARGS(mp, agno, i1, i2, i3))
 
 /* refcount btree tracepoints */
-DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
-DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
 DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
 DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
@@ -3255,7 +3383,39 @@ DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
 DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
 DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
 DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
-#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+
+DECLARE_EVENT_CLASS(xfs_refcount_deferred_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int type, xfs_agblock_t agbno, xfs_extlen_t len),
+	TP_ARGS(mp, agno, type, agbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, type)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->type = type;
+		__entry->agbno = agbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d op %d agno 0x%x agbno 0x%x fsbcount 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len)
+);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_deferred_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 int type, \
+		 xfs_agblock_t bno, \
+		 xfs_extlen_t len), \
+	TP_ARGS(mp, agno, type, bno, len))
 DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
 DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
 
@@ -3926,9 +4086,11 @@ DEFINE_EVENT(xfs_fs_corrupt_class, name,	\
 	TP_PROTO(struct xfs_mount *mp, unsigned int flags), \
 	TP_ARGS(mp, flags))
 DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_corrupt);
 DEFINE_FS_CORRUPT_EVENT(xfs_fs_mark_healthy);
 DEFINE_FS_CORRUPT_EVENT(xfs_fs_unfixed_corruption);
 DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_sick);
+DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_corrupt);
 DEFINE_FS_CORRUPT_EVENT(xfs_rt_mark_healthy);
 DEFINE_FS_CORRUPT_EVENT(xfs_rt_unfixed_corruption);
 
@@ -3955,6 +4117,7 @@ DEFINE_EVENT(xfs_ag_corrupt_class, name,	\
 		 unsigned int flags), \
 	TP_ARGS(mp, agno, flags))
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_sick);
+DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_corrupt);
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_mark_healthy);
 DEFINE_AG_CORRUPT_EVENT(xfs_ag_unfixed_corruption);
 
@@ -3980,7 +4143,9 @@ DEFINE_EVENT(xfs_inode_corrupt_class, name,	\
 	TP_PROTO(struct xfs_inode *ip, unsigned int flags), \
 	TP_ARGS(ip, flags))
 DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_sick);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_corrupt);
 DEFINE_INODE_CORRUPT_EVENT(xfs_inode_mark_healthy);
+DEFINE_INODE_CORRUPT_EVENT(xfs_inode_unfixed_corruption);
 
 TRACE_EVENT(xfs_iwalk_ag,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
@@ -4040,31 +4205,6 @@ TRACE_EVENT(xfs_pwork_init,
 		  __entry->nr_threads, __entry->pid)
 )
 
-DECLARE_EVENT_CLASS(xfs_kmem_class,
-	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip),
-	TP_ARGS(size, flags, caller_ip),
-	TP_STRUCT__entry(
-		__field(ssize_t, size)
-		__field(int, flags)
-		__field(unsigned long, caller_ip)
-	),
-	TP_fast_assign(
-		__entry->size = size;
-		__entry->flags = flags;
-		__entry->caller_ip = caller_ip;
-	),
-	TP_printk("size %zd flags 0x%x caller %pS",
-		  __entry->size,
-		  __entry->flags,
-		  (char *)__entry->caller_ip)
-)
-
-#define DEFINE_KMEM_EVENT(name) \
-DEFINE_EVENT(xfs_kmem_class, name, \
-	TP_PROTO(ssize_t size, int flags, unsigned long caller_ip), \
-	TP_ARGS(size, flags, caller_ip))
-DEFINE_KMEM_EVENT(kmem_alloc);
-
 TRACE_EVENT(xfs_check_new_dalign,
 	TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
 	TP_ARGS(mp, new_dalign, calc_rootino),
@@ -4091,7 +4231,7 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
 	TP_ARGS(cur),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agblock_t, agbno)
 		__field(unsigned int, levels)
@@ -4099,15 +4239,15 @@ TRACE_EVENT(xfs_btree_commit_afakeroot,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->agno = cur->bc_ag.pag->pag_agno;
 		__entry->agbno = cur->bc_ag.afake->af_root;
 		__entry->levels = cur->bc_ag.afake->af_levels;
 		__entry->blocks = cur->bc_ag.afake->af_blocks;
 	),
-	TP_printk("dev %d:%d btree %s agno 0x%x levels %u blocks %u root %u",
+	TP_printk("dev %d:%d %sbt agno 0x%x levels %u blocks %u root %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->agno,
 		  __entry->levels,
 		  __entry->blocks,
@@ -4119,7 +4259,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
 	TP_ARGS(cur),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(xfs_agnumber_t, agno)
 		__field(xfs_agino_t, agino)
 		__field(unsigned int, levels)
@@ -4128,7 +4268,7 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->agno = XFS_INO_TO_AGNO(cur->bc_mp,
 					cur->bc_ino.ip->i_ino);
 		__entry->agino = XFS_INO_TO_AGINO(cur->bc_mp,
@@ -4137,9 +4277,9 @@ TRACE_EVENT(xfs_btree_commit_ifakeroot,
 		__entry->blocks = cur->bc_ino.ifake->if_blocks;
 		__entry->whichfork = cur->bc_ino.whichfork;
 	),
-	TP_printk("dev %d:%d btree %s agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
+	TP_printk("dev %d:%d %sbt agno 0x%x agino 0x%x whichfork %s levels %u blocks %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->agno,
 		  __entry->agino,
 		  __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
@@ -4156,7 +4296,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
 		blocks_with_extra),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(unsigned int, level)
 		__field(unsigned int, nlevels)
 		__field(uint64_t, nr_this_level)
@@ -4167,7 +4307,7 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->nlevels = cur->bc_nlevels;
 		__entry->nr_this_level = nr_this_level;
@@ -4176,9 +4316,9 @@ TRACE_EVENT(xfs_btree_bload_level_geometry,
 		__entry->blocks = blocks;
 		__entry->blocks_with_extra = blocks_with_extra;
 	),
-	TP_printk("dev %d:%d btree %s level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
+	TP_printk("dev %d:%d %sbt level %u/%u nr_this_level %llu nr_per_block %u desired_npb %u blocks %llu blocks_with_extra %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->nlevels,
 		  __entry->nr_this_level,
@@ -4195,7 +4335,7 @@ TRACE_EVENT(xfs_btree_bload_block,
 	TP_ARGS(cur, level, block_idx, nr_blocks, ptr, nr_records),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
-		__field(xfs_btnum_t, btnum)
+		__string(name, cur->bc_ops->name)
 		__field(unsigned int, level)
 		__field(unsigned long long, block_idx)
 		__field(unsigned long long, nr_blocks)
@@ -4205,11 +4345,11 @@ TRACE_EVENT(xfs_btree_bload_block,
 	),
 	TP_fast_assign(
 		__entry->dev = cur->bc_mp->m_super->s_dev;
-		__entry->btnum = cur->bc_btnum;
+		__assign_str(name, cur->bc_ops->name);
 		__entry->level = level;
 		__entry->block_idx = block_idx;
 		__entry->nr_blocks = nr_blocks;
-		if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
 			xfs_fsblock_t	fsb = be64_to_cpu(ptr->l);
 
 			__entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsb);
@@ -4220,9 +4360,9 @@ TRACE_EVENT(xfs_btree_bload_block,
 		}
 		__entry->nr_records = nr_records;
 	),
-	TP_printk("dev %d:%d btree %s level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
+	TP_printk("dev %d:%d %sbt level %u block %llu/%llu agno 0x%x agbno 0x%x recs %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __print_symbolic(__entry->btnum, XFS_BTNUM_STRINGS),
+		  __get_str(name),
 		  __entry->level,
 		  __entry->block_idx,
 		  __entry->nr_blocks,
@@ -4472,6 +4612,164 @@ DEFINE_PERAG_INTENTS_EVENT(xfs_perag_wait_intents);
 
 #endif /* CONFIG_XFS_DRAIN_INTENTS */
 
+#ifdef CONFIG_XFS_MEMORY_BUFS
+TRACE_EVENT(xmbuf_create,
+	TP_PROTO(struct xfs_buftarg *btp),
+	TP_ARGS(btp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, ino)
+		__array(char, pathname, 256)
+	),
+	TP_fast_assign(
+		char		pathname[257];
+		char		*path;
+		struct file	*file = btp->bt_file;
+
+		__entry->dev = btp->bt_mount->m_super->s_dev;
+		__entry->ino = file_inode(file)->i_ino;
+		memset(pathname, 0, sizeof(pathname));
+		path = file_path(file, pathname, sizeof(pathname) - 1);
+		if (IS_ERR(path))
+			path = "(unknown)";
+		strncpy(__entry->pathname, path, sizeof(__entry->pathname));
+	),
+	TP_printk("dev %d:%d xmino 0x%lx path '%s'",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->pathname)
+);
+
+TRACE_EVENT(xmbuf_free,
+	TP_PROTO(struct xfs_buftarg *btp),
+	TP_ARGS(btp),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, ino)
+		__field(unsigned long long, bytes)
+		__field(loff_t, size)
+	),
+	TP_fast_assign(
+		struct file	*file = btp->bt_file;
+		struct inode	*inode = file_inode(file);
+
+		__entry->dev = btp->bt_mount->m_super->s_dev;
+		__entry->size = i_size_read(inode);
+		__entry->bytes = (inode->i_blocks << SECTOR_SHIFT) + inode->i_bytes;
+		__entry->ino = inode->i_ino;
+	),
+	TP_printk("dev %d:%d xmino 0x%lx mem_bytes 0x%llx isize 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->bytes,
+		  __entry->size)
+);
+#endif /* CONFIG_XFS_MEMORY_BUFS */
+
+#ifdef CONFIG_XFS_BTREE_IN_MEM
+TRACE_EVENT(xfbtree_init,
+	TP_PROTO(struct xfs_mount *mp, struct xfbtree *xfbt,
+		 const struct xfs_btree_ops *ops),
+	TP_ARGS(mp, xfbt, ops),
+	TP_STRUCT__entry(
+		__field(const void *, btree_ops)
+		__field(unsigned long, xfino)
+		__field(unsigned int, leaf_mxr)
+		__field(unsigned int, leaf_mnr)
+		__field(unsigned int, node_mxr)
+		__field(unsigned int, node_mnr)
+		__field(unsigned long long, owner)
+	),
+	TP_fast_assign(
+		__entry->btree_ops = ops;
+		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+		__entry->leaf_mxr = xfbt->maxrecs[0];
+		__entry->node_mxr = xfbt->maxrecs[1];
+		__entry->leaf_mnr = xfbt->minrecs[0];
+		__entry->node_mnr = xfbt->minrecs[1];
+		__entry->owner = xfbt->owner;
+	),
+	TP_printk("xfino 0x%lx btree_ops %pS owner 0x%llx leaf_mxr %u leaf_mnr %u node_mxr %u node_mnr %u",
+		  __entry->xfino,
+		  __entry->btree_ops,
+		  __entry->owner,
+		  __entry->leaf_mxr,
+		  __entry->leaf_mnr,
+		  __entry->node_mxr,
+		  __entry->node_mnr)
+);
+
+DECLARE_EVENT_CLASS(xfbtree_buf_class,
+	TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp),
+	TP_ARGS(xfbt, bp),
+	TP_STRUCT__entry(
+		__field(unsigned long, xfino)
+		__field(xfs_daddr_t, bno)
+		__field(int, nblks)
+		__field(int, hold)
+		__field(int, pincount)
+		__field(unsigned int, lockval)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+		__entry->bno = xfs_buf_daddr(bp);
+		__entry->nblks = bp->b_length;
+		__entry->hold = atomic_read(&bp->b_hold);
+		__entry->pincount = atomic_read(&bp->b_pin_count);
+		__entry->lockval = bp->b_sema.count;
+		__entry->flags = bp->b_flags;
+	),
+	TP_printk("xfino 0x%lx daddr 0x%llx bbcount 0x%x hold %d pincount %d lock %d flags %s",
+		  __entry->xfino,
+		  (unsigned long long)__entry->bno,
+		  __entry->nblks,
+		  __entry->hold,
+		  __entry->pincount,
+		  __entry->lockval,
+		  __print_flags(__entry->flags, "|", XFS_BUF_FLAGS))
+)
+
+#define DEFINE_XFBTREE_BUF_EVENT(name) \
+DEFINE_EVENT(xfbtree_buf_class, name, \
+	TP_PROTO(struct xfbtree *xfbt, struct xfs_buf *bp), \
+	TP_ARGS(xfbt, bp))
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_create_root_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_commit_buf);
+DEFINE_XFBTREE_BUF_EVENT(xfbtree_trans_cancel_buf);
+
+DECLARE_EVENT_CLASS(xfbtree_freesp_class,
+	TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur,
+		 xfs_fileoff_t fileoff),
+	TP_ARGS(xfbt, cur, fileoff),
+	TP_STRUCT__entry(
+		__field(unsigned long, xfino)
+		__string(btname, cur->bc_ops->name)
+		__field(int, nlevels)
+		__field(xfs_fileoff_t, fileoff)
+	),
+	TP_fast_assign(
+		__entry->xfino = file_inode(xfbt->target->bt_file)->i_ino;
+		__assign_str(btname, cur->bc_ops->name);
+		__entry->nlevels = cur->bc_nlevels;
+		__entry->fileoff = fileoff;
+	),
+	TP_printk("xfino 0x%lx %sbt nlevels %d fileoff 0x%llx",
+		  __entry->xfino,
+		  __get_str(btname),
+		  __entry->nlevels,
+		  (unsigned long long)__entry->fileoff)
+)
+
+#define DEFINE_XFBTREE_FREESP_EVENT(name) \
+DEFINE_EVENT(xfbtree_freesp_class, name, \
+	TP_PROTO(struct xfbtree *xfbt, struct xfs_btree_cur *cur, \
+		 xfs_fileoff_t fileoff), \
+	TP_ARGS(xfbt, cur, fileoff))
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_alloc_block);
+DEFINE_XFBTREE_FREESP_EVENT(xfbtree_free_block);
+#endif /* CONFIG_XFS_BTREE_IN_MEM */
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 12d45e93f07d..7350640059cc 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1273,7 +1273,7 @@ xfs_trans_reserve_more_inode(
 	unsigned int		rtx = xfs_extlen_to_rtxlen(mp, rblocks);
 	int			error;
 
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
 	if (error)
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 08ce757c7454..1636663707dc 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -215,6 +215,7 @@ struct xfs_buf	*xfs_trans_getsb(struct xfs_trans *);
 
 void		xfs_trans_brelse(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bjoin(xfs_trans_t *, struct xfs_buf *);
+void		xfs_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp);
 void		xfs_trans_bhold(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *);
 void		xfs_trans_binval(xfs_trans_t *, struct xfs_buf *);
@@ -267,19 +268,14 @@ static inline void
 xfs_trans_set_context(
 	struct xfs_trans	*tp)
 {
-	ASSERT(current->journal_info == NULL);
 	tp->t_pflags = memalloc_nofs_save();
-	current->journal_info = tp;
 }
 
 static inline void
 xfs_trans_clear_context(
 	struct xfs_trans	*tp)
 {
-	if (current->journal_info == tp) {
-		memalloc_nofs_restore(tp->t_pflags);
-		current->journal_info = NULL;
-	}
+	memalloc_nofs_restore(tp->t_pflags);
 }
 
 static inline void
@@ -287,10 +283,8 @@ xfs_trans_switch_context(
 	struct xfs_trans	*old_tp,
 	struct xfs_trans	*new_tp)
 {
-	ASSERT(current->journal_info == old_tp);
 	new_tp->t_pflags = old_tp->t_pflags;
 	old_tp->t_pflags = 0;
-	current->journal_info = new_tp;
 }
 
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1098452e7f95..e4c343096f95 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -901,7 +901,8 @@ xfs_trans_ail_init(
 {
 	struct xfs_ail	*ailp;
 
-	ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+	ailp = kzalloc(sizeof(struct xfs_ail),
+			GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 	if (!ailp)
 		return -ENOMEM;
 
@@ -921,7 +922,7 @@ xfs_trans_ail_init(
 	return 0;
 
 out_free_ailp:
-	kmem_free(ailp);
+	kfree(ailp);
 	return -ENOMEM;
 }
 
@@ -932,5 +933,5 @@ xfs_trans_ail_destroy(
 	struct xfs_ail	*ailp = mp->m_ail;
 
 	kthread_stop(ailp->ail_task);
-	kmem_free(ailp);
+	kfree(ailp);
 }
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6549e50d852c..e28ab74af4f0 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -393,6 +393,48 @@ xfs_trans_brelse(
 }
 
 /*
+ * Forcibly detach a buffer previously joined to the transaction.  The caller
+ * will retain its locked reference to the buffer after this function returns.
+ * The buffer must be completely clean and must not be held to the transaction.
+ */
+void
+xfs_trans_bdetach(
+	struct xfs_trans	*tp,
+	struct xfs_buf		*bp)
+{
+	struct xfs_buf_log_item	*bip = bp->b_log_item;
+
+	ASSERT(tp != NULL);
+	ASSERT(bp->b_transp == tp);
+	ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+	ASSERT(atomic_read(&bip->bli_refcount) > 0);
+
+	trace_xfs_trans_bdetach(bip);
+
+	/*
+	 * Erase all recursion count, since we're removing this buffer from the
+	 * transaction.
+	 */
+	bip->bli_recur = 0;
+
+	/*
+	 * The buffer must be completely clean.  Specifically, it had better
+	 * not be dirty, stale, logged, ordered, or held to the transaction.
+	 */
+	ASSERT(!test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags));
+	ASSERT(!(bip->bli_flags & XFS_BLI_DIRTY));
+	ASSERT(!(bip->bli_flags & XFS_BLI_HOLD));
+	ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
+	ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED));
+	ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
+
+	/* Unlink the log item from the transaction and drop the log item. */
+	xfs_trans_del_item(&bip->bli_item);
+	xfs_buf_item_put(bip);
+	bp->b_transp = NULL;
+}
+
+/*
  * Mark the buffer as not needing to be unlocked when the buf item's
  * iop_committing() routine is called.  The buffer must already be locked
  * and associated with the given transaction.
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index aa00cf67ad72..577b535a595c 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -17,6 +17,7 @@
 #include "xfs_qm.h"
 #include "xfs_trace.h"
 #include "xfs_error.h"
+#include "xfs_health.h"
 
 STATIC void	xfs_trans_alloc_dqinfo(xfs_trans_t *);
 
@@ -120,6 +121,116 @@ xfs_trans_dup_dqinfo(
 	}
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/*
+ * Use a static key here to reduce the overhead of quota live updates.  If the
+ * compiler supports jump labels, the static branch will be replaced by a nop
+ * sled when there are no hook users.  Online fsck is currently the only
+ * caller, so this is a reasonable tradeoff.
+ *
+ * Note: Patching the kernel code requires taking the cpu hotplug lock.  Other
+ * parts of the kernel allocate memory with that lock held, which means that
+ * XFS callers cannot hold any locks that might be used by memory reclaim or
+ * writeback when calling the static_branch_{inc,dec} functions.
+ */
+DEFINE_STATIC_XFS_HOOK_SWITCH(xfs_dqtrx_hooks_switch);
+
+void
+xfs_dqtrx_hook_disable(void)
+{
+	xfs_hooks_switch_off(&xfs_dqtrx_hooks_switch);
+}
+
+void
+xfs_dqtrx_hook_enable(void)
+{
+	xfs_hooks_switch_on(&xfs_dqtrx_hooks_switch);
+}
+
+/* Schedule a transactional dquot update on behalf of an inode. */
+void
+xfs_trans_mod_ino_dquot(
+	struct xfs_trans		*tp,
+	struct xfs_inode		*ip,
+	struct xfs_dquot		*dqp,
+	unsigned int			field,
+	int64_t				delta)
+{
+	xfs_trans_mod_dquot(tp, dqp, field, delta);
+
+	if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+		struct xfs_mod_ino_dqtrx_params	p = {
+			.tx_id		= (uintptr_t)tp,
+			.ino		= ip->i_ino,
+			.q_type		= xfs_dquot_type(dqp),
+			.q_id		= dqp->q_id,
+			.delta		= delta
+		};
+		struct xfs_quotainfo	*qi = tp->t_mountp->m_quotainfo;
+
+		xfs_hooks_call(&qi->qi_mod_ino_dqtrx_hooks, field, &p);
+	}
+}
+
+/* Call the specified functions during a dquot counter update. */
+int
+xfs_dqtrx_hook_add(
+	struct xfs_quotainfo	*qi,
+	struct xfs_dqtrx_hook	*hook)
+{
+	int			error;
+
+	/*
+	 * Transactional dquot updates first call the mod hook when changes
+	 * are attached to the transaction and then call the apply hook when
+	 * those changes are committed (or canceled).
+	 *
+	 * The apply hook must be installed before the mod hook so that we
+	 * never fail to catch the end of a quota update sequence.
+	 */
+	error = xfs_hooks_add(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+	if (error)
+		goto out;
+
+	error = xfs_hooks_add(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+	if (error)
+		goto out_apply;
+
+	return 0;
+
+out_apply:
+	xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+out:
+	return error;
+}
+
+/* Stop calling the specified function during a dquot counter update. */
+void
+xfs_dqtrx_hook_del(
+	struct xfs_quotainfo	*qi,
+	struct xfs_dqtrx_hook	*hook)
+{
+	/*
+	 * The mod hook must be removed before apply hook to avoid giving the
+	 * hook consumer with an incomplete update.  No hooks should be running
+	 * after these functions return.
+	 */
+	xfs_hooks_del(&qi->qi_mod_ino_dqtrx_hooks, &hook->mod_hook);
+	xfs_hooks_del(&qi->qi_apply_dqtrx_hooks, &hook->apply_hook);
+}
+
+/* Configure dquot update hook functions. */
+void
+xfs_dqtrx_hook_setup(
+	struct xfs_dqtrx_hook	*hook,
+	notifier_fn_t		mod_fn,
+	notifier_fn_t		apply_fn)
+{
+	xfs_hook_setup(&hook->mod_hook, mod_fn);
+	xfs_hook_setup(&hook->apply_hook, apply_fn);
+}
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Wrap around mod_dquot to account for both user and group quotas.
  */
@@ -137,11 +248,11 @@ xfs_trans_mod_dquot_byino(
 		return;
 
 	if (XFS_IS_UQUOTA_ON(mp) && ip->i_udquot)
-		(void) xfs_trans_mod_dquot(tp, ip->i_udquot, field, delta);
+		xfs_trans_mod_ino_dquot(tp, ip, ip->i_udquot, field, delta);
 	if (XFS_IS_GQUOTA_ON(mp) && ip->i_gdquot)
-		(void) xfs_trans_mod_dquot(tp, ip->i_gdquot, field, delta);
+		xfs_trans_mod_ino_dquot(tp, ip, ip->i_gdquot, field, delta);
 	if (XFS_IS_PQUOTA_ON(mp) && ip->i_pdquot)
-		(void) xfs_trans_mod_dquot(tp, ip->i_pdquot, field, delta);
+		xfs_trans_mod_ino_dquot(tp, ip, ip->i_pdquot, field, delta);
 }
 
 STATIC struct xfs_dqtrx *
@@ -321,6 +432,29 @@ xfs_apply_quota_reservation_deltas(
 	}
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to apply dquot deltas. */
+static inline void
+xfs_trans_apply_dquot_deltas_hook(
+	struct xfs_trans		*tp,
+	struct xfs_dquot		*dqp)
+{
+	if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+		struct xfs_apply_dqtrx_params	p = {
+			.tx_id		= (uintptr_t)tp,
+			.q_type		= xfs_dquot_type(dqp),
+			.q_id		= dqp->q_id,
+		};
+		struct xfs_quotainfo	*qi = tp->t_mountp->m_quotainfo;
+
+		xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+				XFS_APPLY_DQTRX_COMMIT, &p);
+	}
+}
+#else
+# define xfs_trans_apply_dquot_deltas_hook(tp, dqp)	((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Called by xfs_trans_commit() and similar in spirit to
  * xfs_trans_apply_sb_deltas().
@@ -366,6 +500,8 @@ xfs_trans_apply_dquot_deltas(
 
 			ASSERT(XFS_DQ_IS_LOCKED(dqp));
 
+			xfs_trans_apply_dquot_deltas_hook(tp, dqp);
+
 			/*
 			 * adjust the actual number of blocks used
 			 */
@@ -465,6 +601,29 @@ xfs_trans_apply_dquot_deltas(
 	}
 }
 
+#ifdef CONFIG_XFS_LIVE_HOOKS
+/* Call downstream hooks now that it's time to cancel dquot deltas. */
+static inline void
+xfs_trans_unreserve_and_mod_dquots_hook(
+	struct xfs_trans		*tp,
+	struct xfs_dquot		*dqp)
+{
+	if (xfs_hooks_switched_on(&xfs_dqtrx_hooks_switch)) {
+		struct xfs_apply_dqtrx_params	p = {
+			.tx_id		= (uintptr_t)tp,
+			.q_type		= xfs_dquot_type(dqp),
+			.q_id		= dqp->q_id,
+		};
+		struct xfs_quotainfo	*qi = tp->t_mountp->m_quotainfo;
+
+		xfs_hooks_call(&qi->qi_apply_dqtrx_hooks,
+				XFS_APPLY_DQTRX_UNRESERVE, &p);
+	}
+}
+#else
+# define xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp)	((void)0)
+#endif /* CONFIG_XFS_LIVE_HOOKS */
+
 /*
  * Release the reservations, and adjust the dquots accordingly.
  * This is called only when the transaction is being aborted. If by
@@ -495,6 +654,9 @@ xfs_trans_unreserve_and_mod_dquots(
 			 */
 			if ((dqp = qtrx->qt_dquot) == NULL)
 				break;
+
+			xfs_trans_unreserve_and_mod_dquots_hook(tp, dqp);
+
 			/*
 			 * Unreserve the original reservation. We don't care
 			 * about the number of blocks used field, or deltas.
@@ -706,6 +868,7 @@ error_return:
 error_corrupt:
 	xfs_dqunlock(dqp);
 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	xfs_fs_mark_sick(mp, XFS_SICK_FS_QUOTACHECK);
 	return -EFSCORRUPTED;
 }
 
@@ -796,7 +959,7 @@ xfs_trans_reserve_quota_nblks(
 		return 0;
 
 	ASSERT(!xfs_is_quota_inode(&mp->m_sb, ip->i_ino));
-	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	xfs_assert_ilocked(ip, XFS_ILOCK_EXCL);
 
 	if (force)
 		qflags |= XFS_QMOPT_FORCE_RES;
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index dba5dcb62bef..3b103715acc9 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -125,7 +125,8 @@ static void zonefs_readahead(struct readahead_control *rac)
  * which implies that the page range can only be within the fixed inode size.
  */
 static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
-				   struct inode *inode, loff_t offset)
+				   struct inode *inode, loff_t offset,
+				   unsigned int len)
 {
 	struct zonefs_zone *z = zonefs_inode_zone(inode);
 
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b6e8e7c96251..964fa7f24003 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -15,12 +15,13 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
-#include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/mman.h>
 #include <linux/sched/mm.h>
 #include <linux/crc32.h>
 #include <linux/task_io_accounting_ops.h>
+#include <linux/fs_parser.h>
+#include <linux/fs_context.h>
 
 #include "zonefs.h"
 
@@ -113,7 +114,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
 
 	trace_zonefs_zone_mgmt(sb, z, op);
 	ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
-			       z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+			       z->z_size >> SECTOR_SHIFT);
 	if (ret) {
 		zonefs_err(sb,
 			   "Zone management operation %s at %llu failed %d\n",
@@ -470,58 +471,47 @@ static int zonefs_statfs(struct dentry *dentry, struct kstatfs *buf)
 }
 
 enum {
-	Opt_errors_ro, Opt_errors_zro, Opt_errors_zol, Opt_errors_repair,
-	Opt_explicit_open, Opt_err,
+	Opt_errors, Opt_explicit_open,
 };
 
-static const match_table_t tokens = {
-	{ Opt_errors_ro,	"errors=remount-ro"},
-	{ Opt_errors_zro,	"errors=zone-ro"},
-	{ Opt_errors_zol,	"errors=zone-offline"},
-	{ Opt_errors_repair,	"errors=repair"},
-	{ Opt_explicit_open,	"explicit-open" },
-	{ Opt_err,		NULL}
+struct zonefs_context {
+	unsigned long s_mount_opts;
 };
 
-static int zonefs_parse_options(struct super_block *sb, char *options)
-{
-	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
-	substring_t args[MAX_OPT_ARGS];
-	char *p;
-
-	if (!options)
-		return 0;
-
-	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
+static const struct constant_table zonefs_param_errors[] = {
+	{"remount-ro",		ZONEFS_MNTOPT_ERRORS_RO},
+	{"zone-ro",		ZONEFS_MNTOPT_ERRORS_ZRO},
+	{"zone-offline",	ZONEFS_MNTOPT_ERRORS_ZOL},
+	{"repair", 		ZONEFS_MNTOPT_ERRORS_REPAIR},
+	{}
+};
 
-		if (!*p)
-			continue;
+static const struct fs_parameter_spec zonefs_param_spec[] = {
+	fsparam_enum	("errors",		Opt_errors, zonefs_param_errors),
+	fsparam_flag	("explicit-open",	Opt_explicit_open),
+	{}
+};
 
-		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_errors_ro:
-			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_RO;
-			break;
-		case Opt_errors_zro:
-			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZRO;
-			break;
-		case Opt_errors_zol:
-			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_ZOL;
-			break;
-		case Opt_errors_repair:
-			sbi->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
-			sbi->s_mount_opts |= ZONEFS_MNTOPT_ERRORS_REPAIR;
-			break;
-		case Opt_explicit_open:
-			sbi->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
-			break;
-		default:
-			return -EINVAL;
-		}
+static int zonefs_parse_param(struct fs_context *fc, struct fs_parameter *param)
+{
+	struct zonefs_context *ctx = fc->fs_private;
+	struct fs_parse_result result;
+	int opt;
+
+	opt = fs_parse(fc, zonefs_param_spec, param, &result);
+	if (opt < 0)
+		return opt;
+
+	switch (opt) {
+	case Opt_errors:
+		ctx->s_mount_opts &= ~ZONEFS_MNTOPT_ERRORS_MASK;
+		ctx->s_mount_opts |= result.uint_32;
+		break;
+	case Opt_explicit_open:
+		ctx->s_mount_opts |= ZONEFS_MNTOPT_EXPLICIT_OPEN;
+		break;
+	default:
+		return -EINVAL;
 	}
 
 	return 0;
@@ -543,13 +533,6 @@ static int zonefs_show_options(struct seq_file *seq, struct dentry *root)
 	return 0;
 }
 
-static int zonefs_remount(struct super_block *sb, int *flags, char *data)
-{
-	sync_filesystem(sb);
-
-	return zonefs_parse_options(sb, data);
-}
-
 static int zonefs_inode_setattr(struct mnt_idmap *idmap,
 				struct dentry *dentry, struct iattr *iattr)
 {
@@ -1065,7 +1048,7 @@ static int zonefs_init_zgroup(struct super_block *sb,
 	zonefs_info(sb, "Zone group \"%s\" has %u file%s\n",
 		    zonefs_zgroup_name(ztype),
 		    zgroup->g_nr_zones,
-		    zgroup->g_nr_zones > 1 ? "s" : "");
+		    str_plural(zgroup->g_nr_zones));
 
 	return 0;
 }
@@ -1205,7 +1188,6 @@ static const struct super_operations zonefs_sops = {
 	.alloc_inode	= zonefs_alloc_inode,
 	.free_inode	= zonefs_free_inode,
 	.statfs		= zonefs_statfs,
-	.remount_fs	= zonefs_remount,
 	.show_options	= zonefs_show_options,
 };
 
@@ -1250,9 +1232,10 @@ static void zonefs_release_zgroup_inodes(struct super_block *sb)
  * sub-directories and files according to the device zone configuration and
  * format options.
  */
-static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
+static int zonefs_fill_super(struct super_block *sb, struct fs_context *fc)
 {
 	struct zonefs_sb_info *sbi;
+	struct zonefs_context *ctx = fc->fs_private;
 	struct inode *inode;
 	enum zonefs_ztype ztype;
 	int ret;
@@ -1289,7 +1272,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_uid = GLOBAL_ROOT_UID;
 	sbi->s_gid = GLOBAL_ROOT_GID;
 	sbi->s_perm = 0640;
-	sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+	sbi->s_mount_opts = ctx->s_mount_opts;
 
 	atomic_set(&sbi->s_wro_seq_files, 0);
 	sbi->s_max_wro_seq_files = bdev_max_open_zones(sb->s_bdev);
@@ -1300,10 +1283,6 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent)
 	if (ret)
 		return ret;
 
-	ret = zonefs_parse_options(sb, data);
-	if (ret)
-		return ret;
-
 	zonefs_info(sb, "Mounting %u zones", bdev_nr_zones(sb->s_bdev));
 
 	if (!sbi->s_max_wro_seq_files &&
@@ -1364,12 +1343,6 @@ cleanup:
 	return ret;
 }
 
-static struct dentry *zonefs_mount(struct file_system_type *fs_type,
-				   int flags, const char *dev_name, void *data)
-{
-	return mount_bdev(fs_type, flags, dev_name, data, zonefs_fill_super);
-}
-
 static void zonefs_kill_super(struct super_block *sb)
 {
 	struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -1384,22 +1357,72 @@ static void zonefs_kill_super(struct super_block *sb)
 	kfree(sbi);
 }
 
+static void zonefs_free_fc(struct fs_context *fc)
+{
+	struct zonefs_context *ctx = fc->fs_private;
+
+	kfree(ctx);
+}
+
+static int zonefs_get_tree(struct fs_context *fc)
+{
+	return get_tree_bdev(fc, zonefs_fill_super);
+}
+
+static int zonefs_reconfigure(struct fs_context *fc)
+{
+	struct zonefs_context *ctx = fc->fs_private;
+	struct super_block *sb = fc->root->d_sb;
+	struct zonefs_sb_info *sbi = sb->s_fs_info;
+
+	sync_filesystem(fc->root->d_sb);
+	/* Copy new options from ctx into sbi. */
+	sbi->s_mount_opts = ctx->s_mount_opts;
+
+	return 0;
+}
+
+static const struct fs_context_operations zonefs_context_ops = {
+	.parse_param    = zonefs_parse_param,
+	.get_tree       = zonefs_get_tree,
+	.reconfigure	= zonefs_reconfigure,
+	.free           = zonefs_free_fc,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+static int zonefs_init_fs_context(struct fs_context *fc)
+{
+	struct zonefs_context *ctx;
+
+	ctx = kzalloc(sizeof(struct zonefs_context), GFP_KERNEL);
+	if (!ctx)
+		return -ENOMEM;
+	ctx->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
+	fc->ops = &zonefs_context_ops;
+	fc->fs_private = ctx;
+
+	return 0;
+}
+
 /*
  * File system definition and registration.
  */
 static struct file_system_type zonefs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "zonefs",
-	.mount		= zonefs_mount,
-	.kill_sb	= zonefs_kill_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.owner			= THIS_MODULE,
+	.name			= "zonefs",
+	.kill_sb		= zonefs_kill_super,
+	.fs_flags		= FS_REQUIRES_DEV,
+	.init_fs_context	= zonefs_init_fs_context,
+	.parameters		= zonefs_param_spec,
 };
 
 static int __init zonefs_init_inodecache(void)
 {
 	zonefs_inode_cachep = kmem_cache_create("zonefs_inode_cache",
 			sizeof(struct zonefs_inode_info), 0,
-			(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT),
+			SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
 			NULL);
 	if (zonefs_inode_cachep == NULL)
 		return -ENOMEM;