From 005c5db8fd7b2c9fc0b8055fa824ecaa7b15d1bf Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:51:10 -0700
Subject: xfs: rework the inline directory verifiers

The inline directory verifiers should be called on the inode fork data,
which means after iformat_local on the read side, and prior to
ifork_flush on the write side.  This makes the fork verifier more
consistent with the way buffer verifiers work -- i.e. they will operate
on the memory buffer that the code will be reading and writing directly.

Furthermore, revise the verifier function to return -EFSCORRUPTED so
that we don't flood the logs with corruption messages and assert
notices.  This has been a particular problem with xfs/348, which
triggers the XFS_WANT_CORRUPTED_RETURN assertions, which halts the
kernel when CONFIG_XFS_DEBUG=y.  Disk corruption isn't supposed to do
that, at least not in a verifier.

Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v2: get the inode d_ops the proper way
v3: describe the bug that this patch fixes; no code changes
---
 fs/xfs/libxfs/xfs_dir2_priv.h  |  3 +-
 fs/xfs/libxfs/xfs_dir2_sf.c    | 63 +++++++++++++++++++++++++++---------------
 fs/xfs/libxfs/xfs_inode_fork.c | 35 +++++++++--------------
 fs/xfs/libxfs/xfs_inode_fork.h |  2 +-
 fs/xfs/xfs_inode.c             | 19 +++++++------
 5 files changed, 66 insertions(+), 56 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index eb00bc133bca..39f8604f764e 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -125,8 +125,7 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
 extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
 extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
 extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern int xfs_dir2_sf_verify(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *sfp,
-		int size);
+extern int xfs_dir2_sf_verify(struct xfs_inode *ip);
 
 /* xfs_dir2_readdir.c */
 extern int xfs_readdir(struct xfs_inode *dp, struct dir_context *ctx,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 96b45cd6c63f..e84af093b2ab 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -632,36 +632,49 @@ xfs_dir2_sf_check(
 /* Verify the consistency of an inline directory. */
 int
 xfs_dir2_sf_verify(
-	struct xfs_mount		*mp,
-	struct xfs_dir2_sf_hdr		*sfp,
-	int				size)
+	struct xfs_inode		*ip)
 {
+	struct xfs_mount		*mp = ip->i_mount;
+	struct xfs_dir2_sf_hdr		*sfp;
 	struct xfs_dir2_sf_entry	*sfep;
 	struct xfs_dir2_sf_entry	*next_sfep;
 	char				*endp;
 	const struct xfs_dir_ops	*dops;
+	struct xfs_ifork		*ifp;
 	xfs_ino_t			ino;
 	int				i;
 	int				i8count;
 	int				offset;
+	int				size;
+	int				error;
 	__uint8_t			filetype;
 
+	ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_LOCAL);
+	/*
+	 * xfs_iread calls us before xfs_setup_inode sets up ip->d_ops,
+	 * so we can only trust the mountpoint to have the right pointer.
+	 */
 	dops = xfs_dir_get_ops(mp, NULL);
 
+	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
+	size = ifp->if_bytes;
+
 	/*
 	 * Give up if the directory is way too short.
 	 */
-	XFS_WANT_CORRUPTED_RETURN(mp, size >
-			offsetof(struct xfs_dir2_sf_hdr, parent));
-	XFS_WANT_CORRUPTED_RETURN(mp, size >=
-			xfs_dir2_sf_hdr_size(sfp->i8count));
+	if (size <= offsetof(struct xfs_dir2_sf_hdr, parent) ||
+	    size < xfs_dir2_sf_hdr_size(sfp->i8count))
+		return -EFSCORRUPTED;
 
 	endp = (char *)sfp + size;
 
 	/* Check .. entry */
 	ino = dops->sf_get_parent_ino(sfp);
 	i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
-	XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino));
+	error = xfs_dir_ino_validate(mp, ino);
+	if (error)
+		return error;
 	offset = dops->data_first_offset;
 
 	/* Check all reported entries */
@@ -672,12 +685,12 @@ xfs_dir2_sf_verify(
 		 * Check the fixed-offset parts of the structure are
 		 * within the data buffer.
 		 */
-		XFS_WANT_CORRUPTED_RETURN(mp,
-				((char *)sfep + sizeof(*sfep)) < endp);
+		if (((char *)sfep + sizeof(*sfep)) >= endp)
+			return -EFSCORRUPTED;
 
 		/* Don't allow names with known bad length. */
-		XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen > 0);
-		XFS_WANT_CORRUPTED_RETURN(mp, sfep->namelen < MAXNAMELEN);
+		if (sfep->namelen == 0)
+			return -EFSCORRUPTED;
 
 		/*
 		 * Check that the variable-length part of the structure is
@@ -685,33 +698,39 @@ xfs_dir2_sf_verify(
 		 * name component, so nextentry is an acceptable test.
 		 */
 		next_sfep = dops->sf_nextentry(sfp, sfep);
-		XFS_WANT_CORRUPTED_RETURN(mp, endp >= (char *)next_sfep);
+		if (endp < (char *)next_sfep)
+			return -EFSCORRUPTED;
 
 		/* Check that the offsets always increase. */
-		XFS_WANT_CORRUPTED_RETURN(mp,
-				xfs_dir2_sf_get_offset(sfep) >= offset);
+		if (xfs_dir2_sf_get_offset(sfep) < offset)
+			return -EFSCORRUPTED;
 
 		/* Check the inode number. */
 		ino = dops->sf_get_ino(sfp, sfep);
 		i8count += ino > XFS_DIR2_MAX_SHORT_INUM;
-		XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, ino));
+		error = xfs_dir_ino_validate(mp, ino);
+		if (error)
+			return error;
 
 		/* Check the file type. */
 		filetype = dops->sf_get_ftype(sfep);
-		XFS_WANT_CORRUPTED_RETURN(mp, filetype < XFS_DIR3_FT_MAX);
+		if (filetype >= XFS_DIR3_FT_MAX)
+			return -EFSCORRUPTED;
 
 		offset = xfs_dir2_sf_get_offset(sfep) +
 				dops->data_entsize(sfep->namelen);
 
 		sfep = next_sfep;
 	}
-	XFS_WANT_CORRUPTED_RETURN(mp, i8count == sfp->i8count);
-	XFS_WANT_CORRUPTED_RETURN(mp, (void *)sfep == (void *)endp);
+	if (i8count != sfp->i8count)
+		return -EFSCORRUPTED;
+	if ((void *)sfep != (void *)endp)
+		return -EFSCORRUPTED;
 
 	/* Make sure this whole thing ought to be in local format. */
-	XFS_WANT_CORRUPTED_RETURN(mp, offset +
-	       (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
-	       (uint)sizeof(xfs_dir2_block_tail_t) <= mp->m_dir_geo->blksize);
+	if (offset + (sfp->count + 2) * (uint)sizeof(xfs_dir2_leaf_entry_t) +
+	    (uint)sizeof(xfs_dir2_block_tail_t) > mp->m_dir_geo->blksize)
+		return -EFSCORRUPTED;
 
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 9653e964eda4..8a37efe04de3 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -212,6 +212,16 @@ xfs_iformat_fork(
 	if (error)
 		return error;
 
+	/* Check inline dir contents. */
+	if (S_ISDIR(VFS_I(ip)->i_mode) &&
+	    dip->di_format == XFS_DINODE_FMT_LOCAL) {
+		error = xfs_dir2_sf_verify(ip);
+		if (error) {
+			xfs_idestroy_fork(ip, XFS_DATA_FORK);
+			return error;
+		}
+	}
+
 	if (xfs_is_reflink_inode(ip)) {
 		ASSERT(ip->i_cowfp == NULL);
 		xfs_ifork_init_cow(ip);
@@ -322,8 +332,6 @@ xfs_iformat_local(
 	int		whichfork,
 	int		size)
 {
-	int		error;
-
 	/*
 	 * If the size is unreasonable, then something
 	 * is wrong and we just bail out rather than crash in
@@ -339,14 +347,6 @@ xfs_iformat_local(
 		return -EFSCORRUPTED;
 	}
 
-	if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
-		error = xfs_dir2_sf_verify(ip->i_mount,
-				(struct xfs_dir2_sf_hdr *)XFS_DFORK_DPTR(dip),
-				size);
-		if (error)
-			return error;
-	}
-
 	xfs_init_local_fork(ip, whichfork, XFS_DFORK_PTR(dip, whichfork), size);
 	return 0;
 }
@@ -867,7 +867,7 @@ xfs_iextents_copy(
  * In these cases, the format always takes precedence, because the
  * format indicates the current state of the fork.
  */
-int
+void
 xfs_iflush_fork(
 	xfs_inode_t		*ip,
 	xfs_dinode_t		*dip,
@@ -877,7 +877,6 @@ xfs_iflush_fork(
 	char			*cp;
 	xfs_ifork_t		*ifp;
 	xfs_mount_t		*mp;
-	int			error;
 	static const short	brootflag[2] =
 		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
 	static const short	dataflag[2] =
@@ -886,7 +885,7 @@ xfs_iflush_fork(
 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
 
 	if (!iip)
-		return 0;
+		return;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	/*
 	 * This can happen if we gave up in iformat in an error path,
@@ -894,19 +893,12 @@ xfs_iflush_fork(
 	 */
 	if (!ifp) {
 		ASSERT(whichfork == XFS_ATTR_FORK);
-		return 0;
+		return;
 	}
 	cp = XFS_DFORK_PTR(dip, whichfork);
 	mp = ip->i_mount;
 	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
 	case XFS_DINODE_FMT_LOCAL:
-		if (S_ISDIR(VFS_I(ip)->i_mode) && whichfork == XFS_DATA_FORK) {
-			error = xfs_dir2_sf_verify(mp,
-					(struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data,
-					ifp->if_bytes);
-			if (error)
-				return error;
-		}
 		if ((iip->ili_fields & dataflag[whichfork]) &&
 		    (ifp->if_bytes > 0)) {
 			ASSERT(ifp->if_u1.if_data != NULL);
@@ -959,7 +951,6 @@ xfs_iflush_fork(
 		ASSERT(0);
 		break;
 	}
-	return 0;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 132dc59fdde6..7fb8365326d1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -140,7 +140,7 @@ typedef struct xfs_ifork {
 struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
 
 int		xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
-int		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
+void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 				struct xfs_inode_log_item *, int);
 void		xfs_idestroy_fork(struct xfs_inode *, int);
 void		xfs_idata_realloc(struct xfs_inode *, int, int);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c7fe2c2123ab..7605d8396596 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -50,6 +50,7 @@
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_reflink.h"
+#include "xfs_dir2_priv.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -3475,7 +3476,6 @@ xfs_iflush_int(
 	struct xfs_inode_log_item *iip = ip->i_itemp;
 	struct xfs_dinode	*dip;
 	struct xfs_mount	*mp = ip->i_mount;
-	int			error;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(xfs_isiflocked(ip));
@@ -3547,6 +3547,12 @@ xfs_iflush_int(
 	if (ip->i_d.di_version < 3)
 		ip->i_d.di_flushiter++;
 
+	/* Check the inline directory data. */
+	if (S_ISDIR(VFS_I(ip)->i_mode) &&
+	    ip->i_d.di_format == XFS_DINODE_FMT_LOCAL &&
+	    xfs_dir2_sf_verify(ip))
+		goto corrupt_out;
+
 	/*
 	 * Copy the dirty parts of the inode into the on-disk inode.  We always
 	 * copy out the core of the inode, because if the inode is dirty at all
@@ -3558,14 +3564,9 @@ xfs_iflush_int(
 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
 		ip->i_d.di_flushiter = 0;
 
-	error = xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
-	if (error)
-		return error;
-	if (XFS_IFORK_Q(ip)) {
-		error = xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
-		if (error)
-			return error;
-	}
+	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
+	if (XFS_IFORK_Q(ip))
+		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
 	xfs_inobp_check(mp, bp);
 
 	/*
-- 
cgit v1.2.3


From 105664df51648d6376bda2895a890e786d0426ee Mon Sep 17 00:00:00 2001
From: Calvin Owens <calvinowens@fb.com>
Date: Fri, 31 Mar 2017 07:57:55 -0700
Subject: xfs: Honor FALLOC_FL_KEEP_SIZE when punching ends of files

When punching past EOF on XFS, fallocate(mode=PUNCH_HOLE|KEEP_SIZE) will
round the file size up to the nearest multiple of PAGE_SIZE:

  calvinow@vm-disks/generic-xfs-1 ~$ dd if=/dev/urandom of=test bs=2048 count=1
  calvinow@vm-disks/generic-xfs-1 ~$ stat test
    Size: 2048            Blocks: 8          IO Block: 4096   regular file
  calvinow@vm-disks/generic-xfs-1 ~$ fallocate -n -l 2048 -o 2048 -p test
  calvinow@vm-disks/generic-xfs-1 ~$ stat test
    Size: 4096            Blocks: 8          IO Block: 4096   regular file

Commit 3c2bdc912a1cc050 ("xfs: kill xfs_zero_remaining_bytes") replaced
xfs_zero_remaining_bytes() with calls to iomap helpers. The new helpers
don't enforce that [pos,offset) lies strictly on [0,i_size) when being
called from xfs_free_file_space(), so by "leaking" these ranges into
xfs_zero_range() we get this buggy behavior.

Fix this by reintroducing the checks xfs_zero_remaining_bytes() did
against i_size at the bottom of xfs_free_file_space().

Reported-by: Aaron Gao <gzh@fb.com>
Fixes: 3c2bdc912a1cc050 ("xfs: kill xfs_zero_remaining_bytes")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Brian Foster <bfoster@redhat.com>
Cc: <stable@vger.kernel.org> # 4.8+
Signed-off-by: Calvin Owens <calvinowens@fb.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_bmap_util.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 8b75dcea5966..828532ce0adc 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1311,8 +1311,16 @@ xfs_free_file_space(
 	/*
 	 * Now that we've unmap all full blocks we'll have to zero out any
 	 * partial block at the beginning and/or end.  xfs_zero_range is
-	 * smart enough to skip any holes, including those we just created.
+	 * smart enough to skip any holes, including those we just created,
+	 * but we must take care not to zero beyond EOF and enlarge i_size.
 	 */
+
+	if (offset >= XFS_ISIZE(ip))
+		return 0;
+
+	if (offset + len > XFS_ISIZE(ip))
+		len = XFS_ISIZE(ip) - offset;
+
 	return xfs_zero_range(ip, offset, len, NULL);
 }
 
-- 
cgit v1.2.3


From f1c0e20243baca1194fccfdf0111831bee12647c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 3 Apr 2017 15:17:53 -0700
Subject: xfs: fix kernel memory exposure problems

Fix a memory exposure problems in inumbers where we allocate an array of
structures with holes, fail to zero the holes, then blindly copy the
kernel memory contents (junk and all) into userspace.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_itable.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 2a6d9b1558e0..26d67ce3c18d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -583,7 +583,7 @@ xfs_inumbers(
 		return error;
 
 	bcount = MIN(left, (int)(PAGE_SIZE / sizeof(*buffer)));
-	buffer = kmem_alloc(bcount * sizeof(*buffer), KM_SLEEP);
+	buffer = kmem_zalloc(bcount * sizeof(*buffer), KM_SLEEP);
 	do {
 		struct xfs_inobt_rec_incore	r;
 		int				stat;
-- 
cgit v1.2.3


From 696a562072e3c14bcd13ae5acc19cdf27679e865 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 28 Mar 2017 14:51:44 -0700
Subject: xfs: use dedicated log worker wq to avoid deadlock with cil wq

The log covering background task used to be part of the xfssyncd
workqueue. That workqueue was removed as of commit 5889608df ("xfs:
syncd workqueue is no more") and the associated work item scheduled
to the xfs-log wq. The latter is used for log buffer I/O completion.

Since xfs_log_worker() can invoke a log flush, a deadlock is
possible between the xfs-log and xfs-cil workqueues. Consider the
following codepath from xfs_log_worker():

xfs_log_worker()
  xfs_log_force()
    _xfs_log_force()
      xlog_cil_force()
        xlog_cil_force_lsn()
          xlog_cil_push_now()
            flush_work()

The above is in xfs-log wq context and blocked waiting on the
completion of an xfs-cil work item. Concurrently, the cil push in
progress can end up blocked here:

xlog_cil_push_work()
  xlog_cil_push()
    xlog_write()
      xlog_state_get_iclog_space()
        xlog_wait(&log->l_flush_wait, ...)

The above is in xfs-cil context waiting on log buffer I/O
completion, which executes in xfs-log wq context. In this scenario
both workqueues are deadlocked waiting on eachother.

Add a new workqueue specifically for the high level log covering and
ail pushing worker, as was the case prior to commit 5889608df.

Diagnosed-by: David Jeffery <djeffery@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log.c   | 2 +-
 fs/xfs/xfs_mount.h | 1 +
 fs/xfs/xfs_super.c | 8 ++++++++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index b1469f0a91a6..bb58cd1873c9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1293,7 +1293,7 @@ void
 xfs_log_work_queue(
 	struct xfs_mount        *mp)
 {
-	queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+	queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work,
 				msecs_to_jiffies(xfs_syncd_centisecs * 10));
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 6db6fd6b82b0..22b2185e93a0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -183,6 +183,7 @@ typedef struct xfs_mount {
 	struct workqueue_struct	*m_reclaim_workqueue;
 	struct workqueue_struct	*m_log_workqueue;
 	struct workqueue_struct *m_eofblocks_workqueue;
+	struct workqueue_struct	*m_sync_workqueue;
 
 	/*
 	 * Generation of the filesysyem layout.  This is incremented by each
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 685c042a120f..47d239dcf3f4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -877,8 +877,15 @@ xfs_init_mount_workqueues(
 	if (!mp->m_eofblocks_workqueue)
 		goto out_destroy_log;
 
+	mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0,
+					       mp->m_fsname);
+	if (!mp->m_sync_workqueue)
+		goto out_destroy_eofb;
+
 	return 0;
 
+out_destroy_eofb:
+	destroy_workqueue(mp->m_eofblocks_workqueue);
 out_destroy_log:
 	destroy_workqueue(mp->m_log_workqueue);
 out_destroy_reclaim:
@@ -899,6 +906,7 @@ STATIC void
 xfs_destroy_mount_workqueues(
 	struct xfs_mount	*mp)
 {
+	destroy_workqueue(mp->m_sync_workqueue);
 	destroy_workqueue(mp->m_eofblocks_workqueue);
 	destroy_workqueue(mp->m_log_workqueue);
 	destroy_workqueue(mp->m_reclaim_workqueue);
-- 
cgit v1.2.3


From 9c4f29d39168bb9363189f0d54ee1202372e7d9b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Mar 2017 14:53:35 -0700
Subject: xfs: factor out a xfs_bmap_is_real_extent helper

This checks for all the non-normal extent types, including handling both
encodings of delayed allocations.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.h | 12 ++++++++++++
 fs/xfs/xfs_aops.c        |  4 +---
 fs/xfs/xfs_bmap_util.c   |  7 +++----
 fs/xfs/xfs_reflink.c     | 21 ++++-----------------
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cdef87db5262..a6e612cabe15 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -171,6 +171,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
 	{ BMAP_COWFORK,		"COW" }
 
 
+/*
+ * Return true if the extent is a real, allocated extent, or false if it is  a
+ * delayed allocation, and unwritten extent or a hole.
+ */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+	return irec->br_state != XFS_EXT_UNWRITTEN &&
+		irec->br_startblock != HOLESTARTBLOCK &&
+		irec->br_startblock != DELAYSTARTBLOCK &&
+		!isnullstartblock(irec->br_startblock);
+}
+
 /*
  * This macro is used to determine how many extents will be shifted
  * in one write transaction. We could require two splits,
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 61494295d92f..8567b385fff4 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1276,9 +1276,7 @@ xfs_get_blocks(
 	 * For unwritten extents do not report a disk address in the buffered
 	 * read case (treat as if we're reading into a hole).
 	 */
-	if (imap.br_startblock != HOLESTARTBLOCK &&
-	    imap.br_startblock != DELAYSTARTBLOCK &&
-	    !ISUNWRITTEN(&imap))
+	if (xfs_bmap_is_real_extent(&imap))
 		xfs_map_buffer(inode, bh_result, &imap, offset);
 
 	/*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 828532ce0adc..4d1920e594b0 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared(
 	next_map->br_blockcount = 0;
 
 	/* Only written data blocks can be shared. */
-	if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
-	    map->br_startblock == DELAYSTARTBLOCK ||
-	    map->br_startblock == HOLESTARTBLOCK ||
-	    ISUNWRITTEN(map))
+	if (!xfs_is_reflink_inode(ip) ||
+	    whichfork != XFS_DATA_FORK ||
+	    !xfs_bmap_is_real_extent(map))
 		return 0;
 
 	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 4a84c5ea266d..c0f3754caca2 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared(
 	int			error = 0;
 
 	/* Holes, unwritten, and delalloc extents cannot be shared */
-	if (!xfs_is_reflink_inode(ip) ||
-	    ISUNWRITTEN(irec) ||
-	    irec->br_startblock == HOLESTARTBLOCK ||
-	    irec->br_startblock == DELAYSTARTBLOCK ||
-	    isnullstartblock(irec->br_startblock)) {
+	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
 		*shared = false;
 		return 0;
 	}
@@ -1045,12 +1041,12 @@ xfs_reflink_remap_extent(
 	xfs_off_t		new_isize)
 {
 	struct xfs_mount	*mp = ip->i_mount;
+	bool			real_extent = xfs_bmap_is_real_extent(irec);
 	struct xfs_trans	*tp;
 	xfs_fsblock_t		firstfsb;
 	unsigned int		resblks;
 	struct xfs_defer_ops	dfops;
 	struct xfs_bmbt_irec	uirec;
-	bool			real_extent;
 	xfs_filblks_t		rlen;
 	xfs_filblks_t		unmap_len;
 	xfs_off_t		newlen;
@@ -1059,11 +1055,6 @@ xfs_reflink_remap_extent(
 	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
 	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
 
-	/* Only remap normal extents. */
-	real_extent =  (irec->br_startblock != HOLESTARTBLOCK &&
-			irec->br_startblock != DELAYSTARTBLOCK &&
-			!ISUNWRITTEN(irec));
-
 	/* No reflinking if we're low on space */
 	if (real_extent) {
 		error = xfs_reflink_ag_has_free_space(mp,
@@ -1359,9 +1350,7 @@ xfs_reflink_dirty_extents(
 			goto out;
 		if (nmaps == 0)
 			break;
-		if (map[0].br_startblock == HOLESTARTBLOCK ||
-		    map[0].br_startblock == DELAYSTARTBLOCK ||
-		    ISUNWRITTEN(&map[0]))
+		if (!xfs_bmap_is_real_extent(&map[0]))
 			goto next;
 
 		map[1] = map[0];
@@ -1435,9 +1424,7 @@ xfs_reflink_clear_inode_flag(
 			return error;
 		if (nmaps == 0)
 			break;
-		if (map.br_startblock == HOLESTARTBLOCK ||
-		    map.br_startblock == DELAYSTARTBLOCK ||
-		    ISUNWRITTEN(&map))
+		if (!xfs_bmap_is_real_extent(&map))
 			goto next;
 
 		agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
-- 
cgit v1.2.3


From 63fbb4c18d6b04d5f376326395cddf6c2de2c965 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Mar 2017 14:53:36 -0700
Subject: xfs: remove the ISUNWRITTEN macro

Opencoding the trivial checks makes it much easier to read (and grep..).

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap_btree.h | 1 -
 fs/xfs/xfs_aops.c              | 4 ++--
 fs/xfs/xfs_iomap.c             | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 819a8a4dee95..90347a99c6d2 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -30,7 +30,6 @@ struct xfs_trans;
 #define XFS_EXTFMT_INODE(x)	\
 	(xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
 		XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-#define ISUNWRITTEN(x)	((x)->br_state == XFS_EXT_UNWRITTEN)
 
 /*
  * Btree block header size depends on a superblock flag.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 8567b385fff4..eef453adbc06 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1261,8 +1261,8 @@ xfs_get_blocks(
 
 	if (nimaps) {
 		trace_xfs_get_blocks_found(ip, offset, size,
-				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
-						   : XFS_IO_OVERWRITE, &imap);
+			imap.br_state == XFS_EXT_UNWRITTEN ?
+				XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap);
 		xfs_iunlock(ip, lockmode);
 	} else {
 		trace_xfs_get_blocks_notfound(ip, offset, size);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 288ee5b840d7..009f8243dddc 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -240,7 +240,7 @@ xfs_iomap_write_direct(
 	 */
 	if (IS_DAX(VFS_I(ip))) {
 		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
-		if (ISUNWRITTEN(imap)) {
+		if (imap->br_state == XFS_EXT_UNWRITTEN) {
 			tflags |= XFS_TRANS_RESERVE;
 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
 		}
@@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode,
 	return !nimaps ||
 		imap->br_startblock == HOLESTARTBLOCK ||
 		imap->br_startblock == DELAYSTARTBLOCK ||
-		(IS_DAX(inode) && ISUNWRITTEN(imap));
+		(IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN);
 }
 
 static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
-- 
cgit v1.2.3


From bc593eebfd66838a3022ac92782665fbcafc412e Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 28 Mar 2017 14:54:29 -0700
Subject: xfs: fix up inode validation failure message

"xfs_iread: validation failed for inode 96 failed"

One "failed" seems like enough.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Alex Elder <elder@linaro.org>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_inode_buf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index d93f9d918cfc..09c3d1aecef2 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -508,7 +508,7 @@ xfs_iread(
 
 	/* even unallocated inodes are verified */
 	if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
-		xfs_alert(mp, "%s: validation failed for inode %lld failed",
+		xfs_alert(mp, "%s: validation failed for inode %lld",
 				__func__, ip->i_ino);
 
 		XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
-- 
cgit v1.2.3


From 422e5b53ed83dd71c1eac276c6ec7f7c6e81ce8d Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Tue, 28 Mar 2017 14:55:15 -0700
Subject: xfs: Remove obsolete declaration of xfs_buf_get_empty

This function has been removed ever since at least 3.12-era. No need to
keep its declaration in the header so nuke it.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 3c867e5a63e1..e1bc1af743fa 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -291,7 +291,6 @@ xfs_buf_readahead(
 	return xfs_buf_readahead_map(target, &map, 1, ops);
 }
 
-struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
 void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks);
 int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 
-- 
cgit v1.2.3


From be6324c00c4d1e0e665f03ed1fc18863a88da119 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 3 Apr 2017 15:17:57 -0700
Subject: xfs: fix over-copying of getbmap parameters from userspace

In xfs_ioc_getbmap, we should only copy the fields of struct getbmap
from userspace, or else we end up copying random stack contents into the
kernel.  struct getbmap is a strict subset of getbmapx, so a partial
structure copy should work fine.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_ioctl.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 2fd7fdf5438f..6d30b06e79bc 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1543,10 +1543,11 @@ xfs_ioc_getbmap(
 	unsigned int		cmd,
 	void			__user *arg)
 {
-	struct getbmapx		bmx;
+	struct getbmapx		bmx = { 0 };
 	int			error;
 
-	if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
+	/* struct getbmap is a strict subset of struct getbmapx. */
+	if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags)))
 		return -EFAULT;
 
 	if (bmx.bmv_count < 2)
-- 
cgit v1.2.3


From 80c9f490f344be7999f57fc31a8ed956f8c65f3b Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:34 -0700
Subject: vfs: add common GETFSMAP ioctl definitions

Add the GETFSMAP headers to the VFS kernel headers

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 include/uapi/linux/fsmap.h | 112 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)
 create mode 100644 include/uapi/linux/fsmap.h

diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h
new file mode 100644
index 000000000000..7e8e5f0bd6d2
--- /dev/null
+++ b/include/uapi/linux/fsmap.h
@@ -0,0 +1,112 @@
+/*
+ * FS_IOC_GETFSMAP ioctl infrastructure.
+ *
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ */
+#ifndef _LINUX_FSMAP_H
+#define _LINUX_FSMAP_H
+
+#include <linux/types.h>
+
+/*
+ *	Structure for FS_IOC_GETFSMAP.
+ *
+ *	The memory layout for this call are the scalar values defined in
+ *	struct fsmap_head, followed by two struct fsmap that describe
+ *	the lower and upper bound of mappings to return, followed by an
+ *	array of struct fsmap mappings.
+ *
+ *	fmh_iflags control the output of the call, whereas fmh_oflags report
+ *	on the overall record output.  fmh_count should be set to the
+ *	length of the fmh_recs array, and fmh_entries will be set to the
+ *	number of entries filled out during each call.  If fmh_count is
+ *	zero, the number of reverse mappings will be returned in
+ *	fmh_entries, though no mappings will be returned.  fmh_reserved
+ *	must be set to zero.
+ *
+ *	The two elements in the fmh_keys array are used to constrain the
+ *	output.  The first element in the array should represent the
+ *	lowest disk mapping ("low key") that the user wants to learn
+ *	about.  If this value is all zeroes, the filesystem will return
+ *	the first entry it knows about.  For a subsequent call, the
+ *	contents of fsmap_head.fmh_recs[fsmap_head.fmh_count - 1] should be
+ *	copied into fmh_keys[0] to have the kernel start where it left off.
+ *
+ *	The second element in the fmh_keys array should represent the
+ *	highest disk mapping ("high key") that the user wants to learn
+ *	about.  If this value is all ones, the filesystem will not stop
+ *	until it runs out of mapping to return or runs out of space in
+ *	fmh_recs.
+ *
+ *	fmr_device can be either a 32-bit cookie representing a device, or
+ *	a 32-bit dev_t if the FMH_OF_DEV_T flag is set.  fmr_physical,
+ *	fmr_offset, and fmr_length are expressed in units of bytes.
+ *	fmr_owner is either an inode number, or a special value if
+ *	FMR_OF_SPECIAL_OWNER is set in fmr_flags.
+ */
+struct fsmap {
+	__u32		fmr_device;	/* device id */
+	__u32		fmr_flags;	/* mapping flags */
+	__u64		fmr_physical;	/* device offset of segment */
+	__u64		fmr_owner;	/* owner id */
+	__u64		fmr_offset;	/* file offset of segment */
+	__u64		fmr_length;	/* length of segment */
+	__u64		fmr_reserved[3];	/* must be zero */
+};
+
+struct fsmap_head {
+	__u32		fmh_iflags;	/* control flags */
+	__u32		fmh_oflags;	/* output flags */
+	__u32		fmh_count;	/* # of entries in array incl. input */
+	__u32		fmh_entries;	/* # of entries filled in (output). */
+	__u64		fmh_reserved[6];	/* must be zero */
+
+	struct fsmap	fmh_keys[2];	/* low and high keys for the mapping search */
+	struct fsmap	fmh_recs[];	/* returned records */
+};
+
+/* Size of an fsmap_head with room for nr records. */
+static inline size_t
+fsmap_sizeof(
+	unsigned int	nr)
+{
+	return sizeof(struct fsmap_head) + nr * sizeof(struct fsmap);
+}
+
+/* Start the next fsmap query at the end of the current query results. */
+static inline void
+fsmap_advance(
+	struct fsmap_head	*head)
+{
+	head->fmh_keys[0] = head->fmh_recs[head->fmh_entries - 1];
+}
+
+/*	fmh_iflags values - set by FS_IOC_GETFSMAP caller in the header. */
+/* no flags defined yet */
+#define FMH_IF_VALID		0
+
+/*	fmh_oflags values - returned in the header segment only. */
+#define FMH_OF_DEV_T		0x1	/* fmr_device values will be dev_t */
+
+/*	fmr_flags values - returned for each non-header segment */
+#define FMR_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
+#define FMR_OF_ATTR_FORK	0x2	/* segment = attribute fork */
+#define FMR_OF_EXTENT_MAP	0x4	/* segment = extent map */
+#define FMR_OF_SHARED		0x8	/* segment = shared with another file */
+#define FMR_OF_SPECIAL_OWNER	0x10	/* owner is a special value */
+#define FMR_OF_LAST		0x20	/* segment is the last in the FS */
+
+/* Each FS gets to define its own special owner codes. */
+#define FMR_OWNER(type, code)	(((__u64)type << 32) | \
+				 ((__u64)code & 0xFFFFFFFFULL))
+#define FMR_OWNER_TYPE(owner)	((__u32)((__u64)owner >> 32))
+#define FMR_OWNER_CODE(owner)	((__u32)(((__u64)owner & 0xFFFFFFFFULL)))
+#define FMR_OWN_FREE		FMR_OWNER(0, 1) /* free space */
+#define FMR_OWN_UNKNOWN		FMR_OWNER(0, 2) /* unknown owner */
+#define FMR_OWN_METADATA	FMR_OWNER(0, 3) /* metadata */
+
+#define FS_IOC_GETFSMAP		_IOWR('X', 59, struct fsmap_head)
+
+#endif /* _LINUX_FSMAP_H */
-- 
cgit v1.2.3


From 08438b1e386b7899e8a2ffef59d6ebf29aac530f Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:34 -0700
Subject: xfs: plumb in needed functions for range querying of the freespace
 btrees

Plumb in the pieces (init_high_key, diff_two_keys) necessary to call
query_range on the free space btrees.  Remove the debugging asserts
so that we can make queries starting from block 0.

While we're at it, merge the redundant "if (btnum ==" hunks.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc_btree.c | 172 ++++++++++++++++++++++++++++++----------
 1 file changed, 129 insertions(+), 43 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index efb467b10a71..e1fcfe7f0a9a 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec(
 	union xfs_btree_key	*key,
 	union xfs_btree_rec	*rec)
 {
-	ASSERT(rec->alloc.ar_startblock != 0);
-
 	key->alloc.ar_startblock = rec->alloc.ar_startblock;
 	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
 }
 
+STATIC void
+xfs_bnobt_init_high_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	__u32			x;
+
+	x = be32_to_cpu(rec->alloc.ar_startblock);
+	x += be32_to_cpu(rec->alloc.ar_blockcount) - 1;
+	key->alloc.ar_startblock = cpu_to_be32(x);
+	key->alloc.ar_blockcount = 0;
+}
+
+STATIC void
+xfs_cntbt_init_high_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
+	key->alloc.ar_startblock = 0;
+}
+
 STATIC void
 xfs_allocbt_init_rec_from_cur(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_rec	*rec)
 {
-	ASSERT(cur->bc_rec.a.ar_startblock != 0);
-
 	rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
 	rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
 }
@@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur(
 }
 
 STATIC __int64_t
-xfs_allocbt_key_diff(
+xfs_bnobt_key_diff(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_key	*key)
 {
 	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
 	xfs_alloc_key_t		*kp = &key->alloc;
-	__int64_t		diff;
 
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return (__int64_t)be32_to_cpu(kp->ar_startblock) -
-				rec->ar_startblock;
-	}
+	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
+}
+
+STATIC __int64_t
+xfs_cntbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	xfs_alloc_rec_incore_t	*rec = &cur->bc_rec.a;
+	xfs_alloc_key_t		*kp = &key->alloc;
+	__int64_t		diff;
 
 	diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
 	if (diff)
@@ -256,6 +280,33 @@ xfs_allocbt_key_diff(
 	return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
 
+STATIC __int64_t
+xfs_bnobt_diff_two_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) -
+			  be32_to_cpu(k2->alloc.ar_startblock);
+}
+
+STATIC __int64_t
+xfs_cntbt_diff_two_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	__int64_t		diff;
+
+	diff =  be32_to_cpu(k1->alloc.ar_blockcount) -
+		be32_to_cpu(k2->alloc.ar_blockcount);
+	if (diff)
+		return diff;
+
+	return  be32_to_cpu(k1->alloc.ar_startblock) -
+		be32_to_cpu(k2->alloc.ar_startblock);
+}
+
 static bool
 xfs_allocbt_verify(
 	struct xfs_buf		*bp)
@@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = {
 
 #if defined(DEBUG) || defined(XFS_WARN)
 STATIC int
-xfs_allocbt_keys_inorder(
+xfs_bnobt_keys_inorder(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_key	*k1,
 	union xfs_btree_key	*k2)
 {
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return be32_to_cpu(k1->alloc.ar_startblock) <
-		       be32_to_cpu(k2->alloc.ar_startblock);
-	} else {
-		return be32_to_cpu(k1->alloc.ar_blockcount) <
-			be32_to_cpu(k2->alloc.ar_blockcount) ||
-			(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
-			 be32_to_cpu(k1->alloc.ar_startblock) <
-			 be32_to_cpu(k2->alloc.ar_startblock));
-	}
+	return be32_to_cpu(k1->alloc.ar_startblock) <
+	       be32_to_cpu(k2->alloc.ar_startblock);
 }
 
 STATIC int
-xfs_allocbt_recs_inorder(
+xfs_bnobt_recs_inorder(
 	struct xfs_btree_cur	*cur,
 	union xfs_btree_rec	*r1,
 	union xfs_btree_rec	*r2)
 {
-	if (cur->bc_btnum == XFS_BTNUM_BNO) {
-		return be32_to_cpu(r1->alloc.ar_startblock) +
-			be32_to_cpu(r1->alloc.ar_blockcount) <=
-			be32_to_cpu(r2->alloc.ar_startblock);
-	} else {
-		return be32_to_cpu(r1->alloc.ar_blockcount) <
-			be32_to_cpu(r2->alloc.ar_blockcount) ||
-			(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
-			 be32_to_cpu(r1->alloc.ar_startblock) <
-			 be32_to_cpu(r2->alloc.ar_startblock));
-	}
+	return be32_to_cpu(r1->alloc.ar_startblock) +
+		be32_to_cpu(r1->alloc.ar_blockcount) <=
+		be32_to_cpu(r2->alloc.ar_startblock);
+}
+
+STATIC int
+xfs_cntbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->alloc.ar_blockcount) <
+		be32_to_cpu(k2->alloc.ar_blockcount) ||
+		(k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+		 be32_to_cpu(k1->alloc.ar_startblock) <
+		 be32_to_cpu(k2->alloc.ar_startblock));
 }
-#endif	/* DEBUG */
 
-static const struct xfs_btree_ops xfs_allocbt_ops = {
+STATIC int
+xfs_cntbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return be32_to_cpu(r1->alloc.ar_blockcount) <
+		be32_to_cpu(r2->alloc.ar_blockcount) ||
+		(r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+		 be32_to_cpu(r1->alloc.ar_startblock) <
+		 be32_to_cpu(r2->alloc.ar_startblock));
+}
+#endif /* DEBUG */
+
+static const struct xfs_btree_ops xfs_bnobt_ops = {
 	.rec_len		= sizeof(xfs_alloc_rec_t),
 	.key_len		= sizeof(xfs_alloc_key_t),
 
@@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
 	.get_minrecs		= xfs_allocbt_get_minrecs,
 	.get_maxrecs		= xfs_allocbt_get_maxrecs,
 	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_bnobt_init_high_key_from_rec,
 	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
 	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
-	.key_diff		= xfs_allocbt_key_diff,
+	.key_diff		= xfs_bnobt_key_diff,
 	.buf_ops		= &xfs_allocbt_buf_ops,
+	.diff_two_keys		= xfs_bnobt_diff_two_keys,
 #if defined(DEBUG) || defined(XFS_WARN)
-	.keys_inorder		= xfs_allocbt_keys_inorder,
-	.recs_inorder		= xfs_allocbt_recs_inorder,
+	.keys_inorder		= xfs_bnobt_keys_inorder,
+	.recs_inorder		= xfs_bnobt_recs_inorder,
+#endif
+};
+
+static const struct xfs_btree_ops xfs_cntbt_ops = {
+	.rec_len		= sizeof(xfs_alloc_rec_t),
+	.key_len		= sizeof(xfs_alloc_key_t),
+
+	.dup_cursor		= xfs_allocbt_dup_cursor,
+	.set_root		= xfs_allocbt_set_root,
+	.alloc_block		= xfs_allocbt_alloc_block,
+	.free_block		= xfs_allocbt_free_block,
+	.update_lastrec		= xfs_allocbt_update_lastrec,
+	.get_minrecs		= xfs_allocbt_get_minrecs,
+	.get_maxrecs		= xfs_allocbt_get_maxrecs,
+	.init_key_from_rec	= xfs_allocbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_cntbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_allocbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_allocbt_init_ptr_from_cur,
+	.key_diff		= xfs_cntbt_key_diff,
+	.buf_ops		= &xfs_allocbt_buf_ops,
+	.diff_two_keys		= xfs_cntbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_cntbt_keys_inorder,
+	.recs_inorder		= xfs_cntbt_recs_inorder,
 #endif
 };
 
@@ -427,16 +514,15 @@ xfs_allocbt_init_cursor(
 	cur->bc_mp = mp;
 	cur->bc_btnum = btnum;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
-	cur->bc_ops = &xfs_allocbt_ops;
-	if (btnum == XFS_BTNUM_BNO)
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
-	else
-		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
 
 	if (btnum == XFS_BTNUM_CNT) {
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
+		cur->bc_ops = &xfs_cntbt_ops;
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
 		cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
 	} else {
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+		cur->bc_ops = &xfs_bnobt_ops;
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]);
 	}
 
-- 
cgit v1.2.3


From 2d520bfaa28b60401fbfe581f485a0bb952d881c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:35 -0700
Subject: xfs: provide a query_range function for freespace btrees

Implement a query_range function for the bnobt and cntbt.  This will
be used for getfsmap fallback if there is no rmapbt and by the online
scrub and repair code.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/libxfs/xfs_alloc.h | 10 ++++++++++
 2 files changed, 52 insertions(+)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 369adcc18c02..f65b8d49ec16 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2868,3 +2868,45 @@ err:
 	xfs_trans_brelse(tp, agbp);
 	return error;
 }
+
+struct xfs_alloc_query_range_info {
+	xfs_alloc_query_range_fn	fn;
+	void				*priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_alloc_query_range_helper(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_rec		*rec,
+	void				*priv)
+{
+	struct xfs_alloc_query_range_info	*query = priv;
+	struct xfs_alloc_rec_incore		irec;
+
+	irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
+	irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
+	return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all free space within a given range of blocks. */
+int
+xfs_alloc_query_range(
+	struct xfs_btree_cur			*cur,
+	struct xfs_alloc_rec_incore		*low_rec,
+	struct xfs_alloc_rec_incore		*high_rec,
+	xfs_alloc_query_range_fn		fn,
+	void					*priv)
+{
+	union xfs_btree_irec			low_brec;
+	union xfs_btree_irec			high_brec;
+	struct xfs_alloc_query_range_info	query;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	low_brec.a = *low_rec;
+	high_brec.a = *high_rec;
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_range(cur, &low_brec, &high_brec,
+			xfs_alloc_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 2a8d0fa6fbbe..6c2643c4abd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -219,4 +219,14 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno,
 
 xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp);
 
+typedef int (*xfs_alloc_query_range_fn)(
+	struct xfs_btree_cur		*cur,
+	struct xfs_alloc_rec_incore	*rec,
+	void				*priv);
+
+int xfs_alloc_query_range(struct xfs_btree_cur *cur,
+		struct xfs_alloc_rec_incore *low_rec,
+		struct xfs_alloc_rec_incore *high_rec,
+		xfs_alloc_query_range_fn fn, void *priv);
+
 #endif	/* __XFS_ALLOC_H__ */
-- 
cgit v1.2.3


From e9a2599a249ed7d31771985aea0e761f5680de64 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:35 -0700
Subject: xfs: create a function to query all records in a btree

Create a helper function that will query all records in a btree.
This will be used by the online repair functions to examine every
record in a btree to rebuild a second btree.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_alloc.c | 15 +++++++++++++++
 fs/xfs/libxfs/xfs_alloc.h |  2 ++
 fs/xfs/libxfs/xfs_btree.c | 15 +++++++++++++++
 fs/xfs/libxfs/xfs_btree.h |  2 ++
 fs/xfs/libxfs/xfs_rmap.c  | 28 +++++++++++++++++++++-------
 fs/xfs/libxfs/xfs_rmap.h  |  2 ++
 6 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index f65b8d49ec16..7486401ccbd3 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2910,3 +2910,18 @@ xfs_alloc_query_range(
 	return xfs_btree_query_range(cur, &low_brec, &high_brec,
 			xfs_alloc_query_range_helper, &query);
 }
+
+/* Find all free space records. */
+int
+xfs_alloc_query_all(
+	struct xfs_btree_cur			*cur,
+	xfs_alloc_query_range_fn		fn,
+	void					*priv)
+{
+	struct xfs_alloc_query_range_info	query;
+
+	ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
+}
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6c2643c4abd3..77d9c27330ab 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -228,5 +228,7 @@ int xfs_alloc_query_range(struct xfs_btree_cur *cur,
 		struct xfs_alloc_rec_incore *low_rec,
 		struct xfs_alloc_rec_incore *high_rec,
 		xfs_alloc_query_range_fn fn, void *priv);
+int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn,
+		void *priv);
 
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index c3decedc9455..92aa20d56553 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4842,6 +4842,21 @@ xfs_btree_query_range(
 			fn, priv);
 }
 
+/* Query a btree for all records. */
+int
+xfs_btree_query_all(
+	struct xfs_btree_cur		*cur,
+	xfs_btree_query_range_fn	fn,
+	void				*priv)
+{
+	union xfs_btree_irec		low_rec;
+	union xfs_btree_irec		high_rec;
+
+	memset(&low_rec, 0, sizeof(low_rec));
+	memset(&high_rec, 0xFF, sizeof(high_rec));
+	return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv);
+}
+
 /*
  * Calculate the number of blocks needed to store a given number of records
  * in a short-format (per-AG metadata) btree.
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4bb62580a7fd..27bed08261c5 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur,
 int xfs_btree_query_range(struct xfs_btree_cur *cur,
 		union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec,
 		xfs_btree_query_range_fn fn, void *priv);
+int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn,
+		void *priv);
 
 typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level,
 		void *data);
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 3a8cc7139912..384055668374 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper(
 /* Find all rmaps between two keys. */
 int
 xfs_rmap_query_range(
-	struct xfs_btree_cur		*cur,
-	struct xfs_rmap_irec		*low_rec,
-	struct xfs_rmap_irec		*high_rec,
-	xfs_rmap_query_range_fn	fn,
-	void				*priv)
+	struct xfs_btree_cur			*cur,
+	struct xfs_rmap_irec			*low_rec,
+	struct xfs_rmap_irec			*high_rec,
+	xfs_rmap_query_range_fn			fn,
+	void					*priv)
 {
-	union xfs_btree_irec		low_brec;
-	union xfs_btree_irec		high_brec;
+	union xfs_btree_irec			low_brec;
+	union xfs_btree_irec			high_brec;
 	struct xfs_rmap_query_range_info	query;
 
 	low_brec.r = *low_rec;
@@ -2019,6 +2019,20 @@ xfs_rmap_query_range(
 			xfs_rmap_query_range_helper, &query);
 }
 
+/* Find all rmaps. */
+int
+xfs_rmap_query_all(
+	struct xfs_btree_cur			*cur,
+	xfs_rmap_query_range_fn			fn,
+	void					*priv)
+{
+	struct xfs_rmap_query_range_info	query;
+
+	query.priv = priv;
+	query.fn = fn;
+	return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query);
+}
+
 /* Clean up after calling xfs_rmap_finish_one. */
 void
 xfs_rmap_finish_one_cleanup(
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 789930599339..faf2c1a2e63b 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)(
 int xfs_rmap_query_range(struct xfs_btree_cur *cur,
 		struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec,
 		xfs_rmap_query_range_fn fn, void *priv);
+int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn,
+		void *priv);
 
 enum xfs_rmap_intent_type {
 	XFS_RMAP_MAP,
-- 
cgit v1.2.3


From fb3c3de2f65c007f3ee50538ea131f5c4603c7bc Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:36 -0700
Subject: xfs: add a couple of queries to iterate free extents in the rtbitmap

Add _query_range and _query_all functions to the realtime bitmap
allocator.  These two functions are similar in usage to the btree
functions with the same name and will be used for getfsmap and scrub.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/libxfs/xfs_rtbitmap.c | 70 ++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_rtalloc.h         | 22 ++++++++++++--
 2 files changed, 90 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index ea45584a9913..e47b99e59f60 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,3 +1016,73 @@ xfs_rtfree_extent(
 	}
 	return 0;
 }
+
+/* Find all the free records within a given range. */
+int
+xfs_rtalloc_query_range(
+	struct xfs_trans		*tp,
+	struct xfs_rtalloc_rec		*low_rec,
+	struct xfs_rtalloc_rec		*high_rec,
+	xfs_rtalloc_query_range_fn	fn,
+	void				*priv)
+{
+	struct xfs_rtalloc_rec		rec;
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_rtblock_t			rtstart;
+	xfs_rtblock_t			rtend;
+	xfs_rtblock_t			rem;
+	int				is_free;
+	int				error = 0;
+
+	if (low_rec->ar_startblock > high_rec->ar_startblock)
+		return -EINVAL;
+	else if (low_rec->ar_startblock == high_rec->ar_startblock)
+		return 0;
+
+	/* Iterate the bitmap, looking for discrepancies. */
+	rtstart = low_rec->ar_startblock;
+	rem = high_rec->ar_startblock - rtstart;
+	while (rem) {
+		/* Is the first block free? */
+		error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend,
+				&is_free);
+		if (error)
+			break;
+
+		/* How long does the extent go for? */
+		error = xfs_rtfind_forw(mp, tp, rtstart,
+				high_rec->ar_startblock - 1, &rtend);
+		if (error)
+			break;
+
+		if (is_free) {
+			rec.ar_startblock = rtstart;
+			rec.ar_blockcount = rtend - rtstart + 1;
+
+			error = fn(tp, &rec, priv);
+			if (error)
+				break;
+		}
+
+		rem -= rtend - rtstart + 1;
+		rtstart = rtend + 1;
+	}
+
+	return error;
+}
+
+/* Find all the free records. */
+int
+xfs_rtalloc_query_all(
+	struct xfs_trans		*tp,
+	xfs_rtalloc_query_range_fn	fn,
+	void				*priv)
+{
+	struct xfs_rtalloc_rec		keys[2];
+
+	keys[0].ar_startblock = 0;
+	keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks;
+	keys[0].ar_blockcount = keys[1].ar_blockcount = 0;
+
+	return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv);
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index 51dd3c726608..f13133e6f19f 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -23,6 +23,16 @@
 struct xfs_mount;
 struct xfs_trans;
 
+struct xfs_rtalloc_rec {
+	xfs_rtblock_t		ar_startblock;
+	xfs_rtblock_t		ar_blockcount;
+};
+
+typedef int (*xfs_rtalloc_query_range_fn)(
+	struct xfs_trans	*tp,
+	struct xfs_rtalloc_rec	*rec,
+	void			*priv);
+
 #ifdef CONFIG_XFS_RT
 /*
  * Function prototypes for exported functions.
@@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log,
 int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp,
 		     xfs_rtblock_t start, xfs_extlen_t len,
 		     struct xfs_buf **rbpp, xfs_fsblock_t *rsb);
-
-
+int xfs_rtalloc_query_range(struct xfs_trans *tp,
+			    struct xfs_rtalloc_rec *low_rec,
+			    struct xfs_rtalloc_rec *high_rec,
+			    xfs_rtalloc_query_range_fn fn,
+			    void *priv);
+int xfs_rtalloc_query_all(struct xfs_trans *tp,
+			  xfs_rtalloc_query_range_fn fn,
+			  void *priv);
 #else
 # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb)    (ENOSYS)
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
+# define xfs_rtalloc_query_range(t,l,h,f,p)             (ENOSYS)
+# define xfs_rtalloc_query_all(t,f,p)                   (ENOSYS)
 static inline int		/* error */
 xfs_rtmount_init(
 	xfs_mount_t	*mp)	/* file system mount structure */
-- 
cgit v1.2.3


From e89c041338ed6ef2694e6465ca1ba033e0a2978c Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:37 -0700
Subject: xfs: implement the GETFSMAP ioctl

Introduce a new ioctl that uses the reverse mapping btree to return
information about the physical layout of the filesystem.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/Makefile          |   1 +
 fs/xfs/libxfs/xfs_fs.h   |  13 +
 fs/xfs/libxfs/xfs_rmap.c |  28 ++
 fs/xfs/libxfs/xfs_rmap.h |   2 +
 fs/xfs/xfs_fsmap.c       | 756 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/xfs/xfs_fsmap.h       |  53 ++++
 fs/xfs/xfs_ioctl.c       |  84 ++++++
 fs/xfs/xfs_ioctl32.c     |   2 +
 fs/xfs/xfs_trace.c       |   1 +
 fs/xfs/xfs_trace.h       |  84 ++++++
 fs/xfs/xfs_trans.c       |  22 ++
 fs/xfs/xfs_trans.h       |   2 +
 12 files changed, 1048 insertions(+)
 create mode 100644 fs/xfs/xfs_fsmap.c
 create mode 100644 fs/xfs/xfs_fsmap.h

diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 26ef1958b65b..5c90f82b8f6b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -79,6 +79,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_extent_busy.o \
 				   xfs_file.o \
 				   xfs_filestream.o \
+				   xfs_fsmap.o \
 				   xfs_fsops.o \
 				   xfs_globals.o \
 				   xfs_icache.o \
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b72dc821d78b..095bdf049a3f 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -92,6 +92,18 @@ struct getbmapx {
 #define BMV_OF_LAST		0x4	/* segment is the last in the file */
 #define BMV_OF_SHARED		0x8	/* segment shared with another file */
 
+/*	fmr_owner special values for FS_IOC_GETFSMAP */
+#define XFS_FMR_OWN_FREE	FMR_OWN_FREE      /* free space */
+#define XFS_FMR_OWN_UNKNOWN	FMR_OWN_UNKNOWN   /* unknown owner */
+#define XFS_FMR_OWN_FS		FMR_OWNER('X', 1) /* static fs metadata */
+#define XFS_FMR_OWN_LOG		FMR_OWNER('X', 2) /* journalling log */
+#define XFS_FMR_OWN_AG		FMR_OWNER('X', 3) /* per-AG metadata */
+#define XFS_FMR_OWN_INOBT	FMR_OWNER('X', 4) /* inode btree blocks */
+#define XFS_FMR_OWN_INODES	FMR_OWNER('X', 5) /* inodes */
+#define XFS_FMR_OWN_REFC	FMR_OWNER('X', 6) /* refcount tree */
+#define XFS_FMR_OWN_COW		FMR_OWNER('X', 7) /* cow staging */
+#define XFS_FMR_OWN_DEFECTIVE	FMR_OWNER('X', 8) /* bad blocks */
+
 /*
  * Structure for XFS_IOC_FSSETDM.
  * For use by backup and restore programs to set the XFS on-disk inode
@@ -502,6 +514,7 @@ typedef struct xfs_swapext
 #define XFS_IOC_GETBMAPX	_IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE	_IOW ('X', 57, struct xfs_flock64)
 #define XFS_IOC_FREE_EOFBLOCKS	_IOR ('X', 58, struct xfs_fs_eofblocks)
+/*	XFS_IOC_GETFSMAP ------ hoisted 59         */
 
 /*
  * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 384055668374..06cfb93c2ef9 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2305,3 +2305,31 @@ xfs_rmap_free_extent(
 	return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner,
 			XFS_DATA_FORK, &bmap);
 }
+
+/* Compare rmap records.  Returns -1 if a < b, 1 if a > b, and 0 if equal. */
+int
+xfs_rmap_compare(
+	const struct xfs_rmap_irec	*a,
+	const struct xfs_rmap_irec	*b)
+{
+	__u64				oa;
+	__u64				ob;
+
+	oa = xfs_rmap_irec_offset_pack(a);
+	ob = xfs_rmap_irec_offset_pack(b);
+
+	if (a->rm_startblock < b->rm_startblock)
+		return -1;
+	else if (a->rm_startblock > b->rm_startblock)
+		return 1;
+	else if (a->rm_owner < b->rm_owner)
+		return -1;
+	else if (a->rm_owner > b->rm_owner)
+		return 1;
+	else if (oa < ob)
+		return -1;
+	else if (oa > ob)
+		return 1;
+	else
+		return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index faf2c1a2e63b..98f908fea103 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -214,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
 		uint64_t owner, uint64_t offset, unsigned int flags,
 		struct xfs_rmap_irec *irec, int	*stat);
+int xfs_rmap_compare(const struct xfs_rmap_irec *a,
+		const struct xfs_rmap_irec *b);
 
 #endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
new file mode 100644
index 000000000000..24b3e53379bf
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_error.h"
+#include "xfs_btree.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_rmap.h"
+#include "xfs_alloc.h"
+#include "xfs_bit.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+
+/* Convert an xfs_fsmap to an fsmap. */
+void
+xfs_fsmap_from_internal(
+	struct fsmap		*dest,
+	struct xfs_fsmap	*src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = BBTOB(src->fmr_physical);
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_offset = BBTOB(src->fmr_offset);
+	dest->fmr_length = BBTOB(src->fmr_length);
+	dest->fmr_reserved[0] = 0;
+	dest->fmr_reserved[1] = 0;
+	dest->fmr_reserved[2] = 0;
+}
+
+/* Convert an fsmap to an xfs_fsmap. */
+void
+xfs_fsmap_to_internal(
+	struct xfs_fsmap	*dest,
+	struct fsmap		*src)
+{
+	dest->fmr_device = src->fmr_device;
+	dest->fmr_flags = src->fmr_flags;
+	dest->fmr_physical = BTOBBT(src->fmr_physical);
+	dest->fmr_owner = src->fmr_owner;
+	dest->fmr_offset = BTOBBT(src->fmr_offset);
+	dest->fmr_length = BTOBBT(src->fmr_length);
+}
+
+/* Convert an fsmap owner into an rmapbt owner. */
+static int
+xfs_fsmap_owner_to_rmap(
+	struct xfs_rmap_irec	*dest,
+	struct xfs_fsmap	*src)
+{
+	if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) {
+		dest->rm_owner = src->fmr_owner;
+		return 0;
+	}
+
+	switch (src->fmr_owner) {
+	case 0:			/* "lowest owner id possible" */
+	case -1ULL:		/* "highest owner id possible" */
+		dest->rm_owner = 0;
+		break;
+	case XFS_FMR_OWN_FREE:
+		dest->rm_owner = XFS_RMAP_OWN_NULL;
+		break;
+	case XFS_FMR_OWN_UNKNOWN:
+		dest->rm_owner = XFS_RMAP_OWN_UNKNOWN;
+		break;
+	case XFS_FMR_OWN_FS:
+		dest->rm_owner = XFS_RMAP_OWN_FS;
+		break;
+	case XFS_FMR_OWN_LOG:
+		dest->rm_owner = XFS_RMAP_OWN_LOG;
+		break;
+	case XFS_FMR_OWN_AG:
+		dest->rm_owner = XFS_RMAP_OWN_AG;
+		break;
+	case XFS_FMR_OWN_INOBT:
+		dest->rm_owner = XFS_RMAP_OWN_INOBT;
+		break;
+	case XFS_FMR_OWN_INODES:
+		dest->rm_owner = XFS_RMAP_OWN_INODES;
+		break;
+	case XFS_FMR_OWN_REFC:
+		dest->rm_owner = XFS_RMAP_OWN_REFC;
+		break;
+	case XFS_FMR_OWN_COW:
+		dest->rm_owner = XFS_RMAP_OWN_COW;
+		break;
+	case XFS_FMR_OWN_DEFECTIVE:	/* not implemented */
+		/* fall through */
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/* Convert an rmapbt owner into an fsmap owner. */
+static int
+xfs_fsmap_owner_from_rmap(
+	struct xfs_fsmap	*dest,
+	struct xfs_rmap_irec	*src)
+{
+	dest->fmr_flags = 0;
+	if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) {
+		dest->fmr_owner = src->rm_owner;
+		return 0;
+	}
+	dest->fmr_flags |= FMR_OF_SPECIAL_OWNER;
+
+	switch (src->rm_owner) {
+	case XFS_RMAP_OWN_FS:
+		dest->fmr_owner = XFS_FMR_OWN_FS;
+		break;
+	case XFS_RMAP_OWN_LOG:
+		dest->fmr_owner = XFS_FMR_OWN_LOG;
+		break;
+	case XFS_RMAP_OWN_AG:
+		dest->fmr_owner = XFS_FMR_OWN_AG;
+		break;
+	case XFS_RMAP_OWN_INOBT:
+		dest->fmr_owner = XFS_FMR_OWN_INOBT;
+		break;
+	case XFS_RMAP_OWN_INODES:
+		dest->fmr_owner = XFS_FMR_OWN_INODES;
+		break;
+	case XFS_RMAP_OWN_REFC:
+		dest->fmr_owner = XFS_FMR_OWN_REFC;
+		break;
+	case XFS_RMAP_OWN_COW:
+		dest->fmr_owner = XFS_FMR_OWN_COW;
+		break;
+	default:
+		return -EFSCORRUPTED;
+	}
+	return 0;
+}
+
+/* getfsmap query state */
+struct xfs_getfsmap_info {
+	struct xfs_fsmap_head	*head;
+	xfs_fsmap_format_t	formatter;	/* formatting fn */
+	void			*format_arg;	/* format buffer */
+	struct xfs_buf		*agf_bp;	/* AGF, for refcount queries */
+	xfs_daddr_t		next_daddr;	/* next daddr we expect */
+	u64			missing_owner;	/* owner of holes */
+	u32			dev;		/* device id */
+	xfs_agnumber_t		agno;		/* AG number, if applicable */
+	struct xfs_rmap_irec	low;		/* low rmap key */
+	struct xfs_rmap_irec	high;		/* high rmap key */
+	bool			last;		/* last extent? */
+};
+
+/* Associate a device with a getfsmap handler. */
+struct xfs_getfsmap_dev {
+	u32			dev;
+	int			(*fn)(struct xfs_trans *tp,
+				      struct xfs_fsmap *keys,
+				      struct xfs_getfsmap_info *info);
+};
+
+/* Compare two getfsmap device handlers. */
+static int
+xfs_getfsmap_dev_compare(
+	const void			*p1,
+	const void			*p2)
+{
+	const struct xfs_getfsmap_dev	*d1 = p1;
+	const struct xfs_getfsmap_dev	*d2 = p2;
+
+	return d1->dev - d2->dev;
+}
+
+/* Decide if this mapping is shared. */
+STATIC int
+xfs_getfsmap_is_shared(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_rmap_irec		*rec,
+	bool				*stat)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*cur;
+	xfs_agblock_t			fbno;
+	xfs_extlen_t			flen;
+	int				error;
+
+	*stat = false;
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+	/* rt files will have agno set to NULLAGNUMBER */
+	if (info->agno == NULLAGNUMBER)
+		return 0;
+
+	/* Are there any shared blocks here? */
+	flen = 0;
+	cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp,
+			info->agno, NULL);
+
+	error = xfs_refcount_find_shared(cur, rec->rm_startblock,
+			rec->rm_blockcount, &fbno, &flen, false);
+
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	if (error)
+		return error;
+
+	*stat = flen > 0;
+	return 0;
+}
+
+/*
+ * Format a reverse mapping for getfsmap, having translated rm_startblock
+ * into the appropriate daddr units.
+ */
+STATIC int
+xfs_getfsmap_helper(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_rmap_irec		*rec,
+	xfs_daddr_t			rec_daddr)
+{
+	struct xfs_fsmap		fmr;
+	struct xfs_mount		*mp = tp->t_mountp;
+	bool				shared;
+	int				error;
+
+	if (fatal_signal_pending(current))
+		return -EINTR;
+
+	/*
+	 * Filter out records that start before our startpoint, if the
+	 * caller requested that.
+	 */
+	if (xfs_rmap_compare(rec, &info->low) < 0) {
+		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+		if (info->next_daddr < rec_daddr)
+			info->next_daddr = rec_daddr;
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	}
+
+	/* Are we just counting mappings? */
+	if (info->head->fmh_count == 0) {
+		if (rec_daddr > info->next_daddr)
+			info->head->fmh_entries++;
+
+		if (info->last)
+			return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+		info->head->fmh_entries++;
+
+		rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+		if (info->next_daddr < rec_daddr)
+			info->next_daddr = rec_daddr;
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	}
+
+	/*
+	 * If the record starts past the last physical block we saw,
+	 * then we've found a gap.  Report the gap as being owned by
+	 * whatever the caller specified is the missing owner.
+	 */
+	if (rec_daddr > info->next_daddr) {
+		if (info->head->fmh_entries >= info->head->fmh_count)
+			return XFS_BTREE_QUERY_RANGE_ABORT;
+
+		fmr.fmr_device = info->dev;
+		fmr.fmr_physical = info->next_daddr;
+		fmr.fmr_owner = info->missing_owner;
+		fmr.fmr_offset = 0;
+		fmr.fmr_length = rec_daddr - info->next_daddr;
+		fmr.fmr_flags = FMR_OF_SPECIAL_OWNER;
+		error = info->formatter(&fmr, info->format_arg);
+		if (error)
+			return error;
+		info->head->fmh_entries++;
+	}
+
+	if (info->last)
+		goto out;
+
+	/* Fill out the extent we found */
+	if (info->head->fmh_entries >= info->head->fmh_count)
+		return XFS_BTREE_QUERY_RANGE_ABORT;
+
+	trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec);
+
+	fmr.fmr_device = info->dev;
+	fmr.fmr_physical = rec_daddr;
+	error = xfs_fsmap_owner_from_rmap(&fmr, rec);
+	if (error)
+		return error;
+	fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset);
+	fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+	if (rec->rm_flags & XFS_RMAP_UNWRITTEN)
+		fmr.fmr_flags |= FMR_OF_PREALLOC;
+	if (rec->rm_flags & XFS_RMAP_ATTR_FORK)
+		fmr.fmr_flags |= FMR_OF_ATTR_FORK;
+	if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK)
+		fmr.fmr_flags |= FMR_OF_EXTENT_MAP;
+	if (fmr.fmr_flags == 0) {
+		error = xfs_getfsmap_is_shared(tp, info, rec, &shared);
+		if (error)
+			return error;
+		if (shared)
+			fmr.fmr_flags |= FMR_OF_SHARED;
+	}
+	error = info->formatter(&fmr, info->format_arg);
+	if (error)
+		return error;
+	info->head->fmh_entries++;
+
+out:
+	rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount);
+	if (info->next_daddr < rec_daddr)
+		info->next_daddr = rec_daddr;
+	return XFS_BTREE_QUERY_RANGE_CONTINUE;
+}
+
+/* Transform a rmapbt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_helper(
+	struct xfs_btree_cur		*cur,
+	struct xfs_rmap_irec		*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_getfsmap_info	*info = priv;
+	xfs_fsblock_t			fsb;
+	xfs_daddr_t			rec_daddr;
+
+	fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock);
+	rec_daddr = XFS_FSB_TO_DADDR(mp, fsb);
+
+	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
+}
+
+/* Set rmap flags based on the getfsmap flags */
+static void
+xfs_getfsmap_set_irec_flags(
+	struct xfs_rmap_irec	*irec,
+	struct xfs_fsmap	*fmr)
+{
+	irec->rm_flags = 0;
+	if (fmr->fmr_flags & FMR_OF_ATTR_FORK)
+		irec->rm_flags |= XFS_RMAP_ATTR_FORK;
+	if (fmr->fmr_flags & FMR_OF_EXTENT_MAP)
+		irec->rm_flags |= XFS_RMAP_BMBT_BLOCK;
+	if (fmr->fmr_flags & FMR_OF_PREALLOC)
+		irec->rm_flags |= XFS_RMAP_UNWRITTEN;
+}
+
+/* Execute a getfsmap query against the log device. */
+STATIC int
+xfs_getfsmap_logdev(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_rmap_irec		rmap;
+	int				error;
+
+	/* Set up search keys */
+	info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, keys);
+	if (error)
+		return error;
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1);
+	if (error)
+		return error;
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+	info->missing_owner = XFS_FMR_OWN_FREE;
+
+	trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+	trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+	if (keys[0].fmr_physical > 0)
+		return 0;
+
+	/* Fabricate an rmap entry for the external log device. */
+	rmap.rm_startblock = 0;
+	rmap.rm_blockcount = mp->m_sb.sb_logblocks;
+	rmap.rm_owner = XFS_RMAP_OWN_LOG;
+	rmap.rm_offset = 0;
+	rmap.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &rmap, 0);
+}
+
+/* Execute a getfsmap query against the regular data device. */
+STATIC int
+__xfs_getfsmap_datadev(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info,
+	int				(*query_fn)(struct xfs_trans *,
+						    struct xfs_getfsmap_info *,
+						    struct xfs_btree_cur **,
+						    void *),
+	void				*priv)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*bt_cur = NULL;
+	xfs_fsblock_t			start_fsb;
+	xfs_fsblock_t			end_fsb;
+	xfs_agnumber_t			start_ag;
+	xfs_agnumber_t			end_ag;
+	xfs_daddr_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	if (keys[1].fmr_physical >= eofs)
+		keys[1].fmr_physical = eofs - 1;
+	start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical);
+	end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical);
+
+	/*
+	 * Convert the fsmap low/high keys to AG based keys.  Initialize
+	 * low to the fsmap low key and max out the high key to the end
+	 * of the AG.
+	 */
+	info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb);
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	info->high.rm_startblock = -1U;
+	info->high.rm_owner = ULLONG_MAX;
+	info->high.rm_offset = ULLONG_MAX;
+	info->high.rm_blockcount = 0;
+	info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS;
+
+	start_ag = XFS_FSB_TO_AGNO(mp, start_fsb);
+	end_ag = XFS_FSB_TO_AGNO(mp, end_fsb);
+
+	/* Query each AG */
+	for (info->agno = start_ag; info->agno <= end_ag; info->agno++) {
+		/*
+		 * Set the AG high key from the fsmap high key if this
+		 * is the last AG that we're querying.
+		 */
+		if (info->agno == end_ag) {
+			info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp,
+					end_fsb);
+			info->high.rm_offset = XFS_BB_TO_FSBT(mp,
+					keys[1].fmr_offset);
+			error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+			if (error)
+				goto err;
+			xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+		}
+
+		if (bt_cur) {
+			xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR);
+			bt_cur = NULL;
+			xfs_trans_brelse(tp, info->agf_bp);
+			info->agf_bp = NULL;
+		}
+
+		error = xfs_alloc_read_agf(mp, tp, info->agno, 0,
+				&info->agf_bp);
+		if (error)
+			goto err;
+
+		trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+		trace_xfs_fsmap_high_key(mp, info->dev, info->agno,
+				&info->high);
+
+		error = query_fn(tp, info, &bt_cur, priv);
+		if (error)
+			goto err;
+
+		/*
+		 * Set the AG low key to the start of the AG prior to
+		 * moving on to the next AG.
+		 */
+		if (info->agno == start_ag) {
+			info->low.rm_startblock = 0;
+			info->low.rm_owner = 0;
+			info->low.rm_offset = 0;
+			info->low.rm_flags = 0;
+		}
+	}
+
+	/* Report any gap at the end of the AG */
+	info->last = true;
+	error = query_fn(tp, info, &bt_cur, priv);
+	if (error)
+		goto err;
+
+err:
+	if (bt_cur)
+		xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR :
+							 XFS_BTREE_NOERROR);
+	if (info->agf_bp) {
+		xfs_trans_brelse(tp, info->agf_bp);
+		info->agf_bp = NULL;
+	}
+
+	return error;
+}
+
+/* Actually query the rmap btree. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp,
+	void				*priv)
+{
+	/* Report any gap at the end of the last AG. */
+	if (info->last)
+		return xfs_getfsmap_datadev_helper(*curpp, &info->high, info);
+
+	/* Allocate cursor for this AG and query_range it. */
+	*curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+			info->agno);
+	return xfs_rmap_query_range(*curpp, &info->low, &info->high,
+			xfs_getfsmap_datadev_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device rmapbt. */
+STATIC int
+xfs_getfsmap_datadev_rmapbt(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	info->missing_owner = XFS_FMR_OWN_FREE;
+	return __xfs_getfsmap_datadev(tp, keys, info,
+			xfs_getfsmap_datadev_rmapbt_query, NULL);
+}
+
+/* Do we recognize the device? */
+STATIC bool
+xfs_getfsmap_is_valid_device(
+	struct xfs_mount	*mp,
+	struct xfs_fsmap	*fm)
+{
+	if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
+	    fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev))
+		return true;
+	if (mp->m_logdev_targp &&
+	    fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
+		return true;
+	return false;
+}
+
+/* Ensure that the low key is less than the high key. */
+STATIC bool
+xfs_getfsmap_check_keys(
+	struct xfs_fsmap		*low_key,
+	struct xfs_fsmap		*high_key)
+{
+	if (low_key->fmr_device > high_key->fmr_device)
+		return false;
+	if (low_key->fmr_device < high_key->fmr_device)
+		return true;
+
+	if (low_key->fmr_physical > high_key->fmr_physical)
+		return false;
+	if (low_key->fmr_physical < high_key->fmr_physical)
+		return true;
+
+	if (low_key->fmr_owner > high_key->fmr_owner)
+		return false;
+	if (low_key->fmr_owner < high_key->fmr_owner)
+		return true;
+
+	if (low_key->fmr_offset > high_key->fmr_offset)
+		return false;
+	if (low_key->fmr_offset < high_key->fmr_offset)
+		return true;
+
+	return false;
+}
+
+#define XFS_GETFSMAP_DEVS	2
+/*
+ * Get filesystem's extents as described in head, and format for
+ * output.  Calls formatter to fill the user's buffer until all
+ * extents are mapped, until the passed-in head->fmh_count slots have
+ * been filled, or until the formatter short-circuits the loop, if it
+ * is tracking filled-in extents on its own.
+ *
+ * Key to Confusion
+ * ----------------
+ * There are multiple levels of keys and counters at work here:
+ * xfs_fsmap_head.fmh_keys	-- low and high fsmap keys passed in;
+ * 				   these reflect fs-wide sector addrs.
+ * dkeys			-- fmh_keys used to query each device;
+ * 				   these are fmh_keys but w/ the low key
+ * 				   bumped up by fmr_length.
+ * xfs_getfsmap_info.next_daddr	-- next disk addr we expect to see; this
+ *				   is how we detect gaps in the fsmap
+				   records and report them.
+ * xfs_getfsmap_info.low/high	-- per-AG low/high keys computed from
+ * 				   dkeys; used to query the metadata.
+ */
+int
+xfs_getfsmap(
+	struct xfs_mount		*mp,
+	struct xfs_fsmap_head		*head,
+	xfs_fsmap_format_t		formatter,
+	void				*arg)
+{
+	struct xfs_trans		*tp = NULL;
+	struct xfs_fsmap		dkeys[2];	/* per-dev keys */
+	struct xfs_getfsmap_dev		handlers[XFS_GETFSMAP_DEVS];
+	struct xfs_getfsmap_info	info = {0};
+	int				i;
+	int				error = 0;
+
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return -EOPNOTSUPP;
+	if (head->fmh_iflags & ~FMH_IF_VALID)
+		return -EINVAL;
+	if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
+	    !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	head->fmh_entries = 0;
+
+	/* Set up our device handlers. */
+	memset(handlers, 0, sizeof(handlers));
+	handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
+	handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+	if (mp->m_logdev_targp != mp->m_ddev_targp) {
+		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
+		handlers[1].fn = xfs_getfsmap_logdev;
+	}
+
+	xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
+			xfs_getfsmap_dev_compare);
+
+	/*
+	 * To continue where we left off, we allow userspace to use the
+	 * last mapping from a previous call as the low key of the next.
+	 * This is identified by a non-zero length in the low key. We
+	 * have to increment the low key in this scenario to ensure we
+	 * don't return the same mapping again, and instead return the
+	 * very next mapping.
+	 *
+	 * If the low key mapping refers to file data, the same physical
+	 * blocks could be mapped to several other files/offsets.
+	 * According to rmapbt record ordering, the minimal next
+	 * possible record for the block range is the next starting
+	 * offset in the same inode. Therefore, bump the file offset to
+	 * continue the search appropriately.  For all other low key
+	 * mapping types (attr blocks, metadata), bump the physical
+	 * offset as there can be no other mapping for the same physical
+	 * block range.
+	 */
+	dkeys[0] = head->fmh_keys[0];
+	if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) {
+		dkeys[0].fmr_physical += dkeys[0].fmr_length;
+		dkeys[0].fmr_owner = 0;
+		if (dkeys[0].fmr_offset)
+			return -EINVAL;
+	} else
+		dkeys[0].fmr_offset += dkeys[0].fmr_length;
+	dkeys[0].fmr_length = 0;
+	memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap));
+
+	if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1]))
+		return -EINVAL;
+
+	info.next_daddr = head->fmh_keys[0].fmr_physical +
+			  head->fmh_keys[0].fmr_length;
+	info.formatter = formatter;
+	info.format_arg = arg;
+	info.head = head;
+
+	/* For each device we support... */
+	for (i = 0; i < XFS_GETFSMAP_DEVS; i++) {
+		/* Is this device within the range the user asked for? */
+		if (!handlers[i].fn)
+			continue;
+		if (head->fmh_keys[0].fmr_device > handlers[i].dev)
+			continue;
+		if (head->fmh_keys[1].fmr_device < handlers[i].dev)
+			break;
+
+		/*
+		 * If this device number matches the high key, we have
+		 * to pass the high key to the handler to limit the
+		 * query results.  If the device number exceeds the
+		 * low key, zero out the low key so that we get
+		 * everything from the beginning.
+		 */
+		if (handlers[i].dev == head->fmh_keys[1].fmr_device)
+			dkeys[1] = head->fmh_keys[1];
+		if (handlers[i].dev > head->fmh_keys[0].fmr_device)
+			memset(&dkeys[0], 0, sizeof(struct xfs_fsmap));
+
+		error = xfs_trans_alloc_empty(mp, &tp);
+		if (error)
+			break;
+
+		info.dev = handlers[i].dev;
+		info.last = false;
+		info.agno = NULLAGNUMBER;
+		error = handlers[i].fn(tp, dkeys, &info);
+		if (error)
+			break;
+		xfs_trans_cancel(tp);
+		tp = NULL;
+		info.next_daddr = 0;
+	}
+
+	if (tp)
+		xfs_trans_cancel(tp);
+	head->fmh_oflags = FMH_OF_DEV_T;
+	return error;
+}
diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h
new file mode 100644
index 000000000000..0b9bf822595c
--- /dev/null
+++ b/fs/xfs/xfs_fsmap.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (C) 2017 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_FSMAP_H__
+#define __XFS_FSMAP_H__
+
+struct fsmap;
+
+/* internal fsmap representation */
+struct xfs_fsmap {
+	dev_t		fmr_device;	/* device id */
+	uint32_t	fmr_flags;	/* mapping flags */
+	uint64_t	fmr_physical;	/* device offset of segment */
+	uint64_t	fmr_owner;	/* owner id */
+	xfs_fileoff_t	fmr_offset;	/* file offset of segment */
+	xfs_filblks_t	fmr_length;	/* length of segment, blocks */
+};
+
+struct xfs_fsmap_head {
+	uint32_t	fmh_iflags;	/* control flags */
+	uint32_t	fmh_oflags;	/* output flags */
+	unsigned int	fmh_count;	/* # of entries in array incl. input */
+	unsigned int	fmh_entries;	/* # of entries filled in (output). */
+
+	struct xfs_fsmap fmh_keys[2];	/* low and high keys */
+};
+
+void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src);
+void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src);
+
+/* fsmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *);
+
+int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head,
+		xfs_fsmap_format_t formatter, void *arg);
+
+#endif /* __XFS_FSMAP_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6d30b06e79bc..fee6b4a3a83d 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -41,6 +41,9 @@
 #include "xfs_trans.h"
 #include "xfs_pnfs.h"
 #include "xfs_acl.h"
+#include "xfs_btree.h"
+#include <linux/fsmap.h>
+#include "xfs_fsmap.h"
 
 #include <linux/capability.h>
 #include <linux/cred.h>
@@ -1609,6 +1612,84 @@ xfs_ioc_getbmapx(
 	return 0;
 }
 
+struct getfsmap_info {
+	struct xfs_mount	*mp;
+	struct fsmap __user	*data;
+	__u32			last_flags;
+};
+
+STATIC int
+xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
+{
+	struct getfsmap_info	*info = priv;
+	struct fsmap		fm;
+
+	trace_xfs_getfsmap_mapping(info->mp, xfm);
+
+	info->last_flags = xfm->fmr_flags;
+	xfs_fsmap_from_internal(&fm, xfm);
+	if (copy_to_user(info->data, &fm, sizeof(struct fsmap)))
+		return -EFAULT;
+
+	info->data++;
+	return 0;
+}
+
+STATIC int
+xfs_ioc_getfsmap(
+	struct xfs_inode	*ip,
+	void			__user *arg)
+{
+	struct getfsmap_info	info = {0};
+	struct xfs_fsmap_head	xhead = {0};
+	struct fsmap_head	head;
+	bool			aborted = false;
+	int			error;
+
+	if (copy_from_user(&head, arg, sizeof(struct fsmap_head)))
+		return -EFAULT;
+	if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) ||
+	    memchr_inv(head.fmh_keys[0].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[0].fmr_reserved)) ||
+	    memchr_inv(head.fmh_keys[1].fmr_reserved, 0,
+		       sizeof(head.fmh_keys[1].fmr_reserved)))
+		return -EINVAL;
+
+	xhead.fmh_iflags = head.fmh_iflags;
+	xhead.fmh_count = head.fmh_count;
+	xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]);
+	xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]);
+
+	trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]);
+	trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
+
+	info.mp = ip->i_mount;
+	info.data = ((__force struct fsmap_head *)arg)->fmh_recs;
+	error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
+		error = 0;
+		aborted = true;
+	} else if (error)
+		return error;
+
+	/* If we didn't abort, set the "last" flag in the last fmx */
+	if (!aborted && xhead.fmh_entries) {
+		info.data--;
+		info.last_flags |= FMR_OF_LAST;
+		if (copy_to_user(&info.data->fmr_flags, &info.last_flags,
+				sizeof(info.last_flags)))
+			return -EFAULT;
+	}
+
+	/* copy back header */
+	head.fmh_entries = xhead.fmh_entries;
+	head.fmh_oflags = xhead.fmh_oflags;
+	if (copy_to_user(arg, &head, sizeof(struct fsmap_head)))
+		return -EFAULT;
+
+	return 0;
+}
+
 int
 xfs_ioc_swapext(
 	xfs_swapext_t	*sxp)
@@ -1789,6 +1870,9 @@ xfs_file_ioctl(
 	case XFS_IOC_GETBMAPX:
 		return xfs_ioc_getbmapx(ip, arg);
 
+	case FS_IOC_GETFSMAP:
+		return xfs_ioc_getfsmap(ip, arg);
+
 	case XFS_IOC_FD_TO_HANDLE:
 	case XFS_IOC_PATH_TO_HANDLE:
 	case XFS_IOC_PATH_TO_FSHANDLE: {
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 7c49938c5aed..fa0bc4d46065 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -20,6 +20,7 @@
 #include <linux/mount.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
+#include <linux/fsmap.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
@@ -554,6 +555,7 @@ xfs_file_compat_ioctl(
 	case XFS_IOC_GOINGDOWN:
 	case XFS_IOC_ERROR_INJECTION:
 	case XFS_IOC_ERROR_CLEARALL:
+	case FS_IOC_GETFSMAP:
 		return xfs_file_ioctl(filp, cmd, p);
 #ifndef BROKEN_X86_ALIGNMENT
 	/* These are handled fine if no alignment issues */
diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c
index 7f17ae6d709a..5d95fe348294 100644
--- a/fs/xfs/xfs_trace.c
+++ b/fs/xfs/xfs_trace.c
@@ -47,6 +47,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_filestream.h"
+#include "xfs_fsmap.h"
 
 /*
  * We include this last to have the helpers above available for the trace
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 383ac227ce2c..4f96dc953fbe 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -40,6 +40,8 @@ struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
 struct xfs_btree_cur;
 struct xfs_refcount_irec;
+struct xfs_fsmap;
+struct xfs_rmap_irec;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -3267,6 +3269,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
 DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
 
+/* fsmap traces */
+DECLARE_EVENT_CLASS(xfs_fsmap_class,
+	TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno,
+		 struct xfs_rmap_irec *rmap),
+	TP_ARGS(mp, keydev, agno, rmap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_fsblock_t, bno)
+		__field(xfs_filblks_t, len)
+		__field(__uint64_t, owner)
+		__field(__uint64_t, offset)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->keydev = new_decode_dev(keydev);
+		__entry->agno = agno;
+		__entry->bno = rmap->rm_startblock;
+		__entry->len = rmap->rm_blockcount;
+		__entry->owner = rmap->rm_owner;
+		__entry->offset = rmap->rm_offset;
+		__entry->flags = rmap->rm_flags;
+	),
+	TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->agno,
+		  __entry->bno,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+)
+#define DEFINE_FSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_fsmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \
+		 struct xfs_rmap_irec *rmap), \
+	TP_ARGS(mp, keydev, agno, rmap))
+DEFINE_FSMAP_EVENT(xfs_fsmap_low_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_high_key);
+DEFINE_FSMAP_EVENT(xfs_fsmap_mapping);
+
+DECLARE_EVENT_CLASS(xfs_getfsmap_class,
+	TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap),
+	TP_ARGS(mp, fsmap),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(dev_t, keydev)
+		__field(xfs_daddr_t, block)
+		__field(xfs_daddr_t, len)
+		__field(__uint64_t, owner)
+		__field(__uint64_t, offset)
+		__field(__uint64_t, flags)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->keydev = new_decode_dev(fsmap->fmr_device);
+		__entry->block = fsmap->fmr_physical;
+		__entry->len = fsmap->fmr_length;
+		__entry->owner = fsmap->fmr_owner;
+		__entry->offset = fsmap->fmr_offset;
+		__entry->flags = fsmap->fmr_flags;
+	),
+	TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
+		  __entry->block,
+		  __entry->len,
+		  __entry->owner,
+		  __entry->offset,
+		  __entry->flags)
+)
+#define DEFINE_GETFSMAP_EVENT(name) \
+DEFINE_EVENT(xfs_getfsmap_class, name, \
+	TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \
+	TP_ARGS(mp, fsmap))
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key);
+DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 70f42ea86dfb..a280e126491f 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -262,6 +262,28 @@ xfs_trans_alloc(
 	return 0;
 }
 
+/*
+ * Create an empty transaction with no reservation.  This is a defensive
+ * mechanism for routines that query metadata without actually modifying
+ * them -- if the metadata being queried is somehow cross-linked (think a
+ * btree block pointer that points higher in the tree), we risk deadlock.
+ * However, blocks grabbed as part of a transaction can be re-grabbed.
+ * The verifiers will notice the corrupt block and the operation will fail
+ * back to userspace without deadlocking.
+ *
+ * Note the zero-length reservation; this transaction MUST be cancelled
+ * without any dirty data.
+ */
+int
+xfs_trans_alloc_empty(
+	struct xfs_mount		*mp,
+	struct xfs_trans		**tpp)
+{
+	struct xfs_trans_res		resv = {0};
+
+	return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp);
+}
+
 /*
  * Record the indicated change to the given field for application
  * to the file system's superblock when the transaction commits.
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 1646f659b60f..2a9292df6640 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -158,6 +158,8 @@ typedef struct xfs_trans {
 int		xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
 			uint blocks, uint rtextents, uint flags,
 			struct xfs_trans **tpp);
+int		xfs_trans_alloc_empty(struct xfs_mount *mp,
+			struct xfs_trans **tpp);
 void		xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
 
 struct xfs_buf	*xfs_trans_get_buf_map(struct xfs_trans *tp,
-- 
cgit v1.2.3


From a1cae7283d4627a96d092ee7a1f821eb4d9ce7f9 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:37 -0700
Subject: xfs: have getfsmap fall back to the freesp btrees when rmap is not
 present

If the reverse-mapping btree isn't available, fall back to the
free space btrees to provide partial reverse mapping information.
The online scrub tool can make use of even partial information to
speed up the data block scan.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_fsmap.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 69 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 24b3e53379bf..edd75c67919a 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -40,6 +40,7 @@
 #include "xfs_fsmap.h"
 #include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
+#include "xfs_alloc_btree.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 void
@@ -158,6 +159,9 @@ xfs_fsmap_owner_from_rmap(
 	case XFS_RMAP_OWN_COW:
 		dest->fmr_owner = XFS_FMR_OWN_COW;
 		break;
+	case XFS_RMAP_OWN_NULL:	/* "free" */
+		dest->fmr_owner = XFS_FMR_OWN_FREE;
+		break;
 	default:
 		return -EFSCORRUPTED;
 	}
@@ -362,6 +366,30 @@ xfs_getfsmap_datadev_helper(
 	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
 }
 
+/* Transform a bnobt irec into a fsmap */
+STATIC int
+xfs_getfsmap_datadev_bnobt_helper(
+	struct xfs_btree_cur		*cur,
+	struct xfs_alloc_rec_incore	*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = cur->bc_mp;
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_daddr_t			rec_daddr;
+
+	rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno,
+			rec->ar_startblock);
+
+	irec.rm_startblock = rec->ar_startblock;
+	irec.rm_blockcount = rec->ar_blockcount;
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr);
+}
+
 /* Set rmap flags based on the getfsmap flags */
 static void
 xfs_getfsmap_set_irec_flags(
@@ -572,6 +600,43 @@ xfs_getfsmap_datadev_rmapbt(
 			xfs_getfsmap_datadev_rmapbt_query, NULL);
 }
 
+/* Actually query the bno btree. */
+STATIC int
+xfs_getfsmap_datadev_bnobt_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info,
+	struct xfs_btree_cur		**curpp,
+	void				*priv)
+{
+	struct xfs_alloc_rec_incore	*key = priv;
+
+	/* Report any gap at the end of the last AG. */
+	if (info->last)
+		return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info);
+
+	/* Allocate cursor for this AG and query_range it. */
+	*curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp,
+			info->agno, XFS_BTNUM_BNO);
+	key->ar_startblock = info->low.rm_startblock;
+	key[1].ar_startblock = info->high.rm_startblock;
+	return xfs_alloc_query_range(*curpp, key, &key[1],
+			xfs_getfsmap_datadev_bnobt_helper, info);
+}
+
+/* Execute a getfsmap query against the regular data device's bnobt. */
+STATIC int
+xfs_getfsmap_datadev_bnobt(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_alloc_rec_incore	akeys[2];
+
+	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+	return __xfs_getfsmap_datadev(tp, keys, info,
+			xfs_getfsmap_datadev_bnobt_query, &akeys[0]);
+}
+
 /* Do we recognize the device? */
 STATIC bool
 xfs_getfsmap_is_valid_device(
@@ -652,8 +717,6 @@ xfs_getfsmap(
 	int				i;
 	int				error = 0;
 
-	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
-		return -EOPNOTSUPP;
 	if (head->fmh_iflags & ~FMH_IF_VALID)
 		return -EINVAL;
 	if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) ||
@@ -665,7 +728,10 @@ xfs_getfsmap(
 	/* Set up our device handlers. */
 	memset(handlers, 0, sizeof(handlers));
 	handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev);
-	handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		handlers[0].fn = xfs_getfsmap_datadev_rmapbt;
+	else
+		handlers[0].fn = xfs_getfsmap_datadev_bnobt;
 	if (mp->m_logdev_targp != mp->m_ddev_targp) {
 		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
 		handlers[1].fn = xfs_getfsmap_logdev;
-- 
cgit v1.2.3


From 4c934c7dd60cc60af056b8cc62c39d10ce7ca1c8 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Tue, 28 Mar 2017 14:56:38 -0700
Subject: xfs: report realtime space information via the rtbitmap

Use the realtime bitmap to return free space information via getfsmap.
Eventually this will be superseded by the realtime rmapbt code.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
---
 fs/xfs/xfs_fsmap.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index edd75c67919a..48ce356da9e9 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -41,6 +41,7 @@
 #include "xfs_refcount.h"
 #include "xfs_refcount_btree.h"
 #include "xfs_alloc_btree.h"
+#include "xfs_rtalloc.h"
 
 /* Convert an xfs_fsmap to an fsmap. */
 void
@@ -366,6 +367,29 @@ xfs_getfsmap_datadev_helper(
 	return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr);
 }
 
+/* Transform a rtbitmap "record" into a fsmap */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_helper(
+	struct xfs_trans		*tp,
+	struct xfs_rtalloc_rec		*rec,
+	void				*priv)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_getfsmap_info	*info = priv;
+	struct xfs_rmap_irec		irec;
+	xfs_daddr_t			rec_daddr;
+
+	rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock);
+
+	irec.rm_startblock = rec->ar_startblock;
+	irec.rm_blockcount = rec->ar_blockcount;
+	irec.rm_owner = XFS_RMAP_OWN_NULL;	/* "free" */
+	irec.rm_offset = 0;
+	irec.rm_flags = 0;
+
+	return xfs_getfsmap_helper(tp, info, &irec, rec_daddr);
+}
+
 /* Transform a bnobt irec into a fsmap */
 STATIC int
 xfs_getfsmap_datadev_bnobt_helper(
@@ -451,6 +475,93 @@ xfs_getfsmap_logdev(
 	return xfs_getfsmap_helper(tp, info, &rmap, 0);
 }
 
+/* Execute a getfsmap query against the realtime device. */
+STATIC int
+__xfs_getfsmap_rtdev(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	int				(*query_fn)(struct xfs_trans *,
+						    struct xfs_getfsmap_info *),
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	xfs_fsblock_t			start_fsb;
+	xfs_fsblock_t			end_fsb;
+	xfs_daddr_t			eofs;
+	int				error = 0;
+
+	eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks);
+	if (keys[0].fmr_physical >= eofs)
+		return 0;
+	if (keys[1].fmr_physical >= eofs)
+		keys[1].fmr_physical = eofs - 1;
+	start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical);
+	end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical);
+
+	/* Set up search keys */
+	info->low.rm_startblock = start_fsb;
+	error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]);
+	if (error)
+		return error;
+	info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset);
+	info->low.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->low, &keys[0]);
+
+	info->high.rm_startblock = end_fsb;
+	error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]);
+	if (error)
+		return error;
+	info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset);
+	info->high.rm_blockcount = 0;
+	xfs_getfsmap_set_irec_flags(&info->high, &keys[1]);
+
+	trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low);
+	trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high);
+
+	return query_fn(tp, info);
+}
+
+/* Actually query the realtime bitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap_query(
+	struct xfs_trans		*tp,
+	struct xfs_getfsmap_info	*info)
+{
+	struct xfs_rtalloc_rec		alow;
+	struct xfs_rtalloc_rec		ahigh;
+	int				error;
+
+	xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+
+	alow.ar_startblock = info->low.rm_startblock;
+	ahigh.ar_startblock = info->high.rm_startblock;
+	error = xfs_rtalloc_query_range(tp, &alow, &ahigh,
+			xfs_getfsmap_rtdev_rtbitmap_helper, info);
+	if (error)
+		goto err;
+
+	/* Report any gaps at the end of the rtbitmap */
+	info->last = true;
+	error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info);
+	if (error)
+		goto err;
+err:
+	xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED);
+	return error;
+}
+
+/* Execute a getfsmap query against the realtime device rtbitmap. */
+STATIC int
+xfs_getfsmap_rtdev_rtbitmap(
+	struct xfs_trans		*tp,
+	struct xfs_fsmap		*keys,
+	struct xfs_getfsmap_info	*info)
+{
+	info->missing_owner = XFS_FMR_OWN_UNKNOWN;
+	return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query,
+			info);
+}
+
 /* Execute a getfsmap query against the regular data device. */
 STATIC int
 __xfs_getfsmap_datadev(
@@ -649,6 +760,9 @@ xfs_getfsmap_is_valid_device(
 	if (mp->m_logdev_targp &&
 	    fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev))
 		return true;
+	if (mp->m_rtdev_targp &&
+	    fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev))
+		return true;
 	return false;
 }
 
@@ -681,7 +795,7 @@ xfs_getfsmap_check_keys(
 	return false;
 }
 
-#define XFS_GETFSMAP_DEVS	2
+#define XFS_GETFSMAP_DEVS	3
 /*
  * Get filesystem's extents as described in head, and format for
  * output.  Calls formatter to fill the user's buffer until all
@@ -736,6 +850,10 @@ xfs_getfsmap(
 		handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev);
 		handlers[1].fn = xfs_getfsmap_logdev;
 	}
+	if (mp->m_rtdev_targp) {
+		handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev);
+		handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap;
+	}
 
 	xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev),
 			xfs_getfsmap_dev_compare);
-- 
cgit v1.2.3


From 254133f5d0c2df8476495d331f0fa69ff1a374c1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 6 Apr 2017 16:00:11 -0700
Subject: xfs: fold __xfs_trans_roll into xfs_trans_roll

No one cares about the low-level helper anymore.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_trans.c | 17 ++---------------
 fs/xfs/xfs_trans.h |  1 -
 2 files changed, 2 insertions(+), 16 deletions(-)

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a280e126491f..be86e4ed23d5 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1034,17 +1034,14 @@ xfs_trans_cancel(
  * chunk we've been working on and get a new transaction to continue.
  */
 int
-__xfs_trans_roll(
+xfs_trans_roll(
 	struct xfs_trans	**tpp,
-	struct xfs_inode	*dp,
-	int			*committed)
+	struct xfs_inode	*dp)
 {
 	struct xfs_trans	*trans;
 	struct xfs_trans_res	tres;
 	int			error;
 
-	*committed = 0;
-
 	/*
 	 * Ensure that the inode is always logged.
 	 */
@@ -1070,7 +1067,6 @@ __xfs_trans_roll(
 	if (error)
 		return error;
 
-	*committed = 1;
 	trans = *tpp;
 
 	/*
@@ -1093,12 +1089,3 @@ __xfs_trans_roll(
 		xfs_trans_ijoin(trans, dp, 0);
 	return 0;
 }
-
-int
-xfs_trans_roll(
-	struct xfs_trans	**tpp,
-	struct xfs_inode	*dp)
-{
-	int			committed;
-	return __xfs_trans_roll(tpp, dp, &committed);
-}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2a9292df6640..a07acbf0bd8a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -228,7 +228,6 @@ int		xfs_trans_free_extent(struct xfs_trans *,
 				      struct xfs_efd_log_item *, xfs_fsblock_t,
 				      xfs_extlen_t, struct xfs_owner_info *);
 int		xfs_trans_commit(struct xfs_trans *);
-int		__xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *);
 int		xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
 void		xfs_trans_cancel(xfs_trans_t *);
 int		xfs_trans_ail_init(struct xfs_mount *);
-- 
cgit v1.2.3


From 84358536dc355a9c8978ee425f87e116186bed16 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 6 Apr 2017 16:00:39 -0700
Subject: xfs: actually report xattr extents via iomap

Apparently FIEMAP for xattrs has been broken since we switched to
the iomap backend because of an incorrect check for xattr presence.
Also fix the broken locking.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_iomap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 009f8243dddc..b2f0901bb517 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -1170,10 +1170,10 @@ xfs_xattr_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	lockmode = xfs_ilock_data_map_shared(ip);
+	lockmode = xfs_ilock_attr_map_shared(ip);
 
 	/* if there are no attribute fork or extents, return ENOENT */
-	if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
+	if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) {
 		error = -ENOENT;
 		goto out_unlock;
 	}
-- 
cgit v1.2.3


From d956f813b6e731ef82699a18e468e37d59a1366d Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 6 Apr 2017 16:01:47 -0700
Subject: xfs: simplify xfs_calc_dquots_per_chunk

ndquots is a 32-bit value, and we don't care
about the remainder; there is no reason to use do_div
here, it seems to be the result of a decade+ historical
accident.

Worse, the do_div implementation in userspace breaks
when fed a 32-bit dividend, so we commented it out there
in any case.

Change to simple division, and then we can change
userspace to match, and mandate a 64-bit dividend in
the do_div() in userspace as well.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
---
 fs/xfs/libxfs/xfs_dquot_buf.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index ac9a003dd29a..747085b4ef44 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,13 +35,8 @@ int
 xfs_calc_dquots_per_chunk(
 	unsigned int		nbblks)	/* basic block units */
 {
-	unsigned int	ndquots;
-
 	ASSERT(nbblks > 0);
-	ndquots = BBTOB(nbblks);
-	do_div(ndquots, sizeof(xfs_dqblk_t));
-
-	return ndquots;
+	return BBTOB(nbblks) / sizeof(xfs_dqblk_t);
 }
 
 /*
-- 
cgit v1.2.3


From 5146d0b7627900ccbb30049246cdca232a79abf5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Tue, 11 Apr 2017 10:45:17 -0700
Subject: xfs: remove custom do_div implementations

Long ago, all this gunk was added with a lament about problems
with gcc's do_div, and a fun recommendation in the changelog:

 egcs-2.91.66 is the recommended compiler version for building XFS.

All this special stuff was needed to work around an old gcc bug,
apparently, and it's been there ever since.

There should be no need for this anymore, so remove it.

Remove the special 32-bit xfs_do_mod as well; just let the
kernel's do_div() handle all this.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_linux.h | 85 ------------------------------------------------------
 1 file changed, 85 deletions(-)

diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 592fdf7111cb..044fb0e15390 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid)
 #define xfs_sort(a,n,s,fn)	sort(a,n,s,fn,NULL)
 #define xfs_stack_trace()	dump_stack()
 
-
-/* Move the kernel do_div definition off to one side */
-
-#if defined __i386__
-/* For ia32 we need to pull some tricks to get past various versions
- * of the compiler which do not like us using do_div in the middle
- * of large functions.
- */
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
-	__u32	mod;
-
-	switch (n) {
-		case 4:
-			mod = *(__u32 *)a % b;
-			*(__u32 *)a = *(__u32 *)a / b;
-			return mod;
-		case 8:
-			{
-			unsigned long __upper, __low, __high, __mod;
-			__u64	c = *(__u64 *)a;
-			__upper = __high = c >> 32;
-			__low = c;
-			if (__high) {
-				__upper = __high % (b);
-				__high = __high / (b);
-			}
-			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
-			asm("":"=A" (c):"a" (__low),"d" (__high));
-			*(__u64 *)a = c;
-			return __mod;
-			}
-	}
-
-	/* NOTREACHED */
-	return 0;
-}
-
-/* Side effect free 64 bit mod operation */
-static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
-{
-	switch (n) {
-		case 4:
-			return *(__u32 *)a % b;
-		case 8:
-			{
-			unsigned long __upper, __low, __high, __mod;
-			__u64	c = *(__u64 *)a;
-			__upper = __high = c >> 32;
-			__low = c;
-			if (__high) {
-				__upper = __high % (b);
-				__high = __high / (b);
-			}
-			asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper));
-			asm("":"=A" (c):"a" (__low),"d" (__high));
-			return __mod;
-			}
-	}
-
-	/* NOTREACHED */
-	return 0;
-}
-#else
-static inline __u32 xfs_do_div(void *a, __u32 b, int n)
-{
-	__u32	mod;
-
-	switch (n) {
-		case 4:
-			mod = *(__u32 *)a % b;
-			*(__u32 *)a = *(__u32 *)a / b;
-			return mod;
-		case 8:
-			mod = do_div(*(__u64 *)a, b);
-			return mod;
-	}
-
-	/* NOTREACHED */
-	return 0;
-}
-
 /* Side effect free 64 bit mod operation */
 static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
 {
@@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n)
 	/* NOTREACHED */
 	return 0;
 }
-#endif
 
-#undef do_div
-#define do_div(a, b)	xfs_do_div(&(a), (b), sizeof(a))
 #define do_mod(a, b)	xfs_do_mod(&(a), (b), sizeof(a))
 
 static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y)
-- 
cgit v1.2.3


From 3b4683c294095b5f777c03307ef8c60f47320e12 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Tue, 11 Apr 2017 10:50:05 -0700
Subject: xfs: drop iolock from reclaim context to appease lockdep

Lockdep complains about use of the iolock in inode reclaim context
because it doesn't understand that reclaim has the last reference to
the inode, and thus an iolock->reclaim->iolock deadlock is not
possible.

The iolock is technically not necessary in xfs_inactive() and was
only added to appease an assert in xfs_free_eofblocks(), which can
be called from other non-reclaim contexts. Therefore, just kill the
assert and drop the use of the iolock from reclaim context to quiet
lockdep.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_bmap_util.c | 8 +++-----
 fs/xfs/xfs_inode.c     | 9 +++++----
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4d1920e594b0..de94798f1c1b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -903,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
 }
 
 /*
- * This is called by xfs_inactive to free any blocks beyond eof
- * when the link count isn't zero and by xfs_dm_punch_hole() when
- * punching a hole to EOF.
+ * This is called to free any blocks beyond eof. The caller must hold
+ * IOLOCK_EXCL unless we are in the inode reclaim path and have the only
+ * reference to the inode.
  */
 int
 xfs_free_eofblocks(
@@ -920,8 +920,6 @@ xfs_free_eofblocks(
 	struct xfs_bmbt_irec	imap;
 	struct xfs_mount	*mp = ip->i_mount;
 
-	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-
 	/*
 	 * Figure out if there are any blocks beyond the end
 	 * of the file.  If not, then there is nothing to do.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7605d8396596..ec9826c56500 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1906,12 +1906,13 @@ xfs_inactive(
 		 * force is true because we are evicting an inode from the
 		 * cache. Post-eof blocks must be freed, lest we end up with
 		 * broken free space accounting.
+		 *
+		 * Note: don't bother with iolock here since lockdep complains
+		 * about acquiring it in reclaim context. We have the only
+		 * reference to the inode at this point anyways.
 		 */
-		if (xfs_can_free_eofblocks(ip, true)) {
-			xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		if (xfs_can_free_eofblocks(ip, true))
 			xfs_free_eofblocks(ip);
-			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-		}
 
 		return;
 	}
-- 
cgit v1.2.3


From 52813fb13ff90bd9c39a93446cbf1103c290b6e9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:52 -0700
Subject: xfs: fix integer truncation in xfs_bmap_remap_alloc

bno should be a xfs_fsblock_t, which is 64-bit wides instead of a
xfs_aglock_t, which truncates the value to 32 bits.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9bd104f32908..2a426d127e05 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3863,7 +3863,7 @@ xfs_bmap_remap_alloc(
 {
 	struct xfs_trans	*tp = ap->tp;
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_agblock_t		bno;
+	xfs_fsblock_t		bno;
 	struct xfs_alloc_arg	args;
 	int			error;
 
-- 
cgit v1.2.3


From 39e07daa46e34c724ad33f903d166a0a62c20900 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:53 -0700
Subject: xfs: remove attr fork handling in xfs_bmap_finish_one

We never do COW operations for the attr fork, so don't pretend we handle
them.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2a426d127e05..b95e66fd0935 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6491,7 +6491,6 @@ xfs_bmap_finish_one(
 	struct xfs_bmbt_irec		bmap;
 	int				nimaps = 1;
 	xfs_fsblock_t			firstfsb;
-	int				flags = XFS_BMAPI_REMAP;
 	int				done;
 	int				error = 0;
 
@@ -6505,10 +6504,8 @@ xfs_bmap_finish_one(
 			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
 			ip->i_ino, whichfork, startoff, blockcount, state);
 
-	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+	if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK))
 		return -EFSCORRUPTED;
-	if (whichfork == XFS_ATTR_FORK)
-		flags |= XFS_BMAPI_ATTRFORK;
 
 	if (XFS_TEST_ERROR(false, tp->t_mountp,
 			XFS_ERRTAG_BMAP_FINISH_ONE,
@@ -6519,13 +6516,13 @@ xfs_bmap_finish_one(
 	case XFS_BMAP_MAP:
 		firstfsb = bmap.br_startblock;
 		error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
-					bmap.br_blockcount, flags, &firstfsb,
+					bmap.br_blockcount, XFS_BMAPI_REMAP, &firstfsb,
 					bmap.br_blockcount, &bmap, &nimaps,
 					dfops);
 		break;
 	case XFS_BMAP_UNMAP:
 		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
-				bmap.br_blockcount, flags, 1, &firstfsb,
+				bmap.br_blockcount, XFS_BMAPI_REMAP, 1, &firstfsb,
 				dfops, &done);
 		ASSERT(done);
 		break;
-- 
cgit v1.2.3


From 6d04558f9fa9d16c4aba7243030f22ef0c1bbf32 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:54 -0700
Subject: xfs: pass individual arguments to xfs_bmap_add_extent_hole_real

For the reflink case we'd much rather pass the required arguments than
faking up a struct xfs_bmalloca.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 123 ++++++++++++++++++++++++-----------------------
 1 file changed, 64 insertions(+), 59 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b95e66fd0935..f32b2dab16a4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -2879,27 +2879,30 @@ xfs_bmap_add_extent_hole_delay(
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_hole_real(
-	struct xfs_bmalloca	*bma,
-	int			whichfork)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_extnum_t		*idx,
+	struct xfs_btree_cur	**curp,
+	struct xfs_bmbt_irec	*new,
+	xfs_fsblock_t		*first,
+	struct xfs_defer_ops	*dfops,
+	int			*logflagsp)
 {
-	struct xfs_bmbt_irec	*new = &bma->got;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_btree_cur	*cur = *curp;
 	int			error;	/* error return value */
 	int			i;	/* temp state */
-	xfs_ifork_t		*ifp;	/* inode fork pointer */
 	xfs_bmbt_irec_t		left;	/* left neighbor extent entry */
 	xfs_bmbt_irec_t		right;	/* right neighbor extent entry */
 	int			rval=0;	/* return value (logging flags) */
 	int			state;	/* state bits, accessed thru macros */
-	struct xfs_mount	*mp;
 
-	mp = bma->ip->i_mount;
-	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
-
-	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= xfs_iext_count(ifp));
+	ASSERT(*idx >= 0);
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
-	ASSERT(!bma->cur ||
-	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+	ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
@@ -2912,9 +2915,9 @@ xfs_bmap_add_extent_hole_real(
 	/*
 	 * Check and set flags if this segment has a left neighbor.
 	 */
-	if (bma->idx > 0) {
+	if (*idx > 0) {
 		state |= BMAP_LEFT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left);
 		if (isnullstartblock(left.br_startblock))
 			state |= BMAP_LEFT_DELAY;
 	}
@@ -2923,9 +2926,9 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (bma->idx < xfs_iext_count(ifp)) {
+	if (*idx < xfs_iext_count(ifp)) {
 		state |= BMAP_RIGHT_VALID;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
+		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 		if (isnullstartblock(right.br_startblock))
 			state |= BMAP_RIGHT_DELAY;
 	}
@@ -2962,36 +2965,36 @@ xfs_bmap_add_extent_hole_real(
 		 * left and on the right.
 		 * Merge all three into a single extent record.
 		 */
-		--bma->idx;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			left.br_blockcount + new->br_blockcount +
 			right.br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		xfs_iext_remove(bma->ip, bma->idx + 1, 1, state);
+		xfs_iext_remove(ip, *idx + 1, 1, state);
 
-		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-			XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1);
-		if (bma->cur == NULL) {
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) - 1);
+		if (cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff,
+			error = xfs_bmbt_lookup_eq(cur, right.br_startoff,
 					right.br_startblock, right.br_blockcount,
 					&i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
-			error = xfs_btree_delete(bma->cur, &i);
+			error = xfs_btree_delete(cur, &i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
-			error = xfs_btree_decrement(bma->cur, 0, &i);
+			error = xfs_btree_decrement(cur, 0, &i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
-			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+			error = xfs_bmbt_update(cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount +
@@ -3008,23 +3011,23 @@ xfs_bmap_add_extent_hole_real(
 		 * on the left.
 		 * Merge the new allocation with the left neighbor.
 		 */
-		--bma->idx;
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx),
+		--*idx;
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx),
 			left.br_blockcount + new->br_blockcount);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		if (bma->cur == NULL) {
+		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff,
+			error = xfs_bmbt_lookup_eq(cur, left.br_startoff,
 					left.br_startblock, left.br_blockcount,
 					&i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
-			error = xfs_bmbt_update(bma->cur, left.br_startoff,
+			error = xfs_bmbt_update(cur, left.br_startoff,
 					left.br_startblock,
 					left.br_blockcount +
 						new->br_blockcount,
@@ -3040,25 +3043,25 @@ xfs_bmap_add_extent_hole_real(
 		 * on the right.
 		 * Merge the new allocation with the right neighbor.
 		 */
-		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
-		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx),
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
+		xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx),
 			new->br_startoff, new->br_startblock,
 			new->br_blockcount + right.br_blockcount,
 			right.br_state);
-		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
+		trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_);
 
-		if (bma->cur == NULL) {
+		if (cur == NULL) {
 			rval = xfs_ilog_fext(whichfork);
 		} else {
 			rval = 0;
-			error = xfs_bmbt_lookup_eq(bma->cur,
+			error = xfs_bmbt_lookup_eq(cur,
 					right.br_startoff,
 					right.br_startblock,
 					right.br_blockcount, &i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
-			error = xfs_bmbt_update(bma->cur, new->br_startoff,
+			error = xfs_bmbt_update(cur, new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount +
 						right.br_blockcount,
@@ -3074,22 +3077,22 @@ xfs_bmap_add_extent_hole_real(
 		 * real allocation.
 		 * Insert a new entry.
 		 */
-		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-		XFS_IFORK_NEXT_SET(bma->ip, whichfork,
-			XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1);
-		if (bma->cur == NULL) {
+		xfs_iext_insert(ip, *idx, 1, new, state);
+		XFS_IFORK_NEXT_SET(ip, whichfork,
+			XFS_IFORK_NEXTENTS(ip, whichfork) + 1);
+		if (cur == NULL) {
 			rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork);
 		} else {
 			rval = XFS_ILOG_CORE;
-			error = xfs_bmbt_lookup_eq(bma->cur,
+			error = xfs_bmbt_lookup_eq(cur,
 					new->br_startoff,
 					new->br_startblock,
 					new->br_blockcount, &i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done);
-			bma->cur->bc_rec.b.br_state = new->br_state;
-			error = xfs_btree_insert(bma->cur, &i);
+			cur->bc_rec.b.br_state = new->br_state;
+			error = xfs_btree_insert(cur, &i);
 			if (error)
 				goto done;
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
@@ -3098,30 +3101,30 @@ xfs_bmap_add_extent_hole_real(
 	}
 
 	/* add reverse mapping */
-	error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new);
+	error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new);
 	if (error)
 		goto done;
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
+	if (xfs_bmap_needs_btree(ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
-		ASSERT(bma->cur == NULL);
-		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
-				bma->firstblock, bma->dfops, &bma->cur,
+		ASSERT(cur == NULL);
+		error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp,
 				0, &tmp_logflags, whichfork);
-		bma->logflags |= tmp_logflags;
+		*logflagsp |= tmp_logflags;
+		cur = *curp;
 		if (error)
 			goto done;
 	}
 
 	/* clear out the allocated field, done with it now in any case. */
-	if (bma->cur)
-		bma->cur->bc_private.b.allocated = 0;
+	if (cur)
+		cur->bc_private.b.allocated = 0;
 
-	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
+	xfs_bmap_check_leaf_extents(cur, ip, whichfork);
 done:
-	bma->logflags |= rval;
+	*logflagsp |= rval;
 	return error;
 }
 
@@ -4386,7 +4389,9 @@ xfs_bmapi_allocate(
 	if (bma->wasdel)
 		error = xfs_bmap_add_extent_delay_real(bma, whichfork);
 	else
-		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
+		error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip,
+				whichfork, &bma->idx, &bma->cur, &bma->got,
+				bma->firstblock, bma->dfops, &bma->logflags);
 
 	bma->logflags |= tmp_logflags;
 	if (error)
-- 
cgit v1.2.3


From 6ebd5a4413e2afd1b1129135e1cf4a84092550e2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:55 -0700
Subject: xfs: introduce xfs_bmapi_remap

Add a new helper to be used for reflink extent list additions instead of
funneling them through xfs_bmapi_write and overloading the firstblock
member in struct xfs_bmalloca and struct xfs_alloc_args.

With some small changes to xfs_bmap_remap_alloc this also means we do
not need a xfs_bmalloca structure for this case at all.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 157 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 112 insertions(+), 45 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index f32b2dab16a4..a1d9086ac737 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3862,11 +3862,12 @@ xfs_bmap_btalloc(
  */
 STATIC int
 xfs_bmap_remap_alloc(
-	struct xfs_bmalloca	*ap)
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fsblock_t		startblock,
+	xfs_extlen_t		length)
 {
-	struct xfs_trans	*tp = ap->tp;
 	struct xfs_mount	*mp = tp->t_mountp;
-	xfs_fsblock_t		bno;
 	struct xfs_alloc_arg	args;
 	int			error;
 
@@ -3875,24 +3876,24 @@ xfs_bmap_remap_alloc(
 	 * and handle a silent filesystem corruption rather than crashing.
 	 */
 	memset(&args, 0, sizeof(struct xfs_alloc_arg));
-	args.tp = ap->tp;
-	args.mp = ap->tp->t_mountp;
-	bno = *ap->firstblock;
-	args.agno = XFS_FSB_TO_AGNO(mp, bno);
-	args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	args.tp = tp;
+	args.mp = mp;
+	args.agno = XFS_FSB_TO_AGNO(mp, startblock);
+	args.agbno = XFS_FSB_TO_AGBNO(mp, startblock);
+
 	if (args.agno >= mp->m_sb.sb_agcount ||
 	    args.agbno >= mp->m_sb.sb_agblocks)
 		return -EFSCORRUPTED;
 
 	/* "Allocate" the extent from the range we passed in. */
-	trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
-	ap->blkno = bno;
-	ap->ip->i_d.di_nblocks += ap->length;
-	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+	trace_xfs_bmap_remap_alloc(ip, startblock, length);
+
+	ip->i_d.di_nblocks += length;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	/* Fix the freelist, like a real allocator does. */
-	args.datatype = ap->datatype;
-	args.pag = xfs_perag_get(args.mp, args.agno);
+	args.datatype = XFS_ALLOC_USERDATA | XFS_ALLOC_NOBUSY;
+	args.pag = xfs_perag_get(mp, args.agno);
 	ASSERT(args.pag);
 
 	/*
@@ -3905,7 +3906,7 @@ xfs_bmap_remap_alloc(
 	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
 	xfs_perag_put(args.pag);
 	if (error)
-		trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
+		trace_xfs_bmap_remap_alloc_error(ip, error, _RET_IP_);
 	return error;
 }
 
@@ -3917,8 +3918,6 @@ STATIC int
 xfs_bmap_alloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
-	if (ap->flags & XFS_BMAPI_REMAP)
-		return xfs_bmap_remap_alloc(ap);
 	if (XFS_IS_REALTIME_INODE(ap->ip) &&
 	    xfs_alloc_is_userdata(ap->datatype))
 		return xfs_bmap_rtalloc(ap);
@@ -4554,9 +4553,7 @@ xfs_bmapi_write(
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
-	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
-	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+	ASSERT(!(flags & XFS_BMAPI_REMAP));
 
 	/* zeroing is for currently only for data extents, not metadata */
 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4640,13 +4637,8 @@ xfs_bmapi_write(
 			} else {
 				need_alloc = true;
 			}
-		} else {
-			/*
-			 * Make sure we only reflink into a hole.
-			 */
-			ASSERT(!(flags & XFS_BMAPI_REMAP));
-			if (isnullstartblock(bma.got.br_startblock))
-				wasdelay = true;
+		} else if (isnullstartblock(bma.got.br_startblock)) {
+			wasdelay = true;
 		}
 
 		/*
@@ -4775,6 +4767,94 @@ error0:
 	return error;
 }
 
+static int
+xfs_bmapi_remap(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	xfs_fsblock_t		startblock,
+	struct xfs_defer_ops	*dfops)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_btree_cur	*cur = NULL;
+	xfs_fsblock_t		firstblock = NULLFSBLOCK;
+	struct xfs_bmbt_irec	got;
+	xfs_extnum_t		idx;
+	int			logflags = 0, error;
+
+	ASSERT(len > 0);
+	ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+	if (unlikely(XFS_TEST_ERROR(
+	    (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS &&
+	     XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE),
+	     mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) {
+		XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp);
+		return -EFSCORRUPTED;
+	}
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
+		if (error)
+			return error;
+	}
+
+	if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) {
+		/* make sure we only reflink into a hole. */
+		ASSERT(got.br_startoff > bno);
+		ASSERT(got.br_startoff - bno >= len);
+	}
+
+	error = xfs_bmap_remap_alloc(tp, ip, startblock, len);
+	if (error)
+		goto error0;
+
+	if (ifp->if_flags & XFS_IFBROOT) {
+		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
+		cur->bc_private.b.firstblock = firstblock;
+		cur->bc_private.b.dfops = dfops;
+		cur->bc_private.b.flags = 0;
+	}
+
+	got.br_startoff = bno;
+	got.br_startblock = startblock;
+	got.br_blockcount = len;
+	got.br_state = XFS_EXT_NORM;
+
+	error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur,
+			&got, &firstblock, dfops, &logflags);
+	if (error)
+		goto error0;
+
+	if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) {
+		int		tmp_logflags = 0;
+
+		error = xfs_bmap_btree_to_extents(tp, ip, cur,
+			&tmp_logflags, XFS_DATA_FORK);
+		logflags |= tmp_logflags;
+	}
+
+error0:
+	if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS)
+		logflags &= ~XFS_ILOG_DEXT;
+	else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE)
+		logflags &= ~XFS_ILOG_DBROOT;
+
+	if (logflags)
+		xfs_trans_log_inode(tp, ip, logflags);
+	if (cur) {
+		xfs_btree_del_cursor(cur,
+				error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	}
+	return error;
+}
+
 /*
  * When a delalloc extent is split (e.g., due to a hole punch), the original
  * indlen reservation must be shared across the two new extents that are left
@@ -6493,16 +6573,7 @@ xfs_bmap_finish_one(
 	xfs_filblks_t			blockcount,
 	xfs_exntst_t			state)
 {
-	struct xfs_bmbt_irec		bmap;
-	int				nimaps = 1;
-	xfs_fsblock_t			firstfsb;
-	int				done;
-	int				error = 0;
-
-	bmap.br_startblock = startblock;
-	bmap.br_startoff = startoff;
-	bmap.br_blockcount = blockcount;
-	bmap.br_state = state;
+	int				error = 0, done;
 
 	trace_xfs_bmap_deferred(tp->t_mountp,
 			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
@@ -6519,16 +6590,12 @@ xfs_bmap_finish_one(
 
 	switch (type) {
 	case XFS_BMAP_MAP:
-		firstfsb = bmap.br_startblock;
-		error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
-					bmap.br_blockcount, XFS_BMAPI_REMAP, &firstfsb,
-					bmap.br_blockcount, &bmap, &nimaps,
-					dfops);
+		error = xfs_bmapi_remap(tp, ip, startoff, blockcount,
+				startblock, dfops);
 		break;
 	case XFS_BMAP_UNMAP:
-		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
-				bmap.br_blockcount, XFS_BMAPI_REMAP, 1, &firstfsb,
-				dfops, &done);
+		error = xfs_bunmapi(tp, ip, startoff, blockcount,
+				XFS_BMAPI_REMAP, 1, &startblock, dfops, &done);
 		ASSERT(done);
 		break;
 	default:
-- 
cgit v1.2.3


From bf8eadbacb24e321c99bbdd901589942712810d1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:56 -0700
Subject: xfs: remove xfs_bmap_remap_alloc

The main thing that xfs_bmap_remap_alloc does is fixing the AGFL, similar
to what we do in the space allocator.  But the reflink code doesn't touch
the allocation btree unlike the normal space allocator, so we couldn't
care less about the state of the AGFL.

So remove xfs_bmap_remap_alloc and just handle the di_nblocks update in
the caller.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 60 ++----------------------------------------------
 fs/xfs/xfs_trace.h       | 25 --------------------
 2 files changed, 2 insertions(+), 83 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index a1d9086ac737..9f55e5185d56 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3855,61 +3855,6 @@ xfs_bmap_btalloc(
 	return 0;
 }
 
-/*
- * For a remap operation, just "allocate" an extent at the address that the
- * caller passed in, and ensure that the AGFL is the right size.  The caller
- * will then map the "allocated" extent into the file somewhere.
- */
-STATIC int
-xfs_bmap_remap_alloc(
-	struct xfs_trans	*tp,
-	struct xfs_inode	*ip,
-	xfs_fsblock_t		startblock,
-	xfs_extlen_t		length)
-{
-	struct xfs_mount	*mp = tp->t_mountp;
-	struct xfs_alloc_arg	args;
-	int			error;
-
-	/*
-	 * validate that the block number is legal - the enables us to detect
-	 * and handle a silent filesystem corruption rather than crashing.
-	 */
-	memset(&args, 0, sizeof(struct xfs_alloc_arg));
-	args.tp = tp;
-	args.mp = mp;
-	args.agno = XFS_FSB_TO_AGNO(mp, startblock);
-	args.agbno = XFS_FSB_TO_AGBNO(mp, startblock);
-
-	if (args.agno >= mp->m_sb.sb_agcount ||
-	    args.agbno >= mp->m_sb.sb_agblocks)
-		return -EFSCORRUPTED;
-
-	/* "Allocate" the extent from the range we passed in. */
-	trace_xfs_bmap_remap_alloc(ip, startblock, length);
-
-	ip->i_d.di_nblocks += length;
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
-	/* Fix the freelist, like a real allocator does. */
-	args.datatype = XFS_ALLOC_USERDATA | XFS_ALLOC_NOBUSY;
-	args.pag = xfs_perag_get(mp, args.agno);
-	ASSERT(args.pag);
-
-	/*
-	 * The freelist fixing code will decline the allocation if
-	 * the size and shape of the free space doesn't allow for
-	 * allocating the extent and updating all the metadata that
-	 * happens during an allocation.  We're remapping, not
-	 * allocating, so skip that check by pretending to be freeing.
-	 */
-	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
-	xfs_perag_put(args.pag);
-	if (error)
-		trace_xfs_bmap_remap_alloc_error(ip, error, _RET_IP_);
-	return error;
-}
-
 /*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
@@ -4811,9 +4756,8 @@ xfs_bmapi_remap(
 		ASSERT(got.br_startoff - bno >= len);
 	}
 
-	error = xfs_bmap_remap_alloc(tp, ip, startblock, len);
-	if (error)
-		goto error0;
+	ip->i_d.di_nblocks += len;
+	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 
 	if (ifp->if_flags & XFS_IFBROOT) {
 		cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 4f96dc953fbe..cba10daf8391 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -3003,31 +3003,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \
 		 unsigned long caller_ip), \
 	TP_ARGS(ip, error, caller_ip))
 
-/* reflink allocator */
-TRACE_EVENT(xfs_bmap_remap_alloc,
-	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
-		 xfs_extlen_t len),
-	TP_ARGS(ip, fsbno, len),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_ino_t, ino)
-		__field(xfs_fsblock_t, fsbno)
-		__field(xfs_extlen_t, len)
-	),
-	TP_fast_assign(
-		__entry->dev = VFS_I(ip)->i_sb->s_dev;
-		__entry->ino = ip->i_ino;
-		__entry->fsbno = fsbno;
-		__entry->len = len;
-	),
-	TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ino,
-		  __entry->fsbno,
-		  __entry->len)
-);
-DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
-
 /* reflink tracepoint classes */
 
 /* two-file io tracepoint class */
-- 
cgit v1.2.3


From 7590632a33ef2d264665576d3d54e50f906fa758 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 11 Apr 2017 16:45:57 -0700
Subject: xfs: remove bmap block allocation retries

Now that reflink operations don't set the firstblock value we don't
need the workarounds for non-NULL firstblock values without a prior
allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c       | 29 -----------------------------
 fs/xfs/libxfs/xfs_bmap_btree.c | 17 -----------------
 2 files changed, 46 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9f55e5185d56..7f42f6067eb5 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree(
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
 	} else if (dfops->dop_low) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
 		args.fsbno = *firstblock;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -779,20 +778,6 @@ try_another_ag:
 		return error;
 	}
 
-	/*
-	 * During a CoW operation, the allocation and bmbt updates occur in
-	 * different transactions.  The mapping code tries to put new bmbt
-	 * blocks near extents being mapped, but the only way to guarantee this
-	 * is if the alloc and the mapping happen in a single transaction that
-	 * has a block reservation.  That isn't the case here, so if we run out
-	 * of space we'll try again with another AG.
-	 */
-	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
-	    args.fsbno == NULLFSBLOCK &&
-	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		goto try_another_ag;
-	}
 	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
 		xfs_iroot_realloc(ip, -1, whichfork);
 		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -925,7 +910,6 @@ xfs_bmap_local_to_extents(
 	 * file currently fits in an inode.
 	 */
 	if (*firstblock == NULLFSBLOCK) {
-try_another_ag:
 		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
@@ -938,19 +922,6 @@ try_another_ag:
 	if (error)
 		goto done;
 
-	/*
-	 * During a CoW operation, the allocation and bmbt updates occur in
-	 * different transactions.  The mapping code tries to put new bmbt
-	 * blocks near extents being mapped, but the only way to guarantee this
-	 * is if the alloc and the mapping happen in a single transaction that
-	 * has a block reservation.  That isn't the case here, so if we run out
-	 * of space we'll try again with another AG.
-	 */
-	if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
-	    args.fsbno == NULLFSBLOCK &&
-	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
-		goto try_another_ag;
-	}
 	/* Can't fail, the space was reserved. */
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(args.len == 1);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index fd55db479385..3e17ceda038c 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -448,7 +448,6 @@ xfs_bmbt_alloc_block(
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = be64_to_cpu(start->l);
 		args.type = XFS_ALLOCTYPE_START_BNO;
-try_another_ag:
 		/*
 		 * Make sure there is sufficient room left in the AG to
 		 * complete a full tree split for an extent insert.  If
@@ -477,22 +476,6 @@ try_another_ag:
 	if (error)
 		goto error0;
 
-	/*
-	 * During a CoW operation, the allocation and bmbt updates occur in
-	 * different transactions.  The mapping code tries to put new bmbt
-	 * blocks near extents being mapped, but the only way to guarantee this
-	 * is if the alloc and the mapping happen in a single transaction that
-	 * has a block reservation.  That isn't the case here, so if we run out
-	 * of space we'll try again with another AG.
-	 */
-	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
-	    args.fsbno == NULLFSBLOCK &&
-	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
-		args.fsbno = cur->bc_private.b.firstblock;
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		goto try_another_ag;
-	}
-
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
-- 
cgit v1.2.3


From a008c31c7ef9a4106dbadf21b3bcb7e89826a5d7 Mon Sep 17 00:00:00 2001
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Date: Wed, 12 Apr 2017 11:03:20 -0700
Subject: iomap_dio_rw: Prevent reading file data beyond iomap_dio->i_size

On a ppc64 machine executing overlayfs/019 with xfs as the lower and
upper filesystem causes the following call trace,

WARNING: CPU: 2 PID: 8034 at /root/repos/linux/fs/iomap.c:765 .iomap_dio_actor+0xcc/0x420
Modules linked in:
CPU: 2 PID: 8034 Comm: fsstress Tainted: G             L  4.11.0-rc5-next-20170405 #100
task: c000000631314880 task.stack: c0000003915d4000
NIP: c00000000035a72c LR: c00000000035a6f4 CTR: c00000000035a660
REGS: c0000003915d7570 TRAP: 0700   Tainted: G             L   (4.11.0-rc5-next-20170405)
MSR: 800000000282b032 <SF,VEC,VSX,EE,FP,ME,IR,DR,RI>
  CR: 24004284  XER: 00000000
CFAR: c0000000006f7190 SOFTE: 1
GPR00: c00000000035a6f4 c0000003915d77f0 c0000000015a3f00 000000007c22f600
GPR04: 000000000022d000 0000000000002600 c0000003b2d56360 c0000003915d7960
GPR08: c0000003915d7cd0 0000000000000002 0000000000002600 c000000000521cc0
GPR12: 0000000024004284 c00000000fd80a00 000000004b04ae64 ffffffffffffffff
GPR16: 000000001000ca70 0000000000000000 c0000003b2d56380 c00000000153d2b8
GPR20: 0000000000000010 c0000003bc87bac8 0000000000223000 000000000022f5ff
GPR24: c0000003b2d56360 000000000000000c 0000000000002600 000000000022d000
GPR28: 0000000000000000 c0000003915d7960 c0000003b2d56360 00000000000001ff
NIP [c00000000035a72c] .iomap_dio_actor+0xcc/0x420
LR [c00000000035a6f4] .iomap_dio_actor+0x94/0x420
Call Trace:
[c0000003915d77f0] [c00000000035a6f4] .iomap_dio_actor+0x94/0x420 (unreliable)
[c0000003915d78f0] [c00000000035b9f4] .iomap_apply+0xf4/0x1f0
[c0000003915d79d0] [c00000000035c320] .iomap_dio_rw+0x230/0x420
[c0000003915d7ae0] [c000000000512a14] .xfs_file_dio_aio_read+0x84/0x160
[c0000003915d7b80] [c000000000512d24] .xfs_file_read_iter+0x104/0x130
[c0000003915d7c10] [c0000000002d6234] .__vfs_read+0x114/0x1a0
[c0000003915d7cf0] [c0000000002d7a8c] .vfs_read+0xac/0x1a0
[c0000003915d7d90] [c0000000002d96b8] .SyS_read+0x58/0x100
[c0000003915d7e30] [c00000000000b8e0] system_call+0x38/0xfc
Instruction dump:
78630020 7f831b78 7ffc07b4 7c7ce039 40820360 a13d0018 2f890003 419e0288
2f890004 419e00a0 2f890001 419e02a8 <0fe00000> 3b80fffb 38210100 7f83e378

The above problem can also be recreated on a regular xfs filesystem
using the command,

$ fsstress -d /mnt -l 1000 -n 1000 -p 1000

The reason for the call trace is,
1. When 'reserving' blocks for delayed allocation , XFS reserves more
   blocks (i.e. past file's current EOF) than required. This is done
   because XFS assumes that userspace might write more data and hence
   'reserving' more blocks might lead to the file's new data being
   stored contiguously on disk.
2. The in-memory 'struct xfs_bmbt_irec' mapping the file's last extent would
   then cover the prealloc-ed EOF blocks in addition to the regular blocks.
3. When flushing the dirty blocks to disk, we only flush data till the
   file's EOF. But before writing out the dirty data, we allocate blocks
   on the disk for holding the file's new data. This allocation includes
   the blocks that are part of the 'prealloc EOF blocks'.
4. Later, when the last reference to the inode is being closed, XFS frees the
   unused 'prealloc EOF blocks' in xfs_inactive().

In step 3 above, When allocating space on disk for the delayed allocation
range, the space allocator might sometimes allocate less blocks than
required. If such an allocation ends right at the current EOF of the
file, We will not be able to clear the "delayed allocation" flag for the
'prealloc EOF blocks', since we won't have dirty buffer heads associated
with that range of the file.

In such a situation if a Direct I/O read operation is performed on file
range [X, Y] (where X < EOF and Y > EOF), we flush dirty data in the
range [X, Y] and invalidate page cache for that range (Refer to
iomap_dio_rw()). Later for performing the Direct I/O read, XFS obtains
the extent items (which are still cached in memory) for the file
range. When doing so we are not supposed to get an extent item with
IOMAP_DELALLOC flag set, since the previous "flush" operation should
have converted any delayed allocation data in the range [X, Y]. Hence we
end up hitting a WARN_ON_ONCE(1) statement in iomap_dio_actor().

This commit fixes the bug by preventing the read operation from going
beyond iomap_dio->i_size.

Reported-by: Santhosh G <santhog4@linux.vnet.ibm.com>
Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/iomap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/iomap.c b/fs/iomap.c
index 141c3cd55a8b..0ca8b0fbbdad 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -911,6 +911,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
 			break;
 		}
 		pos += ret;
+
+		if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+			break;
 	} while ((count = iov_iter_count(iter)) > 0);
 	blk_finish_plug(&plug);
 
-- 
cgit v1.2.3


From 023cc840b40fad95c6fe26fff1d380a8c9d45939 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 13 Apr 2017 15:15:47 -0700
Subject: xfs: handle array index overrun in xfs_dir2_leaf_readbuf()

Carlos had a case where "find" seemed to start spinning
forever and never return.

This was on a filesystem with non-default multi-fsb (8k)
directory blocks, and a fragmented directory with extents
like this:

0:[0,133646,2,0]
1:[2,195888,1,0]
2:[3,195890,1,0]
3:[4,195892,1,0]
4:[5,195894,1,0]
5:[6,195896,1,0]
6:[7,195898,1,0]
7:[8,195900,1,0]
8:[9,195902,1,0]
9:[10,195908,1,0]
10:[11,195910,1,0]
11:[12,195912,1,0]
12:[13,195914,1,0]
...

i.e. the first extent is a contiguous 2-fsb dir block, but
after that it is fragmented into 1 block extents.

At the top of the readdir path, we allocate a mapping array
which (for this filesystem geometry) can hold 10 extents; see
the assignment to map_info->map_size.  During readdir, we are
therefore able to map extents 0 through 9 above into the array
for readahead purposes.  If we count by 2, we see that the last
mapped index (9) is the first block of a 2-fsb directory block.

At the end of xfs_dir2_leaf_readbuf() we have 2 loops to fill
more readahead; the outer loop assumes one full dir block is
processed each loop iteration, and an inner loop that ensures
that this is so by advancing to the next extent until a full
directory block is mapped.

The problem is that this inner loop may step past the last
extent in the mapping array as it tries to reach the end of
the directory block.  This will read garbage for the extent
length, and as a result the loop control variable 'j' may
become corrupted and never fail the loop conditional.

The number of valid mappings we have in our array is stored
in map->map_valid, so stop this inner loop based on that limit.

There is an ASSERT at the top of the outer loop for this
same condition, but we never made it out of the inner loop,
so the ASSERT never fired.

Huge appreciation for Carlos for debugging and isolating
the problem.

Debugged-and-analyzed-by: Carlos Maiolino <cmaiolino@redhat.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>
Reviewed-by: Bill O'Donnell <billodo@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_dir2_readdir.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index ad9396e516f6..c45de729c74e 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf(
 
 	/*
 	 * Do we need more readahead?
+	 * Each loop tries to process 1 full dir blk; last may be partial.
 	 */
 	blk_start_plug(&plug);
 	for (mip->ra_index = mip->ra_offset = i = 0;
@@ -425,9 +426,14 @@ xfs_dir2_leaf_readbuf(
 		}
 
 		/*
-		 * Advance offset through the mapping table.
+		 * Advance offset through the mapping table, processing a full
+		 * dir block even if it is fragmented into several extents.
+		 * But stop if we have consumed all valid mappings, even if
+		 * it's not yet a full directory block.
 		 */
-		for (j = 0; j < geo->fsbcount; j += length ) {
+		for (j = 0;
+		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
+		     j += length ) {
 			/*
 			 * The rest of this extent but not more than a dir
 			 * block.
-- 
cgit v1.2.3


From cb52ee334a45ae6c78a3999e4b473c43ddc528f4 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Thu, 20 Apr 2017 08:06:47 -0700
Subject: xfs: prevent multi-fsb dir readahead from reading random blocks

Directory block readahead uses a complex iteration mechanism to map
between high-level directory blocks and underlying physical extents.
This mechanism attempts to traverse the higher-level dir blocks in a
manner that handles multi-fsb directory blocks and simultaneously
maintains a reference to the corresponding physical blocks.

This logic doesn't handle certain (discontiguous) physical extent
layouts correctly with multi-fsb directory blocks. For example,
consider the case of a 4k FSB filesystem with a 2 FSB (8k) directory
block size and a directory with the following extent layout:

 EXT: FILE-OFFSET      BLOCK-RANGE      AG AG-OFFSET        TOTAL
   0: [0..7]:          88..95            0 (88..95)             8
   1: [8..15]:         80..87            0 (80..87)             8
   2: [16..39]:        168..191          0 (168..191)          24
   3: [40..63]:        5242952..5242975  1 (72..95)            24

Directory block 0 spans physical extents 0 and 1, dirblk 1 lies
entirely within extent 2 and dirblk 2 spans extents 2 and 3. Because
extent 2 is larger than the directory block size, the readahead code
erroneously assumes the block is contiguous and issues a readahead
based on the physical mapping of the first fsb of the dirblk. This
results in read verifier failure and a spurious corruption or crc
failure, depending on the filesystem format.

Further, the subsequent readahead code responsible for walking
through the physical table doesn't correctly advance the physical
block reference for dirblk 2. Instead of advancing two physical
filesystem blocks, the first iteration of the loop advances 1 block
(correctly), but the subsequent iteration advances 2 more physical
blocks because the next physical extent (extent 3, above) happens to
cover more than dirblk 2. At this point, the higher-level directory
block walking is completely off the rails of the actual physical
layout of the directory for the respective mapping table.

Update the contiguous dirblock logic to consider the current offset
in the physical extent to avoid issuing directory readahead to
unrelated blocks. Also, update the mapping table advancing code to
consider the current offset within the current dirblock to avoid
advancing the mapping reference too far beyond the dirblock.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_dir2_readdir.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index c45de729c74e..20b7a5c6eb2f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -405,7 +405,8 @@ xfs_dir2_leaf_readbuf(
 		 * Read-ahead a contiguous directory block.
 		 */
 		if (i > mip->ra_current &&
-		    map[mip->ra_index].br_blockcount >= geo->fsbcount) {
+		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
+		    geo->fsbcount) {
 			xfs_dir3_data_readahead(dp,
 				map[mip->ra_index].br_startoff + mip->ra_offset,
 				XFS_FSB_TO_DADDR(dp->i_mount,
@@ -438,7 +439,7 @@ xfs_dir2_leaf_readbuf(
 			 * The rest of this extent but not more than a dir
 			 * block.
 			 */
-			length = min_t(int, geo->fsbcount,
+			length = min_t(int, geo->fsbcount - j,
 					map[mip->ra_index].br_blockcount -
 							mip->ra_offset);
 			mip->ra_offset += length;
-- 
cgit v1.2.3


From 42bf9dba40086e3d18dc29eb379f13df219d3f22 Mon Sep 17 00:00:00 2001
From: Hou Tao <houtao1@huawei.com>
Date: Fri, 14 Apr 2017 11:43:27 -0700
Subject: xfs: remove the trailing newline used in the fmt parameter of
 TP_printk

The trailing newlines wil lead to extra newlines in the trace file
which looks like the following output, so remove them.
>kworker/4:1H-1508  [004] .... 47879.101608: xfs_discard_extent: dev 8:0
>
>kworker/u16:2-238  [004] .... 47879.101725: xfs_extent_busy_clear: dev 8:0

Signed-off-by: Hou Tao <houtao1@huawei.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
[darrick: fix the getfsmap tracepoints too]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_trace.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index cba10daf8391..09da3c18850b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2192,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class,
 		__entry->agbno = agbno;
 		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u\n",
+	TP_printk("dev %d:%d agno %u agbno %u len %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
@@ -2264,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
 		__entry->committed = dop->dop_committed;
 		__entry->low = dop->dop_low;
 	),
-	TP_printk("dev %d:%d ops %p committed %d low %d\n",
+	TP_printk("dev %d:%d ops %p committed %d low %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->dop,
 		  __entry->committed,
@@ -2292,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
 		__entry->low = dop->dop_low;
 		__entry->error = error;
 	),
-	TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+	TP_printk("dev %d:%d ops %p committed %d low %d err %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->dop,
 		  __entry->committed,
@@ -2321,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
 		__entry->committed = dfp->dfp_done != NULL;
 		__entry->nr = dfp->dfp_count;
 	),
-	TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n",
+	TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->type,
 		  __entry->intent,
@@ -2616,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class,
 		__entry->asked = r ? r->ar_asked : 0;
 		__entry->len = len;
 	),
-	TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n",
+	TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u "
+		  "resv %u ask %u len %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->resv,
@@ -2669,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
 		__entry->agbno = agbno;
 		__entry->dir = dir;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+	TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->agbno,
@@ -2702,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
 		__entry->blockcount = irec->rc_blockcount;
 		__entry->refcount = irec->rc_refcount;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->startblock,
@@ -2737,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
 		__entry->refcount = irec->rc_refcount;
 		__entry->agbno = agbno;
 	),
-	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->startblock,
@@ -2778,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
 		__entry->i2_refcount = i2->rc_refcount;
 	),
 	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
-		  "agbno %u len %u refcount %u\n",
+		  "agbno %u len %u refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->i1_startblock,
@@ -2824,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
 		__entry->agbno = agbno;
 	),
 	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
-		  "agbno %u len %u refcount %u @ agbno %u\n",
+		  "agbno %u len %u refcount %u @ agbno %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->i1_startblock,
@@ -2877,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
 	),
 	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
 		  "agbno %u len %u refcount %u -- "
-		  "agbno %u len %u refcount %u\n",
+		  "agbno %u len %u refcount %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->agno,
 		  __entry->i1_startblock,
@@ -3204,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 	),
 	TP_printk("dev %d:%d "
 		  "ino 0x%lx isize 0x%llx -> "
-		  "ino 0x%lx isize 0x%llx\n",
+		  "ino 0x%lx isize 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->src_ino,
 		  __entry->src_isize,
@@ -3269,7 +3270,7 @@ DECLARE_EVENT_CLASS(xfs_fsmap_class,
 		__entry->offset = rmap->rm_offset;
 		__entry->flags = rmap->rm_flags;
 	),
-	TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x\n",
+	TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
 		  __entry->agno,
@@ -3309,7 +3310,7 @@ DECLARE_EVENT_CLASS(xfs_getfsmap_class,
 		__entry->offset = fsmap->fmr_offset;
 		__entry->flags = fsmap->fmr_flags;
 	),
-	TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx\n",
+	TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  MAJOR(__entry->keydev), MINOR(__entry->keydev),
 		  __entry->block,
-- 
cgit v1.2.3


From 90115407c5847828d82af9bc139f690600a36219 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Wed, 19 Apr 2017 12:55:57 -0700
Subject: xfs: remove use of do_div with 32-bit dividend in quota

The kbuild test robot caught this; in debug code we have another
caller of do_div with a 32-bit dividend (j) which is caught now
that we are using the kernel-supplied do_div.

None of the values used here are 64-bit; just use simple division.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b669b123287b..4ac5b187534e 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts(
 	 * started afresh by xfs_qm_quotacheck.
 	 */
 #ifdef DEBUG
-	j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB);
-	do_div(j, sizeof(xfs_dqblk_t));
+	j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) /
+		sizeof(xfs_dqblk_t);
 	ASSERT(mp->m_quotainfo->qi_dqperchunk == j);
 #endif
 	dqb = bp->b_addr;
-- 
cgit v1.2.3


From 4f1adf3373f072246c14119b2aa6dfb4d6510a43 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 19 Apr 2017 15:19:32 -0700
Subject: xfs: more do_div cleanups

On some architectures do_div does the pointer compare
trick to make sure that we've sent it an unsigned 64-bit
number.  (Why unsigned?  I don't know.)

Fix up the few places that squawk about this; in
xfs_bmap_wants_extents() we just used a bare int64_t so change
that to unsigned.

In xfs_adjust_extent_unmap_boundaries() all we wanted was the
mod, and we have an xfs-specific function to handle that w/o
side effects, which includes proper casting for do_div.

In xfs_daddr_to_ag[b]no, we were using the wrong type anyway;
XFS_BB_TO_FSBT returns a block in the filesystem, so use
xfs_rfsblock_t not xfs_daddr_t, and gain the unsignedness
from that type as a bonus.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c | 2 +-
 fs/xfs/xfs_bmap_util.c   | 5 +----
 fs/xfs/xfs_mount.h       | 4 ++--
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7f42f6067eb5..0fdff08145c1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -4887,7 +4887,7 @@ xfs_bmap_del_extent_delay(
 	ASSERT(got_endoff >= del_endoff);
 
 	if (isrt) {
-		int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
+		uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount);
 
 		do_div(rtexts, mp->m_sb.sb_rextsize);
 		xfs_mod_frextents(mp, rtexts);
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index de94798f1c1b..7ac80a1facf2 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1206,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries(
 		return error;
 
 	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
-		xfs_daddr_t	block;
-
 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-		block = imap.br_startblock;
-		mod = do_div(block, mp->m_sb.sb_rextsize);
+		mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize);
 		if (mod)
 			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
 	}
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22b2185e93a0..9fa312a41c93 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -313,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
 static inline xfs_agnumber_t
 xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 {
-	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
 	do_div(ld, mp->m_sb.sb_agblocks);
 	return (xfs_agnumber_t) ld;
 }
@@ -321,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
 static inline xfs_agblock_t
 xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
 {
-	xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
+	xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d);
 	return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
 }
 
-- 
cgit v1.2.3


From 895e9bfc9e36c8e39ea4d2129dc2cbde1aafec99 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 19 Apr 2017 15:19:45 -0700
Subject: xfs: remove the unused XFS_MAXLINK_1 define

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 6b7579e7b60a..d114ed80a076 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version)
 /*
  * The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
  * Since the pathconf interface is signed, we use 2^31 - 1 instead.
- * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX.
  */
 #define	XFS_MAXLINK		((1U << 31) - 1U)
-#define	XFS_MAXLINK_1		65535U
 
 /*
  * Values for di_format
-- 
cgit v1.2.3


From 37f7f9bbf3b914e94f81426f6f59a3f97f4dc562 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 19 Apr 2017 15:19:49 -0700
Subject: xfs: remove unused values from xfs_exntst_t

We only ever use the normal and unwritten states.  And the actual
ondisk format (this enum isn't despite being in xfs_format.h) only
has space for the unwritten bit anyway.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_format.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index d114ed80a076..8425884668a8 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1588,7 +1588,6 @@ typedef enum {
  */
 typedef enum {
 	XFS_EXT_NORM, XFS_EXT_UNWRITTEN,
-	XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID
 } xfs_exntst_t;
 
 /*
-- 
cgit v1.2.3


From 0c1d9e4a61590c2a4d657d1deddd1674f1565097 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 20 Apr 2017 09:42:48 -0700
Subject: xfs: simplify validation of the unwritten extent bit

XFS only supports the unwritten extent bit in the data fork, and only if
the file system has a version 5 superblock or the unwritten extent
feature bit.

We currently have two routines that validate the invariant:
xfs_check_nostate_extents which return -EFSCORRUPTED when it's not met,
and xfs_validate_extent that triggers and assert in debug build.

Both of them iterate over all extents of an inode fork when called,
which isn't very efficient.

This patch instead adds a new helper that verifies the invariant one
extent at a time, and calls it from the places where we iterate over
all extents to converted them from or two the in-memory format.  The
callers then return -EFSCORRUPTED when reading invalid extents from
disk, or trigger an assert when writing them to disk.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/libxfs/xfs_bmap.c       | 16 +-------
 fs/xfs/libxfs/xfs_bmap.h       |  2 -
 fs/xfs/libxfs/xfs_bmap_btree.c | 26 ------------
 fs/xfs/libxfs/xfs_bmap_btree.h | 21 ++++++----
 fs/xfs/libxfs/xfs_format.h     |  8 ----
 fs/xfs/libxfs/xfs_inode_fork.c | 90 ++++++++++++------------------------------
 6 files changed, 41 insertions(+), 122 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 0fdff08145c1..f02eb7673392 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -1231,7 +1231,6 @@ xfs_bmap_read_extents(
 	xfs_fsblock_t		bno;	/* block # of "block" */
 	xfs_buf_t		*bp;	/* buffer for "block" */
 	int			error;	/* error return value */
-	xfs_exntfmt_t		exntf;	/* XFS_EXTFMT_NOSTATE, if checking */
 	xfs_extnum_t		i, j;	/* index into the extents list */
 	xfs_ifork_t		*ifp;	/* fork structure */
 	int			level;	/* btree level, for checking */
@@ -1242,8 +1241,6 @@ xfs_bmap_read_extents(
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE :
-					XFS_EXTFMT_INODE(ip);
 	block = ifp->if_broot;
 	/*
 	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
@@ -1311,18 +1308,9 @@ xfs_bmap_read_extents(
 			xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
 			trp->l0 = be64_to_cpu(frp->l0);
 			trp->l1 = be64_to_cpu(frp->l1);
-		}
-		if (exntf == XFS_EXTFMT_NOSTATE) {
-			/*
-			 * Check all attribute bmap btree records and
-			 * any "older" data bmap btree records for a
-			 * set bit in the "extent flag" position.
-			 */
-			if (unlikely(xfs_check_nostate_extents(ifp,
-					start, num_recs))) {
+			if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) {
 				XFS_ERROR_REPORT("xfs_bmap_read_extents(2)",
-						 XFS_ERRLEVEL_LOW,
-						 ip->i_mount);
+						 XFS_ERRLEVEL_LOW, mp);
 				goto error0;
 			}
 		}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index a6e612cabe15..c35a14fa1527 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -244,8 +244,6 @@ int	xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork,
 		struct xfs_bmbt_irec *del);
 void	xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx,
 		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del);
-int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
-		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
 int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 3e17ceda038c..6cba69aff077 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr(
 	memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
 }
 
-/*
- * Check extent records, which have just been read, for
- * any bit in the extent flag field. ASSERT on debug
- * kernels, as this condition should not occur.
- * Return an error condition (1) if any flags found,
- * otherwise return 0.
- */
-
-int
-xfs_check_nostate_extents(
-	xfs_ifork_t		*ifp,
-	xfs_extnum_t		idx,
-	xfs_extnum_t		num)
-{
-	for (; num > 0; num--, idx++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
-		if ((ep->l0 >>
-		     (64 - BMBT_EXNTFLAG_BITLEN)) != 0) {
-			ASSERT(0);
-			return 1;
-		}
-	}
-	return 0;
-}
-
-
 STATIC struct xfs_btree_cur *
 xfs_bmbt_dup_cursor(
 	struct xfs_btree_cur	*cur)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 90347a99c6d2..9da5a8d4f184 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -24,13 +24,6 @@ struct xfs_mount;
 struct xfs_inode;
 struct xfs_trans;
 
-/*
- * Extent state and extent format macros.
- */
-#define XFS_EXTFMT_INODE(x)	\
-	(xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \
-		XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE)
-
 /*
  * Btree block header size depends on a superblock flag.
  */
@@ -139,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
 
+/*
+ * Check that the extent does not contain an invalid unwritten extent flag.
+ */
+static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork,
+		struct xfs_bmbt_rec_host *ep)
+{
+	if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0)
+		return true;
+	if (whichfork == XFS_DATA_FORK &&
+	    xfs_sb_version_hasextflgbit(&mp->m_sb))
+		return true;
+	return false;
+}
+
 #endif	/* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 8425884668a8..a1dccd8d96bc 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1575,14 +1575,6 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x)
 	return (xfs_filblks_t)((x) & ~STARTBLOCKMASK);
 }
 
-/*
- * Possible extent formats.
- */
-typedef enum {
-	XFS_EXTFMT_NOSTATE = 0,
-	XFS_EXTFMT_HASSTATE
-} xfs_exntfmt_t;
-
 /*
  * Possible extent states.
  */
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8a37efe04de3..0e80f34fe97c 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
 
-#ifdef DEBUG
-/*
- * Make sure that the extents in the given memory buffer
- * are valid.
- */
-void
-xfs_validate_extents(
-	xfs_ifork_t		*ifp,
-	int			nrecs,
-	xfs_exntfmt_t		fmt)
-{
-	xfs_bmbt_irec_t		irec;
-	xfs_bmbt_rec_host_t	rec;
-	int			i;
-
-	for (i = 0; i < nrecs; i++) {
-		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
-		rec.l0 = get_unaligned(&ep->l0);
-		rec.l1 = get_unaligned(&ep->l1);
-		xfs_bmbt_get_all(&rec, &irec);
-		if (fmt == XFS_EXTFMT_NOSTATE)
-			ASSERT(irec.br_state == XFS_EXT_NORM);
-	}
-}
-#else /* DEBUG */
-#define xfs_validate_extents(ifp, nrecs, fmt)
-#endif /* DEBUG */
-
-
 /*
  * Move inode type and inode format specific information from the
  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
@@ -352,40 +323,33 @@ xfs_iformat_local(
 }
 
 /*
- * The file consists of a set of extents all
- * of which fit into the on-disk inode.
- * If there are few enough extents to fit into
- * the if_inline_ext, then copy them there.
- * Otherwise allocate a buffer for them and copy
- * them into it.  Either way, set if_extents
- * to point at the extents.
+ * The file consists of a set of extents all of which fit into the on-disk
+ * inode.  If there are few enough extents to fit into the if_inline_ext, then
+ * copy them there.  Otherwise allocate a buffer for them and copy them into it.
+ * Either way, set if_extents to point at the extents.
  */
 STATIC int
 xfs_iformat_extents(
-	xfs_inode_t	*ip,
-	xfs_dinode_t	*dip,
-	int		whichfork)
+	struct xfs_inode	*ip,
+	struct xfs_dinode	*dip,
+	int			whichfork)
 {
-	xfs_bmbt_rec_t	*dp;
-	xfs_ifork_t	*ifp;
-	int		nex;
-	int		size;
-	int		i;
-
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
-	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	int			nex = XFS_DFORK_NEXTENTS(dip, whichfork);
+	int			size = nex * sizeof(xfs_bmbt_rec_t);
+	struct xfs_bmbt_rec	*dp;
+	int			i;
 
 	/*
-	 * If the number of extents is unreasonable, then something
-	 * is wrong and we just bail out rather than crash in
-	 * kmem_alloc() or memcpy() below.
+	 * If the number of extents is unreasonable, then something is wrong and
+	 * we just bail out rather than crash in kmem_alloc() or memcpy() below.
 	 */
-	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
+	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) {
 		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
 			(unsigned long long) ip->i_ino, nex);
 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
-				     ip->i_mount, dip);
+				     mp, dip);
 		return -EFSCORRUPTED;
 	}
 
@@ -400,22 +364,17 @@ xfs_iformat_extents(
 	ifp->if_bytes = size;
 	if (size) {
 		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
-		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
 		for (i = 0; i < nex; i++, dp++) {
 			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
 			ep->l0 = get_unaligned_be64(&dp->l0);
 			ep->l1 = get_unaligned_be64(&dp->l1);
+			if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) {
+				XFS_ERROR_REPORT("xfs_iformat_extents(2)",
+						 XFS_ERRLEVEL_LOW, mp);
+				return -EFSCORRUPTED;
+			}
 		}
 		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
-		if (whichfork != XFS_DATA_FORK ||
-			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
-				if (unlikely(xfs_check_nostate_extents(
-				    ifp, 0, nex))) {
-					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
-							 XFS_ERRLEVEL_LOW,
-							 ip->i_mount);
-					return -EFSCORRUPTED;
-				}
 	}
 	ifp->if_flags |= XFS_IFEXTENTS;
 	return 0;
@@ -518,7 +477,6 @@ xfs_iread_extents(
 		xfs_iext_destroy(ifp);
 		return error;
 	}
-	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
 	ifp->if_flags |= XFS_IFEXTENTS;
 	return 0;
 }
@@ -837,6 +795,9 @@ xfs_iextents_copy(
 	copied = 0;
 	for (i = 0; i < nrecs; i++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
+
+		ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep));
+
 		start_block = xfs_bmbt_get_startblock(ep);
 		if (isnullstartblock(start_block)) {
 			/*
@@ -852,7 +813,6 @@ xfs_iextents_copy(
 		copied++;
 	}
 	ASSERT(copied != 0);
-	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
 
 	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
 }
-- 
cgit v1.2.3


From fad5656b2247244c9fe4f30d63e4c190c5725840 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:39 -0700
Subject: xfs: use NULL instead of 0 to initialize a pointer in xfs_getfsmap

Found by sparse.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_fsmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 48ce356da9e9..3683819887a5 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -827,7 +827,7 @@ xfs_getfsmap(
 	struct xfs_trans		*tp = NULL;
 	struct xfs_fsmap		dkeys[2];	/* per-dev keys */
 	struct xfs_getfsmap_dev		handlers[XFS_GETFSMAP_DEVS];
-	struct xfs_getfsmap_info	info = {0};
+	struct xfs_getfsmap_info	info = { NULL };
 	int				i;
 	int				error = 0;
 
-- 
cgit v1.2.3


From ef2b67ecf8ddbcb30f98f7ce6876872fb3e925f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:40 -0700
Subject: xfs: use NULL instead of 0 to initialize a pointer in
 xfs_ioc_getfsmap

Found by sparse.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index fee6b4a3a83d..0f8bed9a7e4c 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1640,7 +1640,7 @@ xfs_ioc_getfsmap(
 	struct xfs_inode	*ip,
 	void			__user *arg)
 {
-	struct getfsmap_info	info = {0};
+	struct getfsmap_info	info = { NULL };
 	struct xfs_fsmap_head	xhead = {0};
 	struct fsmap_head	head;
 	bool			aborted = false;
-- 
cgit v1.2.3


From e2a641922a3592b5ea226624d5abeb13eb49622c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:40 -0700
Subject: xfs: corruption needs to respect endianess too!

At least if we want to be able to recognize the pattern.  Add a missing
byte swap to the corruption injection case in xlog_sync.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_log.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index bb58cd1873c9..3731f13f63e9 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1852,7 +1852,7 @@ xlog_sync(
 	 */
 	if (log->l_badcrc_factor &&
 	    (prandom_u32() % log->l_badcrc_factor == 0)) {
-		iclog->ic_header.h_crc &= 0xAAAAAAAA;
+		iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA);
 		iclog->ic_state |= XLOG_STATE_IOABORT;
 		xfs_warn(log->l_mp,
 	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
-- 
cgit v1.2.3


From 9d17e14cc052ecebb572d4546c7deeddc0c6e040 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:41 -0700
Subject: xfs: fix __user annotations for xfs_ioc_getfsmap

By passing the whole fsmap_head structure and an index we can get the
user point annotations right for the embedded variable sized array
in struct fsmap_head.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
[darrick: change idx to unsigned int]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_ioctl.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0f8bed9a7e4c..eee8b0f22d75 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1614,7 +1614,8 @@ xfs_ioc_getbmapx(
 
 struct getfsmap_info {
 	struct xfs_mount	*mp;
-	struct fsmap __user	*data;
+	struct fsmap_head __user *data;
+	unsigned int		idx;
 	__u32			last_flags;
 };
 
@@ -1628,17 +1629,17 @@ xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv)
 
 	info->last_flags = xfm->fmr_flags;
 	xfs_fsmap_from_internal(&fm, xfm);
-	if (copy_to_user(info->data, &fm, sizeof(struct fsmap)))
+	if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm,
+			sizeof(struct fsmap)))
 		return -EFAULT;
 
-	info->data++;
 	return 0;
 }
 
 STATIC int
 xfs_ioc_getfsmap(
 	struct xfs_inode	*ip,
-	void			__user *arg)
+	struct fsmap_head	__user *arg)
 {
 	struct getfsmap_info	info = { NULL };
 	struct xfs_fsmap_head	xhead = {0};
@@ -1664,7 +1665,7 @@ xfs_ioc_getfsmap(
 	trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]);
 
 	info.mp = ip->i_mount;
-	info.data = ((__force struct fsmap_head *)arg)->fmh_recs;
+	info.data = arg;
 	error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info);
 	if (error == XFS_BTREE_QUERY_RANGE_ABORT) {
 		error = 0;
@@ -1674,10 +1675,9 @@ xfs_ioc_getfsmap(
 
 	/* If we didn't abort, set the "last" flag in the last fmx */
 	if (!aborted && xhead.fmh_entries) {
-		info.data--;
 		info.last_flags |= FMR_OF_LAST;
-		if (copy_to_user(&info.data->fmr_flags, &info.last_flags,
-				sizeof(info.last_flags)))
+		if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
+				&info.last_flags, sizeof(info.last_flags)))
 			return -EFAULT;
 	}
 
-- 
cgit v1.2.3


From 12e4a381c5cefa4bf54547cb51d98241a54e29f3 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Sun, 23 Apr 2017 10:45:21 -0700
Subject: xfs: fix getfsmap userspace memory corruption while setting OF_LAST

At the end of a getfsmap call, we will set FMR_OF_LAST in the last
struct fsmap that was handed in by userspace if we've truly run out of
space mapping record (as opposed to simply running out of space in the
user array).  Unfortunately, fmh_entries is the wrong check for whether
or not we've filled out anything in the user array because the ioctl
provides that fmh_count==0 sets fmh_entries without filling out the user
array.  Therefore we end up writing things into user memory areas that we
weren't given, and kaboom.

Since Christoph amended the getfsmap structure to track the number of
fsmap entries we've actually filled out, use that as part of deciding if
we have to set the OF_LAST flag.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index eee8b0f22d75..6190697603c9 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1674,7 +1674,7 @@ xfs_ioc_getfsmap(
 		return error;
 
 	/* If we didn't abort, set the "last" flag in the last fmx */
-	if (!aborted && xhead.fmh_entries) {
+	if (!aborted && info.idx) {
 		info.last_flags |= FMR_OF_LAST;
 		if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags,
 				&info.last_flags, sizeof(info.last_flags)))
-- 
cgit v1.2.3


From 3f88a15ae0f15f8dec3b10a55d02d1c1defa73c6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:42 -0700
Subject: xfs: don't use bool values in trace buffers

Using bool values produces sparse warnings of this form:

fs/xfs/./xfs_trace.h:2252:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1)
fs/xfs/./xfs_trace.h:2252:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1)
fs/xfs/./xfs_trace.h:2278:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1)
fs/xfs/./xfs_trace.h:2278:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1)
fs/xfs/./xfs_trace.h:2307:1: warning: odd constant _Bool cast (ffffffffffffffff becomes 1)

Just use a char instead to fix those up.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_trace.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 09da3c18850b..7c5a16528d8b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2255,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(void *, dop)
-		__field(bool, committed)
-		__field(bool, low)
+		__field(char, committed)
+		__field(char, low)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
@@ -2281,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(void *, dop)
-		__field(bool, committed)
-		__field(bool, low)
+		__field(char, committed)
+		__field(char, low)
 		__field(int, error)
 	),
 	TP_fast_assign(
@@ -2311,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
 		__field(dev_t, dev)
 		__field(int, type)
 		__field(void *, intent)
-		__field(bool, committed)
+		__field(char, committed)
 		__field(int, nr)
 	),
 	TP_fast_assign(
-- 
cgit v1.2.3


From 27af1bbf524459962d1477a38ac6e0b7f79aaecc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 21 Apr 2017 11:24:42 -0700
Subject: xfs: remove xfs_trans_ail_delete_bulk

xfs_iflush_done uses an on-stack variable length array to pass the log
items to be deleted to xfs_trans_ail_delete_bulk.  On-stack VLAs are a
nasty gcc extension that can lead to unbounded stack allocations, but
fortunately we can easily avoid them by simply open coding
xfs_trans_ail_delete_bulk in xfs_iflush_done, which is the only caller
of it except for the single-item xfs_trans_ail_delete.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_inode_item.c | 29 +++++++++++---------
 fs/xfs/xfs_trans_ail.c  | 71 ++++++++++++++++++++++++-------------------------
 fs/xfs/xfs_trans_priv.h | 15 +++--------
 3 files changed, 55 insertions(+), 60 deletions(-)

diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d90e7811ccdd..08cb7d1a4a3a 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -731,22 +731,27 @@ xfs_iflush_done(
 	 * holding the lock before removing the inode from the AIL.
 	 */
 	if (need_ail) {
-		struct xfs_log_item *log_items[need_ail];
-		int i = 0;
+		bool			mlip_changed = false;
+
+		/* this is an opencoded batch version of xfs_trans_ail_delete */
 		spin_lock(&ailp->xa_lock);
 		for (blip = lip; blip; blip = blip->li_bio_list) {
-			iip = INODE_ITEM(blip);
-			if (iip->ili_logged &&
-			    blip->li_lsn == iip->ili_flush_lsn) {
-				log_items[i++] = blip;
-			}
-			ASSERT(i <= need_ail);
+			if (INODE_ITEM(blip)->ili_logged &&
+			    blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
+				mlip_changed |= xfs_ail_delete_one(ailp, blip);
 		}
-		/* xfs_trans_ail_delete_bulk() drops the AIL lock. */
-		xfs_trans_ail_delete_bulk(ailp, log_items, i,
-					  SHUTDOWN_CORRUPT_INCORE);
-	}
 
+		if (mlip_changed) {
+			if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
+				xlog_assign_tail_lsn_locked(ailp->xa_mount);
+			if (list_empty(&ailp->xa_ail))
+				wake_up_all(&ailp->xa_empty);
+		}
+		spin_unlock(&ailp->xa_lock);
+
+		if (mlip_changed)
+			xfs_log_space_wake(ailp->xa_mount);
+	}
 
 	/*
 	 * clean up and unlock the flush lock now we are done. We can clear the
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d6c9c3e9e02b..9056c0f34a3c 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk(
 	}
 }
 
-/*
- * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL
+bool
+xfs_ail_delete_one(
+	struct xfs_ail		*ailp,
+	struct xfs_log_item 	*lip)
+{
+	struct xfs_log_item	*mlip = xfs_ail_min(ailp);
+
+	trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
+	xfs_ail_delete(ailp, lip);
+	lip->li_flags &= ~XFS_LI_IN_AIL;
+	lip->li_lsn = 0;
+
+	return mlip == lip;
+}
+
+/**
+ * Remove a log items from the AIL
  *
  * @xfs_trans_ail_delete_bulk takes an array of log items that all need to
  * removed from the AIL. The caller is already holding the AIL lock, and done
@@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk(
  * before returning.
  */
 void
-xfs_trans_ail_delete_bulk(
+xfs_trans_ail_delete(
 	struct xfs_ail		*ailp,
-	struct xfs_log_item	**log_items,
-	int			nr_items,
+	struct xfs_log_item	*lip,
 	int			shutdown_type) __releases(ailp->xa_lock)
 {
-	xfs_log_item_t		*mlip;
-	int			mlip_changed = 0;
-	int			i;
+	struct xfs_mount	*mp = ailp->xa_mount;
+	bool			mlip_changed;
 
-	mlip = xfs_ail_min(ailp);
-
-	for (i = 0; i < nr_items; i++) {
-		struct xfs_log_item *lip = log_items[i];
-		if (!(lip->li_flags & XFS_LI_IN_AIL)) {
-			struct xfs_mount	*mp = ailp->xa_mount;
-
-			spin_unlock(&ailp->xa_lock);
-			if (!XFS_FORCED_SHUTDOWN(mp)) {
-				xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
-		"%s: attempting to delete a log item that is not in the AIL",
-						__func__);
-				xfs_force_shutdown(mp, shutdown_type);
-			}
-			return;
+	if (!(lip->li_flags & XFS_LI_IN_AIL)) {
+		spin_unlock(&ailp->xa_lock);
+		if (!XFS_FORCED_SHUTDOWN(mp)) {
+			xfs_alert_tag(mp, XFS_PTAG_AILDELETE,
+	"%s: attempting to delete a log item that is not in the AIL",
+					__func__);
+			xfs_force_shutdown(mp, shutdown_type);
 		}
-
-		trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
-		xfs_ail_delete(ailp, lip);
-		lip->li_flags &= ~XFS_LI_IN_AIL;
-		lip->li_lsn = 0;
-		if (mlip == lip)
-			mlip_changed = 1;
+		return;
 	}
 
+	mlip_changed = xfs_ail_delete_one(ailp, lip);
 	if (mlip_changed) {
-		if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount))
-			xlog_assign_tail_lsn_locked(ailp->xa_mount);
+		if (!XFS_FORCED_SHUTDOWN(mp))
+			xlog_assign_tail_lsn_locked(mp);
 		if (list_empty(&ailp->xa_ail))
 			wake_up_all(&ailp->xa_empty);
-		spin_unlock(&ailp->xa_lock);
+	}
 
+	spin_unlock(&ailp->xa_lock);
+	if (mlip_changed)
 		xfs_log_space_wake(ailp->xa_mount);
-	} else {
-		spin_unlock(&ailp->xa_lock);
-	}
 }
 
 int
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 49931b72da8a..d91706c56c63 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -106,18 +106,9 @@ xfs_trans_ail_update(
 	xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
 }
 
-void	xfs_trans_ail_delete_bulk(struct xfs_ail *ailp,
-				struct xfs_log_item **log_items, int nr_items,
-				int shutdown_type)
-				__releases(ailp->xa_lock);
-static inline void
-xfs_trans_ail_delete(
-	struct xfs_ail	*ailp,
-	xfs_log_item_t	*lip,
-	int		shutdown_type) __releases(ailp->xa_lock)
-{
-	xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type);
-}
+bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
+		int shutdown_type) __releases(ailp->xa_lock);
 
 static inline void
 xfs_trans_ail_remove(
-- 
cgit v1.2.3


From 20e8a063786050083fe05b4f45be338c60b49126 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Fri, 21 Apr 2017 12:40:44 -0700
Subject: xfs: fix up quotacheck buffer list error handling

The quotacheck error handling of the delwri buffer list assumes the
resident buffers are locked and doesn't clear the _XBF_DELWRI_Q flag
on the buffers that are dequeued. This can lead to assert failures
on buffer release and possibly other locking problems.

Move this code to a delwri queue cancel helper function to
encapsulate the logic required to properly release buffers from a
delwri queue. Update the helper to clear the delwri queue flag and
call it from quotacheck.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_buf.c | 24 ++++++++++++++++++++++++
 fs/xfs/xfs_buf.h |  1 +
 fs/xfs/xfs_qm.c  |  7 +------
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b6208728ba39..ba036c11be94 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1079,6 +1079,8 @@ void
 xfs_buf_unlock(
 	struct xfs_buf		*bp)
 {
+	ASSERT(xfs_buf_islocked(bp));
+
 	XB_CLEAR_OWNER(bp);
 	up(&bp->b_sema);
 
@@ -1814,6 +1816,28 @@ error:
 	return NULL;
 }
 
+/*
+ * Cancel a delayed write list.
+ *
+ * Remove each buffer from the list, clear the delwri queue flag and drop the
+ * associated buffer reference.
+ */
+void
+xfs_buf_delwri_cancel(
+	struct list_head	*list)
+{
+	struct xfs_buf		*bp;
+
+	while (!list_empty(list)) {
+		bp = list_first_entry(list, struct xfs_buf, b_list);
+
+		xfs_buf_lock(bp);
+		bp->b_flags &= ~_XBF_DELWRI_Q;
+		list_del_init(&bp->b_list);
+		xfs_buf_relse(bp);
+	}
+}
+
 /*
  * Add a buffer to the delayed write list.
  *
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index e1bc1af743fa..8d1d44f87ce9 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -329,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t);
 extern void xfs_buf_stale(struct xfs_buf *bp);
 
 /* Delayed Write Buffer Routines */
+extern void xfs_buf_delwri_cancel(struct list_head *);
 extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
 extern int xfs_buf_delwri_submit(struct list_head *);
 extern int xfs_buf_delwri_submit_nowait(struct list_head *);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 4ac5b187534e..5fe6e70b88ef 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1384,12 +1384,7 @@ xfs_qm_quotacheck(
 	mp->m_qflags |= flags;
 
  error_return:
-	while (!list_empty(&buffer_list)) {
-		struct xfs_buf *bp =
-			list_first_entry(&buffer_list, struct xfs_buf, b_list);
-		list_del_init(&bp->b_list);
-		xfs_buf_relse(bp);
-	}
+	xfs_buf_delwri_cancel(&buffer_list);
 
 	if (error) {
 		xfs_warn(mp,
-- 
cgit v1.2.3


From c4cf1acdb186436b88cc9aeaf08322bc49fec183 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Thu, 20 Apr 2017 15:09:05 -0700
Subject: xfs: better log intent item refcount checking

Use ASSERTs on the log intent item refcounts so that we fail noisily if
anyone tries to double-free the item.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_bmap_item.c     | 1 +
 fs/xfs/xfs_extfree_item.c  | 1 +
 fs/xfs/xfs_refcount_item.c | 1 +
 fs/xfs/xfs_rmap_item.c     | 1 +
 4 files changed, 4 insertions(+)

diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 9bf57c76623b..2d7017373e20 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -215,6 +215,7 @@ void
 xfs_bui_release(
 	struct xfs_bui_log_item	*buip)
 {
+	ASSERT(atomic_read(&buip->bui_refcount) > 0);
 	if (atomic_dec_and_test(&buip->bui_refcount)) {
 		xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
 		xfs_bui_item_free(buip);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index d7bc14906af8..44f8c5451210 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -290,6 +290,7 @@ void
 xfs_efi_release(
 	struct xfs_efi_log_item	*efip)
 {
+	ASSERT(atomic_read(&efip->efi_refcount) > 0);
 	if (atomic_dec_and_test(&efip->efi_refcount)) {
 		xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR);
 		xfs_efi_item_free(efip);
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 6e4c7446c3d4..96fe209b5eb6 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -221,6 +221,7 @@ void
 xfs_cui_release(
 	struct xfs_cui_log_item	*cuip)
 {
+	ASSERT(atomic_read(&cuip->cui_refcount) > 0);
 	if (atomic_dec_and_test(&cuip->cui_refcount)) {
 		xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
 		xfs_cui_item_free(cuip);
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 73c827831551..f3b139c9aa16 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -243,6 +243,7 @@ void
 xfs_rui_release(
 	struct xfs_rui_log_item	*ruip)
 {
+	ASSERT(atomic_read(&ruip->rui_refcount) > 0);
 	if (atomic_dec_and_test(&ruip->rui_refcount)) {
 		xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR);
 		xfs_rui_item_free(ruip);
-- 
cgit v1.2.3


From 3c3781951c9a155a56e5eed567349118374cc315 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Thu, 27 Apr 2017 08:59:36 -0700
Subject: xfs: Allow user to kill fstrim process

fstrim can take really long time on big, slow device or on file system
with a lots of allocation groups. Currently there is no way for the user
to cancell the operation. This patch makes it possible for the user to
kill fstrim pocess by adding the check for fatal_signal_pending() in
xfs_trim_extents().

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_discard.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c
index d796ffac7296..6a05d278da64 100644
--- a/fs/xfs/xfs_discard.c
+++ b/fs/xfs/xfs_discard.c
@@ -132,6 +132,11 @@ next_extent:
 		error = xfs_btree_decrement(cur, 0, &i);
 		if (error)
 			goto out_del_cursor;
+
+		if (fatal_signal_pending(current)) {
+			error = -ERESTARTSYS;
+			goto out_del_cursor;
+		}
 	}
 
 out_del_cursor:
@@ -196,8 +201,11 @@ xfs_ioc_trim(
 	for (agno = start_agno; agno <= end_agno; agno++) {
 		error = xfs_trim_extents(mp, agno, start, end, minlen,
 					  &blocks_trimmed);
-		if (error)
+		if (error) {
 			last_error = error;
+			if (error == -ERESTARTSYS)
+				break;
+		}
 	}
 
 	if (last_error)
-- 
cgit v1.2.3


From 8f720d9f892e0e223dab8400f03130bc208c72e7 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@gmail.com>
Date: Fri, 28 Apr 2017 08:10:53 -0700
Subject: xfs: publish UUID in struct super_block

Copy the uuid of the filesystem to struct super_block s_uuid field,
as several other filesystems already do.  Copy regardless of the nouuid
mount option, because other filesystems also do not guaranty uniqueness
of the s_uuid field in super_block struct.

Signed-off-by: Amir Goldstein <amir73il@gmail.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_mount.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 688ebff1f663..2eaf81859166 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -73,6 +73,10 @@ xfs_uuid_mount(
 	uuid_t			*uuid = &mp->m_sb.sb_uuid;
 	int			hole, i;
 
+	/* Publish UUID in struct super_block */
+	BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t));
+	memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t));
+
 	if (mp->m_flags & XFS_MOUNT_NOUUID)
 		return 0;
 
-- 
cgit v1.2.3


From 756baca27fff3ecaeab9dbc7a5ee35a1d7bc0c7f Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 26 Apr 2017 08:30:39 -0700
Subject: xfs: support ability to wait on new inodes

Inodes that are inserted into the perag tree but still under
construction are flagged with the XFS_INEW bit. Most contexts either
skip such inodes when they are encountered or have the ability to
handle them.

The runtime quotaoff sequence introduces a context that must wait
for construction of such inodes to correctly ensure that all dquots
in the fs are released. In anticipation of this, support the ability
to wait on new inodes. Wake the appropriate bit when XFS_INEW is
cleared.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_icache.c | 5 ++++-
 fs/xfs/xfs_inode.h  | 4 +++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 3531f8f72fa5..25f4f4595821 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -366,14 +366,17 @@ xfs_iget_cache_hit(
 
 		error = xfs_reinit_inode(mp, inode);
 		if (error) {
+			bool wake;
 			/*
 			 * Re-initializing the inode failed, and we are in deep
 			 * trouble.  Try to re-add it to the reclaim list.
 			 */
 			rcu_read_lock();
 			spin_lock(&ip->i_flags_lock);
-
+			wake = !!__xfs_iflags_test(ip, XFS_INEW);
 			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+			if (wake)
+				wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
 			trace_xfs_iget_reclaim_fail(ip);
 			goto out_error;
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 10dcf27b4c85..10e89fcb49d7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
 #define XFS_IRECLAIM		(1 << 0) /* started reclaiming this inode */
 #define XFS_ISTALE		(1 << 1) /* inode has been staled */
 #define XFS_IRECLAIMABLE	(1 << 2) /* inode can be reclaimed */
-#define XFS_INEW		(1 << 3) /* inode has just been allocated */
+#define __XFS_INEW_BIT		3	 /* inode has just been allocated */
+#define XFS_INEW		(1 << __XFS_INEW_BIT)
 #define XFS_ITRUNCATED		(1 << 5) /* truncated down so flush-on-close */
 #define XFS_IDIRTY_RELEASE	(1 << 6) /* dirty release already seen */
 #define __XFS_IFLOCK_BIT	7	 /* inode is being flushed right now */
@@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip)
 	xfs_iflags_clear(ip, XFS_INEW);
 	barrier();
 	unlock_new_inode(VFS_I(ip));
+	wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
 }
 
 static inline void xfs_setup_existing_inode(struct xfs_inode *ip)
-- 
cgit v1.2.3


From ae2c4ac2dd39b23a87ddb14ceddc3f2872c6aef5 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 26 Apr 2017 08:30:39 -0700
Subject: xfs: update ag iterator to support wait on new inodes

The AG inode iterator currently skips new inodes as such inodes are
inserted into the inode radix tree before they are fully
constructed. Certain contexts require the ability to wait on the
construction of new inodes, however. The fs-wide dquot release from
the quotaoff sequence is an example of this.

Update the AG inode iterator to support the ability to wait on
inodes flagged with XFS_INEW upon request. Create a new
xfs_inode_ag_iterator_flags() interface and support a set of
iteration flags to modify the iteration behavior. When the
XFS_AGITER_INEW_WAIT flag is set, include XFS_INEW flags in the
radix tree inode lookup and wait on them before the callback is
executed.

Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_icache.c | 53 +++++++++++++++++++++++++++++++++++++++++++++--------
 fs/xfs/xfs_icache.h |  8 ++++++++
 2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 25f4f4595821..f61c84f8e31a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag(
 	xfs_perag_clear_reclaim_tag(pag);
 }
 
+static void
+xfs_inew_wait(
+	struct xfs_inode	*ip)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
+	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
+
+	do {
+		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+		if (!xfs_iflags_test(ip, XFS_INEW))
+			break;
+		schedule();
+	} while (true);
+	finish_wait(wq, &wait.wait);
+}
+
 /*
  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
  * part of the structure. This is made more complex by the fact we store
@@ -626,9 +642,11 @@ out_error_or_again:
 
 STATIC int
 xfs_inode_ag_walk_grab(
-	struct xfs_inode	*ip)
+	struct xfs_inode	*ip,
+	int			flags)
 {
 	struct inode		*inode = VFS_I(ip);
+	bool			newinos = !!(flags & XFS_AGITER_INEW_WAIT);
 
 	ASSERT(rcu_read_lock_held());
 
@@ -646,7 +664,8 @@ xfs_inode_ag_walk_grab(
 		goto out_unlock_noent;
 
 	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+	if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) ||
+	    __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM))
 		goto out_unlock_noent;
 	spin_unlock(&ip->i_flags_lock);
 
@@ -674,7 +693,8 @@ xfs_inode_ag_walk(
 					   void *args),
 	int			flags,
 	void			*args,
-	int			tag)
+	int			tag,
+	int			iter_flags)
 {
 	uint32_t		first_index;
 	int			last_error = 0;
@@ -716,7 +736,7 @@ restart:
 		for (i = 0; i < nr_found; i++) {
 			struct xfs_inode *ip = batch[i];
 
-			if (done || xfs_inode_ag_walk_grab(ip))
+			if (done || xfs_inode_ag_walk_grab(ip, iter_flags))
 				batch[i] = NULL;
 
 			/*
@@ -744,6 +764,9 @@ restart:
 		for (i = 0; i < nr_found; i++) {
 			if (!batch[i])
 				continue;
+			if ((iter_flags & XFS_AGITER_INEW_WAIT) &&
+			    xfs_iflags_test(batch[i], XFS_INEW))
+				xfs_inew_wait(batch[i]);
 			error = execute(batch[i], flags, args);
 			IRELE(batch[i]);
 			if (error == -EAGAIN) {
@@ -823,12 +846,13 @@ xfs_cowblocks_worker(
 }
 
 int
-xfs_inode_ag_iterator(
+xfs_inode_ag_iterator_flags(
 	struct xfs_mount	*mp,
 	int			(*execute)(struct xfs_inode *ip, int flags,
 					   void *args),
 	int			flags,
-	void			*args)
+	void			*args,
+	int			iter_flags)
 {
 	struct xfs_perag	*pag;
 	int			error = 0;
@@ -838,7 +862,8 @@ xfs_inode_ag_iterator(
 	ag = 0;
 	while ((pag = xfs_perag_get(mp, ag))) {
 		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1,
+					  iter_flags);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
@@ -849,6 +874,17 @@ xfs_inode_ag_iterator(
 	return last_error;
 }
 
+int
+xfs_inode_ag_iterator(
+	struct xfs_mount	*mp,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			flags,
+	void			*args)
+{
+	return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0);
+}
+
 int
 xfs_inode_ag_iterator_tag(
 	struct xfs_mount	*mp,
@@ -866,7 +902,8 @@ xfs_inode_ag_iterator_tag(
 	ag = 0;
 	while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
 		ag = pag->pag_agno + 1;
-		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+		error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag,
+					  0);
 		xfs_perag_put(pag);
 		if (error) {
 			last_error = error;
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a7c849b4dea..9183f77958ef 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -48,6 +48,11 @@ struct xfs_eofblocks {
 #define XFS_IGET_UNTRUSTED	0x2
 #define XFS_IGET_DONTCACHE	0x4
 
+/*
+ * flags for AG inode iterator
+ */
+#define XFS_AGITER_INEW_WAIT	0x1	/* wait on new inodes */
+
 int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
 	     uint flags, uint lock_flags, xfs_inode_t **ipp);
 
@@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
 	int flags, void *args);
+int xfs_inode_ag_iterator_flags(struct xfs_mount *mp,
+	int (*execute)(struct xfs_inode *ip, int flags, void *args),
+	int flags, void *args, int iter_flags);
 int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
 	int flags, void *args, int tag);
-- 
cgit v1.2.3


From e20c8a517f259cb4d258e10b0cd5d4b30d4167a0 Mon Sep 17 00:00:00 2001
From: Brian Foster <bfoster@redhat.com>
Date: Wed, 26 Apr 2017 08:30:40 -0700
Subject: xfs: wait on new inodes during quotaoff dquot release

The quotaoff operation has a race with inode allocation that results
in a livelock. An inode allocation that occurs before the quota
status flags are updated acquires the appropriate dquots for the
inode via xfs_qm_vop_dqalloc(). It then inserts the XFS_INEW inode
into the perag radix tree, sometime later attaches the dquots to the
inode and finally clears the XFS_INEW flag. Quotaoff expects to
release the dquots from all inodes in the filesystem via
xfs_qm_dqrele_all_inodes(). This invokes the AG inode iterator,
which skips inodes in the XFS_INEW state because they are not fully
constructed. If the scan occurs after dquots have been attached to
an inode, but before XFS_INEW is cleared, the newly allocated inode
will continue to hold a reference to the applicable dquots. When
quotaoff invokes xfs_qm_dqpurge_all(), the reference count of those
dquot(s) remain elevated and the dqpurge scan spins indefinitely.

To address this problem, update the xfs_qm_dqrele_all_inodes() scan
to wait on inodes marked on the XFS_INEW state. We wait on the
inodes explicitly rather than skip and retry to avoid continuous
retry loops due to a parallel inode allocation workload. Since
quotaoff updates the quota state flags and uses a synchronous
transaction before the dqrele scan, and dquots are attached to
inodes after radix tree insertion iff quota is enabled, one INEW
waiting pass through the AG guarantees that the scan has processed
all inodes that could possibly hold dquot references.

Reported-by: Eryu Guan <eguan@redhat.com>
Signed-off-by: Brian Foster <bfoster@redhat.com>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_qm_syscalls.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 475a3882a81f..9cb5c381b01c 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes(
 	uint		 flags)
 {
 	ASSERT(mp->m_quotainfo);
-	xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
+	xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL,
+				    XFS_AGITER_INEW_WAIT);
 }
-- 
cgit v1.2.3


From fe0be23e68200573de027de9b8cc2b27e7fce35e Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Wed, 12 Apr 2017 12:26:07 -0700
Subject: xfs: reserve enough blocks to handle btree splits when remapping

In xfs_reflink_end_cow, we erroneously reserve only enough blocks to
handle adding 1 extent.  This is problematic if we fragment free space,
have to do CoW, and then have to perform multiple bmap btree expansions.
Furthermore, the BUI recovery routine doesn't reserve /any/ blocks to
handle btree splits, so log recovery fails after our first error causes
the filesystem to go down.

Therefore, refactor the transaction block reservation macros until we
have a macro that works for our deferred (re)mapping activities, and fix
both problems by using that macro.

With 1k blocks we can hit this fairly often in g/187 if the scratch fs
is big enough.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_trans_space.h | 23 +++++++++++++++++------
 fs/xfs/xfs_bmap_item.c          |  5 ++++-
 fs/xfs/xfs_reflink.c            | 18 ++++++++++++++++--
 3 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 7917f6e44286..d787c677d2a3 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,8 +21,20 @@
 /*
  * Components of space reservations.
  */
+
+/* Worst case number of rmaps that can be held in a block. */
 #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
 		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
+
+/* Adding one rmap could split every level up to the top of the tree. */
+#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels)
+
+/* Blocks we might need to add "b" rmaps to a tree. */
+#define XFS_NRMAPADD_SPACE_RES(mp, b)\
+	(((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+	  XFS_RMAPADD_SPACE_RES(mp))
+
 #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
 		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
 #define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -30,13 +42,12 @@
 	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
 	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
 	  XFS_EXTENTADD_SPACE_RES(mp,w))
+
+/* Blocks we might need to add "b" mappings & rmappings to a file. */
 #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
-	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
-	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
-	  XFS_EXTENTADD_SPACE_RES(mp,w) + \
-	 ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
-	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
-	  (mp)->m_rmap_maxlevels)
+	(XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \
+	 XFS_NRMAPADD_SPACE_RES((mp), (b)))
+
 #define	XFS_DAENTER_1B(mp,w)	\
 	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
 #define	XFS_DAENTER_DBS(mp,w)	\
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 2d7017373e20..d419d23fa214 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -34,6 +34,8 @@
 #include "xfs_bmap.h"
 #include "xfs_icache.h"
 #include "xfs_trace.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
 
 
 kmem_zone_t	*xfs_bui_zone;
@@ -447,7 +449,8 @@ xfs_bui_recover(
 		return -EIO;
 	}
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
 	if (error)
 		return error;
 	budp = xfs_trans_get_bud(tp, buip);
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c0f3754caca2..ffe6fe7a7eb5 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -705,8 +705,22 @@ xfs_reflink_end_cow(
 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
 
-	/* Start a rolling transaction to switch the mappings */
-	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	/*
+	 * Start a rolling transaction to switch the mappings.  We're
+	 * unlikely ever to have to remap 16T worth of single-block
+	 * extents, so just cap the worst case extent count to 2^32-1.
+	 * Stick a warning in just in case, and avoid 64-bit division.
+	 */
+	BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX);
+	if (end_fsb - offset_fsb > UINT_MAX) {
+		error = -EFSCORRUPTED;
+		xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE);
+		ASSERT(0);
+		goto out;
+	}
+	resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount,
+			(unsigned int)(end_fsb - offset_fsb),
+			XFS_DATA_FORK);
 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
 			resblks, 0, 0, &tp);
 	if (error)
-- 
cgit v1.2.3


From 161f55efba5ddccc690139fae9373cafc3447a97 Mon Sep 17 00:00:00 2001
From: Eryu Guan <eguan@redhat.com>
Date: Tue, 2 May 2017 13:54:47 -0700
Subject: xfs: fix use-after-free in xfs_finish_page_writeback

Commit 28b783e47ad7 ("xfs: bufferhead chains are invalid after
end_page_writeback") fixed one use-after-free issue by
pre-calculating the loop conditionals before calling bh->b_end_io()
in the end_io processing loop, but it assigned 'next' pointer before
checking end offset boundary & breaking the loop, at which point the
bh might be freed already, and caused use-after-free.

This is caught by KASAN when running fstests generic/127 on sub-page
block size XFS.

[ 2517.244502] run fstests generic/127 at 2017-04-27 07:30:50
[ 2747.868840] ==================================================================
[ 2747.876949] BUG: KASAN: use-after-free in xfs_destroy_ioend+0x3d3/0x4e0 [xfs] at addr ffff8801395ae698
...
[ 2747.918245] Call Trace:
[ 2747.920975]  dump_stack+0x63/0x84
[ 2747.924673]  kasan_object_err+0x21/0x70
[ 2747.928950]  kasan_report+0x271/0x530
[ 2747.933064]  ? xfs_destroy_ioend+0x3d3/0x4e0 [xfs]
[ 2747.938409]  ? end_page_writeback+0xce/0x110
[ 2747.943171]  __asan_report_load8_noabort+0x19/0x20
[ 2747.948545]  xfs_destroy_ioend+0x3d3/0x4e0 [xfs]
[ 2747.953724]  xfs_end_io+0x1af/0x2b0 [xfs]
[ 2747.958197]  process_one_work+0x5ff/0x1000
[ 2747.962766]  worker_thread+0xe4/0x10e0
[ 2747.966946]  kthread+0x2d3/0x3d0
[ 2747.970546]  ? process_one_work+0x1000/0x1000
[ 2747.975405]  ? kthread_create_on_node+0xc0/0xc0
[ 2747.980457]  ? syscall_return_slowpath+0xe6/0x140
[ 2747.985706]  ? do_page_fault+0x30/0x80
[ 2747.989887]  ret_from_fork+0x2c/0x40
[ 2747.993874] Object at ffff8801395ae690, in cache buffer_head size: 104
[ 2748.001155] Allocated:
[ 2748.003782] PID = 8327
[ 2748.006411]  save_stack_trace+0x1b/0x20
[ 2748.010688]  save_stack+0x46/0xd0
[ 2748.014383]  kasan_kmalloc+0xad/0xe0
[ 2748.018370]  kasan_slab_alloc+0x12/0x20
[ 2748.022648]  kmem_cache_alloc+0xb8/0x1b0
[ 2748.027024]  alloc_buffer_head+0x22/0xc0
[ 2748.031399]  alloc_page_buffers+0xd1/0x250
[ 2748.035968]  create_empty_buffers+0x30/0x410
[ 2748.040730]  create_page_buffers+0x120/0x1b0
[ 2748.045493]  __block_write_begin_int+0x17a/0x1800
[ 2748.050740]  iomap_write_begin+0x100/0x2f0
[ 2748.055308]  iomap_zero_range_actor+0x253/0x5c0
[ 2748.060362]  iomap_apply+0x157/0x270
[ 2748.064347]  iomap_zero_range+0x5a/0x80
[ 2748.068624]  iomap_truncate_page+0x6b/0xa0
[ 2748.073227]  xfs_setattr_size+0x1f7/0xa10 [xfs]
[ 2748.078312]  xfs_vn_setattr_size+0x68/0x140 [xfs]
[ 2748.083589]  xfs_file_fallocate+0x4ac/0x820 [xfs]
[ 2748.088838]  vfs_fallocate+0x2cf/0x780
[ 2748.093021]  SyS_fallocate+0x48/0x80
[ 2748.097006]  do_syscall_64+0x18a/0x430
[ 2748.101186]  return_from_SYSCALL_64+0x0/0x6a
[ 2748.105948] Freed:
[ 2748.108189] PID = 8327
[ 2748.110816]  save_stack_trace+0x1b/0x20
[ 2748.115093]  save_stack+0x46/0xd0
[ 2748.118788]  kasan_slab_free+0x73/0xc0
[ 2748.122969]  kmem_cache_free+0x7a/0x200
[ 2748.127247]  free_buffer_head+0x41/0x80
[ 2748.131524]  try_to_free_buffers+0x178/0x250
[ 2748.136316]  xfs_vm_releasepage+0x2e9/0x3d0 [xfs]
[ 2748.141563]  try_to_release_page+0x100/0x180
[ 2748.146325]  invalidate_inode_pages2_range+0x7da/0xcf0
[ 2748.152087]  xfs_shift_file_space+0x37d/0x6e0 [xfs]
[ 2748.157557]  xfs_collapse_file_space+0x49/0x120 [xfs]
[ 2748.163223]  xfs_file_fallocate+0x2a7/0x820 [xfs]
[ 2748.168462]  vfs_fallocate+0x2cf/0x780
[ 2748.172642]  SyS_fallocate+0x48/0x80
[ 2748.176629]  do_syscall_64+0x18a/0x430
[ 2748.180810]  return_from_SYSCALL_64+0x0/0x6a

Fixed it by checking on offset against end & breaking out first,
dereference bh only if there're still bufferheads to process.

Signed-off-by: Eryu Guan <eguan@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index eef453adbc06..0debaeb70f2a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -111,11 +111,11 @@ xfs_finish_page_writeback(
 
 	bsize = bh->b_size;
 	do {
+		if (off > end)
+			break;
 		next = bh->b_this_page;
 		if (off < bvec->bv_offset)
 			goto next_bh;
-		if (off > end)
-			break;
 		bh->b_end_io(bh, !error);
 next_bh:
 		off += bsize;
-- 
cgit v1.2.3