summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYingping Lu <yingping@sgi.com>2006-06-09 14:55:18 +1000
committerNathan Scott <nathans@sgi.com>2006-06-09 14:55:18 +1000
commitd210a28cd851082cec9b282443f8cc0e6fc09830 (patch)
tree77b8c843d4cb7e6095b607570c5fd16702e50592
parentd3446eac3f50dade2f09ed212b112609ee78fb33 (diff)
downloadlinux-d210a28cd851082cec9b282443f8cc0e6fc09830.tar.gz
linux-d210a28cd851082cec9b282443f8cc0e6fc09830.tar.bz2
linux-d210a28cd851082cec9b282443f8cc0e6fc09830.zip
[XFS] In actual allocation of file system blocks and freeing extents, the
transaction within each such operation may involve multiple locking of AGF buffer. While the freeing extent function has sorted the extents based on AGF number before entering into transaction, however, when the file system space is very limited, the allocation of space would try every AGF to get space allocated, this could potentially cause out-of-order locking, thus deadlock could happen. This fix mitigates the scarce space for allocation by setting aside a few blocks without reservation, and avoid deadlock by maintaining ascending order of AGF locking. SGI-PV: 947395 SGI-Modid: xfs-linux-melb:xfs-kern:210801a Signed-off-by: Yingping Lu <yingping@sgi.com> Signed-off-by: Nathan Scott <nathans@sgi.com>
-rw-r--r--fs/xfs/xfs_alloc.c29
-rw-r--r--fs/xfs/xfs_alloc.h2
-rw-r--r--fs/xfs/xfs_bmap.c5
-rw-r--r--fs/xfs/xfs_bmap_btree.c10
-rw-r--r--fs/xfs/xfs_mount.c24
5 files changed, 55 insertions, 15 deletions
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 8558226281c4..22af489d3f34 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1862,7 +1862,7 @@ xfs_alloc_fix_freelist(
(pag->pagf_longest - delta) :
(pag->pagf_flcount > 0 || pag->pagf_longest > 0);
if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
- (args->minleft &&
+ (!(flags & XFS_ALLOC_FLAG_FREEING) &&
(int)(pag->pagf_freeblks + pag->pagf_flcount -
need - args->total) <
(int)args->minleft)) {
@@ -1898,7 +1898,7 @@ xfs_alloc_fix_freelist(
longest = (longest > delta) ? (longest - delta) :
(be32_to_cpu(agf->agf_flcount) > 0 || longest > 0);
if (args->minlen + args->alignment + args->minalignslop - 1 > longest ||
- (args->minleft &&
+ (!(flags & XFS_ALLOC_FLAG_FREEING) &&
(int)(be32_to_cpu(agf->agf_freeblks) +
be32_to_cpu(agf->agf_flcount) - need - args->total) <
(int)args->minleft)) {
@@ -1951,8 +1951,14 @@ xfs_alloc_fix_freelist(
* the restrictions correctly. Can happen for free calls
* on a completely full ag.
*/
- if (targs.agbno == NULLAGBLOCK)
+ if (targs.agbno == NULLAGBLOCK) {
+ if (!(flags & XFS_ALLOC_FLAG_FREEING)) {
+ xfs_trans_brelse(tp, agflbp);
+ args->agbp = NULL;
+ return 0;
+ }
break;
+ }
/*
* Put each allocated block on the list.
*/
@@ -2360,8 +2366,19 @@ xfs_alloc_vextent(
if (args->agno == sagno &&
type == XFS_ALLOCTYPE_START_BNO)
args->type = XFS_ALLOCTYPE_THIS_AG;
- if (++(args->agno) == mp->m_sb.sb_agcount)
- args->agno = 0;
+ /*
+ * For the first allocation, we can try any AG to get
+ * space. However, if we already have allocated a
+ * block, we don't want to try AGs whose number is below
+ * sagno. Otherwise, we may end up with out-of-order
+ * locking of AGF, which might cause deadlock.
+ */
+ if (++(args->agno) == mp->m_sb.sb_agcount) {
+ if (args->firstblock != NULLFSBLOCK)
+ args->agno = sagno;
+ else
+ args->agno = 0;
+ }
/*
* Reached the starting a.g., must either be done
* or switch to non-trylock mode.
@@ -2443,7 +2460,7 @@ xfs_free_extent(
args.minlen = args.minleft = args.minalignslop = 0;
down_read(&args.mp->m_peraglock);
args.pag = &args.mp->m_perag[args.agno];
- if ((error = xfs_alloc_fix_freelist(&args, 0)))
+ if ((error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING)))
goto error0;
#ifdef DEBUG
ASSERT(args.agbp != NULL);
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 2d1f8928b267..650591f999ae 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -41,6 +41,7 @@ typedef enum xfs_alloctype
* Flags for xfs_alloc_fix_freelist.
*/
#define XFS_ALLOC_FLAG_TRYLOCK 0x00000001 /* use trylock for buffer locking */
+#define XFS_ALLOC_FLAG_FREEING 0x00000002 /* indicate caller is freeing extents*/
/*
* Argument structure for xfs_alloc routines.
@@ -70,6 +71,7 @@ typedef struct xfs_alloc_arg {
char wasfromfl; /* set if allocation is from freelist */
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
+ xfs_fsblock_t firstblock; /* io first block allocated */
} xfs_alloc_arg_t;
/*
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 890ad3528174..ad595dbefe16 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2762,6 +2762,7 @@ xfs_bmap_btalloc(
args.mp = mp;
args.fsbno = ap->rval;
args.maxlen = MIN(ap->alen, mp->m_sb.sb_agblocks);
+ args.firstblock = ap->firstblock;
blen = 0;
if (nullfb) {
args.type = XFS_ALLOCTYPE_START_BNO;
@@ -2821,7 +2822,7 @@ xfs_bmap_btalloc(
else
args.minlen = ap->alen;
} else if (ap->low) {
- args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.type = XFS_ALLOCTYPE_START_BNO;
args.total = args.minlen = ap->minlen;
} else {
args.type = XFS_ALLOCTYPE_NEAR_BNO;
@@ -3452,6 +3453,7 @@ xfs_bmap_extents_to_btree(
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
args.tp = tp;
args.mp = mp;
+ args.firstblock = *firstblock;
if (*firstblock == NULLFSBLOCK) {
args.type = XFS_ALLOCTYPE_START_BNO;
args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
@@ -3587,6 +3589,7 @@ xfs_bmap_local_to_extents(
args.tp = tp;
args.mp = ip->i_mount;
+ args.firstblock = *firstblock;
ASSERT((ifp->if_flags &
(XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bea44709afbe..3b6dfc9b53af 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -1569,12 +1569,11 @@ xfs_bmbt_split(
lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
left = XFS_BUF_TO_BMBT_BLOCK(lbp);
args.fsbno = cur->bc_private.b.firstblock;
+ args.firstblock = args.fsbno;
if (args.fsbno == NULLFSBLOCK) {
args.fsbno = lbno;
args.type = XFS_ALLOCTYPE_START_BNO;
- } else if (cur->bc_private.b.flist->xbf_low)
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- else
+ } else
args.type = XFS_ALLOCTYPE_NEAR_BNO;
args.mod = args.minleft = args.alignment = args.total = args.isfl =
args.userdata = args.minalignslop = 0;
@@ -2356,6 +2355,7 @@ xfs_bmbt_newroot(
args.userdata = args.minalignslop = 0;
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ args.firstblock = args.fsbno;
if (args.fsbno == NULLFSBLOCK) {
#ifdef DEBUG
if ((error = xfs_btree_check_lptr(cur, INT_GET(*pp, ARCH_CONVERT), level))) {
@@ -2365,9 +2365,7 @@ xfs_bmbt_newroot(
#endif
args.fsbno = INT_GET(*pp, ARCH_CONVERT);
args.type = XFS_ALLOCTYPE_START_BNO;
- } else if (args.wasdel)
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- else
+ } else
args.type = XFS_ALLOCTYPE_NEAR_BNO;
if ((error = xfs_alloc_vextent(&args))) {
XFS_BMBT_TRACE_CURSOR(cur, ERROR);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c0b1c2906880..4b7be49cc4de 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1254,6 +1254,26 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
xfs_trans_log_buf(tp, bp, first, last);
}
+
+/*
+ * In order to avoid ENOSPC-related deadlock caused by
+ * out-of-order locking of AGF buffer (PV 947395), we place
+ * constraints on the relationship among actual allocations for
+ * data blocks, freelist blocks, and potential file data bmap
+ * btree blocks. However, these restrictions may result in no
+ * actual space allocated for a delayed extent, for example, a data
+ * block in a certain AG is allocated but there is no additional
+ * block for the additional bmap btree block due to a split of the
+ * bmap btree of the file. The result of this may lead to an
+ * infinite loop in xfssyncd when the file gets flushed to disk and
+ * all delayed extents need to be actually allocated. To get around
+ * this, we explicitly set aside a few blocks which will not be
+ * reserved in delayed allocation. Considering the minimum number of
+ * needed freelist blocks is 4 fsbs, a potential split of file's bmap
+ * btree requires 1 fsb, so we set the number of set-aside blocks to 8.
+*/
+#define SET_ASIDE_BLOCKS 8
+
/*
* xfs_mod_incore_sb_unlocked() is a utility routine common used to apply
* a delta to a specified field in the in-core superblock. Simply
@@ -1298,7 +1318,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
return 0;
case XFS_SBS_FDBLOCKS:
- lcounter = (long long)mp->m_sb.sb_fdblocks;
+ lcounter = (long long)mp->m_sb.sb_fdblocks - SET_ASIDE_BLOCKS;
res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
if (delta > 0) { /* Putting blocks back */
@@ -1332,7 +1352,7 @@ xfs_mod_incore_sb_unlocked(xfs_mount_t *mp, xfs_sb_field_t field,
}
}
- mp->m_sb.sb_fdblocks = lcounter;
+ mp->m_sb.sb_fdblocks = lcounter + SET_ASIDE_BLOCKS;
return 0;
case XFS_SBS_FREXTENTS:
lcounter = (long long)mp->m_sb.sb_frextents;