From 4b4fa25ced2d719a06a3a63009bea1cf1fbedd55 Mon Sep 17 00:00:00 2001 From: Mandy Kirkconnell Date: Fri, 31 Mar 2006 13:03:58 +1000 Subject: [XFS] Cleanup comment to remove reference to obsoleted function xfs_bmap_do_search_extents(). SGI-PV: 951415 SGI-Modid: xfs-linux-melb:xfs-kern:208491a Signed-off-by: Mandy Kirkconnell Signed-off-by: Nathan Scott --- fs/xfs/xfs_bmap.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index f83399c89ce3..8e0d73d9ccc4 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -353,10 +353,11 @@ xfs_check_nostate_extents( xfs_extnum_t num); /* - * Call xfs_bmap_do_search_extents() to search for the extent - * record containing block bno. If in multi-level in-core extent - * allocation mode, find and extract the target extent buffer, - * otherwise just use the direct extent list. + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. */ xfs_bmbt_rec_t * xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, -- cgit v1.2.3 From 764d1f89a5f2b914bc13b1b8b8920a600a5fba10 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 31 Mar 2006 13:04:17 +1000 Subject: [XFS] Implement the silent parameter to fill_super, previously ignored. SGI-PV: 951299 SGI-Modid: xfs-linux-melb:xfs-kern:25632a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_super.c | 11 +++---- fs/xfs/xfs_clnt.h | 1 + fs/xfs/xfs_error.h | 3 ++ fs/xfs/xfs_mount.c | 71 +++++++++++++++++++------------------------- fs/xfs/xfs_mount.h | 5 +++- fs/xfs/xfs_vfsops.c | 5 +++- 6 files changed, 49 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1884300417e3..68f4793e8a11 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -67,7 +67,8 @@ mempool_t *xfs_ioend_pool; STATIC struct xfs_mount_args * xfs_args_allocate( - struct super_block *sb) + struct super_block *sb, + int silent) { struct xfs_mount_args *args; @@ -80,8 +81,8 @@ xfs_args_allocate( args->flags |= XFSMNT_DIRSYNC; if (sb->s_flags & MS_SYNCHRONOUS) args->flags |= XFSMNT_WSYNC; - - /* Default to 32 bit inodes on Linux all the time */ + if (silent) + args->flags |= XFSMNT_QUIET; args->flags |= XFSMNT_32BITINODES; return args; @@ -719,7 +720,7 @@ xfs_fs_remount( char *options) { vfs_t *vfsp = vfs_from_sb(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, 0); int error; VFS_PARSEARGS(vfsp, options, args, 1, error); @@ -825,7 +826,7 @@ xfs_fs_fill_super( { vnode_t *rootvp; struct vfs *vfsp = vfs_allocate(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, silent); struct kstatfs statvfs; int error, error2; diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 022fff62085b..5b7eb81453be 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -68,6 +68,7 @@ struct xfs_mount_args { * enforcement */ #define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit * enforcement */ +#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */ #define XFSMNT_NOALIGN 0x00000200 /* don't allocate at * stripe boundaries*/ #define XFSMNT_RETERR 0x00000400 /* return error to user */ diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 26b8e709a569..bc43163456ef 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -186,4 +186,7 @@ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) +#define xfs_fs_mount_cmn_err(f, fmt, args...) \ + ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 72e7e78bfff8..049fabb7f7e0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -213,7 +213,8 @@ xfs_mount_free( STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, - xfs_sb_t *sbp) + xfs_sb_t *sbp, + int flags) { /* * If the log device and data device have the @@ -223,33 +224,29 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - cmn_err(CE_WARN, "XFS: bad magic number"); + xfs_fs_mount_cmn_err(flags, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!XFS_SB_GOOD_VERSION(sbp)) { - cmn_err(CE_WARN, "XFS: bad version"); + xfs_fs_mount_cmn_err(flags, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an external log; " - "specify logdev on the\nmount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an external log; " + "specify logdev on the\nmount command line."); + return XFS_ERROR(EINVAL); } if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an internal log; " - "don't specify logdev on\nthe mount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an internal log; " + "do not specify logdev on\nthe mount command line."); + return XFS_ERROR(EINVAL); } /* @@ -274,9 +271,7 @@ xfs_mount_validate_sb( (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { - cmn_err(CE_WARN, "XFS: SB sanity check 1 failed"); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)", - XFS_ERRLEVEL_LOW, mp, sbp); + xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -289,9 +284,7 @@ xfs_mount_validate_sb( (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { - cmn_err(CE_WARN, "XFS: SB sanity check 2 failed"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(4)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -307,15 +300,13 @@ xfs_mount_validate_sb( (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { #endif - cmn_err(CE_WARN, - "XFS: File system is too large to be mounted on this system."); + xfs_fs_mount_cmn_err(flags, + "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); } if (unlikely(sbp->sb_inprogress)) { - cmn_err(CE_WARN, "XFS: file system busy"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(5)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "file system busy"); return XFS_ERROR(EFSCORRUPTED); } @@ -323,8 +314,8 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system using version 1 directory format"); + xfs_fs_mount_cmn_err(flags, + "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -332,11 +323,11 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system with blocksize %d bytes", + xfs_fs_mount_cmn_err(flags, + "file system with blocksize %d bytes", sbp->sb_blocksize); - cmn_err(CE_WARN, - "XFS: Only page-sized (%ld) or less blocksizes currently work.", + xfs_fs_mount_cmn_err(flags, + "only pagesize (%ld) or less will currently work.", PAGE_SIZE); return XFS_ERROR(ENOSYS); } @@ -484,7 +475,7 @@ xfs_xlatesb( * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp) +xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; unsigned int extra_flags; @@ -506,7 +497,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB read failed"); + xfs_fs_mount_cmn_err(flags, "SB read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -520,9 +511,9 @@ xfs_readsb(xfs_mount_t *mp) sbp = XFS_BUF_TO_SBP(bp); xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); - error = xfs_mount_validate_sb(mp, &(mp->m_sb)); + error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { - cmn_err(CE_WARN, "XFS: SB validate failed"); + xfs_fs_mount_cmn_err(flags, "SB validate failed"); goto fail; } @@ -530,8 +521,8 @@ xfs_readsb(xfs_mount_t *mp) * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > mp->m_sb.sb_sectsize) { - cmn_err(CE_WARN, - "XFS: device supports only %u byte sectors (not %u)", + xfs_fs_mount_cmn_err(flags, + "device supports only %u byte sectors (not %u)", sector_size, mp->m_sb.sb_sectsize); error = ENOSYS; goto fail; @@ -548,7 +539,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB re-read failed"); + xfs_fs_mount_cmn_err(flags, "SB re-read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -678,7 +669,7 @@ xfs_mountfs( int error = 0; if (mp->m_sb_bp == NULL) { - if ((error = xfs_readsb(mp))) { + if ((error = xfs_readsb(mp, mfsi_flags))) { return error; } } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 66cbee79864e..668ad23fd37c 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -510,9 +510,12 @@ xfs_preferred_iosize(xfs_mount_t *mp) */ #define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ #define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ +/* XFS_MFSI_RRINODES */ #define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ /* log recovery */ #define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ +/* XFS_MFSI_CONVERT_SUNIT */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ /* * Macros for getting from mount to vfs and back. @@ -581,7 +584,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); -extern int xfs_readsb(xfs_mount_t *mp); +extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); extern int xfs_syncsub(xfs_mount_t *, int, int, int *); diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 504d2a80747a..89020c15d88a 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -442,6 +442,9 @@ xfs_mount( p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; + if (args->flags & XFSMNT_QUIET) + flags |= XFS_MFSI_QUIET; + /* * Open real time and log devices - order is important. */ @@ -492,7 +495,7 @@ xfs_mount( error = xfs_start_flags(vfsp, args, mp); if (error) goto error1; - error = xfs_readsb(mp); + error = xfs_readsb(mp, flags); if (error) goto error1; error = xfs_finish_flags(vfsp, args, mp); -- cgit v1.2.3 From 9a2a7de268f67fea0c450ed3e99a2d31f43d7166 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 31 Mar 2006 13:04:49 +1000 Subject: [XFS] Make project quota enforcement return an error code consistent with its use. SGI-PV: 951300 SGI-Modid: xfs-linux-melb:xfs-kern:25633a Signed-off-by: Nathan Scott --- fs/xfs/quota/xfs_qm.c | 17 ++++++----- fs/xfs/quota/xfs_trans_dquot.c | 68 ++++++++++++++++++++++-------------------- fs/xfs/xfs_bmap.c | 11 ++++--- fs/xfs/xfs_quota.h | 5 ++-- 4 files changed, 54 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 73c1e5e80c07..7fb5eca9bd50 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -2624,7 +2624,7 @@ xfs_qm_vop_chown_reserve( { int error; xfs_mount_t *mp; - uint delblks, blkflags; + uint delblks, blkflags, prjflags = 0; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; ASSERT(XFS_ISLOCKED_INODE(ip)); @@ -2650,10 +2650,13 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { - if ((XFS_IS_GQUOTA_ON(ip->i_mount) && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) || - (XFS_IS_PQUOTA_ON(ip->i_mount) && - ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))) { + if (XFS_IS_PQUOTA_ON(ip->i_mount) && + ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) + prjflags = XFS_QMOPT_ENOSPC; + + if (prjflags || + (XFS_IS_GQUOTA_ON(ip->i_mount) && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { delblksgdq = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -2664,7 +2667,7 @@ xfs_qm_vop_chown_reserve( if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); /* @@ -2681,7 +2684,7 @@ xfs_qm_vop_chown_reserve( ASSERT(unresudq || unresgdq); if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index d8e131ec0aa8..9168918db252 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -595,12 +595,19 @@ xfs_trans_unreserve_and_mod_dquots( } } +STATIC int +xfs_quota_error(uint flags) +{ + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; +} + /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also * if the blk reservation is for RT or regular blocks. * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. - * Returns EDQUOT if quota is exceeded. */ STATIC int xfs_trans_dqresv( @@ -666,19 +673,15 @@ xfs_trans_dqresv( */ if (hardlimit > 0ULL && (hardlimit <= nblks + *resbcountp)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } if (softlimit > 0ULL && (softlimit <= nblks + *resbcountp)) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -695,16 +698,12 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; if (hardlimit > 0ULL && count >= hardlimit) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } else if (softlimit > 0ULL && count >= softlimit) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -751,13 +750,14 @@ error_return: /* - * Given a dquot(s), make disk block and/or inode reservations against them. + * Given dquot(s), make disk block and/or inode reservations against them. * The fact that this does the reservation against both the usr and - * grp quotas is important, because this follows a both-or-nothing + * grp/prj quotas is important, because this follows a both-or-nothing * approach. * * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks * dquots are unlocked on return, if they were not locked by caller. @@ -772,25 +772,27 @@ xfs_trans_reserve_quota_bydquots( long ninos, uint flags) { - int resvd; + int resvd = 0, error; - if (! XFS_IS_QUOTA_ON(mp)) - return (0); + if (!XFS_IS_QUOTA_ON(mp)) + return 0; if (tp && tp->t_dqinfo == NULL) xfs_trans_alloc_dqinfo(tp); ASSERT(flags & XFS_QMOPT_RESBLK_MASK); - resvd = 0; if (udqp) { - if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags)) - return (EDQUOT); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; resvd = 1; } if (gdqp) { - if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) { /* * can't do it, so backout previous reservation */ @@ -799,14 +801,14 @@ xfs_trans_reserve_quota_bydquots( xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); } - return (EDQUOT); + return error; } } /* * Didn't change anything critical, so, no need to log */ - return (0); + return 0; } @@ -814,8 +816,6 @@ xfs_trans_reserve_quota_bydquots( * Lock the dquot and change the reservation if we can. * This doesn't change the actual usage, just the reservation. * The inode sent in is locked. - * - * Returns 0 on success, EDQUOT or other errors otherwise */ STATIC int xfs_trans_reserve_quota_nblks( @@ -824,20 +824,24 @@ xfs_trans_reserve_quota_nblks( xfs_inode_t *ip, long nblks, long ninos, - uint type) + uint flags) { int error; if (!XFS_IS_QUOTA_ON(mp)) - return (0); + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); - ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS || - (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. @@ -845,8 +849,8 @@ xfs_trans_reserve_quota_nblks( error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, ip->i_gdquot, nblks, ninos, - type); - return (error); + flags); + return error; } /* diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index d384e489705f..26939d364bc4 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4719,18 +4719,17 @@ xfs_bmapi( /* * Make a transaction-less quota reservation for * delayed allocation blocks. This number gets - * adjusted later. - * We return EDQUOT if we haven't allocated - * blks already inside this loop; + * adjusted later. We return if we haven't + * allocated blocks already inside this loop. */ - if (XFS_TRANS_RESERVE_QUOTA_NBLKS( + if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS( mp, NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS)) { + XFS_QMOPT_RES_REGBLKS))) { if (n == 0) { *nmap = 0; ASSERT(cur == NULL); - return XFS_ERROR(EDQUOT); + return error; } break; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 4f6a034de7f7..7fbef974bce6 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -196,10 +196,11 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ #define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ #define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be -- cgit v1.2.3 From 3bbcc8e3976f8bba2fd607c8850d7dfe7e332fda Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 31 Mar 2006 13:04:56 +1000 Subject: [XFS] Reenable write barriers by default. SGI-PV: 912426 SGI-Modid: xfs-linux-melb:xfs-kern:25634a Signed-off-by: Nathan Scott --- fs/xfs/xfs_vfsops.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index 89020c15d88a..f0e09ca14139 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -1700,8 +1700,9 @@ xfs_parseargs( int dsunit, dswidth, vol_dsunit, vol_dswidth; int iosize; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; args->flags |= XFSMNT_IDELETE; + args->flags |= XFSMNT_BARRIER; + args->flags2 |= XFSMNT2_COMPAT_IOSIZE; if (!options) goto done; @@ -1950,8 +1951,6 @@ xfs_showargs( seq_printf(m, "," MNTOPT_IKEEP); if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) seq_printf(m, "," MNTOPT_LARGEIO); - if (mp->m_flags & XFS_MOUNT_BARRIER) - seq_printf(m, "," MNTOPT_BARRIER); if (!(vfsp->vfs_flag & VFS_32BITINODES)) seq_printf(m, "," MNTOPT_64BITINODE); -- cgit v1.2.3 From 1b895840ce93fd2d150a86c800a3085eaab4eb9e Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Fri, 31 Mar 2006 13:08:59 +1000 Subject: [XFS] Provide XFS support for the splice syscall. Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_file.c | 113 ++++++++++++++++++++++++++++------------ fs/xfs/linux-2.6/xfs_linux.h | 1 + fs/xfs/linux-2.6/xfs_lrw.c | 120 +++++++++++++++++++++++++++++++++---------- fs/xfs/linux-2.6/xfs_lrw.h | 11 +++- fs/xfs/linux-2.6/xfs_vnode.h | 12 +++++ fs/xfs/xfs_vnodeops.c | 4 ++ 6 files changed, 199 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 85997b1205f5..ae4c4754ed31 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -69,7 +69,6 @@ __xfs_file_read( return rval; } - STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, @@ -90,7 +89,6 @@ xfs_file_aio_read_invis( return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_write( struct kiocb *iocb, @@ -113,7 +111,6 @@ __xfs_file_write( return rval; } - STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, @@ -134,7 +131,6 @@ xfs_file_aio_write_invis( return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_readv( struct file *file, @@ -179,7 +175,6 @@ xfs_file_readv_invis( return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos); } - STATIC inline ssize_t __xfs_file_writev( struct file *file, @@ -204,7 +199,6 @@ __xfs_file_writev( return rval; } - STATIC ssize_t xfs_file_writev( struct file *file, @@ -228,7 +222,7 @@ xfs_file_writev_invis( STATIC ssize_t xfs_file_sendfile( struct file *filp, - loff_t *ppos, + loff_t *pos, size_t count, read_actor_t actor, void *target) @@ -236,10 +230,80 @@ xfs_file_sendfile( vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); ssize_t rval; - VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval); + VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval); return rval; } +STATIC ssize_t +xfs_file_sendfile_invis( + struct file *filp, + loff_t *pos, + size_t count, + read_actor_t actor, + void *target) +{ + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); + ssize_t rval; + + VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + struct inode *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read_invis( + struct file *infilp, + struct inode *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_write( + struct inode *pipe, + struct file *outfilp, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_write_invis( + struct inode *pipe, + struct file *outfilp, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval); + return rval; +} STATIC int xfs_file_open( @@ -251,13 +315,10 @@ xfs_file_open( if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - - ASSERT(vp); VOP_OPEN(vp, NULL, error); return -error; } - STATIC int xfs_file_release( struct inode *inode, @@ -271,7 +332,6 @@ xfs_file_release( return -error; } - STATIC int xfs_file_fsync( struct file *filp, @@ -285,21 +345,11 @@ xfs_file_fsync( if (datasync) flags |= FSYNC_DATA; - - ASSERT(vp); VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error); return -error; } -/* - * xfs_file_readdir maps to VOP_READDIR(). - * We need to build a uio, cred, ... - */ - -#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) - #ifdef CONFIG_XFS_DMAPI - STATIC struct page * xfs_vm_nopage( struct vm_area_struct *area, @@ -319,10 +369,8 @@ xfs_vm_nopage( return filemap_nopage(area, address, type); } - #endif /* CONFIG_XFS_DMAPI */ - STATIC int xfs_file_readdir( struct file *filp, @@ -330,7 +378,7 @@ xfs_file_readdir( filldir_t filldir) { int error = 0; - vnode_t *vp; + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); uio_t uio; iovec_t iov; int eof = 0; @@ -340,9 +388,6 @@ xfs_file_readdir( xfs_off_t start_offset, curr_offset; xfs_dirent_t *dbp = NULL; - vp = vn_from_inode(filp->f_dentry->d_inode); - ASSERT(vp); - /* Try fairly hard to get memory */ do { if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) @@ -387,7 +432,7 @@ xfs_file_readdir( } size -= dbp->d_reclen; curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; - dbp = nextdp(dbp); + dbp = (xfs_dirent_t *)((char *)dbp + dbp->d_reclen); } } done: @@ -402,7 +447,6 @@ done: return -error; } - STATIC int xfs_file_mmap( struct file *filp, @@ -457,11 +501,10 @@ xfs_file_ioctl_invis( unsigned int cmd, unsigned long arg) { - int error; struct inode *inode = filp->f_dentry->d_inode; vnode_t *vp = vn_from_inode(inode); + int error; - ASSERT(vp); VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error); VMODIFY(vp); @@ -537,6 +580,8 @@ const struct file_operations xfs_file_operations = { .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, .sendfile = xfs_file_sendfile, + .splice_read = xfs_file_splice_read, + .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -558,7 +603,9 @@ const struct file_operations xfs_invis_file_operations = { .writev = xfs_file_writev_invis, .aio_read = xfs_file_aio_read_invis, .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile, + .sendfile = xfs_file_sendfile_invis, + .splice_read = xfs_file_splice_read_invis, + .splice_write = xfs_file_splice_write_invis, .unlocked_ioctl = xfs_file_ioctl_invis, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_invis_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 1fe09f2d6519..e9fe43d74768 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -103,6 +103,7 @@ */ #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ +#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 84ddf1893894..90cd314acbaa 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -301,36 +301,23 @@ xfs_sendfile( void *target, cred_t *credp) { + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; ssize_t ret; - xfs_fsize_t n; - xfs_inode_t *ip; - xfs_mount_t *mp; - vnode_t *vp; - - ip = XFS_BHVTOI(bdp); - vp = BHV_TO_VNODE(bdp); - mp = ip->i_mount; XFS_STATS_INC(xs_read_calls); - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (count == 0)) - return 0; - - if (n < count) - count = n; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && (!(ioflags & IO_INVIS))) { vrwlock_t locktype = VRWLOCK_READ; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + *offset, count, FILP_DELAY_FLAG(filp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -340,12 +327,96 @@ xfs_sendfile( xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, (void *)(unsigned long)target, count, *offset, ioflags); ret = generic_file_sendfile(filp, offset, count, actor, target); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} +ssize_t +xfs_splice_read( + bhv_desc_t *bdp, + struct file *infilp, + struct inode *pipe, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + infilp->f_pos, count, + FILP_DELAY_FLAG(infilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, + pipe, count, infilp->f_pos, ioflags); + ret = generic_file_splice_read(infilp, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +ssize_t +xfs_splice_write( + bhv_desc_t *bdp, + struct inode *pipe, + struct file *outfilp, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_WRITE; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), + outfilp->f_pos, count, + FILP_DELAY_FLAG(outfilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, + pipe, count, outfilp->f_pos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -363,7 +434,7 @@ xfs_zero_last_block( xfs_fsize_t end_size) { xfs_fileoff_t last_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int zero_offset; int zero_len; @@ -373,8 +444,6 @@ xfs_zero_last_block( ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); - mp = io->io_mount; - zero_offset = XFS_B_FSB_OFFSET(mp, isize); if (zero_offset == 0) { /* @@ -405,10 +474,9 @@ xfs_zero_last_block( * don't deadlock when the buffer cache calls back to us. */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); - loff = XFS_FSB_TO_B(mp, last_fsb); + loff = XFS_FSB_TO_B(mp, last_fsb); zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); @@ -441,7 +509,7 @@ xfs_zero_eof( xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; xfs_extlen_t buf_len_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int error = 0; xfs_bmbt_irec_t imap; @@ -450,8 +518,6 @@ xfs_zero_eof( ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); ASSERT(offset > isize); - mp = io->io_mount; - /* * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 38864a88d42d..eaa5659713fb 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -60,6 +60,8 @@ struct xfs_iomap; #define XFS_IOMAP_ALLOC_ENTER 25 #define XFS_IOMAP_ALLOC_MAP 26 #define XFS_IOMAP_UNWRITTEN 27 +#define XFS_SPLICE_READ_ENTER 28 +#define XFS_SPLICE_WRITE_ENTER 29 extern void xfs_rw_enter_trace(int, struct xfs_iocore *, void *, size_t, loff_t, int); extern void xfs_inval_cached_trace(struct xfs_iocore *, @@ -78,6 +80,7 @@ extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, struct xfs_iomap *, int *); extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, xfs_fsize_t, xfs_fsize_t); @@ -90,7 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); - -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, + struct inode *, size_t, int, int, + struct cred *); +extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *, + struct file *, size_t, int, int, + struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 06f5845e9568..6f1c79a28f8b 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,6 +173,12 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, + struct inode *, size_t, int, int, + struct cred *); +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *, + struct file *, size_t, int, int, + struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, @@ -231,6 +237,8 @@ typedef struct vnodeops { vop_read_t vop_read; vop_write_t vop_write; vop_sendfile_t vop_sendfile; + vop_splice_read_t vop_splice_read; + vop_splice_write_t vop_splice_write; vop_ioctl_t vop_ioctl; vop_getattr_t vop_getattr; vop_setattr_t vop_setattr; @@ -276,6 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) +#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index de49601919c1..fa71b305ba5c 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -4648,6 +4648,10 @@ vnodeops_t xfs_vnodeops = { .vop_read = xfs_read, #ifdef HAVE_SENDFILE .vop_sendfile = xfs_sendfile, +#endif +#ifdef HAVE_SPLICE + .vop_splice_read = xfs_splice_read, + .vop_splice_write = xfs_splice_write, #endif .vop_write = xfs_write, .vop_ioctl = xfs_ioctl, -- cgit v1.2.3 From d4569d2e6949a63851032b40c811913d4a6f85f5 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:10:13 +0200 Subject: BUG_ON() Conversion in fs/direct-io.c this changes if() BUG(); constructs to BUG_ON() which is cleaner and can better optimized away Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/direct-io.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/direct-io.c b/fs/direct-io.c index 910a8ed74b5d..b05d1b218776 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -929,8 +929,7 @@ do_holes: block_in_page += this_chunk_blocks; dio->blocks_available -= this_chunk_blocks; next_block: - if (dio->block_in_file > dio->final_block_in_request) - BUG(); + BUG_ON(dio->block_in_file > dio->final_block_in_request); if (dio->block_in_file == dio->final_block_in_request) break; } -- cgit v1.2.3 From 7dddb12c63553db850365cfd066a00416aa8c6cb Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:13:38 +0200 Subject: BUG_ON() Conversion in fs/exec.c this changes if() BUG(); constructs to BUG_ON() which is cleaner and can better optimized away Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 950ebd43cdc3..0291a68a3626 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -561,7 +561,7 @@ static int exec_mmap(struct mm_struct *mm) arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); - if (active_mm != old_mm) BUG(); + BUG_ON(active_mm != old_mm); mmput(old_mm); return 0; } -- cgit v1.2.3 From 0bf3ba538a150f8430104a50e88c1449e8fa1fe6 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:14:43 +0200 Subject: BUG_ON() Conversion in fs/hfsplus/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/hfsplus/bnode.c | 6 ++---- fs/hfsplus/btree.c | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 8f07e8fbd03d..746abc9ecf70 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -466,8 +466,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) ; - if (!*p) - BUG(); + BUG_ON(!*p); *p = node->next_hash; node->tree->node_hash_cnt--; } @@ -622,8 +621,7 @@ void hfs_bnode_put(struct hfs_bnode *node) dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); - if (!atomic_read(&node->refcnt)) - BUG(); + BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; for (i = 0; i < tree->pages_per_bnode; i++) { diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index a67edfa34e9e..effa8991999c 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -269,8 +269,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u8 *data, byte, m; dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); - if (!node->this) - BUG(); + BUG_ON(!node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); -- cgit v1.2.3 From 4b4d1cc7336b29f766d4e59d1ed2c627443a694a Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:15:35 +0200 Subject: BUG_ON() Conversion in fs/jffs2/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/jffs2/background.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 7b77a9541125..ff2a872e80e7 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -35,8 +35,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) pid_t pid; int ret = 0; - if (c->gc_task) - BUG(); + BUG_ON(c->gc_task); init_completion(&c->gc_thread_start); init_completion(&c->gc_thread_exit); -- cgit v1.2.3 From 5df0d312413d920628f149421d7b0a3994684620 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:16:26 +0200 Subject: BUG_ON() Conversion in fs/smbfs/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/smbfs/file.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index c56bd99a9701..ed9a24d19d7d 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -178,11 +178,9 @@ smb_writepage(struct page *page, struct writeback_control *wbc) unsigned offset = PAGE_CACHE_SIZE; int err; - if (!mapping) - BUG(); + BUG_ON(!mapping); inode = mapping->host; - if (!inode) - BUG(); + BUG_ON(!inode); end_index = inode->i_size >> PAGE_CACHE_SHIFT; -- cgit v1.2.3 From 99cee0cd7560fc4e7f3646ee18d90e328bd1cb32 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sat, 1 Apr 2006 01:18:38 +0200 Subject: BUG_ON() Conversion in fs/sysfs/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/sysfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4c29ac41ac3e..f0b347bd12ca 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -175,8 +175,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) struct bin_attribute * bin_attr; struct sysfs_symlink * sl; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); switch (sd->s_type) { case SYSFS_DIR: -- cgit v1.2.3 From 8abf6a4707cfb95ca552b882959c6f8ff9924270 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:36:13 +0200 Subject: BUG_ON() Conversion in fs/dquot.c this changes if() BUG(); constructs to BUG_ON() which is cleaner and can better optimized away Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/dquot.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dquot.c b/fs/dquot.c index 6b3886920939..81d87a413c68 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -590,8 +590,7 @@ we_slept: atomic_dec(&dquot->dq_count); #ifdef __DQUOT_PARANOIA /* sanity check */ - if (!list_empty(&dquot->dq_free)) - BUG(); + BUG_ON(!list_empty(&dquot->dq_free)); #endif put_dquot_last(dquot); spin_unlock(&dq_list_lock); @@ -666,8 +665,7 @@ we_slept: return NODQUOT; } #ifdef __DQUOT_PARANOIA - if (!dquot->dq_sb) /* Has somebody invalidated entry under us? */ - BUG(); + BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif return dquot; -- cgit v1.2.3 From f6298aab2ebaa61de39931595f125bc1968905cc Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:37:19 +0200 Subject: BUG_ON() Conversion in fs/fcntl.c this changes if() BUG(); constructs to BUG_ON() which is cleaner and can better optimized away Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/fcntl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 2a2479196f96..d35cbc6bc112 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -453,8 +453,7 @@ static void send_sigio_to_task(struct task_struct *p, /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ - if ((reason & __SI_MASK) != __SI_POLL) - BUG(); + BUG_ON((reason & __SI_MASK) != __SI_POLL); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else -- cgit v1.2.3 From b7542f8c7eb40efb967a558c5be90fe5f939c3ef Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:38:18 +0200 Subject: BUG_ON() Conversion in fs/inode.c this changes if() BUG(); constructs to BUG_ON() which is cleaner and can better optimized away Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/inode.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 32b7c3375021..3a2446a27d2c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -172,8 +172,7 @@ static struct inode *alloc_inode(struct super_block *sb) void destroy_inode(struct inode *inode) { - if (inode_has_buffers(inode)) - BUG(); + BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); @@ -249,12 +248,9 @@ void clear_inode(struct inode *inode) might_sleep(); invalidate_inode_buffers(inode); - if (inode->i_data.nrpages) - BUG(); - if (!(inode->i_state & I_FREEING)) - BUG(); - if (inode->i_state & I_CLEAR) - BUG(); + BUG_ON(inode->i_data.nrpages); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); wait_on_inode(inode); DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op->clear_inode) @@ -1054,8 +1050,7 @@ void generic_delete_inode(struct inode *inode) hlist_del_init(&inode->i_hash); spin_unlock(&inode_lock); wake_up_inode(inode); - if (inode->i_state != I_CLEAR) - BUG(); + BUG_ON(inode->i_state != I_CLEAR); destroy_inode(inode); } -- cgit v1.2.3 From d6735bfcc998863dab89dacca2aed20932b6bc21 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:39:21 +0200 Subject: BUG_ON() Conversion in fs/sysv/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/sysv/dir.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8c66e9270dd6..d7074341ee87 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -253,8 +253,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) lock_page(page); err = mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = 0; err = dir_commit_chunk(page, from, to); dir_put_page(page); @@ -353,8 +352,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page *page, lock_page(page); err = page->mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); err = dir_commit_chunk(page, from, to); dir_put_page(page); -- cgit v1.2.3 From 2c2111c2bd821d3e7cf5a6a37a112a620fd947a3 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:40:13 +0200 Subject: BUG_ON() Conversion in fs/udf/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/udf/inode.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 81e0e8459af1..2983afd5e7fd 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -312,12 +312,10 @@ static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head err = 0; bh = inode_getblk(inode, block, &err, &phys, &new); - if (bh) - BUG(); + BUG_ON(bh); if (err) goto abort; - if (!phys) - BUG(); + BUG_ON(!phys); if (new) set_buffer_new(bh_result); -- cgit v1.2.3 From 7ec70738097af9dfd25d5f83e9b27a532f462912 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Sun, 2 Apr 2006 13:41:02 +0200 Subject: BUG_ON() Conversion in fs/freevxfs/ this changes if() BUG(); constructs to BUG_ON() which is cleaner, contains unlikely() and can better optimized away. Signed-off-by: Eric Sesterhenn Signed-off-by: Adrian Bunk --- fs/freevxfs/vxfs_olt.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 76a0708ae978..049500847903 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -42,24 +42,21 @@ static inline void vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) { - if (infp->vsi_fshino) - BUG(); + BUG_ON(infp->vsi_fshino); infp->vsi_fshino = fshp->olt_fsino[0]; } static inline void vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) { - if (infp->vsi_iext) - BUG(); + BUG_ON(infp->vsi_iext); infp->vsi_iext = ilistp->olt_iext[0]; } static inline u_long vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) { - if (sbp->s_blocksize % bsize) - BUG(); + BUG_ON(sbp->s_blocksize % bsize); return (block * (sbp->s_blocksize / bsize)); } -- cgit v1.2.3 From a580290c3e64bb695158a090d02d1232d9609311 Mon Sep 17 00:00:00 2001 From: Martin Waitz Date: Sun, 2 Apr 2006 13:59:55 +0200 Subject: Documentation: fix minor kernel-doc warnings This patch updates the comments to match the actual code. Signed-off-by: Martin Waitz Signed-off-by: Adrian Bunk --- fs/sysfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f26880a4785e..6cfdc9a87772 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -50,7 +50,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, return sd; } -/** +/* * * Return -EEXIST if there is already a sysfs element with the same name for * the same parent. -- cgit v1.2.3 From 29e350944fdc2dfca102500790d8ad6d6ff4f69d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Apr 2006 12:46:35 -0700 Subject: splice: add SPLICE_F_NONBLOCK flag It doesn't make the splice itself necessarily nonblocking (because the actual file descriptors that are spliced from/to may block unless they have the O_NONBLOCK flag set), but it makes the splice pipe operations nonblocking. Signed-off-by: Linus Torvalds --- fs/splice.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 7c2bbf18d7a7..6081cf7d2d1b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -106,7 +106,7 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { static ssize_t move_to_pipe(struct inode *inode, struct page **pages, int nr_pages, unsigned long offset, - unsigned long len) + unsigned long len, unsigned int flags) { struct pipe_inode_info *info; int ret, do_wakeup, i; @@ -159,6 +159,12 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, break; } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; @@ -191,7 +197,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len) + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -279,7 +285,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * Now we splice them into the pipe.. */ splice_them: - return move_to_pipe(pipe, pages, i, offset, len); + return move_to_pipe(pipe, pages, i, offset, len, flags); } ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, @@ -291,7 +297,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len); + ret = __generic_file_splice_read(in, pipe, len, flags); if (ret <= 0) break; @@ -299,6 +305,11 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, in->f_pos += ret; len -= ret; spliced += ret; + + if (!(flags & SPLICE_F_NONBLOCK)) + continue; + ret = -EAGAIN; + break; } if (spliced) @@ -527,6 +538,12 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, break; } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; -- cgit v1.2.3 From 6e0dd741a89be35defa05bd79f4211c5a2762825 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 31 Mar 2006 15:37:06 -0800 Subject: [PATCH] sysfs: zero terminate sysfs write buffers No one should be writing a PAGE_SIZE worth of data to a normal sysfs file, so properly terminate the buffer. Thanks to Al Viro for pointing out my supidity here. Signed-off-by: Greg Kroah-Hartman Signed-off-by: Linus Torvalds --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 830f76fa098c..f1cb1ddde511 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -183,7 +183,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t return -ENOMEM; if (count >= PAGE_SIZE) - count = PAGE_SIZE; + count = PAGE_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; return error ? -EFAULT : count; -- cgit v1.2.3 From 53cd9ae886273d6c2b8ba4aa63d6cd6b1217b57f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:04:21 +0200 Subject: [PATCH] splice: fix shadow[] filling logic Clear the entire range, and don't increment pidx or we keep filling the same position again and again. Thanks to KAMEZAWA Hiroyuki. Signed-off-by: Jens Axboe --- fs/splice.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 6081cf7d2d1b..a555d0a83fe9 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -237,9 +237,9 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * fill shadow[] with pages at the right locations, so we only * have to fill holes */ - memset(shadow, 0, i * sizeof(struct page *)); - for (j = 0, pidx = index; j < i; pidx++, j++) - shadow[pages[j]->index - pidx] = pages[j]; + memset(shadow, 0, nr_pages * sizeof(struct page *)); + for (j = 0; j < i; j++) + shadow[pages[j]->index - index] = pages[j]; /* * now fill in the holes -- cgit v1.2.3 From 4f6f0bd2ffa4e31c3524f5e65c84a29b6ab73307 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:04:46 +0200 Subject: [PATCH] splice: improve writeback and clean up page stealing By cleaning up the writeback logic (killing write_one_page() and the manual set_page_dirty()), we can get rid of ->stolen inside the pipe_buffer and just keep it local in pipe_to_file(). This also adds dirty page balancing logic and O_SYNC handling. Signed-off-by: Jens Axboe --- fs/pipe.c | 1 - fs/splice.c | 64 +++++++++++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 48 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 109a102c150d..5093408746b8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -124,7 +124,6 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer static int anon_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { - buf->stolen = 1; return 0; } diff --git a/fs/splice.c b/fs/splice.c index a555d0a83fe9..07f4d863c2d4 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -22,7 +22,10 @@ #include #include #include +#include +#include #include +#include /* * Passed to the actors @@ -38,11 +41,15 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { struct page *page = buf->page; + struct address_space *mapping = page_mapping(page); WARN_ON(!PageLocked(page)); WARN_ON(!PageUptodate(page)); - if (!remove_mapping(page_mapping(page), page)) + if (PagePrivate(page)) + try_to_release_page(page, mapping_gfp_mask(mapping)); + + if (!remove_mapping(mapping, page)) return 1; if (PageLRU(page)) { @@ -55,7 +62,6 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, spin_unlock_irq(&zone->lru_lock); } - buf->stolen = 1; return 0; } @@ -64,7 +70,6 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; - buf->stolen = 0; } static void *page_cache_pipe_buf_map(struct file *file, @@ -91,8 +96,7 @@ static void *page_cache_pipe_buf_map(struct file *file, static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - if (!buf->stolen) - unlock_page(buf->page); + unlock_page(buf->page); kunmap(buf->page); } @@ -319,7 +323,8 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, } /* - * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). */ static int pipe_to_sendpage(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) @@ -379,7 +384,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct page *page; pgoff_t index; char *src; - int ret; + int ret, stolen; /* * after this, page will be locked and unmapped @@ -390,6 +395,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + stolen = 0; /* * reuse buf page, if SPLICE_F_MOVE is set @@ -399,6 +405,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; + stolen = 1; if (add_to_page_cache_lru(page, mapping, index, mapping_gfp_mask(mapping))) goto find_page; @@ -443,10 +450,13 @@ find_page: } ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); - if (ret) + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) goto out; - if (!buf->stolen) { + if (!stolen) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, sd->len); @@ -455,16 +465,18 @@ find_page: } ret = mapping->a_ops->commit_write(file, page, 0, sd->len); - if (ret < 0) + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) goto out; - set_page_dirty(page); - ret = write_one_page(page, 0); + balance_dirty_pages_ratelimited(mapping); out: - if (ret < 0) - unlock_page(page); - if (!buf->stolen) + if (!stolen) { page_cache_release(page); + unlock_page(page); + } buf->ops->unmap(info, buf); return ret; } @@ -576,7 +588,27 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, ssize_t generic_file_splice_write(struct inode *inode, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_file); + struct address_space *mapping = out->f_mapping; + ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + + /* + * if file or inode is SYNC and we actually wrote some data, sync it + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) + && ret > 0) { + struct inode *inode = mapping->host; + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(mapping->host, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + + return ret; } ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, -- cgit v1.2.3 From 83f9135bddffded9f1716519b6c147bcf046c87e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:05:09 +0200 Subject: [PATCH] splice: add comments documenting more of the code Hopefully this will make Andrew a little more happy. Signed-off-by: Jens Axboe --- fs/splice.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 07f4d863c2d4..34591924c783 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -37,6 +37,12 @@ struct splice_desc { loff_t pos; /* file position */ }; +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { @@ -108,6 +114,10 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { .steal = page_cache_pipe_buf_steal, }; +/* + * Pipe output worker. This sets up our pipe format with the page cache + * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + */ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) @@ -292,6 +302,16 @@ splice_them: return move_to_pipe(pipe, pages, i, offset, len, flags); } +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will read pages from given file and fill them into a pipe. + * + */ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, size_t len, unsigned int flags) { @@ -370,10 +390,12 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, * - Destination page does not exist, we can add the pipe page to * the page cache and avoid the copy. * - * For now we just do the slower thing and always copy pages over, it's - * easier than migrating pages from the pipe to the target file. For the - * case of doing file | file splicing, the migrate approach had some LRU - * nastiness... + * If asked to move pages to the output file (SPLICE_F_MOVE is set in + * sd->flags), we attempt to migrate pages from the pipe to the output + * file address space page cache. This is possible if no one else has + * the pipe page referenced outside of the pipe and page cache. If + * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create + * a new page in the output file page cache and fill/dirty that. */ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) @@ -401,6 +423,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, * reuse buf page, if SPLICE_F_MOVE is set */ if (sd->flags & SPLICE_F_MOVE) { + /* + * If steal succeeds, buf->page is now pruned from the vm + * side (LRU and page cache) and we can reuse it. + */ if (buf->ops->steal(info, buf)) goto find_page; @@ -484,6 +510,11 @@ out: typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, struct splice_desc *); +/* + * Pipe input worker. Most of this logic works like a regular pipe, the + * key here is the 'actor' worker passed in that actually moves the data + * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. + */ static ssize_t move_from_pipe(struct inode *inode, struct file *out, size_t len, unsigned int flags, splice_actor *actor) @@ -585,6 +616,17 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } +/** + * generic_file_splice_write - splice data from a pipe to a file + * @inode: pipe inode + * @out: file to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * + */ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, size_t len, unsigned int flags) { @@ -611,6 +653,17 @@ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, return ret; } +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @inode: pipe inode + * @out: socket to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, size_t len, unsigned int flags) { @@ -620,6 +673,9 @@ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); EXPORT_SYMBOL(generic_file_splice_read); +/* + * Attempt to initiate a splice from pipe to file. + */ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, unsigned int flags) { @@ -640,6 +696,9 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, return out->f_op->splice_write(pipe, out, len, flags); } +/* + * Attempt to initiate a splice from a file to a pipe. + */ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, unsigned int flags) { @@ -668,6 +727,9 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, return in->f_op->splice_read(in, pipe, len, flags); } +/* + * Determine where to splice to/from. + */ static long do_splice(struct file *in, struct file *out, size_t len, unsigned int flags) { -- cgit v1.2.3 From b2b39fa478db6db89b7ccafb0649973845b0eb75 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:05:41 +0200 Subject: [PATCH] splice: add a SPLICE_F_MORE flag This lets userspace indicate whether more data will be coming in a subsequent splice call. Signed-off-by: Jens Axboe --- fs/splice.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 34591924c783..e84544558946 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -354,6 +354,7 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, unsigned int offset; ssize_t ret; void *ptr; + int more; /* * sub-optimal, but we are limited by the pipe ->map. we don't @@ -366,9 +367,9 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, return PTR_ERR(ptr); offset = pos & ~PAGE_CACHE_MASK; + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, - sd->len < sd->total_len); + ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); buf->ops->unmap(info, buf); if (ret == sd->len) -- cgit v1.2.3 From 059a8f3734a66cb2c41731083b0cbf836721485b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:06:05 +0200 Subject: [PATCH] splice: export generic_splice_sendpage Forgot that one, thanks Jeff. Also move the other EXPORT_SYMBOL to right below the functions. Signed-off-by: Jens Axboe --- fs/splice.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index e84544558946..7c2abd4504d7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -342,6 +342,8 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, return ret; } +EXPORT_SYMBOL(generic_file_splice_read); + /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). @@ -654,6 +656,8 @@ ssize_t generic_file_splice_write(struct inode *inode, struct file *out, return ret; } +EXPORT_SYMBOL(generic_file_splice_write); + /** * generic_splice_sendpage - splice data from a pipe to a socket * @inode: pipe inode @@ -671,8 +675,7 @@ ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); } -EXPORT_SYMBOL(generic_file_splice_write); -EXPORT_SYMBOL(generic_file_splice_read); +EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. -- cgit v1.2.3 From ad8d6f0a783ffa2ff9b0cf09910b889715772201 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:10:32 +0200 Subject: [PATCH] splice: page stealing needs to wait_on_page_writeback() Thanks to Andrew for the good explanation of why this is so. akpm writes: If a page is under writeback and we remove it from pagecache, it's still going to get written to disk. But the VFS no longer knows about that page, nor that this page is about to modify disk blocks. So there might be scenarios in which those blocks-which-are-about-to-be-written-to get reused for something else. When writeback completes, it'll scribble on those blocks. This won't happen in ext2/ext3-style filesystems in normal mode because the page has buffers and try_to_release_page() will fail. But ext2 in nobh mode doesn't attach buffers at all - it just sticks the page in a BIO, finds some new blocks, points the BIO at those blocks and lets it rip. While that write IO's in flight, someone could truncate the file. Truncate won't block on the writeout because the page isn't in pagecache any more. So truncate will the free the blocks from the file under the page's feet. Then something else can reallocate those blocks. Then write data to them. Now, the original write completes, corrupting the filesystem. Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 7c2abd4504d7..b5fb2f3e3ac6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -52,6 +52,15 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, WARN_ON(!PageLocked(page)); WARN_ON(!PageUptodate(page)); + /* + * At least for ext2 with nobh option, we need to wait on writeback + * completing on this page, since we'll remove it from the pagecache. + * Otherwise truncate wont wait on the page, allowing the disk + * blocks to be reused by someone else before we actually wrote our + * data to them. fs corruption ensues. + */ + wait_on_page_writeback(page); + if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); -- cgit v1.2.3 From 3e7ee3e7b36fa4e2d88d8fb0a2577be95fc4636d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 2 Apr 2006 23:11:04 +0200 Subject: [PATCH] splice: fix page stealing LRU handling. Originally from Nick Piggin, just adapted to the newer branch. You can't check PageLRU without holding zone->lru_lock. The page release code can get away with it only because the page refcount is 0 at that point. Also, you can't reliably remove pages from the LRU unless the refcount is 0. Ever. Signed-off-by: Nick Piggin Signed-off-by: Jens Axboe --- fs/pipe.c | 3 +++ fs/splice.c | 30 +++++++++++------------------- 2 files changed, 14 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 5093408746b8..795df987cd38 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -95,6 +95,8 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff { struct page *page = buf->page; + buf->flags &= ~PIPE_BUF_FLAG_STOLEN; + /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep @@ -124,6 +126,7 @@ static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer static int anon_pipe_buf_steal(struct pipe_inode_info *info, struct pipe_buffer *buf) { + buf->flags |= PIPE_BUF_FLAG_STOLEN; return 0; } diff --git a/fs/splice.c b/fs/splice.c index b5fb2f3e3ac6..bfa42a277bb8 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -67,16 +67,7 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (!remove_mapping(mapping, page)) return 1; - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - BUG_ON(!PageLRU(page)); - __ClearPageLRU(page); - del_page_from_lru(zone, page); - spin_unlock_irq(&zone->lru_lock); - } - + buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; } @@ -85,6 +76,7 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *info, { page_cache_release(buf->page); buf->page = NULL; + buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); } static void *page_cache_pipe_buf_map(struct file *file, @@ -414,11 +406,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, { struct file *file = sd->file; struct address_space *mapping = file->f_mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping); unsigned int offset; struct page *page; pgoff_t index; char *src; - int ret, stolen; + int ret; /* * after this, page will be locked and unmapped @@ -429,7 +422,6 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; - stolen = 0; /* * reuse buf page, if SPLICE_F_MOVE is set @@ -443,15 +435,15 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, goto find_page; page = buf->page; - stolen = 1; - if (add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping))) + if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(page); } else { find_page: ret = -ENOMEM; - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); + page = find_or_create_page(mapping, index, gfp_mask); if (!page) goto out; @@ -494,7 +486,7 @@ find_page: } else if (ret) goto out; - if (!stolen) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { char *dst = kmap_atomic(page, KM_USER0); memcpy(dst + offset, src + buf->offset, sd->len); @@ -511,7 +503,7 @@ find_page: balance_dirty_pages_ratelimited(mapping); out: - if (!stolen) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { page_cache_release(page); unlock_page(page); } -- cgit v1.2.3 From ab0920ce7ebb6d60063c793f227ae198a492251b Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 16 Mar 2006 15:06:37 -0800 Subject: ocfs2: multi node truncate fix Fix ocfs2_truncate_file() so that it forces a truncate_inode_pages() on all interested nodes in all cases of a truncate(), not just allocation change. Signed-off-by: Mark Fasheh --- fs/ocfs2/file.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 34e903a6a46b..581eb451a41a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking -- cgit v1.2.3 From 1f7bc828e30fe3e23ea0968b9595ad20e2785978 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 10:33:35 -0800 Subject: ocfs2: remove an overly aggressive BUG() in dlmfs Don't BUG() user_dlm_unblock_lock() on the absence of the USER_LOCK_BLOCKED flag - this turns out to be a valid case. Make some of the related BUG() statements print more useful information. Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index c3764f4744ee..bac4615965f6 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -268,13 +268,26 @@ static void user_dlm_unblock_lock(void *opaque) spin_lock(&lockres->l_lock); - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); - /* notice that we don't clear USER_LOCK_BLOCKED here. That's - * for user_ast to do. */ + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", + lockres->l_name, lockres->l_flags); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); -- cgit v1.2.3 From cc6eb725955efb026007e1d7da8fe5383981afd2 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 10:34:21 -0800 Subject: ocfs2: catch an invalid ast case in dlmfs Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index bac4615965f6..d0f1027a3853 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -139,6 +139,10 @@ static void user_ast(void *opaque) return; } + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %s, requested ivmode. flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= -- cgit v1.2.3 From f43e6918c0e3906fd4483316f6a1a07bba615908 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 18:24:12 -0800 Subject: ocfs2: Handle the DLM_CANCELGRANT case in user_unlock_ast() Remove the code which attempted to catch it via dlmunlock() return status - this never happens there. Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index d0f1027a3853..808ec0527c75 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -233,23 +233,38 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) lockres->l_level = LKM_IVMODE; - else { + else if (status == DLM_CANCELGRANT) { + mlog(0, "Lock %s, cancel fails, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", + lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ - __user_dlm_queue_lockres(lockres); + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); @@ -299,7 +314,9 @@ static void user_dlm_unblock_lock(void *opaque) } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "BUSY flag detected...\n"); + mlog(0, "Cancel lock %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -313,14 +330,7 @@ static void user_dlm_unblock_lock(void *opaque) LKM_CANCEL, user_unlock_ast, lockres); - if (status == DLM_CANCELGRANT) { - /* If we got this, then the ast was fired - * before we could cancel. We cleanup our - * state, and restart the function. */ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - } else if (status != DLM_NORMAL) + if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } -- cgit v1.2.3 From 2cd9888590c52ac7592e3607d0a3174ccd57ef86 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Wed, 29 Mar 2006 16:49:13 -0800 Subject: ocfs2: test and set teardown flag early in user_dlm_destroy_lock() Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/userdlm.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 808ec0527c75..74ca4e5f9765 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -237,9 +237,13 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; - else if (status == DLM_CANCELGRANT) { + } else if (status == DLM_CANCELGRANT) { mlog(0, "Lock %s, cancel fails, flags 0x%x\n", lockres->l_name, lockres->l_flags); /* We tried to cancel a convert request, but it was @@ -608,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) mlog(0, "asked to destroy %s\n", lockres->l_name); spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "Lock is already torn down\n"); + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); @@ -633,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); mlog(0, "unlocking lockres %s\n", lockres->l_name); -- cgit v1.2.3 From a9e2ae39170d01937725e1fff2e606baaa71346c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 24 Mar 2006 14:20:17 -0800 Subject: ocfs2: Better I/O error handling in heartbeat Propagate errors received in o2hb_bio_end_io() back to the heartbeat thread so it can skip re-arming the timer. Signed-off-by: Mark Fasheh --- fs/ocfs2/cluster/heartbeat.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d06867..21f38accd039 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); -- cgit v1.2.3 From c7f21e4f5a3d4e378e4d453b2be209dcfd1bb964 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:01:01 +0200 Subject: [PATCH] splice: mark the io page as accessed We should do that, since we do the LRU manipulation ourselves now. Suggested by Nick Piggin. Signed-off-by: Jens Axboe --- fs/splice.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index bfa42a277bb8..b450acdff397 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -501,6 +501,7 @@ find_page: } else if (ret) goto out; + mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { -- cgit v1.2.3 From 9aefe431f5a000884db7ae74ac208de814fe5913 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 10 Apr 2006 09:02:40 +0200 Subject: [PATCH] splice: potential !page dereference We can get to out: with a NULL page, which we probably don't want to be calling page_cache_release() on. Signed-off-by: Dave Jones Signed-off-by: Jens Axboe --- fs/splice.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index b450acdff397..26f5f7ecee5c 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -445,7 +445,7 @@ find_page: ret = -ENOMEM; page = find_or_create_page(mapping, index, gfp_mask); if (!page) - goto out; + goto out_nomem; /* * If the page is uptodate, it is also locked. If it isn't @@ -508,6 +508,7 @@ out: page_cache_release(page); unlock_page(page); } +out_nomem: buf->ops->unmap(info, buf); return ret; } -- cgit v1.2.3 From c0bd1f650bd06a43435808d44f1e9520ea806206 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:32 +0200 Subject: [PATCH] splice: only call wake_up_interruptible() when we really have to __wake_up_common() is pretty heavy in the kernel profiles, this brings it down to a more acceptable level. Signed-off-by: Jens Axboe --- fs/splice.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 26f5f7ecee5c..9f796b1034d1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -187,7 +187,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); do_wakeup = 0; @@ -201,7 +203,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); } @@ -600,7 +604,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); do_wakeup = 0; } @@ -611,7 +617,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, mutex_unlock(PIPE_MUTEX(*inode)); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); + smp_mb(); + if (waitqueue_active(PIPE_WAIT(*inode))) + wake_up_interruptible(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } -- cgit v1.2.3 From 16c523ddabcce5d3d817f4a2491d628f84dfaaa1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:03:58 +0200 Subject: [PATCH] splice: cleanup __generic_file_splice_read() The whole shadow/pages logic got overly complex, and this simpler approach is actually faster in testing. Signed-off-by: Jens Axboe --- fs/splice.c | 59 ++++++++++------------------------------------------------- 1 file changed, 10 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 9f796b1034d1..8b5efcc906dc 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -220,10 +220,10 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; - struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; + struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index, pidx; - int i, j; + pgoff_t index; + int i; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -237,42 +237,14 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, */ do_page_cache_readahead(mapping, in, index, nr_pages); - /* - * Get as many pages from the page cache as possible.. - * Start IO on the page cache entries we create (we - * can assume that any pre-existing ones we find have - * already had IO started on them). - */ - i = find_get_pages(mapping, index, nr_pages, pages); - - /* - * common case - we found all pages and they are contiguous, - * kick them off - */ - if (i && (pages[i - 1]->index == index + i - 1)) - goto splice_them; - - /* - * fill shadow[] with pages at the right locations, so we only - * have to fill holes - */ - memset(shadow, 0, nr_pages * sizeof(struct page *)); - for (j = 0; j < i; j++) - shadow[pages[j]->index - index] = pages[j]; - /* * now fill in the holes */ - for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { - int error; - - if (shadow[i]) - continue; - + for (i = 0; i < nr_pages; i++, index++) { /* * no page there, look one up / create it */ - page = find_or_create_page(mapping, pidx, + page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); if (!page) break; @@ -280,31 +252,20 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, if (PageUptodate(page)) unlock_page(page); else { - error = mapping->a_ops->readpage(in, page); + int error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); break; } } - shadow[i] = page; - } - - if (!i) { - for (i = 0; i < nr_pages; i++) { - if (shadow[i]) - page_cache_release(shadow[i]); - } - return 0; + pages[i] = page; } - memcpy(pages, shadow, i * sizeof(struct page *)); + if (i) + return move_to_pipe(pipe, pages, i, offset, len, flags); - /* - * Now we splice them into the pipe.. - */ -splice_them: - return move_to_pipe(pipe, pages, i, offset, len, flags); + return 0; } /** -- cgit v1.2.3 From 49d0b21be21efc07526d637e0ae935019667e532 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:04:41 +0200 Subject: [PATCH] splice: optimize the splice buffer mapping We don't really need to lock down the pages, just make sure they are uptodate. Signed-off-by: Jens Axboe --- fs/splice.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8b5efcc906dc..50c43a1e0923 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -84,26 +84,43 @@ static void *page_cache_pipe_buf_map(struct file *file, struct pipe_buffer *buf) { struct page *page = buf->page; - - lock_page(page); + int err; if (!PageUptodate(page)) { - unlock_page(page); - return ERR_PTR(-EIO); - } + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * uh oh, read-error from disk + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } - if (!page->mapping) { + /* + * page is ok afterall, fall through to mapping + */ unlock_page(page); - return ERR_PTR(-ENODATA); } - return kmap(buf->page); + return kmap(page); +error: + unlock_page(page); + return ERR_PTR(err); } static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) { - unlock_page(buf->page); kunmap(buf->page); } @@ -379,7 +396,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, int ret; /* - * after this, page will be locked and unmapped + * make sure the data in this buffer is uptodate */ src = buf->ops->map(file, info, buf); if (IS_ERR(src)) @@ -399,6 +416,9 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (buf->ops->steal(info, buf)) goto find_page; + /* + * this will also set the page locked + */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; -- cgit v1.2.3 From 0b749ce3802428007a37870eb51ba3c0bdf90857 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Apr 2006 09:05:04 +0200 Subject: [PATCH] splice: be smarter about calling do_page_cache_readahead() We don't want to call into the read-ahead logic unless we are at the start of a page, _or_ we have multiple pages to read. Signed-off-by: Jens Axboe --- fs/splice.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 50c43a1e0923..9bfd6af0cf45 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -250,9 +250,12 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range + * initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. */ - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!offset || nr_pages > 1) + do_page_cache_readahead(mapping, in, index, nr_pages); /* * now fill in the holes -- cgit v1.2.3 From 3a326a2ce88e71d00ac0d133e314a3342a7709f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:35 +0200 Subject: [PATCH] introduce a "kernel-internal pipe object" abstraction separate out the 'internal pipe object' abstraction, and make it usable to splice. This cleans up and fixes several aspects of the internal splice APIs and the pipe code: - pipes: the allocation and freeing of pipe_inode_info is now more symmetric and more streamlined with existing kernel practices. - splice: small micro-optimization: less pointer dereferencing in splice methods Signed-off-by: Ingo Molnar Update XFS for the ->splice_read/->splice_write changes. Signed-off-by: Jens Axboe --- fs/fifo.c | 12 +++-- fs/pipe.c | 51 +++++++++--------- fs/splice.c | 122 ++++++++++++++++++++++--------------------- fs/xfs/linux-2.6/xfs_file.c | 8 +-- fs/xfs/linux-2.6/xfs_lrw.c | 4 +- fs/xfs/linux-2.6/xfs_lrw.h | 4 +- fs/xfs/linux-2.6/xfs_vnode.h | 4 +- 7 files changed, 106 insertions(+), 99 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index 889f722ee36d..b16e2f597d61 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -15,12 +15,13 @@ #include #include -static void wait_for_partner(struct inode* inode, unsigned int* cnt) +static void wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; - while(cur == *cnt) { - pipe_wait(inode); - if(signal_pending(current)) + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) break; } } @@ -37,7 +38,8 @@ static int fifo_open(struct inode *inode, struct file *filp) mutex_lock(PIPE_MUTEX(*inode)); if (!inode->i_pipe) { ret = -ENOMEM; - if(!pipe_new(inode)) + inode->i_pipe = alloc_pipe_info(inode); + if (!inode->i_pipe) goto err_nocleanup; } filp->f_version = 0; diff --git a/fs/pipe.c b/fs/pipe.c index 795df987cd38..705b48692627 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -36,7 +36,7 @@ */ /* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct inode * inode) +void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); @@ -44,11 +44,13 @@ void pipe_wait(struct inode * inode) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - mutex_unlock(PIPE_MUTEX(*inode)); + prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); schedule(); - finish_wait(PIPE_WAIT(*inode), &wait); - mutex_lock(PIPE_MUTEX(*inode)); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); } static int @@ -223,7 +225,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, wake_up_interruptible_sync(PIPE_WAIT(*inode)); kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); } - pipe_wait(inode); + pipe_wait(inode->i_pipe); } mutex_unlock(PIPE_MUTEX(*inode)); /* Signal writers asynchronously that there is more room. */ @@ -370,7 +372,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; } PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); + pipe_wait(inode->i_pipe); PIPE_WAITING_WRITERS(*inode)--; } out: @@ -675,6 +677,20 @@ static struct file_operations rdwr_pipe_fops = { .fasync = pipe_rdwr_fasync, }; +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *info; + + info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (info) { + init_waitqueue_head(&info->wait); + info->r_counter = info->w_counter = 1; + info->inode = inode; + } + + return info; +} + void free_pipe_info(struct inode *inode) { int i; @@ -691,23 +707,6 @@ void free_pipe_info(struct inode *inode) kfree(info); } -struct inode* pipe_new(struct inode* inode) -{ - struct pipe_inode_info *info; - - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (!info) - goto fail_page; - inode->i_pipe = info; - - init_waitqueue_head(PIPE_WAIT(*inode)); - PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; - - return inode; -fail_page: - return NULL; -} - static struct vfsmount *pipe_mnt __read_mostly; static int pipefs_delete_dentry(struct dentry *dentry) { @@ -724,8 +723,10 @@ static struct inode * get_pipe_inode(void) if (!inode) goto fail_inode; - if(!pipe_new(inode)) + inode->i_pipe = alloc_pipe_info(inode); + if (!inode->i_pipe) goto fail_iput; + PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; inode->i_fop = &rdwr_pipe_fops; diff --git a/fs/splice.c b/fs/splice.c index 9bfd6af0cf45..ed91a62402e0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -136,34 +136,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct inode *inode, struct page **pages, +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, int nr_pages, unsigned long offset, unsigned long len, unsigned int flags) { - struct pipe_inode_info *info; int ret, do_wakeup, i; ret = 0; do_wakeup = 0; i = 0; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); - struct pipe_buffer *buf = info->bufs + newbuf; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -175,7 +174,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - info->nrbufs = ++bufs; + pipe->nrbufs = ++bufs; do_wakeup = 1; ret += this_len; @@ -205,25 +204,25 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, - POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } while (i < nr_pages) @@ -232,8 +231,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages, return ret; } -static int __generic_file_splice_read(struct file *in, struct inode *pipe, - size_t len, unsigned int flags) +static int +__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -298,7 +298,7 @@ static int __generic_file_splice_read(struct file *in, struct inode *pipe, * Will read pages from given file and fill them into a pipe. * */ -ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, +ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { ssize_t spliced; @@ -306,6 +306,7 @@ ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, ret = 0; spliced = 0; + while (len) { ret = __generic_file_splice_read(in, pipe, len, flags); @@ -509,11 +510,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct inode *inode, struct file *out, +static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags, splice_actor *actor) { - struct pipe_inode_info *info; int ret, do_wakeup, err; struct splice_desc sd; @@ -525,22 +525,22 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, sd.file = out; sd.pos = out->f_pos; - mutex_lock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); - info = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; if (sd.len > sd.total_len) sd.len = sd.total_len; - err = actor(info, buf, &sd); + err = actor(pipe, buf, &sd); if (err) { if (!ret && err != -ENODATA) ret = err; @@ -553,10 +553,10 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, buf->len -= sd.len; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } @@ -568,9 +568,9 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (bufs) continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { if (ret) break; } @@ -589,22 +589,23 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); do_wakeup = 0; } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); if (do_wakeup) { smp_mb(); - if (waitqueue_active(PIPE_WAIT(*inode))) - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_lock(&out->f_mapping->host->i_mutex); @@ -616,7 +617,7 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, /** * generic_file_splice_write - splice data from a pipe to a file - * @inode: pipe inode + * @pipe: pipe info * @out: file to write to * @len: number of bytes to splice * @flags: splice modifier flags @@ -625,11 +626,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out, * the given pipe inode to the given file. * */ -ssize_t generic_file_splice_write(struct inode *inode, struct file *out, - size_t len, unsigned int flags) +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; - ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file); + ssize_t ret; + + ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* * if file or inode is SYNC and we actually wrote some data, sync it @@ -664,10 +668,10 @@ EXPORT_SYMBOL(generic_file_splice_write); * is involved. * */ -ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, size_t len, unsigned int flags) { - return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -675,8 +679,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -static long do_splice_from(struct inode *pipe, struct file *out, size_t len, - unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -698,8 +702,8 @@ static long do_splice_from(struct inode *pipe, struct file *out, size_t len, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct inode *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -732,14 +736,14 @@ static long do_splice_to(struct file *in, struct inode *pipe, size_t len, static long do_splice(struct file *in, struct file *out, size_t len, unsigned int flags) { - struct inode *pipe; + struct pipe_inode_info *pipe; - pipe = in->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_from(pipe, out, len, flags); - pipe = out->f_dentry->d_inode; - if (pipe->i_pipe) + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) return do_splice_to(in, pipe, len, flags); return -EINVAL; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index ae4c4754ed31..269721af02f3 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,7 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -266,7 +266,7 @@ xfs_file_splice_read( STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) { @@ -279,7 +279,7 @@ xfs_file_splice_read_invis( STATIC ssize_t xfs_file_splice_write( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) @@ -293,7 +293,7 @@ xfs_file_splice_write( STATIC ssize_t xfs_file_splice_write_invis( - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t len, unsigned int flags) diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 90cd314acbaa..74a52937f208 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,7 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, - struct inode *pipe, + struct pipe_inode_info *pipe, size_t count, int flags, int ioflags, @@ -380,7 +380,7 @@ xfs_splice_read( ssize_t xfs_splice_write( bhv_desc_t *bdp, - struct inode *pipe, + struct pipe_inode_info *pipe, struct file *outfilp, size_t count, int flags, diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index eaa5659713fb..55c689a86ad2 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -94,9 +94,9 @@ extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *, +extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 6f1c79a28f8b..88b09f186289 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -174,9 +174,9 @@ typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, - struct inode *, size_t, int, int, + struct pipe_inode_info *, size_t, int, int, struct cred *); -typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *, +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, struct file *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, -- cgit v1.2.3 From 529565dcb1581c9a1e3f6df1c1763ca3e0f0d512 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 10 Apr 2006 15:18:58 +0200 Subject: [PATCH] splice: add optional input and output offsets add optional input and output offsets to sys_splice(), for seekable file descriptors: asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags); semantics are straightforward: f_pos will be updated with the offset provided by user-space, before the splice transfer is about to begin. Providing a NULL offset pointer means the existing f_pos will be used (and updated in situ). Providing an offset for a pipe results in -ESPIPE. Providing an invalid offset pointer results in -EFAULT. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 54 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index ed91a62402e0..a5326127aad5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,7 +680,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t __user *off_out, size_t len, + unsigned int flags) { loff_t pos; int ret; @@ -691,7 +692,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; + if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) + return -EFAULT; + pos = out->f_pos; + ret = rw_verify_area(WRITE, out, &pos, len); if (unlikely(ret < 0)) return ret; @@ -702,8 +707,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t __user *off_in, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { loff_t pos, isize, left; int ret; @@ -714,7 +720,11 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (!(in->f_mode & FMODE_READ)) return -EBADF; + if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + pos = in->f_pos; + ret = rw_verify_area(READ, in, &pos, len); if (unlikely(ret < 0)) return ret; @@ -733,23 +743,39 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, /* * Determine where to splice to/from. */ -static long do_splice(struct file *in, struct file *out, size_t len, - unsigned int flags) +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + if (off_out && out->f_op->llseek == no_llseek) + return -EINVAL; + if (off_in && in->f_op->llseek == no_llseek) + return -EINVAL; + pipe = in->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_from(pipe, out, len, flags); + if (pipe) { + if (off_in) + return -ESPIPE; + + return do_splice_from(pipe, out, off_out, len, flags); + } pipe = out->f_dentry->d_inode->i_pipe; - if (pipe) - return do_splice_to(in, pipe, len, flags); + if (pipe) { + if (off_out) + return -ESPIPE; + + return do_splice_to(in, off_in, pipe, len, flags); + } return -EINVAL; } -asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) { long error; struct file *in, *out; @@ -759,13 +785,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) return 0; error = -EBADF; - in = fget_light(fdin, &fput_in); + in = fget_light(fd_in, &fput_in); if (in) { if (in->f_mode & FMODE_READ) { - out = fget_light(fdout, &fput_out); + out = fget_light(fd_out, &fput_out); if (out) { if (out->f_mode & FMODE_WRITE) - error = do_splice(in, out, len, flags); + error = do_splice(in, off_in, + out, off_out, + len, flags); fput_light(out, fput_out); } } -- cgit v1.2.3 From cbca692c246874a3cc1b5a9b694add4c39e8bc18 Mon Sep 17 00:00:00 2001 From: Eric Sesterhenn Date: Thu, 23 Mar 2006 00:36:54 +0100 Subject: [PATCH] Bogus NULL pointer check in fs/configfs/dir.c We check the "group" pointer after we dereference it. This check is bogus, as it cannot be NULL coming in. Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/configfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8ed9b06a9828..5638c8f9362f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group) int ret = 0; int i; - if (group && group->default_groups) { + if (group->default_groups) { /* FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's -- cgit v1.2.3 From 65714b918415e06c92426f6544b2296dae694590 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 26 Mar 2006 14:25:52 +0200 Subject: [PATCH] CONFIGFS_FS must depend on SYSFS This patch fixes the a compile error with CONFIG_SYSFS=n Configfs is creating, as a matter of policy, the /sys/kernel/config mountpoint. This means it requires CONFIG_SYSFS. Signed-off-by: Adrian Bunk Signed-off-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index e207be68d4ca..97f317413122 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -861,7 +861,7 @@ config RAMFS config CONFIGFS_FS tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on SYSFS && EXPERIMENTAL help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based -- cgit v1.2.3 From de12a7878c11f3b282d640888aa635e0711d0b5e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 10 Apr 2006 17:16:49 -0600 Subject: [PATCH] de_thread: Don't confuse users do_each_thread. Oleg Nesterov spotted two interesting bugs with the current de_thread code. The simplest is a long standing double decrement of __get_cpu_var(process_counts) in __unhash_process. Caused by two processes exiting when only one was created. The other is that since we no longer detach from the thread_group list it is possible for do_each_thread when run under the tasklist_lock to see the same task_struct twice. Once on the task list as a thread_group_leader, and once on the thread list of another thread. The double appearance in do_each_thread can cause a double increment of mm_core_waiters in zap_threads resulting in problems later on in coredump_wait. To remedy those two problems this patch takes the simple approach of changing the old thread group leader into a child thread. The only routine in release_task that cares is __unhash_process, and it can be trivially seen that we handle cleaning up a thread group leader properly. Since de_thread doesn't change the pid of the exiting leader process and instead shares it with the new leader process. I change thread_group_leader to recognize group leadership based on the group_leader field and not based on pids. This should also be slightly cheaper then the existing thread_group_leader macro. I performed a quick audit and I couldn't see any user of thread_group_leader that cared about the difference. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 0291a68a3626..4d38ad0b70d6 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -723,7 +723,12 @@ static int de_thread(struct task_struct *tsk) current->parent = current->real_parent = leader->real_parent; leader->parent = leader->real_parent = child_reaper; current->group_leader = current; - leader->group_leader = leader; + leader->group_leader = current; + + /* Reduce leader to a thread */ + detach_pid(leader, PIDTYPE_PGID); + detach_pid(leader, PIDTYPE_SID); + list_del_init(&leader->tasks); add_parent(current); add_parent(leader); -- cgit v1.2.3 From e50bd16fe49689bc5fb54fca5ed8b568dfba65c6 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:10:45 +1000 Subject: [XFS] Fix superblock validation regression for the zero imaxpct case. Thanks to kjamieson for noticing. SGI-PV: 951661 SGI-Modid: xfs-linux-melb:xfs-kern:25675a Signed-off-by: Nathan Scott --- fs/xfs/xfs_mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 049fabb7f7e0..c0b1c2906880 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -270,7 +270,7 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } -- cgit v1.2.3 From 8272145c05c6d01a34f5114357c5e8093fb66472 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:10:55 +1000 Subject: [XFS] Fix a writepage regression where we accidentally stopped honouring nonblock mode with the new IO path code (since 2.6.16). SGI-PV: 951662 SGI-Modid: xfs-linux-melb:xfs-kern:25676a Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_aops.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 6cbbd165c60d..4d191ef39b67 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -870,12 +870,14 @@ xfs_page_state_convert( pgoff_t end_index, last_index, tlast; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0, trylock_flag = 0; + int page_dirty, count = 0; + int trylock = 0; int all_bh = unmapped; - /* wait for other IO threads? */ - if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) - trylock_flag |= BMAPI_TRYLOCK; + if (startio) { + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + trylock |= BMAPI_TRYLOCK; + } /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -956,15 +958,13 @@ xfs_page_state_convert( if (buffer_unwritten(bh)) { type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE|BMAPI_IGNSTATE; + flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE; - if (!startio) - flags |= trylock_flag; + flags = BMAPI_ALLOCATE | trylock; } else { type = IOMAP_NEW; - flags = BMAPI_WRITE|BMAPI_MMAP; + flags = BMAPI_WRITE | BMAPI_MMAP; } if (!iomap_valid) { -- cgit v1.2.3 From 1fc5d959d88a5f77aa7e4435f6c9d0e2d2236704 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 11 Apr 2006 15:11:12 +1000 Subject: [XFS] Fix inode reclaim scalability regression. When a filesystem has millions of inodes cached and has sparse cluster population, removing inodes from the cluster hash consumes excessive amounts of CPU time. Reduce the CPU cost by making removal O(1) via use of a double linked list for the hash chains. SGI-PV: 951551 SGI-Modid: xfs-linux-melb:xfs-kern:25683a Signed-off-by: David Chinner Signed-off-by: Nathan Scott --- fs/xfs/xfs_iget.c | 29 ++++++++++++----------------- fs/xfs/xfs_inode.h | 1 + 2 files changed, 13 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index bb33113eef9f..b53854325266 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -421,7 +421,10 @@ finish_inode: ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; + if (ch->ch_list) + ch->ch_list->chl_prev = chlnew; chlnew->chl_next = ch->ch_list; + chlnew->chl_prev = NULL; ch->ch_list = chlnew; chlnew = NULL; } @@ -723,23 +726,15 @@ xfs_iextract( ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); ASSERT(ip->i_chash != NULL); chm=NULL; - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - if (chm == NULL) { - /* first item on the list */ - ch->ch_list = chl->chl_next; - } else { - chm->chl_next = chl->chl_next; - } - kmem_zone_free(xfs_chashlist_zone, chl); - break; - } else { - ASSERT(chl->chl_ip != ip); - chm = chl; - } - } - ASSERT_ALWAYS(chl != NULL); - } else { + chl = ip->i_chash; + if (chl->chl_prev) + chl->chl_prev->chl_next = chl->chl_next; + else + ch->ch_list = chl->chl_next; + if (chl->chl_next) + chl->chl_next->chl_prev = chl->chl_prev; + kmem_zone_free(xfs_chashlist_zone, chl); + } else { /* delete one inode from a non-empty list */ iq = ip->i_cnext; iq->i_cprev = ip->i_cprev; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 39ef9c36ea55..3b544db1790b 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -189,6 +189,7 @@ typedef struct xfs_ihash { */ typedef struct xfs_chashlist { struct xfs_chashlist *chl_next; + struct xfs_chashlist *chl_prev; struct xfs_inode *chl_ip; xfs_daddr_t chl_blkno; /* starting block number of * the cluster */ -- cgit v1.2.3 From 58829e490ee805f1c8b3009abc90e2a1a7a0d278 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Tue, 11 Apr 2006 15:11:20 +1000 Subject: [XFS] Fix an inode use-after-free durin an unpin. When reclaiming inodes that have been unlinked, we may need to execute transactions during reclaim. By the time the transaction has hit the disk, the linux inode and xfs vnode may already have been freed so we can't reference them safely. Use the known xfs inode state to determine if it is safe to reference the vnode and linux inode during the unpin operation. SGI-PV: 946321 SGI-Modid: xfs-linux-melb:xfs-kern:25687a Signed-off-by: David Chinner Signed-off-by: Nathan Scott --- fs/xfs/xfs_inode.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 48146bdc6bdd..94b60dd03801 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2732,16 +2732,29 @@ xfs_iunpin( ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) { - vnode_t *vp = XFS_ITOV_NULL(ip); + /* + * If the inode is currently being reclaimed, the + * linux inode _and_ the xfs vnode may have been + * freed so we cannot reference either of them safely. + * Hence we should not try to do anything to them + * if the xfs inode is currently in the reclaim + * path. + * + * However, we still need to issue the unpin wakeup + * call as the inode reclaim may be blocked waiting for + * the inode to become unpinned. + */ + if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { + vnode_t *vp = XFS_ITOV_NULL(ip); - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = vn_to_inode(vp); - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } } - wake_up(&ip->i_ipin_wait); } } -- cgit v1.2.3 From 8c0b5113a55c698f3190ec85925815640f1c2049 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:12:45 +1000 Subject: [XFS] Fix utime(2) in the case that no times parameter was passed in. SGI-PV: 949858 SGI-Modid: xfs-linux-melb:xfs-kern:25717a Signed-off-by: Jes Sorensen Signed-off-by: Nathan Scott --- fs/xfs/linux-2.6/xfs_iops.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 149237304fb6..2e2e275c786f 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,8 +673,7 @@ xfs_vn_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; - if (ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; -- cgit v1.2.3 From 019ff2d57b0bbe77d1eca19f5b634e5e7ff2a0b8 Mon Sep 17 00:00:00 2001 From: Nathan Scott Date: Tue, 11 Apr 2006 15:45:05 +1000 Subject: [XFS] Fix a problem in aligning inode allocations to stripe unit boundaries. SGI-PV: 951862 SGI-Modid: xfs-linux-melb:xfs-kern:25726a Signed-off-by: Nathan Scott --- fs/xfs/xfs_ialloc.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 4eeb856183b1..deddbd03c166 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc( */ agi = XFS_BUF_TO_AGI(agbp); newino = be32_to_cpu(agi->agi_newino); - if(likely(newino != NULLAGINO)) { - args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + - XFS_IALLOC_BLOCKS(args.mp); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), args.agbno); args.type = XFS_ALLOCTYPE_THIS_BNO; @@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc( * Set the alignment for the allocation. * If stripe alignment is turned on then align at stripe unit * boundary. - * If the cluster size is smaller than a filesystem block - * then we're doing I/O for inodes in filesystem block size + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size * pieces, so don't need alignment anyway. */ isaligned = 0; @@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc( args.alignment = args.mp->m_dalign; isaligned = 1; } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= + args.mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) args.alignment = args.mp->m_sb.sb_inoalignmt; @@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc( if ((error = xfs_alloc_vextent(&args))) return error; } - + /* * If stripe alignment is turned on, then try again with cluster * alignment. -- cgit v1.2.3 From b92ce55893745e011edae70830b8bc863be881f9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:07 +0200 Subject: [PATCH] splice: add direct fd <-> fd splicing support It's more efficient for sendfile() emulation. Basically we cache an internal private pipe and just use that as the intermediate area for pages. Direct splicing is not available from sys_splice(), it is only meant to be used for sendfile() emulation. Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at exit for the normal fast path. Signed-off-by: Jens Axboe --- fs/pipe.c | 10 ++-- fs/splice.c | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 137 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 705b48692627..036536f072c9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -691,12 +691,10 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode) return info; } -void free_pipe_info(struct inode *inode) +void __free_pipe_info(struct pipe_inode_info *info) { int i; - struct pipe_inode_info *info = inode->i_pipe; - inode->i_pipe = NULL; for (i = 0; i < PIPE_BUFFERS; i++) { struct pipe_buffer *buf = info->bufs + i; if (buf->ops) @@ -707,6 +705,12 @@ void free_pipe_info(struct inode *inode) kfree(info); } +void free_pipe_info(struct inode *inode) +{ + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; +} + static struct vfsmount *pipe_mnt __read_mostly; static int pipefs_delete_dentry(struct dentry *dentry) { diff --git a/fs/splice.c b/fs/splice.c index a5326127aad5..c47b561edac0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -680,8 +680,7 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t __user *off_out, size_t len, - unsigned int flags) + size_t len, unsigned int flags) { loff_t pos; int ret; @@ -692,9 +691,6 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (!(out->f_mode & FMODE_WRITE)) return -EBADF; - if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t))) - return -EFAULT; - pos = out->f_pos; ret = rw_verify_area(WRITE, out, &pos, len); @@ -707,9 +703,8 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, loff_t __user *off_in, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, + size_t len, unsigned int flags) { loff_t pos, isize, left; int ret; @@ -720,9 +715,6 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, if (!(in->f_mode & FMODE_READ)) return -EBADF; - if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) - return -EFAULT; - pos = in->f_pos; ret = rw_verify_area(READ, in, &pos, len); @@ -740,6 +732,118 @@ static long do_splice_to(struct file *in, loff_t __user *off_in, return in->f_op->splice_read(in, pipe, len, flags); } +long do_splice_direct(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + umode_t i_mode; + int i; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (!pipe) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the move_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * do the splice + */ + ret = 0; + bytes = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + /* * Determine where to splice to/from. */ @@ -749,25 +853,33 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; - if (off_out && out->f_op->llseek == no_llseek) - return -EINVAL; - if (off_in && in->f_op->llseek == no_llseek) - return -EINVAL; - pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { if (off_in) return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&out->f_pos, off_out, + sizeof(loff_t))) + return -EFAULT; + } - return do_splice_from(pipe, out, off_out, len, flags); + return do_splice_from(pipe, out, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; if (pipe) { if (off_out) return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + return -EFAULT; + } - return do_splice_to(in, off_in, pipe, len, flags); + return do_splice_to(in, pipe, len, flags); } return -EINVAL; -- cgit v1.2.3 From 7480a90435673b4c717b6caf1350ec577d5f1adf Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:52:47 +0200 Subject: [PATCH] splice: speedup __generic_file_splice_read Using find_get_page() is a lot faster than find_or_create_page(). This gets splice a lot closer to sendfile() for fd -> socket transfers. Signed-off-by: Jens Axboe --- fs/splice.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index c47b561edac0..e30743c2c06a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -240,7 +240,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, struct page *pages[PIPE_BUFFERS]; struct page *page; pgoff_t index; - int i; + int i, error; index = in->f_pos >> PAGE_CACHE_SHIFT; offset = in->f_pos & ~PAGE_CACHE_MASK; @@ -260,32 +260,84 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, /* * now fill in the holes */ + error = 0; for (i = 0; i < nr_pages; i++, index++) { +find_page: /* - * no page there, look one up / create it + * lookup the page for this index */ - page = find_or_create_page(mapping, index, - mapping_gfp_mask(mapping)); - if (!page) - break; + page = find_get_page(mapping, index); + if (!page) { + /* + * If in nonblock mode then dont block on + * readpage (we've kicked readahead so there + * will be asynchronous progress): + */ + if (flags & SPLICE_F_NONBLOCK) + break; + + /* + * page didn't exist, allocate one + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + break; + } - if (PageUptodate(page)) - unlock_page(page); - else { - int error = mapping->a_ops->readpage(in, page); + goto readpage; + } + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + +readpage: + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { page_cache_release(page); + if (error == AOP_TRUNCATED_PAGE) + goto find_page; break; } } +fill_it: pages[i] = page; } if (i) return move_to_pipe(pipe, pages, i, offset, len, flags); - return 0; + return error; } /** -- cgit v1.2.3 From 9aeedfc4712ed58d9f7ae41596185c72b8dc97e8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:53:10 +0200 Subject: [PATCH] get rid of the PIPE_*() macros get rid of the PIPE_*() macros. Scripted transformation. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/fifo.c | 48 ++++++++++++++--------------- fs/pipe.c | 104 +++++++++++++++++++++++++++++++------------------------------- 2 files changed, 76 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index b16e2f597d61..2c27f56d7304 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -28,14 +28,14 @@ static void wait_for_partner(struct inode* inode, unsigned int *cnt) static void wake_up_partner(struct inode* inode) { - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible(&inode->i_pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { int ret; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (!inode->i_pipe) { ret = -ENOMEM; inode->i_pipe = alloc_pipe_info(inode); @@ -55,18 +55,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - PIPE_RCOUNTER(*inode)++; - if (PIPE_READERS(*inode)++ == 0) + inode->i_pipe->r_counter++; + if (inode->i_pipe->readers++ == 0) wake_up_partner(inode); - if (!PIPE_WRITERS(*inode)) { + if (!inode->i_pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = PIPE_WCOUNTER(*inode); + filp->f_version = inode->i_pipe->w_counter; } else { - wait_for_partner(inode, &PIPE_WCOUNTER(*inode)); + wait_for_partner(inode, &inode->i_pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -80,16 +80,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode)) + if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers) goto err; filp->f_op = &write_fifo_fops; - PIPE_WCOUNTER(*inode)++; - if (!PIPE_WRITERS(*inode)++) + inode->i_pipe->w_counter++; + if (!inode->i_pipe->writers++) wake_up_partner(inode); - if (!PIPE_READERS(*inode)) { - wait_for_partner(inode, &PIPE_RCOUNTER(*inode)); + if (!inode->i_pipe->readers) { + wait_for_partner(inode, &inode->i_pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -104,11 +104,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - PIPE_READERS(*inode)++; - PIPE_WRITERS(*inode)++; - PIPE_RCOUNTER(*inode)++; - PIPE_WCOUNTER(*inode)++; - if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1) + inode->i_pipe->readers++; + inode->i_pipe->writers++; + inode->i_pipe->r_counter++; + inode->i_pipe->w_counter++; + if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1) wake_up_partner(inode); break; @@ -118,27 +118,27 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; err_rd: - if (!--PIPE_READERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--inode->i_pipe->readers) + wake_up_interruptible(&inode->i_pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--PIPE_WRITERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--inode->i_pipe->writers) + wake_up_interruptible(&inode->i_pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) + if (!inode->i_pipe->readers && !inode->i_pipe->writers) free_pipe_info(inode); err_nocleanup: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/pipe.c b/fs/pipe.c index 036536f072c9..0602fc9f7eba 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -158,7 +158,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; for (;;) { int bufs = info->nrbufs; @@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!PIPE_WRITERS(*inode)) + if (!inode->i_pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!inode->i_pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible_sync(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } pipe_wait(inode->i_pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -264,10 +264,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; - if (!PIPE_READERS(*inode)) { + if (!inode->i_pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -306,7 +306,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + if (!inode->i_pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; @@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible_sync(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; + inode->i_pipe->waiting_writers++; pipe_wait(inode->i_pipe); - PIPE_WAITING_WRITERS(*inode)--; + inode->i_pipe->waiting_writers--; } out: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -416,7 +416,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, switch (cmd) { case FIONREAD: - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); info = inode->i_pipe; count = 0; buf = info->curbuf; @@ -425,7 +425,7 @@ pipe_ioctl(struct inode *pino, struct file *filp, count += info->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -441,14 +441,14 @@ pipe_poll(struct file *filp, poll_table *wait) struct pipe_inode_info *info = inode->i_pipe; int nrbufs; - poll_wait(filp, PIPE_WAIT(*inode), wait); + poll_wait(filp, &inode->i_pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ nrbufs = info->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode)) + if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter) mask |= POLLHUP; } @@ -458,7 +458,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!PIPE_READERS(*inode)) + if (!inode->i_pipe->readers) mask |= POLLERR; } @@ -468,17 +468,17 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode) -= decr; - PIPE_WRITERS(*inode) -= decw; - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers -= decr; + inode->i_pipe->writers -= decw; + if (!inode->i_pipe->readers && !inode->i_pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&inode->i_pipe->wait); + kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; } @@ -489,9 +489,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -506,9 +506,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -523,14 +523,14 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -569,9 +569,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -579,9 +579,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -589,12 +589,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (filp->f_mode & FMODE_READ) - PIPE_READERS(*inode)++; + inode->i_pipe->readers++; if (filp->f_mode & FMODE_WRITE) - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -731,7 +731,7 @@ static struct inode * get_pipe_inode(void) if (!inode->i_pipe) goto fail_iput; - PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; + inode->i_pipe->readers = inode->i_pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* -- cgit v1.2.3 From 923f4f23940d2361e8d5c4245982163a8e9d1c91 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:53:33 +0200 Subject: [PATCH] pipe.c/fifo.c code cleanups more code cleanups after the macro conversion: - standardize on 'struct pipe_inode_info *pipe' variable names - introduce 'pipe' temporaries to reduce mass inode->i_pipe dereferencing Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/fifo.c | 49 ++++++++++--------- fs/pipe.c | 163 ++++++++++++++++++++++++++++++++------------------------------ 2 files changed, 111 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/fifo.c b/fs/fifo.c index 2c27f56d7304..49035b174b48 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -33,14 +33,17 @@ static void wake_up_partner(struct inode* inode) static int fifo_open(struct inode *inode, struct file *filp) { + struct pipe_inode_info *pipe; int ret; mutex_lock(&inode->i_mutex); - if (!inode->i_pipe) { + pipe = inode->i_pipe; + if (!pipe) { ret = -ENOMEM; - inode->i_pipe = alloc_pipe_info(inode); - if (!inode->i_pipe) + pipe = alloc_pipe_info(inode); + if (!pipe) goto err_nocleanup; + inode->i_pipe = pipe; } filp->f_version = 0; @@ -55,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - inode->i_pipe->r_counter++; - if (inode->i_pipe->readers++ == 0) + pipe->r_counter++; + if (pipe->readers++ == 0) wake_up_partner(inode); - if (!inode->i_pipe->writers) { + if (!pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = inode->i_pipe->w_counter; + filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &inode->i_pipe->w_counter); + wait_for_partner(inode, &pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -80,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !inode->i_pipe->readers) + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; filp->f_op = &write_fifo_fops; - inode->i_pipe->w_counter++; - if (!inode->i_pipe->writers++) + pipe->w_counter++; + if (!pipe->writers++) wake_up_partner(inode); - if (!inode->i_pipe->readers) { - wait_for_partner(inode, &inode->i_pipe->r_counter); + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -104,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - inode->i_pipe->readers++; - inode->i_pipe->writers++; - inode->i_pipe->r_counter++; - inode->i_pipe->w_counter++; - if (inode->i_pipe->readers == 1 || inode->i_pipe->writers == 1) + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(inode); break; @@ -122,19 +125,19 @@ static int fifo_open(struct inode *inode, struct file *filp) return 0; err_rd: - if (!--inode->i_pipe->readers) - wake_up_interruptible(&inode->i_pipe->wait); + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--inode->i_pipe->writers) - wake_up_interruptible(&inode->i_pipe->wait); + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!inode->i_pipe->readers && !inode->i_pipe->writers) + if (!pipe->readers && !pipe->writers) free_pipe_info(inode); err_nocleanup: diff --git a/fs/pipe.c b/fs/pipe.c index 0602fc9f7eba..b941e1951eac 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -93,7 +93,7 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; @@ -104,8 +104,8 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff * temporary page, let's keep track of it as a one-deep * allocation cache */ - if (page_count(page) == 1 && !info->tmp_page) { - info->tmp_page = page; + if (page_count(page) == 1 && !pipe->tmp_page) { + pipe->tmp_page = page; return; } @@ -115,17 +115,17 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) +static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { kunmap(buf->page); } -static int anon_pipe_buf_steal(struct pipe_inode_info *info, +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { buf->flags |= PIPE_BUF_FLAG_STOLEN; @@ -145,7 +145,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -159,12 +159,12 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; @@ -173,14 +173,14 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { if (!ret) ret = PTR_ERR(addr); break; } error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); if (unlikely(error)) { if (!ret) ret = -EFAULT; break; @@ -190,10 +190,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov, buf->len -= chars; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; @@ -202,9 +202,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!inode->i_pipe->writers) + if (!pipe->writers) break; - if (!inode->i_pipe->waiting_writers) { + if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -222,16 +222,16 @@ pipe_readv(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(inode->i_pipe); + pipe_wait(pipe); } mutex_unlock(&inode->i_mutex); /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -250,7 +250,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -265,9 +265,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; - if (!inode->i_pipe->readers) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -275,23 +275,23 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (info->nrbufs && chars != 0) { - int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + lastbuf; + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; if (ops->can_merge && offset + chars <= PAGE_SIZE) { void *addr; int error; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); if (IS_ERR(addr)) { error = PTR_ERR(addr); goto out; } error = pipe_iov_copy_from_user(offset + addr, iov, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); ret = error; do_wakeup = 1; if (error) @@ -306,16 +306,16 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!inode->i_pipe->readers) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = info->tmp_page; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; int error; if (!page) { @@ -324,7 +324,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ret = ret ? : -ENOMEM; break; } - info->tmp_page = page; + pipe->tmp_page = page; } /* Always wakeup, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to @@ -349,8 +349,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; - info->nrbufs = ++bufs; - info->tmp_page = NULL; + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; total_len -= chars; if (!total_len) @@ -367,19 +367,19 @@ pipe_writev(struct file *filp, const struct iovec *_iov, break; } if (do_wakeup) { - wake_up_interruptible_sync(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - inode->i_pipe->waiting_writers++; - pipe_wait(inode->i_pipe); - inode->i_pipe->waiting_writers--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } out: mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -411,21 +411,22 @@ pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int count, buf, nrbufs; switch (cmd) { case FIONREAD: mutex_lock(&inode->i_mutex); - info = inode->i_pipe; + pipe = inode->i_pipe; count = 0; - buf = info->curbuf; - nrbufs = info->nrbufs; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { - count += info->bufs[buf].len; + count += pipe->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } mutex_unlock(&inode->i_mutex); + return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -438,17 +439,17 @@ pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info = inode->i_pipe; + struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; - poll_wait(filp, &inode->i_pipe->wait, wait); + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = info->nrbufs; + nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!inode->i_pipe->writers && filp->f_version != inode->i_pipe->w_counter) + if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } @@ -458,7 +459,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!inode->i_pipe->readers) + if (!pipe->readers) mask |= POLLERR; } @@ -468,15 +469,18 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { + struct pipe_inode_info *pipe; + mutex_lock(&inode->i_mutex); - inode->i_pipe->readers -= decr; - inode->i_pipe->writers -= decw; - if (!inode->i_pipe->readers && !inode->i_pipe->writers) { + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(&inode->i_pipe->wait); - kill_fasync(&inode->i_pipe->fasync_readers, SIGIO, POLL_IN); - kill_fasync(&inode->i_pipe->fasync_writers, SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_unlock(&inode->i_mutex); @@ -679,30 +683,30 @@ static struct file_operations rdwr_pipe_fops = { struct pipe_inode_info * alloc_pipe_info(struct inode *inode) { - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (info) { - init_waitqueue_head(&info->wait); - info->r_counter = info->w_counter = 1; - info->inode = inode; + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; } - return info; + return pipe; } -void __free_pipe_info(struct pipe_inode_info *info) +void __free_pipe_info(struct pipe_inode_info *pipe) { int i; for (i = 0; i < PIPE_BUFFERS; i++) { - struct pipe_buffer *buf = info->bufs + i; + struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(info, buf); + buf->ops->release(pipe, buf); } - if (info->tmp_page) - __free_page(info->tmp_page); - kfree(info); + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe); } void free_pipe_info(struct inode *inode) @@ -723,15 +727,17 @@ static struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; if (!inode) goto fail_inode; - inode->i_pipe = alloc_pipe_info(inode); - if (!inode->i_pipe) + pipe = alloc_pipe_info(inode); + if (!pipe) goto fail_iput; + inode->i_pipe = pipe; - inode->i_pipe->readers = inode->i_pipe->writers = 1; + pipe->readers = pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* @@ -746,6 +752,7 @@ static struct inode * get_pipe_inode(void) inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; + return inode; fail_iput: -- cgit v1.2.3 From 6f767b0425f5902e4817648632230b512e81c963 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:53:56 +0200 Subject: [PATCH] splice: speedups and optimizations - Kill the local variables that cache ->nrbufs, they just take up space. - Only set do_wakeup for a real pipe. This is a big win for direct splicing. - Kill i_mutex lock around ->f_pos update, regular io paths don't do this either. Signed-off-by: Jens Axboe --- fs/splice.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index e30743c2c06a..36bc262dfbd5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -150,8 +150,6 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs; - if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) @@ -159,9 +157,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - bufs = pipe->nrbufs; - if (bufs < PIPE_BUFFERS) { - int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS - 1); + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; struct page *page = pages[i++]; unsigned long this_len; @@ -174,8 +171,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, buf->offset = offset; buf->len = this_len; buf->ops = &page_cache_pipe_buf_ops; - pipe->nrbufs = ++bufs; - do_wakeup = 1; + pipe->nrbufs++; + if (pipe->inode) + do_wakeup = 1; ret += this_len; len -= this_len; @@ -184,7 +182,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; if (!len) break; - if (bufs < PIPE_BUFFERS) + if (pipe->nrbufs < PIPE_BUFFERS) continue; break; @@ -581,11 +579,8 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&pipe->inode->i_mutex); for (;;) { - int bufs = pipe->nrbufs; - - if (bufs) { - int curbuf = pipe->curbuf; - struct pipe_buffer *buf = pipe->bufs + curbuf; + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; struct pipe_buf_operations *ops = buf->ops; sd.len = buf->len; @@ -606,10 +601,10 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); - curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); - pipe->curbuf = curbuf; - pipe->nrbufs = --bufs; - do_wakeup = 1; + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; } sd.pos += sd.len; @@ -618,7 +613,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, break; } - if (bufs) + if (pipe->nrbufs) continue; if (!pipe->writers) break; @@ -660,9 +655,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_lock(&out->f_mapping->host->i_mutex); out->f_pos = sd.pos; - mutex_unlock(&out->f_mapping->host->i_mutex); return ret; } -- cgit v1.2.3 From 49570e9b29a3d78950b5eba6b73bdcca955f0877 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:09 +0200 Subject: [PATCH] splice: unlikely() optimizations Also corrects a few comments. Patch mainly from Ingo, changes by me. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/read_write.c | 2 +- fs/splice.c | 15 +++++++-------- 2 files changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 6256ca81a718..5bc0e9234f9d 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) { + if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff --git a/fs/splice.c b/fs/splice.c index 36bc262dfbd5..77b026baff7d 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -346,7 +346,6 @@ fill_it: * @flags: splice modifier flags * * Will read pages from given file and fill them into a pipe. - * */ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -690,7 +689,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock(&inode->i_mutex); err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); + OSYNC_METADATA|OSYNC_DATA); mutex_unlock(&inode->i_mutex); if (err) @@ -730,10 +729,10 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t pos; int ret; - if (!out->f_op || !out->f_op->splice_write) + if (unlikely(!out->f_op || !out->f_op->splice_write)) return -EINVAL; - if (!(out->f_mode & FMODE_WRITE)) + if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; pos = out->f_pos; @@ -754,10 +753,10 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, loff_t pos, isize, left; int ret; - if (!in->f_op || !in->f_op->splice_read) + if (unlikely(!in->f_op || !in->f_op->splice_read)) return -EINVAL; - if (!(in->f_mode & FMODE_READ)) + if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; pos = in->f_pos; @@ -771,7 +770,7 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, return 0; left = isize - in->f_pos; - if (left < len) + if (unlikely(left < len)) len = left; return in->f_op->splice_read(in, pipe, len, flags); @@ -799,7 +798,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * 'out' and transfer the wanted data from 'in' to 'out' through that */ pipe = current->splice_pipe; - if (!pipe) { + if (unlikely(!pipe)) { pipe = alloc_pipe_info(NULL); if (!pipe) return -ENOMEM; -- cgit v1.2.3 From c2058e0611c2fb4a00e63db61ef3a638cf760a98 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 13:56:34 +0200 Subject: [PATCH] splice: add Ingo as addition copyright holder The comment is also somewhat out of date, correct that as well. Signed-off-by: Jens Axboe --- fs/splice.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 77b026baff7d..48b4382cc114 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -9,11 +9,12 @@ * that transfers data buffers to or from a pipe buffer. * * Named by Larry McVoy, original implementation from Linus, extended by - * Jens to support splicing to files and fixing the initial implementation - * bugs. + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. * - * Copyright (C) 2005 Jens Axboe - * Copyright (C) 2005 Linus Torvalds + * Copyright (C) 2005-2006 Jens Axboe + * Copyright (C) 2005-2006 Linus Torvalds + * Copyright (C) 2006 Ingo Molnar * */ #include -- cgit v1.2.3 From 73d62d83ec3627782ba6f55defc76f3ffbef46ee Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:57:21 +0200 Subject: [PATCH] splice: comment styles - capitalize consistently - end sentences in one way or another - update comment text to match the implementation Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/splice.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 48b4382cc114..e50a460239dd 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -92,7 +92,7 @@ static void *page_cache_pipe_buf_map(struct file *file, /* * Page got truncated/unhashed. This will cause a 0-byte - * splice, if this is the first page + * splice, if this is the first page. */ if (!page->mapping) { err = -ENODATA; @@ -100,7 +100,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * uh oh, read-error from disk + * Uh oh, read-error from disk. */ if (!PageUptodate(page)) { err = -EIO; @@ -108,7 +108,7 @@ static void *page_cache_pipe_buf_map(struct file *file, } /* - * page is ok afterall, fall through to mapping + * Page is ok afterall, fall through to mapping. */ unlock_page(page); } @@ -249,7 +249,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, nr_pages = PIPE_BUFFERS; /* - * initiate read-ahead on this page range. however, don't call into + * Initiate read-ahead on this page range. however, don't call into * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ @@ -257,7 +257,7 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, do_page_cache_readahead(mapping, in, index, nr_pages); /* - * now fill in the holes + * Now fill in the holes: */ error = 0; for (i = 0; i < nr_pages; i++, index++) { @@ -396,10 +396,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, int more; /* - * sub-optimal, but we are limited by the pipe ->map. we don't + * Sub-optimal, but we are limited by the pipe ->map. We don't * need a kmap'ed buffer here, we just want to make sure we * have the page pinned if the pipe page originates from the - * page cache + * page cache. */ ptr = buf->ops->map(file, info, buf); if (IS_ERR(ptr)) @@ -460,7 +460,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, offset = sd->pos & ~PAGE_CACHE_MASK; /* - * reuse buf page, if SPLICE_F_MOVE is set + * Reuse buf page, if SPLICE_F_MOVE is set. */ if (sd->flags & SPLICE_F_MOVE) { /* @@ -501,7 +501,7 @@ find_page: if (!PageUptodate(page)) { /* - * page got invalidated, repeat + * Page got invalidated, repeat. */ if (!page->mapping) { unlock_page(page); @@ -598,6 +598,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, ret += sd.len; buf->offset += sd.len; buf->len -= sd.len; + if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); @@ -681,7 +682,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); /* - * if file or inode is SYNC and we actually wrote some data, sync it + * If file or inode is SYNC and we actually wrote some data, sync it. */ if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) && ret > 0) { @@ -815,7 +816,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, } /* - * do the splice + * Do the splice. */ ret = 0; bytes = 0; -- cgit v1.2.3 From 341b446bc5aa36d1d5b8159c1e66716b5d89024d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Apr 2006 13:57:45 +0200 Subject: [PATCH] another round of fs/pipe.c cleanups make pipe.c a bit more readable and hackable. Signed-off-by: Ingo Molnar Signed-off-by: Jens Axboe --- fs/pipe.c | 76 ++++++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 48 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index b941e1951eac..e984beb93a0e 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -44,7 +44,8 @@ void pipe_wait(struct pipe_inode_info *pipe) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); + prepare_to_wait(&pipe->wait, &wait, + TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); if (pipe->inode) mutex_unlock(&pipe->inode->i_mutex); schedule(); @@ -93,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; @@ -102,25 +104,22 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buff /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep - * allocation cache + * allocation cache. (Otherwise just release our reference to it) */ - if (page_count(page) == 1 && !pipe->tmp_page) { + if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; - return; - } - - /* - * Otherwise just release our reference to it - */ - page_cache_release(page); + else + page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } @@ -182,7 +181,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); ops->unmap(pipe, buf); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -218,7 +218,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { @@ -228,7 +229,8 @@ pipe_readv(struct file *filp, const struct iovec *_iov, pipe_wait(pipe); } mutex_unlock(&inode->i_mutex); - /* Signal writers asynchronously that there is more room. */ + + /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { wake_up_interruptible(&pipe->wait); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); @@ -242,6 +244,7 @@ static ssize_t pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = count }; + return pipe_readv(filp, &iov, 1, ppos); } @@ -276,10 +279,12 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ if (pipe->nrbufs && chars != 0) { - int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & (PIPE_BUFFERS-1); + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (PIPE_BUFFERS-1); struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; + if (ops->can_merge && offset + chars <= PAGE_SIZE) { void *addr; int error; @@ -306,9 +311,11 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) ret = -EPIPE; + if (!ret) + ret = -EPIPE; break; } bufs = pipe->nrbufs; @@ -326,7 +333,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, } pipe->tmp_page = page; } - /* Always wakeup, even if the copy fails. Otherwise + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? @@ -339,7 +346,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_from_user(kmap(page), iov, chars); kunmap(page); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -359,11 +367,13 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (bufs < PIPE_BUFFERS) continue; if (filp->f_flags & O_NONBLOCK) { - if (!ret) ret = -EAGAIN; + if (!ret) + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { @@ -391,6 +401,7 @@ pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev(filp, &iov, 1, ppos); } @@ -401,7 +412,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) } static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { return -EBADF; } @@ -475,6 +487,7 @@ pipe_release(struct inode *inode, int decr, int decw) pipe = inode->i_pipe; pipe->readers -= decr; pipe->writers -= decw; + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { @@ -525,14 +538,15 @@ static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); mutex_unlock(&inode->i_mutex); @@ -720,6 +734,7 @@ static int pipefs_delete_dentry(struct dentry *dentry) { return 1; } + static struct dentry_operations pipefs_dentry_operations = { .d_delete = pipefs_delete_dentry, }; @@ -757,6 +772,7 @@ static struct inode * get_pipe_inode(void) fail_iput: iput(inode); + fail_inode: return NULL; } @@ -769,7 +785,7 @@ int do_pipe(int *fd) struct inode * inode; struct file *f1, *f2; int error; - int i,j; + int i, j; error = -ENFILE; f1 = get_empty_filp(); @@ -802,6 +818,7 @@ int do_pipe(int *fd) dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this); if (!dentry) goto close_f12_inode_i_j; + dentry->d_op = &pipefs_dentry_operations; d_add(dentry, inode); f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt)); @@ -825,6 +842,7 @@ int do_pipe(int *fd) fd_install(j, f2); fd[0] = i; fd[1] = j; + return 0; close_f12_inode_i_j: @@ -849,8 +867,9 @@ no_files: * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct super_block *pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct super_block * +pipefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } @@ -864,6 +883,7 @@ static struct file_system_type pipe_fs_type = { static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); + if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { -- cgit v1.2.3 From 29ff2db55196717e2e67e0f04adc833ee7edd491 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Apr 2006 22:52:46 -0700 Subject: [PATCH] select() warning fixes fs/select.c: In function `core_sys_select': fs/select.c:339: warning: assignment from incompatible pointer type fs/select.c:376: warning: comparison of distinct pointer types lacks a cast By using a void* we can remove lots of casts rather than adding more. Cc: Jes Sorensen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 071660fa7b01..fce0fd1bb1d1 100644 --- a/fs/select.c +++ b/fs/select.c @@ -310,7 +310,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; - char *bits; + void *bits; int ret, size, max_fdset; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ @@ -341,12 +341,12 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || -- cgit v1.2.3 From 7b04d7170e9af805cac19f97b28fff10db897893 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:53:27 -0700 Subject: [PATCH] Add GFP_NOWAIT Introduce GFP_NOWAIT, as an alias for GFP_ATOMIC & ~__GFP_HIGH. This also changes XFS, which is the only in-tree user of this idiom that I could find. The XFS piece is compile-tested only. Signed-off-by: Jeff Dike Acked-by: Nathan Scott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9fb0312665ca..26fed0756f01 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -182,7 +182,7 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; -- cgit v1.2.3 From 5246d0503130fa58904c8beb987fcf93b96d8ab6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 10 Apr 2006 22:53:57 -0700 Subject: [PATCH] sync_file_range(): use unsigned for flags Ulrich suggested that the `flags' arg to sync_file_range() become unsigned. Cc: Ulrich Drepper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/sync.c b/fs/sync.c index 8616006d2094..aab5ffe77e9f 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -61,7 +61,7 @@ * will be available after a crash. */ asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, - int flags) + unsigned int flags) { int ret; struct file *file; @@ -126,7 +126,7 @@ out: * `endbyte' is inclusive */ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, - int flags) + unsigned int flags) { int ret; struct address_space *mapping; -- cgit v1.2.3 From f6422f17d3a480f21917a3895e2a46b968f56a08 Mon Sep 17 00:00:00 2001 From: Herbert Poetzl Date: Mon, 10 Apr 2006 22:54:03 -0700 Subject: [PATCH] vfs: propagate mnt_flags into do_loopback/vfsmount The mnt_flags are propagated into do_loopback(), so that they can be stored with the vfsmount Signed-off-by: Herbert Poetzl Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index bf478addb852..2c5f1f80bdc2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; + int recurse = flags & MS_REC; int err = mount_is_safe(nd); + if (err) return err; if (!old_name || !*old_name) @@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } + mnt->mnt_flags = mnt_flags; out: up_write(&namespace_sem); @@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) -- cgit v1.2.3 From 00fbc6dfe7c4487f812829bff79c3121c8fd3bca Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Mon, 10 Apr 2006 22:54:06 -0700 Subject: [PATCH] 9p: handle sget() failure Handle a failing sget() in v9fs_get_sb(). Signed-off-by: Christoph Hellwig Signed-off-by: Eric Van Hensbergen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_super.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b0a0ae509c00..61c599b4a1e3 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - kfree(v9ses); - return ERR_PTR(newfid); + sb = ERR_PTR(newfid); + goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - + if (IS_ERR(sb)) + goto out_close_session; v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type return sb; +out_close_session: + v9fs_session_close(v9ses); +out_free_session: + kfree(v9ses); + return sb; + put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); -- cgit v1.2.3 From b04eb6aa08ecc3e24df2f78ebc486011ebd74feb Mon Sep 17 00:00:00 2001 From: Mitchell Blank Jr Date: Mon, 10 Apr 2006 22:54:08 -0700 Subject: [PATCH] select: don't overflow if (SELECT_STACK_ALLOC % sizeof(long) != 0) If SELECT_STACK_ALLOC is not a multiple of sizeof(long) then stack_fds[] would be shorter than SELECT_STACK_ALLOC bytes and could overflow later in the function. Fixed by simply rearranging the test later to work on sizeof(stack_fds) Currently SELECT_STACK_ALLOC is 256 so this doesn't happen, but it's nasty to have things like this hidden in the code. What if later someone decides to change SELECT_STACK_ALLOC to 300? Signed-off-by: Mitchell Blank Jr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index fce0fd1bb1d1..a8109baa5e46 100644 --- a/fs/select.c +++ b/fs/select.c @@ -311,7 +311,8 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, { fd_set_bits fds; void *bits; - int ret, size, max_fdset; + int ret, max_fdset; + unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; @@ -333,14 +334,15 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - if (6*size < SELECT_STACK_ALLOC) - bits = stack_fds; - else + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; + if (!bits) + goto out_nofds; + } fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; -- cgit v1.2.3 From 80e8ff634169be3fc2ac48f258cc7638e898cd46 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 10 Apr 2006 22:54:10 -0700 Subject: [PATCH] kdump proc vmcore size oveflow fix A couple of /proc/vmcore data structures overflow with 32bit systems having memory more than 4G. This patch fixes those. Signed-off-by: Ken'ichi Ohmichi Signed-off-by: Vivek Goyal Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7efa73d44c9a..20d4b2237fce 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { ssize_t acc = 0, tmp; - size_t tsz, nr_bytes; - u64 start; + size_t tsz; + u64 start, nr_bytes; struct vmcore *curr_m = NULL; if (buflen == 0 || *fpos >= vmcore_size) -- cgit v1.2.3 From 2395140ee2bffe38b1c8a59318f62882b797f5e6 Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Mon, 10 Apr 2006 22:54:12 -0700 Subject: [PATCH] uniform POLLRDHUP handling between epoll and poll/select As reported by Michael Kerrisk, POLLRDHUP handling was not consistent between epoll and poll/select, since in epoll it was unmaskeable. This patch brings uniformity in POLLRDHUP handling. Signed-off-by: Davide Libenzi Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 242fe1a66ce5..1b4491cdd115 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else @@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; -- cgit v1.2.3 From f5e902817fee1589badca1284f49eecc0ef0c200 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 10 Apr 2006 22:54:16 -0700 Subject: [PATCH] process accounting: take original leader's start_time in non-leader exec The only record we have of the real-time age of a process, regardless of execs it's done, is start_time. When a non-leader thread exec, the original start_time of the process is lost. Things looking at the real-time age of the process are fooled, for example the process accounting record when the process finally dies. This change makes the oldest start_time stick around with the process after a non-leader exec. This way the association between PID and start_time is kept constant, which seems correct to me. Signed-off-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4d38ad0b70d6..3234a0c32d54 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,6 +678,18 @@ static int de_thread(struct task_struct *tsk) while (leader->exit_state != EXIT_ZOMBIE) yield(); + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + current->start_time = leader->start_time; + spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); proc_dentry1 = proc_pid_unhash(current); -- cgit v1.2.3 From 68250ba5df4c9d00d3064a0ba9a894035436916b Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Mon, 10 Apr 2006 22:54:30 -0700 Subject: [PATCH] kdump: enable CONFIG_PROC_VMCORE by default Everybody seems to be using /proc/vmcore as a method to access the kernel crash dump. Hence probably it makes sense to enable CONFIG_PROC_VMCORE by default if CONFIG_CRASH_DUMP is selected. This makes kdump configuration further easier for a user. Signed-off-by: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 97f317413122..2524629dc835 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -799,6 +799,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + default y help Exports the dump image of crashed kernel in ELF format. -- cgit v1.2.3 From 091e881d0e55496d8887b61446ae1c598b0995b6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 10 Apr 2006 22:54:31 -0700 Subject: [PATCH] inotify: check for NULL inode in inotify_d_instantiate The spufs file system creates files in a directory before instantiating the directory itself, which causes a NULL pointer access in inotify_d_instantiate since c32ccd87bfd1414b0aabfcd8dbc7539ad23bcbaa. I'd like to keep this behavior since it means that the user will not have access to files in the directory before I know that I succeed in creating everything in it. This patch adds a simple check for the inode to keep that working. Signed-off-by: Arnd Bergmann Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/inotify.c b/fs/inotify.c index 367c487c014b..1f50302849c5 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode) WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); spin_lock(&entry->d_lock); parent = entry->d_parent; - if (inotify_inode_watched(parent->d_inode)) + if (parent->d_inode && inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&entry->d_lock); } -- cgit v1.2.3 From 389ed39b9711bbe5210d5e118e1f1af36ca88b7c Mon Sep 17 00:00:00 2001 From: "Ananiev, Leonid I" Date: Mon, 10 Apr 2006 22:54:38 -0700 Subject: [PATCH] ext3: Fix missed mutex unlock Missed unlock_super()call is added in error condition code path. Signed-off-by: Leonid Ananiev Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1041dab6de2f..14f5f6ea3e72 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -974,6 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_put; } -- cgit v1.2.3 From d3406ffa4af8af1d7c14cff06e003eb0a557d4ad Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:49 -0700 Subject: [PATCH] fuse: fix oops in fuse_send_readpages() During heavy parallel filesystem activity it was possible to Oops the kernel. The reason is that read_cache_pages() could skip pages which have already been inserted into the cache by another task. Occasionally this may result in zero pages actually being sent, while fuse_send_readpages() relies on at least one page being in the request. So check this corner case and just free the request instead of trying to send it. Reported and tested by Konstantin Isakov. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975f2697e866..3ac39c0288de 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -397,8 +397,12 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, return -EINTR; err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) - fuse_send_readpages(data.req, file, inode); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file, inode); + else + fuse_put_request(fc, data.req); + } return err; } -- cgit v1.2.3 From 7025d9ad10a38dadef8b286e0092731c2d3cdc53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:50 -0700 Subject: [PATCH] fuse: fix fuse_dev_poll() return value fuse_dev_poll() returned an error value instead of a poll mask. Luckily (or unluckily) -ENODEV does contain the POLLERR bit. There's also a race if filesystem is unmounted between fuse_get_conn() and spin_lock(), in which case this event will be missed by poll(). Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 23d1f52eb1b8..b2e8613a26d8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -804,17 +804,18 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf, static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { - struct fuse_conn *fc = fuse_get_conn(file); unsigned mask = POLLOUT | POLLWRNORM; - + struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return POLLERR; poll_wait(file, &fc->waitq, wait); spin_lock(&fuse_lock); - if (!list_empty(&fc->pending)) - mask |= POLLIN | POLLRDNORM; + if (!fc->connected) + mask = POLLERR; + else if (!list_empty(&fc->pending)) + mask |= POLLIN | POLLRDNORM; spin_unlock(&fuse_lock); return mask; -- cgit v1.2.3 From 385a17bfc3cb035333c8a91eddc78a6e04c4625e Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:54:52 -0700 Subject: [PATCH] fuse: add O_ASYNC support to FUSE device This adds asynchronous notification to FUSE - a FUSE server can request O_ASYNC on a /dev/fuse file descriptor and receive SIGIO when there is input available. One subtlety - fuse_dev_fasync, which is called when O_ASYNC is requested, does no locking, unlink the other methods. I think it's unnecessary, as the fuse_conn.fasync list is manipulated only by fasync_helper and kill_fasync, which provide their own locking. It would also be wrong to use the fuse_lock, as it's a spin lock and fasync_helper can sleep. My one concern with this is the fuse_conn going away underneath fuse_dev_fasync - sys_fcntl takes a reference on the file struct, so this seems not to be a problem. Signed-off-by: Jeff Dike Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 17 ++++++++++++++++- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 1 + 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b2e8613a26d8..438770f8867f 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -317,6 +317,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } /* @@ -901,6 +902,7 @@ void fuse_abort_conn(struct fuse_conn *fc) end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } spin_unlock(&fuse_lock); } @@ -917,12 +919,24 @@ static int fuse_dev_release(struct inode *inode, struct file *file) end_requests(fc, &fc->processing); } spin_unlock(&fuse_lock); - if (fc) + if (fc) { + fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); + } return 0; } +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -ENODEV; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .llseek = no_llseek, @@ -932,6 +946,7 @@ const struct file_operations fuse_dev_operations = { .writev = fuse_dev_writev, .poll = fuse_dev_poll, .release = fuse_dev_release, + .fasync = fuse_dev_fasync, }; static struct miscdevice fuse_miscdevice = { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a16a04fcf41e..e5cb46b78437 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -318,6 +318,9 @@ struct fuse_conn { /** kobject */ struct kobject kobj; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 879e6fba9480..78700cbb9cdf 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -216,6 +216,7 @@ static void fuse_put_super(struct super_block *sb) spin_unlock(&fuse_lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); kobject_del(&fc->kobj); kobject_put(&fc->kobj); -- cgit v1.2.3 From e5ac1d1e70a8c19a65a959d73650203df7a2e168 Mon Sep 17 00:00:00 2001 From: Jeff Dike Date: Mon, 10 Apr 2006 22:54:53 -0700 Subject: [PATCH] fuse: add O_NONBLOCK support to FUSE device I don't like duplicating the connected and list_empty tests in fuse_dev_readv, but this seemed cleaner than adding the f_flags test to request_wait. Signed-off-by: Jeff Dike Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 438770f8867f..75c6e9166c39 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -619,6 +619,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, err = -EPERM; if (!fc) goto err_unlock; + + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + list_empty(&fc->pending)) + goto err_unlock; + request_wait(fc); err = -ENODEV; if (!fc->connected) -- cgit v1.2.3 From 0720b315976447cba3f0c3e211223b8cb82b0f93 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:55 -0700 Subject: [PATCH] fuse: simplify locking This is in preparation for removing the global spinlock in favor of a per-mount one. The only critical part is the interaction between fuse_dev_release() and fuse_fill_super(): fuse_dev_release() must see the assignment to file->private_data, otherwise it will leak the reference to fuse_conn. This is ensured by the fput() operation, which will synchronize the assignment with other CPU's that may do a final fput() soon after this. Also redundant locking is removed from fuse_fill_super(), where exclusion is already ensured by the BKL held for this function by the VFS. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 31 ++++++++++------------------ fs/fuse/inode.c | 64 ++++++++++++++++++++------------------------------------- 2 files changed, 33 insertions(+), 62 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 75c6e9166c39..c510533c6849 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep; static struct fuse_conn *fuse_get_conn(struct file *file) { - struct fuse_conn *fc; - spin_lock(&fuse_lock); - fc = file->private_data; - if (fc && !fc->connected) - fc = NULL; - spin_unlock(&fuse_lock); - return fc; + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; } static void fuse_request_init(struct fuse_req *req) @@ -607,19 +605,16 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; unsigned reqsize; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; restart: spin_lock(&fuse_lock); - fc = file->private_data; - err = -EPERM; - if (!fc) - goto err_unlock; - err = -EAGAIN; if ((file->f_flags & O_NONBLOCK) && fc->connected && list_empty(&fc->pending)) @@ -915,17 +910,13 @@ void fuse_abort_conn(struct fuse_conn *fc) static int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc; - - spin_lock(&fuse_lock); - fc = file->private_data; + struct fuse_conn *fc = fuse_get_conn(file); if (fc) { + spin_lock(&fuse_lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - } - spin_unlock(&fuse_lock); - if (fc) { + spin_unlock(&fuse_lock); fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 78700cbb9cdf..620579a69107 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -414,37 +414,6 @@ static struct fuse_conn *new_conn(void) return fc; } -static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) -{ - struct fuse_conn *fc; - int err; - - err = -EINVAL; - if (file->f_op != &fuse_dev_operations) - goto out_err; - - err = -ENOMEM; - fc = new_conn(); - if (!fc) - goto out_err; - - spin_lock(&fuse_lock); - err = -EINVAL; - if (file->private_data) - goto out_unlock; - - kobject_get(&fc->kobj); - file->private_data = fc; - spin_unlock(&fuse_lock); - return fc; - - out_unlock: - spin_unlock(&fuse_lock); - kobject_put(&fc->kobj); - out_err: - return ERR_PTR(err); -} - static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -526,12 +495,9 @@ static void fuse_send_init(struct fuse_conn *fc) static unsigned long long conn_id(void) { + /* BKL is held for ->get_sb() */ static unsigned long long ctr = 1; - unsigned long long val; - spin_lock(&fuse_lock); - val = ctr++; - spin_unlock(&fuse_lock); - return val; + return ctr++; } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -556,10 +522,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) return -EINVAL; - fc = get_conn(file, sb); - fput(file); - if (IS_ERR(fc)) - return PTR_ERR(fc); + if (file->f_op != &fuse_dev_operations) + return -EINVAL; + + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + if (file->private_data) + return -EINVAL; + + fc = new_conn(); + if (!fc) + return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; @@ -589,10 +562,16 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_put_root; sb->s_root = root_dentry; - spin_lock(&fuse_lock); fc->mounted = 1; fc->connected = 1; - spin_unlock(&fuse_lock); + kobject_get(&fc->kobj); + file->private_data = fc; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); fuse_send_init(fc); @@ -601,6 +580,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_put_root: dput(root_dentry); err: + fput(file); kobject_put(&fc->kobj); return err; } -- cgit v1.2.3 From d713311464bcca73c990d1a1b5c9467eae87f5b4 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:55 -0700 Subject: [PATCH] fuse: use a per-mount spinlock Remove the global spinlock in favor of a per-mount one. This patch is basically find & replace. The difficult part has already been done by the previous patch. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 122 ++++++++++++++++++++++++++++--------------------------- fs/fuse/fuse_i.h | 24 +++-------- fs/fuse/inode.c | 12 +++--- 3 files changed, 74 insertions(+), 84 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c510533c6849..63d2cf43b5e3 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -94,11 +94,11 @@ static struct fuse_req *do_get_request(struct fuse_conn *fc) { struct fuse_req *req; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); BUG_ON(list_empty(&fc->unused_list)); req = list_entry(fc->unused_list.next, struct fuse_req, list); list_del_init(&req->list); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_request_init(req); req->preallocated = 1; req->in.h.uid = current->fsuid; @@ -124,7 +124,7 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc) return do_get_request(fc); } -/* Must be called with fuse_lock held */ +/* Must be called with fc->lock held */ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) { if (req->preallocated) { @@ -143,9 +143,9 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fuse_putback_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } @@ -155,15 +155,15 @@ static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) fuse_putback_request(fc, req); } -void fuse_release_background(struct fuse_req *req) +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); if (req->file) fput(req->file); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); list_del(&req->bg_entry); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } /* @@ -182,7 +182,7 @@ void fuse_release_background(struct fuse_req *req) * interrupted and put in the background, it will return with an error * and hence never be reset and reused. * - * Called with fuse_lock, unlocks it + * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { @@ -191,14 +191,14 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (!req->background) { wake_up(&req->waitq); fuse_put_request_locked(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); down_read(&fc->sbput_sem); if (fc->mounted) - fuse_release_background(req); + fuse_release_background(fc, req); up_read(&fc->sbput_sem); if (end) end(fc, req); @@ -248,16 +248,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) get_file(req->file); } -/* Called with fuse_lock held. Releases, and then reacquires it. */ +/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) { sigset_t oldset; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); restore_sigs(&oldset); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->state == FUSE_REQ_FINISHED && !req->interrupted) return; @@ -271,9 +271,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); @@ -324,7 +324,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; else if (fc->conn_error) @@ -337,15 +337,15 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_wait_answer(fc, req); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { queue_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; request_end(fc, req); @@ -361,9 +361,9 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); background_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); request_send_nowait(fc, req); } @@ -372,16 +372,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * anything that could cause a page-fault. If the request was already * interrupted bail out. */ -static int lock_request(struct fuse_req *req) +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->interrupted) err = -ENOENT; else req->locked = 1; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return err; } @@ -391,18 +391,19 @@ static int lock_request(struct fuse_req *req) * requester thread is currently waiting for it to be unlocked, so * wake it up. */ -static void unlock_request(struct fuse_req *req) +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) { if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (req->interrupted) wake_up(&req->waitq); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } struct fuse_copy_state { + struct fuse_conn *fc; int write; struct fuse_req *req; const struct iovec *iov; @@ -415,11 +416,12 @@ struct fuse_copy_state { unsigned len; }; -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); + cs->fc = fc; cs->write = write; cs->req = req; cs->iov = iov; @@ -449,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unsigned long offset; int err; - unlock_request(cs->req); + unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -472,7 +474,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->seglen -= cs->len; cs->addr += cs->len; - return lock_request(cs->req); + return lock_request(cs->fc, cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -584,9 +586,9 @@ static void request_wait(struct fuse_conn *fc) if (signal_pending(current)) break; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); schedule(); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&fc->waitq, &wait); @@ -614,7 +616,7 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, return -EPERM; restart: - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -EAGAIN; if ((file->f_flags & O_NONBLOCK) && fc->connected && list_empty(&fc->pending)) @@ -643,14 +645,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, request_end(fc, req); goto restart; } - spin_unlock(&fuse_lock); - fuse_copy_init(&cs, 1, req, iov, nr_segs); + spin_unlock(&fc->lock); + fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err && req->interrupted) err = -ENOENT; @@ -665,12 +667,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return reqsize; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return err; } @@ -739,7 +741,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, if (!fc) return -ENODEV; - fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -751,7 +753,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, oh.len != nbytes) goto err_finish; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -ENOENT; if (!fc->connected) goto err_unlock; @@ -762,9 +764,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; if (req->interrupted) { - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } @@ -772,12 +774,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, req->out.h = oh; req->locked = 1; cs.req = req; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err = copy_out_args(&cs, &req->out, nbytes); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err) { if (req->interrupted) @@ -789,7 +791,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, return err ? err : nbytes; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err_finish: fuse_copy_finish(&cs); return err; @@ -813,12 +815,12 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) poll_wait(file, &fc->waitq, wait); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) mask = POLLERR; else if (!list_empty(&fc->pending)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return mask; } @@ -826,7 +828,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) /* * Abort all requests on the given list (pending or processing) * - * This function releases and reacquires fuse_lock + * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { @@ -835,7 +837,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; request_end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } @@ -866,10 +868,10 @@ static void end_io_requests(struct fuse_conn *fc) req->end = NULL; /* The end function will consume this reference */ __fuse_get_request(req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } } @@ -896,7 +898,7 @@ static void end_io_requests(struct fuse_conn *fc) */ void fuse_abort_conn(struct fuse_conn *fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; end_io_requests(fc); @@ -905,18 +907,18 @@ void fuse_abort_conn(struct fuse_conn *fc) wake_up_all(&fc->waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static int fuse_dev_release(struct inode *inode, struct file *file) { struct fuse_conn *fc = fuse_get_conn(file); if (fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index e5cb46b78437..6ed812fd6200 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -144,7 +144,7 @@ struct fuse_req { /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by - * fuse_lock + * fuse_conn->lock */ /** True if the request has reply */ @@ -213,6 +213,9 @@ struct fuse_req { * unmounted. */ struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + /** The user id for this mount */ uid_t user_id; @@ -351,21 +354,6 @@ static inline u64 get_node_id(struct inode *inode) /** Device operations */ extern const struct file_operations fuse_dev_operations; -/** - * This is the single global spinlock which protects FUSE's structures - * - * The following data is protected by this lock: - * - * - the private_data field of the device file - * - the s_fs_info field of the super block - * - unused_list, pending, processing lists in fuse_conn - * - background list in fuse_conn - * - the unique request ID counter reqctr in fuse_conn - * - the sb (super_block) field in fuse_conn - * - the file (device file) field in fuse_conn - */ -extern spinlock_t fuse_lock; - /** * Get a filled in inode */ @@ -490,7 +478,7 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** * Release inodes and file associated with background request */ -void fuse_release_background(struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 620579a69107..cc58debeabd4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); -spinlock_t fuse_lock; static kmem_cache_t *fuse_inode_cachep; static struct subsystem connections_subsys; @@ -207,13 +206,14 @@ static void fuse_put_super(struct super_block *sb) down_write(&fc->sbput_sem); while (!list_empty(&fc->background)) - fuse_release_background(list_entry(fc->background.next, + fuse_release_background(fc, + list_entry(fc->background.next, struct fuse_req, bg_entry)); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); fc->mounted = 0; fc->connected = 0; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); @@ -388,6 +388,7 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { int i; + spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); @@ -734,7 +735,6 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - spin_lock_init(&fuse_lock); res = fuse_fs_init(); if (res) goto err; -- cgit v1.2.3 From a87046d822f2d982d25b24c4a644d34f22d4888a Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:56 -0700 Subject: [PATCH] fuse: consolidate device errors Return consistent error values for the case when the opened device file has no mount associated yet. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 63d2cf43b5e3..6b8843d4ad8c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -739,7 +739,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) @@ -930,7 +930,7 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fc->fasync); -- cgit v1.2.3 From ce1d5a491f0ee50560416a73faa5e4ddbab074bd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:58 -0700 Subject: [PATCH] fuse: clean up request accounting FUSE allocated most requests from a fixed size pool filled at mount time. However in some cases (release/forget) non-pool requests were used. File locking operations aren't well served by the request pool, since they may block indefinetly thus exhausting the pool. This patch removes the request pool and always allocates requests on demand. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 73 +++++----------------------------- fs/fuse/dir.c | 118 +++++++++++++++++++++++++++---------------------------- fs/fuse/file.c | 48 +++++++++++----------- fs/fuse/fuse_i.h | 26 +++--------- fs/fuse/inode.c | 54 +++++++------------------ 5 files changed, 111 insertions(+), 208 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6b8843d4ad8c..4dc104c0e95d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -72,10 +72,8 @@ static void restore_sigs(sigset_t *oldset) */ void fuse_reset_request(struct fuse_req *req) { - int preallocated = req->preallocated; BUG_ON(atomic_read(&req->count) != 1); fuse_request_init(req); - req->preallocated = preallocated; } static void __fuse_get_request(struct fuse_req *req) @@ -90,71 +88,28 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } -static struct fuse_req *do_get_request(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc) { - struct fuse_req *req; + struct fuse_req *req = fuse_request_alloc(); + if (!req) + return ERR_PTR(-ENOMEM); - spin_lock(&fc->lock); - BUG_ON(list_empty(&fc->unused_list)); - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del_init(&req->list); - spin_unlock(&fc->lock); + atomic_inc(&fc->num_waiting); fuse_request_init(req); - req->preallocated = 1; req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; return req; } -/* This can return NULL, but only in case it's interrupted by a SIGKILL */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc) -{ - int intr; - sigset_t oldset; - - atomic_inc(&fc->num_waiting); - block_sigs(&oldset); - intr = down_interruptible(&fc->outstanding_sem); - restore_sigs(&oldset); - if (intr) { - atomic_dec(&fc->num_waiting); - return NULL; - } - return do_get_request(fc); -} - -/* Must be called with fc->lock held */ -static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) -{ - if (req->preallocated) { - atomic_dec(&fc->num_waiting); - list_add(&req->list, &fc->unused_list); - } else - fuse_request_free(req); - - /* If we are in debt decrease that first */ - if (fc->outstanding_debt) - fc->outstanding_debt--; - else - up(&fc->outstanding_sem); -} - void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fc->lock); - fuse_putback_request(fc, req); - spin_unlock(&fc->lock); + atomic_dec(&fc->num_waiting); + fuse_request_free(req); } } -static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) -{ - if (atomic_dec_and_test(&req->count)) - fuse_putback_request(fc, req); -} - void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); @@ -189,9 +144,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->list); req->state = FUSE_REQ_FINISHED; if (!req->background) { - wake_up(&req->waitq); - fuse_put_request_locked(fc, req); spin_unlock(&fc->lock); + wake_up(&req->waitq); + fuse_put_request(fc, req); } else { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -302,16 +257,6 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fc->reqctr; req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - if (!req->preallocated) { - /* If request is not preallocated (either FORGET or - RELEASE), then still decrease outstanding_sem, so - user can't open infinite number of files while not - processing the RELEASE requests. However for - efficiency do it without blocking, so if down() - would block, just increase the debt instead */ - if (down_trylock(&fc->outstanding_sem)) - fc->outstanding_debt++; - } list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; wake_up(&fc->waitq); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 256355b80256..8d7546e832e8 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fc = get_fuse_conn(inode); - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + if (IS_ERR(req)) return 0; fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); @@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - req = fuse_get_request(fc); - if (!req) - return ERR_PTR(-EINTR); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); @@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct file *file; int flags = nd->intent.open.flags - 1; - err = -ENOSYS; if (fc->no_create) - goto out; + return -ENOSYS; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) - goto out; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + err = -ENOMEM; ff = fuse_file_alloc(); if (!ff) goto out_put_request; @@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); - out: return err; } @@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_SYMLINK; req->in.numargs = 2; @@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); @@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); @@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.newdir = get_node_id(newdir); @@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); @@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode) int err; struct fuse_attr_out arg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); @@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask; @@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { @@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_req(fc); char *link; - if (!req) - return ERR_PTR(-EINTR); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); link = (char *) __get_free_page(GFP_KERNEL); if (!link) { @@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) } } - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); iattr_to_fattr(attr, &inarg); @@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3ac39c0288de..e4f041a11bb5 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, struct fuse_req *req; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -184,9 +184,9 @@ static int fuse_flush(struct file *file) if (fc->no_flush) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page) if (is_bad_inode(inode)) goto out; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) goto out; req->out.page_zeroing = 1; @@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_send_readpages(req, data->file, inode); - data->req = req = fuse_get_request(fc); - if (!req) { + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { unlock_page(page); - return -EINTR; + return PTR_ERR(req); } } req->pages[req->num_pages] = page; @@ -392,9 +392,9 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_request(fc); - if (!data.req) - return -EINTR; + data.req = fuse_get_req(fc); + if (IS_ERR(data.req)) + return PTR_ERR(data.req); err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); if (!err) { @@ -455,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->num_pages = 1; req->pages[0] = page; @@ -532,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); while (count) { size_t nres; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 6ed812fd6200..242e69cb1251 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -18,9 +18,6 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** If more requests are outstanding, then the operation will block */ -#define FUSE_MAX_OUTSTANDING 10 - /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -131,8 +128,8 @@ struct fuse_conn; * A request to the client */ struct fuse_req { - /** This can be on either unused_list, pending processing or - io lists in fuse_conn */ + /** This can be on either pending processing or io lists in + fuse_conn */ struct list_head list; /** Entry on the background list */ @@ -150,9 +147,6 @@ struct fuse_req { /** True if the request has reply */ unsigned isreply:1; - /** The request is preallocated */ - unsigned preallocated:1; - /** The request was interrupted */ unsigned interrupted:1; @@ -247,19 +241,9 @@ struct fuse_conn { interrupted request) */ struct list_head background; - /** Controls the maximum number of outstanding requests */ - struct semaphore outstanding_sem; - - /** This counts the number of outstanding requests if - outstanding_sem would go negative */ - unsigned outstanding_debt; - /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; - /** The list of unused requests */ - struct list_head unused_list; - /** The next unique request id */ u64 reqctr; @@ -452,11 +436,11 @@ void fuse_reset_request(struct fuse_req *req); /** * Reserve a preallocated request */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Decrement reference count of a request. If count goes to zero put - * on unused list (preallocated) or free request (not preallocated). + * Decrement reference count of a request. If count goes to zero free + * the request. */ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index cc58debeabd4..824ebbc428ed 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -243,9 +243,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; @@ -370,15 +370,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) static void fuse_conn_release(struct kobject *kobj) { - struct fuse_conn *fc = get_fuse_conn_kobj(kobj); - - while (!list_empty(&fc->unused_list)) { - struct fuse_req *req; - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del(&req->list); - fuse_request_free(req); - } - kfree(fc); + kfree(get_fuse_conn_kobj(kobj)); } static struct fuse_conn *new_conn(void) @@ -387,27 +379,16 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { - int i; spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->unused_list); INIT_LIST_HEAD(&fc->background); - sema_init(&fc->outstanding_sem, 1); /* One for INIT */ init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); - for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { - struct fuse_req *req = fuse_request_alloc(); - if (!req) { - kobject_put(&fc->kobj); - return NULL; - } - list_add(&req->list, &fc->unused_list); - } fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; @@ -438,7 +419,6 @@ static struct super_operations fuse_super_operations = { static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -457,22 +437,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - fuse_put_request(fc, req); } -static void fuse_send_init(struct fuse_conn *fc) +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) { - /* This is called from fuse_read_super() so there's guaranteed - to be exactly one request available */ - struct fuse_req *req = fuse_get_request(fc); struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; @@ -508,6 +477,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; + struct fuse_req *init_req; int err; if (!parse_fuse_opt((char *) data, &d)) @@ -554,13 +524,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; } + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + err = kobject_set_name(&fc->kobj, "%llu", conn_id()); if (err) - goto err_put_root; + goto err_free_req; err = kobject_add(&fc->kobj); if (err) - goto err_put_root; + goto err_free_req; sb->s_root = root_dentry; fc->mounted = 1; @@ -574,10 +548,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) */ fput(file); - fuse_send_init(fc); + fuse_send_init(fc, init_req); return 0; + err_free_req: + fuse_request_free(init_req); err_put_root: dput(root_dentry); err: -- cgit v1.2.3 From 08a53cdce62d37d918530bbbf726cc01b21dc3d1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 10 Apr 2006 22:54:59 -0700 Subject: [PATCH] fuse: account background requests The previous patch removed limiting the number of outstanding requests. This patch adds a much simpler limiting, that is also compatible with file locking operations. A task may have at most one synchronous request allocated. So these requests need not be otherwise limited. However the number of background requests (release, forget, asynchronous reads, interrupted requests) can grow indefinitely. This can be used by a malicous user to cause FUSE to allocate arbitrary amounts of unswappable kernel memory, denying service. For this reason add a limit for the number of background requests, and block allocations of new requests until the number goes bellow the limit. Also use this mechanism to block all requests until the INIT reply is received. Signed-off-by: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/dev.c | 24 ++++++++++++++++++++---- fs/fuse/fuse_i.h | 14 ++++++++++++++ fs/fuse/inode.c | 4 ++++ 3 files changed, 38 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4dc104c0e95d..6c740f860665 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -90,7 +90,17 @@ static void __fuse_put_request(struct fuse_req *req) struct fuse_req *fuse_get_req(struct fuse_conn *fc) { - struct fuse_req *req = fuse_request_alloc(); + struct fuse_req *req; + sigset_t oldset; + int err; + + block_sigs(&oldset); + err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + restore_sigs(&oldset); + if (err) + return ERR_PTR(-EINTR); + + req = fuse_request_alloc(); if (!req) return ERR_PTR(-ENOMEM); @@ -118,6 +128,11 @@ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) fput(req->file); spin_lock(&fc->lock); list_del(&req->bg_entry); + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; spin_unlock(&fc->lock); } @@ -195,6 +210,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { req->background = 1; list_add(&req->bg_entry, &fc->background); + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; if (req->inode) req->inode = igrab(req->inode); if (req->inode2) @@ -288,6 +306,7 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { spin_lock(&fc->lock); + background_request(fc, req); if (fc->connected) { queue_request(fc, req); spin_unlock(&fc->lock); @@ -306,9 +325,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fc->lock); - background_request(fc, req); - spin_unlock(&fc->lock); request_send_nowait(fc, req); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 242e69cb1251..19c7185a7546 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -18,6 +18,9 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 +/** Maximum number of outstanding background requests */ +#define FUSE_MAX_BACKGROUND 10 + /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -241,6 +244,17 @@ struct fuse_conn { interrupted request) */ struct list_head background; + /** Number of requests currently in the background */ + unsigned num_background; + + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; + + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; + /** RW semaphore for exclusion with fuse_put_super() */ struct rw_semaphore sbput_sem; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 824ebbc428ed..fd34037b0588 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -381,6 +381,7 @@ static struct fuse_conn *new_conn(void) if (fc) { spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); @@ -392,6 +393,7 @@ static struct fuse_conn *new_conn(void) fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; + fc->blocked = 1; } return fc; } @@ -438,6 +440,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } fuse_put_request(fc, req); + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); } static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) -- cgit v1.2.3 From 7775f4c85dcbd1175f21b2fbb7221c79ec70b722 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:20 -0700 Subject: [PATCH] knfsd: Correct reserved reply space for read requests. NFSd makes sure there is enough space to hold the maximum possible reply before accepting a request. The units for this maximum is (4byte) words. However in three places, particularly for read request, the number given is a number of bytes. This means too much space is reserved which is slightly wasteful. This is the sort of patch that could uncover a deeper bug, and it is not critical, so it would be best for it to spend a while in -mm before going in to mainline. (akpm: target 2.6.17-rc2, 2.6.16.3 (approx)) Discovered-by: "Eivind Sarto" Signed-off-by: Neil Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs3proc.c | 2 +- fs/nfsd/nfs4proc.c | 2 +- fs/nfsd/nfsproc.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6d2dfed1de08..f61142afea44 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = { PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), - PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), + PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6d63f1d9e5f5..ca8a4c410de3 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -975,7 +975,7 @@ struct nfsd4_voidargs { int dummy; }; */ static struct svc_procedure nfsd_procedures4[2] = { PROC(null, void, void, void, RC_NOCACHE, 1), - PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) + PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) }; struct svc_version nfsd_version4 = { diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 3e6b75cd90fd..06cd0db0f32b 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), -- cgit v1.2.3 From d5b9026a670fdb404e6e2e2e0a1b447e9ea9c1f6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:22 -0700 Subject: [PATCH] knfsd: locks: flag NFSv4-owned locks Use the fl_lmops field to identify which locks are ours, instead of trying to look them up in our private hash. This is safer and more efficient. Earlier versions of this patch used a lock flag instead, but Trond pointed out that adding a new flag for each lock manager wasn't going to scale well, and suggested this approach instead; a separate patch converts lockd to using fl_lmops in the same way. In the NFSv4 case this looks like a bit of a hack, since the NFSv4 server isn't currently actually defining a lock_manager_operations struct, so we end up defining one *just* to serve as a cookie to identify our locks. But it works, and we actually do expect to start using the lock_manager_operations at some point anyway. Signed-off-by: Marc Eshel Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47ec112b266c..ffedce08b4cb 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2495,36 +2495,27 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static int -nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) -{ - struct nfs4_stateowner *local = NULL; - int status = 0; - - if (hashval >= LOCK_HASH_SIZE) - goto out; - list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) { - if (local == sop) { - status = 1; - goto out; - } - } -out: - return status; -} - +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +struct lock_manager_operations nfsd_posix_mng_ops = { +}; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner; - unsigned int hval = lockownerid_hashval(sop->so_id); + struct nfs4_stateowner *sop; + unsigned int hval; - deny->ld_sop = NULL; - if (nfs4_verify_lock_stateowner(sop, hval)) { + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = ~(u64)0; @@ -2736,6 +2727,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; if ((lock->lk_length == ~(u64)0) || @@ -2841,6 +2833,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) @@ -2900,6 +2893,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) -- cgit v1.2.3 From e465a77f943f51df1a169426df879340bd0db3f3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Mon, 10 Apr 2006 22:55:23 -0700 Subject: [PATCH] fs/nfsd/nfs4state.c: make a struct static Signed-off-by: Adrian Bunk Cc: Marc Eshel Cc: Andy Adamson Cc: J. Bruce Fields Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index ffedce08b4cb..a8c2122a481e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2497,7 +2497,7 @@ nfs4_transform_lock_offset(struct file_lock *lock) /* Hack!: For now, we're defining this just so we can use a pointer to it * as a unique cookie to identify our (NFSv4's) posix locks. */ -struct lock_manager_operations nfsd_posix_mng_ops = { +static struct lock_manager_operations nfsd_posix_mng_ops = { }; static inline void -- cgit v1.2.3 From 249920527f9e6e5c305538bbf1ea882ee7dc1c06 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:24 -0700 Subject: [PATCH] knfsd: nfsd4: Wrong error handling in nfs4acl this fixes coverity id #3. Coverity detected dead code, since the == -1 comparison only returns 0 or 1 to error. Therefore the if ( error < 0 ) statement was always false. Seems that this was an if( error = nfs4... ) statement some time ago, which got broken during cleanup. Signed-off-by: Eric Sesterhenn Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 7391f4aabedb..63818a51c05c 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) continue; error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who) == -1; + ace->access_mask, ace->whotype, ace->who); if (error < 0) goto out; -- cgit v1.2.3 From b905b7b0a054d2ab3e0c9304def998546c93f6b5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:25 -0700 Subject: [PATCH] knfsd: nfsd4: better nfs4acl errors We're returning -1 in a few places in the NFSv4<->POSIX acl translation code where we could return a reasonable error. Also allows some minor simplification elsewhere. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4acl.c | 6 +++--- fs/nfsd/nfs4xdr.c | 7 +++---- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 63818a51c05c..edb107e61b91 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl) /* Also, the remaining entries are for named users and * groups, and come in threes (mask, allow, deny): */ if (n4acl->naces < 7) - return -1; + return -EINVAL; if ((n4acl->naces - 7) % 3) - return -1; + return -EINVAL; return 4 + (n4acl->naces - 7)/3; } } @@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, struct nfs4_ace *ace; if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -1; + return -ENOMEM; ace->type = type; ace->flag = flag; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 03857fd81126..845f25251d81 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia buf, dummy32, &ace.who); if (status) goto out_nfserr; - if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who) != 0) { - status = -ENOMEM; + status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, + ace.access_mask, ace.whotype, ace.who); + if (status) goto out_nfserr; - } } } else *acl = NULL; -- cgit v1.2.3 From b5872b0dcc0501035d5ae53c60f8cbbb3798da8a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:26 -0700 Subject: [PATCH] knfsd: nfsd4: fix acl xattr length return We should be using the length from the second vfs_getxattr, in case it changed. (Note: there's still a small race here; we could end up returning -ENOMEM if the length increased between the first and second call. I don't know whether it's worth spending a lot of effort to fix that.) This makes XFS ACLs usable on NFS exports, which they currently aren't, since XFS appears to be returning a too-large value for vfs_getxattr() when it's passed a NULL buffer. So there's probably an XFS bug here too, though since getxattr with a NULL buffer is usually used to decide how much memory to allocate, it may be a fairly harmless bug in most cases. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/vfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31018333dc38..6aa92d0e6876 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -371,7 +371,6 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; - int error; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - error = vfs_getxattr(dentry, key, *buf, buflen); - if (error < 0) - return error; - return buflen; + return vfs_getxattr(dentry, key, *buf, buflen); } #endif -- cgit v1.2.3 From cd15654963cf7e4dd938a403de3ec5bcd09f8350 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:27 -0700 Subject: [PATCH] knfsd: nfsd: oops exporting nonexistent directory Export a directory that does not exist: exportfs -orw,fsid=0,insecure,no_subtree_check client:/home/NFS4 Try to mount from client with nfs4. Mount hangs (I'm not sure why - that's another issue). While client is hung, back on server mkdir /home/NFS4 The server panics in dput. I traced the problem back to svc_export_parse() calling path_release() even though path_lookup() failed (it happens to fill in the nameidata structure with a negative dentry - so the test after out: succeeds). After patching, an recreating the problem, the client mount still takes some time before finally exiting with a message "couldn't read superblock". Here is a simple patch to resolve this issue: Signed-off-by: Frank Filz Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/export.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c340be0a3f59..4e0578121d9a 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; err = path_lookup(buf, 0, &nd); - if (err) goto out; + if (err) goto out_no_path; exp.h.flags = 0; exp.ex_client = dom; @@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) out: if (nd.dentry) path_release(&nd); + out_no_path: if (dom) auth_domain_put(dom); kfree(buf); -- cgit v1.2.3 From 54cceebb679a8d10fa382422aa2035cdc65fe7ce Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:30 -0700 Subject: [PATCH] knfsd: nfsd: nfsd_setuser doesn't really need to modify rqstp->rq_cred. In addition to setting the processes filesystem id's, nfsd_setuser also modifies the value of the rq_cred which stores the id's that originally came from the rpc call, for example to reflect root squashing. There's no real reason to do that--the only case where rqstp->rq_cred is actually used later on is in the NFSv4 SETCLIENTID/SETCLIENTID_CONFIRM operations, and there the results are the opposite of what we want--those two operations don't deal with the filesystem at all, they only record the credentials used with the rpc call for later reference (so that we may require the same credentials be used on later operations), and the credentials shouldn't vary just because there was or wasn't a previous operation in the compound that referred to some export This fixes a bug which caused mounts from Solaris clients to fail. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/auth.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index cfe9ce881613..6e92b0fe5323 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -14,46 +14,46 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct svc_cred *cred = &rqstp->rq_cred; + struct svc_cred cred = rqstp->rq_cred; int i; int ret; if (exp->ex_flags & NFSEXP_ALLSQUASH) { - cred->cr_uid = exp->ex_anon_uid; - cred->cr_gid = exp->ex_anon_gid; - put_group_info(cred->cr_group_info); - cred->cr_group_info = groups_alloc(0); + cred.cr_uid = exp->ex_anon_uid; + cred.cr_gid = exp->ex_anon_gid; + cred.cr_group_info = groups_alloc(0); } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; - if (!cred->cr_uid) - cred->cr_uid = exp->ex_anon_uid; - if (!cred->cr_gid) - cred->cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred->cr_group_info->ngroups); + if (!cred.cr_uid) + cred.cr_uid = exp->ex_anon_uid; + if (!cred.cr_gid) + cred.cr_gid = exp->ex_anon_gid; + gi = groups_alloc(cred.cr_group_info->ngroups); if (gi) - for (i = 0; i < cred->cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred->cr_group_info, i)) + for (i = 0; i < cred.cr_group_info->ngroups; i++) { + if (!GROUP_AT(cred.cr_group_info, i)) GROUP_AT(gi, i) = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i); + GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); } - put_group_info(cred->cr_group_info); - cred->cr_group_info = gi; - } + cred.cr_group_info = gi; + } else + get_group_info(cred.cr_group_info); - if (cred->cr_uid != (uid_t) -1) - current->fsuid = cred->cr_uid; + if (cred.cr_uid != (uid_t) -1) + current->fsuid = cred.cr_uid; else current->fsuid = exp->ex_anon_uid; - if (cred->cr_gid != (gid_t) -1) - current->fsgid = cred->cr_gid; + if (cred.cr_gid != (gid_t) -1) + current->fsgid = cred.cr_gid; else current->fsgid = exp->ex_anon_gid; - if (!cred->cr_group_info) + if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred->cr_group_info); - if ((cred->cr_uid)) { + ret = set_current_groups(cred.cr_group_info); + put_group_info(cred.cr_group_info); + if ((cred.cr_uid)) { cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; } else { cap_t(current->cap_effective) |= (CAP_NFSD_MASK & -- cgit v1.2.3 From f0e2993e9e73e8f38b05a89c98b9db94fec2199d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:31 -0700 Subject: [PATCH] knfsd: nfsd4: remove nfsd_setuser from putrootfh Since nfsd_setuser() is already called from any operation that uses the current filehandle (because it's called from fh_verify), there's no reason to call it from putrootfh. Signed-off-by: Andy Adamson Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4proc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index ca8a4c410de3..b0e095ea0c03 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) fh_put(current_fh); status = exp_pseudoroot(rqstp->rq_client, current_fh, &rqstp->rq_chandle); - if (!status) - status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export)); return status; } -- cgit v1.2.3 From 6ed6decccf544970664757464cfb67e081775e6a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:32 -0700 Subject: [PATCH] knfsd: nfsd4: fix corruption of returned data when using 64k pages In v4 we grab an extra page just for the padding of returned data. The formula that the rpc server uses to allocate pages for the response doesn't take into account this extra page. Instead of adjusting those formulae, we adopt the same solution as v2 and v3, and put the "tail" data in the same page as the "head" data. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 42 +++++++++++++++++------------------------- 1 file changed, 17 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 845f25251d81..f2710cfc0bc5 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2084,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read WRITE32(eof); WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; resp->xbuf->page_len = maxcount; - /* read zero bytes -> don't set up tail */ - if(!maxcount) - return 0; - - /* set up page for remaining responses */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2141,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - - resp->xbuf->page_len = maxcount; if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } -- cgit v1.2.3 From bb6e8a9f4005237401a45f1ea43db060ea5f9725 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:33 -0700 Subject: [PATCH] knfsd: nfsd4: fix corruption on readdir encoding with 64k pages Fix corruption on readdir encoding with 64k pages. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4xdr.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f2710cfc0bc5..de3998f15f10 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2157,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re { int maxcount; loff_t offset; - u32 *page, *savep; + u32 *page, *savep, *tailbase; ENCODE_HEAD; if (nfserr) @@ -2173,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re WRITE32(0); ADJUST_ARGS(); resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; maxcount = PAGE_SIZE; if (maxcount > readdir->rd_maxcount) @@ -2217,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re *p++ = htonl(readdir->common.err == nfserr_eof); resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - /* allocate a page for the tail */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: -- cgit v1.2.3 From 5e8d5c29482dc56de5971ddc99c6e7f69e4d16f6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:37 -0700 Subject: [PATCH] knfsd: nfsd4: fix laundromat shutdown race We need to make sure the laundromat work doesn't reschedule itself just when we try to cancel it. Also, we shouldn't be waiting for it to finish running while holding the state lock, as that's a potential deadlock. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index a8c2122a481e..01ff544dc1f5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3238,8 +3238,6 @@ __nfs4_state_shutdown(void) } cancel_delayed_work(&laundromat_work); - flush_workqueue(laundry_wq); - destroy_workqueue(laundry_wq); nfsd4_shutdown_recdir(); nfs4_init = 0; } @@ -3247,6 +3245,8 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + destroy_workqueue(laundry_wq); nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); -- cgit v1.2.3 From 541e0e09814594e907e18fb8d9fc9b746aa4b18a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:38 -0700 Subject: [PATCH] knfsd: nfsd4: nfsd4_probe_callback cleanup Some obvious cleanup. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4callback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c872bd07fc10..dbaf3f93f328 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_clnt; } - /* the task holds a reference to the nfs4_client struct */ cb->cb_client = clnt; + + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); @@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp) out_rpciod: atomic_dec(&clp->cl_count); rpciod_down(); + cb->cb_client = NULL; out_clnt: rpc_shutdown_client(clnt); - goto out_err; out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); - cb->cb_client = NULL; } static void -- cgit v1.2.3 From 4e2fd495b520b51e4ba83340f13520b7f07e3743 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:39 -0700 Subject: [PATCH] knfsd: nfsd4: add missing rpciod_down() We should be shutting down rpciod for the callback channel when we shut down the server. Also note that we do rpciod_up() and create the callback client *before* setting cb_set--the cb_set only determines whether the initial null was succesful. So cb_set is not a reliable determiner of whether we need to clean up, only cb_client is. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 01ff544dc1f5..e97c58aafde5 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -329,23 +329,30 @@ put_nfs4_client(struct nfs4_client *clp) free_client(clp); } +static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + rpciod_down(); + } +} + static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; struct nfs4_delegation *dp; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; struct list_head reaplist; dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (atomic_read(&cb->cb_set) == 1 && clnt) { - rpc_shutdown_client(clnt); - clnt = clp->cl_callback.cb_client = NULL; - } + shutdown_callback_client(clp); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); -- cgit v1.2.3 From ef0f3390ebedac78bff1936bbb26606bca83e891 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:41 -0700 Subject: [PATCH] knfsd: nfsd4: limit number of delegations handed out. It's very easy for the server to DOS itself by just giving out too many delegations. For now we just solve the problem with a dumb hard limit. Eventually we'll want a smarter policy. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 74 +++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e97c58aafde5..1e2a89aaf895 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi) kref_get(&fi->fi_ref); } +static int num_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (num_delegations > STATEID_HASH_SIZE * 4) + return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); @@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); kmem_cache_free(deleg_slab, dp); + num_delegations--; } } @@ -943,40 +983,6 @@ out: return status; } -/* - * Open owner state (share locks) - */ - -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) - -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) - -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) - -static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; - /* OPEN Share state helper functions */ static inline struct nfs4_file * alloc_init_file(struct inode *ino) -- cgit v1.2.3 From 358dd55aa3a77fbbae482b83d96733d9ad441d05 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 10 Apr 2006 22:55:42 -0700 Subject: [PATCH] knfsd: nfsd4: grant delegations more frequently Keep unused openowners around for at least one lease period, to avoid the need for as many open confirmations and to allow handing out more delegations. Signed-off-by: J. Bruce Fields Signed-off-by: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nfsd/nfs4state.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1e2a89aaf895..96c7578cbe1e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1199,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); - unhash_stateowner(sop); - list_add_tail(&sop->so_close_lru, &close_lru); + list_move_tail(&sop->so_close_lru, &close_lru); sop->so_time = get_seconds(); } @@ -1929,8 +1928,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + release_stateowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -3218,15 +3216,8 @@ __nfs4_state_shutdown(void) int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; - struct nfs4_stateowner *sop = NULL; struct list_head *pos, *next, reaplist; - list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); - } - for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); -- cgit v1.2.3 From cbb7e577e732f576b9f399bc2600bdc0626c68dc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 14:57:50 +0200 Subject: [PATCH] splice: pass offset around for ->splice_read() and ->splice_write() We need not use ->f_pos as the offset for the file input/output. If the user passed an offset pointer in through sys_splice(), just use that and leave ->f_pos alone. Signed-off-by: Jens Axboe --- fs/splice.c | 86 ++++++++++++++++++++++---------------------- fs/xfs/linux-2.6/xfs_file.c | 12 ++++--- fs/xfs/linux-2.6/xfs_lrw.c | 14 ++++---- fs/xfs/linux-2.6/xfs_lrw.h | 4 +-- fs/xfs/linux-2.6/xfs_vnode.h | 12 +++---- 5 files changed, 68 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index e50a460239dd..5d3eda64703b 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -231,8 +231,9 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, } static int -__generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct address_space *mapping = in->f_mapping; unsigned int offset, nr_pages; @@ -241,8 +242,8 @@ __generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, pgoff_t index; int i, error; - index = in->f_pos >> PAGE_CACHE_SHIFT; - offset = in->f_pos & ~PAGE_CACHE_MASK; + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -348,8 +349,9 @@ fill_it: * * Will read pages from given file and fill them into a pipe. */ -ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t spliced; int ret; @@ -358,12 +360,12 @@ ssize_t generic_file_splice_read(struct file *in, struct pipe_inode_info *pipe, spliced = 0; while (len) { - ret = __generic_file_splice_read(in, pipe, len, flags); + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); if (ret <= 0) break; - in->f_pos += ret; + *ppos += ret; len -= ret; spliced += ret; @@ -561,7 +563,7 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags, + loff_t *ppos, size_t len, unsigned int flags, splice_actor *actor) { int ret, do_wakeup, err; @@ -573,7 +575,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.total_len = len; sd.flags = flags; sd.file = out; - sd.pos = out->f_pos; + sd.pos = *ppos; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -656,9 +658,7 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - out->f_pos = sd.pos; return ret; - } /** @@ -674,12 +674,12 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, */ ssize_t generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, len, flags, pipe_to_file); + ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); /* * If file or inode is SYNC and we actually wrote some data, sync it. @@ -715,9 +715,9 @@ EXPORT_SYMBOL(generic_file_splice_write); * */ ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, len, flags, pipe_to_sendpage); + return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -726,9 +726,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); * Attempt to initiate a splice from pipe to file. */ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - size_t len, unsigned int flags) + loff_t *ppos, size_t len, unsigned int flags) { - loff_t pos; int ret; if (unlikely(!out->f_op || !out->f_op->splice_write)) @@ -737,22 +736,21 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; - pos = out->f_pos; - - ret = rw_verify_area(WRITE, out, &pos, len); + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, len, flags); + return out->f_op->splice_write(pipe, out, ppos, len, flags); } /* * Attempt to initiate a splice from a file to a pipe. */ -static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, - size_t len, unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { - loff_t pos, isize, left; + loff_t isize, left; int ret; if (unlikely(!in->f_op || !in->f_op->splice_read)) @@ -761,28 +759,27 @@ static long do_splice_to(struct file *in, struct pipe_inode_info *pipe, if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; - pos = in->f_pos; - - ret = rw_verify_area(READ, in, &pos, len); + ret = rw_verify_area(READ, in, ppos, len); if (unlikely(ret < 0)) return ret; isize = i_size_read(in->f_mapping->host); - if (unlikely(in->f_pos >= isize)) + if (unlikely(*ppos >= isize)) return 0; - left = isize - in->f_pos; + left = isize - *ppos; if (unlikely(left < len)) len = left; - return in->f_op->splice_read(in, pipe, len, flags); + return in->f_op->splice_read(in, ppos, pipe, len, flags); } -long do_splice_direct(struct file *in, struct file *out, size_t len, - unsigned int flags) +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) { struct pipe_inode_info *pipe; long ret, bytes; + loff_t out_off; umode_t i_mode; int i; @@ -820,6 +817,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ ret = 0; bytes = 0; + out_off = 0; while (len) { size_t read_len, max_read_len; @@ -829,7 +827,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, */ max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); - ret = do_splice_to(in, pipe, max_read_len, flags); + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); if (unlikely(ret < 0)) goto out_release; @@ -840,7 +838,7 @@ long do_splice_direct(struct file *in, struct file *out, size_t len, * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: */ - ret = do_splice_from(pipe, out, read_len, + ret = do_splice_from(pipe, out, &out_off, read_len, flags & ~SPLICE_F_NONBLOCK); if (unlikely(ret < 0)) goto out_release; @@ -898,6 +896,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, size_t len, unsigned int flags) { struct pipe_inode_info *pipe; + loff_t offset, *off; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -906,12 +905,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_out) { if (out->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&out->f_pos, off_out, - sizeof(loff_t))) + if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &out->f_pos; - return do_splice_from(pipe, out, len, flags); + return do_splice_from(pipe, out, off, len, flags); } pipe = out->f_dentry->d_inode->i_pipe; @@ -921,11 +921,13 @@ static long do_splice(struct file *in, loff_t __user *off_in, if (off_in) { if (in->f_op->llseek == no_llseek) return -EINVAL; - if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t))) + if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - } + off = &offset; + } else + off = &in->f_pos; - return do_splice_to(in, pipe, len, flags); + return do_splice_to(in, off, pipe, len, flags); } return -EINVAL; diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 269721af02f3..c847416f6d10 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -252,6 +252,7 @@ xfs_file_sendfile_invis( STATIC ssize_t xfs_file_splice_read( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -259,13 +260,14 @@ xfs_file_splice_read( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval); return rval; } STATIC ssize_t xfs_file_splice_read_invis( struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -273,7 +275,7 @@ xfs_file_splice_read_invis( vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval); return rval; } @@ -281,13 +283,14 @@ STATIC ssize_t xfs_file_splice_write( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval); return rval; } @@ -295,13 +298,14 @@ STATIC ssize_t xfs_file_splice_write_invis( struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t len, unsigned int flags) { vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); ssize_t rval; - VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval); + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval); return rval; } diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 74a52937f208..67efe3308980 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -338,6 +338,7 @@ ssize_t xfs_splice_read( bhv_desc_t *bdp, struct file *infilp, + loff_t *ppos, struct pipe_inode_info *pipe, size_t count, int flags, @@ -360,7 +361,7 @@ xfs_splice_read( int error; error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), - infilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(infilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -368,8 +369,8 @@ xfs_splice_read( } } xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, - pipe, count, infilp->f_pos, ioflags); - ret = generic_file_splice_read(infilp, pipe, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -382,6 +383,7 @@ xfs_splice_write( bhv_desc_t *bdp, struct pipe_inode_info *pipe, struct file *outfilp, + loff_t *ppos, size_t count, int flags, int ioflags, @@ -403,7 +405,7 @@ xfs_splice_write( int error; error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), - outfilp->f_pos, count, + *ppos, count, FILP_DELAY_FLAG(outfilp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_EXCL); @@ -411,8 +413,8 @@ xfs_splice_write( } } xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, - pipe, count, outfilp->f_pos, ioflags); - ret = generic_file_splice_write(pipe, outfilp, count, flags); + pipe, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); if (ret > 0) XFS_STATS_ADD(xs_write_bytes, ret); diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 55c689a86ad2..8f4539952350 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 88b09f186289..2a8e16c22353 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); -typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, struct pipe_inode_info *, size_t, int, int, struct cred *); typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, - struct file *, size_t, int, int, + struct file *, loff_t *, size_t, int, int, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); @@ -284,10 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) -#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) -#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv) \ - rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ -- cgit v1.2.3 From 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 11 Apr 2006 15:51:17 +0200 Subject: [PATCH] splice: add support for sys_tee() Basically an in-kernel implementation of tee, which uses splice and the pipe buffers as an intelligent way to pass data around by reference. Where the user space tee consumes the input and produces a stdout and file output, this syscall merely duplicates the data inside a pipe to another pipe. No data is copied, the output just grabs a reference to the input pipe data. Signed-off-by: Jens Axboe --- fs/pipe.c | 7 +++ fs/splice.c | 186 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 193 insertions(+) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index e984beb93a0e..7fefb10db8d9 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -131,12 +131,19 @@ static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, return 0; } +static void anon_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = anon_pipe_buf_map, .unmap = anon_pipe_buf_unmap, .release = anon_pipe_buf_release, .steal = anon_pipe_buf_steal, + .get = anon_pipe_buf_get, }; static ssize_t diff --git a/fs/splice.c b/fs/splice.c index 5d3eda64703b..8d57e89924a6 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -125,12 +125,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void page_cache_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations page_cache_pipe_buf_ops = { .can_merge = 0, .map = page_cache_pipe_buf_map, .unmap = page_cache_pipe_buf_unmap, .release = page_cache_pipe_buf_release, .steal = page_cache_pipe_buf_steal, + .get = page_cache_pipe_buf_get, }; /* @@ -963,3 +970,182 @@ asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, return error; } + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, do_wakeup = 0, i; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. + */ + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} -- cgit v1.2.3 From 73ce8355c243a434524a34c05cc417dd0467996e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:14:26 +0200 Subject: [fuse] fix deadlock between fuse_put_super() and request_end() A deadlock was possible, when the last reference to the superblock was held due to a background request containing a file reference. Releasing the file would release the vfsmount which in turn would release the superblock. Since sbput_sem is held during the fput() and fuse_put_super() tries to acquire this same semaphore, a deadlock results. The chosen soltuion is to get rid of sbput_sem, and instead use the spinlock to ensure the referenced inodes/file are released only once. Since the actual release may sleep, defer these outside the locked region, but using local variables instead of the structure members. This is a much more rubust solution. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 28 ++++++++++++++++------------ fs/fuse/fuse_i.h | 12 +++--------- fs/fuse/inode.c | 27 +++++++++++++++++---------- 3 files changed, 36 insertions(+), 31 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6c740f860665..d4efb6223e2c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -120,20 +120,14 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) { - iput(req->inode); - iput(req->inode2); - if (req->file) - fput(req->file); - spin_lock(&fc->lock); - list_del(&req->bg_entry); + list_del_init(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } fc->num_background--; - spin_unlock(&fc->lock); } /* @@ -163,17 +157,27 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); fuse_put_request(fc, req); } else { + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + struct file *file = req->file; void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; + req->inode = NULL; + req->inode2 = NULL; + req->file = NULL; + if (!list_empty(&req->bg_entry)) + fuse_remove_background(fc, req); spin_unlock(&fc->lock); - down_read(&fc->sbput_sem); - if (fc->mounted) - fuse_release_background(fc, req); - up_read(&fc->sbput_sem); + if (end) end(fc, req); else fuse_put_request(fc, req); + + if (file) + fput(file); + iput(inode); + iput(inode2); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 19c7185a7546..ee9b83042510 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -255,15 +255,9 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; - /** RW semaphore for exclusion with fuse_put_super() */ - struct rw_semaphore sbput_sem; - /** The next unique request id */ u64 reqctr; - /** Mount is active */ - unsigned mounted; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -474,11 +468,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** - * Release inodes and file associated with background request + * Remove request from the the background list */ -void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); -/* Abort all requests */ +/** Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd34037b0588..43a6fc0db8a7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -204,17 +204,26 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - down_write(&fc->sbput_sem); - while (!list_empty(&fc->background)) - fuse_release_background(fc, - list_entry(fc->background.next, - struct fuse_req, bg_entry)); - spin_lock(&fc->lock); - fc->mounted = 0; fc->connected = 0; + while (!list_empty(&fc->background)) { + struct fuse_req *req = list_entry(fc->background.next, + struct fuse_req, bg_entry); + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + + /* File would hold a reference to vfsmount */ + BUG_ON(req->file); + req->inode = NULL; + req->inode2 = NULL; + fuse_remove_background(fc, req); + + spin_unlock(&fc->lock); + iput(inode); + iput(inode2); + spin_lock(&fc->lock); + } spin_unlock(&fc->lock); - up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); @@ -386,7 +395,6 @@ static struct fuse_conn *new_conn(void) INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); INIT_LIST_HEAD(&fc->background); - init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); @@ -541,7 +549,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_free_req; sb->s_root = root_dentry; - fc->mounted = 1; fc->connected = 1; kobject_get(&fc->kobj); file->private_data = fc; -- cgit v1.2.3 From 9bc5dddad1294955e70eeb87325ba1505fb5fe2e Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:09 +0200 Subject: [fuse] Fix accounting the number of waiting requests Properly accounting the number of waiting requests was forgotten in "clean up request accounting" patch. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 25 +++++++++++++++++++------ fs/fuse/fuse_i.h | 3 +++ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index d4efb6223e2c..8538b298a6b0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -92,30 +92,39 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; sigset_t oldset; + int intr; int err; + atomic_inc(&fc->num_waiting); block_sigs(&oldset); - err = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (err) - return ERR_PTR(-EINTR); + err = -EINTR; + if (intr) + goto out; req = fuse_request_alloc(); + err = -ENOMEM; if (!req) - return ERR_PTR(-ENOMEM); + goto out; - atomic_inc(&fc->num_waiting); fuse_request_init(req); req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; + req->waiting = 1; return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - atomic_dec(&fc->num_waiting); + if (req->waiting) + atomic_dec(&fc->num_waiting); fuse_request_free(req); } } @@ -281,6 +290,10 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } wake_up(&fc->waitq); kill_fasync(&fc->fasync, SIGIO, POLL_IN); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ee9b83042510..59661c481d9d 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -159,6 +159,9 @@ struct fuse_req { /** Data is being copied to/from the request */ unsigned locked:1; + /** Request is counted as "waiting" */ + unsigned waiting:1; + /** State of the request */ enum fuse_req_state state; -- cgit v1.2.3 From 4858cae4f0904681eab58a16891c22397618a2a2 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:38 +0200 Subject: [fuse] Don't init request twice Request is already initialized in fuse_request_alloc() so no need to do it again in fuse_get_req(). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 8538b298a6b0..cc750c68fe70 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -108,7 +108,6 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc) if (!req) goto out; - fuse_request_init(req); req->in.h.uid = current->fsuid; req->in.h.gid = current->fsgid; req->in.h.pid = current->pid; -- cgit v1.2.3 From 56cf34ff0795692327234963dcdcc2cdeec2bb3d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 11 Apr 2006 21:16:51 +0200 Subject: [fuse] Direct I/O should not use fuse_reset_request It's cleaner to allocate a new request, otherwise the uid/gid/pid fields of the request won't be filled in. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e4f041a11bb5..fc342cf7c2cc 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi + Copyright (C) 2001-2006 Miklos Szeredi This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -565,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, buf += nres; if (nres != nbytes) break; - if (count) - fuse_reset_request(req); + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } } fuse_put_request(fc, req); if (res > 0) { -- cgit v1.2.3 From c06511d12d720b23c8dffff23004f0a888698f20 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 14 Apr 2006 04:05:55 -0600 Subject: [PATCH] de_thread: Don't change our parents and ptrace flags. This is two distinct changes. - Not changing our real parents. - Not changing our ptrace parents. Not changing our real parents is trivially correct because both tasks have the same real parents as they are part of a thread group. Now that we demote the leader to a thread there is no longer any reason to change it's parentage. Not changing our ptrace parents is a user visible change if someone looks hard enough. I don't think user space applications will care or even notice. In the practical and I think common case a debugger will have attached to all of the threads using the same ptrace flags. From my quick skim of strace and gdb that appears to be the case. Which if true means debuggers will not notice a change. Before this point we have already generated a ptrace event in do_exit that reports the leaders pid has died so de_thread is visible to a debugger. Which means attempting to hide this case by copying flags around appears excessive. By not doing anything it avoids all of the weird locking issues between de_thread and ptrace attach, and removes one case from consideration for fixing the ptrace locking. This only addresses Oleg's first concern with ptrace_attach, that of the problems caused by reparenting. Oleg's second concern is essentially a race between ptrace_attach and release_task that causes an oops when we get to force_sig_specific. There is nothing special about de_thread with respect to that race. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- fs/exec.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 3234a0c32d54..4121bb559739 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct task_struct *parent; struct dentry *proc_dentry1, *proc_dentry2; - unsigned long ptrace; /* * Wait for the thread group leader to be a zombie. @@ -704,22 +702,6 @@ static int de_thread(struct task_struct *tsk) * two threads with a switched PID, and release * the former thread group leader: */ - ptrace = leader->ptrace; - parent = leader->parent; - if (unlikely(ptrace) && unlikely(parent == current)) { - /* - * Joker was ptracing his own group leader, - * and now he wants to be his own parent! - * We can't have that. - */ - ptrace = 0; - } - - ptrace_unlink(current); - ptrace_unlink(leader); - remove_parent(current); - remove_parent(leader); - /* Become a process group leader with the old leader's pid. * Note: The old leader also uses thispid until release_task @@ -732,8 +714,6 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_SID, current->signal->session); list_add_tail(¤t->tasks, &init_task.tasks); - current->parent = current->real_parent = leader->real_parent; - leader->parent = leader->real_parent = child_reaper; current->group_leader = current; leader->group_leader = current; @@ -742,13 +722,6 @@ static int de_thread(struct task_struct *tsk) detach_pid(leader, PIDTYPE_SID); list_del_init(&leader->tasks); - add_parent(current); - add_parent(leader); - if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); - } - current->exit_signal = SIGCHLD; BUG_ON(leader->exit_state != EXIT_ZOMBIE); -- cgit v1.2.3 From 4508a7a734b111b8b7e39986237d84acb1168dd0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 20 Mar 2006 17:53:53 +1100 Subject: [PATCH] sysfs: Allow sysfs attribute files to be pollable It works like this: Open the file Read all the contents. Call poll requesting POLLERR or POLLPRI (so select/exceptfds works) When poll returns, close the file and go to top of loop. or lseek to start of file and go back to the 'read'. Events are signaled by an object manager calling sysfs_notify(kobj, dir, attr); If the dir is non-NULL, it is used to find a subdirectory which contains the attribute (presumably created by sysfs_create_group). This has a cost of one int per attribute, one wait_queuehead per kobject, one int per open file. The name "sysfs_notify" may be confused with the inotify functionality. Maybe it would be nice to support inotify for sysfs attributes as well? This patch also uses sysfs_notify to allow /sys/block/md*/md/sync_action to be pollable Signed-off-by: Neil Brown Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 1 + fs/sysfs/file.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/sysfs.h | 1 + 3 files changed, 78 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 6cfdc9a87772..610b5bdbe75b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_event, 0); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index f1cb1ddde511..cf3786625bfa 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -57,6 +58,7 @@ struct sysfs_buffer { struct sysfs_ops * ops; struct semaphore sem; int needs_read_fill; + int event; }; @@ -72,6 +74,7 @@ struct sysfs_buffer { */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { + struct sysfs_dirent * sd = dentry->d_fsdata; struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; @@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; + buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); @@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp) return 0; } +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, as simply seeking and reading + * again will not get new data, or reset the state of 'poll'. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Nether 'poll' or 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + int res = 0; + + poll_wait(filp, &kobj->poll, wait); + + if (buffer->event != atomic_read(&sd->s_event)) { + res = POLLERR|POLLPRI; + buffer->needs_read_fill = 1; + } + + return res; +} + + +static struct dentry *step_down(struct dentry *dir, const char * name) +{ + struct dentry * de; + + if (dir == NULL || dir->d_inode == NULL) + return NULL; + + mutex_lock(&dir->d_inode->i_mutex); + de = lookup_one_len(name, dir, strlen(name)); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + if (IS_ERR(de)) + return NULL; + if (de->d_inode == NULL) { + dput(de); + return NULL; + } + return de; +} + +void sysfs_notify(struct kobject * k, char *dir, char *attr) +{ + struct dentry *de = k->dentry; + if (de) + dget(de); + if (de && dir) + de = step_down(de, dir); + if (de && attr) + de = step_down(de, attr); + if (de) { + struct sysfs_dirent * sd = de->d_fsdata; + if (sd) + atomic_inc(&sd->s_event); + wake_up_interruptible(&k->poll); + dput(de); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + const struct file_operations sysfs_file_operations = { .read = sysfs_read_file, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, .release = sysfs_release, + .poll = sysfs_poll, }; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 32958a7c50e9..3651ffb5ec09 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); -- cgit v1.2.3 From d4d7e5dffc4844ef51fe11f497bd774c04413a00 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 24 Mar 2006 20:45:35 +0100 Subject: [PATCH] BLOCK: delay all uevents until partition table is scanned [BLOCK] delay all uevents until partition table is scanned Here we delay the annoucement of all block device events until the disk's partition table is scanned and all partition devices are already created and sysfs is populated. We have a bunch of old bugs for removable storage handling where we probe successfully for a filesystem on the raw disk, but at the same time the kernel recognizes a partition table and creates partition devices. Currently there is no sane way to tell if partitions will show up or not at the time the disk device is announced to userspace. With the delayed events we can simply skip any probe for a filesystem on the raw disk when we find already present partitions. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index af0cb4b9e784..f3b6af071722 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part) devfs_remove("%s/part%d", disk->devfs_name, part); if (p->holder_dir) kobject_unregister(p->holder_dir); - kobject_unregister(&p->kobj); + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + kobject_put(&p->kobj); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) @@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); p->kobj.parent = &disk->kobj; p->kobj.ktype = &ktype_part; - kobject_register(&p->kobj); + kobject_init(&p->kobj); + kobject_add(&p->kobj); + if (!disk->part_uevent_suppress) + kobject_uevent(&p->kobj, KOBJ_ADD); partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -395,6 +400,8 @@ void register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; + int i; + struct hd_struct *p; int err; strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); @@ -406,13 +413,12 @@ void register_disk(struct gendisk *disk) return; disk_sysfs_symlinks(disk); disk_sysfs_add_subdirs(disk); - kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { if (disk->devfs_name[0] != '\0') devfs_add_disk(disk); - return; + goto exit; } /* always add handle for the whole disk */ @@ -420,16 +426,32 @@ void register_disk(struct gendisk *disk) /* No such device (e.g., media were just removed) */ if (!get_capacity(disk)) - return; + goto exit; bdev = bdget_disk(disk, 0); if (!bdev) - return; + goto exit; + /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - if (blkdev_get(bdev, FMODE_READ, 0) < 0) - return; + disk->part_uevent_suppress = 1; + err = blkdev_get(bdev, FMODE_READ, 0); + disk->part_uevent_suppress = 0; + if (err < 0) + goto exit; blkdev_put(bdev); + +exit: + /* announce disk after possible partitions are already created */ + kobject_uevent(&disk->kobj, KOBJ_ADD); + + /* announce possible partitions */ + for (i = 1; i < disk->minors; i++) { + p = disk->part[i-1]; + if (!p || !p->nr_sects) + continue; + kobject_uevent(&p->kobj, KOBJ_ADD); + } } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) -- cgit v1.2.3 From 2436f039d26a91e5404974ee0cb789b17db46168 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Mon, 10 Apr 2006 00:17:20 -0700 Subject: [PATCH] Fix block device symlink name As noted further on the this file, some block devices have a / in their name, so fix the "block:..." symlink name the same as the /sys/block name. Signed-off-by: Stephen Rothwell Cc: Al Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/partitions/check.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f3b6af071722..45ae7dd3c650 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -372,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -379,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } -- cgit v1.2.3 From 75616cf9854b83eb83a968b1338ae0ee11c9673c Mon Sep 17 00:00:00 2001 From: "Ananiev, Leonid I" Date: Mon, 10 Apr 2006 22:54:38 -0700 Subject: [PATCH] ext3: Fix missed mutex unlock Missed unlock_super()call is added in error condition code path. Signed-off-by: Leonid Ananiev Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ext3/resize.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 14f5f6ea3e72..c5ffa8523968 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_journal; } -- cgit v1.2.3 From 0a489cb3b6a7b277030cdbc97c2c65905db94536 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:02:48 -0700 Subject: x86: don't allow tail-calls in sys_ftruncate[64]() Gcc thinks it owns the incoming argument stack, but that's not true for "asmlinkage" functions, and it corrupts the caller-set-up argument stack when it pushes the third argument onto the stack. Which can result in %ebx getting corrupted in user space. Now, normally nobody sane would ever notice, since libc will save and restore %ebx anyway over the system call, but it's still wrong. I'd much rather have "asmlinkage" tell gcc directly that it doesn't own the stack, but no such attribute exists, so we're stuck with our hacky manual "prevent_tail_call()" macro once more (we've had the same issue before with sys_waitpid() and sys_wait4()). Thanks to Hans-Werner Hilse for reporting the issue and testing the fix. Signed-off-by: Linus Torvalds --- fs/open.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index c32c89d6d8db..8279c65d3bef 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,9 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +345,9 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + prevent_tail_call(ret); + return ret; } #endif -- cgit v1.2.3 From 385910f2b275a636238f70844f1b6da9fda6f2da Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 18 Apr 2006 13:22:59 -0700 Subject: x86: be careful about tailcall breakage for sys_open[at] too Came up through a quick grep for other cases similar to the ftruncate() one in commit 0a489cb3b6a7b277030cdbc97c2c65905db94536. Also, add a comment, so that people who read the code understand why we do what looks like a no-op. (Again, this won't actually matter to any sane user, since libc will save and restore the register gcc stomps on, but it's still wrong to stomp on it) Signed-off-by: Linus Torvalds --- fs/open.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/open.c b/fs/open.c index 8279c65d3bef..53ec28c36777 100644 --- a/fs/open.c +++ b/fs/open.c @@ -332,6 +332,7 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -346,6 +347,7 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ prevent_tail_call(ret); return ret; } @@ -1097,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_openat); -- cgit v1.2.3 From 91ad66ef4469cb631ec0ccd131b07f16770773f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:55:10 +0200 Subject: [PATCH] splice: close i_size truncate races on read We need to check i_size after doing a blocking readpage. Signed-off-by: Jens Axboe --- fs/splice.c | 43 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8d57e89924a6..7e8585574726 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -145,8 +145,8 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long offset, - unsigned long len, unsigned int flags) + int nr_pages, unsigned long len, + unsigned int offset, unsigned int flags) { int ret, do_wakeup, i; @@ -243,14 +243,16 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int offset, nr_pages; + unsigned int loff, offset, nr_pages; struct page *pages[PIPE_BUFFERS]; struct page *page; - pgoff_t index; + pgoff_t index, end_index; + loff_t isize; + size_t bytes; int i, error; index = *ppos >> PAGE_CACHE_SHIFT; - offset = *ppos & ~PAGE_CACHE_MASK; + loff = offset = *ppos & ~PAGE_CACHE_MASK; nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) @@ -268,6 +270,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * Now fill in the holes: */ error = 0; + bytes = 0; for (i = 0; i < nr_pages; i++, index++) { find_page: /* @@ -336,13 +339,41 @@ readpage: goto find_page; break; } + + /* + * i_size must be checked after ->readpage(). + */ + isize = i_size_read(mapping->host); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) { + page_cache_release(page); + break; + } + + /* + * if this is the last page, see if we need to shrink + * the length and stop + */ + if (end_index == index) { + loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); + if (bytes + loff > isize) { + page_cache_release(page); + break; + } + /* + * force quit after adding this page + */ + nr_pages = i; + } } fill_it: pages[i] = page; + bytes += PAGE_CACHE_SIZE - loff; + loff = 0; } if (i) - return move_to_pipe(pipe, pages, i, offset, len, flags); + return move_to_pipe(pipe, pages, i, bytes, offset, flags); return error; } -- cgit v1.2.3 From c4f895cbe1e95aab633207fb19c650b7c984c01a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:12 +0200 Subject: [PATCH] splice: cleanup the SPLICE_F_NONBLOCK handling - generic_file_splice_read() more readable and correct - Don't bail on page allocation with NONBLOCK set, just don't allow direct blocking on IO (eg lock_page). Signed-off-by: Jens Axboe --- fs/splice.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 7e8585574726..78cd264340f2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -278,14 +278,6 @@ find_page: */ page = find_get_page(mapping, index); if (!page) { - /* - * If in nonblock mode then dont block on - * readpage (we've kicked readahead so there - * will be asynchronous progress): - */ - if (flags & SPLICE_F_NONBLOCK) - break; - /* * page didn't exist, allocate one */ @@ -307,6 +299,13 @@ find_page: * If the page isn't uptodate, we may need to start io on it */ if (!PageUptodate(page)) { + /* + * If in nonblock mode then dont block on waiting + * for an in-flight io page + */ + if (flags & SPLICE_F_NONBLOCK) + break; + lock_page(page); /* @@ -400,17 +399,20 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, while (len) { ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret <= 0) + if (ret < 0) break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + } *ppos += ret; len -= ret; spliced += ret; - - if (!(flags & SPLICE_F_NONBLOCK)) - continue; - ret = -EAGAIN; - break; } if (spliced) -- cgit v1.2.3 From 2a27250e6cf47ca1ea3bea0a55e4b7889c097627 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:56:40 +0200 Subject: [PATCH] tee: link_pipe() must be careful when dropping one of the pipe locks We need to ensure that we only drop a lock that is ordered last, to avoid ABBA deadlocks with competing processes. Signed-off-by: Jens Axboe --- fs/splice.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 78cd264340f2..4f5e6b09fb26 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1012,7 +1012,9 @@ static int link_pipe(struct pipe_inode_info *ipipe, size_t len, unsigned int flags) { struct pipe_buffer *ibuf, *obuf; - int ret = 0, do_wakeup = 0, i; + int ret, do_wakeup, i, ipipe_first; + + ret = do_wakeup = ipipe_first = 0; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1020,6 +1022,7 @@ static int link_pipe(struct pipe_inode_info *ipipe, * could deadlock (one doing tee from A -> B, the other from B -> A). */ if (ipipe->inode < opipe->inode) { + ipipe_first = 1; mutex_lock(&ipipe->inode->i_mutex); mutex_lock(&opipe->inode->i_mutex); } else { @@ -1068,9 +1071,11 @@ static int link_pipe(struct pipe_inode_info *ipipe, /* * We have input available, but no output room. - * If we already copied data, return that. + * If we already copied data, return that. If we + * need to drop the opipe lock, it must be ordered + * last to avoid deadlocks. */ - if (flags & SPLICE_F_NONBLOCK) { + if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) { if (!ret) ret = -EAGAIN; break; @@ -1104,7 +1109,12 @@ static int link_pipe(struct pipe_inode_info *ipipe, if (ret) break; } - if (flags & SPLICE_F_NONBLOCK) { + /* + * pipe_wait() drops the ipipe mutex. To avoid deadlocks + * with another process, we can only safely do that if + * the ipipe lock is ordered last. + */ + if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) { if (!ret) ret = -EAGAIN; break; -- cgit v1.2.3 From a4514ebd8e12c63c09ab02be518db545bd1d24af Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:05 +0200 Subject: [PATCH] splice: offset fixes - We need to adjust *ppos for writes as well. - Copy back modified offset value if one was passed in, similar to what sendfile does. Signed-off-by: Jens Axboe --- fs/splice.c | 45 ++++++++++++++++++++++++++++++--------------- 1 file changed, 30 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 4f5e6b09fb26..27d6408ff490 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -720,22 +720,26 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, ssize_t ret; ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); - - /* - * If file or inode is SYNC and we actually wrote some data, sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) - && ret > 0) { + if (ret > 0) { struct inode *inode = mapping->host; - int err; - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(mapping->host, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); + *ppos += ret; + + /* + * If file or inode is SYNC and we actually wrote some data, + * sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(inode, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); - if (err) - ret = err; + if (err) + ret = err; + } } return ret; @@ -937,6 +941,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *pipe; loff_t offset, *off; + long ret; pipe = in->f_dentry->d_inode->i_pipe; if (pipe) { @@ -951,7 +956,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - return do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(pipe, out, off, len, flags); + + if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } pipe = out->f_dentry->d_inode->i_pipe; @@ -967,7 +977,12 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - return do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, pipe, len, flags); + + if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + ret = -EFAULT; + + return ret; } return -EINVAL; -- cgit v1.2.3 From 9e0267c26e237f84f608a68e579bf4eb89dad819 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 19 Apr 2006 15:57:31 +0200 Subject: [PATCH] splice: fixup writeout path after ->map changes Since ->map() no longer locks the page, we need to adjust the handling of those pages (and stealing) a little. This now passes full regressions again. Signed-off-by: Jens Axboe --- fs/splice.c | 49 ++++++++++++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 27d6408ff490..22fac87e90b3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -50,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, struct page *page = buf->page; struct address_space *mapping = page_mapping(page); - WARN_ON(!PageLocked(page)); + lock_page(page); + WARN_ON(!PageUptodate(page)); /* @@ -65,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, if (PagePrivate(page)) try_to_release_page(page, mapping_gfp_mask(mapping)); - if (!remove_mapping(mapping, page)) + if (!remove_mapping(mapping, page)) { + unlock_page(page); return 1; + } buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; return 0; @@ -507,14 +510,12 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, if (sd->flags & SPLICE_F_MOVE) { /* * If steal succeeds, buf->page is now pruned from the vm - * side (LRU and page cache) and we can reuse it. + * side (LRU and page cache) and we can reuse it. The page + * will also be looked on successful return. */ if (buf->ops->steal(info, buf)) goto find_page; - /* - * this will also set the page locked - */ page = buf->page; if (add_to_page_cache(page, mapping, index, gfp_mask)) goto find_page; @@ -523,15 +524,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, lru_cache_add(page); } else { find_page: - ret = -ENOMEM; - page = find_or_create_page(mapping, index, gfp_mask); - if (!page) - goto out_nomem; + page = find_lock_page(mapping, index); + if (!page) { + ret = -ENOMEM; + page = page_cache_alloc_cold(mapping); + if (unlikely(!page)) + goto out_nomem; + + /* + * This will also lock the page + */ + ret = add_to_page_cache_lru(page, mapping, index, + gfp_mask); + if (unlikely(ret)) + goto out; + } /* - * If the page is uptodate, it is also locked. If it isn't - * uptodate, we can mark it uptodate if we are filling the - * full page. Otherwise we need to read it in first... + * We get here with the page locked. If the page is also + * uptodate, we don't need to do more. If it isn't, we + * may need to bring it in if we are not going to overwrite + * the full page. */ if (!PageUptodate(page)) { if (sd->len < PAGE_CACHE_SIZE) { @@ -553,10 +566,8 @@ find_page: ret = -EIO; goto out; } - } else { - WARN_ON(!PageLocked(page)); + } else SetPageUptodate(page); - } } } @@ -585,10 +596,10 @@ find_page: mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: - if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) page_cache_release(page); - unlock_page(page); - } + + unlock_page(page); out_nomem: buf->ops->unmap(info, buf); return ret; -- cgit v1.2.3 From 5e85d4abe3f43bb5362f384bab0e20ef082ce0b5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 18 Apr 2006 22:20:16 -0700 Subject: [PATCH] task: Make task list manipulations RCU safe While we can currently walk through thread groups, process groups, and sessions with just the rcu_read_lock, this opens the door to walking the entire task list. We already have all of the other RCU guarantees so there is no cost in doing this, this should be enough so that proc can stop taking the tasklist lock during readdir. prev_task was killed because it has no users, and using it will miss new tasks when doing an rcu traversal. Signed-off-by: Eric W. Biederman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4121bb559739..3a79d97ac234 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -712,7 +712,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail(¤t->tasks, &init_task.tasks); + list_add_tail_rcu(¤t->tasks, &init_task.tasks); current->group_leader = current; leader->group_leader = current; -- cgit v1.2.3 From dda27d1a55e185b0c5fd184b86ac26c66846f095 Mon Sep 17 00:00:00 2001 From: Arthur Othieno Date: Tue, 18 Apr 2006 22:20:57 -0700 Subject: [PATCH] hugetlbfs: add Kconfig help text In kernel bugzilla #6248 (http://bugzilla.kernel.org/show_bug.cgi?id=6248), Adrian Bunk notes that CONFIG_HUGETLBFS is missing Kconfig help text. Signed-off-by: Arthur Othieno Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 2524629dc835..f9b5842c8d2d 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -842,6 +842,12 @@ config TMPFS config HUGETLBFS bool "HugeTLB file system support" depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN + help + hugetlbfs is a filesystem backing for HugeTLB pages, based on + ramfs. For architectures that support it, say Y here and read + for details. + + If unsure, say N. config HUGETLB_PAGE def_bool HUGETLBFS -- cgit v1.2.3 From ca99c1da080345e227cfb083c330a184d42e27f3 Mon Sep 17 00:00:00 2001 From: Dipankar Sarma Date: Tue, 18 Apr 2006 22:21:46 -0700 Subject: [PATCH] Fix file lookup without ref There are places in the kernel where we look up files in fd tables and access the file structure without holding refereces to the file. So, we need special care to avoid the race between looking up files in the fd table and tearing down of the file in another CPU. Otherwise, one might see a NULL f_dentry or such torn down version of the file. This patch fixes those special places where such a race may happen. Signed-off-by: Dipankar Sarma Acked-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 9 +++++++-- fs/proc/base.c | 21 +++++++++++++++------ 2 files changed, 22 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index dda83d6cd48b..efad798824dc 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from) lock_kernel(); j = 0; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structures, so + * we need to acquire ->file_lock. + */ + spin_lock(&files->file_lock); fdt = files_fdtable(files); for (;;) { unsigned long set; @@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from) set >>= 1; } } - rcu_read_unlock(); + spin_unlock(&files->file_lock); unlock_kernel(); } EXPORT_SYMBOL(steal_locks); diff --git a/fs/proc/base.c b/fs/proc/base.c index a3a3eecef689..6cc77dc3f3ff 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm files = get_files_struct(task); if (files) { - rcu_read_lock(); + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (file) { *mnt = mntget(file->f_vfsmnt); *dentry = dget(file->f_dentry); - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); return 0; } - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); } return -ENOENT; @@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, if (!files) goto out_unlock; inode->i_mode = S_IFLNK; - rcu_read_lock(); + + /* + * We are not taking a ref to the file structure, so we must + * hold ->file_lock. + */ + spin_lock(&files->file_lock); file = fcheck_files(files, fd); if (!file) goto out_unlock2; @@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, inode->i_mode |= S_IRUSR | S_IXUSR; if (file->f_mode & 2) inode->i_mode |= S_IWUSR | S_IXUSR; - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); inode->i_op = &proc_pid_link_inode_operations; inode->i_size = 64; @@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, return NULL; out_unlock2: - rcu_read_unlock(); + spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -- cgit v1.2.3 From 95cf959b245832ad49bb333bf88f9805244b225d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:14:06 -0400 Subject: VFS: Fix another open intent Oops If the call to nfs_intent_set_file() fails to open a file in nfs4_proc_create(), we should return an error. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 47ece1dd3c67..d86c0db7b1e8 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1218,7 +1218,7 @@ out: return status; } -static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) +static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state) { struct file *filp; @@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st struct nfs_open_context *ctx; ctx = (struct nfs_open_context *)filp->private_data; ctx->state = state; - } else - nfs4_close_state(state, nd->intent.open.flags); + return 0; + } + nfs4_close_state(state, nd->intent.open.flags); + return PTR_ERR(filp); } struct dentry * @@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, nfs_setattr_update_inode(state->inode, sattr); } if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN)) - nfs4_intent_set_file(nd, dentry, state); + status = nfs4_intent_set_file(nd, dentry, state); else nfs4_close_state(state, flags); out: -- cgit v1.2.3 From e99170ff3b799a9fd43d538932a9231fac1de9d4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 18 Apr 2006 13:21:42 -0400 Subject: NFS,SUNRPC: Fix compiler warnings if CONFIG_PROC_FS & CONFIG_SYSCTL are unset Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 8 +++----- fs/nfs/file.c | 5 ++--- 2 files changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 0f583cb16ddb..3c72b0c07283 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { - struct dentry *dentry = iocb->ki_filp->f_dentry; - dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", - dentry->d_name.name, (long long) pos, nr_segs); + iocb->ki_filp->f_dentry->d_name.name, + (long long) pos, nr_segs); return -EINVAL; } @@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = { static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) { struct nfs_write_data *data = dreq->commit_data; - struct rpc_task *task = &data->task; data->inode = dreq->inode; data->cred = dreq->ctx->cred; @@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) /* Note: task.tk_ops->rpc_release will free dreq->commit_data */ dreq->commit_data = NULL; - dprintk("NFS: %5u initiated commit call\n", task->tk_pid); + dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); lock_kernel(); rpc_execute(&data->task); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f1df2c8d9259..fade02c15e6e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl) */ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) { - struct inode * inode = filp->f_mapping->host; - dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n", - inode->i_sb->s_id, inode->i_ino, + filp->f_dentry->d_inode->i_sb->s_id, + filp->f_dentry->d_inode->i_ino, fl->fl_type, fl->fl_flags); /* -- cgit v1.2.3 From ec535ce154f2eaad3d97f2f20a76a6d8bdac33e5 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 18 Apr 2006 13:21:50 -0400 Subject: NFS: make 2 functions static Signed-off-by: Adrian Bunk Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/lockd/svclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index d2b66bad7d50..3ef739120dff 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data) svc_wake_up(block->b_daemon); } -void nlmsvc_grant_release(void *data) +static void nlmsvc_grant_release(void *data) { struct nlm_rqst *call = data; -- cgit v1.2.3 From b9d9506d944865876e67281a4e4269d823ce5381 Mon Sep 17 00:00:00 2001 From: John Hawkes Date: Wed, 19 Apr 2006 13:06:20 -0400 Subject: NFS: nfs_show_stats; for_each_possible_cpu(), not NR_CPUS Convert a for-loop that explicitly references "NR_CPUS" into the potentially more efficient for_each_possible_cpu() construct. Signed-off-by: John Hawkes Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 2f7656b911b6..d0b991a92327 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display superblock I/O counters */ - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_possible_cpu(cpu) { struct nfs_iostats *stats; - if (!cpu_possible(cpu)) - continue; - preempt_disable(); stats = per_cpu_ptr(nfss->io_stats, cpu); -- cgit v1.2.3 From 7451c4f0ee53e36fd74168af8df75b28fd04a2aa Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 19 Apr 2006 13:06:37 -0400 Subject: NFS: remove needless check in nfs_opendir() Local variable res was initialized to 0 - no check needed here. Signed-off-by: Carsten Otte Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index a23f34894167..cae74dd4c7f5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = { static int nfs_opendir(struct inode *inode, struct file *filp) { - int res = 0; + int res; dfprintk(VFS, "NFS: opendir(%s/%ld)\n", inode->i_sb->s_id, inode->i_ino); lock_kernel(); /* Call generic open code in order to cache credentials */ - if (!res) - res = nfs_open(inode, filp); + res = nfs_open(inode, filp); unlock_kernel(); return res; } -- cgit v1.2.3 From 82aa5d6183667aa2a5f3c61e390934b0273d2ad7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 20 Apr 2006 13:05:48 +0200 Subject: [PATCH] splice: fix smaller sized splice reads Signed-off-by: Jens Axboe --- fs/splice.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 22fac87e90b3..0559e7577a04 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -275,6 +275,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = 0; bytes = 0; for (i = 0; i < nr_pages; i++, index++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min(len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index @@ -366,11 +375,13 @@ readpage: * force quit after adding this page */ nr_pages = i; + this_len = min(this_len, loff); } } fill_it: pages[i] = page; - bytes += PAGE_CACHE_SIZE - loff; + bytes += this_len; + len -= this_len; loff = 0; } -- cgit v1.2.3 From 0bd4fa977f81c914eb8bada00284d0933825900e Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 18:17:42 +0000 Subject: [CIFS] [CIFS] Do not take rename sem on most path based calls (during building of full path) to avoid hang rename/readdir hang Reported by Alan Tyson Signed-off-by: Steve French --- fs/cifs/dir.c | 4 ---- fs/cifs/fcntl.c | 2 -- fs/cifs/file.c | 2 -- fs/cifs/inode.c | 6 ------ fs/cifs/link.c | 6 ------ fs/cifs/readdir.c | 2 -- fs/cifs/xattr.c | 8 -------- 7 files changed, 30 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 1d0ca3eaaca5..3830dfeb31cf 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -139,9 +139,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -316,9 +314,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) rc = -ENOMEM; else if (pTcon->ses->capabilities & CAP_UNIX) { diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index ec4dfe9bf5ef..633a93811328 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -86,9 +86,7 @@ int cifs_dir_notify(struct file * file, unsigned long arg) cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5c497c529772..1476725e6051 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -203,9 +203,7 @@ int cifs_open(struct inode *inode, struct file *file) } } - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 957ddd1571c6..4093764ef461 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -722,9 +722,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -807,9 +805,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -1141,9 +1137,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) rc = 0; } - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 9562f5bba65c..2ec99f833142 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -48,10 +48,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, /* No need to check for cross device links since server will do that BB note DFS case in future though (when we may have to check) */ - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); fromName = build_path_from_dentry(old_file); toName = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if((fromName == NULL) || (toName == NULL)) { rc = -ENOMEM; goto cifs_hl_exit; @@ -103,9 +101,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) xid = GetXid(); - mutex_lock(&direntry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex); if (!full_path) goto out_no_free; @@ -164,9 +160,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) cifs_sb = CIFS_SB(inode->i_sb); pTcon = cifs_sb->tcon; - mutex_lock(&inode->i_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&inode->i_sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 2f6e2825571e..7b8591acc5ad 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -404,9 +404,7 @@ static int initiate_cifs_search(const int xid, struct file *file) if(pTcon == NULL) return -EINVAL; - mutex_lock(&file->f_dentry->d_sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(file->f_dentry); - mutex_unlock(&file->f_dentry->d_sb->s_vfs_rename_mutex); if(full_path == NULL) { return -ENOMEM; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 3938444d87b2..7754d641775e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -62,9 +62,7 @@ int cifs_removexattr(struct dentry * direntry, const char * ea_name) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -116,9 +114,7 @@ int cifs_setxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -223,9 +219,7 @@ ssize_t cifs_getxattr(struct dentry * direntry, const char * ea_name, cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; @@ -341,9 +335,7 @@ ssize_t cifs_listxattr(struct dentry * direntry, char * data, size_t buf_size) cifs_sb = CIFS_SB(sb); pTcon = cifs_sb->tcon; - mutex_lock(&sb->s_vfs_rename_mutex); full_path = build_path_from_dentry(direntry); - mutex_unlock(&sb->s_vfs_rename_mutex); if(full_path == NULL) { FreeXid(xid); return -ENOMEM; -- cgit v1.2.3 From 296034f7de8bdf111984ce1630ac598a9c94a253 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 18:18:37 +0000 Subject: [CIFS] Don't allow a backslash in a path component Unless Posix paths have been negotiated, the backslash, "\", is not a valid character in a path component. Signed-off-by: Dave Kleikamp Signed-off-by: Steve French --- fs/cifs/dir.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3830dfeb31cf..82315edc77d7 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -436,6 +436,20 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct name cifs_sb = CIFS_SB(parent_dir_inode->i_sb); pTcon = cifs_sb->tcon; + /* + * Don't allow the separator character in a path component. + * The VFS will not allow "/", but "\" is allowed by posix. + */ + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { + int i; + for (i = 0; i < direntry->d_name.len; i++) + if (direntry->d_name.name[i] == '\\') { + cFYI(1, ("Invalid file name")); + FreeXid(xid); + return ERR_PTR(-EINVAL); + } + } + /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ -- cgit v1.2.3 From 45af7a0f2ebad1304cab956e15f0b37318226fcd Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 21 Apr 2006 22:52:25 +0000 Subject: [CIFS] Use the kthread_ API instead of opencoding lots of hairy code for kernel thread creation and teardown. It does not move the cifsd thread handling to kthread due to problems found in testing with wakeup of threads blocked in the socket peek api, but the other cifs kernel threads now use kthread. Also cleanup cifs_init to properly unwind when thread creation fails. Signed-off-by: Christoph Hellwig Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 99 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 50 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d4b713e5affb..c262d8874ce9 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include "cifsfs.h" #include "cifspdu.h" #define DECLARE_GLOBALS_HERE @@ -75,9 +76,6 @@ unsigned int cifs_max_pending = CIFS_MAX_REQ; module_param(cifs_max_pending, int, 0); MODULE_PARM_DESC(cifs_max_pending,"Simultaneous requests to server. Default: 50 Range: 2 to 256"); -static DECLARE_COMPLETION(cifs_oplock_exited); -static DECLARE_COMPLETION(cifs_dnotify_exited); - extern mempool_t *cifs_sm_req_poolp; extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; @@ -841,10 +839,6 @@ static int cifs_oplock_thread(void * dummyarg) __u16 netfid; int rc; - daemonize("cifsoplockd"); - allow_signal(SIGTERM); - - oplockThread = current; do { if (try_to_freeze()) continue; @@ -900,9 +894,9 @@ static int cifs_oplock_thread(void * dummyarg) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(1); /* yield in case q were corrupt */ } - } while(!signal_pending(current)); - oplockThread = NULL; - complete_and_exit (&cifs_oplock_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int cifs_dnotify_thread(void * dummyarg) @@ -910,10 +904,6 @@ static int cifs_dnotify_thread(void * dummyarg) struct list_head *tmp; struct cifsSesInfo *ses; - daemonize("cifsdnotifyd"); - allow_signal(SIGTERM); - - dnotifyThread = current; do { if(try_to_freeze()) continue; @@ -931,8 +921,9 @@ static int cifs_dnotify_thread(void * dummyarg) wake_up_all(&ses->server->response_q); } read_unlock(&GlobalSMBSeslock); - } while(!signal_pending(current)); - complete_and_exit (&cifs_dnotify_exited, 0); + } while (!kthread_should_stop()); + + return 0; } static int __init @@ -982,32 +973,48 @@ init_cifs(void) } rc = cifs_init_inodecache(); - if (!rc) { - rc = cifs_init_mids(); - if (!rc) { - rc = cifs_init_request_bufs(); - if (!rc) { - rc = register_filesystem(&cifs_fs_type); - if (!rc) { - rc = (int)kernel_thread(cifs_oplock_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) { - rc = (int)kernel_thread(cifs_dnotify_thread, NULL, - CLONE_FS | CLONE_FILES | CLONE_VM); - if(rc > 0) - return 0; - else - cERROR(1,("error %d create dnotify thread", rc)); - } else { - cERROR(1,("error %d create oplock thread",rc)); - } - } - cifs_destroy_request_bufs(); - } - cifs_destroy_mids(); - } - cifs_destroy_inodecache(); + if (rc) + goto out_clean_proc; + + rc = cifs_init_mids(); + if (rc) + goto out_destroy_inodecache; + + rc = cifs_init_request_bufs(); + if (rc) + goto out_destroy_mids; + + rc = register_filesystem(&cifs_fs_type); + if (rc) + goto out_destroy_request_bufs; + + oplockThread = kthread_run(cifs_oplock_thread, NULL, "cifsoplockd"); + if (IS_ERR(oplockThread)) { + rc = PTR_ERR(oplockThread); + cERROR(1,("error %d create oplock thread", rc)); + goto out_unregister_filesystem; } + + dnotifyThread = kthread_run(cifs_dnotify_thread, NULL, "cifsdnotifyd"); + if (IS_ERR(dnotifyThread)) { + rc = PTR_ERR(dnotifyThread); + cERROR(1,("error %d create dnotify thread", rc)); + goto out_stop_oplock_thread; + } + + return 0; + + out_stop_oplock_thread: + kthread_stop(oplockThread); + out_unregister_filesystem: + unregister_filesystem(&cifs_fs_type); + out_destroy_request_bufs: + cifs_destroy_request_bufs(); + out_destroy_mids: + cifs_destroy_mids(); + out_destroy_inodecache: + cifs_destroy_inodecache(); + out_clean_proc: #ifdef CONFIG_PROC_FS cifs_proc_clean(); #endif @@ -1025,14 +1032,8 @@ exit_cifs(void) cifs_destroy_inodecache(); cifs_destroy_mids(); cifs_destroy_request_bufs(); - if(oplockThread) { - send_sig(SIGTERM, oplockThread, 1); - wait_for_completion(&cifs_oplock_exited); - } - if(dnotifyThread) { - send_sig(SIGTERM, dnotifyThread, 1); - wait_for_completion(&cifs_dnotify_exited); - } + kthread_stop(oplockThread); + kthread_stop(dnotifyThread); } MODULE_AUTHOR("Steve French "); -- cgit v1.2.3 From 60808233f374aebba26488d06a5f25443f6763c3 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 22 Apr 2006 15:53:05 +0000 Subject: [CIFS] Readdir fixes to allow search to start at arbitrary position in directory Also includes first part of fix to compensate for servers which forget to return . and .. as well as updates to changelog and cifs readme. Signed-off-by: Steve French --- fs/cifs/CHANGES | 6 +++++- fs/cifs/README | 8 ++++++++ fs/cifs/cifssmb.c | 2 +- fs/cifs/connect.c | 5 ++++- fs/cifs/file.c | 32 ++++++++++++++++++++------------ fs/cifs/ntlmssp.c | 14 ++++++++++++++ fs/cifs/readdir.c | 43 ++++++++++++++++++++++--------------------- 7 files changed, 74 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 8a2de038882e..1a27ecb46c9a 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,7 +1,11 @@ Version 1.42 ------------ Fix slow oplock break when mounted to different servers at the same time and -the tids match and we try to find matching fid on wrong server. +the tids match and we try to find matching fid on wrong server. Fix read +looping when signing required by server (2.6.16 kernel only). Fix readdir +vs. rename race which could cause each to hang. Return . and .. even +if server does not. Allow searches to skip first three entries and +begin at any location. Fix oops in find_writeable_file. Version 1.41 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index b2b4d0803761..0355003f4f0a 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -511,6 +511,14 @@ LinuxExtensionsEnabled If set to one then the client will attempt to support and want to map the uid and gid fields to values supplied at mount (rather than the actual values, then set this to zero. (default 1) +Experimental When set to 1 used to enable certain experimental + features (currently enables multipage writes + when signing is enabled, the multipage write + performance enhancement was disabled when + signing turned on in case buffer was modified + just before it was sent, also this flag will + be used to use the new experimental sessionsetup + code). These experimental features and tracing can be enabled by changing flags in /proc/fs/cifs (after the cifs module has been installed or built into the diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index d705500aa283..fd36892eda55 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -3119,7 +3119,7 @@ findFirstRetry: psrch_inf->endOfSearch = FALSE; psrch_inf->entries_in_buffer = le16_to_cpu(parms->SearchCount); - psrch_inf->index_of_last_entry = + psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; *pnetfid = parms->SearchHandle; } else { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0b86d5ca9014..aaf151cb5822 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3447,7 +3447,10 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); - if (extended_security + if(experimEnabled > 1) + rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, + &ntlmv2_flag, nls_info); + else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { cFYI(1, ("New style sesssetup")); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1476725e6051..e152bf6afa60 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -904,8 +904,7 @@ static ssize_t cifs_write(struct file *file, const char *write_data, if (rc != 0) break; } - /* BB FIXME We can not sign across two buffers yet */ - if((pTcon->ses->server->secMode & + if(experimEnabled || (pTcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) == 0) { struct kvec iov[2]; unsigned int len; @@ -921,13 +920,13 @@ static ssize_t cifs_write(struct file *file, const char *write_data, *poffset, &bytes_written, iov, 1, long_op); } else - /* BB FIXME fixup indentation of line below */ - rc = CIFSSMBWrite(xid, pTcon, - open_file->netfid, - min_t(const int, cifs_sb->wsize, - write_size - total_written), - *poffset, &bytes_written, - write_data + total_written, NULL, long_op); + rc = CIFSSMBWrite(xid, pTcon, + open_file->netfid, + min_t(const int, cifs_sb->wsize, + write_size - total_written), + *poffset, &bytes_written, + write_data + total_written, + NULL, long_op); } if (rc || (bytes_written == 0)) { if (total_written) @@ -966,6 +965,16 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *open_file; int rc; + /* Having a null inode here (because mapping->host was set to zero by + the VFS or MM) should not happen but we had reports of on oops (due to + it being zero) during stress testcases so we need to check for it */ + + if(cifs_inode == NULL) { + cERROR(1,("Null inode passed to cifs_writeable_file")); + dump_stack(); + return NULL; + } + read_lock(&GlobalSMBSeslock); list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { if (open_file->closePend) @@ -1091,12 +1100,11 @@ static int cifs_writepages(struct address_space *mapping, if (cifs_sb->wsize < PAGE_CACHE_SIZE) return generic_writepages(mapping, wbc); - /* BB FIXME we do not have code to sign across multiple buffers yet, - so go to older writepage style write which we can sign if needed */ if((cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server)) if(cifs_sb->tcon->ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - return generic_writepages(mapping, wbc); + if(!experimEnabled) + return generic_writepages(mapping, wbc); /* * BB: Is this meaningful for a non-block-device file system? diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c index 78866f925747..115359cc7a32 100644 --- a/fs/cifs/ntlmssp.c +++ b/fs/cifs/ntlmssp.c @@ -121,6 +121,20 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, } + /* copy session key */ + + /* if Unicode, align strings to two byte boundary */ + + /* copy user name */ /* BB Do we need to special case null user name? */ + + /* copy domain name */ + + /* copy Linux version */ + + /* copy network operating system name */ + + /* update bcc and smb buffer length */ + /* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ /* SMB request buf freed in SendReceive2 */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 7b8591acc5ad..41c022e3c132 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -590,6 +590,13 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; + + /* if first entry in buf is zero then is first buffer + in search response data which means it is likely . and .. + will be in this buffer, although some servers do not return + . and .. for the root of a drive and for those we need + to start two entries earlier */ + /* dump_cifs_file_struct(file, "In fce ");*/ if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || @@ -632,23 +639,14 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, char * end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + smbCalcSize((struct smb_hdr *) cifsFile->srch_inf.ntwrk_buf_start); + + current_entry = cifsFile->srch_inf.srch_entries_start; first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); - current_entry = cifsFile->srch_inf.srch_entries_start; for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) { /* go entry by entry figuring out which is first */ - /* if( . or ..) - skip */ - rc = cifs_entry_is_dot(current_entry,cifsFile); - if(rc == 1) /* is . or .. so skip */ { - cFYI(1,("Entry is .")); /* BB removeme BB */ - /* continue; */ - } else if (rc == 2 ) { - cFYI(1,("Entry is ..")); /* BB removeme BB */ - /* continue; */ - } current_entry = nxt_dir_entry(current_entry,end_of_smb); } if((current_entry == NULL) && (i < pos_in_buf)) { @@ -768,6 +766,11 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; + rc = cifs_entry_is_dot(pfindEntry,cifsF); + /* skip . and .. since we added them first */ + if(rc != 0) + return 0; + cifs_sb = CIFS_SB(file->f_dentry->d_sb); qstring.name = scratch_buf; @@ -896,22 +899,22 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) switch ((int) file->f_pos) { case 0: - /*if (filldir(direntry, ".", 1, file->f_pos, + if (filldir(direntry, ".", 1, file->f_pos, file->f_dentry->d_inode->i_ino, DT_DIR) < 0) { - cERROR(1, ("Filldir for current dir failed ")); + cERROR(1, ("Filldir for current dir failed")); rc = -ENOMEM; break; } - file->f_pos++; */ + file->f_pos++; case 1: - /* if (filldir(direntry, "..", 2, file->f_pos, + if (filldir(direntry, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { cERROR(1, ("Filldir for parent dir failed ")); rc = -ENOMEM; break; } - file->f_pos++; */ - case 2: + file->f_pos++; + default: /* 1) If search is active, is in current search buffer? if it before then restart search @@ -925,7 +928,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) return rc; } } - default: if(file->private_data == NULL) { rc = -EINVAL; FreeXid(xid); @@ -945,8 +947,6 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) kfree(cifsFile->search_resume_name); cifsFile->search_resume_name = NULL; */ - /* BB account for . and .. in f_pos as special case */ - rc = find_cifs_entry(xid,pTcon, file, ¤t_entry,&num_to_fill); if(rc) { @@ -975,7 +975,8 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) num_to_fill, i)); break; } - + /* if buggy server returns . and .. late do + we want to check for that here? */ rc = cifs_filldir(current_entry, file, filldir, direntry,tmp_buf); file->f_pos++; -- cgit v1.2.3 From b9251b823b5e921c894eb135cb6c64abf483f50e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 22 Apr 2006 02:36:24 -0700 Subject: [PATCH] Fix reiserfs deadlock reiserfs_cache_default_acl() should return whether we successfully found the acl or not. We have to return correct value even if reiserfs_get_acl() returns error code and not just 0. Otherwise callers such as reiserfs_mkdir() can unnecessarily lock the xattrs and later functions such as reiserfs_new_inode() fail to notice that we have already taken the lock and try to take it again with obvious consequences. Signed-off-by: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr_acl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 58c418fbca2c..97ae1b92bc47 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -408,8 +408,9 @@ int reiserfs_cache_default_acl(struct inode *inode) acl = reiserfs_get_acl(inode, ACL_TYPE_DEFAULT); reiserfs_read_unlock_xattrs(inode->i_sb); reiserfs_read_unlock_xattr_i(inode); - ret = acl ? 1 : 0; - posix_acl_release(acl); + ret = (acl && !IS_ERR(acl)); + if (ret) + posix_acl_release(acl); } return ret; -- cgit v1.2.3 From b66ac3ea21f81dea02cdb4e9de66ee6afdc540e4 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 23 Apr 2006 01:54:50 +0000 Subject: [CIFS] Fix typo in previous Signed-off-by: Steve French --- fs/cifs/readdir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 41c022e3c132..b689c5035124 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -766,7 +766,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if(file->f_dentry == NULL) return -ENOENT; - rc = cifs_entry_is_dot(pfindEntry,cifsF); + rc = cifs_entry_is_dot(pfindEntry,pCifsF); /* skip . and .. since we added them first */ if(rc != 0) return 0; -- cgit v1.2.3 From 301dc3e6f6ea83703fa52919c00e60661da5a8fe Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 24 Apr 2006 16:24:54 +0000 Subject: [CIFS] Fix compile error when CONFIG_CIFS_EXPERIMENTAL is undefined Signed-off-by: Dave Kleikamp Signed-off-by: Steve French --- fs/cifs/connect.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index aaf151cb5822..d2ec806a4f32 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3447,10 +3447,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); +#ifdef CONFIG_CIFS_EXPERIMENTAL if(experimEnabled > 1) rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, &ntlmv2_flag, nls_info); - else if (extended_security + else +#endif + if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { cFYI(1, ("New style sesssetup")); -- cgit v1.2.3 From ba5f5d90c45a30e4e9a1bd136acf1b3973c905c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 25 Apr 2006 15:33:34 +0200 Subject: [PATCH] splice: fix min() warning Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 0559e7577a04..4aa67254740f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -283,7 +283,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, /* * this_len is the max we'll use from this page */ - this_len = min(len, PAGE_CACHE_SIZE - loff); + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); find_page: /* * lookup the page for this index -- cgit v1.2.3 From 016b661e2f717168e600f3c85f29e1a49f88e004 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 25 Apr 2006 15:42:00 +0200 Subject: [PATCH] splice: fix offset problems Make the move_from_pipe() actors return number of bytes processed, then move_from_pipe() can decide more cleverly when to move on to the next buffer. This fixes problems with pipe offset and differing file offset. Signed-off-by: Jens Axboe --- fs/splice.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 4aa67254740f..8c6030c762e2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -439,14 +439,13 @@ EXPORT_SYMBOL(generic_file_splice_read); /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' - * using sendpage(). + * using sendpage(). Return the number of bytes sent. */ static int pipe_to_sendpage(struct pipe_inode_info *info, struct pipe_buffer *buf, struct splice_desc *sd) { struct file *file = sd->file; loff_t pos = sd->pos; - unsigned int offset; ssize_t ret; void *ptr; int more; @@ -461,16 +460,13 @@ static int pipe_to_sendpage(struct pipe_inode_info *info, if (IS_ERR(ptr)) return PTR_ERR(ptr); - offset = pos & ~PAGE_CACHE_MASK; more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; - ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); + ret = file->f_op->sendpage(file, buf->page, buf->offset, sd->len, + &pos, more); buf->ops->unmap(info, buf); - if (ret == sd->len) - return 0; - - return -EIO; + return ret; } /* @@ -499,7 +495,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, struct file *file = sd->file; struct address_space *mapping = file->f_mapping; gfp_t gfp_mask = mapping_gfp_mask(mapping); - unsigned int offset; + unsigned int offset, this_len; struct page *page; pgoff_t index; char *src; @@ -515,6 +511,10 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, index = sd->pos >> PAGE_CACHE_SHIFT; offset = sd->pos & ~PAGE_CACHE_MASK; + this_len = sd->len; + if (this_len + offset > PAGE_CACHE_SIZE) + this_len = PAGE_CACHE_SIZE - offset; + /* * Reuse buf page, if SPLICE_F_MOVE is set. */ @@ -558,7 +558,7 @@ find_page: * the full page. */ if (!PageUptodate(page)) { - if (sd->len < PAGE_CACHE_SIZE) { + if (this_len < PAGE_CACHE_SIZE) { ret = mapping->a_ops->readpage(file, page); if (unlikely(ret)) goto out; @@ -582,7 +582,7 @@ find_page: } } - ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); + ret = mapping->a_ops->prepare_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; @@ -592,18 +592,22 @@ find_page: if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { char *dst = kmap_atomic(page, KM_USER0); - memcpy(dst + offset, src + buf->offset, sd->len); + memcpy(dst + offset, src + buf->offset, this_len); flush_dcache_page(page); kunmap_atomic(dst, KM_USER0); } - ret = mapping->a_ops->commit_write(file, page, 0, sd->len); + ret = mapping->a_ops->commit_write(file, page, offset, offset+this_len); if (ret == AOP_TRUNCATED_PAGE) { page_cache_release(page); goto find_page; } else if (ret) goto out; + /* + * Return the number of bytes written. + */ + ret = this_len; mark_page_accessed(page); balance_dirty_pages_ratelimited(mapping); out: @@ -652,16 +656,22 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, sd.len = sd.total_len; err = actor(pipe, buf, &sd); - if (err) { + if (err <= 0) { if (!ret && err != -ENODATA) ret = err; break; } - ret += sd.len; - buf->offset += sd.len; - buf->len -= sd.len; + ret += err; + buf->offset += err; + buf->len -= err; + + sd.len -= err; + sd.pos += err; + sd.total_len -= err; + if (sd.len) + continue; if (!buf->len) { buf->ops = NULL; @@ -672,8 +682,6 @@ static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, do_wakeup = 1; } - sd.pos += sd.len; - sd.total_len -= sd.len; if (!sd.total_len) break; } -- cgit v1.2.3 From 5a5fb1ea74d8b82ca1461b885a1334fb21e037be Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:48:55 +0200 Subject: Revert "[fuse] fix deadlock between fuse_put_super() and request_end()" This reverts 73ce8355c243a434524a34c05cc417dd0467996e commit. It was wrong, because it didn't take into account the requirement, that iput() for background requests must be performed synchronously with ->put_super(), otherwise active inodes may remain after unmount. The right solution is to keep the sbput_sem and perform iput() within the locked region, but move fput() outside sbput_sem. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 28 ++++++++++++---------------- fs/fuse/fuse_i.h | 12 +++++++++--- fs/fuse/inode.c | 27 ++++++++++----------------- 3 files changed, 31 insertions(+), 36 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index cc750c68fe70..4967bd40b953 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -128,14 +128,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } -void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { - list_del_init(&req->bg_entry); + iput(req->inode); + iput(req->inode2); + if (req->file) + fput(req->file); + spin_lock(&fc->lock); + list_del(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } fc->num_background--; + spin_unlock(&fc->lock); } /* @@ -165,27 +171,17 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); fuse_put_request(fc, req); } else { - struct inode *inode = req->inode; - struct inode *inode2 = req->inode2; - struct file *file = req->file; void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - req->inode = NULL; - req->inode2 = NULL; - req->file = NULL; - if (!list_empty(&req->bg_entry)) - fuse_remove_background(fc, req); spin_unlock(&fc->lock); - + down_read(&fc->sbput_sem); + if (fc->mounted) + fuse_release_background(fc, req); + up_read(&fc->sbput_sem); if (end) end(fc, req); else fuse_put_request(fc, req); - - if (file) - fput(file); - iput(inode); - iput(inode2); } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 59661c481d9d..0474202cb5dc 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -258,9 +258,15 @@ struct fuse_conn { /** waitq for blocked connection */ wait_queue_head_t blocked_waitq; + /** RW semaphore for exclusion with fuse_put_super() */ + struct rw_semaphore sbput_sem; + /** The next unique request id */ u64 reqctr; + /** Mount is active */ + unsigned mounted; + /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -471,11 +477,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** - * Remove request from the the background list + * Release inodes and file associated with background request */ -void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req); -/** Abort all requests */ +/* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 43a6fc0db8a7..fd34037b0588 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -204,26 +204,17 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); + down_write(&fc->sbput_sem); + while (!list_empty(&fc->background)) + fuse_release_background(fc, + list_entry(fc->background.next, + struct fuse_req, bg_entry)); + spin_lock(&fc->lock); + fc->mounted = 0; fc->connected = 0; - while (!list_empty(&fc->background)) { - struct fuse_req *req = list_entry(fc->background.next, - struct fuse_req, bg_entry); - struct inode *inode = req->inode; - struct inode *inode2 = req->inode2; - - /* File would hold a reference to vfsmount */ - BUG_ON(req->file); - req->inode = NULL; - req->inode2 = NULL; - fuse_remove_background(fc, req); - - spin_unlock(&fc->lock); - iput(inode); - iput(inode2); - spin_lock(&fc->lock); - } spin_unlock(&fc->lock); + up_write(&fc->sbput_sem); /* Flush all readers on this fs */ kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); @@ -395,6 +386,7 @@ static struct fuse_conn *new_conn(void) INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); INIT_LIST_HEAD(&fc->background); + init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); @@ -549,6 +541,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_free_req; sb->s_root = root_dentry; + fc->mounted = 1; fc->connected = 1; kobject_get(&fc->kobj); file->private_data = fc; -- cgit v1.2.3 From 6dbbcb120570d747b00783820ee02d1e1bcf63de Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:49:06 +0200 Subject: [fuse] fix deadlock between fuse_put_super() and request_end(), try #2 A deadlock was possible, when the last reference to the superblock was held due to a background request containing a file reference. Releasing the file would release the vfsmount which in turn would release the superblock. Since sbput_sem is held during the fput() and fuse_put_super() tries to acquire this same semaphore, a deadlock results. The solution is to move the fput() outside the region protected by sbput_sem. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4967bd40b953..104a62dadb94 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -128,12 +128,16 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) } } +/* + * Called with sbput_sem held for read (request_end) or write + * (fuse_put_super). By the time fuse_put_super() is finished, all + * inodes belonging to background requests must be released, so the + * iputs have to be done within the locked region. + */ void fuse_release_background(struct fuse_conn *fc, struct fuse_req *req) { iput(req->inode); iput(req->inode2); - if (req->file) - fput(req->file); spin_lock(&fc->lock); list_del(&req->bg_entry); if (fc->num_background == FUSE_MAX_BACKGROUND) { @@ -178,6 +182,11 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) if (fc->mounted) fuse_release_background(fc, req); up_read(&fc->sbput_sem); + + /* fput must go outside sbput_sem, otherwise it can deadlock */ + if (req->file) + fput(req->file); + if (end) end(fc, req); else -- cgit v1.2.3 From 8aa09a50b5d9dbdf627f79e19d72d82994348089 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 26 Apr 2006 10:49:16 +0200 Subject: [fuse] fix race between checking and setting file->private_data BKL does not protect against races if the task may sleep between checking and setting a value. So move checking of file->private_data near to setting it in fuse_fill_super(). Found by Al Viro. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd34037b0588..7627022446b2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -500,11 +500,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - /* Setting file->private_data can't race with other mount() - instances, since BKL is held for ->get_sb() */ - if (file->private_data) - return -EINVAL; - fc = new_conn(); if (!fc) return -ENOMEM; @@ -540,6 +535,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_free_req; + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + err = -EINVAL; + if (file->private_data) + goto err_kobject_del; + sb->s_root = root_dentry; fc->mounted = 1; fc->connected = 1; @@ -556,6 +557,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) return 0; + err_kobject_del: + kobject_del(&fc->kobj); err_free_req: fuse_request_free(init_req); err_put_root: -- cgit v1.2.3 From 912d35f86781e64d73be1ef358f703c08905ac37 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 10:59:21 +0200 Subject: [PATCH] Add support for the sys_vmsplice syscall sys_splice() moves data to/from pipes with a file input/output. sys_vmsplice() moves data to a pipe, with the input being a user address range instead. This uses an approach suggested by Linus, where we can hold partial ranges inside the pages[] map. Hopefully this will be useful for network receive support as well. Signed-off-by: Jens Axboe --- fs/splice.c | 292 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 253 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 8c6030c762e2..0b2c1f060cae 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -27,6 +27,7 @@ #include #include #include +#include /* * Passed to the actors @@ -38,6 +39,22 @@ struct splice_desc { loff_t pos; /* file position */ }; +struct partial_page { + unsigned int offset; + unsigned int len; +}; + +/* + * Passed to move_to_pipe + */ +struct splice_pipe_desc { + struct page **pages; /* page map */ + struct partial_page *partial; /* pages[] may not be contig */ + int nr_pages; /* number of pages in map */ + unsigned int flags; /* splice flags */ + struct pipe_buf_operations *ops;/* ops associated with output pipe */ +}; + /* * Attempt to steal a page from a pipe buffer. This should perhaps go into * a vm helper function, it's already simplified quite a bit by the @@ -128,6 +145,19 @@ static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, kunmap(buf->page); } +static void *user_page_pipe_buf_map(struct file *file, + struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return kmap(buf->page); +} + +static void user_page_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + kunmap(buf->page); +} + static void page_cache_pipe_buf_get(struct pipe_inode_info *info, struct pipe_buffer *buf) { @@ -143,19 +173,33 @@ static struct pipe_buf_operations page_cache_pipe_buf_ops = { .get = page_cache_pipe_buf_get, }; +static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static struct pipe_buf_operations user_page_pipe_buf_ops = { + .can_merge = 0, + .map = user_page_pipe_buf_map, + .unmap = user_page_pipe_buf_unmap, + .release = page_cache_pipe_buf_release, + .steal = user_page_pipe_buf_steal, + .get = page_cache_pipe_buf_get, +}; + /* * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, - int nr_pages, unsigned long len, - unsigned int offset, unsigned int flags) +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { - int ret, do_wakeup, i; + int ret, do_wakeup, page_nr; ret = 0; do_wakeup = 0; - i = 0; + page_nr = 0; if (pipe->inode) mutex_lock(&pipe->inode->i_mutex); @@ -171,27 +215,19 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, if (pipe->nrbufs < PIPE_BUFFERS) { int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); struct pipe_buffer *buf = pipe->bufs + newbuf; - struct page *page = pages[i++]; - unsigned long this_len; - this_len = PAGE_CACHE_SIZE - offset; - if (this_len > len) - this_len = len; - - buf->page = page; - buf->offset = offset; - buf->len = this_len; - buf->ops = &page_cache_pipe_buf_ops; + buf->page = spd->pages[page_nr]; + buf->offset = spd->partial[page_nr].offset; + buf->len = spd->partial[page_nr].len; + buf->ops = spd->ops; pipe->nrbufs++; + page_nr++; + ret += buf->len; + if (pipe->inode) do_wakeup = 1; - ret += this_len; - len -= this_len; - offset = 0; - if (!--nr_pages) - break; - if (!len) + if (!--spd->nr_pages) break; if (pipe->nrbufs < PIPE_BUFFERS) continue; @@ -199,7 +235,7 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, break; } - if (flags & SPLICE_F_NONBLOCK) { + if (spd->flags & SPLICE_F_NONBLOCK) { if (!ret) ret = -EAGAIN; break; @@ -234,8 +270,8 @@ static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } - while (i < nr_pages) - page_cache_release(pages[i++]); + while (page_nr < spd->nr_pages) + page_cache_release(spd->pages[page_nr++]); return ret; } @@ -246,17 +282,24 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, unsigned int flags) { struct address_space *mapping = in->f_mapping; - unsigned int loff, offset, nr_pages; + unsigned int loff, nr_pages; struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; struct page *page; pgoff_t index, end_index; loff_t isize; - size_t bytes; - int i, error; + size_t total_len; + int error; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + }; index = *ppos >> PAGE_CACHE_SHIFT; - loff = offset = *ppos & ~PAGE_CACHE_MASK; - nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if (nr_pages > PIPE_BUFFERS) nr_pages = PIPE_BUFFERS; @@ -266,15 +309,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!offset || nr_pages > 1) - do_page_cache_readahead(mapping, in, index, nr_pages); + if (!loff || spd.nr_pages > 1) + do_page_cache_readahead(mapping, in, index, spd.nr_pages); /* * Now fill in the holes: */ error = 0; - bytes = 0; - for (i = 0; i < nr_pages; i++, index++) { + total_len = 0; + for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { unsigned int this_len; if (!len) @@ -367,26 +410,29 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (bytes + loff > isize) { + if (total_len + loff > isize) { page_cache_release(page); break; } /* * force quit after adding this page */ - nr_pages = i; + nr_pages = spd.nr_pages; this_len = min(this_len, loff); + loff = 0; } } fill_it: - pages[i] = page; - bytes += this_len; + pages[spd.nr_pages] = page; + partial[spd.nr_pages].offset = loff; + partial[spd.nr_pages].len = this_len; len -= this_len; + total_len += this_len; loff = 0; } - if (i) - return move_to_pipe(pipe, pages, i, bytes, offset, flags); + if (spd.nr_pages) + return move_to_pipe(pipe, &spd); return error; } @@ -1018,6 +1064,174 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; } +/* + * Map an iov into an array of pages and offset/length tupples. With the + * partial_page structure, we can map several non-contiguous ranges into + * our ones pages[] map instead of splitting that operation into pieces. + * Could easily be exported as a generic helper for other users, in which + * case one would probably want to add a 'max_nr_pages' parameter as well. + */ +static int get_iovec_page_array(const struct iovec __user *iov, + unsigned int nr_vecs, struct page **pages, + struct partial_page *partial) +{ + int buffers = 0, error = 0; + + /* + * It's ok to take the mmap_sem for reading, even + * across a "get_user()". + */ + down_read(¤t->mm->mmap_sem); + + while (nr_vecs) { + unsigned long off, npages; + void __user *base; + size_t len; + int i; + + /* + * Get user address base and length for this iovec. + */ + error = get_user(base, &iov->iov_base); + if (unlikely(error)) + break; + error = get_user(len, &iov->iov_len); + if (unlikely(error)) + break; + + /* + * Sanity check this iovec. 0 read succeeds. + */ + if (unlikely(!len)) + break; + error = -EFAULT; + if (unlikely(!base)) + break; + + /* + * Get this base offset and number of pages, then map + * in the user pages. + */ + off = (unsigned long) base & ~PAGE_MASK; + npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (npages > PIPE_BUFFERS - buffers) + npages = PIPE_BUFFERS - buffers; + + error = get_user_pages(current, current->mm, + (unsigned long) base, npages, 0, 0, + &pages[buffers], NULL); + + if (unlikely(error <= 0)) + break; + + /* + * Fill this contiguous range into the partial page map. + */ + for (i = 0; i < error; i++) { + const int plen = min_t(size_t, len, PAGE_SIZE) - off; + + partial[buffers].offset = off; + partial[buffers].len = plen; + + off = 0; + len -= plen; + buffers++; + } + + /* + * We didn't complete this iov, stop here since it probably + * means we have to move some of this into a pipe to + * be able to continue. + */ + if (len) + break; + + /* + * Don't continue if we mapped fewer pages than we asked for, + * or if we mapped the max number of pages that we have + * room for. + */ + if (error < npages || buffers == PIPE_BUFFERS) + break; + + nr_vecs--; + iov++; + } + + up_read(¤t->mm->mmap_sem); + + if (buffers) + return buffers; + + return error; +} + +/* + * vmsplice splices a user address range into a pipe. It can be thought of + * as splice-from-memory, where the regular splice is splice-from-file (or + * to file). In both cases the output is a pipe, naturally. + * + * Note that vmsplice only supports splicing _from_ user memory to a pipe, + * not the other way around. Splicing from user memory is a simple operation + * that can be supported without any funky alignment restrictions or nasty + * vm tricks. We simply map in the user memory and fill them into a pipe. + * The reverse isn't quite as easy, though. There are two possible solutions + * for that: + * + * - memcpy() the data internally, at which point we might as well just + * do a regular read() on the buffer anyway. + * - Lots of nasty vm tricks, that are neither fast nor flexible (it + * has restriction limitations on both ends of the pipe). + * + * Alas, it isn't here. + * + */ +static long do_vmsplice(struct file *file, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct pipe_inode_info *pipe = file->f_dentry->d_inode->i_pipe; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &user_page_pipe_buf_ops, + }; + + if (unlikely(!pipe)) + return -EBADF; + if (unlikely(nr_segs > UIO_MAXIOV)) + return -EINVAL; + else if (unlikely(!nr_segs)) + return 0; + + spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial); + if (spd.nr_pages <= 0) + return spd.nr_pages; + + return move_to_pipe(pipe, &spd); +} + +asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, + unsigned long nr_segs, unsigned int flags) +{ + struct file *file; + long error; + int fput; + + error = -EBADF; + file = fget_light(fd, &fput); + if (file) { + if (file->f_mode & FMODE_WRITE) + error = do_vmsplice(file, iov, nr_segs, flags); + + fput_light(file, fput); + } + + return error; +} + asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, int fd_out, loff_t __user *off_out, size_t len, unsigned int flags) -- cgit v1.2.3 From 00522fb41a2a9bf0f98a007c0e2b516a3873148c Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 26 Apr 2006 14:39:29 +0200 Subject: [PATCH] splice: rearrange moving to/from pipe helpers We need these for people writing their own ->splice_read/write hooks. Signed-off-by: Jens Axboe --- fs/splice.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 0b2c1f060cae..447ebc0a37f3 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -29,23 +29,13 @@ #include #include -/* - * Passed to the actors - */ -struct splice_desc { - unsigned int len, total_len; /* current and remaining length */ - unsigned int flags; /* splice flags */ - struct file *file; /* file to read/write */ - loff_t pos; /* file position */ -}; - struct partial_page { unsigned int offset; unsigned int len; }; /* - * Passed to move_to_pipe + * Passed to splice_to_pipe */ struct splice_pipe_desc { struct page **pages; /* page map */ @@ -192,8 +182,8 @@ static struct pipe_buf_operations user_page_pipe_buf_ops = { * Pipe output worker. This sets up our pipe format with the page cache * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). */ -static ssize_t move_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) +static ssize_t splice_to_pipe(struct pipe_inode_info *pipe, + struct splice_pipe_desc *spd) { int ret, do_wakeup, page_nr; @@ -432,7 +422,7 @@ fill_it: } if (spd.nr_pages) - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); return error; } @@ -666,17 +656,14 @@ out_nomem: return ret; } -typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, - struct splice_desc *); - /* * Pipe input worker. Most of this logic works like a regular pipe, the * key here is the 'actor' worker passed in that actually moves the data * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. */ -static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags, - splice_actor *actor) +ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) { int ret, do_wakeup, err; struct splice_desc sd; @@ -795,7 +782,7 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, struct address_space *mapping = out->f_mapping; ssize_t ret; - ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + ret = splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); if (ret > 0) { struct inode *inode = mapping->host; @@ -837,7 +824,7 @@ EXPORT_SYMBOL(generic_file_splice_write); ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); + return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); } EXPORT_SYMBOL(generic_splice_sendpage); @@ -924,7 +911,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, /* * We don't have an immediate reader, but we'll read the stuff - * out of the pipe right after the move_to_pipe(). So set + * out of the pipe right after the splice_to_pipe(). So set * PIPE_READERS appropriately. */ pipe->readers = 1; @@ -1210,7 +1197,7 @@ static long do_vmsplice(struct file *file, const struct iovec __user *iov, if (spd.nr_pages <= 0) return spd.nr_pages; - return move_to_pipe(pipe, &spd); + return splice_to_pipe(pipe, &spd); } asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov, -- cgit v1.2.3 From de0bb97aff6743f71abb8ec581238e2bdae9cdd1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 07:26:09 +0100 Subject: [PATCH] forgotten ->b_data in memcpy() call in ext3/resize.c (oopsable) sbi->s_group_desc is an array of pointers to buffer_head. memcpy() of buffer size from address of buffer_head is a bad idea - it will generate junk in any case, may oops if buffer_head is close to the end of slab page and next page is not mapped and isn't what was intended there. IOW, ->b_data is missing in that call. Fortunately, result doesn't go into the primary on-disk data structures, so only backup ones get crap written to them; that had allowed this bug to remain unnoticed until now. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/ext3/resize.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index c5ffa8523968..8aac5334680d 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -213,7 +213,7 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_bh; } lock_buffer(bh); - memcpy(gdb->b_data, sbi->s_group_desc[i], bh->b_size); + memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, bh->b_size); set_buffer_uptodate(gdb); unlock_buffer(bh); ext3_journal_dirty_metadata(handle, gdb); -- cgit v1.2.3 From a090d9132c1e53e3517111123680c15afb25c0a4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 26 Apr 2006 07:32:40 +0100 Subject: [PATCH] protect ext3 ioctl modifying append_only, immutable, etc. with i_mutex All modifications of ->i_flags in inodes that might be visible to somebody else must be under ->i_mutex. That patch fixes ext3 ioctl() setting S_APPEND and friends. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/ext3/ioctl.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index aaf1da17b6d4..8c22aa9a7fbb 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -48,6 +48,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; + mutex_lock(&inode->i_mutex); oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -60,8 +61,10 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } /* @@ -69,14 +72,18 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&inode->i_mutex); return -EPERM; + } } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + mutex_unlock(&inode->i_mutex); return PTR_ERR(handle); + } if (IS_SYNC(inode)) handle->h_sync = 1; err = ext3_reserve_inode_write(handle, inode, &iloc); @@ -93,11 +100,14 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext3_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext3_journal_stop(handle); - if (err) + if (err) { + mutex_unlock(&inode->i_mutex); return err; + } if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); + mutex_unlock(&inode->i_mutex); return err; } case EXT3_IOC_GETVERSION: -- cgit v1.2.3 From e7edf9cdeddc0cff125e8e658216efb2ff2b2219 Mon Sep 17 00:00:00 2001 From: James Morris Date: Wed, 26 Apr 2006 02:45:03 -0400 Subject: [PATCH] LSM: add missing hook to do_compat_readv_writev() This patch addresses a flaw in LSM, where there is no mediation of readv() and writev() in for 32-bit compatible apps using a 64-bit kernel. This bug was discovered and fixed initially in the native readv/writev code [1], but was not fixed in the compat code. Thanks to Al for spotting this one. [1] http://lwn.net/Articles/154282/ Signed-off-by: James Morris Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/compat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 7f8e26ea427c..2e32bd340474 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1217,6 +1217,10 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, if (ret < 0) goto out; + ret = security_file_permission(file, type == READ ? MAY_READ:MAY_WRITE); + if (ret) + goto out; + fnv = NULL; if (type == READ) { fn = file->f_op->read; -- cgit v1.2.3 From eb645a24de82496434cc81171d7f350edb327399 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 08:44:27 +0200 Subject: [PATCH] splice: switch to using page_cache_readahead() Avoids doing useless work, when the file is fully cached. Signed-off-by: Jens Axboe --- fs/splice.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index 447ebc0a37f3..dc205f643090 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -299,8 +299,8 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, * read-ahead if this is a non-zero offset (we are likely doing small * chunk splice and the page is already there) for a single page. */ - if (!loff || spd.nr_pages > 1) - do_page_cache_readahead(mapping, in, index, spd.nr_pages); + if (!loff || nr_pages > 1) + page_cache_readahead(mapping, &in->f_ra, in, index, nr_pages); /* * Now fill in the holes: -- cgit v1.2.3 From eb20796bf6fdb95ccf51440ba2a827149bdc037f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 27 Apr 2006 11:05:22 +0200 Subject: [PATCH] splice: make the read-side do batched page lookups Use the new find_get_pages_contig() to potentially look up the entire splice range in one single call. This speeds up generic_file_splice_read() quite a bit. Signed-off-by: Jens Axboe --- fs/splice.c | 95 ++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/splice.c b/fs/splice.c index dc205f643090..a46ddd28561e 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -279,7 +279,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, pgoff_t index, end_index; loff_t isize; size_t total_len; - int error; + int error, page_nr; struct splice_pipe_desc spd = { .pages = pages, .partial = partial, @@ -307,39 +307,67 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ error = 0; total_len = 0; - for (spd.nr_pages = 0; spd.nr_pages < nr_pages; spd.nr_pages++, index++) { - unsigned int this_len; - if (!len) - break; + /* + * Lookup the (hopefully) full range of pages we need. + */ + spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); + /* + * If find_get_pages_contig() returned fewer pages than we needed, + * allocate the rest. + */ + index += spd.nr_pages; + while (spd.nr_pages < nr_pages) { /* - * this_len is the max we'll use from this page - */ - this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); -find_page: - /* - * lookup the page for this index + * Page could be there, find_get_pages_contig() breaks on + * the first hole. */ page = find_get_page(mapping, index); if (!page) { /* - * page didn't exist, allocate one + * page didn't exist, allocate one. */ page = page_cache_alloc_cold(mapping); if (!page) break; error = add_to_page_cache_lru(page, mapping, index, - mapping_gfp_mask(mapping)); + mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); break; } - - goto readpage; + /* + * add_to_page_cache() locks the page, unlock it + * to avoid convoluting the logic below even more. + */ + unlock_page(page); } + pages[spd.nr_pages++] = page; + index++; + } + + /* + * Now loop over the map and see if we need to start IO on any + * pages, fill in the partial map, etc. + */ + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + /* + * this_len is the max we'll use from this page + */ + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = pages[page_nr]; + /* * If the page isn't uptodate, we may need to start io on it */ @@ -360,7 +388,6 @@ find_page: */ if (!page->mapping) { unlock_page(page); - page_cache_release(page); break; } /* @@ -371,16 +398,20 @@ find_page: goto fill_it; } -readpage: /* * need to read in the page */ error = mapping->a_ops->readpage(in, page); - if (unlikely(error)) { - page_cache_release(page); + /* + * We really should re-lookup the page here, + * but it complicates things a lot. Instead + * lets just do what we already stored, and + * we'll get it the next time we are called. + */ if (error == AOP_TRUNCATED_PAGE) - goto find_page; + error = 0; + break; } @@ -389,10 +420,8 @@ readpage: */ isize = i_size_read(mapping->host); end_index = (isize - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!isize || index > end_index)) { - page_cache_release(page); + if (unlikely(!isize || index > end_index)) break; - } /* * if this is the last page, see if we need to shrink @@ -400,27 +429,33 @@ readpage: */ if (end_index == index) { loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK); - if (total_len + loff > isize) { - page_cache_release(page); + if (total_len + loff > isize) break; - } /* * force quit after adding this page */ - nr_pages = spd.nr_pages; + len = this_len; this_len = min(this_len, loff); loff = 0; } } fill_it: - pages[spd.nr_pages] = page; - partial[spd.nr_pages].offset = loff; - partial[spd.nr_pages].len = this_len; + partial[page_nr].offset = loff; + partial[page_nr].len = this_len; len -= this_len; total_len += this_len; loff = 0; + spd.nr_pages++; + index++; } + /* + * Release any pages at the end, if we quit early. 'i' is how far + * we got, 'nr_pages' is how many pages are in the map. + */ + while (page_nr < nr_pages) + page_cache_release(pages[page_nr++]); + if (spd.nr_pages) return splice_to_pipe(pipe, &spd); -- cgit v1.2.3