From 95989c46d2a156365867b1d795fdefce71bce378 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 8 Jun 2017 08:23:07 -0700 Subject: xfs: fix spurious spin_is_locked() assert failures on non-smp kernels The 0-day kernel test robot reports assertion failures on !CONFIG_SMP kernels due to failed spin_is_locked() checks. As it turns out, spin_is_locked() is hardcoded to return zero on !CONFIG_SMP kernels and so this function cannot be relied on to verify spinlock state in this configuration. To avoid this problem, replace the associated asserts with lockdep variants that do the right thing regardless of kernel configuration. Drop the one assert that checks for an unlocked lock as there is no suitable lockdep variant for that case. This moves the spinlock checks from XFS debug code to lockdep, but generally provides the same level of protection. Reported-by: kbuild test robot Signed-off-by: Brian Foster Reviewed-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_buf.c | 2 +- fs/xfs/xfs_icache.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 07b77b73b024..16d6a578fc16 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -117,7 +117,7 @@ static inline void __xfs_buf_ioacct_dec( struct xfs_buf *bp) { - ASSERT(spin_is_locked(&bp->b_lock)); + lockdep_assert_held(&bp->b_lock); if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f61c84f8e31a..990210fcb9c3 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -66,7 +66,6 @@ xfs_inode_alloc( XFS_STATS_INC(mp, vn_active); ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(!xfs_isiflocked(ip)); ASSERT(ip->i_ino == 0); @@ -190,7 +189,7 @@ xfs_perag_set_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (pag->pag_ici_reclaimable++) return; @@ -212,7 +211,7 @@ xfs_perag_clear_reclaim_tag( { struct xfs_mount *mp = pag->pag_mount; - ASSERT(spin_is_locked(&pag->pag_ici_lock)); + lockdep_assert_held(&pag->pag_ici_lock); if (--pag->pag_ici_reclaimable) return; -- cgit v1.2.3 From ba80aa909c99802c428682c352b0ee0baac0acd3 Mon Sep 17 00:00:00 2001 From: Nicholas Bellinger Date: Thu, 8 Jun 2017 04:51:54 +0000 Subject: configfs: Fix race between create_link and configfs_rmdir This patch closes a long standing race in configfs between the creation of a new symlink in create_link(), while the symlink target's config_item is being concurrently removed via configfs_rmdir(). This can happen because the symlink target's reference is obtained by config_item_get() in create_link() before the CONFIGFS_USET_DROPPING bit set by configfs_detach_prep() during configfs_rmdir() shutdown is actually checked.. This originally manifested itself on ppc64 on v4.8.y under heavy load using ibmvscsi target ports with Novalink API: [ 7877.289863] rpadlpar_io: slot U8247.22L.212A91A-V1-C8 added [ 7879.893760] ------------[ cut here ]------------ [ 7879.893768] WARNING: CPU: 15 PID: 17585 at ./include/linux/kref.h:46 config_item_get+0x7c/0x90 [configfs] [ 7879.893811] CPU: 15 PID: 17585 Comm: targetcli Tainted: G O 4.8.17-customv2.22 #12 [ 7879.893812] task: c00000018a0d3400 task.stack: c0000001f3b40000 [ 7879.893813] NIP: d000000002c664ec LR: d000000002c60980 CTR: c000000000b70870 [ 7879.893814] REGS: c0000001f3b43810 TRAP: 0700 Tainted: G O (4.8.17-customv2.22) [ 7879.893815] MSR: 8000000000029033 CR: 28222242 XER: 00000000 [ 7879.893820] CFAR: d000000002c664bc SOFTE: 1 GPR00: d000000002c60980 c0000001f3b43a90 d000000002c70908 c0000000fbc06820 GPR04: c0000001ef1bd900 0000000000000004 0000000000000001 0000000000000000 GPR08: 0000000000000000 0000000000000001 d000000002c69560 d000000002c66d80 GPR12: c000000000b70870 c00000000e798700 c0000001f3b43ca0 c0000001d4949d40 GPR16: c00000014637e1c0 0000000000000000 0000000000000000 c0000000f2392940 GPR20: c0000001f3b43b98 0000000000000041 0000000000600000 0000000000000000 GPR24: fffffffffffff000 0000000000000000 d000000002c60be0 c0000001f1dac490 GPR28: 0000000000000004 0000000000000000 c0000001ef1bd900 c0000000f2392940 [ 7879.893839] NIP [d000000002c664ec] config_item_get+0x7c/0x90 [configfs] [ 7879.893841] LR [d000000002c60980] check_perm+0x80/0x2e0 [configfs] [ 7879.893842] Call Trace: [ 7879.893844] [c0000001f3b43ac0] [d000000002c60980] check_perm+0x80/0x2e0 [configfs] [ 7879.893847] [c0000001f3b43b10] [c000000000329770] do_dentry_open+0x2c0/0x460 [ 7879.893849] [c0000001f3b43b70] [c000000000344480] path_openat+0x210/0x1490 [ 7879.893851] [c0000001f3b43c80] [c00000000034708c] do_filp_open+0xfc/0x170 [ 7879.893853] [c0000001f3b43db0] [c00000000032b5bc] do_sys_open+0x1cc/0x390 [ 7879.893856] [c0000001f3b43e30] [c000000000009584] system_call+0x38/0xec [ 7879.893856] Instruction dump: [ 7879.893858] 409d0014 38210030 e8010010 7c0803a6 4e800020 3d220000 e94981e0 892a0000 [ 7879.893861] 2f890000 409effe0 39200001 992a0000 <0fe00000> 4bffffd0 60000000 60000000 [ 7879.893866] ---[ end trace 14078f0b3b5ad0aa ]--- To close this race, go ahead and obtain the symlink's target config_item reference only after the existing CONFIGFS_USET_DROPPING check succeeds. This way, if configfs_rmdir() wins create_link() will return -ENONET, and if create_link() wins configfs_rmdir() will return -EBUSY. Reported-by: Bryant G. Ly Tested-by: Bryant G. Ly Signed-off-by: Nicholas Bellinger Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org --- fs/configfs/symlink.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index a6ab012a2c6a..c8aabba502f6 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -83,14 +83,13 @@ static int create_link(struct config_item *parent_item, ret = -ENOMEM; sl = kmalloc(sizeof(struct configfs_symlink), GFP_KERNEL); if (sl) { - sl->sl_target = config_item_get(item); spin_lock(&configfs_dirent_lock); if (target_sd->s_type & CONFIGFS_USET_DROPPING) { spin_unlock(&configfs_dirent_lock); - config_item_put(item); kfree(sl); return -ENOENT; } + sl->sl_target = config_item_get(item); list_add(&sl->sl_list, &target_sd->s_links); spin_unlock(&configfs_dirent_lock); ret = configfs_create_link(sl, parent_item->ci_dentry, -- cgit v1.2.3 From 19e72d3abb63cb16d021a4066ce1a18880509e99 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Feb 2017 17:28:50 -0800 Subject: configfs: Introduce config_item_get_unless_zero() Signed-off-by: Bart Van Assche [hch: minor style tweak] Signed-off-by: Christoph Hellwig --- fs/configfs/item.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 8b2a994042dd..a66f6624d899 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -138,6 +138,14 @@ struct config_item *config_item_get(struct config_item *item) } EXPORT_SYMBOL(config_item_get); +struct config_item *config_item_get_unless_zero(struct config_item *item) +{ + if (item && kref_get_unless_zero(&item->ci_kref)) + return item; + return NULL; +} +EXPORT_SYMBOL(config_item_get_unless_zero); + static void config_item_cleanup(struct config_item *item) { struct config_item_type *t = item->ci_type; -- cgit v1.2.3 From 03f219041fdbeb31cecff41bb1cb4e1018f9cf75 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 17 May 2017 12:21:07 +0100 Subject: ceph: check i_nlink while converting a file handle to dentry Converting a file handle to a dentry can be done call after the inode unlink. This means that __fh_to_dentry() requires an extra check to verify the number of links is not 0. The issue can be easily reproduced using xfstest generic/426, which does something like: name_to_handle_at(&fh) echo 3 > /proc/sys/vm/drop_caches unlink() open_by_handle_at(&fh) The call to open_by_handle_at() should fail, as the file doesn't exist anymore. Link: http://tracker.ceph.com/issues/19958 Signed-off-by: Luis Henriques Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/export.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e8f11fa565c5..7df550c13d7f 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -91,6 +91,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return ERR_PTR(-ESTALE); + if (inode->i_nlink == 0) { + iput(inode); + return ERR_PTR(-ESTALE); + } } return d_obtain_alias(inode); -- cgit v1.2.3 From 56199016e8672feb7b903eda003a863d5bf2b8c4 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 1 Jun 2017 16:44:53 +0800 Subject: ceph: use current_kernel_time() to get request time stamp ceph uses ktime_get_real_ts() to get request time stamp. In most other cases, current_kernel_time() is used to get time stamp for filesystem operations (called by current_time()). There is granularity difference between ktime_get_real_ts() and current_kernel_time(). The later one can be up to one jiffy behind the former one. This can causes inode's ctime to go back. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f38e56fa9712..0c05df44cc6c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1687,7 +1687,6 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); - struct timespec ts; if (!req) return ERR_PTR(-ENOMEM); @@ -1706,8 +1705,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - ktime_get_real_ts(&ts); - req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran); + req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; -- cgit v1.2.3 From 4ca2fea6f8277ab381bd08b996d641255b6f7b00 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 1 Jun 2017 17:08:00 +0800 Subject: ceph: unify inode i_ctime update Current __ceph_setattr() can set inode's i_ctime to current_time(), req->r_stamp or attr->ia_ctime. These time stamps may have minor differences. It may cause potential problem. Signed-off-by: "Yan, Zheng" Acked-by: Arnd Bergmann Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 1 + fs/ceph/inode.c | 5 ++--- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 987044bca1c2..59cb307b15fb 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -131,6 +131,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) } if (new_mode != old_mode) { + newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; newattrs.ia_valid = ATTR_MODE; ret = __ceph_setattr(inode, &newattrs); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index dcce79b84406..4de6cdddf059 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2022,7 +2022,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) attr->ia_size > inode->i_size) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); - inode->i_ctime = attr->ia_ctime; ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || @@ -2044,7 +2043,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); - inode->i_ctime = attr->ia_ctime; if (only) { /* * if kernel wants to dirty ctime but nothing else, @@ -2067,7 +2065,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = current_time(inode); + inode->i_ctime = attr->ia_ctime; } release &= issued; @@ -2085,6 +2083,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; + req->r_stamp = attr->ia_ctime; err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, -- cgit v1.2.3 From 96ecff14225ad40a29f4d5cfa6bd9266c8e1e89a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Jun 2017 15:17:32 -0400 Subject: ufs: fix logics in "ufs: make fsck -f happy" Storing stats _only_ at new locations is wrong for UFS1; old locations should always be kept updated. The check for "has been converted to use of new locations" is also wrong - it should be "->fs_maxbsize is equal to ->fs_bsize". Signed-off-by: Al Viro --- fs/ufs/super.c | 41 ++++++++++++++++++++++++++++------------- 1 file changed, 28 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ufs/super.c b/fs/ufs/super.c index d9aa2627c9df..eca838a8b43e 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -480,7 +480,7 @@ static void ufs_setup_cstotal(struct super_block *sb) usb3 = ubh_get_usb_third(uspi); if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) || mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ uspi->cs_total.cs_ndir = fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir); @@ -596,9 +596,7 @@ static void ufs_put_cstotal(struct super_block *sb) usb2 = ubh_get_usb_second(uspi); usb3 = ubh_get_usb_third(uspi); - if ((mtype == UFS_MOUNT_UFSTYPE_44BSD && - (usb1->fs_flags & UFS_FLAGS_UPDATED)) || - mtype == UFS_MOUNT_UFSTYPE_UFS2) { + if (mtype == UFS_MOUNT_UFSTYPE_UFS2) { /*we have statistic in different place, then usual*/ usb2->fs_un.fs_u2.cs_ndir = cpu_to_fs64(sb, uspi->cs_total.cs_ndir); @@ -608,16 +606,26 @@ static void ufs_put_cstotal(struct super_block *sb) cpu_to_fs64(sb, uspi->cs_total.cs_nifree); usb3->fs_un1.fs_u2.cs_nffree = cpu_to_fs64(sb, uspi->cs_total.cs_nffree); - } else { - usb1->fs_cstotal.cs_ndir = - cpu_to_fs32(sb, uspi->cs_total.cs_ndir); - usb1->fs_cstotal.cs_nbfree = - cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); - usb1->fs_cstotal.cs_nifree = - cpu_to_fs32(sb, uspi->cs_total.cs_nifree); - usb1->fs_cstotal.cs_nffree = - cpu_to_fs32(sb, uspi->cs_total.cs_nffree); + goto out; } + + if (mtype == UFS_MOUNT_UFSTYPE_44BSD && + (usb2->fs_un.fs_u2.fs_maxbsize == usb1->fs_bsize)) { + /* store stats in both old and new places */ + usb2->fs_un.fs_u2.cs_ndir = + cpu_to_fs64(sb, uspi->cs_total.cs_ndir); + usb2->fs_un.fs_u2.cs_nbfree = + cpu_to_fs64(sb, uspi->cs_total.cs_nbfree); + usb3->fs_un1.fs_u2.cs_nifree = + cpu_to_fs64(sb, uspi->cs_total.cs_nifree); + usb3->fs_un1.fs_u2.cs_nffree = + cpu_to_fs64(sb, uspi->cs_total.cs_nffree); + } + usb1->fs_cstotal.cs_ndir = cpu_to_fs32(sb, uspi->cs_total.cs_ndir); + usb1->fs_cstotal.cs_nbfree = cpu_to_fs32(sb, uspi->cs_total.cs_nbfree); + usb1->fs_cstotal.cs_nifree = cpu_to_fs32(sb, uspi->cs_total.cs_nifree); + usb1->fs_cstotal.cs_nffree = cpu_to_fs32(sb, uspi->cs_total.cs_nffree); +out: ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_print_super_stuff(sb, usb1, usb2, usb3); UFSD("EXIT\n"); @@ -997,6 +1005,13 @@ again: flags |= UFS_ST_SUN; } + if ((flags & UFS_ST_MASK) == UFS_ST_44BSD && + uspi->s_postblformat == UFS_42POSTBLFMT) { + if (!silent) + pr_err("this is not a 44bsd filesystem"); + goto failed; + } + /* * Check ufs magic number */ -- cgit v1.2.3 From fffd70f58864f5a48b2c17d02730a460f86d4254 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Jun 2017 15:36:31 -0400 Subject: ufs: make ufs_freespace() return signed as it is, checking that its return value is <= 0 is useless and that's how it's being used. Signed-off-by: Al Viro --- fs/ufs/util.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 398019fb1448..1e1639f8a58b 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -354,12 +354,12 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, * Determine the number of available frags given a * percentage to hold in reserve. */ -static inline u64 +static inline s64 ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + uspi->cs_total.cs_nffree - - (uspi->s_dsize * (percentreserved) / 100); + (uspi->s_dsize * percentreserved) / 100; } /* -- cgit v1.2.3 From b451cec4bbd913688f5381efad407762a64a92ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Jun 2017 15:41:17 -0400 Subject: ufs: fix reserved blocks check a) honour ->s_minfree; don't just go with default (5) b) don't bother with capability checks until we know we'll need them Signed-off-by: Al Viro --- fs/ufs/balloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index d642cc0a8271..52d1ef415f6f 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -400,10 +400,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * There is not enough space for user on the device */ - if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - mutex_unlock(&UFS_SB(sb)->s_lock); - UFSD("EXIT (FAILED)\n"); - return 0; + if (unlikely(ufs_freespace(uspi, uspi->s_minfree) <= 0)) { + if (!capable(CAP_SYS_RESOURCE)) { + mutex_unlock(&UFS_SB(sb)->s_lock); + UFSD("EXIT (FAILED)\n"); + return 0; + } } if (goal >= uspi->s_size) -- cgit v1.2.3 From c596961d1b4ccc6f15754fe5a49c37ac6da57145 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Jun 2017 16:36:29 -0400 Subject: ufs: fix s_size/s_dsize users For UFS2 we need 64bit variants; we even store them in uspi, but use 32bit ones instead. One wrinkle is in handling of reserved space - recalculating it every time had been stupid all along, but now it would become really ugly. Just calculate it once... Signed-off-by: Al Viro --- fs/ufs/balloc.c | 2 +- fs/ufs/super.c | 23 ++++++++++++----------- fs/ufs/ufs_fs.h | 7 +++---- fs/ufs/util.h | 11 +++-------- 4 files changed, 19 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 52d1ef415f6f..af0473a851af 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -400,7 +400,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * There is not enough space for user on the device */ - if (unlikely(ufs_freespace(uspi, uspi->s_minfree) <= 0)) { + if (unlikely(ufs_freefrags(uspi) <= uspi->s_root_blocks)) { if (!capable(CAP_SYS_RESOURCE)) { mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index eca838a8b43e..34656c7a8e22 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1159,8 +1159,8 @@ magic_found: uspi->s_cgmask = fs32_to_cpu(sb, usb1->fs_cgmask); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { - uspi->s_u2_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); - uspi->s_u2_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); + uspi->s_size = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size); + uspi->s_dsize = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); } else { uspi->s_size = fs32_to_cpu(sb, usb1->fs_size); uspi->s_dsize = fs32_to_cpu(sb, usb1->fs_dsize); @@ -1209,6 +1209,9 @@ magic_found: uspi->s_postbloff = fs32_to_cpu(sb, usb3->fs_postbloff); uspi->s_rotbloff = fs32_to_cpu(sb, usb3->fs_rotbloff); + uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree, 100); + /* * Compute another frequently used values */ @@ -1398,19 +1401,17 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) buf->f_type = UFS2_MAGIC; - buf->f_blocks = fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize); - } else { + else buf->f_type = UFS_MAGIC; - buf->f_blocks = uspi->s_dsize; - } - buf->f_bfree = ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree; + + buf->f_blocks = uspi->s_dsize; + buf->f_bfree = ufs_freefrags(uspi); buf->f_ffree = uspi->cs_total.cs_nifree; buf->f_bsize = sb->s_blocksize; - buf->f_bavail = (buf->f_bfree > (((long)buf->f_blocks / 100) * uspi->s_minfree)) - ? (buf->f_bfree - (((long)buf->f_blocks / 100) * uspi->s_minfree)) : 0; + buf->f_bavail = (buf->f_bfree > uspi->s_root_blocks) + ? (buf->f_bfree - uspi->s_root_blocks) : 0; buf->f_files = uspi->s_ncg * uspi->s_ipg; buf->f_namelen = UFS_MAXNAMLEN; buf->f_fsid.val[0] = (u32)id; diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 0cbd5d340b67..823d55a37586 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -733,10 +733,8 @@ struct ufs_sb_private_info { __u32 s_dblkno; /* offset of first data after cg */ __u32 s_cgoffset; /* cylinder group offset in cylinder */ __u32 s_cgmask; /* used to calc mod fs_ntrak */ - __u32 s_size; /* number of blocks (fragments) in fs */ - __u32 s_dsize; /* number of data blocks in fs */ - __u64 s_u2_size; /* ufs2: number of blocks (fragments) in fs */ - __u64 s_u2_dsize; /*ufs2: number of data blocks in fs */ + __u64 s_size; /* number of blocks (fragments) in fs */ + __u64 s_dsize; /* number of data blocks in fs */ __u32 s_ncg; /* number of cylinder groups */ __u32 s_bsize; /* size of basic blocks */ __u32 s_fsize; /* size of fragments */ @@ -793,6 +791,7 @@ struct ufs_sb_private_info { __u32 s_maxsymlinklen;/* upper limit on fast symlinks' size */ __s32 fs_magic; /* filesystem magic */ unsigned int s_dirblksize; + __u64 s_root_blocks; }; /* diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 1e1639f8a58b..9fc7119a1551 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -350,16 +350,11 @@ static inline void *ubh_get_data_ptr(struct ufs_sb_private_info *uspi, #define ubh_blkmap(ubh,begin,bit) \ ((*ubh_get_addr(ubh, (begin) + ((bit) >> 3)) >> ((bit) & 7)) & (0xff >> (UFS_MAXFRAG - uspi->s_fpb))) -/* - * Determine the number of available frags given a - * percentage to hold in reserve. - */ -static inline s64 -ufs_freespace(struct ufs_sb_private_info *uspi, int percentreserved) +static inline u64 +ufs_freefrags(struct ufs_sb_private_info *uspi) { return ufs_blkstofrags(uspi->cs_total.cs_nbfree) + - uspi->cs_total.cs_nffree - - (uspi->s_dsize * percentreserved) / 100; + uspi->cs_total.cs_nffree; } /* -- cgit v1.2.3 From 267309f394bf3cd8db001992890b1fa52b97974e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 14 Jun 2017 23:32:19 -0400 Subject: ufs_get_locked_page(): make sure we have buffer_heads callers rely upon that, but find_lock_page() racing with attempt of page eviction by memory pressure might have left us with * try_to_free_buffers() successfully done * __remove_mapping() failed, leaving the page in our mapping * find_lock_page() returning an uptodate page with no buffer_heads attached. Signed-off-by: Al Viro --- fs/ufs/util.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.c b/fs/ufs/util.c index f41ad0a6106f..02497a492eb2 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -243,9 +243,8 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) { - struct page *page; - - page = find_lock_page(mapping, index); + struct inode *inode = mapping->host; + struct page *page = find_lock_page(mapping, index); if (!page) { page = read_mapping_page(mapping, index, NULL); @@ -253,7 +252,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "read_mapping_page error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - goto out; + return page; } lock_page(page); @@ -262,8 +261,7 @@ struct page *ufs_get_locked_page(struct address_space *mapping, /* Truncate got there first */ unlock_page(page); put_page(page); - page = NULL; - goto out; + return NULL; } if (!PageUptodate(page) || PageError(page)) { @@ -272,11 +270,12 @@ struct page *ufs_get_locked_page(struct address_space *mapping, printk(KERN_ERR "ufs_change_blocknr: " "can not read page: ino %lu, index: %lu\n", - mapping->host->i_ino, index); + inode->i_ino, index); - page = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } } -out: + if (!page_has_buffers(page)) + create_empty_buffers(page, 1 << inode->i_blkbits, 0); return page; } -- cgit v1.2.3 From 09bf4f5b6e6013f0ad6b090d4a8deebd4e56d878 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Jun 2017 00:17:30 -0400 Subject: ufs: avoid grabbing ->truncate_mutex if possible tail unpacking is done in a wrong place; the deadlocks galore is best dealt with by doing that in ->write_iter() (and switching to iomap, while we are at it), but that's rather painful to backport. The trouble comes from grabbing pages that cover the beginning of tail from inside of ufs_new_fragments(); ongoing pageout of any of those is going to deadlock on ->truncate_mutex with process that got around to extending the tail holding that and waiting for page to get unlocked, while ->writepage() on that page is waiting on ->truncate_mutex. The thing is, we don't need ->truncate_mutex when the fragment we are trying to map is within the tail - the damn thing is allocated (tail can't contain holes). Let's do a plain lookup and if the fragment is present, we can just pretend that we'd won the race in almost all cases. The only exception is a fragment between the end of tail and the end of block containing tail. Protect ->i_lastfrag with ->meta_lock - read_seqlock_excl() is sufficient. Signed-off-by: Al Viro --- fs/ufs/balloc.c | 10 ++++++---- fs/ufs/inode.c | 26 ++++++++++++++++++++------ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index af0473a851af..d56d9bc705fe 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -423,12 +423,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + write_sequnlock(&UFS_I(inode)->meta_lock); } mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); @@ -441,8 +441,10 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, result = ufs_add_fragments(inode, tmp, oldcount, newcount); if (result) { *err = 0; + read_seqlock_excl(&UFS_I(inode)->meta_lock); UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + read_sequnlock_excl(&UFS_I(inode)->meta_lock); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); mutex_unlock(&UFS_SB(sb)->s_lock); @@ -479,12 +481,12 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + *err = 0; write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); - write_sequnlock(&UFS_I(inode)->meta_lock); - *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); + write_sequnlock(&UFS_I(inode)->meta_lock); mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index da553ffec85b..1dda6c4875f9 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -401,13 +401,20 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; - if (!create) { - phys64 = ufs_frag_map(inode, offsets, depth); - if (phys64) - map_bh(bh_result, sb, phys64 + frag); - return 0; - } + phys64 = ufs_frag_map(inode, offsets, depth); + if (!create) + goto done; + if (phys64) { + if (fragment >= UFS_NDIR_FRAGMENT) + goto done; + read_seqlock_excl(&UFS_I(inode)->meta_lock); + if (fragment < UFS_I(inode)->i_lastfrag) { + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + goto done; + } + read_sequnlock_excl(&UFS_I(inode)->meta_lock); + } /* This code entered only while writing ....? */ mutex_lock(&UFS_I(inode)->truncate_mutex); @@ -451,6 +458,11 @@ out: } mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; + +done: + if (phys64) + map_bh(bh_result, sb, phys64 + frag); + return 0; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) @@ -1161,7 +1173,9 @@ static void ufs_truncate_blocks(struct inode *inode) free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); } } + read_seqlock_excl(&ufsi->meta_lock); ufsi->i_lastfrag = DIRECT_FRAGMENT; + read_sequnlock_excl(&ufsi->meta_lock); mark_inode_dirty(inode); mutex_unlock(&ufsi->truncate_mutex); } -- cgit v1.2.3 From 289dec5b895a7ecefb2f49da109e6aed9b0f1754 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Jun 2017 00:42:56 -0400 Subject: ufs: more deadlock prevention on tail unpacking ->s_lock is not needed for ufs_change_blocknr() Signed-off-by: Al Viro --- fs/ufs/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index d56d9bc705fe..0315fea1d589 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -478,6 +478,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (result) { ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); + mutex_unlock(&UFS_SB(sb)->s_lock); ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); @@ -487,7 +488,6 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); write_sequnlock(&UFS_I(inode)->meta_lock); - mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); -- cgit v1.2.3 From a8fad984833832d5ca11a9ed64ddc55646da30e3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 15 Jun 2017 03:57:46 -0400 Subject: ufs_truncate_blocks(): fix the case when size is in the last direct block The logics when deciding whether we need to do anything with direct blocks is broken when new size is within the last direct block. It's better to find the path to the last byte _not_ to be removed and use that instead of the path to the beginning of the first block to be freed... Signed-off-by: Al Viro --- fs/ufs/inode.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 1dda6c4875f9..9f4590261134 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -886,7 +886,6 @@ static inline void free_data(struct to_free *ctx, u64 from, unsigned count) ctx->to = from + count; } -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) static void ufs_trunc_direct(struct inode *inode) @@ -1124,19 +1123,24 @@ static void ufs_truncate_blocks(struct inode *inode) struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned offsets[4]; - int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth; int depth2; unsigned i; struct ufs_buffer_head *ubh[3]; void *p; u64 block; - if (!depth) - return; + if (inode->i_size) { + sector_t last = (inode->i_size - 1) >> uspi->s_bshift; + depth = ufs_block_to_path(inode, last, offsets); + if (!depth) + return; + } else { + depth = 1; + } - /* find the last non-zero in offsets[] */ for (depth2 = depth - 1; depth2; depth2--) - if (offsets[depth2]) + if (offsets[depth2] != uspi->s_apb - 1) break; mutex_lock(&ufsi->truncate_mutex); @@ -1145,9 +1149,8 @@ static void ufs_truncate_blocks(struct inode *inode) offsets[0] = UFS_IND_BLOCK; } else { /* get the blocks that should be partially emptied */ - p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++); for (i = 0; i < depth2; i++) { - offsets[i]++; /* next branch is fully freed */ block = ufs_data_ptr_to_cpu(sb, p); if (!block) break; @@ -1158,7 +1161,7 @@ static void ufs_truncate_blocks(struct inode *inode) write_sequnlock(&ufsi->meta_lock); break; } - p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]++); } while (i--) free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); -- cgit v1.2.3 From 81be24d263dbeddaba35827036d6f6787a59c2c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 Jun 2017 07:20:09 +0100 Subject: Hang/soft lockup in d_invalidate with simultaneous calls It's not hard to trigger a bunch of d_invalidate() on the same dentry in parallel. They end up fighting each other - any dentry picked for removal by one will be skipped by the rest and we'll go for the next iteration through the entire subtree, even if everything is being skipped. Morevoer, we immediately go back to scanning the subtree. The only thing we really need is to dissolve all mounts in the subtree and as soon as we've nothing left to do, we can just unhash the dentry and bugger off. Signed-off-by: Al Viro --- fs/dcache.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index cddf39777835..a9f995f6859e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1494,7 +1494,7 @@ static void check_and_drop(void *_data) { struct detach_data *data = _data; - if (!data->mountpoint && !data->select.found) + if (!data->mountpoint && list_empty(&data->select.dispose)) __d_drop(data->select.start); } @@ -1536,17 +1536,15 @@ void d_invalidate(struct dentry *dentry) d_walk(dentry, &data, detach_and_collect, check_and_drop); - if (data.select.found) + if (!list_empty(&data.select.dispose)) shrink_dentry_list(&data.select.dispose); + else if (!data.mountpoint) + return; if (data.mountpoint) { detach_mounts(data.mountpoint); dput(data.mountpoint); } - - if (!data.mountpoint && !data.select.found) - break; - cond_resched(); } } -- cgit v1.2.3 From 4068367c9ca7b515a209f9c0c8741309a1e90495 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Thu, 8 Jun 2017 17:32:29 -0700 Subject: fs: don't forget to put old mntns in mntns_install Fixes: 4f757f3cbf54 ("make sure that mntns_install() doesn't end up with referral for root") Cc: Al Viro Signed-off-by: Andrei Vagin Signed-off-by: Al Viro --- fs/namespace.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 8bd3e4d448b9..5a4438445bf7 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3488,6 +3488,8 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return err; } + put_mnt_ns(old_mnt_ns); + /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); -- cgit v1.2.3 From 20223f0f39ea9d31ece08f04ac79f8c4e8d98246 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 16 Jun 2017 11:08:24 +0200 Subject: fs: pass on flags in compat_writev Fixes: 793b80ef14af ("vfs: pass a flags argument to vfs_readv/vfs_writev") Signed-off-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/read_write.c b/fs/read_write.c index 47c1d4484df9..19d4d88fa285 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1285,7 +1285,7 @@ static size_t compat_writev(struct file *file, if (!(file->f_mode & FMODE_CAN_WRITE)) goto out; - ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, 0); + ret = compat_do_readv_writev(WRITE, file, vec, vlen, pos, flags); out: if (ret > 0) -- cgit v1.2.3 From 64c2b20301f62c697352c8028c569b1b2bdd8e82 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 16 Jun 2017 14:02:37 -0700 Subject: userfaultfd: shmem: handle coredumping in handle_userfault() Anon and hugetlbfs handle FOLL_DUMP set by get_dump_page() internally to __get_user_pages(). shmem as opposed has no special FOLL_DUMP handling there so handle_mm_fault() is invoked without mmap_sem and ends up calling handle_userfault() that isn't expecting to be invoked without mmap_sem held. This makes handle_userfault() fail immediately if invoked through shmem_vm_ops->fault during coredumping and solves the problem. The side effect is a BUG_ON with no lock held triggered by the coredumping process which exits. Only 4.11 is affected, pre-4.11 anon memory holes are skipped in __get_user_pages by checking FOLL_DUMP explicitly against empty pagetables (mm/gup.c:no_page_table()). It's zero cost as we already had a check for current->flags to prevent futex to trigger userfaults during exit (PF_EXITING). Link: http://lkml.kernel.org/r/20170615214838.27429-1-aarcange@redhat.com Signed-off-by: Andrea Arcangeli Reported-by: "Dr. David Alan Gilbert" Cc: [4.11+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f7555fc25877..1d622f276e3a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -340,9 +340,28 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) bool must_wait, return_to_userland; long blocking_state; - BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - ret = VM_FAULT_SIGBUS; + + /* + * We don't do userfault handling for the final child pid update. + * + * We also don't do userfault handling during + * coredumping. hugetlbfs has the special + * follow_hugetlb_page() to skip missing pages in the + * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with + * the no_page_table() helper in follow_page_mask(), but the + * shmem_vm_ops->fault method is invoked even during + * coredumping without mmap_sem and it ends up here. + */ + if (current->flags & (PF_EXITING|PF_DUMPCORE)) + goto out; + + /* + * Coredumping runs without mmap_sem so we can only check that + * the mmap_sem is held, if PF_DUMPCORE was not set. + */ + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -360,12 +379,6 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) if (unlikely(ACCESS_ONCE(ctx->released))) goto out; - /* - * We don't do userfault handling for the final child pid update. - */ - if (current->flags & PF_EXITING) - goto out; - /* * Check that we can return VM_FAULT_RETRY. * -- cgit v1.2.3 From 23ac7cba73bb2c6e80f9cdebeb39dc3dad34ebb3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Jun 2017 23:49:17 -0400 Subject: fix signedness of timestamps on ufs1 Signed-off-by: Al Viro --- fs/ufs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9f4590261134..7b1b810a8ab1 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -578,9 +578,9 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); - inode->i_atime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); - inode->i_mtime.tv_sec = fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); + inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; -- cgit v1.2.3 From c0ef65d2928249e822b813beb41b6c1478c556ab Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Jun 2017 23:54:47 -0400 Subject: ufs_iget(): fail with -ESTALE on deleted inode Signed-off-by: Al Viro --- fs/ufs/inode.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7b1b810a8ab1..f36d6a53687d 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -566,10 +566,8 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -614,10 +612,8 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) */ inode->i_mode = mode = fs16_to_cpu(sb, ufs2_inode->ui_mode); set_nlink(inode, fs16_to_cpu(sb, ufs2_inode->ui_nlink)); - if (inode->i_nlink == 0) { - ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); - return -1; - } + if (inode->i_nlink == 0) + return -ESTALE; /* * Linux now has 32-bit uid and gid, so we can support EFT. @@ -657,7 +653,7 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * bh; struct inode *inode; - int err; + int err = -EIO; UFSD("ENTER, ino %lu\n", ino); @@ -692,9 +688,10 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) err = ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + brelse(bh); if (err) goto bad_inode; + inode->i_version++; ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; @@ -703,15 +700,13 @@ struct inode *ufs_iget(struct super_block *sb, unsigned long ino) ufs_set_inode_ops(inode); - brelse(bh); - UFSD("EXIT\n"); unlock_new_inode(inode); return inode; bad_inode: iget_failed(inode); - return ERR_PTR(-EIO); + return ERR_PTR(err); } static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) -- cgit v1.2.3 From 77e9ce327d9b607cd6e57c0f4524a654dc59c4b1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 17 Jun 2017 15:44:06 -0400 Subject: ufs: fix the logics for tail relocation * original hysteresis loop got broken by typo back in 2002; now it never switches out of OPTTIME state. Fixed. * critical levels for switching from OPTTIME to OPTSPACE and back ought to be calculated once, at mount time. * we should use mul_u64_u32_div() for those calculations, now that ->s_dsize is 64bit. * to quote Kirk McKusick (in 1995 FreeBSD commit message): The threshold for switching from time-space and space-time is too small when minfree is 5%...so make it stay at space in this case. Signed-off-by: Al Viro --- fs/ufs/balloc.c | 22 ++++++---------------- fs/ufs/super.c | 9 +++++++++ fs/ufs/ufs_fs.h | 2 ++ 3 files changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 0315fea1d589..f80be4c5df9d 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -455,24 +455,14 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, /* * allocate new block and move data */ - switch (fs32_to_cpu(sb, usb1->fs_optim)) { - case UFS_OPTSPACE: + if (fs32_to_cpu(sb, usb1->fs_optim) == UFS_OPTSPACE) { request = newcount; - if (uspi->s_minfree < 5 || uspi->cs_total.cs_nffree - > uspi->s_dsize * uspi->s_minfree / (2 * 100)) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; - default: - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - - case UFS_OPTTIME: + if (uspi->cs_total.cs_nffree < uspi->s_space_to_time) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); + } else { request = uspi->s_fpb; - if (uspi->cs_total.cs_nffree < uspi->s_dsize * - (uspi->s_minfree - 2) / 100) - break; - usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTTIME); - break; + if (uspi->cs_total.cs_nffree > uspi->s_time_to_space) + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); } result = ufs_alloc_fragments (inode, cgno, goal, request, err); if (result) { diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 34656c7a8e22..f211b662dd92 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1211,6 +1211,15 @@ magic_found: uspi->s_root_blocks = mul_u64_u32_div(uspi->s_dsize, uspi->s_minfree, 100); + if (uspi->s_minfree <= 5) { + uspi->s_time_to_space = ~0ULL; + uspi->s_space_to_time = 0; + usb1->fs_optim = cpu_to_fs32(sb, UFS_OPTSPACE); + } else { + uspi->s_time_to_space = (uspi->s_root_blocks / 2) + 1; + uspi->s_space_to_time = mul_u64_u32_div(uspi->s_dsize, + uspi->s_minfree - 2, 100) - 1; + } /* * Compute another frequently used values diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h index 823d55a37586..150eef6f1233 100644 --- a/fs/ufs/ufs_fs.h +++ b/fs/ufs/ufs_fs.h @@ -792,6 +792,8 @@ struct ufs_sb_private_info { __s32 fs_magic; /* filesystem magic */ unsigned int s_dirblksize; __u64 s_root_blocks; + __u64 s_time_to_space; + __u64 s_space_to_time; }; /* -- cgit v1.2.3 From 1be7107fbe18eed3e319a6c3e83c78254b693acb Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 19 Jun 2017 04:03:24 -0700 Subject: mm: larger stack guard gap, between vmas Stack guard page is a useful feature to reduce a risk of stack smashing into a different mapping. We have been using a single page gap which is sufficient to prevent having stack adjacent to a different mapping. But this seems to be insufficient in the light of the stack usage in userspace. E.g. glibc uses as large as 64kB alloca() in many commonly used functions. Others use constructs liks gid_t buffer[NGROUPS_MAX] which is 256kB or stack strings with MAX_ARG_STRLEN. This will become especially dangerous for suid binaries and the default no limit for the stack size limit because those applications can be tricked to consume a large portion of the stack and a single glibc call could jump over the guard page. These attacks are not theoretical, unfortunatelly. Make those attacks less probable by increasing the stack guard gap to 1MB (on systems with 4k pages; but make it depend on the page size because systems with larger base pages might cap stack allocations in the PAGE_SIZE units) which should cover larger alloca() and VLA stack allocations. It is obviously not a full fix because the problem is somehow inherent, but it should reduce attack space a lot. One could argue that the gap size should be configurable from userspace, but that can be done later when somebody finds that the new 1MB is wrong for some special case applications. For now, add a kernel command line option (stack_guard_gap) to specify the stack gap size (in page units). Implementation wise, first delete all the old code for stack guard page: because although we could get away with accounting one extra page in a stack vma, accounting a larger gap can break userspace - case in point, a program run with "ulimit -S -v 20000" failed when the 1MB gap was counted for RLIMIT_AS; similar problems could come with RLIMIT_MLOCK and strict non-overcommit mode. Instead of keeping gap inside the stack vma, maintain the stack guard gap as a gap between vmas: using vm_start_gap() in place of vm_start (or vm_end_gap() in place of vm_end if VM_GROWSUP) in just those few places which need to respect the gap - mainly arch_get_unmapped_area(), and and the vma tree's subtree_gap support for that. Original-patch-by: Oleg Nesterov Original-patch-by: Michal Hocko Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Tested-by: Helge Deller # parisc Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- fs/proc/task_mmu.c | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dde861387a40..d44f5456eb9b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -200,7 +200,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f0c8b33d99b1..520802da059c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -300,11 +300,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) /* We don't show the stack guard page in /proc/maps */ start = vma->vm_start; - if (stack_guard_page_start(vma, start)) - start += PAGE_SIZE; end = vma->vm_end; - if (stack_guard_page_end(vma, end)) - end -= PAGE_SIZE; seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", -- cgit v1.2.3