diff options
author | Takashi Iwai <tiwai@suse.de> | 2021-03-02 18:30:07 +0100 |
---|---|---|
committer | Takashi Iwai <tiwai@suse.de> | 2021-03-02 18:30:07 +0100 |
commit | 9b838a3c32d7a1edd7edeec1bc455eca76622218 (patch) | |
tree | b959299355265d21586c3782fa93f0a5a2ac068d /fs | |
parent | a864e8f159b13babf552aff14a5fbe11abc017e4 (diff) | |
parent | ffd7e705fad695fc0abd5809ef8dc72cda7e49a6 (diff) | |
download | linux-9b838a3c32d7a1edd7edeec1bc455eca76622218.tar.gz linux-9b838a3c32d7a1edd7edeec1bc455eca76622218.tar.bz2 linux-9b838a3c32d7a1edd7edeec1bc455eca76622218.zip |
Merge tag 'tags/sound-sdw-kconfig-fixes' into for-linus
ALSA/ASoC/SOF/SoundWire: fix Kconfig issues
In January, Intel kbuild bot and Arnd Bergmann reported multiple
issues with randconfig. This patchset builds on Arnd's suggestions to
a) expose ACPI and PCI devices in separate modules, while sof-acpi-dev
and sof-pci-dev become helpers. This will result in minor changes
required for developers/testers, i.e. modprobe snd-sof-pci will no
longer result in a probe. The SOF CI was already updated to deal with
this module dependency change and introduction of new modules.
b) Fix SOF/SoundWire/DSP_config dependencies by moving the code
required to detect SoundWire presence in ACPI tables to sound/hda.
Link: https://lore.kernel.org/r/20210302003125.1178419-1-pierre-louis.bossart@linux.intel.com
Diffstat (limited to 'fs')
389 files changed, 6856 insertions, 4821 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 6261719f6f2a..bb1b286c49ae 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -239,6 +239,7 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, } static int v9fs_xattr_set_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -258,7 +259,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (value) { /* update the cached acl value */ @@ -279,7 +280,8 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, struct iattr iattr = { 0 }; struct posix_acl *old_acl = acl; - retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + retval = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, &acl); if (retval) goto err_out; if (!acl) { @@ -297,7 +299,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, * What is the following setxattr update the * mode ? */ - v9fs_vfs_setattr_dotl(dentry, &iattr); + v9fs_vfs_setattr_dotl(&init_user_ns, dentry, &iattr); } break; case ACL_TYPE_DEFAULT: diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 7b763776306e..4ca56c5dd637 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -135,7 +135,8 @@ extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); -extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, +extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index fd2a2b040250..d44ade76966a 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -59,7 +59,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); +int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *, + struct iattr *); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 4a937fac1acb..8d97f0b45e9c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -251,7 +251,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns,inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -676,8 +676,8 @@ error: */ static int -v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); u32 perm = unixmode2p9mode(v9ses, mode); @@ -702,7 +702,8 @@ v9fs_vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, * */ -static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int v9fs_vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -907,9 +908,9 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) */ int -v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +v9fs_vfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int retval; struct inode *old_inode; @@ -1016,8 +1017,8 @@ done: */ static int -v9fs_vfs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; @@ -1027,7 +1028,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1040,7 +1041,7 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1054,7 +1055,8 @@ v9fs_vfs_getattr(const struct path *path, struct kstat *stat, * */ -static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) +static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct v9fs_session_info *v9ses; @@ -1062,7 +1064,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(dentry, iattr); + retval = setattr_prepare(&init_user_ns, dentry, iattr); if (retval) return retval; @@ -1118,7 +1120,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) v9fs_invalidate_inode_attr(d_inode(dentry)); - setattr_copy(d_inode(dentry), iattr); + setattr_copy(&init_user_ns, d_inode(dentry), iattr); mark_inode_dirty(d_inode(dentry)); return 0; } @@ -1137,9 +1139,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb, unsigned int flags) { umode_t mode; - char ext[32]; - char tag_name[14]; - unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -1157,18 +1156,18 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_gid = stat->n_gid; } if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { - if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + if (v9fs_proto_dotu(v9ses)) { + unsigned int i_nlink; /* - * Hadlink support got added later to - * to the .u extension. So there can be - * server out there that doesn't support - * this even with .u extension. So check - * for non NULL stat->extension + * Hadlink support got added later to the .u extension. + * So there can be a server out there that doesn't + * support this even with .u extension. That would + * just leave us with stat->extension being an empty + * string, though. */ - strlcpy(ext, stat->extension, sizeof(ext)); /* HARDLINKCOUNT %u */ - sscanf(ext, "%13s %u", tag_name, &i_nlink); - if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + if (sscanf(stat->extension, + " HARDLINKCOUNT %u", &i_nlink) == 1) set_nlink(inode, i_nlink); } } @@ -1295,7 +1294,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, */ static int -v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +v9fs_vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", dir->i_ino, dentry, symname); @@ -1348,7 +1348,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, */ static int -v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 823c2eb5f1bf..1dc7af046615 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -33,8 +33,8 @@ #include "acl.h" static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - dev_t rdev); +v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, dev_t rdev); /** * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a @@ -218,10 +218,10 @@ int v9fs_open_to_dotl_flags(int flags) */ static int -v9fs_vfs_create_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - bool excl) +v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, bool excl) { - return v9fs_vfs_mknod_dotl(dir, dentry, omode, 0); + return v9fs_vfs_mknod_dotl(mnt_userns, dir, dentry, omode, 0); } static int @@ -367,8 +367,9 @@ err_clunk_old_fid: * */ -static int v9fs_vfs_mkdir_dotl(struct inode *dir, - struct dentry *dentry, umode_t omode) +static int v9fs_vfs_mkdir_dotl(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -457,8 +458,9 @@ error: } static int -v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +v9fs_vfs_getattr_dotl(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct v9fs_session_info *v9ses; @@ -468,7 +470,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -485,7 +487,7 @@ v9fs_vfs_getattr_dotl(const struct path *path, struct kstat *stat, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -540,7 +542,8 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) * */ -int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) +int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; struct p9_fid *fid = NULL; @@ -549,7 +552,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) p9_debug(P9_DEBUG_VFS, "\n"); - retval = setattr_prepare(dentry, iattr); + retval = setattr_prepare(&init_user_ns, dentry, iattr); if (retval) return retval; @@ -590,7 +593,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) truncate_setsize(inode, iattr->ia_size); v9fs_invalidate_inode_attr(inode); - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { /* We also want to update ACL when we update mode bits */ @@ -684,8 +687,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, } static int -v9fs_vfs_symlink_dotl(struct inode *dir, struct dentry *dentry, - const char *symname) +v9fs_vfs_symlink_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err; kgid_t gid; @@ -824,8 +827,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * */ static int -v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode, - dev_t rdev) +v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t omode, dev_t rdev) { int err; kgid_t gid; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index 87217dd0433e..ee331845e2c7 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -157,6 +157,7 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, } static int v9fs_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 699c4fa8b78b..06b7c92343ad 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -144,7 +144,8 @@ struct adfs_discmap { /* Inode stuff */ struct inode *adfs_iget(struct super_block *sb, struct object_info *obj); int adfs_write_inode(struct inode *inode, struct writeback_control *wbc); -int adfs_notify_change(struct dentry *dentry, struct iattr *attr); +int adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); /* map.c */ int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 32620f4a7623..fb7ee026d101 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -292,14 +292,15 @@ out: * later. */ int -adfs_notify_change(struct dentry *dentry, struct iattr *attr) +adfs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); /* * we can't change the UID or GID of any file - diff --git a/fs/affs/affs.h b/fs/affs/affs.h index a755bef7c4c7..bfa89e131ead 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -167,27 +167,33 @@ extern const struct export_operations affs_export_ops; extern int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len); extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int); extern int affs_unlink(struct inode *dir, struct dentry *dentry); -extern int affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +extern int affs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool); +extern int affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); -extern int affs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname); -extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags); +extern int affs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname); +extern int affs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags); /* inode.c */ extern struct inode *affs_new_inode(struct inode *dir); -extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); +extern int affs_notify_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); extern void affs_evict_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, unsigned long ino); extern int affs_write_inode(struct inode *inode, struct writeback_control *wbc); -extern int affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s32 type); +extern int affs_add_entry(struct inode *dir, struct inode *inode, + struct dentry *dentry, s32 type); /* file.c */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 044412110b52..2352a75bd9d6 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -216,14 +216,15 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } int -affs_notify_change(struct dentry *dentry, struct iattr *attr) +affs_notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto out; @@ -249,7 +250,7 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) affs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); if (attr->ia_valid & ATTR_MODE) diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 5400a876d73f..bcab18956b4f 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -242,7 +242,8 @@ affs_unlink(struct inode *dir, struct dentry *dentry) } int -affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +affs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -273,7 +274,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) } int -affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +affs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int error; @@ -311,7 +313,8 @@ affs_rmdir(struct inode *dir, struct dentry *dentry) } int -affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +affs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; @@ -500,9 +503,9 @@ done: return retval; } -int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int affs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 7bd659ad959e..714fcca9af99 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -28,18 +28,19 @@ static int afs_lookup_one_filldir(struct dir_context *ctx, const char *name, int loff_t fpos, u64 ino, unsigned dtype); static int afs_lookup_filldir(struct dir_context *ctx, const char *name, int nlen, loff_t fpos, u64 ino, unsigned dtype); -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl); -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); +static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl); +static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, struct dentry *dentry); -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content); -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags); +static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *content); +static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags); static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); static void afs_dir_invalidatepage(struct page *page, unsigned int offset, unsigned int length); @@ -1325,7 +1326,8 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1619,8 +1621,8 @@ static const struct afs_operation_ops afs_create_operation = { /* * create a regular file on an AFS filesystem */ -static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1741,8 +1743,8 @@ static const struct afs_operation_ops afs_symlink_operation = { /* * create a symlink in an AFS filesystem */ -static int afs_symlink(struct inode *dir, struct dentry *dentry, - const char *content) +static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *content) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1876,9 +1878,9 @@ static const struct afs_operation_ops afs_rename_operation = { /* * rename a file in an AFS filesystem and/or move it between directories */ -static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct afs_operation *op; struct afs_vnode *orig_dvnode, *new_dvnode, *vnode; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index b0d7b892090d..1156b2df28d3 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -734,8 +734,8 @@ error_unlock: /* * read the attributes of an inode */ -int afs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -745,7 +745,7 @@ int afs_getattr(const struct path *path, struct kstat *stat, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -857,7 +857,8 @@ static const struct afs_operation_ops afs_setattr_operation = { /* * set the attributes of an inode */ -int afs_setattr(struct dentry *dentry, struct iattr *attr) +int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 0d150a29e39e..b626e38e9ab5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1149,8 +1149,9 @@ extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern bool afs_check_validity(struct afs_vnode *); extern int afs_validate(struct afs_vnode *, struct key *); -extern int afs_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int afs_setattr(struct dentry *, struct iattr *); +extern int afs_getattr(struct user_namespace *mnt_userns, const struct path *, + struct kstat *, u32, unsigned int); +extern int afs_setattr(struct user_namespace *mnt_userns, struct dentry *, struct iattr *); extern void afs_evict_inode(struct inode *); extern int afs_drop_inode(struct inode *); @@ -1361,7 +1362,7 @@ extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); extern struct key *afs_request_key_rcu(struct afs_cell *); extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); -extern int afs_permission(struct inode *, int); +extern int afs_permission(struct user_namespace *, struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 9cf3102f370c..3c7a8fc4f93f 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -396,7 +396,8 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key, * - AFS ACLs are attached to directories only, and a file is controlled by its * parent directory's ACL */ -int afs_permission(struct inode *inode, int mask) +int afs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct afs_vnode *vnode = AFS_FS_I(inode); afs_access_t access; diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 95c573dcda11..c629caae5002 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -120,6 +120,7 @@ static const struct afs_operation_ops afs_store_acl_operation = { * Set a file's AFS3 ACL. */ static int afs_xattr_set_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -248,6 +249,7 @@ static const struct afs_operation_ops yfs_store_opaque_acl2_operation = { * Set a file's YFS ACL. */ static int afs_xattr_set_yfs(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 89714308c25b..a280156138ed 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -55,61 +55,79 @@ static struct file_system_type anon_inode_fs_type = { .kill_sb = kill_anon_super, }; -/** - * anon_inode_getfile - creates a new file instance by hooking it up to an - * anonymous inode, and a dentry that describe the "class" - * of the file - * - * @name: [in] name of the "class" of the new file - * @fops: [in] file operations for the new file - * @priv: [in] private data for the new file (will be file's private_data) - * @flags: [in] flags - * - * Creates a new file by hooking it on a single inode. This is useful for files - * that do not need to have a full-fledged inode in order to operate correctly. - * All the files created with anon_inode_getfile() will share a single inode, - * hence saving memory and avoiding code duplication for the file/inode/dentry - * setup. Returns the newly created file* or an error pointer. - */ -struct file *anon_inode_getfile(const char *name, - const struct file_operations *fops, - void *priv, int flags) +static struct inode *anon_inode_make_secure_inode( + const char *name, + const struct inode *context_inode) { - struct file *file; + struct inode *inode; + const struct qstr qname = QSTR_INIT(name, strlen(name)); + int error; + + inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); + if (IS_ERR(inode)) + return inode; + inode->i_flags &= ~S_PRIVATE; + error = security_inode_init_security_anon(inode, &qname, context_inode); + if (error) { + iput(inode); + return ERR_PTR(error); + } + return inode; +} - if (IS_ERR(anon_inode_inode)) - return ERR_PTR(-ENODEV); +static struct file *__anon_inode_getfile(const char *name, + const struct file_operations *fops, + void *priv, int flags, + const struct inode *context_inode, + bool secure) +{ + struct inode *inode; + struct file *file; if (fops->owner && !try_module_get(fops->owner)) return ERR_PTR(-ENOENT); - /* - * We know the anon_inode inode count is always greater than zero, - * so ihold() is safe. - */ - ihold(anon_inode_inode); - file = alloc_file_pseudo(anon_inode_inode, anon_inode_mnt, name, + if (secure) { + inode = anon_inode_make_secure_inode(name, context_inode); + if (IS_ERR(inode)) { + file = ERR_CAST(inode); + goto err; + } + } else { + inode = anon_inode_inode; + if (IS_ERR(inode)) { + file = ERR_PTR(-ENODEV); + goto err; + } + /* + * We know the anon_inode inode count is always + * greater than zero, so ihold() is safe. + */ + ihold(inode); + } + + file = alloc_file_pseudo(inode, anon_inode_mnt, name, flags & (O_ACCMODE | O_NONBLOCK), fops); if (IS_ERR(file)) - goto err; + goto err_iput; - file->f_mapping = anon_inode_inode->i_mapping; + file->f_mapping = inode->i_mapping; file->private_data = priv; return file; +err_iput: + iput(inode); err: - iput(anon_inode_inode); module_put(fops->owner); return file; } -EXPORT_SYMBOL_GPL(anon_inode_getfile); /** - * anon_inode_getfd - creates a new file instance by hooking it up to an - * anonymous inode, and a dentry that describe the "class" - * of the file + * anon_inode_getfile - creates a new file instance by hooking it up to an + * anonymous inode, and a dentry that describe the "class" + * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file @@ -118,12 +136,23 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile); * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. - * All the files created with anon_inode_getfd() will share a single inode, + * All the files created with anon_inode_getfile() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry - * setup. Returns new descriptor or an error code. + * setup. Returns the newly created file* or an error pointer. */ -int anon_inode_getfd(const char *name, const struct file_operations *fops, - void *priv, int flags) +struct file *anon_inode_getfile(const char *name, + const struct file_operations *fops, + void *priv, int flags) +{ + return __anon_inode_getfile(name, fops, priv, flags, NULL, false); +} +EXPORT_SYMBOL_GPL(anon_inode_getfile); + +static int __anon_inode_getfd(const char *name, + const struct file_operations *fops, + void *priv, int flags, + const struct inode *context_inode, + bool secure) { int error, fd; struct file *file; @@ -133,7 +162,8 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops, return error; fd = error; - file = anon_inode_getfile(name, fops, priv, flags); + file = __anon_inode_getfile(name, fops, priv, flags, context_inode, + secure); if (IS_ERR(file)) { error = PTR_ERR(file); goto err_put_unused_fd; @@ -146,8 +176,55 @@ err_put_unused_fd: put_unused_fd(fd); return error; } + +/** + * anon_inode_getfd - creates a new file instance by hooking it up to + * an anonymous inode and a dentry that describe + * the "class" of the file + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * + * Creates a new file by hooking it on a single inode. This is + * useful for files that do not need to have a full-fledged inode in + * order to operate correctly. All the files created with + * anon_inode_getfd() will use the same singleton inode, reducing + * memory use and avoiding code duplication for the file/inode/dentry + * setup. Returns a newly created file descriptor or an error code. + */ +int anon_inode_getfd(const char *name, const struct file_operations *fops, + void *priv, int flags) +{ + return __anon_inode_getfd(name, fops, priv, flags, NULL, false); +} EXPORT_SYMBOL_GPL(anon_inode_getfd); +/** + * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new + * !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls + * the inode_init_security_anon() LSM hook. This allows the inode to have its + * own security context and for a LSM to reject creation of the inode. + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * @context_inode: + * [in] the logical relationship with the new inode (optional) + * + * The LSM may use @context_inode in inode_init_security_anon(), but a + * reference to it is not held. + */ +int anon_inode_getfd_secure(const char *name, const struct file_operations *fops, + void *priv, int flags, + const struct inode *context_inode) +{ + return __anon_inode_getfd(name, fops, priv, flags, context_inode, true); +} +EXPORT_SYMBOL_GPL(anon_inode_getfd_secure); + static int __init anon_inode_init(void) { anon_inode_mnt = kern_mount(&anon_inode_fs_type); diff --git a/fs/attr.c b/fs/attr.c index b4bbdbd4c8ca..87ef39db1c34 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -18,27 +18,55 @@ #include <linux/evm.h> #include <linux/ima.h> -static bool chown_ok(const struct inode *inode, kuid_t uid) +/** + * chown_ok - verify permissions to chown inode + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @uid: uid to chown @inode to + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ +static bool chown_ok(struct user_namespace *mnt_userns, + const struct inode *inode, + kuid_t uid) { - if (uid_eq(current_fsuid(), inode->i_uid) && - uid_eq(uid, inode->i_uid)) + kuid_t kuid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), kuid) && uid_eq(uid, kuid)) return true; - if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (uid_eq(inode->i_uid, INVALID_UID) && + if (uid_eq(kuid, INVALID_UID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; } -static bool chgrp_ok(const struct inode *inode, kgid_t gid) +/** + * chgrp_ok - verify permissions to chgrp inode + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @gid: gid to chown @inode to + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ +static bool chgrp_ok(struct user_namespace *mnt_userns, + const struct inode *inode, kgid_t gid) { - if (uid_eq(current_fsuid(), inode->i_uid) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && + (in_group_p(gid) || gid_eq(gid, kgid))) return true; - if (capable_wrt_inode_uidgid(inode, CAP_CHOWN)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; - if (gid_eq(inode->i_gid, INVALID_GID) && + if (gid_eq(kgid, INVALID_GID) && ns_capable(inode->i_sb->s_user_ns, CAP_CHOWN)) return true; return false; @@ -46,6 +74,7 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid) /** * setattr_prepare - check if attribute changes to a dentry are allowed + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: dentry to check * @attr: attributes to change * @@ -55,10 +84,17 @@ static bool chgrp_ok(const struct inode *inode, kgid_t gid) * SGID bit from mode if user is not allowed to set it. Also file capabilities * and IMA extended attributes are cleared if ATTR_KILL_PRIV is set. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * Should be called as the first thing in ->setattr implementations, * possibly after taking additional locks. */ -int setattr_prepare(struct dentry *dentry, struct iattr *attr) +int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; @@ -78,27 +114,27 @@ int setattr_prepare(struct dentry *dentry, struct iattr *attr) goto kill_priv; /* Make sure a caller can chown. */ - if ((ia_valid & ATTR_UID) && !chown_ok(inode, attr->ia_uid)) + if ((ia_valid & ATTR_UID) && !chown_ok(mnt_userns, inode, attr->ia_uid)) return -EPERM; /* Make sure caller can chgrp. */ - if ((ia_valid & ATTR_GID) && !chgrp_ok(inode, attr->ia_gid)) + if ((ia_valid & ATTR_GID) && !chgrp_ok(mnt_userns, inode, attr->ia_gid)) return -EPERM; /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : + i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } /* Check for setting the inode time. */ if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)) { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } @@ -107,7 +143,7 @@ kill_priv: if (ia_valid & ATTR_KILL_PRIV) { int error; - error = security_inode_killpriv(dentry); + error = security_inode_killpriv(mnt_userns, dentry); if (error) return error; } @@ -162,20 +198,33 @@ EXPORT_SYMBOL(inode_newsize_ok); /** * setattr_copy - copy simple metadata updates into the generic inode + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the inode to be updated * @attr: the new attributes * * setattr_copy must be called with i_mutex held. * * setattr_copy updates the inode's metadata with that specified - * in attr. Noticeably missing is inode size update, which is more complex + * in attr on idmapped mounts. If file ownership is changed setattr_copy + * doesn't map ia_uid and ia_gid. It will asssume the caller has already + * provided the intended values. Necessary permission checks to determine + * whether or not the S_ISGID property needs to be removed are performed with + * the correct idmapped mount permission helpers. + * Noticeably missing is inode size update, which is more complex * as it requires pagecache updates. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * The inode is not marked as dirty after this operation. The rationale is * that for "simple" filesystems, the struct inode is the inode storage. * The caller is free to mark the inode dirty afterwards if needed. */ -void setattr_copy(struct inode *inode, const struct iattr *attr) +void setattr_copy(struct user_namespace *mnt_userns, struct inode *inode, + const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -191,9 +240,9 @@ void setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; - - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (!in_group_p(kgid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; inode->i_mode = mode; } @@ -202,6 +251,7 @@ EXPORT_SYMBOL(setattr_copy); /** * notify_change - modify attributes of a filesytem object + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: object affected * @attr: new attributes * @delegated_inode: returns inode, if the inode is delegated @@ -214,13 +264,23 @@ EXPORT_SYMBOL(setattr_copy); * retry. Because breaking a delegation may take a long time, the * caller should drop the i_mutex before doing so. * + * If file ownership is changed notify_change() doesn't map ia_uid and + * ia_gid. It will asssume the caller has already provided the intended values. + * * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. Also, passing NULL is fine for callers holding * the file open for write, as there can be no conflicting delegation in * that case. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. */ -int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode) +int notify_change(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; umode_t mode = inode->i_mode; @@ -243,8 +303,8 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de if (IS_IMMUTABLE(inode)) return -EPERM; - if (!inode_owner_or_capable(inode)) { - error = inode_permission(inode, MAY_WRITE); + if (!inode_owner_or_capable(mnt_userns, inode)) { + error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error) return error; } @@ -320,9 +380,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de /* Don't allow modifications of files with invalid uids or * gids unless those uids & gids are being made valid. */ - if (!(ia_valid & ATTR_UID) && !uid_valid(inode->i_uid)) + if (!(ia_valid & ATTR_UID) && + !uid_valid(i_uid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; - if (!(ia_valid & ATTR_GID) && !gid_valid(inode->i_gid)) + if (!(ia_valid & ATTR_GID) && + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; error = security_inode_setattr(dentry, attr); @@ -333,13 +395,13 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de return error; if (inode->i_op->setattr) - error = inode->i_op->setattr(dentry, attr); + error = inode->i_op->setattr(mnt_userns, dentry, attr); else - error = simple_setattr(dentry, attr); + error = simple_setattr(mnt_userns, dentry, attr); if (!error) { fsnotify_change(dentry, ia_valid); - ima_inode_post_setattr(dentry); + ima_inode_post_setattr(mnt_userns, dentry); evm_inode_post_setattr(dentry, ia_valid); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 5aaa1732bf1e..91fe4548c256 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -10,10 +10,12 @@ #include "autofs_i.h" -static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_symlink(struct user_namespace *, struct inode *, + struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static int autofs_dir_mkdir(struct user_namespace *, struct inode *, + struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, @@ -524,9 +526,9 @@ static struct dentry *autofs_lookup(struct inode *dir, return NULL; } -static int autofs_dir_symlink(struct inode *dir, - struct dentry *dentry, - const char *symname) +static int autofs_dir_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -715,8 +717,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 54f0ce444272..48e16144c1f7 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -27,8 +27,9 @@ static const struct file_operations bad_file_ops = .open = bad_file_open, }; -static int bad_inode_create (struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int bad_inode_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) { return -EIO; } @@ -50,14 +51,15 @@ static int bad_inode_unlink(struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_symlink (struct inode *dir, struct dentry *dentry, - const char *symname) +static int bad_inode_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { return -EIO; } -static int bad_inode_mkdir(struct inode *dir, struct dentry *dentry, - umode_t mode) +static int bad_inode_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return -EIO; } @@ -67,13 +69,14 @@ static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) return -EIO; } -static int bad_inode_mknod (struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int bad_inode_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { return -EIO; } -static int bad_inode_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int bad_inode_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -86,18 +89,21 @@ static int bad_inode_readlink(struct dentry *dentry, char __user *buffer, return -EIO; } -static int bad_inode_permission(struct inode *inode, int mask) +static int bad_inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { return -EIO; } -static int bad_inode_getattr(const struct path *path, struct kstat *stat, +static int bad_inode_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { return -EIO; } -static int bad_inode_setattr(struct dentry *direntry, struct iattr *attrs) +static int bad_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *direntry, struct iattr *attrs) { return -EIO; } @@ -140,13 +146,15 @@ static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry, return -EIO; } -static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry, +static int bad_inode_tmpfile(struct user_namespace *mnt_userns, + struct inode *inode, struct dentry *dentry, umode_t mode) { return -EIO; } -static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl, +static int bad_inode_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type) { return -EIO; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index d8dfe3a0cb39..34d4f68f786b 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,8 +75,8 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int bfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { int err; struct inode *inode; @@ -96,7 +96,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } set_bit(ino, info->si_imap); info->si_freei--; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; @@ -199,9 +199,9 @@ out_brelse: return error; } -static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int bfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode, *new_inode; struct buffer_head *old_bh, *new_bh; diff --git a/fs/block_dev.c b/fs/block_dev.c index ec26179c8062..4aa1f88d5bf8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -221,7 +221,7 @@ static void blkdev_bio_end_io_simple(struct bio *bio) static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, - int nr_pages) + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); @@ -355,8 +355,8 @@ static void blkdev_bio_end_io(struct bio *bio) } } -static ssize_t -__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); @@ -486,7 +486,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { - int nr_pages; + unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; @@ -495,7 +495,7 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } static __init int blkdev_init(void) @@ -1270,7 +1270,7 @@ rescan: return ret; } /* - * Only exported for for loop and dasd for historic reasons. Don't use in new + * Only exported for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index a0af1b952c4d..d95eb5c8cb37 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -107,13 +107,15 @@ out: return ret; } -int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int ret; umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, + &inode->i_mode, &acl); if (ret) return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3bc00aed13b2..bd659354d043 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3635,7 +3635,8 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type); -int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int btrfs_init_acl(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); #else diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index be9e3900cce8..bf2c51a9607a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3634,7 +3634,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return ret; } - return generic_file_buffered_read(iocb, to, ret); + return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 535abf898225..2e1c282c202d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5212,7 +5212,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) return ret; } -static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -5221,7 +5222,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) if (btrfs_root_readonly(root)) return -EROFS; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -5232,12 +5233,13 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) } if (attr->ia_valid) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(inode); if (!err && attr->ia_valid & ATTR_MODE) - err = posix_acl_chmod(inode, inode->i_mode); + err = posix_acl_chmod(&init_user_ns, inode, + inode->i_mode); } return err; @@ -6357,7 +6359,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, if (ret != 0) goto fail_unlock; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode_set_bytes(inode, 0); inode->i_mtime = current_time(inode); @@ -6518,8 +6520,8 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, return err; } -static int btrfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -6582,8 +6584,8 @@ out_unlock: return err; } -static int btrfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -6727,7 +6729,8 @@ fail: return err; } -static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode = NULL; @@ -9017,7 +9020,8 @@ fail: return -ENOMEM; } -static int btrfs_getattr(const struct path *path, struct kstat *stat, +static int btrfs_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; @@ -9043,7 +9047,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -9534,9 +9538,9 @@ out_notrans: return ret; } -static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; @@ -9744,8 +9748,8 @@ out: return ret; } -static int btrfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; @@ -10079,7 +10083,8 @@ static int btrfs_set_page_dirty(struct page *page) return __set_page_dirty_nobuffers(page); } -static int btrfs_permission(struct inode *inode, int mask) +static int btrfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; @@ -10091,10 +10096,11 @@ static int btrfs_permission(struct inode *inode, int mask) if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } -static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a8c60d46d19c..072e77726e94 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -213,7 +213,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) const char *comp = NULL; u32 binode_flags; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (btrfs_root_readonly(root)) @@ -429,7 +429,7 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) unsigned old_i_flags; int ret = 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (btrfs_root_readonly(root)) @@ -925,13 +925,14 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || - IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) + if (check_sticky(&init_user_ns, dir, d_inode(victim)) || + IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) || + IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -954,7 +955,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -1871,7 +1872,7 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, btrfs_info(BTRFS_I(file_inode(file))->root->fs_info, "Snapshot src from another FS"); ret = -EXDEV; - } else if (!inode_owner_or_capable(src_inode)) { + } else if (!inode_owner_or_capable(&init_user_ns, src_inode)) { /* * Subvolume creation is not restricted, but snapshots * are limited to own subvolumes only @@ -1991,7 +1992,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, u64 flags; int ret = 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(file); @@ -2547,7 +2548,8 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); + ret = inode_permission(&init_user_ns, temp_inode, + MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { ret = -EACCES; @@ -3077,7 +3079,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, if (root == dest) goto out_dput; - err = inode_permission(inode, MAY_WRITE | MAY_EXEC); + err = inode_permission(&init_user_ns, inode, + MAY_WRITE | MAY_EXEC); if (err) goto out_dput; } @@ -3148,7 +3151,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) * running and allows defrag on files open in read-only mode. */ if (!capable(CAP_SYS_ADMIN) && - inode_permission(inode, MAY_WRITE)) { + inode_permission(&init_user_ns, inode, MAY_WRITE)) { ret = -EPERM; goto out; } @@ -4460,7 +4463,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, int ret = 0; int received_uuid_changed; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(file); diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 6bd97bd4cb37..3a4099a2bf05 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -62,7 +62,7 @@ struct inode *btrfs_new_test_inode(void) BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; - inode_init_owner(inode, NULL, S_IFREG); + inode_init_owner(&init_user_ns, inode, NULL, S_IFREG); return inode; } diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index af6246f36a9e..b025102e435f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -362,6 +362,7 @@ static int btrfs_xattr_handler_get(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -371,6 +372,7 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, } static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/buffer.c b/fs/buffer.c index 32647d2011df..0cb7ffd4977c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -847,7 +847,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, if (retry) gfp |= __GFP_NOFAIL; - memcg = get_mem_cgroup_from_page(page); + /* The page lock pins the memcg */ + memcg = page_memcg(page); old_memcg = set_active_memcg(memcg); head = NULL; @@ -868,7 +869,6 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, } out: set_active_memcg(old_memcg); - mem_cgroup_put(memcg); return head; /* * In case anything failed, we just free everything we got. @@ -2083,7 +2083,8 @@ static int __block_commit_write(struct inode *inode, struct page *page, set_buffer_uptodate(bh); mark_buffer_dirty(bh); } - clear_buffer_new(bh); + if (buffer_new(bh)) + clear_buffer_new(bh); block_start = block_end; bh = bh->b_this_page; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 4cea5fbf695e..5efa6a3702c0 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -470,14 +470,14 @@ static int cachefiles_attr_changed(struct fscache_object *_object) _debug("discard tail %llx", oi_size); newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = oi_size & PAGE_MASK; - ret = notify_change(object->backer, &newattrs, NULL); + ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); if (ret < 0) goto truncate_failed; } newattrs.ia_valid = ATTR_SIZE; newattrs.ia_size = ni_size; - ret = notify_change(object->backer, &newattrs, NULL); + ret = notify_change(&init_user_ns, object->backer, &newattrs, NULL); truncate_failed: inode_unlock(d_inode(object->backer)); diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ecc8ecbbfa5a..7bf0732ae25c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -311,7 +311,8 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Unlink security error"); } else { trace_cachefiles_unlink(object, rep, why); - ret = vfs_unlink(d_inode(dir), rep, NULL); + ret = vfs_unlink(&init_user_ns, d_inode(dir), rep, + NULL); if (preemptive) cachefiles_mark_object_buried(cache, rep, why); @@ -412,9 +413,16 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = d_inode(dir), + .old_dentry = rep, + .new_mnt_userns = &init_user_ns, + .new_dir = d_inode(cache->graveyard), + .new_dentry = grave, + }; trace_cachefiles_rename(object, rep, grave, why); - ret = vfs_rename(d_inode(dir), rep, - d_inode(cache->graveyard), grave, NULL, 0); + ret = vfs_rename(&rd); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); @@ -561,7 +569,7 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_mkdir(d_inode(dir), next, 0); + ret = vfs_mkdir(&init_user_ns, d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); if (!key) trace_cachefiles_mkdir(object, next, ret); @@ -597,7 +605,8 @@ lookup_again: if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(d_inode(dir), next, S_IFREG, true); + ret = vfs_create(&init_user_ns, d_inode(dir), next, + S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); trace_cachefiles_create(object, next, ret); if (ret < 0) @@ -791,7 +800,7 @@ retry: ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = vfs_mkdir(d_inode(dir), subdir, 0700); + ret = vfs_mkdir(&init_user_ns, d_inode(dir), subdir, 0700); if (ret < 0) goto mkdir_error; diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index 72e42438f3d7..a591b5e09637 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -39,8 +39,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) _enter("%p{%s}", object, type); /* attempt to install a type label directly */ - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, type, 2, - XATTR_CREATE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, type, + 2, XATTR_CREATE); if (ret == 0) { _debug("SET"); /* we succeeded */ goto error; @@ -54,7 +54,8 @@ int cachefiles_check_object_type(struct cachefiles_object *object) } /* read the current type label */ - ret = vfs_getxattr(dentry, cachefiles_xattr_cache, xtype, 3); + ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, xtype, + 3); if (ret < 0) { if (ret == -ERANGE) goto bad_type_length; @@ -110,9 +111,8 @@ int cachefiles_set_object_xattr(struct cachefiles_object *object, _debug("SET #%u", auxdata->len); clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_CREATE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, XATTR_CREATE); if (ret < 0 && ret != -ENOMEM) cachefiles_io_error_obj( object, @@ -140,9 +140,8 @@ int cachefiles_update_object_xattr(struct cachefiles_object *object, _debug("SET #%u", auxdata->len); clear_bit(FSCACHE_COOKIE_AUX_UPDATED, &object->fscache.cookie->flags); - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_REPLACE); + ret = vfs_setxattr(&init_user_ns, dentry, cachefiles_xattr_cache, + &auxdata->type, auxdata->len, XATTR_REPLACE); if (ret < 0 && ret != -ENOMEM) cachefiles_io_error_obj( object, @@ -171,7 +170,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) if (!auxbuf) return -ENOMEM; - xlen = vfs_getxattr(dentry, cachefiles_xattr_cache, + xlen = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, &auxbuf->type, 512 + 1); ret = -ESTALE; if (xlen < 1 || @@ -213,7 +212,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, } /* read the current type label */ - ret = vfs_getxattr(dentry, cachefiles_xattr_cache, + ret = vfs_getxattr(&init_user_ns, dentry, cachefiles_xattr_cache, &auxbuf->type, 512 + 1); if (ret < 0) { if (ret == -ENODATA) @@ -270,9 +269,9 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, } /* update the current label */ - ret = vfs_setxattr(dentry, cachefiles_xattr_cache, - &auxdata->type, auxdata->len, - XATTR_REPLACE); + ret = vfs_setxattr(&init_user_ns, dentry, + cachefiles_xattr_cache, &auxdata->type, + auxdata->len, XATTR_REPLACE); if (ret < 0) { cachefiles_io_error_obj(object, "Can't update xattr on %lu" @@ -309,7 +308,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, { int ret; - ret = vfs_removexattr(dentry, cachefiles_xattr_cache); + ret = vfs_removexattr(&init_user_ns, dentry, cachefiles_xattr_cache); if (ret < 0) { if (ret == -ENOENT || ret == -ENODATA) ret = 0; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index e0465741c591..529af59d9fd3 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -82,7 +82,8 @@ retry: return acl; } -int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int ceph_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int ret = 0, size = 0; const char *name = NULL; @@ -100,7 +101,8 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { - ret = posix_acl_update_mode(inode, &new_mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, + &new_mode, &acl); if (ret) goto out; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 950552944436..26e66436f005 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1662,7 +1662,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n", inode, off, len, ceph_cap_string(got), ret); - ceph_put_cap_refs(ci, got); + ceph_put_cap_refs_async(ci, got); out_free: ceph_restore_sigs(&oldset); sb_end_pagefault(inode->i_sb); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 255a512f1277..570731c4d019 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3027,6 +3027,12 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, return 0; } +enum put_cap_refs_mode { + PUT_CAP_REFS_SYNC = 0, + PUT_CAP_REFS_NO_CHECK, + PUT_CAP_REFS_ASYNC, +}; + /* * Release cap refs. * @@ -3037,10 +3043,11 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, * cap_snap, and wake up any waiters. */ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, - bool skip_checking_caps) + enum put_cap_refs_mode mode) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; + bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -3057,26 +3064,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; + /* put the ref held by ceph_take_cap_refs() */ put++; + check_flushsnaps = true; } dout("put_cap_refs %p wb %d -> %d (?)\n", inode, ci->i_wb_ref+1, ci->i_wb_ref); } - if (had & CEPH_CAP_FILE_WR) + if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { last++; - if (__ceph_have_pending_cap_snap(ci)) { - struct ceph_cap_snap *capsnap = - list_last_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - capsnap->writing = 0; - if (ceph_try_drop_cap_snap(ci, capsnap)) - put++; - else if (__ceph_finish_cap_snap(ci, capsnap)) - flushsnaps = 1; - wake = 1; - } + check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { @@ -3088,15 +3086,42 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) drop_inode_snap_realm(ci); } + } + if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(ci, capsnap)) + /* put the ref held by ceph_queue_cap_snap() */ + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } spin_unlock(&ci->i_ceph_lock); dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); - if (last && !skip_checking_caps) - ceph_check_caps(ci, 0, NULL); - else if (flushsnaps) - ceph_flush_snaps(ci, NULL); + switch (mode) { + case PUT_CAP_REFS_SYNC: + if (last) + ceph_check_caps(ci, 0, NULL); + else if (flushsnaps) + ceph_flush_snaps(ci, NULL); + break; + case PUT_CAP_REFS_ASYNC: + if (last) + ceph_queue_check_caps(inode); + else if (flushsnaps) + ceph_queue_flush_snaps(inode); + break; + default: + break; + } if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) @@ -3105,12 +3130,17 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { - __ceph_put_cap_refs(ci, had, false); + __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); +} + +void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) +{ + __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) { - __ceph_put_cap_refs(ci, had, true); + __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); } /* diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 858ee7362ff5..83d9358854fb 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -823,8 +823,8 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) return PTR_ERR(result); } -static int ceph_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ceph_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -878,14 +878,14 @@ out: return err; } -static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ceph_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return ceph_mknod(dir, dentry, mode, 0); + return ceph_mknod(mnt_userns, dir, dentry, mode, 0); } -static int ceph_symlink(struct inode *dir, struct dentry *dentry, - const char *dest) +static int ceph_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *dest) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -937,7 +937,8 @@ out: return err; } -static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ceph_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; @@ -1183,9 +1184,9 @@ out: return err; } -static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ceph_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); struct ceph_mds_request *req; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index adc8fc3c5d85..156f849f5385 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1816,60 +1816,17 @@ void ceph_async_iput(struct inode *inode) } } -/* - * Write back inode data in a worker thread. (This can't be done - * in the message handler context.) - */ -void ceph_queue_writeback(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - set_bit(CEPH_I_WORK_WRITEBACK, &ci->i_work_mask); - - ihold(inode); - if (queue_work(ceph_inode_to_client(inode)->inode_wq, - &ci->i_work)) { - dout("ceph_queue_writeback %p\n", inode); - } else { - dout("ceph_queue_writeback %p already queued, mask=%lx\n", - inode, ci->i_work_mask); - iput(inode); - } -} - -/* - * queue an async invalidation - */ -void ceph_queue_invalidate(struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - set_bit(CEPH_I_WORK_INVALIDATE_PAGES, &ci->i_work_mask); - - ihold(inode); - if (queue_work(ceph_inode_to_client(inode)->inode_wq, - &ceph_inode(inode)->i_work)) { - dout("ceph_queue_invalidate %p\n", inode); - } else { - dout("ceph_queue_invalidate %p already queued, mask=%lx\n", - inode, ci->i_work_mask); - iput(inode); - } -} - -/* - * Queue an async vmtruncate. If we fail to queue work, we will handle - * the truncation the next time we call __ceph_do_pending_vmtruncate. - */ -void ceph_queue_vmtruncate(struct inode *inode) +void ceph_queue_inode_work(struct inode *inode, int work_bit) { + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - set_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask); + set_bit(work_bit, &ci->i_work_mask); ihold(inode); - if (queue_work(ceph_inode_to_client(inode)->inode_wq, - &ci->i_work)) { - dout("ceph_queue_vmtruncate %p\n", inode); + if (queue_work(fsc->inode_wq, &ci->i_work)) { + dout("queue_inode_work %p, mask=%lx\n", inode, ci->i_work_mask); } else { - dout("ceph_queue_vmtruncate %p already queued, mask=%lx\n", + dout("queue_inode_work %p already queued, mask=%lx\n", inode, ci->i_work_mask); iput(inode); } @@ -2008,6 +1965,12 @@ static void ceph_inode_work(struct work_struct *work) if (test_and_clear_bit(CEPH_I_WORK_VMTRUNCATE, &ci->i_work_mask)) __ceph_do_pending_vmtruncate(inode); + if (test_and_clear_bit(CEPH_I_WORK_CHECK_CAPS, &ci->i_work_mask)) + ceph_check_caps(ci, 0, NULL); + + if (test_and_clear_bit(CEPH_I_WORK_FLUSH_SNAPS, &ci->i_work_mask)) + ceph_flush_snaps(ci, NULL); + iput(inode); } @@ -2238,7 +2201,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) /* * setattr */ -int ceph_setattr(struct dentry *dentry, struct iattr *attr) +int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); @@ -2247,7 +2211,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err != 0) return err; @@ -2262,7 +2226,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) err = __ceph_setattr(inode, attr); if (err >= 0 && (attr->ia_valid & ATTR_MODE)) - err = posix_acl_chmod(inode, attr->ia_mode); + err = posix_acl_chmod(&init_user_ns, inode, attr->ia_mode); return err; } @@ -2321,7 +2285,8 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, * Check inode permissions. We verify we have a valid value for * the AUTH cap, then call the generic handler. */ -int ceph_permission(struct inode *inode, int mask) +int ceph_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int err; @@ -2331,7 +2296,7 @@ int ceph_permission(struct inode *inode, int mask) err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); return err; } @@ -2368,8 +2333,8 @@ static int statx_to_caps(u32 want, umode_t mode) * Get all the attributes. If we have sufficient caps for the requested attrs, * then we can avoid talking to the MDS at all. */ -int ceph_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct ceph_inode_info *ci = ceph_inode(inode); @@ -2385,7 +2350,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat, return err; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->ino = ceph_present_inode(inode); /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index b611f829cb61..0728b01d4d43 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -623,6 +623,16 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 0; } + /* Fb cap still in use, delay it */ + if (ci->i_wb_ref) { + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " + "used WRBUFFER, delaying\n", inode, capsnap, + capsnap->context, capsnap->context->seq, + ceph_cap_string(capsnap->dirty), capsnap->size); + capsnap->writing = 1; + return 0; + } + ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b62d8fee3b86..c48bb30c8d70 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -562,9 +562,11 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Masks of ceph inode work. */ -#define CEPH_I_WORK_WRITEBACK 0 /* writeback */ -#define CEPH_I_WORK_INVALIDATE_PAGES 1 /* invalidate pages */ -#define CEPH_I_WORK_VMTRUNCATE 2 /* vmtruncate */ +#define CEPH_I_WORK_WRITEBACK 0 +#define CEPH_I_WORK_INVALIDATE_PAGES 1 +#define CEPH_I_WORK_VMTRUNCATE 2 +#define CEPH_I_WORK_CHECK_CAPS 3 +#define CEPH_I_WORK_FLUSH_SNAPS 4 /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode @@ -962,21 +964,49 @@ extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern bool ceph_inode_set_size(struct inode *inode, loff_t size); extern void __ceph_do_pending_vmtruncate(struct inode *inode); -extern void ceph_queue_vmtruncate(struct inode *inode); -extern void ceph_queue_invalidate(struct inode *inode); -extern void ceph_queue_writeback(struct inode *inode); + extern void ceph_async_iput(struct inode *inode); +void ceph_queue_inode_work(struct inode *inode, int work_bit); + +static inline void ceph_queue_vmtruncate(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_VMTRUNCATE); +} + +static inline void ceph_queue_invalidate(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_INVALIDATE_PAGES); +} + +static inline void ceph_queue_writeback(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_WRITEBACK); +} + +static inline void ceph_queue_check_caps(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_CHECK_CAPS); +} + +static inline void ceph_queue_flush_snaps(struct inode *inode) +{ + ceph_queue_inode_work(inode, CEPH_I_WORK_FLUSH_SNAPS); +} + extern int __ceph_do_getattr(struct inode *inode, struct page *locked_page, int mask, bool force); static inline int ceph_do_getattr(struct inode *inode, int mask, bool force) { return __ceph_do_getattr(inode, NULL, mask, force); } -extern int ceph_permission(struct inode *inode, int mask); +extern int ceph_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); extern int __ceph_setattr(struct inode *inode, struct iattr *attr); -extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); -extern int ceph_getattr(const struct path *path, struct kstat *stat, +extern int ceph_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); +extern int ceph_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); /* xattr.c */ @@ -1037,7 +1067,8 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx); #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int); -int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ceph_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, int type); int ceph_pre_init_acls(struct inode *dir, umode_t *mode, struct ceph_acl_sec_ctx *as_ctx); void ceph_init_inode_acls(struct inode *inode, @@ -1105,6 +1136,7 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps, bool snap_rwsem_locked); extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps); extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had); +extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had); extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had); extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 24997982de01..02f59bcb4f27 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1238,6 +1238,7 @@ static int ceph_get_xattr_handler(const struct xattr_handler *handler, } static int ceph_set_xattr_handler(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index b231dcf1d1f9..3aedc484e440 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -133,11 +133,12 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) { struct TCP_Server_Info *server = chan->server; - seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x " - "TCP status: %d Instance: %d Local Users To Server: %d " - "SecMode: 0x%x Req On Wire: %d In Send: %d " - "In MaxReq Wait: %d\n", - i+1, + seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" + "\n\t\tNumber of credits: %d Dialect 0x%x" + "\n\t\tTCP status: %d Instance: %d" + "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" + "\n\t\tIn Send: %d In MaxReq Wait: %d", + i+1, server->conn_id, server->credits, server->dialect, server->tcpStatus, @@ -197,14 +198,14 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) cfile = list_entry(tmp2, struct cifsFileInfo, tlist); seq_printf(m, - "0x%x 0x%llx 0x%x %d %d %d %s", + "0x%x 0x%llx 0x%x %d %d %d %pd", tcon->tid, cfile->fid.persistent_fid, cfile->f_flags, cfile->count, cfile->pid, from_kuid(&init_user_ns, cfile->uid), - cfile->dentry->d_name.name); + cfile->dentry); #ifdef CONFIG_CIFS_DEBUG2 seq_printf(m, " 0x%llx\n", cfile->fid.mid); #else @@ -227,7 +228,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - int i, j; + int c, i, j; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -275,14 +276,25 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); - seq_printf(m, "Servers:"); - i = 0; + seq_printf(m, "\nServers: "); + + c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + /* channel info will be printed as a part of sessions below */ + if (server->is_channel) + continue; + + c++; + seq_printf(m, "\n%d) ConnectionId: 0x%llx ", + c, server->conn_id); + + if (server->hostname) + seq_printf(m, "Hostname: %s ", server->hostname); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) goto skip_rdma; @@ -362,46 +374,48 @@ skip_rdma: if (server->posix_ext_supported) seq_printf(m, " posix"); - i++; + if (server->rdma) + seq_printf(m, "\nRDMA "); + seq_printf(m, "\nTCP status: %d Instance: %d" + "\nLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d", + server->tcpStatus, + server->reconnect_instance, + server->srv_count, + server->sec_mode, in_flight(server)); + + seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); + + seq_printf(m, "\n\n\tSessions: "); + i = 0; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, smb_ses_list); + i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ", - i, ses->serverName, ses->ses_count, + seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", + i, ses->ip_addr, ses->ses_count, ses->capabilities, ses->status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) - seq_printf(m, "Guest\t"); + seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) - seq_printf(m, "Anonymous\t"); + seq_printf(m, "Anonymous "); } else { seq_printf(m, - "\n%d) Name: %s Domain: %s Uses: %d OS:" - " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" - " session status: %d ", - i, ses->serverName, ses->serverDomain, + "\n\t%d) Name: %s Domain: %s Uses: %d OS: %s " + "\n\tNOS: %s\tCapability: 0x%x" + "\n\tSMB session status: %d ", + i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } - seq_printf(m,"Security type: %s\n", + seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); - if (server->rdma) - seq_printf(m, "RDMA\n\t"); - seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " - "Server: %d SecMode: 0x%x Req On Wire: %d", - server->tcpStatus, - server->reconnect_instance, - server->srv_count, - server->sec_mode, in_flight(server)); - - seq_printf(m, " In Send: %d In MaxReq Wait: %d", - atomic_read(&server->in_send), - atomic_read(&server->num_waiters)); - /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) @@ -414,13 +428,13 @@ skip_rdma: from_kuid(&init_user_ns, ses->cred_uid)); if (ses->chan_count > 1) { - seq_printf(m, "\n\n\tExtra Channels: %zu\n", + seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); for (j = 1; j < ses->chan_count; j++) cifs_dump_channel(m, j, &ses->chans[j]); } - seq_puts(m, "\n\n\tShares:"); + seq_puts(m, "\n\n\tShares: "); j = 0; seq_printf(m, "\n\t%d) IPC: ", j); @@ -437,38 +451,43 @@ skip_rdma: cifs_debug_tcon(m, tcon); } - seq_puts(m, "\n\tMIDs:\n"); - - spin_lock(&GlobalMid_Lock); - list_for_each(tmp3, &server->pending_mid_q) { - mid_entry = list_entry(tmp3, struct mid_q_entry, - qhead); - seq_printf(m, "\tState: %d com: %d pid:" - " %d cbdata: %p mid %llu\n", - mid_entry->mid_state, - le16_to_cpu(mid_entry->command), - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); - } - spin_unlock(&GlobalMid_Lock); - spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\tServer interfaces: %zu\n", + seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { struct cifs_server_iface *iface; iface = &ses->iface_list[j]; - seq_printf(m, "\t%d)", j); + seq_printf(m, "\n\t%d)", j+1); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); } + if (i == 0) + seq_printf(m, "\n\t\t[NONE]"); + + seq_puts(m, "\n\n\tMIDs: "); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp3, &server->pending_mid_q) { + mid_entry = list_entry(tmp3, struct mid_q_entry, + qhead); + seq_printf(m, "\n\tState: %d com: %d pid:" + " %d cbdata: %p mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); + } + spin_unlock(&GlobalMid_Lock); + seq_printf(m, "\n--\n"); } + if (c == 0) + seq_printf(m, "\n\t[NONE]"); + spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index d35f599aa00e..f2d730fffccb 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -272,7 +272,7 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) if (IS_ERR(share_name)) { int ret; - ret = PTR_ERR(net_name); + ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", __func__, tcon->treeName, ret); kfree(net_name); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 562913e2b3f2..9d29eb9660c2 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -267,10 +267,11 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) return true; /* well known sid found, uid returned */ } -static void +static __u16 cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { int i; + __u16 size = 1 + 1 + 6; dst->revision = src->revision; dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); @@ -278,6 +279,9 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) dst->authority[i] = src->authority[i]; for (i = 0; i < dst->num_subauth; ++i) dst->sub_auth[i] = src->sub_auth[i]; + size += (dst->num_subauth * 4); + + return size; } static int @@ -521,8 +525,11 @@ exit_cifs_idmap(void) } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ -static void copy_sec_desc(const struct cifs_ntsd *pntsd, - struct cifs_ntsd *pnntsd, __u32 sidsoffset) +static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, + __u32 sidsoffset, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid) { struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -536,19 +543,25 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); /* copy owner sid */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pownersid) + owner_sid_ptr = pownersid; + else + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pgrpsid) + group_sid_ptr = pgrpsid; + else + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); - return; + return sidsoffset + (2 * sizeof(struct cifs_sid)); } @@ -663,6 +676,25 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, return; } +static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid) +{ + __u16 size = 1 + 1 + 2 + 4; + + dst->type = src->type; + dst->flags = src->flags; + dst->access_req = src->access_req; + + /* Check if there's a replacement sid specified */ + if (psid) + size += cifs_copy_sid(&dst->sid, psid); + else + size += cifs_copy_sid(&dst->sid, &src->sid); + + dst->size = cpu_to_le16(size); + + return size; +} + static __u16 fill_ace_for_sid(struct cifs_ace *pntace, const struct cifs_sid *psid, __u64 nmode, umode_t bits, __u8 access_type, @@ -907,29 +939,30 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) return ace_size; } -static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid, __u64 *pnmode, bool modefromsid) +static void populate_new_aces(char *nacl_base, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid, + __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, + bool modefromsid) { - u16 size = 0; - u32 num_aces = 0; - struct cifs_acl *pnndacl; __u64 nmode; + u32 num_aces = 0; + u16 nsize = 0; __u64 user_mode; __u64 group_mode; __u64 other_mode; __u64 deny_user_mode = 0; __u64 deny_group_mode = 0; bool sticky_set = false; - - pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + struct cifs_ace *pnntace = NULL; nmode = *pnmode; + num_aces = *pnum_aces; + nsize = *pnsize; if (modefromsid) { - struct cifs_ace *pntace = - (struct cifs_ace *)((char *)pnndacl + size); - - size += setup_special_mode_ACE(pntace, nmode); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; goto set_size; } @@ -966,40 +999,170 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, sticky_set = true; if (deny_user_mode) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pownersid, deny_user_mode, 0700, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode, + 0700, ACCESS_DENIED, false); num_aces++; } + /* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/ if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), - pownersid, user_mode, 0700, ACCESS_ALLOWED, true); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, user_mode, + 0700, ACCESS_ALLOWED, true); num_aces++; + /* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */ if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode, + 0070, ACCESS_ALLOWED, !sticky_set); num_aces++; - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode, + 0007, ACCESS_ALLOWED, !sticky_set); num_aces++; set_size: + *pnum_aces = num_aces; + *pnsize = nsize; +} + +static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u16 ace_size = 0; + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* Go through all the ACEs */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pnownersid); + else if (pngrpsid && compare_sids(&pntace->sid, pgrpsid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pngrpsid); + else + ace_size = cifs_copy_ace(pnntace, pntace, NULL); + + size += le16_to_cpu(pntace->size); + nsize += ace_size; + } + + return nsize; +} + +static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + __u64 *pnmode, bool mode_from_sid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u32 num_aces = 0; + __u64 nmode; + bool new_aces_set = false; + + /* Assuming that pndacl and pnmode are never NULL */ + nmode = *pnmode; + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ + if (!pdacl) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + goto finalize_dacl; + } + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + /* Retain old ACEs which we can retain */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { + /* Place the new ACEs in between existing explicit and inherited */ + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + + /* If it's any one of the ACE we're replacing, skip! */ + if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + (compare_sids(&pntace->sid, pownersid) == 0) || + (compare_sids(&pntace->sid, pgrpsid) == 0) || + (compare_sids(&pntace->sid, &sid_everyone) == 0) || + (compare_sids(&pntace->sid, &sid_authusers) == 0)) { + goto next_ace; + } + + nsize += cifs_copy_ace(pnntace, pntace, NULL); + num_aces++; + +next_ace: + size += le16_to_cpu(pntace->size); + } + + /* If inherited ACEs are not present, place the new ones at the tail */ + if (!new_aces_set) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + +finalize_dacl: pndacl->num_aces = cpu_to_le32(num_aces); - pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); + pndacl->size = cpu_to_le16(nsize); return 0; } - static int parse_sid(struct cifs_sid *psid, char *end_of_acl) { /* BB need to add parm so we can store the SID BB */ @@ -1094,7 +1257,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, + __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, bool mode_from_sid, bool id_from_sid, int *aclflag) { int rc = 0; @@ -1102,39 +1265,59 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + secdesclen; + u16 size = 0; - if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->gsidoffset)); - dacloffset = le32_to_cpu(pntsd->dacloffset); + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) { + cifs_dbg(VFS, "Server returned illegal ACL size\n"); + return -EINVAL; + } + } + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); + + if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ ndacloffset = sizeof(struct cifs_ntsd); ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + ndacl_ptr->size = cpu_to_le16(0); + ndacl_ptr->num_aces = cpu_to_le32(0); + + rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - /* copy sec desc control portion & owner and group sids */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); - *aclflag = CIFS_ACL_DACL; + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + NULL, NULL); + + *aclflag |= CIFS_ACL_DACL; } else { - memcpy(pnntsd, pntsd, secdesclen); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); + ndacl_ptr->num_aces = dacl_ptr->num_aces; + if (uid_valid(uid)) { /* chown */ uid_t id; - owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->osidoffset)); nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!nowner_sid_ptr) - return -ENOMEM; + if (!nowner_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kuid(&init_user_ns, uid); if (id_from_sid) { struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr; @@ -1145,27 +1328,25 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, osid->SubAuthorities[0] = cpu_to_le32(88); osid->SubAuthorities[1] = cpu_to_le32(1); osid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", __func__, rc, id); - kfree(nowner_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); - kfree(nowner_sid_ptr); - *aclflag = CIFS_ACL_OWNER; + *aclflag |= CIFS_ACL_OWNER; } if (gid_valid(gid)) { /* chgrp */ gid_t id; - group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->gsidoffset)); ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!ngroup_sid_ptr) - return -ENOMEM; + if (!ngroup_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kgid(&init_user_ns, gid); if (id_from_sid) { struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr; @@ -1176,19 +1357,35 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, gsid->SubAuthorities[0] = cpu_to_le32(88); gsid->SubAuthorities[1] = cpu_to_le32(2); gsid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", __func__, rc, id); - kfree(ngroup_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); - kfree(ngroup_sid_ptr); - *aclflag = CIFS_ACL_GROUP; + *aclflag |= CIFS_ACL_GROUP; + } + + if (dacloffset) { + /* Replace ACEs for old owner with new one */ + size = replace_sids_and_copy_aces(dacl_ptr, ndacl_ptr, + owner_sid_ptr, group_sid_ptr, + nowner_sid_ptr, ngroup_sid_ptr); + ndacl_ptr->size = cpu_to_le16(size); } + + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + nowner_sid_ptr, ngroup_sid_ptr); + +chown_chgrp_exit: + /* errors could jump here. So make sure we return soon after this */ + kfree(nowner_sid_ptr); + kfree(ngroup_sid_ptr); } return rc; @@ -1384,6 +1581,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; + __u32 nsecdesclen = 0; + __u32 dacloffset = 0; + struct cifs_acl *dacl_ptr = NULL; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1414,31 +1614,52 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) + mode_from_sid = true; + else + mode_from_sid = false; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) + id_from_sid = true; + else + id_from_sid = false; + + /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ + nsecdesclen = secdesclen; + if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ + if (mode_from_sid) + nsecdesclen += sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += 5 * sizeof(struct cifs_ace); + } else { /* chown */ + /* When ownership changes, changes new owner sid length could be different */ + nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2); + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (mode_from_sid) + nsecdesclen += + le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += le16_to_cpu(dacl_ptr->size); + } + } + /* * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security * descriptor parameters, and secuirty descriptor itself */ - secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); - pnntsd = kmalloc(secdesclen, GFP_KERNEL); + nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); + pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); cifs_put_tlink(tlink); return -ENOMEM; } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) - mode_from_sid = true; - else - mode_from_sid = false; - - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) - id_from_sid = true; - else - id_from_sid = false; - - rc = build_sec_desc(pntsd, pnntsd, secdesclen, pnmode, uid, gid, + rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid, mode_from_sid, id_from_sid, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); @@ -1448,7 +1669,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, if (!rc) { /* Set the security descriptor */ - rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index ff7fd0862e28..d9e704979d99 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -31,8 +31,8 @@ #define EXEC_BIT 0x1 #define ACL_OWNER_MASK 0700 -#define ACL_GROUP_MASK 0770 -#define ACL_EVERYONE_MASK 0777 +#define ACL_GROUP_MASK 0070 +#define ACL_EVERYONE_MASK 0007 #define UBITSHIFT 6 #define GBITSHIFT 3 diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 51d53e4bdf6b..b8f1ff9a83f3 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -568,15 +568,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return rc; } } else { - /* We use ses->serverName if no domain name available */ - len = strlen(ses->serverName); + /* We use ses->ip_addr if no domain name available */ + len = strlen(ses->ip_addr); server = kmalloc(2 + (len * 2), GFP_KERNEL); if (server == NULL) { rc = -ENOMEM; return rc; } - len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len, + len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ab883e84e116..d43e935d2df4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -305,7 +305,8 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) return -EOPNOTSUPP; } -static int cifs_permission(struct inode *inode, int mask) +static int cifs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct cifs_sb_info *cifs_sb; @@ -320,7 +321,7 @@ static int cifs_permission(struct inode *inode, int mask) on the client (above and beyond ACL on servers) for servers which do not support setting and viewing mode bits, so allowing client to check permissions is useful */ - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static struct kmem_cache *cifs_inode_cachep; @@ -637,8 +638,18 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); - /* convert actimeo and display it in seconds */ - seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->actimeo / HZ); + + /* + * Display file and directory attribute timeout in seconds. + * If file and directory attribute timeout the same then actimeo + * was likely specified on mount + */ + if (cifs_sb->ctx->acdirmax == cifs_sb->ctx->acregmax) + seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->acregmax / HZ); + else { + seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ); + seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ); + } if (tcon->ses->chan_max > 1) seq_printf(s, ",multichannel,max_channels=%zu", @@ -1525,6 +1536,7 @@ init_cifs(void) */ atomic_set(&sesInfoAllocCount, 0); atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesNextId, 0); atomic_set(&tcpSesAllocCount, 0); atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 2307bb0f6147..0d7ef150dbb2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -62,19 +62,22 @@ extern void cifs_sb_deactive(struct super_block *sb); /* Functions related to inodes */ extern const struct inode_operations cifs_dir_inode_ops; extern struct inode *cifs_root_iget(struct super_block *); -extern int cifs_create(struct inode *, struct dentry *, umode_t, - bool excl); +extern int cifs_create(struct user_namespace *, struct inode *, + struct dentry *, umode_t, bool excl); extern int cifs_atomic_open(struct inode *, struct dentry *, struct file *, unsigned, umode_t); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, unsigned int); extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); -extern int cifs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct inode *, struct dentry *, umode_t); +extern int cifs_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); +extern int cifs_mkdir(struct user_namespace *, struct inode *, struct dentry *, + umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); -extern int cifs_rename2(struct inode *, struct dentry *, struct inode *, - struct dentry *, unsigned int); +extern int cifs_rename2(struct user_namespace *, struct inode *, + struct dentry *, struct inode *, struct dentry *, + unsigned int); extern int cifs_revalidate_file_attr(struct file *filp); extern int cifs_revalidate_dentry_attr(struct dentry *); extern int cifs_revalidate_file(struct file *filp); @@ -82,8 +85,10 @@ extern int cifs_revalidate_dentry(struct dentry *); extern int cifs_invalidate_mapping(struct inode *inode); extern int cifs_revalidate_mapping(struct inode *inode); extern int cifs_zap_mapping(struct inode *inode); -extern int cifs_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int cifs_setattr(struct dentry *, struct iattr *); +extern int cifs_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); +extern int cifs_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern int cifs_fiemap(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); @@ -132,8 +137,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); /* Functions related to symlinks */ extern const char *cifs_get_link(struct dentry *, struct inode *, struct delayed_call *); -extern int cifs_symlink(struct inode *inode, struct dentry *direntry, - const char *symname); +extern int cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR extern const struct xattr_handler *cifs_xattr_handlers[]; @@ -160,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.30" +#define CIFS_VERSION "2.31" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 50fcb65920e8..3de3c5908a72 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -21,6 +21,7 @@ #include <linux/in.h> #include <linux/in6.h> +#include <linux/inet.h> #include <linux/slab.h> #include <linux/mempool.h> #include <linux/workqueue.h> @@ -504,6 +505,8 @@ struct smb_version_operations { loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); + /* Check for STATUS_NETWORK_NAME_DELETED */ + void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { @@ -577,6 +580,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -901,7 +905,7 @@ struct cifs_ses { kuid_t linux_uid; /* overriding owner of files on the mount */ kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; - char serverName[SERVER_NAME_LEN_WITH_NULL]; + char ip_addr[INET6_ADDRSTRLEN + 1]; /* Max ipv6 (or v4) addr string len */ char *user_name; /* must not be null except during init of sess and after mount option parsing we fill it */ char *domainName; @@ -1704,7 +1708,9 @@ static inline bool is_retryable_error(int error) #define CIFS_ECHO_OP 0x080 /* echo request */ #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ -#define CIFS_OP_MASK 0x0380 /* mask request type */ +/* Lower bitmask values are reserved by others below. */ +#define CIFS_SESS_OP 0x2000 /* session setup request */ +#define CIFS_OP_MASK 0x2380 /* mask request type */ #define CIFS_HAS_CREDITS 0x0400 /* already has credits */ #define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ @@ -1844,6 +1850,7 @@ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ */ GLOBAL_EXTERN atomic_t sesInfoAllocCount; GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesNextId; GLOBAL_EXTERN atomic_t tcpSesAllocCount; GLOBAL_EXTERN atomic_t tcpSesReconnectCount; GLOBAL_EXTERN atomic_t tconInfoReconnectCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 32f7a013402e..75ce6f742b8d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -232,6 +232,8 @@ extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); +extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, + size_t to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0496934feecb..c279527aae92 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1451,9 +1451,9 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) while (remaining > 0) { int length; - length = cifs_read_from_socket(server, server->bigbuf, - min_t(unsigned int, remaining, - CIFSMaxBufSize + MAX_HEADER_SIZE(server))); + length = cifs_discard_from_socket(server, + min_t(size_t, remaining, + CIFSMaxBufSize + MAX_HEADER_SIZE(server))); if (length < 0) return length; server->total_read += length; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4bb9decbbf27..112692300fb6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -242,7 +242,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->max_read = 0; cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->hostname); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ @@ -564,6 +564,23 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, return cifs_readv_from_socket(server, &smb_msg); } +ssize_t +cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) +{ + struct msghdr smb_msg; + + /* + * iov_iter_discard already sets smb_msg.type and count and iov_offset + * and cifs_readv_from_socket sets msg_control and msg_controllen + * so little to initialize in struct msghdr + */ + smb_msg.msg_name = NULL; + smb_msg.msg_namelen = 0; + iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + + return cifs_readv_from_socket(server, &smb_msg); +} + int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) @@ -846,7 +863,7 @@ static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; - int scredits = server->credits; + int scredits, in_flight; /* * SMB1 does not use credits. @@ -857,12 +874,14 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) if (shdr->CreditRequest) { spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, - le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); @@ -993,6 +1012,10 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; + if (bufs[i] && server->ops->is_network_name_deleted) + server->ops->is_network_name_deleted(bufs[i], + server); + if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1317,6 +1340,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err_crypto_release; } + tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs; tcp_ses->noautotune = ctx->noautotune; @@ -1838,9 +1862,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) /* new SMB session uses our server ref */ ses->server = server; if (server->dstaddr.ss_family == AF_INET6) - sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); + sprintf(ses->ip_addr, "%pI6", &addr6->sin6_addr); else - sprintf(ses->serverName, "%pI4", &addr->sin_addr); + sprintf(ses->ip_addr, "%pI4", &addr->sin_addr); if (ctx->username) { ses->user_name = kstrdup(ctx->username, GFP_KERNEL); @@ -2269,7 +2293,9 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) if (strcmp(old->local_nls->charset, new->local_nls->charset)) return 0; - if (old->ctx->actimeo != new->ctx->actimeo) + if (old->ctx->acregmax != new->ctx->acregmax) + return 0; + if (old->ctx->acdirmax != new->ctx->acdirmax) return 0; return 1; @@ -2911,7 +2937,7 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, #ifdef CONFIG_CIFS_DFS_UPCALL /* * cifs_build_path_to_root returns full path to root when we do not have an - * exiting connection (tcon) + * existing connection (tcon) */ static char * build_unc_path_to_root(const struct smb3_fs_context *ctx, @@ -3038,96 +3064,91 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, return 0; } -static int setup_dfs_tgt_conn(const char *path, const char *full_path, - const struct dfs_cache_tgt_iterator *tgt_it, - struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - unsigned int *xid, struct TCP_Server_Info **server, - struct cifs_ses **ses, struct cifs_tcon **tcon) -{ - int rc; - struct dfs_info3_param ref = {0}; - char *mdata = NULL; - struct smb3_fs_context fake_ctx = {NULL}; - char *fake_devname = NULL; - - cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); - - rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); - if (rc) - return rc; - - mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - full_path + 1, &ref, - &fake_devname); - free_dfs_info_param(&ref); - - if (IS_ERR(mdata)) { - rc = PTR_ERR(mdata); - mdata = NULL; - } else - rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname); - - kfree(mdata); - kfree(fake_devname); - - if (!rc) { - /* - * We use a 'fake_ctx' here because we need pass it down to the - * mount_{get,put} functions to test connection against new DFS - * targets. - */ - mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); - rc = mount_get_conns(&fake_ctx, cifs_sb, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) { - /* - * We were able to connect to new target server. - * Update current context with new target server. - */ - rc = update_vol_info(tgt_it, &fake_ctx, ctx); - } - } - smb3_cleanup_fs_context_contents(&fake_ctx); - return rc; -} - static int do_dfs_failover(const char *path, const char *full_path, struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, struct cifs_ses *root_ses, unsigned int *xid, struct TCP_Server_Info **server, struct cifs_ses **ses, struct cifs_tcon **tcon) { int rc; - struct dfs_cache_tgt_list tgt_list; + struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; + struct smb3_fs_context tmp_ctx = {NULL}; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EOPNOTSUPP; + cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path); + rc = dfs_cache_noreq_find(path, NULL, &tgt_list); if (rc) return rc; + /* + * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to + * test connection against new DFS targets. + */ + rc = smb3_fs_context_dup(&tmp_ctx, ctx); + if (rc) + goto out; for (;;) { + struct dfs_info3_param ref = {0}; + char *fake_devname = NULL, *mdata = NULL; + /* Get next DFS target server - if any */ rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); if (rc) break; - /* Connect to next DFS target */ - rc = setup_dfs_tgt_conn(path, full_path, tgt_it, cifs_sb, ctx, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) + + rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + if (rc) + break; + + cifs_dbg(FYI, "%s: old ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, &ref, + &fake_devname); + free_dfs_info_param(&ref); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else + rc = cifs_setup_volume_info(&tmp_ctx, mdata, fake_devname); + + kfree(mdata); + kfree(fake_devname); + + if (rc) break; + + cifs_dbg(FYI, "%s: new ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); + rc = mount_get_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon); + if (!rc || (*server && *ses)) { + /* + * We were able to connect to new target server. Update current context with + * new target server. + */ + rc = update_vol_info(tgt_it, &tmp_ctx, ctx); + break; + } } if (!rc) { + cifs_dbg(FYI, "%s: final ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); /* - * Update DFS target hint in DFS referral cache with the target - * server we successfully reconnected to. + * Update DFS target hint in DFS referral cache with the target server we + * successfully reconnected to. */ - rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, - cifs_sb->local_nls, - cifs_remap(cifs_sb), path, - tgt_it); + rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), path, tgt_it); } + +out: + smb3_cleanup_fs_context_contents(&tmp_ctx); dfs_cache_free_tgts(&tgt_list); return rc; } @@ -3285,77 +3306,77 @@ static void put_root_ses(struct cifs_ses *ses) cifs_put_smb_ses(ses); } -/* Check if a path component is remote and then update @dfs_path accordingly */ -static int check_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - const unsigned int xid, struct TCP_Server_Info *server, - struct cifs_tcon *tcon, char **dfs_path) +/* Set up next dfs prefix path in @dfs_path */ +static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, + const unsigned int xid, struct TCP_Server_Info *server, + struct cifs_tcon *tcon, char **dfs_path) { - char *path, *s; - char sep = CIFS_DIR_SEP(cifs_sb), tmp; - char *npath; - int rc = 0; - int added_treename = tcon->Flags & SMB_SHARE_IS_IN_DFS; - int skip = added_treename; + char *path, *npath; + int added_treename = is_tcon_dfs(tcon); + int rc; path = cifs_build_path_to_root(ctx, cifs_sb, tcon, added_treename); if (!path) return -ENOMEM; - /* - * Walk through the path components in @path and check if they're accessible. In case any of - * the components is -EREMOTE, then update @dfs_path with the next DFS referral request path - * (NOT including the remaining components). - */ - s = path; - do { - /* skip separators */ - while (*s && *s == sep) - s++; - if (!*s) - break; - /* next separator */ - while (*s && *s != sep) - s++; - /* - * if the treename is added, we then have to skip the first - * part within the separators - */ - if (skip) { - skip = 0; - continue; + rc = is_path_remote(cifs_sb, ctx, xid, server, tcon); + if (rc == -EREMOTE) { + struct smb3_fs_context v = {NULL}; + /* if @path contains a tree name, skip it in the prefix path */ + if (added_treename) { + rc = smb3_parse_devname(path, &v); + if (rc) + goto out; + npath = build_unc_path_to_root(&v, cifs_sb, true); + smb3_cleanup_fs_context_contents(&v); + } else { + v.UNC = ctx->UNC; + v.prepath = path + 1; + npath = build_unc_path_to_root(&v, cifs_sb, true); } - tmp = *s; - *s = 0; - rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, path); - if (rc && rc == -EREMOTE) { - struct smb3_fs_context v = {NULL}; - /* if @path contains a tree name, skip it in the prefix path */ - if (added_treename) { - rc = smb3_parse_devname(path, &v); - if (rc) - break; - rc = -EREMOTE; - npath = build_unc_path_to_root(&v, cifs_sb, true); - smb3_cleanup_fs_context_contents(&v); - } else { - v.UNC = ctx->UNC; - v.prepath = path + 1; - npath = build_unc_path_to_root(&v, cifs_sb, true); - } - if (IS_ERR(npath)) { - rc = PTR_ERR(npath); - break; - } - kfree(*dfs_path); - *dfs_path = npath; + + if (IS_ERR(npath)) { + rc = PTR_ERR(npath); + goto out; } - *s = tmp; - } while (rc == 0); + kfree(*dfs_path); + *dfs_path = npath; + rc = -EREMOTE; + } + +out: kfree(path); return rc; } +/* Check if resolved targets can handle any DFS referrals */ +static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server) +{ + int rc; + struct dfs_info3_param ref = {0}; + + if (is_tcon_dfs(tcon)) { + *ref_server = true; + } else { + cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + + rc = dfs_cache_noreq_find(ref_path, &ref, NULL); + if (rc) { + cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc); + return rc; + } + cifs_dbg(FYI, "%s: ref.flags=0x%x\n", __func__, ref.flags); + /* + * Check if all targets are capable of handling DFS referrals as per + * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL. + */ + *ref_server = !!(ref.flags & DFSREF_REFERRAL_SERVER); + free_dfs_info_param(&ref); + } + return 0; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) { int rc = 0; @@ -3367,18 +3388,19 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) char *ref_path = NULL, *full_path = NULL; char *oldmnt = NULL; char *mntdata = NULL; + bool ref_server = false; rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon); /* - * Unconditionally try to get an DFS referral (even cached) to determine whether it is an - * DFS mount. + * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally + * try to get an DFS referral (even cached) to determine whether it is an DFS mount. * * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem * to respond with PATH_NOT_COVERED to requests that include the prefix. */ - if (dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || + dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, NULL)) { - /* No DFS referral was returned. Looks like a regular share. */ if (rc) goto error; /* Check if it is fully accessible and then mount it */ @@ -3432,13 +3454,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) break; if (!tcon) continue; + /* Make sure that requests go through new root servers */ - if (is_tcon_dfs(tcon)) { + rc = is_referral_server(ref_path + 1, tcon, &ref_server); + if (rc) + break; + if (ref_server) { put_root_ses(root_ses); set_root_ses(cifs_sb, ses, &root_ses); } - /* Check for remaining path components and then continue chasing them (-EREMOTE) */ - rc = check_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); + + /* Get next dfs path and then continue chasing them if -EREMOTE */ + rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); /* Prevent recursion on broken link referrals */ if (rc == -EREMOTE && ++count > MAX_NESTED_LINKS) rc = -ELOOP; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 4950ab0486ae..098b4bc8da59 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -37,11 +37,12 @@ struct cache_dfs_tgt { struct cache_entry { struct hlist_node hlist; const char *path; - int ttl; - int srvtype; - int flags; + int hdr_flags; /* RESP_GET_DFS_REFERRAL.ReferralHeaderFlags */ + int ttl; /* DFS_REREFERRAL_V3.TimeToLive */ + int srvtype; /* DFS_REREFERRAL_V3.ServerType */ + int ref_flags; /* DFS_REREFERRAL_V3.ReferralEntryFlags */ struct timespec64 etime; - int path_consumed; + int path_consumed; /* RESP_GET_DFS_REFERRAL.PathConsumed */ int numtgts; struct list_head tlist; struct cache_dfs_tgt *tgthint; @@ -166,14 +167,11 @@ static int dfscache_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, - "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", - ce->path, - ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", - ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", - ce->path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { seq_printf(m, " %s%s\n", @@ -236,11 +234,12 @@ static inline void dump_tgts(const struct cache_entry *ce) static inline void dump_ce(const struct cache_entry *ce) { - cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,interlink=%s,path_consumed=%d,expired=%s\n", + cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->hdr_flags, ce->ref_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -381,7 +380,8 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, ce->ttl = refs[0].ttl; ce->etime = get_expire_time(ce->ttl); ce->srvtype = refs[0].server_type; - ce->flags = refs[0].ref_flag; + ce->hdr_flags = refs[0].flags; + ce->ref_flags = refs[0].ref_flag; ce->path_consumed = refs[0].path_consumed; for (i = 0; i < numrefs; i++) { @@ -799,7 +799,8 @@ static int setup_referral(const char *path, struct cache_entry *ce, ref->path_consumed = ce->path_consumed; ref->ttl = ce->ttl; ref->server_type = ce->srvtype; - ref->ref_flag = ce->flags; + ref->ref_flag = ce->ref_flags; + ref->flags = ce->hdr_flags; return 0; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 97ac363b5df1..a3fb81e0ba17 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -567,8 +567,8 @@ out_free_xid: return rc; } -int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, - bool excl) +int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode, bool excl) { int rc; unsigned int xid = get_xid(); @@ -611,8 +611,8 @@ out_free_xid: return rc; } -int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, - dev_t device_number) +int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode, dev_t device_number) { int rc = -EPERM; unsigned int xid; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6d001905c8e5..26de4329d161 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -580,7 +580,7 @@ int cifs_open(struct inode *inode, struct file *file) } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", - tcon->ses->serverName, + tcon->ses->ip_addr, tcon->ses->serverNOS); tcon->broken_posix_open = true; } else if ((rc != -EIO) && (rc != -EREMOTE) && diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 12a5da0230b5..892f51a21278 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -140,6 +140,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), + fsparam_u32("acdirmax", Opt_acdirmax), + fsparam_u32("acregmax", Opt_acregmax), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), @@ -397,7 +399,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smb3any_values; break; case Smb_default: - ctx->ops = &smb30_operations; /* currently identical with 3.0 */ + ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; break; default: @@ -542,20 +544,37 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, /* BB Need to add support for sep= here TBD */ while ((key = strsep(&options, ",")) != NULL) { - if (*key) { - size_t v_len = 0; - char *value = strchr(key, '='); - - if (value) { - if (value == key) - continue; - *value++ = 0; - v_len = strlen(value); - } - ret = vfs_parse_fs_string(fc, key, value, v_len); - if (ret < 0) - break; + size_t len; + char *value; + + if (*key == 0) + break; + + /* Check if following character is the deliminator If yes, + * we have encountered a double deliminator reset the NULL + * character to the deliminator + */ + while (options && options[0] == ',') { + len = strlen(key); + strcpy(key + len, options); + options = strchr(options, ','); + if (options) + *options++ = 0; + } + + + len = 0; + value = strchr(key, '='); + if (value) { + if (value == key) + continue; + *value++ = 0; + len = strlen(value); } + + ret = vfs_parse_fs_string(fc, key, value, len); + if (ret < 0) + break; } return ret; @@ -929,12 +948,31 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize = result.uint_32; ctx->got_wsize = true; break; + case Opt_acregmax: + ctx->acregmax = HZ * result.uint_32; + if (ctx->acregmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acregmax too large\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_acdirmax: + ctx->acdirmax = HZ * result.uint_32; + if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acdirmax too large\n"); + goto cifs_parse_mount_err; + } + break; case Opt_actimeo: - ctx->actimeo = HZ * result.uint_32; - if (ctx->actimeo > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "attribute cache timeout too large\n"); + if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "timeout too large\n"); goto cifs_parse_mount_err; } + if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || + (ctx->acregmax != CIFS_DEF_ACTIMEO)) { + cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + break; + } + ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; break; case Opt_echo_interval: ctx->echo_interval = result.uint_32; @@ -1361,7 +1399,8 @@ int smb3_init_fs_context(struct fs_context *fc) /* default is to use strict cifs caching semantics */ ctx->strict_io = true; - ctx->actimeo = CIFS_DEF_ACTIMEO; + ctx->acregmax = CIFS_DEF_ACTIMEO; + ctx->acdirmax = CIFS_DEF_ACTIMEO; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 1c44a460e2c0..87dd1f7168f2 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -118,6 +118,8 @@ enum cifs_param { Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_acdirmax, + Opt_acregmax, Opt_echo_interval, Opt_max_credits, Opt_snapshot, @@ -232,7 +234,9 @@ struct smb3_fs_context { unsigned int wsize; unsigned int min_offload; bool sockopt_tcp_nodelay:1; - unsigned long actimeo; /* attribute cache timeout (jiffies) */ + /* attribute cache timemout for files and directories in jiffies */ + unsigned long acregmax; + unsigned long acdirmax; struct smb_version_operations *ops; struct smb_version_values *vals; char *prepath; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a83b3a8ffaac..7c61bc9573c0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1857,7 +1857,8 @@ posix_mkdir_get_info: goto posix_mkdir_out; } -int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode) +int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, umode_t mode) { int rc = 0; unsigned int xid; @@ -2067,9 +2068,9 @@ do_rename_exit: } int -cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, - struct inode *target_dir, struct dentry *target_dentry, - unsigned int flags) +cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, + struct dentry *source_dentry, struct inode *target_dir, + struct dentry *target_dentry, unsigned int flags) { char *from_name = NULL; char *to_name = NULL; @@ -2198,12 +2199,23 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; - if (!cifs_sb->ctx->actimeo) - return true; - - if (!time_in_range(jiffies, cifs_i->time, - cifs_i->time + cifs_sb->ctx->actimeo)) - return true; + /* + * depending on inode type, check if attribute caching disabled for + * files or directories + */ + if (S_ISDIR(inode->i_mode)) { + if (!cifs_sb->ctx->acdirmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acdirmax)) + return true; + } else { /* file */ + if (!cifs_sb->ctx->acregmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acregmax)) + return true; + } /* hardlinked files w/ noserverino get "special" treatment */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && @@ -2370,8 +2382,8 @@ int cifs_revalidate_dentry(struct dentry *dentry) return cifs_revalidate_mapping(inode); } -int cifs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); @@ -2408,7 +2420,7 @@ int cifs_getattr(const struct path *path, struct kstat *stat, return rc; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; @@ -2610,7 +2622,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(direntry, attrs); + rc = setattr_prepare(&init_user_ns, direntry, attrs); if (rc < 0) goto out; @@ -2715,7 +2727,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) attrs->ia_size != i_size_read(inode)) truncate_setsize(inode, attrs->ia_size); - setattr_copy(inode, attrs); + setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); /* force revalidate when any of these times are set since some @@ -2757,7 +2769,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; - rc = setattr_prepare(direntry, attrs); + rc = setattr_prepare(&init_user_ns, direntry, attrs); if (rc < 0) { free_xid(xid); return rc; @@ -2913,7 +2925,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_size != i_size_read(inode)) truncate_setsize(inode, attrs->ia_size); - setattr_copy(inode, attrs); + setattr_copy(&init_user_ns, inode, attrs); mark_inode_dirty(inode); cifs_setattr_exit: @@ -2923,7 +2935,8 @@ cifs_setattr_exit: } int -cifs_setattr(struct dentry *direntry, struct iattr *attrs) +cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, + struct iattr *attrs) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 94dab4309fbb..7c5878a645d9 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -661,7 +661,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, } int -cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) +cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, + struct dentry *direntry, const char *symname) { int rc = -EOPNOTSUPP; unsigned int xid; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 213465718fa8..183a3a868d7b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -218,7 +218,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* UNC and paths */ /* XXX: Use ses->server->hostname? */ - sprintf(unc, unc_fmt, ses->serverName); + sprintf(unc, unc_fmt, ses->ip_addr); ctx.UNC = unc; ctx.prepath = ""; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f19274857292..f5087295424c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -63,17 +63,19 @@ smb2_add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits, const int optype) { int *val, rc = -1; + int scredits, in_flight; unsigned int add = credits->value; unsigned int instance = credits->instance; bool reconnect_detected = false; + bool reconnect_with_invalid_credits = false; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) - trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val, add); + reconnect_with_invalid_credits = true; + if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -84,7 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, pr_warn_once("server overflowed SMB3 credits\n"); } server->in_flight--; - if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) + if (server->in_flight == 0 && + ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) && + ((optype & CIFS_OP_MASK) != CIFS_SESS_OP)) rc = change_conf(server); /* * Sometimes server returns 0 credits on oplock break ack - we need to @@ -97,14 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, server->oplock_credits++; } } + scredits = *val; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); if (reconnect_detected) { + trace_smb3_reconnect_detected(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n", add, instance); } + if (reconnect_with_invalid_credits) { + trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "Negotiate operation when server credits is non-zero. Optype: %d, server credits: %d, credits added: %d\n", + optype, scredits, add); + } + if (server->tcpStatus == CifsNeedReconnect || server->tcpStatus == CifsExiting) return; @@ -123,23 +139,30 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: - trace_smb3_add_credits(server->CurrentMid, - server->hostname, rc, add); - cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, rc); + /* change_conf rebalanced credits for different types */ + break; } + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, scredits); } static void smb2_set_credits(struct TCP_Server_Info *server, const int val) { + int scredits, in_flight; + spin_lock(&server->req_lock); server->credits = val; if (val == 1) server->reconnect_instance++; + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_set_credits(server->CurrentMid, - server->hostname, val, val); + server->conn_id, server->hostname, scredits, val, in_flight); cifs_dbg(FYI, "%s: set %u credits\n", __func__, val); /* don't log while holding the lock */ @@ -171,7 +194,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, struct cifs_credits *credits) { int rc = 0; - unsigned int scredits; + unsigned int scredits, in_flight; spin_lock(&server->req_lock); while (1) { @@ -208,17 +231,18 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); credits->instance = server->reconnect_instance; server->credits -= credits->value; - scredits = server->credits; server->in_flight++; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; break; } } + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(credits->value)); + server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -231,14 +255,14 @@ smb2_adjust_credits(struct TCP_Server_Info *server, const unsigned int payload_size) { int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE); - int scredits; + int scredits, in_flight; if (!credits->value || credits->value == new_val) return 0; if (credits->value < new_val) { trace_smb3_too_many_credits(server->CurrentMid, - server->hostname, 0, credits->value - new_val); + server->conn_id, server->hostname, 0, credits->value - new_val, 0); cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); @@ -248,9 +272,13 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_lock(&server->req_lock); if (server->reconnect_instance != credits->instance) { + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + trace_smb3_reconnect_detected(server->CurrentMid, - server->hostname, 0, 0); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_server_dbg(VFS, "trying to return %d credits to old session\n", credits->value - new_val); return -EAGAIN; @@ -258,15 +286,18 @@ smb2_adjust_credits(struct TCP_Server_Info *server, server->credits += credits->value - new_val; scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); - credits->value = new_val; trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, credits->value - new_val); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", __func__, credits->value - new_val, scredits); + credits->value = new_val; + return 0; } @@ -2369,7 +2400,7 @@ static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - int scredits; + int scredits, in_flight; if (shdr->Status != STATUS_PENDING) return false; @@ -2378,11 +2409,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); } @@ -2418,6 +2451,34 @@ smb2_is_status_io_timeout(char *buf) return false; } +static void +smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct list_head *tmp, *tmp1; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + if (shdr->Status != STATUS_NETWORK_NAME_DELETED) + return; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + if (tcon->tid == shdr->TreeId) { + tcon->need_reconnect = true; + spin_unlock(&cifs_tcp_ses_lock); + pr_warn_once("Server share %s deleted.\n", + tcon->treeName); + return; + } + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -4605,6 +4666,10 @@ static void smb2_decrypt_offload(struct work_struct *work) #ifdef CONFIG_CIFS_STATS2 mid->when_received = jiffies; #endif + if (dw->server->ops->is_network_name_deleted) + dw->server->ops->is_network_name_deleted(dw->buf, + dw->server); + mid->callback(mid); } else { spin_lock(&GlobalMid_Lock); @@ -4723,6 +4788,12 @@ non_offloaded_decrypt: rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, pages, npages, len, false); + if (rc >= 0) { + if (server->ops->is_network_name_deleted) { + server->ops->is_network_name_deleted(buf, + server); + } + } } free_pages: @@ -5072,6 +5143,7 @@ struct smb_version_operations smb20_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb21_operations = { @@ -5173,6 +5245,7 @@ struct smb_version_operations smb21_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb30_operations = { @@ -5286,6 +5359,7 @@ struct smb_version_operations smb30_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb311_operations = { @@ -5399,6 +5473,7 @@ struct smb_version_operations smb311_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 794fc3b68b4f..4bbb6126b14d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -814,8 +814,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - req->DialectCount = cpu_to_le16(2); - total_len += 4; + req->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + req->DialectCount = cpu_to_le16(3); + total_len += 6; } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -849,6 +850,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB2_CLIENT_GUID_SIZE); if ((server->vals->protocol_id == SMB311_PROT_ID) || (strcmp(server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) || + (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0)) assemble_neg_contexts(req, server, &total_len); } @@ -883,6 +886,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_server_dbg(VFS, "SMB2.1 dialect returned but not requested\n"); return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { + /* ops set to 3.0 by default for default so update */ + server->ops = &smb311_operations; + server->vals = &smb311_values; } } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { @@ -1042,10 +1049,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) SMB3ANY_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - pneg_inbuf->DialectCount = cpu_to_le16(2); - /* structure is big enough for 3 dialects, sending only 2 */ + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); + /* SMB 2.1 not included so subtract one dialect from len */ inbuflen = sizeof(*pneg_inbuf) - - (2 * sizeof(pneg_inbuf->Dialects[0])); + (sizeof(pneg_inbuf->Dialects[0])); } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -1053,7 +1061,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID); pneg_inbuf->DialectCount = cpu_to_le16(4); - /* structure is big enough for 3 dialects */ + /* structure is big enough for 4 dialects */ inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ @@ -1253,7 +1261,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_ses_server(sess_data->ses), &rqst, &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index c3d1a584f251..d6df908dccad 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -851,17 +851,21 @@ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname), - TP_ARGS(currmid, hostname), + TP_ARGS(currmid, conn_id, hostname), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; ), - TP_printk("server=%s current_mid=0x%llx", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu", + __entry->conn_id, __entry->hostname, __entry->currmid) ) @@ -869,44 +873,56 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, #define DEFINE_SMB3_RECONNECT_EVENT(name) \ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ - char *hostname), \ - TP_ARGS(currmid, hostname)) + __u64 conn_id, \ + char *hostname), \ + TP_ARGS(currmid, conn_id, hostname)) DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname, int credits, - int credits_to_add), - TP_ARGS(currmid, hostname, credits, credits_to_add), + int credits_to_add, + int in_flight), + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) __field(int, credits) __field(int, credits_to_add) + __field(int, in_flight) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; __entry->credits = credits; __entry->credits_to_add = credits_to_add; + __entry->in_flight = in_flight; ), - TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu " + "credits=%d credit_change=%d in_flight=%d", + __entry->conn_id, __entry->hostname, __entry->currmid, __entry->credits, - __entry->credits_to_add) + __entry->credits_to_add, + __entry->in_flight) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ + __u64 conn_id, \ char *hostname, \ int credits, \ - int credits_to_add), \ - TP_ARGS(currmid, hostname, credits, credits_to_add)) + int credits_to_add, \ + int in_flight), \ + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(reconnect_detected); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4a2b836eb017..e90a1d1380b0 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -445,7 +445,7 @@ unmask: */ server->tcpStatus = CifsNeedReconnect; trace_smb3_partial_send_reconnect(server->CurrentMid, - server->hostname); + server->conn_id, server->hostname); } smbd_done: if (rc < 0 && rc != -EINTR) @@ -527,7 +527,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, int *credits; int optype; long int t; - int scredits = server->credits; + int scredits, in_flight; if (timeout < 0) t = MAX_JIFFY_OFFSET; @@ -551,23 +551,39 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->max_in_flight = server->in_flight; *credits -= 1; *instance = server->reconnect_instance; + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, -1, in_flight); + cifs_dbg(FYI, "%s: remove %u credits total=%d\n", + __func__, 1, scredits); + return 0; } while (1) { if (*credits < num_credits) { + scredits = *credits; spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout(server->request_q, has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits, 0); + server->conn_id, server->hostname, scredits, + num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -595,6 +611,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout( server->request_q, @@ -603,13 +620,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout( - server->CurrentMid, - server->hostname, num_credits, - 0); + server->CurrentMid, + server->conn_id, server->hostname, + scredits, num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -625,16 +647,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, /* update # of requests on the wire to server */ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { *credits -= num_credits; - scredits = *credits; server->in_flight += num_credits; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; *instance = server->reconnect_instance; } + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(num_credits)); + server->conn_id, server->hostname, scredits, + -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, num_credits, scredits); break; @@ -656,13 +680,13 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, const int flags, unsigned int *instance) { int *credits; - int scredits, sin_flight; + int scredits, in_flight; credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); spin_lock(&server->req_lock); scredits = *credits; - sin_flight = server->in_flight; + in_flight = server->in_flight; if (*credits < num) { /* @@ -684,10 +708,11 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, if (server->in_flight == 0) { spin_unlock(&server->req_lock); trace_smb3_insufficient_credits(server->CurrentMid, - server->hostname, scredits, sin_flight); + server->conn_id, server->hostname, scredits, + num, in_flight); cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n", - __func__, sin_flight, num, scredits); - return -ENOTSUPP; + __func__, in_flight, num, scredits); + return -EDEADLK; } } spin_unlock(&server->req_lock); @@ -1171,7 +1196,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); @@ -1236,7 +1261,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 6b658a1172ef..41a611e76bb7 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -101,6 +101,7 @@ static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d5ebd36fb2cc..e7b27754ce78 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -46,10 +46,12 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct inode *inode, int mask); +int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int coda_revalidate_inode(struct inode *); -int coda_getattr(const struct path *, struct kstat *, u32, unsigned int); -int coda_setattr(struct dentry *, struct iattr *); +int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, + u32, unsigned int); +int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); /* this file: heloers */ char *coda_f2s(struct CodaFid *f); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index ca40c2556ba6..d69989c1bac3 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -73,7 +73,8 @@ static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsig } -int coda_permission(struct inode *inode, int mask) +int coda_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int error; @@ -132,7 +133,8 @@ static inline void coda_dir_drop_nlink(struct inode *dir) } /* creation routines: create, mknod, mkdir, link, symlink */ -static int coda_create(struct inode *dir, struct dentry *de, umode_t mode, bool excl) +static int coda_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *de, umode_t mode, bool excl) { int error; const char *name=de->d_name.name; @@ -164,7 +166,8 @@ err_out: return error; } -static int coda_mkdir(struct inode *dir, struct dentry *de, umode_t mode) +static int coda_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -225,7 +228,8 @@ static int coda_link(struct dentry *source_de, struct inode *dir_inode, } -static int coda_symlink(struct inode *dir_inode, struct dentry *de, +static int coda_symlink(struct user_namespace *mnt_userns, + struct inode *dir_inode, struct dentry *de, const char *symname) { const char *name = de->d_name.name; @@ -291,9 +295,9 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) } /* rename */ -static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { const char *old_name = old_dentry->d_name.name; const char *new_name = new_dentry->d_name.name; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index b1c70e2b9b1e..d9f1bd7153df 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -251,16 +251,17 @@ static void coda_evict_inode(struct inode *inode) coda_cache_clear_inode(inode); } -int coda_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int coda_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); return err; } -int coda_setattr(struct dentry *de, struct iattr *iattr) +int coda_setattr(struct user_namespace *mnt_userns, struct dentry *de, + struct iattr *iattr) { struct inode *inode = d_inode(de); struct coda_vattr vattr; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 3aec27e5eb82..cb9fd59a688c 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -24,7 +24,8 @@ #include "coda_linux.h" /* pioctl ops */ -static int coda_ioctl_permission(struct inode *inode, int mask); +static int coda_ioctl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); static long coda_pioctl(struct file *filp, unsigned int cmd, unsigned long user_data); @@ -40,7 +41,8 @@ const struct file_operations coda_ioctl_operations = { }; /* the coda pioctl inode ops */ -static int coda_ioctl_permission(struct inode *inode, int mask) +static int coda_ioctl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { return (mask & MAY_EXEC) ? -EACCES : 0; } diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 22dce2d35a4b..9a3aed249692 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -79,7 +79,8 @@ extern void configfs_hash_and_remove(struct dentry * dir, const char * name); extern const unsigned char * configfs_get_name(struct configfs_dirent *sd); extern void configfs_drop_dentry(struct configfs_dirent *sd, struct dentry *parent); -extern int configfs_setattr(struct dentry *dentry, struct iattr *iattr); +extern int configfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr); extern struct dentry *configfs_pin_fs(void); extern void configfs_release_fs(void); @@ -92,7 +93,8 @@ extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; extern const struct dentry_operations configfs_dentry_ops; -extern int configfs_symlink(struct inode *dir, struct dentry *dentry, +extern int configfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname); extern int configfs_unlink(struct inode *dir, struct dentry *dentry); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b839dd1b459f..b6098e02e20b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1268,7 +1268,8 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int configfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8bd6a883c94c..42c348bb2903 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -40,7 +40,8 @@ static const struct inode_operations configfs_inode_operations ={ .setattr = configfs_setattr, }; -int configfs_setattr(struct dentry * dentry, struct iattr * iattr) +int configfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; @@ -67,7 +68,7 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) } /* attributes were changed atleast once in past */ - error = simple_setattr(dentry, iattr); + error = simple_setattr(mnt_userns, dentry, iattr); if (error) return error; diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index cb61467478ca..77c854364e60 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -139,7 +139,8 @@ static int get_target(const char *symname, struct path *path, } -int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int configfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int ret; struct path path; @@ -197,7 +198,8 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna if (dentry->d_inode || d_unhashed(dentry)) ret = -EEXIST; else - ret = inode_permission(dir, MAY_WRITE | MAY_EXEC); + ret = inode_permission(&init_user_ns, dir, + MAY_WRITE | MAY_EXEC); if (!ret) ret = type->ct_item_ops->allow_link(parent_item, target_item); if (!ret) { diff --git a/fs/coredump.c b/fs/coredump.c index a2f6ecc8e345..1c0fdc1aa70b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -703,6 +703,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) goto close_fail; } } else { + struct user_namespace *mnt_userns; struct inode *inode; int open_flags = O_CREAT | O_RDWR | O_NOFOLLOW | O_LARGEFILE | O_EXCL; @@ -780,13 +781,15 @@ void do_coredump(const kernel_siginfo_t *siginfo) * a process dumps core while its cwd is e.g. on a vfat * filesystem. */ - if (!uid_eq(inode->i_uid, current_fsuid())) + mnt_userns = file_mnt_user_ns(cprm.file); + if (!uid_eq(i_uid_into_mnt(mnt_userns, inode), current_fsuid())) goto close_fail; if ((inode->i_mode & 0677) != 0600) goto close_fail; if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; - if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) + if (do_truncate(mnt_userns, cprm.file->f_path.dentry, + 0, 0, cprm.file)) goto close_fail; } @@ -894,10 +897,10 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, */ page = get_dump_page(addr); if (page) { - void *kaddr = kmap(page); + void *kaddr = kmap_local_page(page); stop = !dump_emit(cprm, kaddr, PAGE_SIZE); - kunmap(page); + kunmap_local(kaddr); put_page(page); } else { stop = !dump_skip(cprm, PAGE_SIZE); @@ -931,7 +934,8 @@ void dump_truncate(struct coredump_params *cprm) if (file->f_op->llseek && file->f_op->llseek != no_llseek) { offset = file->f_op->llseek(file, 0, SEEK_CUR); if (i_size_read(file->f_mapping->host) < offset) - do_truncate(file->f_path.dentry, offset, 0, file); + do_truncate(file_mnt_user_ns(file), file->f_path.dentry, + offset, 0, file); } } EXPORT_SYMBOL(dump_truncate); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 4b90cfd1ec36..2be65269a987 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -392,8 +392,7 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) /* Don't map the last page if it contains some other data */ if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) { - pr_debug("mmap: %s: last page is shared\n", - file_dentry(file)->d_name.name); + pr_debug("mmap: %pD: last page is shared\n", file); pages--; } @@ -430,16 +429,15 @@ static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) } if (!ret) - pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) " - "to vma 0x%08lx, page_prot 0x%llx\n", - file_dentry(file)->d_name.name, pgoff, - address, pages, vma_pages(vma), vma->vm_start, + pr_debug("mapped %pD[%lu] at 0x%08lx (%u/%lu pages) " + "to vma 0x%08lx, page_prot 0x%llx\n", file, + pgoff, address, pages, vma_pages(vma), vma->vm_start, (unsigned long long)pgprot_val(vma->vm_page_prot)); return ret; bailout: - pr_debug("%s[%lu]: direct mmap impossible: %s\n", - file_dentry(file)->d_name.name, pgoff, bailout_reason); + pr_debug("%pD[%lu]: direct mmap impossible: %s\n", + file, pgoff, bailout_reason); /* Didn't manage any direct map, but normal paging is still possible */ return 0; } @@ -469,8 +467,8 @@ static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, if (!offset || block_pages != pages) return -ENOSYS; addr = sbi->linear_phys_addr + offset; - pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n", - file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr); + pr_debug("get_unmapped for %pD ofs %#lx siz %lu at 0x%08lx\n", + file, pgoff*PAGE_SIZE, len, addr); return addr; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index a51cef6bd27f..ed3d623724cd 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -465,7 +465,7 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) return -EFAULT; policy.version = version; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); diff --git a/fs/dcache.c b/fs/dcache.c index 97e81a844a96..7d24ff7eb206 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -456,23 +456,6 @@ static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, list_lru_isolate_move(lru, &dentry->d_lru, list); } -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent dentry hashes, so that it won't - * be found through a VFS lookup any more. Note that this is different from - * deleting the dentry - d_delete will try to mark the dentry negative if - * possible, giving a successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants to invalidate a dentry for some - * reason (NFS timeouts or autofs deletes). - * - * __d_drop requires dentry->d_lock - * ___d_drop doesn't mark dentry as "unhashed" - * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). - */ static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; @@ -501,6 +484,24 @@ void __d_drop(struct dentry *dentry) } EXPORT_SYMBOL(__d_drop); +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent dentry hashes, so that it won't + * be found through a VFS lookup any more. Note that this is different from + * deleting the dentry - d_delete will try to mark the dentry negative if + * possible, giving a successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants to invalidate a dentry for some + * reason (NFS timeouts or autofs deletes). + * + * __d_drop requires dentry->d_lock + * + * ___d_drop doesn't mark dentry as "unhashed" + * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). + */ void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); @@ -996,20 +997,6 @@ struct dentry *d_find_any_alias(struct inode *inode) } EXPORT_SYMBOL(d_find_any_alias); -/** - * d_find_alias - grab a hashed alias of inode - * @inode: inode in question - * - * If inode has a hashed alias, or is a directory and has any alias, - * acquire the reference to alias and return it. Otherwise return NULL. - * Notice that if inode is a directory there can be only one alias and - * it can be unhashed only if it has no children, or if it is the root - * of a filesystem, or if the directory was renamed and d_revalidate - * was the first vfs operation to notice. - * - * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer - * any other hashed alias over that one. - */ static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; @@ -1029,6 +1016,20 @@ static struct dentry *__d_find_alias(struct inode *inode) return NULL; } +/** + * d_find_alias - grab a hashed alias of inode + * @inode: inode in question + * + * If inode has a hashed alias, or is a directory and has any alias, + * acquire the reference to alias and return it. Otherwise return NULL. + * Notice that if inode is a directory there can be only one alias and + * it can be unhashed only if it has no children, or if it is the root + * of a filesystem, or if the directory was renamed and d_revalidate + * was the first vfs operation to notice. + * + * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer + * any other hashed alias over that one. + */ struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; @@ -1043,6 +1044,31 @@ struct dentry *d_find_alias(struct inode *inode) EXPORT_SYMBOL(d_find_alias); /* + * Caller MUST be holding rcu_read_lock() and be guaranteed + * that inode won't get freed until rcu_read_unlock(). + */ +struct dentry *d_find_alias_rcu(struct inode *inode) +{ + struct hlist_head *l = &inode->i_dentry; + struct dentry *de = NULL; + + spin_lock(&inode->i_lock); + // ->i_dentry and ->i_rcu are colocated, but the latter won't be + // used without having I_FREEING set, which means no aliases left + if (likely(!(inode->i_state & I_FREEING) && !hlist_empty(l))) { + if (S_ISDIR(inode->i_mode)) { + de = hlist_entry(l->first, struct dentry, d_u.d_alias); + } else { + hlist_for_each_entry(de, l, d_u.d_alias) + if (!d_unhashed(de)) + break; + } + } + spin_unlock(&inode->i_lock); + return de; +} + +/* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ @@ -2150,8 +2176,8 @@ EXPORT_SYMBOL(d_obtain_root); * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * - * For a case-insensitive lookup match and if the the case-exact dentry - * already exists in in the dcache, use it and return it. + * For a case-insensitive lookup match and if the case-exact dentry + * already exists in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 2fcf66473436..22e86ae4dd5a 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -42,13 +42,14 @@ static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; * so that we can use the file mode as part of a heuristic to determine whether * to lock down individual files. */ -static int debugfs_setattr(struct dentry *dentry, struct iattr *ia) +static int debugfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { int ret = security_locked_down(LOCKDOWN_DEBUGFS); if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) return ret; - return simple_setattr(dentry, ia); + return simple_setattr(&init_user_ns, dentry, ia); } static const struct inode_operations debugfs_file_inode_operations = { @@ -297,7 +298,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) { struct dentry *dentry; - if (IS_ERR(parent)) + if (!debugfs_initialized() || IS_ERR_OR_NULL(name) || IS_ERR(parent)) return NULL; if (!parent) @@ -318,6 +319,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!(debugfs_allow & DEBUGFS_ALLOW_API)) return ERR_PTR(-EPERM); + if (!debugfs_initialized()) + return ERR_PTR(-ENOENT); + pr_debug("creating file '%s'\n", name); if (IS_ERR(parent)) @@ -775,8 +779,8 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, take_dentry_name_snapshot(&old_name, old_dentry); - error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), - dentry, 0); + error = simple_rename(&init_user_ns, d_inode(old_dir), old_dentry, + d_inode(new_dir), dentry, 0); if (error) { release_dentry_name_snapshot(&old_name); goto exit; diff --git a/fs/direct-io.c b/fs/direct-io.c index aa1083ecd623..b61491bf3166 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -462,7 +462,7 @@ static inline void dio_cleanup(struct dio *dio, struct dio_submit *sdio) * Wait for the next BIO to complete. Remove it and return it. NULL is * returned once all BIOs have been completed. This must only be called once * all bios have been issued so that dio->refcount can only decrease. This - * requires that that the caller hold a reference on the dio. + * requires that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { @@ -695,7 +695,7 @@ static inline int dio_new_bio(struct dio *dio, struct dio_submit *sdio, if (ret) goto out; sector = start_sector << (sdio->blkbits - 9); - nr_pages = min(sdio->pages_in_io, BIO_MAX_PAGES); + nr_pages = bio_max_segs(sdio->pages_in_io); BUG_ON(nr_pages <= 0); dio_bio_alloc(dio, sdio, map_bh->b_bdev, sector, nr_pages); sdio->boundary = 0; @@ -1279,7 +1279,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, if (retval == -ENOTBLK) { /* * The remaining part of the request will be - * be handled by buffered I/O when we return + * handled by buffered I/O when we return */ retval = 0; } diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 0681540c48d9..943e523f4c9d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1110,8 +1110,8 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, } inode_lock(lower_inode); - rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, - page_virt, size, 0); + rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, page_virt, size, 0); if (!rc && ecryptfs_inode) fsstack_copy_attr_all(ecryptfs_inode, lower_inode); inode_unlock(lower_inode); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 58d0f7187997..18e9285fbb4c 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -141,7 +141,8 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, else if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL); + rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry, + NULL); if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; @@ -180,7 +181,8 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); + rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -190,7 +192,8 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); + vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry), + lower_dentry, NULL); goto out_lock; } fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); @@ -254,7 +257,8 @@ out: * Returns zero on success; non-zero on error condition */ static int -ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, +ecryptfs_create(struct user_namespace *mnt_userns, + struct inode *directory_inode, struct dentry *ecryptfs_dentry, umode_t mode, bool excl) { struct inode *ecryptfs_inode; @@ -436,8 +440,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, dget(lower_old_dentry); dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), - lower_new_dentry, NULL); + rc = vfs_link(lower_old_dentry, &init_user_ns, + d_inode(lower_dir_dentry), lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); @@ -460,7 +464,8 @@ static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } -static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, +static int ecryptfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { int rc; @@ -481,7 +486,7 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, + rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -499,7 +504,8 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; @@ -507,7 +513,8 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -541,7 +548,7 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) else if (d_unhashed(lower_dentry)) rc = -EINVAL; else - rc = vfs_rmdir(lower_dir_inode, lower_dentry); + rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry); if (!rc) { clear_nlink(d_inode(dentry)); fsstack_copy_attr_times(dir, lower_dir_inode); @@ -555,7 +562,8 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) } static int -ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { int rc; struct dentry *lower_dentry; @@ -563,7 +571,8 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); @@ -579,9 +588,9 @@ out: } static int -ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +ecryptfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int rc; struct dentry *lower_old_dentry; @@ -590,6 +599,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dir_dentry; struct dentry *trap; struct inode *target_inode; + struct renamedata rd = {}; if (flags) return -EINVAL; @@ -619,9 +629,14 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = -ENOTEMPTY; goto out_lock; } - rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, - d_inode(lower_new_dir_dentry), lower_new_dentry, - NULL, 0); + + rd.old_mnt_userns = &init_user_ns; + rd.old_dir = d_inode(lower_old_dir_dentry); + rd.old_dentry = lower_old_dentry; + rd.new_mnt_userns = &init_user_ns; + rd.new_dir = d_inode(lower_new_dir_dentry); + rd.new_dentry = lower_new_dentry; + rc = vfs_rename(&rd); if (rc) goto out_lock; if (target_inode) @@ -855,16 +870,19 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); inode_lock(d_inode(lower_dentry)); - rc = notify_change(lower_dentry, &lower_ia, NULL); + rc = notify_change(&init_user_ns, lower_dentry, + &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); } return rc; } static int -ecryptfs_permission(struct inode *inode, int mask) +ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { - return inode_permission(ecryptfs_inode_to_lower(inode), mask); + return inode_permission(&init_user_ns, + ecryptfs_inode_to_lower(inode), mask); } /** @@ -879,7 +897,8 @@ ecryptfs_permission(struct inode *inode, int mask) * All other metadata changes will be passed right to the lower filesystem, * and we will just update our inode to look like the lower. */ -static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) +static int ecryptfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { int rc = 0; struct dentry *lower_dentry; @@ -933,7 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } mutex_unlock(&crypt_stat->cs_mutex); - rc = setattr_prepare(dentry, ia); + rc = setattr_prepare(&init_user_ns, dentry, ia); if (rc) goto out; if (ia->ia_valid & ATTR_SIZE) { @@ -959,14 +978,15 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) lower_ia.ia_valid &= ~ATTR_MODE; inode_lock(d_inode(lower_dentry)); - rc = notify_change(lower_dentry, &lower_ia, NULL); + rc = notify_change(&init_user_ns, lower_dentry, &lower_ia, NULL); inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } -static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, +static int ecryptfs_getattr_link(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -975,7 +995,7 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -991,7 +1011,8 @@ static int ecryptfs_getattr_link(const struct path *path, struct kstat *stat, return rc; } -static int ecryptfs_getattr(const struct path *path, struct kstat *stat, +static int ecryptfs_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; @@ -1003,7 +1024,7 @@ static int ecryptfs_getattr(const struct path *path, struct kstat *stat, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(d_inode(dentry), stat); + generic_fillattr(&init_user_ns, d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1025,7 +1046,7 @@ ecryptfs_setxattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_setxattr_locked(lower_dentry, name, value, size, flags, NULL); + rc = __vfs_setxattr_locked(&init_user_ns, lower_dentry, name, value, size, flags, NULL); inode_unlock(lower_inode); if (!rc && inode) fsstack_copy_attr_all(inode, lower_inode); @@ -1091,7 +1112,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, struct inode *inode, goto out; } inode_lock(lower_inode); - rc = __vfs_removexattr(lower_dentry, name); + rc = __vfs_removexattr(&init_user_ns, lower_dentry, name); inode_unlock(lower_inode); out: return rc; @@ -1135,6 +1156,7 @@ static int ecryptfs_xattr_get(const struct xattr_handler *handler, } static int ecryptfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e63259fdef28..cdf40a54a35d 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -531,6 +531,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } + if (mnt_user_ns(path.mnt) != &init_user_ns) { + rc = -EINVAL; + printk(KERN_ERR "Mounting on idmapped mounts currently disallowed\n"); + goto out_free; + } + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 019572c6b39a..2f333a40ff4d 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -426,8 +426,8 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) if (size < 0) size = 8; put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); - rc = __vfs_setxattr(lower_dentry, lower_inode, ECRYPTFS_XATTR_NAME, - xattr_virt, size, 0); + rc = __vfs_setxattr(&init_user_ns, lower_dentry, lower_inode, + ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index feaa5e182b7b..e6bc0302643b 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -137,7 +137,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) unsigned int oldflags = efivarfs_getflags(inode); int error; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (copy_from_user(&flags, arg, sizeof(flags))) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 0297ad95eb5c..14e2947975fd 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -66,8 +66,8 @@ bool efivarfs_valid_name(const char *str, int len) return uuid_is_valid(s); } -static int efivarfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int efivarfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = NULL; struct efivar_entry *var; diff --git a/fs/erofs/data.c b/fs/erofs/data.c index ea4f693bee22..f88851c5c250 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -215,10 +215,8 @@ submit_bio_retry: /* max # of continuous pages */ if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); - if (nblocks > BIO_MAX_PAGES) - nblocks = BIO_MAX_PAGES; - bio = bio_alloc(GFP_NOIO, nblocks); + bio = bio_alloc(GFP_NOIO, bio_max_segs(nblocks)); bio->bi_end_io = erofs_readendio; bio_set_dev(bio, sb->s_bdev); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 3e21c0e8adae..119fdce1b520 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -331,8 +331,9 @@ struct inode *erofs_iget(struct super_block *sb, return inode; } -int erofs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *const inode = d_inode(path->dentry); @@ -343,7 +344,7 @@ int erofs_getattr(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 67a7ec945686..351dae524a0c 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -373,8 +373,9 @@ extern const struct inode_operations erofs_symlink_iops; extern const struct inode_operations erofs_fast_symlink_iops; struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); -int erofs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags); +int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); /* namei.c */ extern const struct inode_operations erofs_dir_iops; diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index 5f8cc7346c69..3a81e1f7fc06 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -234,8 +234,8 @@ static struct dentry *erofs_lookup(struct inode *dir, } else if (err) { inode = ERR_PTR(err); } else { - erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__, - dentry->d_name.name, nid, d_type); + erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__, + dentry, nid, d_type); inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); } return d_splice_alias(inode, dentry); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a829af074eb5..3196474cbe24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -979,7 +979,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) return epir; } -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_KCMP static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long toff) { struct rb_node *rbp; @@ -1021,7 +1021,7 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, return file_raw; } -#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_KCMP */ /** * Adds a new entry to the tail of the list in a lockless way, i.e. diff --git a/fs/exec.c b/fs/exec.c index 5a853f03c233..18594f11c31f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1404,14 +1404,15 @@ EXPORT_SYMBOL(begin_new_exec); void would_dump(struct linux_binprm *bprm, struct file *file) { struct inode *inode = file_inode(file); - if (inode_permission(inode, MAY_READ) < 0) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + if (inode_permission(mnt_userns, inode, MAY_READ) < 0) { struct user_namespace *old, *user_ns; bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP; /* Ensure mm->user_ns contains the executable */ user_ns = old = bprm->mm->user_ns; while ((user_ns != &init_user_ns) && - !privileged_wrt_inode_uidgid(user_ns, inode)) + !privileged_wrt_inode_uidgid(user_ns, mnt_userns, inode)) user_ns = user_ns->parent; if (old != user_ns) { @@ -1454,7 +1455,7 @@ EXPORT_SYMBOL(finalize_exec); /* * Prepare credentials and lock ->cred_guard_mutex. * setup_new_exec() commits the new creds and drops the lock. - * Or, if exec fails before, free_bprm() should release ->cred and + * Or, if exec fails before, free_bprm() should release ->cred * and unlock. */ static int prepare_bprm_creds(struct linux_binprm *bprm) @@ -1579,6 +1580,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm) static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) { /* Handle suid and sgid on files */ + struct user_namespace *mnt_userns; struct inode *inode; unsigned int mode; kuid_t uid; @@ -1595,13 +1597,15 @@ static void bprm_fill_uid(struct linux_binprm *bprm, struct file *file) if (!(mode & (S_ISUID|S_ISGID))) return; + mnt_userns = file_mnt_user_ns(file); + /* Be careful if suid/sgid is set */ inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; - uid = inode->i_uid; - gid = inode->i_gid; + uid = i_uid_into_mnt(mnt_userns, inode); + gid = i_gid_into_mnt(mnt_userns, inode); inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ @@ -1837,7 +1841,7 @@ static int bprm_execve(struct linux_binprm *bprm, out: /* - * If past the point of no return ensure the the code never + * If past the point of no return ensure the code never * returns to the userspace process. Use an existing fatal * signal if present otherwise terminate the process with * SIGSEGV. diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index a987919686c0..761c79c3a4ba 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -166,7 +166,7 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu) * If the value of "clu" is 0, it means cluster 2 which is the first cluster of * the cluster heap. */ -void exfat_clear_bitmap(struct inode *inode, unsigned int clu) +void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; @@ -180,7 +180,7 @@ void exfat_clear_bitmap(struct inode *inode, unsigned int clu) b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); clear_bit_le(b, sbi->vol_amap[i]->b_data); - exfat_update_bh(sbi->vol_amap[i], IS_DIRSYNC(inode)); + exfat_update_bh(sbi->vol_amap[i], sync); if (opts->discard) { int ret_discard; diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index b8f0e829ecbd..fa21421a14d9 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -408,7 +408,7 @@ int exfat_count_num_clusters(struct super_block *sb, int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); int exfat_set_bitmap(struct inode *inode, unsigned int clu); -void exfat_clear_bitmap(struct inode *inode, unsigned int clu); +void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); @@ -416,9 +416,11 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); extern const struct file_operations exfat_file_operations; int __exfat_truncate(struct inode *inode, loff_t new_size); void exfat_truncate(struct inode *inode, loff_t size); -int exfat_setattr(struct dentry *dentry, struct iattr *attr); -int exfat_getattr(const struct path *path, struct kstat *stat, - unsigned int request_mask, unsigned int query_flags); +int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int exfat_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags); int exfat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); /* namei.c */ diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 6aec6288e1f2..7f39b1c6469c 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -77,6 +77,10 @@ #define EXFAT_FILE_NAME_LEN 15 +#define EXFAT_MIN_SECT_SIZE_BITS 9 +#define EXFAT_MAX_SECT_SIZE_BITS 12 +#define EXFAT_MAX_SECT_PER_CLUS_BITS(x) (25 - (x)->sect_size_bits) + /* EXFAT: Main and Backup Boot Sector (512 bytes) */ struct boot_sector { __u8 jmp_boot[BOOTSEC_JUMP_BOOT_LEN]; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index c3c9afee7418..7b2e8af17193 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -157,6 +157,7 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) unsigned int clu; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); + int cur_cmap_i, next_cmap_i; /* invalid cluster number */ if (p_chain->dir == EXFAT_FREE_CLUSTER || @@ -176,21 +177,51 @@ int exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) clu = p_chain->dir; + cur_cmap_i = next_cmap_i = + BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu)); + if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + unsigned int last_cluster = p_chain->dir + p_chain->size - 1; do { - exfat_clear_bitmap(inode, clu); - clu++; + bool sync = false; + + if (clu < last_cluster) + next_cmap_i = + BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu+1)); + /* flush bitmap only if index would be changed or for last cluster */ + if (clu == last_cluster || cur_cmap_i != next_cmap_i) { + sync = true; + cur_cmap_i = next_cmap_i; + } + + exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + clu++; num_clusters++; } while (num_clusters < p_chain->size); } else { do { - exfat_clear_bitmap(inode, clu); - - if (exfat_get_next_cluster(sb, &clu)) - goto dec_used_clus; + bool sync = false; + unsigned int n_clu = clu; + int err = exfat_get_next_cluster(sb, &n_clu); + + if (err || n_clu == EXFAT_EOF_CLUSTER) + sync = true; + else + next_cmap_i = + BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(n_clu)); + + if (cur_cmap_i != next_cmap_i) { + sync = true; + cur_cmap_i = next_cmap_i; + } + exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + clu = n_clu; num_clusters++; + + if (err) + goto dec_used_clus; } while (clu != EXFAT_EOF_CLUSTER); } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 183ffdf4d43c..f783cf38dd8e 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -267,13 +267,14 @@ write_size: mutex_unlock(&sbi->s_lock); } -int exfat_getattr(const struct path *path, struct kstat *stat, - unsigned int request_mask, unsigned int query_flags) +int exfat_getattr(struct user_namespace *mnt_uerns, const struct path *path, + struct kstat *stat, unsigned int request_mask, + unsigned int query_flags) { struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -282,7 +283,8 @@ int exfat_getattr(const struct path *path, struct kstat *stat, return 0; } -int exfat_setattr(struct dentry *dentry, struct iattr *attr) +int exfat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb); struct inode *inode = dentry->d_inode; @@ -305,7 +307,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) ATTR_TIMES_SET); } - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); attr->ia_valid = ia_valid; if (error) goto out; @@ -340,7 +342,7 @@ int exfat_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXFAT_I(inode)->truncate_lock); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); exfat_truncate_atime(&inode->i_atime); mark_inode_dirty(inode); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 2932b23a3b6c..d9e8ec689c55 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -541,8 +541,8 @@ out: return ret; } -static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int exfat_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -827,7 +827,8 @@ unlock: return err; } -static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int exfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -1318,9 +1319,10 @@ out: return ret; } -static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int exfat_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *old_inode, *new_inode; struct super_block *sb = old_dir->i_sb; diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 87be5bfc31eb..c6d8d2e53486 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -381,8 +381,7 @@ static int exfat_calibrate_blocksize(struct super_block *sb, int logical_sect) { struct exfat_sb_info *sbi = EXFAT_SB(sb); - if (!is_power_of_2(logical_sect) || - logical_sect < 512 || logical_sect > 4096) { + if (!is_power_of_2(logical_sect)) { exfat_err(sb, "bogus logical sector size %u", logical_sect); return -EIO; } @@ -451,6 +450,25 @@ static int exfat_read_boot_sector(struct super_block *sb) return -EINVAL; } + /* + * sect_size_bits could be at least 9 and at most 12. + */ + if (p_boot->sect_size_bits < EXFAT_MIN_SECT_SIZE_BITS || + p_boot->sect_size_bits > EXFAT_MAX_SECT_SIZE_BITS) { + exfat_err(sb, "bogus sector size bits : %u\n", + p_boot->sect_size_bits); + return -EINVAL; + } + + /* + * sect_per_clus_bits could be at least 0 and at most 25 - sect_size_bits. + */ + if (p_boot->sect_per_clus_bits > EXFAT_MAX_SECT_PER_CLUS_BITS(p_boot)) { + exfat_err(sb, "bogus sectors bits per cluster : %u\n", + p_boot->sect_per_clus_bits); + return -EINVAL; + } + sbi->sect_per_clus = 1 << p_boot->sect_per_clus_bits; sbi->sect_per_clus_bits = p_boot->sect_per_clus_bits; sbi->cluster_size_bits = p_boot->sect_per_clus_bits + @@ -477,16 +495,19 @@ static int exfat_read_boot_sector(struct super_block *sb) sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; /* check consistencies */ - if (sbi->num_FAT_sectors << p_boot->sect_size_bits < - sbi->num_clusters * 4) { + if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits < + (u64)sbi->num_clusters * 4) { exfat_err(sb, "bogus fat length"); return -EINVAL; } + if (sbi->data_start_sector < - sbi->FAT1_start_sector + sbi->num_FAT_sectors * p_boot->num_fats) { + (u64)sbi->FAT1_start_sector + + (u64)sbi->num_FAT_sectors * p_boot->num_fats) { exfat_err(sb, "bogus data start sector"); return -EINVAL; } + if (sbi->vol_flags & VOLUME_DIRTY) exfat_warn(sb, "Volume was not properly unmounted. Some data may be corrupt. Please run fsck."); if (sbi->vol_flags & MEDIA_FAILURE) diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index cf4c77f8dd08..b9a9db98e94b 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -216,14 +216,16 @@ __ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) * inode->i_mutex: down */ int -ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; int update_mode = 0; umode_t mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (error) return error; update_mode = 1; diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index 0f01c759daac..917db5f6630a 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -56,7 +56,8 @@ static inline int ext2_acl_count(size_t size) /* acl.c */ extern struct posix_acl *ext2_get_acl(struct inode *inode, int type); -extern int ext2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int ext2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ext2_init_acl (struct inode *, struct inode *); #else diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 2a4175fbaf5e..3309fb2d327a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -764,8 +764,9 @@ extern struct inode *ext2_iget (struct super_block *, unsigned long); extern int ext2_write_inode (struct inode *, struct writeback_control *); extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int ext2_setattr (struct dentry *, struct iattr *); -extern int ext2_getattr (const struct path *, struct kstat *, u32, unsigned int); +extern int ext2_setattr (struct user_namespace *, struct dentry *, struct iattr *); +extern int ext2_getattr (struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern void ext2_set_inode_flags(struct inode *inode); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 432c3febea6d..df14e750e9fe 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -551,7 +551,7 @@ got: inode->i_uid = current_fsuid(); inode->i_gid = dir->i_gid; } else - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 78c417d3c898..68178b2234bd 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1638,8 +1638,8 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) return __ext2_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } -int ext2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int ext2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext2_inode_info *ei = EXT2_I(inode); @@ -1660,16 +1660,17 @@ int ext2_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } -int ext2_setattr(struct dentry *dentry, struct iattr *iattr) +int ext2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) return error; @@ -1689,9 +1690,9 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr) if (error) return error; } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); if (iattr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); mark_inode_dirty(inode); return error; diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 32a8d10b579d..b399cbb7022d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -39,7 +39,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (ret) return ret; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { ret = -EACCES; goto setflags_out; } @@ -84,7 +84,7 @@ setflags_out: case EXT2_IOC_SETVERSION: { __u32 generation; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; ret = mnt_want_write_file(filp); if (ret) @@ -117,7 +117,7 @@ setversion_out: if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index ea980f1e2e99..3367384d344d 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -100,7 +100,9 @@ struct dentry *ext2_get_parent(struct dentry *child) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +static int ext2_create (struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, + umode_t mode, bool excl) { struct inode *inode; int err; @@ -118,7 +120,8 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return ext2_add_nondir(dentry, inode); } -static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext2_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode = ext2_new_inode(dir, mode, NULL); if (IS_ERR(inode)) @@ -131,7 +134,8 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int ext2_mknod (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -151,8 +155,8 @@ static int ext2_mknod (struct inode * dir, struct dentry *dentry, umode_t mode, return err; } -static int ext2_symlink (struct inode * dir, struct dentry * dentry, - const char * symname) +static int ext2_symlink (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; int err = -ENAMETOOLONG; @@ -225,7 +229,8 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ext2_mkdir(struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -315,8 +320,9 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry, +static int ext2_rename (struct user_namespace * mnt_userns, + struct inode * old_dir, struct dentry * old_dentry, + struct inode * new_dir, struct dentry * new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index 9a682e440acb..ebade1f52451 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -19,6 +19,7 @@ ext2_xattr_security_get(const struct xattr_handler *handler, static int ext2_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 49add1107850..18a87d5dd1ab 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -26,6 +26,7 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, static int ext2_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index c243a3b4d69d..58092449f8ff 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -30,6 +30,7 @@ ext2_xattr_user_get(const struct xattr_handler *handler, static int ext2_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/.kunitconfig b/fs/ext4/.kunitconfig new file mode 100644 index 000000000000..bf51da7cd9fc --- /dev/null +++ b/fs/ext4/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_KUNIT_TESTS=y diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 619dd35ddd48..86699c8cab28 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -103,8 +103,7 @@ config EXT4_DEBUG config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" if !KUNIT_ALL_TESTS - select EXT4_FS - depends on KUNIT + depends on EXT4_FS && KUNIT default KUNIT_ALL_TESTS help This builds the ext4 KUnit tests. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 68aaed48315f..c5eaffccecc3 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -222,7 +222,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, } int -ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) +ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; @@ -245,7 +246,7 @@ retry: ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 9b63f5416a2f..84b8942a57f2 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -56,7 +56,8 @@ static inline int ext4_acl_count(size_t size) /* acl.c */ struct posix_acl *ext4_get_acl(struct inode *inode, int type); -int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ext4_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 2866d249f3d2..644fd69185d3 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2755,18 +2755,19 @@ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); -extern struct inode *__ext4_new_inode(handle_t *, struct inode *, umode_t, +extern struct inode *__ext4_new_inode(struct user_namespace *, handle_t *, + struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); -#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ - __ext4_new_inode((handle), (dir), (mode), (qstr), (goal), (owner), \ - i_flags, 0, 0, 0) -#define ext4_new_inode_start_handle(dir, mode, qstr, goal, owner, \ +#define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ + __ext4_new_inode(&init_user_ns, (handle), (dir), (mode), (qstr), \ + (goal), (owner), i_flags, 0, 0, 0) +#define ext4_new_inode_start_handle(mnt_userns, dir, mode, qstr, goal, owner, \ type, nblocks) \ - __ext4_new_inode(NULL, (dir), (mode), (qstr), (goal), (owner), \ + __ext4_new_inode((mnt_userns), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) @@ -2877,11 +2878,14 @@ extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); -extern int ext4_setattr(struct dentry *, struct iattr *); -extern int ext4_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_setattr(struct user_namespace *, struct dentry *, + struct iattr *); +extern int ext4_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); -extern int ext4_file_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int ext4_file_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3960b7ec3ab7..77c7c8a54da7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4382,8 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, { struct inode *inode = file_inode(file); handle_t *handle; - int ret = 0; - int ret2 = 0, ret3 = 0; + int ret, ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4408,7 +4407,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, depth = ext_depth(inode); retry: - while (ret >= 0 && len) { + while (len) { /* * Recalculate credits when extent tree depth changes. */ @@ -4430,9 +4429,13 @@ retry: inode->i_ino, map.m_lblk, map.m_len, ret); ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); + ext4_journal_stop(handle); break; } + /* + * allow a full retry cycle for any remaining allocations + */ + retries = 0; map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; @@ -4450,11 +4453,8 @@ retry: if (unlikely(ret2)) break; } - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) { - ret = 0; + if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - } return ret > 0 ? ret2 : ret; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6e8208acfc62..6c4f19b0a556 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -915,13 +915,11 @@ static int ext4_fc_submit_inode_data_all(journal_t *journal) struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; - struct list_head *pos; int ret = 0; spin_lock(&sbi->s_fc_lock); ext4_set_mount_flag(sb, EXT4_MF_FC_COMMITTING); - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); @@ -978,17 +976,15 @@ __releases(&sbi->s_fc_lock) { struct super_block *sb = (struct super_block *)(journal->j_private); struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_fc_dentry_update *fc_dentry; + struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; - struct list_head *pos, *n, *fcd_pos, *fcd_n; - struct ext4_inode_info *ei; + struct ext4_inode_info *ei, *ei_n; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; - list_for_each_safe(fcd_pos, fcd_n, &sbi->s_fc_dentry_q[FC_Q_MAIN]) { - fc_dentry = list_entry(fcd_pos, struct ext4_fc_dentry_update, - fcd_list); + list_for_each_entry_safe(fc_dentry, fc_dentry_n, + &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv( @@ -1004,8 +1000,8 @@ __releases(&sbi->s_fc_lock) } inode = NULL; - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - ei = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(ei, ei_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { if (ei->vfs_inode.i_ino == fc_dentry->fcd_ino) { inode = &ei->vfs_inode; break; @@ -1057,7 +1053,6 @@ static int ext4_fc_perform_commit(journal_t *journal) struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; - struct list_head *pos; struct inode *inode; struct blk_plug plug; int ret = 0; @@ -1099,8 +1094,7 @@ static int ext4_fc_perform_commit(journal_t *journal) goto out; } - list_for_each(pos, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; @@ -1226,9 +1220,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter; + struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; - struct list_head *pos, *n; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; @@ -1236,8 +1229,8 @@ static void ext4_fc_cleanup(journal_t *journal, int full) jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); - list_for_each_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN]) { - iter = list_entry(pos, struct ext4_inode_info, i_fc_list); + list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], + i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 20f2fcb799f5..633ae7becd61 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -919,7 +919,8 @@ static int ext4_xattr_credits_for_new_inode(struct inode *dir, mode_t mode, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, +struct inode *__ext4_new_inode(struct user_namespace *mnt_userns, + handle_t *handle, struct inode *dir, umode_t mode, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, @@ -969,10 +970,10 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, i_gid_write(inode, owner[1]); } else if (test_opt(sb, GRPID)) { inode->i_mode = mode; - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); inode->i_gid = dir->i_gid; } else - inode_init_owner(inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); if (ext4_has_feature_project(sb) && ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c173c8405856..650c5acd2f2d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -20,6 +20,7 @@ */ #include <linux/fs.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> @@ -4961,15 +4962,11 @@ static void __ext4_update_other_inode_time(struct super_block *sb, if (!inode) return; - if ((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_INODE)) || - ((inode->i_state & I_DIRTY_TIME) == 0)) + if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); - if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | - I_DIRTY_INODE)) == 0) && - (inode->i_state & I_DIRTY_TIME)) { + if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; @@ -5319,7 +5316,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) * * Called with inode->i_mutex down. */ -int ext4_setattr(struct dentry *dentry, struct iattr *attr) +int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; @@ -5337,7 +5335,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(mnt_userns, dentry, attr); if (error) return error; @@ -5512,7 +5510,7 @@ out_mmap_sem: } if (!error) { - setattr_copy(inode, attr); + setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); } @@ -5524,7 +5522,7 @@ out_mmap_sem: ext4_orphan_del(NULL, inode); if (!error && (ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(mnt_userns, inode, inode->i_mode); err_out: if (error) @@ -5535,8 +5533,8 @@ err_out: return error; } -int ext4_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int ext4_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct ext4_inode *raw_inode; @@ -5571,17 +5569,18 @@ int ext4_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); return 0; } -int ext4_file_getattr(const struct path *path, struct kstat *stat, +int ext4_file_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); u64 delalloc_blocks; - ext4_getattr(path, stat, request_mask, query_flags); + ext4_getattr(mnt_userns, path, stat, request_mask, query_flags); /* * If there is inline data in the inode, the inode will normally not @@ -5937,26 +5936,16 @@ out: * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. - * - * If only the I_DIRTY_TIME flag is set, we can skip everything. If - * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need - * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; - if (flags == I_DIRTY_TIME) - return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) - goto out; - + return; ext4_mark_inode_dirty(handle, inode); - ext4_journal_stop(handle); -out: - return; } int ext4_change_inode_journal_flag(struct inode *inode, int val) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 713b1ae44c1a..a2cf35066f46 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -107,10 +107,12 @@ void ext4_reset_inode_seed(struct inode *inode) * important fields of the inodes. * * @sb: the super block of the filesystem + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the inode to swap with EXT4_BOOT_LOADER_INO * */ static long swap_inode_boot_loader(struct super_block *sb, + struct user_namespace *mnt_userns, struct inode *inode) { handle_t *handle; @@ -139,7 +141,8 @@ static long swap_inode_boot_loader(struct super_block *sb, } if (IS_RDONLY(inode) || IS_APPEND(inode) || IS_IMMUTABLE(inode) || - !inode_owner_or_capable(inode) || !capable(CAP_SYS_ADMIN)) { + !inode_owner_or_capable(mnt_userns, inode) || + !capable(CAP_SYS_ADMIN)) { err = -EPERM; goto journal_err_out; } @@ -814,6 +817,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); + struct user_namespace *mnt_userns = file_mnt_user_ns(filp); unsigned int flags; ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); @@ -829,7 +833,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC_SETFLAGS: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) @@ -871,7 +875,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) __u32 generation; int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (ext4_has_metadata_csum(inode->i_sb)) { @@ -1010,7 +1014,7 @@ mext_out: case EXT4_IOC_MIGRATE: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1032,7 +1036,7 @@ mext_out: case EXT4_IOC_ALLOC_DA_BLKS: { int err; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; err = mnt_want_write_file(filp); @@ -1051,7 +1055,7 @@ mext_out: err = mnt_want_write_file(filp); if (err) return err; - err = swap_inode_boot_loader(sb, inode); + err = swap_inode_boot_loader(sb, mnt_userns, inode); mnt_drop_write_file(filp); return err; } @@ -1217,7 +1221,7 @@ resizefs_out: case EXT4_IOC_CLEAR_ES_CACHE: { - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; ext4_clear_inode_es(inode); return 0; @@ -1263,7 +1267,7 @@ resizefs_out: return -EFAULT; /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EACCES; if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cf652ba3e74d..686bf982c84e 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -731,6 +731,29 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } + +/* + * Linear search cross check + */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ + while (n--) { + dxtrace(printk(KERN_CONT ",")); + if (dx_get_hash(++at) > hash) { + at--; + break; + } + } + ASSERT(at == target - 1); +} +#else /* DX_DEBUG */ +static inline void htree_rep_invariant_check(struct dx_entry *at, + struct dx_entry *target, + u32 hash, unsigned int n) +{ +} #endif /* DX_DEBUG */ /* @@ -827,20 +850,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, p = m + 1; } - if (0) { // linear search cross check - unsigned n = count - 1; - at = entries; - while (n--) - { - dxtrace(printk(KERN_CONT ",")); - if (dx_get_hash(++at) > hash) - { - at--; - break; - } - } - ASSERT(at == p - 1); - } + htree_rep_invariant_check(entries, p, hash, count - 1); at = p - 1; dxtrace(printk(KERN_CONT " %x->%u\n", @@ -2401,11 +2411,10 @@ again: (frame - 1)->bh); if (err) goto journal_error; - if (restart) { - err = ext4_handle_dirty_dx_node(handle, dir, - frame->bh); + err = ext4_handle_dirty_dx_node(handle, dir, + frame->bh); + if (err) goto journal_error; - } } else { struct dx_root *dxroot; memcpy((char *) entries2, (char *) entries, @@ -2596,8 +2605,8 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ext4_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { handle_t *handle; struct inode *inode; @@ -2610,8 +2619,8 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, umode_t mode, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, - NULL, EXT4_HT_DIR, credits); + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -2631,8 +2640,8 @@ retry: return err; } -static int ext4_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ext4_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -2645,8 +2654,8 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry, credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, mode, &dentry->d_name, 0, - NULL, EXT4_HT_DIR, credits); + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, &dentry->d_name, + 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); err = PTR_ERR(inode); if (!IS_ERR(inode)) { @@ -2665,7 +2674,8 @@ retry: return err; } -static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext4_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2676,7 +2686,7 @@ static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return err; retry: - inode = ext4_new_inode_start_handle(dir, mode, + inode = ext4_new_inode_start_handle(mnt_userns, dir, mode, NULL, 0, NULL, EXT4_HT_DIR, EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) + @@ -2774,7 +2784,8 @@ out: return err; } -static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ext4_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; @@ -2790,7 +2801,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); retry: - inode = ext4_new_inode_start_handle(dir, S_IFDIR | mode, + inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFDIR | mode, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3292,7 +3303,7 @@ out_trace: return retval; } -static int ext4_symlink(struct inode *dir, +static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, const char *symname) { handle_t *handle; @@ -3333,7 +3344,7 @@ static int ext4_symlink(struct inode *dir, EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } - inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, + inode = ext4_new_inode_start_handle(mnt_userns, dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); @@ -3662,7 +3673,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } -static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, +static struct inode *ext4_whiteout_for_rename(struct user_namespace *mnt_userns, + struct ext4_renament *ent, int credits, handle_t **h) { struct inode *wh; @@ -3676,7 +3688,8 @@ static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + EXT4_XATTR_TRANS_BLOCKS + 4); retry: - wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + wh = ext4_new_inode_start_handle(mnt_userns, ent->dir, + S_IFCHR | WHITEOUT_MODE, &ent->dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); @@ -3703,9 +3716,9 @@ retry: * while new_{dentry,inode) refers to the destination dentry/inode * This comes from rename(const char *oldpath, const char *newpath) */ -static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { @@ -3789,7 +3802,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; } } else { - whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle); if (IS_ERR(whiteout)) { retval = PTR_ERR(whiteout); whiteout = NULL; @@ -4085,7 +4098,8 @@ end_rename: return retval; } -static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int ext4_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -4107,7 +4121,7 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, new_dir, new_dentry); } - return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); + return ext4_rename(mnt_userns, old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index f014c5e473a9..3db923403505 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -371,8 +371,7 @@ int ext4_mpage_readpages(struct inode *inode, * bio_alloc will _always_ be able to allocate a bio if * __GFP_DIRECT_RECLAIM is set, see bio_alloc_bioset(). */ - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); + bio = bio_alloc(GFP_KERNEL, bio_max_segs(nr_pages)); fscrypt_set_bio_crypt_ctx(bio, inode, next_block, GFP_KERNEL); ext4_set_bio_post_read_ctx(bio, inode, page->index); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fb5985102c1d..ad34a37278cd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -59,7 +59,7 @@ #include <trace/events/ext4.h> static struct ext4_lazy_init *ext4_li_info; -static struct mutex ext4_li_mtx; +static DEFINE_MUTEX(ext4_li_mtx); static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, @@ -4875,7 +4875,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); - sbi->s_journal->j_commit_callback = ext4_journal_commit_callback; sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; sbi->s_journal->j_finish_inode_data_buffers = @@ -4987,6 +4986,14 @@ no_journal: goto failed_mount5; } + /* + * We can only set up the journal commit callback once + * mballoc is initialized + */ + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = + ext4_journal_commit_callback; + block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); @@ -6654,7 +6661,7 @@ static struct file_system_type ext4_fs_type = { .name = "ext4", .mount = ext4_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); @@ -6667,7 +6674,6 @@ static int __init ext4_init_fs(void) ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; - mutex_init(&ext4_li_mtx); /* Build-time check for flags consistency */ ext4_check_flag_values(); diff --git a/fs/ext4/xattr_hurd.c b/fs/ext4/xattr_hurd.c index 8cfa74a56361..c78df5790377 100644 --- a/fs/ext4/xattr_hurd.c +++ b/fs/ext4/xattr_hurd.c @@ -32,6 +32,7 @@ ext4_xattr_hurd_get(const struct xattr_handler *handler, static int ext4_xattr_hurd_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 197a9d8a15ef..8213f66f7b2d 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -23,6 +23,7 @@ ext4_xattr_security_get(const struct xattr_handler *handler, static int ext4_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index e9389e5d75c3..7c21ffb26d25 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -30,6 +30,7 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, static int ext4_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d4546184b34b..2fe7ff0a479c 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -31,6 +31,7 @@ ext4_xattr_user_get(const struct xattr_handler *handler, static int ext4_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 732ec10e7890..965037a9c205 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -214,8 +214,8 @@ static int f2fs_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -269,7 +269,8 @@ static int __f2fs_set_acl(struct inode *inode, int type, return error; } -int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int f2fs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) return -EIO; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 124868c13f80..986fd1bc780b 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -34,7 +34,8 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); +extern int f2fs_set_acl(struct user_namespace *, struct inode *, + struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b9721c8f116c..7c95818639a6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -969,8 +969,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, unsigned int post_read_steps = 0; bio = bio_alloc_bioset(for_write ? GFP_NOIO : GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES), - &f2fs_bioset); + bio_max_segs(nr_pages), &f2fs_bioset); if (!bio) return ERR_PTR(-ENOMEM); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 506c801880f3..e2d302ae3a46 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3187,9 +3187,10 @@ void f2fs_truncate_data_blocks(struct dnode_of_data *dn); int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate_blocks(struct inode *inode, u64 from, bool lock); int f2fs_truncate(struct inode *inode); -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count); int f2fs_precache_extents(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 471a6ff0c937..d26ff2ae3f5e 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -789,8 +789,8 @@ int f2fs_truncate(struct inode *inode) return 0; } -int f2fs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int f2fs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct f2fs_inode_info *fi = F2FS_I(inode); @@ -826,7 +826,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -837,7 +837,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, } #ifdef CONFIG_F2FS_FS_POSIX_ACL -static void __setattr_copy(struct inode *inode, const struct iattr *attr) +static void __setattr_copy(struct user_namespace *mnt_userns, + struct inode *inode, const struct iattr *attr) { unsigned int ia_valid = attr->ia_valid; @@ -853,9 +854,9 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(kgid) && !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; set_acl_inode(inode, mode); } @@ -864,7 +865,8 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) #define __setattr_copy setattr_copy #endif -int f2fs_setattr(struct dentry *dentry, struct iattr *attr) +int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; @@ -884,7 +886,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) !f2fs_is_compress_backend_ready(inode)) return -EOPNOTSUPP; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -960,10 +962,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) spin_unlock(&F2FS_I(inode)->i_size_lock); } - __setattr_copy(inode, attr); + __setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, f2fs_get_inode_mode(inode)); + err = posix_acl_chmod(&init_user_ns, inode, f2fs_get_inode_mode(inode)); if (is_inode_flag_set(inode, FI_ACL_MODE)) { if (!err) @@ -1978,7 +1980,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) u32 iflags; int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(fsflags, (int __user *)arg)) @@ -2025,7 +2027,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2092,7 +2094,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2134,7 +2136,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (!S_ISREG(inode->i_mode)) @@ -2169,7 +2171,7 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -2198,7 +2200,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) struct inode *inode = file_inode(filp); int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(filp); @@ -3175,7 +3177,7 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) return -EFAULT; /* Make sure caller has proper permission */ - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (fa.fsx_xflags & ~F2FS_SUPPORTED_XFLAGS) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 887804968576..17bd072a5d39 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) nid_free = true; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_blocks = 0; @@ -314,8 +314,8 @@ static void set_compress_inode(struct f2fs_sb_info *sbi, struct inode *inode, } } -static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -637,8 +637,8 @@ static const char *f2fs_get_link(struct dentry *dentry, return link; } -static int f2fs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -717,7 +717,8 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -770,8 +771,8 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } -static int f2fs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; @@ -878,7 +879,8 @@ out: return err; } -static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int f2fs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); @@ -1255,7 +1257,8 @@ out: return err; } -static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int f2fs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8a0fb890e8d..4b0e2e3c2c88 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2747,7 +2747,7 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, BIO_MAX_PAGES); + nrpages = bio_max_segs(last_offset - i); /* readahead node pages */ f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4acfa7d36731..7069793752f1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1300,9 +1300,6 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) inode->i_ino == F2FS_META_INO(sbi)) return; - if (flags == I_DIRTY_TIME) - return; - if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 8159fae74b9a..490f843ec3bf 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -64,6 +64,7 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, } static int f2fs_xattr_generic_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -107,6 +108,7 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, } static int f2fs_xattr_advise_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -114,7 +116,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, unsigned char old_advise = F2FS_I(inode)->i_advise; unsigned char new_advise; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EPERM; if (value == NULL) return -EINVAL; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 922a0c6ba46c..02d4d4234956 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -397,9 +397,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); extern const struct file_operations fat_file_operations; extern const struct inode_operations fat_file_inode_operations; -extern int fat_setattr(struct dentry *dentry, struct iattr *attr); +extern int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); extern void fat_truncate_blocks(struct inode *inode, loff_t offset); -extern int fat_getattr(const struct path *path, struct kstat *stat, +extern int fat_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); extern int fat_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/fat/file.c b/fs/fat/file.c index 5fee74f1ad61..13855ba49cd9 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -95,7 +95,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) goto out_unlock_inode; /* This MUST be done before doing anything irreversible... */ - err = fat_setattr(file->f_path.dentry, &ia); + err = fat_setattr(file_mnt_user_ns(file), file->f_path.dentry, &ia); if (err) goto out_unlock_inode; @@ -394,11 +394,11 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) fat_flush_inodes(inode->i_sb, inode, NULL); } -int fat_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int fat_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) { @@ -447,12 +447,13 @@ static int fat_sanitize_mode(const struct msdos_sb_info *sbi, return 0; } -static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) +static int fat_allow_set_time(struct user_namespace *mnt_userns, + struct msdos_sb_info *sbi, struct inode *inode) { umode_t allow_utime = sbi->options.allow_utime; - if (!uid_eq(current_fsuid(), inode->i_uid)) { - if (in_group_p(inode->i_gid)) + if (!uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { + if (in_group_p(i_gid_into_mnt(mnt_userns, inode))) allow_utime >>= 3; if (allow_utime & MAY_WRITE) return 1; @@ -466,7 +467,8 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) /* valid file mode bits */ #define FAT_VALID_MODE (S_IFREG | S_IFDIR | S_IRWXUGO) -int fat_setattr(struct dentry *dentry, struct iattr *attr) +int fat_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); struct inode *inode = d_inode(dentry); @@ -476,11 +478,11 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) /* Check for setting the inode time. */ ia_valid = attr->ia_valid; if (ia_valid & TIMES_SET_FLAGS) { - if (fat_allow_set_time(sbi, inode)) + if (fat_allow_set_time(mnt_userns, sbi, inode)) attr->ia_valid &= ~TIMES_SET_FLAGS; } - error = setattr_prepare(dentry, attr); + error = setattr_prepare(mnt_userns, dentry, attr); attr->ia_valid = ia_valid; if (error) { if (sbi->options.quiet) @@ -550,7 +552,7 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr) fat_truncate_time(inode, &attr->ia_mtime, S_MTIME); attr->ia_valid &= ~(ATTR_ATIME|ATTR_CTIME|ATTR_MTIME); - setattr_copy(inode, attr); + setattr_copy(mnt_userns, inode, attr); mark_inode_dirty(inode); out: return error; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index f1b2a1fc2a6a..18a50a46b57f 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -329,22 +329,23 @@ EXPORT_SYMBOL_GPL(fat_truncate_time); int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) { - int iflags = I_DIRTY_TIME; - bool dirty = false; + int dirty_flags = 0; if (inode->i_ino == MSDOS_ROOT_INO) return 0; - fat_truncate_time(inode, now, flags); - if (flags & S_VERSION) - dirty = inode_maybe_inc_iversion(inode, false); - if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && - !(inode->i_sb->s_flags & SB_LAZYTIME)) - dirty = true; + if (flags & (S_ATIME | S_CTIME | S_MTIME)) { + fat_truncate_time(inode, now, flags); + if (inode->i_sb->s_flags & SB_LAZYTIME) + dirty_flags |= I_DIRTY_TIME; + else + dirty_flags |= I_DIRTY_SYNC; + } + + if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) + dirty_flags |= I_DIRTY_SYNC; - if (dirty) - iflags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, iflags); + __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL_GPL(fat_update_time); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 9d062886fbc1..efba301d68ae 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -261,8 +261,8 @@ static int msdos_add_entry(struct inode *dir, const unsigned char *name, } /***** Create a file */ -static int msdos_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int msdos_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode = NULL; @@ -339,7 +339,8 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int msdos_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -593,7 +594,8 @@ error_inode: } /***** Rename, a wrapper for rename_same_dir & rename_diff_dir */ -static int msdos_rename(struct inode *old_dir, struct dentry *old_dentry, +static int msdos_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -665,7 +667,7 @@ static struct file_system_type msdos_fs_type = { .name = "msdos", .mount = msdos_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("msdos"); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 0cdd0fb9f742..5369d82e0bfb 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -756,8 +756,8 @@ error: return ERR_PTR(err); } -static int vfat_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int vfat_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -846,7 +846,8 @@ out: return err; } -static int vfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int vfat_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -892,9 +893,9 @@ out: return err; } -static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int vfat_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct buffer_head *dotdot_bh; struct msdos_dir_entry *dotdot_de; @@ -1062,7 +1063,7 @@ static struct file_system_type vfat_fs_type = { .name = "vfat", .mount = vfat_mount, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("vfat"); diff --git a/fs/fcntl.c b/fs/fcntl.c index 483ef8861376..dfc72f15be7f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -25,6 +25,7 @@ #include <linux/user_namespace.h> #include <linux/memfd.h> #include <linux/compat.h> +#include <linux/mount.h> #include <linux/poll.h> #include <asm/siginfo.h> @@ -46,7 +47,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg) /* O_NOATIME can only be set by the owner or superuser */ if ((arg & O_NOATIME) && !(filp->f_flags & O_NOATIME)) - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(file_mnt_user_ns(filp), inode)) return -EPERM; /* required for strict SunOS emulation */ diff --git a/fs/fhandle.c b/fs/fhandle.c index 01263ffbc4c0..ec6feeccc276 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -173,7 +173,7 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, /* * With handle we don't look at the execute bit on the - * the directory. Ideally we would like CAP_DAC_SEARCH. + * directory. Ideally we would like CAP_DAC_SEARCH. * But we don't have that */ if (!capable(CAP_DAC_READ_SEARCH)) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c41cb887eb7d..e91980f49388 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1442,9 +1442,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, } /* - * Write out an inode and its dirty pages. Do not update the writeback list - * linkage. That is left to the caller. The caller is also responsible for - * setting I_SYNC flag and calling inode_sync_complete() to clear it. + * Write out an inode and its dirty pages (or some of its dirty pages, depending + * on @wbc->nr_to_write), and clear the relevant dirty flags from i_state. + * + * This doesn't remove the inode from the writeback list it is on, except + * potentially to move it from b_dirty_time to b_dirty due to timestamp + * expiration. The caller is otherwise responsible for writeback list handling. + * + * The caller is also responsible for setting the I_SYNC flag beforehand and + * calling inode_sync_complete() to clear it afterwards. */ static int __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1479,7 +1485,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * change I_DIRTY_TIME into I_DIRTY_SYNC. */ if ((inode->i_state & I_DIRTY_TIME) && - (wbc->sync_mode == WB_SYNC_ALL || wbc->for_sync || + (wbc->sync_mode == WB_SYNC_ALL || time_after(jiffies, inode->dirtied_time_when + dirtytime_expire_interval * HZ))) { trace_writeback_lazytime(inode); @@ -1487,9 +1493,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Some filesystems may redirty the inode during the writeback - * due to delalloc, clear dirty metadata flags right before - * write_inode() + * Get and clear the dirty flags from i_state. This needs to be done + * after calling writepages because some filesystems may redirty the + * inode during writepages due to delalloc. It also needs to be done + * after handling timestamp expiration, as that may dirty the inode too. */ spin_lock(&inode->i_lock); dirty = inode->i_state & I_DIRTY; @@ -1524,12 +1531,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out an inode's dirty pages. Either the caller has an active reference - * on the inode or the inode has I_WILL_FREE set. + * Write out an inode's dirty data and metadata on-demand, i.e. separately from + * the regular batched writeback done by the flusher threads in + * writeback_sb_inodes(). @wbc controls various aspects of the write, such as + * whether it is a data-integrity sync (%WB_SYNC_ALL) or not (%WB_SYNC_NONE). * - * This function is designed to be called for writing back one inode which - * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode() - * and does more profound writeback list handling in writeback_sb_inodes(). + * To prevent the inode from going away, either the caller must have a reference + * to the inode, or the inode must have I_WILL_FREE or I_FREEING set. */ static int writeback_single_inode(struct inode *inode, struct writeback_control *wbc) @@ -1544,23 +1552,23 @@ static int writeback_single_inode(struct inode *inode, WARN_ON(inode->i_state & I_WILL_FREE); if (inode->i_state & I_SYNC) { - if (wbc->sync_mode != WB_SYNC_ALL) - goto out; /* - * It's a data-integrity sync. We must wait. Since callers hold - * inode reference or inode has I_WILL_FREE set, it cannot go - * away under us. + * Writeback is already running on the inode. For WB_SYNC_NONE, + * that's enough and we can just return. For WB_SYNC_ALL, we + * must wait for the existing writeback to complete, then do + * writeback again if there's anything left. */ + if (wbc->sync_mode != WB_SYNC_ALL) + goto out; __inode_wait_for_writeback(inode); } WARN_ON(inode->i_state & I_SYNC); /* - * Skip inode if it is clean and we have no outstanding writeback in - * WB_SYNC_ALL mode. We don't want to mess with writeback lists in this - * function since flusher thread may be doing for example sync in - * parallel and if we move the inode, it could get skipped. So here we - * make sure inode is on some writeback list and leave it there unless - * we have completely cleaned the inode. + * If the inode is already fully clean, then there's nothing to do. + * + * For data-integrity syncs we also need to check whether any pages are + * still under writeback, e.g. due to prior WB_SYNC_NONE writeback. If + * there are any such pages, we'll need to wait for them. */ if (!(inode->i_state & I_DIRTY_ALL) && (wbc->sync_mode != WB_SYNC_ALL || @@ -1576,8 +1584,9 @@ static int writeback_single_inode(struct inode *inode, wb = inode_to_wb_and_lock_list(inode); spin_lock(&inode->i_lock); /* - * If inode is clean, remove it from writeback lists. Otherwise don't - * touch it. See comment above for explanation. + * If the inode is now fully clean, then it can be safely removed from + * its writeback list (if any). Otherwise the flusher threads are + * responsible for the writeback lists. */ if (!(inode->i_state & I_DIRTY_ALL)) inode_io_list_del_locked(inode, wb); @@ -2219,23 +2228,24 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) } /** - * __mark_inode_dirty - internal function + * __mark_inode_dirty - internal function to mark an inode dirty * * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * @flags: what kind of dirty, e.g. I_DIRTY_SYNC. This can be a combination of + * multiple I_DIRTY_* flags, except that I_DIRTY_TIME can't be combined + * with I_DIRTY_PAGES. * - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. + * Mark an inode as dirty. We notify the filesystem, then update the inode's + * dirty flags. Then, if needed we add the inode to the appropriate dirty list. * - * Put the inode on the super block's dirty list. + * Most callers should use mark_inode_dirty() or mark_inode_dirty_sync() + * instead of calling this directly. * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. + * CAREFUL! We only add the inode to the dirty list if it is hashed or if it + * refers to a blockdev. Unhashed inodes will never be added to the dirty list + * even if they are later hashed, as they will have been marked dirty already. * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. + * In short, ensure you hash any inodes _before_ you start marking them dirty. * * Note that for blockdevs, inode->dirtied_when represents the dirtying time of * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of @@ -2247,25 +2257,34 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - int dirtytime; + int dirtytime = 0; trace_writeback_mark_inode_dirty(inode, flags); - /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself - */ - if (flags & (I_DIRTY_INODE | I_DIRTY_TIME)) { + if (flags & I_DIRTY_INODE) { + /* + * Notify the filesystem about the inode being dirtied, so that + * (if needed) it can update on-disk fields and journal the + * inode. This is only needed when the inode itself is being + * dirtied now. I.e. it's only needed for I_DIRTY_INODE, not + * for just I_DIRTY_PAGES or I_DIRTY_TIME. + */ trace_writeback_dirty_inode_start(inode, flags); - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode, flags); - + sb->s_op->dirty_inode(inode, flags & I_DIRTY_INODE); trace_writeback_dirty_inode(inode, flags); - } - if (flags & I_DIRTY_INODE) + + /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ flags &= ~I_DIRTY_TIME; - dirtytime = flags & I_DIRTY_TIME; + } else { + /* + * Else it's either I_DIRTY_PAGES, I_DIRTY_TIME, or nothing. + * (We don't support setting both I_DIRTY_PAGES and I_DIRTY_TIME + * in one call to __mark_inode_dirty().) + */ + dirtytime = flags & I_DIRTY_TIME; + WARN_ON_ONCE(dirtytime && flags != I_DIRTY_TIME); + } /* * Paired with smp_mb() in __writeback_single_inode() for the @@ -2288,6 +2307,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) inode_attach_wb(inode, NULL); + /* I_DIRTY_INODE supersedes I_DIRTY_TIME. */ if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index f529075a2ce8..e9c0f916349d 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -50,7 +50,8 @@ struct posix_acl *fuse_get_acl(struct inode *inode, int type) return acl; } -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct fuse_conn *fc = get_fuse_conn(inode); const char *name; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 588f8d1240aa..c6636b4c4ccf 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -844,11 +844,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (WARN_ON(PageMlocked(oldpage))) goto out_fallback_unlock; - err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL); - if (err) { - unlock_page(newpage); - goto out_put_old; - } + replace_page_cache_page(oldpage, newpage); get_page(newpage); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 78f9f209078c..06a18700a845 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -605,7 +605,8 @@ out_err: return err; } -static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t); +static int fuse_mknod(struct user_namespace *, struct inode *, struct dentry *, + umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) @@ -645,7 +646,7 @@ out_dput: return err; mknod: - err = fuse_mknod(dir, entry, mode, 0); + err = fuse_mknod(&init_user_ns, dir, entry, mode, 0); if (err) goto out_dput; no_open: @@ -715,8 +716,8 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, return err; } -static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, - dev_t rdev) +static int fuse_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -738,13 +739,14 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode, return create_new_entry(fm, &args, dir, entry, mode); } -static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode, - bool excl) +static int fuse_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode, bool excl) { - return fuse_mknod(dir, entry, mode, 0); + return fuse_mknod(&init_user_ns, dir, entry, mode, 0); } -static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) +static int fuse_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -765,8 +767,8 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode) return create_new_entry(fm, &args, dir, entry, S_IFDIR); } -static int fuse_symlink(struct inode *dir, struct dentry *entry, - const char *link) +static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; @@ -908,9 +910,9 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, return err; } -static int fuse_rename2(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent, - unsigned int flags) +static int fuse_rename2(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *oldent, struct inode *newdir, + struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; @@ -1087,7 +1089,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1249,7 +1251,8 @@ static int fuse_perm_getattr(struct inode *inode, int mask) * access request is sent. Execute permission is still checked * locally based on file mode. */ -static int fuse_permission(struct inode *inode, int mask) +static int fuse_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; @@ -1280,7 +1283,7 @@ static int fuse_permission(struct inode *inode, int mask) } if (fc->default_permissions) { - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root @@ -1288,7 +1291,8 @@ static int fuse_permission(struct inode *inode, int mask) if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, + inode, mask); } /* Note: the opposite of the above test does not @@ -1610,7 +1614,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -1756,7 +1760,8 @@ error: return err; } -static int fuse_setattr(struct dentry *entry, struct iattr *attr) +static int fuse_setattr(struct user_namespace *mnt_userns, struct dentry *entry, + struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1818,7 +1823,8 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) return ret; } -static int fuse_getattr(const struct path *path, struct kstat *stat, +static int fuse_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7c4b8cb93f9f..68cca8d4db6e 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1180,8 +1180,8 @@ extern const struct xattr_handler *fuse_no_acl_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_acl(struct inode *inode, int type); -int fuse_set_acl(struct inode *inode, struct posix_acl *acl, int type); - +int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); /* readdir.c */ int fuse_readdir(struct file *file, struct dir_context *ctx); diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index cdea18de94f7..1a7d7ace54e1 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -188,6 +188,7 @@ static int fuse_xattr_get(const struct xattr_handler *handler, } static int fuse_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -214,6 +215,7 @@ static int no_xattr_get(const struct xattr_handler *handler, } static int no_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *nodee, const char *name, const void *value, size_t size, int flags) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2e939f5fe751..9165d70ead07 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -106,7 +106,8 @@ out: return error; } -int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; @@ -130,7 +131,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { - ret = posix_acl_update_mode(inode, &mode, &acl); + ret = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); if (ret) goto unlock; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 61353a1501c5..eccc6a43326c 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -13,6 +13,7 @@ extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); extern int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int gfs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); #endif /* __ACL_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 62d9081d1e26..7a358ae05185 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1230,6 +1230,9 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, gfs2_inplace_release(ip); + if (ip->i_qadata && ip->i_qadata->qa_qd_num) + gfs2_quota_unlock(ip); + if (length != written && (iomap->flags & IOMAP_F_NEW)) { /* Deallocate blocks that were just allocated. */ loff_t blockmask = i_blocksize(inode) - 1; @@ -1242,9 +1245,6 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, } } - if (ip->i_qadata && ip->i_qadata->qa_qd_num) - gfs2_quota_unlock(ip); - if (unlikely(!written)) goto out_unlock; @@ -1538,13 +1538,13 @@ more_rgrps: goto out; } ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - 0, rd_gh); + LM_FLAG_NODE_SCOPE, rd_gh); if (ret) goto out; /* Must be done with the rgrp glock held: */ if (gfs2_rs_active(&ip->i_res) && - rgd == ip->i_res.rs_rbm.rgd) + rgd == ip->i_res.rs_rgd) gfs2_rs_deltree(&ip->i_res); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 89609c299717..2d500f90cdac 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -238,7 +238,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, goto out; error = -EACCES; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto out; error = 0; @@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask, !capable(CAP_LINUX_IMMUTABLE)) goto out; if (!IS_IMMUTABLE(inode)) { - error = gfs2_permission(inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); if (error) goto out; } @@ -716,10 +716,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if (file->f_mode & FMODE_WRITE) { + if (gfs2_rs_active(&ip->i_res)) gfs2_rs_delete(ip, &inode->i_writecount); + if (file->f_mode & FMODE_WRITE) gfs2_qa_put(ip); - } return 0; } @@ -749,7 +749,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - int sync_state = inode->i_state & I_DIRTY_ALL; + int sync_state = inode->i_state & I_DIRTY; struct gfs2_inode *ip = GFS2_I(inode); int ret = 0, ret1 = 0; @@ -762,7 +762,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (!gfs2_is_jdata(ip)) sync_state &= ~I_DIRTY_PAGES; if (datasync) - sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME); + sync_state &= ~I_DIRTY_SYNC; if (sync_state) { ret = sync_inode_metadata(inode, 1); @@ -1112,8 +1112,8 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t goto out_qunlock; /* check if the selected rgrp limits our max_blks further */ - if (ap.allowed && ap.allowed < max_blks) - max_blks = ap.allowed; + if (ip->i_res.rs_reserved < max_blks) + max_blks = ip->i_res.rs_reserved; /* Almost done. Calculate bytes that can be written using * max_blks. We also recompute max_bytes, data_blocks and diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index d87a5bc3607b..9567520d79f7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -313,9 +313,23 @@ void gfs2_glock_put(struct gfs2_glock *gl) static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) { const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); - if ((gh->gh_state == LM_ST_EXCLUSIVE || - gh_head->gh_state == LM_ST_EXCLUSIVE) && gh != gh_head) - return 0; + + if (gh != gh_head) { + /** + * Here we make a special exception to grant holders who agree + * to share the EX lock with other holders who also have the + * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit + * is set, we grant more holders with the bit set. + */ + if (gh_head->gh_state == LM_ST_EXCLUSIVE && + (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && + gh->gh_state == LM_ST_EXCLUSIVE && + (gh->gh_flags & LM_FLAG_NODE_SCOPE)) + return 1; + if ((gh->gh_state == LM_ST_EXCLUSIVE || + gh_head->gh_state == LM_ST_EXCLUSIVE)) + return 0; + } if (gl->gl_state == gh->gh_state) return 1; if (gh->gh_flags & GL_EXACT) @@ -2030,6 +2044,8 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'A'; if (flags & LM_FLAG_PRIORITY) *p++ = 'p'; + if (flags & LM_FLAG_NODE_SCOPE) + *p++ = 'n'; if (flags & GL_ASYNC) *p++ = 'a'; if (flags & GL_EXACT) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 53813364517b..31a8f2f649b5 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -75,6 +75,11 @@ enum { * request and directly join the other shared lock. A shared lock request * without the priority flag might be forced to wait until the deferred * requested had acquired and released the lock. + * + * LM_FLAG_NODE_SCOPE + * This holder agrees to share the lock within this node. In other words, + * the glock is held in EX mode according to DLM, but local holders on the + * same node can share it. */ #define LM_FLAG_TRY 0x0001 @@ -82,6 +87,7 @@ enum { #define LM_FLAG_NOEXP 0x0004 #define LM_FLAG_ANY 0x0008 #define LM_FLAG_PRIORITY 0x0010 +#define LM_FLAG_NODE_SCOPE 0x0020 #define GL_ASYNC 0x0040 #define GL_EXACT 0x0080 #define GL_SKIP 0x0100 diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 3faa421568b0..8e32d569c8bf 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -86,16 +86,12 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_trans tr; + unsigned int revokes; int ret; - memset(&tr, 0, sizeof(tr)); - INIT_LIST_HEAD(&tr.tr_buf); - INIT_LIST_HEAD(&tr.tr_databuf); - INIT_LIST_HEAD(&tr.tr_ail1_list); - INIT_LIST_HEAD(&tr.tr_ail2_list); - tr.tr_revokes = atomic_read(&gl->gl_ail_count); + revokes = atomic_read(&gl->gl_ail_count); - if (!tr.tr_revokes) { + if (!revokes) { bool have_revokes; bool log_in_flight; @@ -122,20 +118,14 @@ static int gfs2_ail_empty_gl(struct gfs2_glock *gl) return 0; } - /* A shortened, inline version of gfs2_trans_begin() - * tr->alloced is not set since the transaction structure is - * on the stack */ - tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes); - tr.tr_ip = _RET_IP_; - ret = gfs2_log_reserve(sdp, tr.tr_reserved); - if (ret < 0) - return ret; - WARN_ON_ONCE(current->journal_info); - current->journal_info = &tr; - - __gfs2_ail_flush(gl, 0, tr.tr_revokes); - + memset(&tr, 0, sizeof(tr)); + set_bit(TR_ONSTACK, &tr.tr_flags); + ret = __gfs2_trans_begin(&tr, sdp, 0, revokes, _RET_IP_); + if (ret) + goto flush; + __gfs2_ail_flush(gl, 0, revokes); gfs2_trans_end(sdp); + flush: gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_EMPTY_GL); @@ -146,19 +136,15 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned int revokes = atomic_read(&gl->gl_ail_count); - unsigned int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); int ret; if (!revokes) return; - while (revokes > max_revokes) - max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); - - ret = gfs2_trans_begin(sdp, 0, max_revokes); + ret = gfs2_trans_begin(sdp, 0, revokes); if (ret) return; - __gfs2_ail_flush(gl, fsync, max_revokes); + __gfs2_ail_flush(gl, fsync, revokes); gfs2_trans_end(sdp); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_AIL_FLUSH); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8e1ab8ed4abc..0957119f7744 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -20,6 +20,7 @@ #include <linux/percpu.h> #include <linux/lockref.h> #include <linux/rhashtable.h> +#include <linux/mutex.h> #define DIO_WAIT 0x00000010 #define DIO_METADATA 0x00000020 @@ -106,7 +107,8 @@ struct gfs2_rgrpd { u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ u32 rd_free; - u32 rd_reserved; /* number of blocks reserved */ + u32 rd_requested; /* number of blocks in rd_rstree */ + u32 rd_reserved; /* number of reserved blocks */ u32 rd_free_clone; u32 rd_dinodes; u64 rd_igeneration; @@ -122,34 +124,10 @@ struct gfs2_rgrpd { #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ spinlock_t rd_rsspin; /* protects reservation related vars */ + struct mutex rd_mutex; struct rb_root rd_rstree; /* multi-block reservation tree */ }; -struct gfs2_rbm { - struct gfs2_rgrpd *rgd; - u32 offset; /* The offset is bitmap relative */ - int bii; /* Bitmap index */ -}; - -static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) -{ - return rbm->rgd->rd_bits + rbm->bii; -} - -static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) -{ - BUG_ON(rbm->offset >= rbm->rgd->rd_data); - return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + - rbm->offset; -} - -static inline bool gfs2_rbm_eq(const struct gfs2_rbm *rbm1, - const struct gfs2_rbm *rbm2) -{ - return (rbm1->rgd == rbm2->rgd) && (rbm1->bii == rbm2->bii) && - (rbm1->offset == rbm2->offset); -} - enum gfs2_state_bits { BH_Pinned = BH_PrivateStart, BH_Escaped = BH_PrivateStart + 1, @@ -313,9 +291,11 @@ struct gfs2_qadata { /* quota allocation data */ */ struct gfs2_blkreserv { - struct rb_node rs_node; /* link to other block reservations */ - struct gfs2_rbm rs_rbm; /* Start of reservation */ - u32 rs_free; /* how many blocks are still free */ + struct rb_node rs_node; /* node within rd_rstree */ + struct gfs2_rgrpd *rs_rgd; + u64 rs_start; + u32 rs_requested; + u32 rs_reserved; /* number of reserved blocks */ }; /* @@ -490,7 +470,7 @@ struct gfs2_quota_data { enum { TR_TOUCHED = 1, TR_ATTACHED = 2, - TR_ALLOCED = 3, + TR_ONSTACK = 3, }; struct gfs2_trans { @@ -506,7 +486,6 @@ struct gfs2_trans { unsigned int tr_num_buf_rm; unsigned int tr_num_databuf_rm; unsigned int tr_num_revoke; - unsigned int tr_num_revoke_rm; struct list_head tr_list; struct list_head tr_databuf; @@ -531,6 +510,7 @@ struct gfs2_jdesc { unsigned int nr_extents; struct work_struct jd_work; struct inode *jd_inode; + struct bio *jd_log_bio; unsigned long jd_flags; #define JDF_RECOVERY 1 unsigned int jd_jid; @@ -585,6 +565,7 @@ struct gfs2_args { unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + unsigned int ar_got_rgrplvb:1; /* Was the rgrplvb opt given? */ unsigned int ar_loccookie:1; /* use location based readdir cookies */ s32 ar_commit; /* Commit interval */ @@ -821,7 +802,6 @@ struct gfs2_sbd { struct gfs2_trans *sd_log_tr; unsigned int sd_log_blks_reserved; - int sd_log_committed_revoke; atomic_t sd_log_pinned; unsigned int sd_log_num_revoke; @@ -834,24 +814,22 @@ struct gfs2_sbd { atomic_t sd_log_thresh2; atomic_t sd_log_blks_free; atomic_t sd_log_blks_needed; + atomic_t sd_log_revokes_available; wait_queue_head_t sd_log_waitq; wait_queue_head_t sd_logd_waitq; u64 sd_log_sequence; - unsigned int sd_log_head; - unsigned int sd_log_tail; int sd_log_idle; struct rw_semaphore sd_log_flush_lock; atomic_t sd_log_in_flight; - struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; int sd_log_error; /* First log error */ wait_queue_head_t sd_withdraw_wait; - atomic_t sd_reserving_log; - wait_queue_head_t sd_reserving_log_wait; - + unsigned int sd_log_tail; + unsigned int sd_log_flush_tail; + unsigned int sd_log_head; unsigned int sd_log_flush_head; spinlock_t sd_ail_lock; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c1b77e8d6b1c..c9775d5c6594 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -325,7 +325,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, } if (!is_root) { - error = gfs2_permission(dir, MAY_EXEC); + error = gfs2_permission(&init_user_ns, dir, MAY_EXEC); if (error) goto out; } @@ -355,7 +355,8 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, { int error; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, &dip->i_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -490,8 +491,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, di = (struct gfs2_dinode *)dibh->b_data; gfs2_dinode_out(ip, di); - di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); - di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->di_major = cpu_to_be32(imajor(&ip->i_inode)); + di->di_minor = cpu_to_be32(iminor(&ip->i_inode)); di->__pad1 = 0; di->__pad2 = 0; di->__pad3 = 0; @@ -843,8 +844,8 @@ fail: * Returns: errno */ -static int gfs2_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +static int gfs2_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return gfs2_create_inode(dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, excl); } @@ -951,7 +952,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) goto out_gunlock; - error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1068,7 +1069,8 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, if (IS_APPEND(&dip->i_inode)) return -EPERM; - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, &dip->i_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -1145,7 +1147,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) if (!rgd) goto out_inodes; - gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); + gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, LM_FLAG_NODE_SCOPE, ghs + 2); error = gfs2_glock_nq(ghs); /* parent */ @@ -1204,8 +1206,8 @@ out_inodes: * Returns: errno */ -static int gfs2_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int gfs2_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { unsigned int size; @@ -1225,7 +1227,8 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, * Returns: errno */ -static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int gfs2_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); @@ -1240,8 +1243,8 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) * */ -static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev) +static int gfs2_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { return gfs2_create_inode(dir, dentry, NULL, mode, dev, NULL, 0, 0); } @@ -1450,8 +1453,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -ENOENT; goto out_gunlock; } - error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, - &rd_gh); + error = gfs2_glock_nq_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rd_gh); if (error) goto out_gunlock; } @@ -1490,7 +1493,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, } } } else { - error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC); + error = gfs2_permission(&init_user_ns, ndir, + MAY_WRITE | MAY_EXEC); if (error) goto out_gunlock; @@ -1525,7 +1529,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(d_inode(odentry), MAY_WRITE); + error = gfs2_permission(&init_user_ns, d_inode(odentry), + MAY_WRITE); if (error) goto out_gunlock; } @@ -1688,12 +1693,14 @@ static int gfs2_exchange(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(old_mode)) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, odentry->d_inode, + MAY_WRITE); if (error) goto out_gunlock; } if (S_ISDIR(new_mode)) { - error = gfs2_permission(ndentry->d_inode, MAY_WRITE); + error = gfs2_permission(&init_user_ns, ndentry->d_inode, + MAY_WRITE); if (error) goto out_gunlock; } @@ -1747,9 +1754,9 @@ out: return error; } -static int gfs2_rename2(struct inode *odir, struct dentry *odentry, - struct inode *ndir, struct dentry *ndentry, - unsigned int flags) +static int gfs2_rename2(struct user_namespace *mnt_userns, struct inode *odir, + struct dentry *odentry, struct inode *ndir, + struct dentry *ndentry, unsigned int flags) { flags &= ~RENAME_NOREPLACE; @@ -1833,7 +1840,8 @@ out: * Returns: errno */ -int gfs2_permission(struct inode *inode, int mask) +int gfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct gfs2_inode *ip; struct gfs2_holder i_gh; @@ -1852,7 +1860,7 @@ int gfs2_permission(struct inode *inode, int mask) if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode)) error = -EPERM; else - error = generic_permission(inode, mask); + error = generic_permission(&init_user_ns, inode, mask); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); @@ -1861,7 +1869,7 @@ int gfs2_permission(struct inode *inode, int mask) static int __gfs2_setattr_simple(struct inode *inode, struct iattr *attr) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -1963,7 +1971,8 @@ out: * Returns: errno */ -static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) +static int gfs2_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); @@ -1982,7 +1991,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto error; @@ -1993,7 +2002,8 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) else { error = gfs2_setattr_simple(inode, attr); if (!error && attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, + inode->i_mode); } error: @@ -2007,6 +2017,7 @@ out: /** * gfs2_getattr - Read out an inode's attributes + * @mnt_userns: user namespace of the mount the inode was found from * @path: Object to query * @stat: The inode's stats * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -2021,7 +2032,8 @@ out: * Returns: errno */ -static int gfs2_getattr(const struct path *path, struct kstat *stat, +static int gfs2_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); @@ -2049,7 +2061,7 @@ static int gfs2_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 8073b8d2c7fa..c447bd5b3017 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,8 @@ extern int gfs2_inode_refresh(struct gfs2_inode *ip); extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); -extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); extern int gfs2_setattr_simple(struct inode *inode, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 9f2b5609f225..153272f82984 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -284,7 +284,6 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; - int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -297,13 +296,10 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_update_request_times(gl); - /* don't want to skip dlm_unlock writing the lvb when lock is ex */ - - if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) - lvb_needs_unlock = 1; + /* don't want to skip dlm_unlock writing the lvb when lock has one */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - !lvb_needs_unlock) { + !gl->gl_lksb.sb_lvbptr) { gfs2_glock_free(gl); return; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 2e9314091c81..16937ebb2a3e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -50,10 +50,12 @@ unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct) unsigned int blks; unsigned int first, second; + /* The initial struct gfs2_log_descriptor block */ blks = 1; first = sdp->sd_ldptrs; if (nstruct > first) { + /* Subsequent struct gfs2_meta_header blocks */ second = sdp->sd_inptrs; blks += DIV_ROUND_UP(nstruct - first, second); } @@ -89,7 +91,7 @@ void gfs2_remove_from_ail(struct gfs2_bufdata *bd) static int gfs2_ail1_start_one(struct gfs2_sbd *sdp, struct writeback_control *wbc, - struct gfs2_trans *tr) + struct gfs2_trans *tr, struct blk_plug *plug) __releases(&sdp->sd_ail_lock) __acquires(&sdp->sd_ail_lock) { @@ -131,6 +133,11 @@ __acquires(&sdp->sd_ail_lock) continue; spin_unlock(&sdp->sd_ail_lock); ret = generic_writepages(mapping, wbc); + if (need_resched()) { + blk_finish_plug(plug); + cond_resched(); + blk_start_plug(plug); + } spin_lock(&sdp->sd_ail_lock); if (ret == -ENODATA) /* if a jdata write into a new hole */ ret = 0; /* ignore it */ @@ -205,7 +212,7 @@ restart: list_for_each_entry_reverse(tr, head, tr_list) { if (wbc->nr_to_write <= 0) break; - ret = gfs2_ail1_start_one(sdp, wbc, tr); + ret = gfs2_ail1_start_one(sdp, wbc, tr, &plug); if (ret) { if (ret == -EBUSY) goto restart; @@ -240,6 +247,45 @@ static void gfs2_ail1_start(struct gfs2_sbd *sdp) return gfs2_ail1_flush(sdp, &wbc); } +static void gfs2_log_update_flush_tail(struct gfs2_sbd *sdp) +{ + unsigned int new_flush_tail = sdp->sd_log_head; + struct gfs2_trans *tr; + + if (!list_empty(&sdp->sd_ail1_list)) { + tr = list_last_entry(&sdp->sd_ail1_list, + struct gfs2_trans, tr_list); + new_flush_tail = tr->tr_first; + } + sdp->sd_log_flush_tail = new_flush_tail; +} + +static void gfs2_log_update_head(struct gfs2_sbd *sdp) +{ + unsigned int new_head = sdp->sd_log_flush_head; + + if (sdp->sd_log_flush_tail == sdp->sd_log_head) + sdp->sd_log_flush_tail = new_head; + sdp->sd_log_head = new_head; +} + +/** + * gfs2_ail_empty_tr - empty one of the ail lists of a transaction + */ + +static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + struct list_head *head) +{ + struct gfs2_bufdata *bd; + + while (!list_empty(head)) { + bd = list_first_entry(head, struct gfs2_bufdata, + bd_ail_st_list); + gfs2_assert(sdp, bd->bd_tr == tr); + gfs2_remove_from_ail(bd); + } +} + /** * gfs2_ail1_empty_one - Check whether or not a trans in the AIL has been synced * @sdp: the filesystem @@ -315,6 +361,7 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes) else oldest_tr = 0; } + gfs2_log_update_flush_tail(sdp); ret = list_empty(&sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); @@ -348,47 +395,69 @@ static void gfs2_ail1_wait(struct gfs2_sbd *sdp) spin_unlock(&sdp->sd_ail_lock); } -/** - * gfs2_ail_empty_tr - empty one of the ail lists for a transaction - */ - -static void gfs2_ail_empty_tr(struct gfs2_sbd *sdp, struct gfs2_trans *tr, - struct list_head *head) +static void __ail2_empty(struct gfs2_sbd *sdp, struct gfs2_trans *tr) { - struct gfs2_bufdata *bd; - - while (!list_empty(head)) { - bd = list_first_entry(head, struct gfs2_bufdata, - bd_ail_st_list); - gfs2_assert(sdp, bd->bd_tr == tr); - gfs2_remove_from_ail(bd); - } + gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); + list_del(&tr->tr_list); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); + gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); + gfs2_trans_free(sdp, tr); } static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) { - struct gfs2_trans *tr, *safe; + struct list_head *ail2_list = &sdp->sd_ail2_list; unsigned int old_tail = sdp->sd_log_tail; - int wrap = (new_tail < old_tail); - int a, b, rm; + struct gfs2_trans *tr, *safe; spin_lock(&sdp->sd_ail_lock); + if (old_tail <= new_tail) { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first && tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } else { + list_for_each_entry_safe(tr, safe, ail2_list, tr_list) { + if (old_tail <= tr->tr_first || tr->tr_first < new_tail) + __ail2_empty(sdp, tr); + } + } + spin_unlock(&sdp->sd_ail_lock); +} - list_for_each_entry_safe(tr, safe, &sdp->sd_ail2_list, tr_list) { - a = (old_tail <= tr->tr_first); - b = (tr->tr_first < new_tail); - rm = (wrap) ? (a || b) : (a && b); - if (!rm) - continue; +/** + * gfs2_log_is_empty - Check if the log is empty + * @sdp: The GFS2 superblock + */ - gfs2_ail_empty_tr(sdp, tr, &tr->tr_ail2_list); - list_del(&tr->tr_list); - gfs2_assert_warn(sdp, list_empty(&tr->tr_ail1_list)); - gfs2_assert_warn(sdp, list_empty(&tr->tr_ail2_list)); - gfs2_trans_free(sdp, tr); +bool gfs2_log_is_empty(struct gfs2_sbd *sdp) { + return atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks; +} + +static bool __gfs2_log_try_reserve_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + unsigned int available; + + available = atomic_read(&sdp->sd_log_revokes_available); + while (available >= revokes) { + if (atomic_try_cmpxchg(&sdp->sd_log_revokes_available, + &available, available - revokes)) + return true; } + return false; +} - spin_unlock(&sdp->sd_ail_lock); +/** + * gfs2_log_release_revokes - Release a given number of revokes + * @sdp: The GFS2 superblock + * @revokes: The number of revokes to release + * + * sdp->sd_log_flush_lock must be held. + */ +void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes) +{ + if (revokes) + atomic_add(revokes, &sdp->sd_log_revokes_available); } /** @@ -400,86 +469,141 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail) void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks) { - atomic_add(blks, &sdp->sd_log_blks_free); trace_gfs2_log_blocks(sdp, blks); gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= sdp->sd_jdesc->jd_blocks); - up_read(&sdp->sd_log_flush_lock); + if (atomic_read(&sdp->sd_log_blks_needed)) + wake_up(&sdp->sd_log_waitq); } /** - * gfs2_log_reserve - Make a log reservation + * __gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free + * + * Try to do the same as __gfs2_log_reserve(), but fail if no more log + * space is immediately available. + */ +static bool __gfs2_log_try_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) +{ + unsigned wanted = blks + taboo_blks; + unsigned int free_blocks; + + free_blocks = atomic_read(&sdp->sd_log_blks_free); + while (free_blocks >= wanted) { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, &free_blocks, + free_blocks - blks)) { + trace_gfs2_log_blocks(sdp, -blks); + return true; + } + } + return false; +} + +/** + * __gfs2_log_reserve - Make a log reservation * @sdp: The GFS2 superblock * @blks: The number of blocks to reserve + * @taboo_blks: The number of blocks to leave free * - * Note that we never give out the last few blocks of the journal. Thats - * due to the fact that there is a small number of header blocks - * associated with each log flush. The exact number can't be known until - * flush time, so we ensure that we have just enough free blocks at all - * times to avoid running out during a log flush. + * @taboo_blks is set to 0 for logd, and to GFS2_LOG_FLUSH_MIN_BLOCKS + * for all other processes. This ensures that when the log is almost full, + * logd will still be able to call gfs2_log_flush one more time without + * blocking, which will advance the tail and make some more log space + * available. * * We no longer flush the log here, instead we wake up logd to do that * for us. To avoid the thundering herd and to ensure that we deal fairly * with queued waiters, we use an exclusive wait. This means that when we * get woken with enough journal space to get our reservation, we need to * wake the next waiter on the list. - * - * Returns: errno */ -int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks) +static void __gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks, + unsigned int taboo_blks) { - int ret = 0; - unsigned reserved_blks = 7 * (4096 / sdp->sd_vfs->s_blocksize); - unsigned wanted = blks + reserved_blks; - DEFINE_WAIT(wait); - int did_wait = 0; + unsigned wanted = blks + taboo_blks; unsigned int free_blocks; - if (gfs2_assert_warn(sdp, blks) || - gfs2_assert_warn(sdp, blks <= sdp->sd_jdesc->jd_blocks)) - return -EINVAL; atomic_add(blks, &sdp->sd_log_blks_needed); -retry: - free_blocks = atomic_read(&sdp->sd_log_blks_free); - if (unlikely(free_blocks <= wanted)) { - do { - prepare_to_wait_exclusive(&sdp->sd_log_waitq, &wait, - TASK_UNINTERRUPTIBLE); + for (;;) { + if (current != sdp->sd_logd_process) wake_up(&sdp->sd_logd_waitq); - did_wait = 1; - if (atomic_read(&sdp->sd_log_blks_free) <= wanted) - io_schedule(); - free_blocks = atomic_read(&sdp->sd_log_blks_free); - } while(free_blocks <= wanted); - finish_wait(&sdp->sd_log_waitq, &wait); - } - atomic_inc(&sdp->sd_reserving_log); - if (atomic_cmpxchg(&sdp->sd_log_blks_free, free_blocks, - free_blocks - blks) != free_blocks) { - if (atomic_dec_and_test(&sdp->sd_reserving_log)) - wake_up(&sdp->sd_reserving_log_wait); - goto retry; + io_wait_event(sdp->sd_log_waitq, + (free_blocks = atomic_read(&sdp->sd_log_blks_free), + free_blocks >= wanted)); + do { + if (atomic_try_cmpxchg(&sdp->sd_log_blks_free, + &free_blocks, + free_blocks - blks)) + goto reserved; + } while (free_blocks >= wanted); } - atomic_sub(blks, &sdp->sd_log_blks_needed); - trace_gfs2_log_blocks(sdp, -blks); - /* - * If we waited, then so might others, wake them up _after_ we get - * our share of the log. - */ - if (unlikely(did_wait)) +reserved: + trace_gfs2_log_blocks(sdp, -blks); + if (atomic_sub_return(blks, &sdp->sd_log_blks_needed)) wake_up(&sdp->sd_log_waitq); +} + +/** + * gfs2_log_try_reserve - Try to make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * This is similar to gfs2_log_reserve, but sdp->sd_log_flush_lock must be + * held for correct revoke accounting. + */ - down_read(&sdp->sd_log_flush_lock); - if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { - gfs2_log_release(sdp, blks); - ret = -EROFS; +bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks = 0; + + *extra_revokes = 0; + if (revokes && !__gfs2_log_try_reserve_revokes(sdp, revokes)) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; } - if (atomic_dec_and_test(&sdp->sd_reserving_log)) - wake_up(&sdp->sd_reserving_log_wait); - return ret; + if (!blks) + return true; + if (__gfs2_log_try_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS)) + return true; + if (!revoke_blks) + gfs2_log_release_revokes(sdp, revokes); + return false; +} + +/** + * gfs2_log_reserve - Make a log reservation + * @sdp: The GFS2 superblock + * @tr: The transaction + * @extra_revokes: The number of additional revokes reserved (output) + * + * sdp->sd_log_flush_lock must not be held. + */ + +void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes) +{ + unsigned int blks = tr->tr_reserved; + unsigned int revokes = tr->tr_revokes; + unsigned int revoke_blks = 0; + + *extra_revokes = 0; + if (revokes) { + revoke_blks = DIV_ROUND_UP(revokes, sdp->sd_inptrs); + *extra_revokes = revoke_blks * sdp->sd_inptrs - revokes; + blks += revoke_blks; + } + __gfs2_log_reserve(sdp, blks, GFS2_LOG_FLUSH_MIN_BLOCKS); } /** @@ -507,24 +631,20 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer } /** - * calc_reserved - Calculate the number of blocks to reserve when - * refunding a transaction's unused buffers. + * calc_reserved - Calculate the number of blocks to keep reserved * @sdp: The GFS2 superblock * * This is complex. We need to reserve room for all our currently used - * metadata buffers (e.g. normal file I/O rewriting file time stamps) and - * all our journaled data buffers for journaled files (e.g. files in the + * metadata blocks (e.g. normal file I/O rewriting file time stamps) and + * all our journaled data blocks for journaled files (e.g. files in the * meta_fs like rindex, or files for which chattr +j was done.) - * If we don't reserve enough space, gfs2_log_refund and gfs2_log_flush - * will count it as free space (sd_log_blks_free) and corruption will follow. + * If we don't reserve enough space, corruption will follow. * - * We can have metadata bufs and jdata bufs in the same journal. So each - * type gets its own log header, for which we need to reserve a block. - * In fact, each type has the potential for needing more than one header - * in cases where we have more buffers than will fit on a journal page. + * We can have metadata blocks and jdata blocks in the same journal. Each + * type gets its own log descriptor, for which we need to reserve a block. + * In fact, each type has the potential for needing more than one log descriptor + * in cases where we have more blocks than will fit in a log descriptor. * Metadata journal entries take up half the space of journaled buffer entries. - * Thus, metadata entries have buf_limit (502) and journaled buffers have - * databuf_limit (251) before they cause a wrap around. * * Also, we need to reserve blocks for revoke journal entries and one for an * overall header for the lot. @@ -533,59 +653,29 @@ static inline unsigned int log_distance(struct gfs2_sbd *sdp, unsigned int newer */ static unsigned int calc_reserved(struct gfs2_sbd *sdp) { - unsigned int reserved = 0; - unsigned int mbuf; - unsigned int dbuf; + unsigned int reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + unsigned int blocks; struct gfs2_trans *tr = sdp->sd_log_tr; if (tr) { - mbuf = tr->tr_num_buf_new - tr->tr_num_buf_rm; - dbuf = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; - reserved = mbuf + dbuf; - /* Account for header blocks */ - reserved += DIV_ROUND_UP(mbuf, buf_limit(sdp)); - reserved += DIV_ROUND_UP(dbuf, databuf_limit(sdp)); + blocks = tr->tr_num_buf_new - tr->tr_num_buf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, buf_limit(sdp)); + blocks = tr->tr_num_databuf_new - tr->tr_num_databuf_rm; + reserved += blocks + DIV_ROUND_UP(blocks, databuf_limit(sdp)); } - - if (sdp->sd_log_committed_revoke > 0) - reserved += gfs2_struct2blk(sdp, sdp->sd_log_committed_revoke); - /* One for the overall header */ - if (reserved) - reserved++; return reserved; } -static unsigned int current_tail(struct gfs2_sbd *sdp) -{ - struct gfs2_trans *tr; - unsigned int tail; - - spin_lock(&sdp->sd_ail_lock); - - if (list_empty(&sdp->sd_ail1_list)) { - tail = sdp->sd_log_head; - } else { - tr = list_last_entry(&sdp->sd_ail1_list, struct gfs2_trans, - tr_list); - tail = tr->tr_first; - } - - spin_unlock(&sdp->sd_ail_lock); - - return tail; -} - -static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail) +static void log_pull_tail(struct gfs2_sbd *sdp) { - unsigned int dist = log_distance(sdp, new_tail, sdp->sd_log_tail); + unsigned int new_tail = sdp->sd_log_flush_tail; + unsigned int dist; + if (new_tail == sdp->sd_log_tail) + return; + dist = log_distance(sdp, new_tail, sdp->sd_log_tail); ail2_empty(sdp, new_tail); - - atomic_add(dist, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, dist); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); - + gfs2_log_release(sdp, dist); sdp->sd_log_tail = new_tail; } @@ -698,7 +788,7 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl) } /** - * gfs2_write_revokes - Add as many revokes to the system transaction as we can + * gfs2_flush_revokes - Add as many revokes to the system transaction as we can * @sdp: The GFS2 superblock * * Our usual strategy is to defer writing revokes as much as we can in the hope @@ -709,38 +799,14 @@ void gfs2_glock_remove_revoke(struct gfs2_glock *gl) * been written back. This will basically come at no cost now, and will save * us from having to keep track of those blocks on the AIL2 list later. */ -void gfs2_write_revokes(struct gfs2_sbd *sdp) +void gfs2_flush_revokes(struct gfs2_sbd *sdp) { /* number of revokes we still have room for */ - int max_revokes = (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_log_descriptor)) / sizeof(u64); + unsigned int max_revokes = atomic_read(&sdp->sd_log_revokes_available); gfs2_log_lock(sdp); - while (sdp->sd_log_num_revoke > max_revokes) - max_revokes += (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header)) / sizeof(u64); - max_revokes -= sdp->sd_log_num_revoke; - if (!sdp->sd_log_num_revoke) { - atomic_dec(&sdp->sd_log_blks_free); - /* If no blocks have been reserved, we need to also - * reserve a block for the header */ - if (!sdp->sd_log_blks_reserved) { - atomic_dec(&sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, -2); - } else { - trace_gfs2_log_blocks(sdp, -1); - } - } gfs2_ail1_empty(sdp, max_revokes); gfs2_log_unlock(sdp); - - if (!sdp->sd_log_num_revoke) { - atomic_inc(&sdp->sd_log_blks_free); - if (!sdp->sd_log_blks_reserved) { - atomic_inc(&sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, 2); - } else { - trace_gfs2_log_blocks(sdp, 1); - } - } } /** @@ -769,7 +835,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 dblock; if (gfs2_withdrawn(sdp)) - goto out; + return; page = mempool_alloc(gfs2_page_pool, GFP_NOIO); lh = page_address(page); @@ -822,10 +888,8 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, sb->s_blocksize - LH_V1_SIZE - 4); lh->lh_crc = cpu_to_be32(crc); - gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE | op_flags); -out: - log_flush_wait(sdp); + gfs2_log_write(sdp, jd, page, sb->s_blocksize, 0, dblock); + gfs2_log_submit_bio(&jd->jd_log_bio, REQ_OP_WRITE | op_flags); } /** @@ -838,25 +902,24 @@ out: static void log_write_header(struct gfs2_sbd *sdp, u32 flags) { - unsigned int tail; int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); gfs2_assert_withdraw(sdp, (state != SFS_FROZEN)); - tail = current_tail(sdp); if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) { gfs2_ordered_wait(sdp); log_flush_wait(sdp); op_flags = REQ_SYNC | REQ_META | REQ_PRIO; } - sdp->sd_log_idle = (tail == sdp->sd_log_flush_head); - gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, tail, - sdp->sd_log_flush_head, flags, op_flags); + sdp->sd_log_idle = (sdp->sd_log_flush_tail == sdp->sd_log_flush_head); + gfs2_write_log_header(sdp, sdp->sd_jdesc, sdp->sd_log_sequence++, + sdp->sd_log_flush_tail, sdp->sd_log_flush_head, + flags, op_flags); gfs2_log_incr_head(sdp); - - if (sdp->sd_log_tail != tail) - log_pull_tail(sdp, tail); + log_flush_wait(sdp); + log_pull_tail(sdp); + gfs2_log_update_head(sdp); } /** @@ -956,10 +1019,15 @@ static void trans_drain(struct gfs2_trans *tr) void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) { struct gfs2_trans *tr = NULL; + unsigned int reserved_blocks = 0, used_blocks = 0; enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state); + unsigned int first_log_head; + unsigned int reserved_revokes = 0; down_write(&sdp->sd_log_flush_lock); + trace_gfs2_log_flush(sdp, 1, flags); +repeat: /* * Do this check while holding the log_flush_lock to prevent new * buffers from being added to the ail via gfs2_pin() @@ -970,28 +1038,47 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) /* Log might have been flushed while we waited for the flush lock */ if (gl && !test_bit(GLF_LFLUSH, &gl->gl_flags)) goto out; - trace_gfs2_log_flush(sdp, 1, flags); - if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + first_log_head = sdp->sd_log_head; + sdp->sd_log_flush_head = first_log_head; - sdp->sd_log_flush_head = sdp->sd_log_head; tr = sdp->sd_log_tr; - if (tr) { - sdp->sd_log_tr = NULL; - tr->tr_first = sdp->sd_log_flush_head; - if (unlikely (state == SFS_FROZEN)) - if (gfs2_assert_withdraw_delayed(sdp, - !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) - goto out_withdraw; + if (tr || sdp->sd_log_num_revoke) { + if (reserved_blocks) + gfs2_log_release(sdp, reserved_blocks); + reserved_blocks = sdp->sd_log_blks_reserved; + reserved_revokes = sdp->sd_log_num_revoke; + if (tr) { + sdp->sd_log_tr = NULL; + tr->tr_first = first_log_head; + if (unlikely (state == SFS_FROZEN)) { + if (gfs2_assert_withdraw_delayed(sdp, + !tr->tr_num_buf_new && !tr->tr_num_databuf_new)) + goto out_withdraw; + } + } + } else if (!reserved_blocks) { + unsigned int taboo_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + + reserved_blocks = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (current == sdp->sd_logd_process) + taboo_blocks = 0; + + if (!__gfs2_log_try_reserve(sdp, reserved_blocks, taboo_blocks)) { + up_write(&sdp->sd_log_flush_lock); + __gfs2_log_reserve(sdp, reserved_blocks, taboo_blocks); + down_write(&sdp->sd_log_flush_lock); + goto repeat; + } + BUG_ON(sdp->sd_log_num_revoke); } + if (flags & GFS2_LOG_HEAD_FLUSH_SHUTDOWN) + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + if (unlikely(state == SFS_FROZEN)) - if (gfs2_assert_withdraw_delayed(sdp, !sdp->sd_log_num_revoke)) + if (gfs2_assert_withdraw_delayed(sdp, !reserved_revokes)) goto out_withdraw; - if (gfs2_assert_withdraw_delayed(sdp, - sdp->sd_log_num_revoke == sdp->sd_log_committed_revoke)) - goto out_withdraw; gfs2_ordered_write(sdp); if (gfs2_withdrawn(sdp)) @@ -999,16 +1086,13 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) lops_before_commit(sdp, tr); if (gfs2_withdrawn(sdp)) goto out_withdraw; - gfs2_log_submit_bio(&sdp->sd_log_bio, REQ_OP_WRITE); + gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE); if (gfs2_withdrawn(sdp)) goto out_withdraw; if (sdp->sd_log_head != sdp->sd_log_flush_head) { - log_flush_wait(sdp); log_write_header(sdp, flags); - } else if (sdp->sd_log_tail != current_tail(sdp) && !sdp->sd_log_idle){ - atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ - trace_gfs2_log_blocks(sdp, -1); + } else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) { log_write_header(sdp, flags); } if (gfs2_withdrawn(sdp)) @@ -1016,9 +1100,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) lops_after_commit(sdp, tr); gfs2_log_lock(sdp); - sdp->sd_log_head = sdp->sd_log_flush_head; sdp->sd_log_blks_reserved = 0; - sdp->sd_log_committed_revoke = 0; spin_lock(&sdp->sd_ail_lock); if (tr && !list_empty(&tr->tr_ail1_list)) { @@ -1033,10 +1115,7 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) empty_ail1_list(sdp); if (gfs2_withdrawn(sdp)) goto out_withdraw; - atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */ - trace_gfs2_log_blocks(sdp, -1); log_write_header(sdp, flags); - sdp->sd_log_head = sdp->sd_log_flush_head; } if (flags & (GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LOG_HEAD_FLUSH_FREEZE)) @@ -1046,12 +1125,22 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags) } out_end: - trace_gfs2_log_flush(sdp, 0, flags); + used_blocks = log_distance(sdp, sdp->sd_log_flush_head, first_log_head); + reserved_revokes += atomic_read(&sdp->sd_log_revokes_available); + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + gfs2_assert_withdraw(sdp, reserved_revokes % sdp->sd_inptrs == sdp->sd_ldptrs); + if (reserved_revokes > sdp->sd_ldptrs) + reserved_blocks += (reserved_revokes - sdp->sd_ldptrs) / sdp->sd_inptrs; out: + if (used_blocks != reserved_blocks) { + gfs2_assert_withdraw_delayed(sdp, used_blocks < reserved_blocks); + gfs2_log_release(sdp, reserved_blocks - used_blocks); + } up_write(&sdp->sd_log_flush_lock); gfs2_trans_free(sdp, tr); if (gfs2_withdrawing(sdp)) gfs2_withdraw(sdp); + trace_gfs2_log_flush(sdp, 0, flags); return; out_withdraw: @@ -1087,8 +1176,8 @@ static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new) old->tr_num_databuf_new += new->tr_num_databuf_new; old->tr_num_buf_rm += new->tr_num_buf_rm; old->tr_num_databuf_rm += new->tr_num_databuf_rm; + old->tr_revokes += new->tr_revokes; old->tr_num_revoke += new->tr_num_revoke; - old->tr_num_revoke_rm += new->tr_num_revoke_rm; list_splice_tail_init(&new->tr_databuf, &old->tr_databuf); list_splice_tail_init(&new->tr_buf, &old->tr_buf); @@ -1110,20 +1199,17 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr) if (sdp->sd_log_tr) { gfs2_merge_trans(sdp, tr); } else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) { - gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags)); + gfs2_assert_withdraw(sdp, !test_bit(TR_ONSTACK, &tr->tr_flags)); sdp->sd_log_tr = tr; set_bit(TR_ATTACHED, &tr->tr_flags); } - sdp->sd_log_committed_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; reserved = calc_reserved(sdp); maxres = sdp->sd_log_blks_reserved + tr->tr_reserved; gfs2_assert_withdraw(sdp, maxres >= reserved); unused = maxres - reserved; - atomic_add(unused, &sdp->sd_log_blks_free); - trace_gfs2_log_blocks(sdp, unused); - gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= - sdp->sd_jdesc->jd_blocks); + if (unused) + gfs2_log_release(sdp, unused); sdp->sd_log_blks_reserved = reserved; gfs2_log_unlock(sdp); @@ -1166,15 +1252,11 @@ static void gfs2_log_shutdown(struct gfs2_sbd *sdp) gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list)); - sdp->sd_log_flush_head = sdp->sd_log_head; - log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT | GFS2_LFC_SHUTDOWN); + log_pull_tail(sdp); gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail); gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list)); - - sdp->sd_log_head = sdp->sd_log_flush_head; - sdp->sd_log_tail = sdp->sd_log_head; } static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) @@ -1208,7 +1290,6 @@ int gfs2_logd(void *data) struct gfs2_sbd *sdp = data; unsigned long t = 1; DEFINE_WAIT(wait); - bool did_flush; while (!kthread_should_stop()) { @@ -1227,12 +1308,10 @@ int gfs2_logd(void *data) continue; } - did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_LOGD_JFLUSH_REQD); - did_flush = true; + GFS2_LFC_LOGD_JFLUSH_REQD); } if (gfs2_ail_flush_reqd(sdp)) { @@ -1240,13 +1319,9 @@ int gfs2_logd(void *data) gfs2_ail1_wait(sdp); gfs2_ail1_empty(sdp, 0); gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | - GFS2_LFC_LOGD_AIL_FLUSH_REQD); - did_flush = true; + GFS2_LFC_LOGD_AIL_FLUSH_REQD); } - if (!gfs2_ail_flush_reqd(sdp) || did_flush) - wake_up(&sdp->sd_log_waitq); - t = gfs2_tune_get(sdp, gt_logd_secs) * HZ; try_to_freeze(); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index 79f97290146e..eea58015710e 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -13,6 +13,13 @@ #include "incore.h" #include "inode.h" +/* + * The minimum amount of log space required for a log flush is one block for + * revokes and one block for the log header. Log flushes other than + * GFS2_LOG_HEAD_FLUSH_NORMAL may write one or two more log headers. + */ +#define GFS2_LOG_FLUSH_MIN_BLOCKS 4 + /** * gfs2_log_lock - acquire the right to mess with the log manager * @sdp: the filesystem @@ -43,7 +50,9 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, if (++value == sdp->sd_jdesc->jd_blocks) { value = 0; } - sdp->sd_log_head = sdp->sd_log_tail = value; + sdp->sd_log_tail = value; + sdp->sd_log_flush_tail = value; + sdp->sd_log_head = value; } static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) @@ -64,8 +73,13 @@ static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) extern void gfs2_ordered_del_inode(struct gfs2_inode *ip); extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct); extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd); +extern bool gfs2_log_is_empty(struct gfs2_sbd *sdp); +extern void gfs2_log_release_revokes(struct gfs2_sbd *sdp, unsigned int revokes); extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks); -extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks); +extern bool gfs2_log_try_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); +extern void gfs2_log_reserve(struct gfs2_sbd *sdp, struct gfs2_trans *tr, + unsigned int *extra_revokes); extern void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, u64 seq, u32 tail, u32 lblock, u32 flags, int op_flags); @@ -78,6 +92,6 @@ extern void log_flush_wait(struct gfs2_sbd *sdp); extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); -extern void gfs2_write_revokes(struct gfs2_sbd *sdp); +extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3922b26264f5..dc1b93a877c6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -76,15 +76,20 @@ static void maybe_release_space(struct gfs2_bufdata *bd) unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number; struct gfs2_bitmap *bi = rgd->rd_bits + index; + rgrp_lock_local(rgd); if (bi->bi_clone == NULL) - return; + goto out; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL); memcpy(bi->bi_clone + bi->bi_offset, bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes); clear_bit(GBF_FULL, &bi->bi_flags); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_free_clone < rgd->rd_reserved); rgd->rd_extfail_pt = rgd->rd_free; + +out: + rgrp_unlock_local(rgd); } /** @@ -322,17 +327,18 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno, * then add the page segment to that. */ -void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, - unsigned size, unsigned offset, u64 blkno) +void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno) { struct bio *bio; int ret; - bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, REQ_OP_WRITE, + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE, gfs2_end_log_write, false); ret = bio_add_page(bio, page, size, offset); if (ret == 0) { - bio = gfs2_log_get_bio(sdp, blkno, &sdp->sd_log_bio, + bio = gfs2_log_get_bio(sdp, blkno, &jd->jd_log_bio, REQ_OP_WRITE, gfs2_end_log_write, true); ret = bio_add_page(bio, page, size, offset); WARN_ON(ret == 0); @@ -355,7 +361,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, bh->b_page, bh->b_size, bh_offset(bh), dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, + bh_offset(bh), dblock); } /** @@ -369,14 +376,14 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) * the page may be freed at any time. */ -void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) +static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page) { struct super_block *sb = sdp->sd_vfs; u64 dblock; dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, page, sb->s_blocksize, 0, dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, page, sb->s_blocksize, 0, dblock); } /** @@ -845,7 +852,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) struct page *page; unsigned int length; - gfs2_write_revokes(sdp); + gfs2_flush_revokes(sdp); if (!sdp->sd_log_num_revoke) return; @@ -857,7 +864,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - gfs2_log_write_page(sdp, page); page = mempool_alloc(gfs2_page_pool, GFP_NOIO); mh = page_address(page); diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index fbdbb08dcec6..31b6dd0d2e5d 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -10,37 +10,24 @@ #include <linux/list.h> #include "incore.h" -#define BUF_OFFSET \ - ((sizeof(struct gfs2_log_descriptor) + sizeof(__be64) - 1) & \ - ~(sizeof(__be64) - 1)) -#define DATABUF_OFFSET \ - ((sizeof(struct gfs2_log_descriptor) + (2 * sizeof(__be64) - 1)) & \ - ~(2 * sizeof(__be64) - 1)) - extern const struct gfs2_log_operations *gfs2_log_ops[]; extern void gfs2_log_incr_head(struct gfs2_sbd *sdp); extern u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lbn); -extern void gfs2_log_write(struct gfs2_sbd *sdp, struct page *page, - unsigned size, unsigned offset, u64 blkno); -extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page); +extern void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, + struct page *page, unsigned size, unsigned offset, + u64 blkno); extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { - unsigned int limit; - - limit = (sdp->sd_sb.sb_bsize - BUF_OFFSET) / sizeof(__be64); - return limit; + return sdp->sd_ldptrs; } static inline unsigned int databuf_limit(struct gfs2_sbd *sdp) { - unsigned int limit; - - limit = (sdp->sd_sb.sb_bsize - DATABUF_OFFSET) / (2 * sizeof(__be64)); - return limit; + return sdp->sd_ldptrs / 2; } static inline void lops_before_commit(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index c7393ee9cf68..28d0eb23e18e 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -98,7 +98,7 @@ static int __init init_gfs2_fs(void) error = -ENOMEM; gfs2_glock_cachep = kmem_cache_create("gfs2_glock", sizeof(struct gfs2_glock), - 0, 0, + 0, SLAB_RECLAIM_ACCOUNT, gfs2_init_glock_once); if (!gfs2_glock_cachep) goto fail_cachep1; @@ -134,7 +134,7 @@ static int __init init_gfs2_fs(void) gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", sizeof(struct gfs2_quota_data), - 0, 0, NULL); + 0, SLAB_RECLAIM_ACCOUNT, NULL); if (!gfs2_quotad_cachep) goto fail_cachep6; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 61fce59cb4d3..74c7d01723b9 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -136,8 +136,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) init_rwsem(&sdp->sd_log_flush_lock); atomic_set(&sdp->sd_log_in_flight, 0); - atomic_set(&sdp->sd_reserving_log, 0); - init_waitqueue_head(&sdp->sd_reserving_log_wait); init_waitqueue_head(&sdp->sd_log_flush_wait); atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); mutex_init(&sdp->sd_freeze_mutex); @@ -171,7 +169,8 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) return -EINVAL; } - if (sb->sb_fs_format != GFS2_FORMAT_FS || + if (sb->sb_fs_format < GFS2_FS_FORMAT_MIN || + sb->sb_fs_format > GFS2_FS_FORMAT_MAX || sb->sb_multihost_format != GFS2_FORMAT_MULTI) { fs_warn(sdp, "Unknown on-disk format, unable to mount\n"); return -EINVAL; @@ -179,7 +178,7 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent) if (sb->sb_bsize < 512 || sb->sb_bsize > PAGE_SIZE || (sb->sb_bsize & (sb->sb_bsize - 1))) { - pr_warn("Invalid superblock size\n"); + pr_warn("Invalid block size\n"); return -EINVAL; } @@ -317,6 +316,13 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sizeof(struct gfs2_meta_header)) * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */ + /* + * We always keep at least one block reserved for revokes in + * transactions. This greatly simplifies allocating additional + * revoke blocks. + */ + atomic_set(&sdp->sd_log_revokes_available, sdp->sd_ldptrs); + /* Compute maximum reservation required to add a entry to a directory */ hash_blocks = DIV_ROUND_UP(sizeof(u64) * BIT(GFS2_DIR_MAX_DEPTH), @@ -488,6 +494,19 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) goto out; } + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + sb->s_xattr = gfs2_xattr_handlers_max; + break; + + case GFS2_FS_FORMAT_MIN: + sb->s_xattr = gfs2_xattr_handlers_min; + break; + + default: + BUG(); + } + /* Set up the buffer cache and SB for real */ if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { ret = -EINVAL; @@ -1032,13 +1051,14 @@ hostdata_error: } if (lm->lm_mount == NULL) { - fs_info(sdp, "Now mounting FS...\n"); + fs_info(sdp, "Now mounting FS (format %u)...\n", sdp->sd_sb.sb_fs_format); complete_all(&sdp->sd_locking_init); return 0; } ret = lm->lm_mount(sdp, table); if (ret == 0) - fs_info(sdp, "Joined cluster. Now mounting FS...\n"); + fs_info(sdp, "Joined cluster. Now mounting FS (format %u)...\n", + sdp->sd_sb.sb_fs_format); complete_all(&sdp->sd_locking_init); return ret; } @@ -1084,6 +1104,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; + struct gfs2_holder freeze_gh; int error; sdp = init_sbd(sb); @@ -1107,7 +1128,6 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; sb->s_export_op = &gfs2_export_ops; - sb->s_xattr = gfs2_xattr_handlers; sb->s_qcop = &gfs2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; @@ -1156,6 +1176,10 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) if (error) goto fail_locking; + /* Turn rgrplvb on by default if fs format is recent enough */ + if (!sdp->sd_args.ar_got_rgrplvb && sdp->sd_sb.sb_fs_format > 1801) + sdp->sd_args.ar_rgrplvb = 1; + error = wait_on_journal(sdp); if (error) goto fail_sb; @@ -1195,25 +1219,18 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) goto fail_per_node; } - if (sb_rdonly(sb)) { - struct gfs2_holder freeze_gh; + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); + if (error) + goto fail_per_node; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error) { - fs_err(sdp, "can't make FS RO: %d\n", error); - goto fail_per_node; - } - gfs2_glock_dq_uninit(&freeze_gh); - } else { + if (!sb_rdonly(sb)) error = gfs2_make_fs_rw(sdp); - if (error) { - fs_err(sdp, "can't make FS RW: %d\n", error); - goto fail_per_node; - } - } + gfs2_freeze_unlock(&freeze_gh); + if (error) { + fs_err(sdp, "can't make FS RW: %d\n", error); + goto fail_per_node; + } gfs2_glock_dq_uninit(&mount_gh); gfs2_online_uevent(sdp); return 0; @@ -1456,6 +1473,7 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_rgrplvb: args->ar_rgrplvb = result.boolean; + args->ar_got_rgrplvb = 1; break; case Opt_loccookie: args->ar_loccookie = result.boolean; @@ -1514,6 +1532,12 @@ static int gfs2_reconfigure(struct fs_context *fc) fc->sb_flags |= SB_RDONLY; if ((sb->s_flags ^ fc->sb_flags) & SB_RDONLY) { + struct gfs2_holder freeze_gh; + + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); + if (error) + return -EINVAL; + if (fc->sb_flags & SB_RDONLY) { error = gfs2_make_fs_ro(sdp); if (error) @@ -1523,6 +1547,7 @@ static int gfs2_reconfigure(struct fs_context *fc) if (error) errorfc(fc, "unable to remount read-write"); } + gfs2_freeze_unlock(&freeze_gh); } sdp->sd_args = *newargs; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index c26c68ebd29d..282173774005 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -470,9 +470,7 @@ void gfs2_recover_func(struct work_struct *work) /* Acquire a shared hold on the freeze lock */ - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | LM_FLAG_PRIORITY | - GL_EXACT, &thaw_gh); + error = gfs2_freeze_lock(sdp, &thaw_gh, LM_FLAG_PRIORITY); if (error) goto fail_gunlock_ji; @@ -507,22 +505,24 @@ void gfs2_recover_func(struct work_struct *work) /* We take the sd_log_flush_lock here primarily to prevent log * flushes and simultaneous journal replays from stomping on - * each other wrt sd_log_bio. */ + * each other wrt jd_log_bio. */ down_read(&sdp->sd_log_flush_lock); for (pass = 0; pass < 2; pass++) { lops_before_scan(jd, &head, pass); error = foreach_descriptor(jd, head.lh_tail, head.lh_blkno, pass); lops_after_scan(jd, error, pass); - if (error) + if (error) { + up_read(&sdp->sd_log_flush_lock); goto fail_gunlock_thaw; + } } recover_local_statfs(jd, &head); clean_journal(jd, &head); up_read(&sdp->sd_log_flush_lock); - gfs2_glock_dq_uninit(&thaw_gh); + gfs2_freeze_unlock(&thaw_gh); t_rep = ktime_get(); fs_info(sdp, "jid=%u: Journal replayed in %lldms [jlck:%lldms, " "jhead:%lldms, tlck:%lldms, replay:%lldms]\n", @@ -544,7 +544,7 @@ void gfs2_recover_func(struct work_struct *work) goto done; fail_gunlock_thaw: - gfs2_glock_dq_uninit(&thaw_gh); + gfs2_freeze_unlock(&thaw_gh); fail_gunlock_ji: if (jlocked) { gfs2_glock_dq_uninit(&ji_gh); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5e8eef9990e3..89c37a845e64 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -36,6 +36,24 @@ #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) +struct gfs2_rbm { + struct gfs2_rgrpd *rgd; + u32 offset; /* The offset is bitmap relative */ + int bii; /* Bitmap index */ +}; + +static inline struct gfs2_bitmap *rbm_bi(const struct gfs2_rbm *rbm) +{ + return rbm->rgd->rd_bits + rbm->bii; +} + +static inline u64 gfs2_rbm_to_block(const struct gfs2_rbm *rbm) +{ + BUG_ON(rbm->offset >= rbm->rgd->rd_data); + return rbm->rgd->rd_data0 + (rbm_bi(rbm)->bi_start * GFS2_NBBY) + + rbm->offset; +} + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two @@ -61,7 +79,7 @@ static const char valid_change[16] = { }; static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap); + struct gfs2_blkreserv *rs, bool nowrap); /** @@ -175,7 +193,7 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) /** * rs_cmp - multi-block reservation range compare - * @blk: absolute file system block number of the new reservation + * @start: start of the new reservation * @len: number of blocks in the new reservation * @rs: existing reservation to compare against * @@ -183,13 +201,11 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state) * -1 if the block range is before the start of the reservation * 0 if the block range overlaps with the reservation */ -static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs) +static inline int rs_cmp(u64 start, u32 len, struct gfs2_blkreserv *rs) { - u64 startblk = gfs2_rbm_to_block(&rs->rs_rbm); - - if (blk >= startblk + rs->rs_free) + if (start >= rs->rs_start + rs->rs_requested) return 1; - if (blk + len - 1 < startblk) + if (rs->rs_start >= start + len) return -1; return 0; } @@ -277,29 +293,38 @@ static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block) } /** - * gfs2_rbm_incr - increment an rbm structure + * gfs2_rbm_add - add a number of blocks to an rbm * @rbm: The rbm with rgd already set correctly + * @blocks: The number of blocks to add to rpm * - * This function takes an existing rbm structure and increments it to the next - * viable block offset. - * - * Returns: If incrementing the offset would cause the rbm to go past the - * end of the rgrp, true is returned, otherwise false. + * This function takes an existing rbm structure and adds a number of blocks to + * it. * + * Returns: True if the new rbm would point past the end of the rgrp. */ -static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) +static bool gfs2_rbm_add(struct gfs2_rbm *rbm, u32 blocks) { - if (rbm->offset + 1 < rbm_bi(rbm)->bi_blocks) { /* in the same bitmap */ - rbm->offset++; + struct gfs2_rgrpd *rgd = rbm->rgd; + struct gfs2_bitmap *bi = rgd->rd_bits + rbm->bii; + + if (rbm->offset + blocks < bi->bi_blocks) { + rbm->offset += blocks; return false; } - if (rbm->bii == rbm->rgd->rd_length - 1) /* at the last bitmap */ - return true; + blocks -= bi->bi_blocks - rbm->offset; - rbm->offset = 0; - rbm->bii++; - return false; + for(;;) { + bi++; + if (bi == rgd->rd_bits + rgd->rd_length) + return true; + if (blocks < bi->bi_blocks) { + rbm->offset = blocks; + rbm->bii = bi - rgd->rd_bits; + return false; + } + blocks -= bi->bi_blocks; + } } /** @@ -308,7 +333,8 @@ static bool gfs2_rbm_incr(struct gfs2_rbm *rbm) * @n_unaligned: Number of unaligned blocks to check * @len: Decremented for each block found (terminate on zero) * - * Returns: true if a non-free block is encountered + * Returns: true if a non-free block is encountered or the end of the resource + * group is reached. */ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *len) @@ -323,7 +349,7 @@ static bool gfs2_unaligned_extlen(struct gfs2_rbm *rbm, u32 n_unaligned, u32 *le (*len)--; if (*len == 0) return true; - if (gfs2_rbm_incr(rbm)) + if (gfs2_rbm_add(rbm, 1)) return true; } @@ -595,10 +621,11 @@ static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs, { struct gfs2_inode *ip = container_of(rs, struct gfs2_inode, i_res); - gfs2_print_dbg(seq, "%s B: n:%llu s:%llu b:%u f:%u\n", fs_id_buf, + gfs2_print_dbg(seq, "%s B: n:%llu s:%llu f:%u\n", + fs_id_buf, (unsigned long long)ip->i_no_addr, - (unsigned long long)gfs2_rbm_to_block(&rs->rs_rbm), - rs->rs_rbm.offset, rs->rs_free); + (unsigned long long)rs->rs_start, + rs->rs_requested); } /** @@ -613,33 +640,22 @@ static void __rs_deltree(struct gfs2_blkreserv *rs) if (!gfs2_rs_active(rs)) return; - rgd = rs->rs_rbm.rgd; + rgd = rs->rs_rgd; trace_gfs2_rs(rs, TRACE_RS_TREEDEL); rb_erase(&rs->rs_node, &rgd->rd_rstree); RB_CLEAR_NODE(&rs->rs_node); - if (rs->rs_free) { - u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) + - rs->rs_free - 1; - struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, }; - struct gfs2_bitmap *start, *last; + if (rs->rs_requested) { + /* return requested blocks to the rgrp */ + BUG_ON(rs->rs_rgd->rd_requested < rs->rs_requested); + rs->rs_rgd->rd_requested -= rs->rs_requested; - /* return reserved blocks to the rgrp */ - BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free); - rs->rs_rbm.rgd->rd_reserved -= rs->rs_free; /* The rgrp extent failure point is likely not to increase; it will only do so if the freed blocks are somehow contiguous with a span of free blocks that follows. Still, it will force the number to be recalculated later. */ - rgd->rd_extfail_pt += rs->rs_free; - rs->rs_free = 0; - if (gfs2_rbm_from_block(&last_rbm, last_block)) - return; - start = rbm_bi(&rs->rs_rbm); - last = rbm_bi(&last_rbm); - do - clear_bit(GBF_FULL, &start->bi_flags); - while (start++ != last); + rgd->rd_extfail_pt += rs->rs_requested; + rs->rs_requested = 0; } } @@ -652,11 +668,11 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) { struct gfs2_rgrpd *rgd; - rgd = rs->rs_rbm.rgd; + rgd = rs->rs_rgd; if (rgd) { spin_lock(&rgd->rd_rsspin); __rs_deltree(rs); - BUG_ON(rs->rs_free); + BUG_ON(rs->rs_requested); spin_unlock(&rgd->rd_rsspin); } } @@ -904,6 +920,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) rgd->rd_data = be32_to_cpu(buf.ri_data); rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); spin_lock_init(&rgd->rd_rsspin); + mutex_init(&rgd->rd_mutex); error = compute_bitstructs(rgd); if (error) @@ -1149,6 +1166,23 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd) return count; } +static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) +{ + struct gfs2_bitmap *bi; + int x; + + if (rgd->rd_free) { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + clear_bit(GBF_FULL, &bi->bi_flags); + } + } else { + for (x = 0; x < rgd->rd_length; x++) { + bi = rgd->rd_bits + x; + set_bit(GBF_FULL, &bi->bi_flags); + } + } +} /** * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps @@ -1192,11 +1226,11 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { - for (x = 0; x < length; x++) - clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgrp_set_bitmap_flags(rgd); rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_reserved); /* max out the rgrp allocation failure point */ rgd->rd_extfail_pt = rgd->rd_free; } @@ -1244,7 +1278,11 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) if (rgd->rd_rgl->rl_unlinked == 0) rgd->rd_flags &= ~GFS2_RDF_CHECK; rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); + rgrp_set_bitmap_flags(rgd); rgd->rd_free_clone = rgd->rd_free; + BUG_ON(rgd->rd_reserved); + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); rgd->rd_igeneration = be64_to_cpu(rgd->rd_rgl->rl_igeneration); return 0; @@ -1404,7 +1442,8 @@ int gfs2_fitrim(struct file *filp, void __user *argp) while (1) { - ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (ret) goto out; @@ -1412,9 +1451,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) /* Trim each bitmap in the rgrp */ for (x = 0; x < rgd->rd_length; x++) { struct gfs2_bitmap *bi = rgd->rd_bits + x; + rgrp_lock_local(rgd); ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, minlen, &amt); + rgrp_unlock_local(rgd); if (ret) { gfs2_glock_dq_uninit(&gh); goto out; @@ -1426,9 +1467,11 @@ int gfs2_fitrim(struct file *filp, void __user *argp) ret = gfs2_trans_begin(sdp, RES_RG_HDR, 0); if (ret == 0) { bh = rgd->rd_bits[0].bi_bh; + rgrp_lock_local(rgd); rgd->rd_flags |= GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, bh); gfs2_rgrp_out(rgd, bh->b_data); + rgrp_unlock_local(rgd); gfs2_trans_end(sdp); } } @@ -1458,8 +1501,7 @@ static void rs_insert(struct gfs2_inode *ip) struct rb_node **newn, *parent = NULL; int rc; struct gfs2_blkreserv *rs = &ip->i_res; - struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; - u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); + struct gfs2_rgrpd *rgd = rs->rs_rgd; BUG_ON(gfs2_rs_active(rs)); @@ -1470,7 +1512,7 @@ static void rs_insert(struct gfs2_inode *ip) rb_entry(*newn, struct gfs2_blkreserv, rs_node); parent = *newn; - rc = rs_cmp(fsblock, rs->rs_free, cur); + rc = rs_cmp(rs->rs_start, rs->rs_requested, cur); if (rc > 0) newn = &((*newn)->rb_right); else if (rc < 0) @@ -1486,7 +1528,7 @@ static void rs_insert(struct gfs2_inode *ip) rb_insert_color(&rs->rs_node, &rgd->rd_rstree); /* Do our rgrp accounting for the reservation */ - rgd->rd_reserved += rs->rs_free; /* blocks reserved */ + rgd->rd_requested += rs->rs_requested; /* blocks requested */ spin_unlock(&rgd->rd_rsspin); trace_gfs2_rs(rs, TRACE_RS_INSERT); } @@ -1507,9 +1549,9 @@ static inline u32 rgd_free(struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *rs) { u32 tot_reserved, tot_free; - if (WARN_ON_ONCE(rgd->rd_reserved < rs->rs_free)) + if (WARN_ON_ONCE(rgd->rd_requested < rs->rs_requested)) return 0; - tot_reserved = rgd->rd_reserved - rs->rs_free; + tot_reserved = rgd->rd_requested - rs->rs_requested; if (rgd->rd_free_clone < tot_reserved) tot_reserved = 0; @@ -1534,17 +1576,26 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, u64 goal; struct gfs2_blkreserv *rs = &ip->i_res; u32 extlen; - u32 free_blocks = rgd_free(rgd, rs); + u32 free_blocks, blocks_available; int ret; struct inode *inode = &ip->i_inode; + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + if (rgd->rd_free_clone < rgd->rd_requested) + free_blocks = 0; + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (rgd == rs->rs_rgd) + blocks_available += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + if (S_ISDIR(inode->i_mode)) extlen = 1; else { extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target); extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks); } - if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen)) + if (free_blocks < extlen || blocks_available < extlen) return; /* Find bitmap block that contains bits for goal block */ @@ -1556,10 +1607,10 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, if (WARN_ON(gfs2_rbm_from_block(&rbm, goal))) return; - ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, ip, true); + ret = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &extlen, &ip->i_res, true); if (ret == 0) { - rs->rs_rbm = rbm; - rs->rs_free = extlen; + rs->rs_start = gfs2_rbm_to_block(&rbm); + rs->rs_requested = extlen; rs_insert(ip); } else { if (goal == rgd->rd_last_alloc + rgd->rd_data0) @@ -1572,7 +1623,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, * @rgd: The resource group * @block: The starting block * @length: The required length - * @ip: Ignore any reservations for this inode + * @ignore_rs: Reservation to ignore * * If the block does not appear in any reservation, then return the * block number unchanged. If it does appear in the reservation, then @@ -1582,7 +1633,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, u32 length, - const struct gfs2_inode *ip) + struct gfs2_blkreserv *ignore_rs) { struct gfs2_blkreserv *rs; struct rb_node *n; @@ -1602,8 +1653,8 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, } if (n) { - while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) { - block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; + while (rs_cmp(block, length, rs) == 0 && rs != ignore_rs) { + block = rs->rs_start + rs->rs_requested; n = n->rb_right; if (n == NULL) break; @@ -1618,7 +1669,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, /** * gfs2_reservation_check_and_update - Check for reservations during block alloc * @rbm: The current position in the resource group - * @ip: The inode for which we are searching for blocks + * @rs: Our own reservation * @minext: The minimum extent length * @maxext: A pointer to the maximum extent structure * @@ -1632,20 +1683,19 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, */ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, - const struct gfs2_inode *ip, + struct gfs2_blkreserv *rs, u32 minext, struct gfs2_extent *maxext) { u64 block = gfs2_rbm_to_block(rbm); u32 extlen = 1; u64 nblock; - int ret; /* * If we have a minimum extent length, then skip over any extent * which is less than the min extent length in size. */ - if (minext) { + if (minext > 1) { extlen = gfs2_free_extlen(rbm, minext); if (extlen <= maxext->len) goto fail; @@ -1655,7 +1705,7 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, * Check the extent which has been found against the reservations * and skip if parts of it are already reserved */ - nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, ip); + nblock = gfs2_next_unreserved_block(rbm->rgd, block, extlen, rs); if (nblock == block) { if (!minext || extlen >= minext) return 0; @@ -1664,12 +1714,15 @@ static int gfs2_reservation_check_and_update(struct gfs2_rbm *rbm, maxext->len = extlen; maxext->rbm = *rbm; } -fail: - nblock = block + extlen; + } else { + u64 len = nblock - block; + if (len >= (u64)1 << 32) + return -E2BIG; + extlen = len; } - ret = gfs2_rbm_from_block(rbm, nblock); - if (ret < 0) - return ret; +fail: + if (gfs2_rbm_add(rbm, extlen)) + return -E2BIG; return 1; } @@ -1677,9 +1730,9 @@ fail: * gfs2_rbm_find - Look for blocks of a particular state * @rbm: Value/result starting position and final position * @state: The state which we want to find - * @minext: Pointer to the requested extent length (NULL for a single block) + * @minext: Pointer to the requested extent length * This is updated to be the actual reservation size. - * @ip: If set, check for reservations + * @rs: Our own reservation (NULL to skip checking for reservations) * @nowrap: Stop looking at the end of the rgrp, rather than wrapping * around until we've reached the starting point. * @@ -1693,7 +1746,7 @@ fail: */ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, - const struct gfs2_inode *ip, bool nowrap) + struct gfs2_blkreserv *rs, bool nowrap) { bool scan_from_start = rbm->bii == 0 && rbm->offset == 0; struct buffer_head *bh; @@ -1714,8 +1767,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, while(1) { bi = rbm_bi(rbm); - if ((ip == NULL || !gfs2_rs_active(&ip->i_res)) && - test_bit(GBF_FULL, &bi->bi_flags) && + if (test_bit(GBF_FULL, &bi->bi_flags) && (state == GFS2_BLKST_FREE)) goto next_bitmap; @@ -1731,11 +1783,10 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, goto next_bitmap; } rbm->offset = offset; - if (ip == NULL) + if (!rs) return 0; - ret = gfs2_reservation_check_and_update(rbm, ip, - minext ? *minext : 0, + ret = gfs2_reservation_check_and_update(rbm, rs, *minext, &maxext); if (ret == 0) return 0; @@ -1767,7 +1818,7 @@ next_iter: break; } - if (minext == NULL || state != GFS2_BLKST_FREE) + if (state != GFS2_BLKST_FREE) return -ENOSPC; /* If the extent was too small, and it's smaller than the smallest @@ -1775,7 +1826,7 @@ next_iter: useless to search this rgrp again for this amount or more. */ if (wrapped && (scan_from_start || rbm->bii > last_bii) && *minext < rbm->rgd->rd_extfail_pt) - rbm->rgd->rd_extfail_pt = *minext; + rbm->rgd->rd_extfail_pt = *minext - 1; /* If the maximum extent we found is big enough to fulfill the minimum requirements, use it anyway. */ @@ -1938,7 +1989,7 @@ static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs, u64 tdiff; tdiff = ktime_to_ns(ktime_sub(ktime_get_real(), - rs->rs_rbm.rgd->rd_gl->gl_dstamp)); + rs->rs_rgd->rd_gl->gl_dstamp)); return tdiff > (msecs * 1000 * 1000); } @@ -1993,8 +2044,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * We try our best to find an rgrp that has at least ap->target blocks * available. After a couple of passes (loops == 2), the prospects of finding * such an rgrp diminish. At this stage, we return the first rgrp that has - * at least ap->min_target blocks available. Either way, we set ap->allowed to - * the number of blocks available in the chosen rgrp. + * at least ap->min_target blocks available. * * Returns: 0 on success, * -ENOMEM if a suitable rgrp can't be found @@ -2006,56 +2056,64 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; struct gfs2_blkreserv *rs = &ip->i_res; - int error = 0, rg_locked, flags = 0; + int error = 0, flags = LM_FLAG_NODE_SCOPE; + bool rg_locked; u64 last_unlinked = NO_BLOCK; + u32 target = ap->target; int loops = 0; - u32 free_blocks, skip = 0; + u32 free_blocks, blocks_available, skip = 0; + + BUG_ON(rs->rs_reserved); if (sdp->sd_args.ar_rgrplvb) flags |= GL_SKIP; - if (gfs2_assert_warn(sdp, ap->target)) + if (gfs2_assert_warn(sdp, target)) return -EINVAL; if (gfs2_rs_active(rs)) { - begin = rs->rs_rbm.rgd; - } else if (rs->rs_rbm.rgd && - rgrp_contains_block(rs->rs_rbm.rgd, ip->i_goal)) { - begin = rs->rs_rbm.rgd; + begin = rs->rs_rgd; + } else if (rs->rs_rgd && + rgrp_contains_block(rs->rs_rgd, ip->i_goal)) { + begin = rs->rs_rgd; } else { check_and_update_goal(ip); - rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); + rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) skip = gfs2_orlov_skip(ip); - if (rs->rs_rbm.rgd == NULL) + if (rs->rs_rgd == NULL) return -EBADSLT; while (loops < 3) { - rg_locked = 1; + struct gfs2_rgrpd *rgd; - if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) { - rg_locked = 0; + rg_locked = gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl); + if (rg_locked) { + rgrp_lock_local(rs->rs_rgd); + } else { if (skip && skip--) goto next_rgrp; if (!gfs2_rs_active(rs)) { if (loops == 0 && - !fast_to_acquire(rs->rs_rbm.rgd)) + !fast_to_acquire(rs->rs_rgd)) goto next_rgrp; if ((loops < 2) && gfs2_rgrp_used_recently(rs, 1000) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + gfs2_rgrp_congested(rs->rs_rgd, loops)) goto next_rgrp; } - error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl, + error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl, LM_ST_EXCLUSIVE, flags, &ip->i_rgd_gh); if (unlikely(error)) return error; + rgrp_lock_local(rs->rs_rgd); if (!gfs2_rs_active(rs) && (loops < 2) && - gfs2_rgrp_congested(rs->rs_rbm.rgd, loops)) + gfs2_rgrp_congested(rs->rs_rgd, loops)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { - error = update_rgrp_lvb(rs->rs_rbm.rgd); + error = update_rgrp_lvb(rs->rs_rgd); if (unlikely(error)) { + rgrp_unlock_local(rs->rs_rgd); gfs2_glock_dq_uninit(&ip->i_rgd_gh); return error; } @@ -2063,36 +2121,46 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) } /* Skip unusable resource groups */ - if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | + if ((rs->rs_rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) || - (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) + (loops == 0 && target > rs->rs_rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) - gfs2_rgrp_bh_get(rs->rs_rbm.rgd); + gfs2_rgrp_bh_get(rs->rs_rgd); /* Get a reservation if we don't already have one */ if (!gfs2_rs_active(rs)) - rg_mblk_search(rs->rs_rbm.rgd, ip, ap); + rg_mblk_search(rs->rs_rgd, ip, ap); /* Skip rgrps when we can't get a reservation on first pass */ if (!gfs2_rs_active(rs) && (loops < 1)) goto check_rgrp; /* If rgrp has enough free space, use it */ - free_blocks = rgd_free(rs->rs_rbm.rgd, rs); - if (free_blocks >= ap->target || - (loops == 2 && ap->min_target && - free_blocks >= ap->min_target)) { - ap->allowed = free_blocks; - return 0; + rgd = rs->rs_rgd; + spin_lock(&rgd->rd_rsspin); + free_blocks = rgd_free(rgd, rs); + blocks_available = rgd->rd_free_clone - rgd->rd_reserved; + if (free_blocks < target || blocks_available < target) { + spin_unlock(&rgd->rd_rsspin); + goto check_rgrp; } + rs->rs_reserved = ap->target; + if (rs->rs_reserved > blocks_available) + rs->rs_reserved = blocks_available; + rgd->rd_reserved += rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rgrp_unlock_local(rs->rs_rgd); + return 0; check_rgrp: /* Check for unlinked inodes which can be reclaimed */ - if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) - try_rgrp_unlink(rs->rs_rbm.rgd, &last_unlinked, + if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) + try_rgrp_unlink(rs->rs_rgd, &last_unlinked, ip->i_no_addr); skip_rgrp: + rgrp_unlock_local(rs->rs_rgd); + /* Drop reservation, if we couldn't use reserved rgrp */ if (gfs2_rs_active(rs)) gfs2_rs_deltree(rs); @@ -2102,7 +2170,7 @@ skip_rgrp: gfs2_glock_dq_uninit(&ip->i_rgd_gh); next_rgrp: /* Find the next rgrp, and continue looking */ - if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin)) + if (gfs2_select_rgrp(&rs->rs_rgd, begin)) continue; if (skip) continue; @@ -2119,9 +2187,12 @@ next_rgrp: return error; } /* Flushing the log may release space */ - if (loops == 2) + if (loops == 2) { + if (ap->min_target) + target = ap->min_target; gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_INPLACE_RESERVE); + } } return -ENOSPC; @@ -2136,6 +2207,17 @@ next_rgrp: void gfs2_inplace_release(struct gfs2_inode *ip) { + struct gfs2_blkreserv *rs = &ip->i_res; + + if (rs->rs_reserved) { + struct gfs2_rgrpd *rgd = rs->rs_rgd; + + spin_lock(&rgd->rd_rsspin); + BUG_ON(rgd->rd_reserved < rs->rs_reserved); + rgd->rd_reserved -= rs->rs_reserved; + spin_unlock(&rgd->rd_rsspin); + rs->rs_reserved = 0; + } if (gfs2_holder_initialized(&ip->i_rgd_gh)) gfs2_glock_dq_uninit(&ip->i_rgd_gh); } @@ -2205,7 +2287,7 @@ static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd, bi_prev = bi; } gfs2_setbit(&rbm, false, new_state); - gfs2_rbm_incr(&rbm); + gfs2_rbm_add(&rbm, 1); } } @@ -2223,11 +2305,12 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, struct gfs2_blkreserv *trs; const struct rb_node *n; - gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u r:%u e:%u\n", + spin_lock(&rgd->rd_rsspin); + gfs2_print_dbg(seq, "%s R: n:%llu f:%02x b:%u/%u i:%u q:%u r:%u e:%u\n", fs_id_buf, (unsigned long long)rgd->rd_addr, rgd->rd_flags, rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes, - rgd->rd_reserved, rgd->rd_extfail_pt); + rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt); if (rgd->rd_sbd->sd_args.ar_rgrplvb) { struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl; @@ -2236,7 +2319,6 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd, be32_to_cpu(rgl->rl_free), be32_to_cpu(rgl->rl_dinodes)); } - spin_lock(&rgd->rd_rsspin); for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) { trs = rb_entry(n, struct gfs2_blkreserv, rs_node); dump_rs(seq, trs, fs_id_buf); @@ -2273,29 +2355,29 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip, { struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rbm->rgd; - unsigned rlen; - u64 block; - int ret; - spin_lock(&rgd->rd_rsspin); + BUG_ON(rs->rs_reserved < len); + rs->rs_reserved -= len; if (gfs2_rs_active(rs)) { - if (gfs2_rbm_eq(&rs->rs_rbm, rbm)) { - block = gfs2_rbm_to_block(rbm); - ret = gfs2_rbm_from_block(&rs->rs_rbm, block + len); - rlen = min(rs->rs_free, len); - rs->rs_free -= rlen; - rgd->rd_reserved -= rlen; + u64 start = gfs2_rbm_to_block(rbm); + + if (rs->rs_start == start) { + unsigned int rlen; + + rs->rs_start += len; + rlen = min(rs->rs_requested, len); + rs->rs_requested -= rlen; + rgd->rd_requested -= rlen; trace_gfs2_rs(rs, TRACE_RS_CLAIM); - if (rs->rs_free && !ret) - goto out; + if (rs->rs_start < rgd->rd_data0 + rgd->rd_data && + rs->rs_requested) + return; /* We used up our block reservation, so we should reserve more blocks next time. */ atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint); } __rs_deltree(rs); } -out: - spin_unlock(&rgd->rd_rsspin); } /** @@ -2315,15 +2397,13 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, u64 goal; if (gfs2_rs_active(&ip->i_res)) { - *rbm = ip->i_res.rs_rbm; - return; + goal = ip->i_res.rs_start; + } else { + if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) + goal = ip->i_goal; + else + goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; } - - if (!dinode && rgrp_contains_block(rbm->rgd, ip->i_goal)) - goal = ip->i_goal; - else - goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0; - if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) { rbm->bii = 0; rbm->offset = 0; @@ -2346,17 +2426,21 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; - struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rbm.rgd, }; - unsigned int ndata; + struct gfs2_rbm rbm = { .rgd = ip->i_res.rs_rgd, }; u64 block; /* block, within the file system scope */ - int error; + u32 minext = 1; + int error = -ENOSPC; - gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, ip, false); + BUG_ON(ip->i_res.rs_reserved < *nblocks); + rgrp_lock_local(rbm.rgd); + if (gfs2_rs_active(&ip->i_res)) { + gfs2_set_alloc_start(&rbm, ip, dinode); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, &ip->i_res, false); + } if (error == -ENOSPC) { gfs2_set_alloc_start(&rbm, ip, dinode); - error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, NULL, NULL, false); + error = gfs2_rbm_find(&rbm, GFS2_BLKST_FREE, &minext, NULL, false); } /* Since all blocks are reserved in advance, this shouldn't happen */ @@ -2371,14 +2455,8 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_alloc_extent(&rbm, dinode, nblocks); block = gfs2_rbm_to_block(&rbm); rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; - if (gfs2_rs_active(&ip->i_res)) - gfs2_adjust_reservation(ip, &rbm, *nblocks); - ndata = *nblocks; - if (dinode) - ndata--; - if (!dinode) { - ip->i_goal = block + ndata - 1; + ip->i_goal = block + *nblocks - 1; error = gfs2_meta_inode_buffer(ip, &dibh); if (error == 0) { struct gfs2_dinode *di = @@ -2389,12 +2467,20 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, brelse(dibh); } } - if (rbm.rgd->rd_free < *nblocks) { + spin_lock(&rbm.rgd->rd_rsspin); + gfs2_adjust_reservation(ip, &rbm, *nblocks); + if (rbm.rgd->rd_free < *nblocks || rbm.rgd->rd_reserved < *nblocks) { fs_warn(sdp, "nblocks=%u\n", *nblocks); + spin_unlock(&rbm.rgd->rd_rsspin); goto rgrp_error; } - + BUG_ON(rbm.rgd->rd_reserved < *nblocks); + BUG_ON(rbm.rgd->rd_free_clone < *nblocks); + BUG_ON(rbm.rgd->rd_free < *nblocks); + rbm.rgd->rd_reserved -= *nblocks; + rbm.rgd->rd_free_clone -= *nblocks; rbm.rgd->rd_free -= *nblocks; + spin_unlock(&rbm.rgd->rd_rsspin); if (dinode) { rbm.rgd->rd_dinodes++; *generation = rbm.rgd->rd_igeneration++; @@ -2404,6 +2490,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rbm.rgd); gfs2_statfs_change(sdp, 0, -(s64)*nblocks, dinode ? 1 : 0); if (dinode) @@ -2411,13 +2498,13 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_quota_change(ip, *nblocks, ip->i_inode.i_uid, ip->i_inode.i_gid); - rbm.rgd->rd_free_clone -= *nblocks; trace_gfs2_block_alloc(ip, rbm.rgd, block, *nblocks, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED); *bn = block; return 0; rgrp_error: + rgrp_unlock_local(rbm.rgd); gfs2_rgrp_error(rbm.rgd); return -EIO; } @@ -2437,12 +2524,14 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd, { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE); trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE); rgd->rd_free += blen; rgd->rd_flags &= ~GFS2_RGF_TRIMMED; gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rgd); /* Directories keep their data in the metadata address space */ if (meta || ip->i_depth || gfs2_is_jdata(ip)) @@ -2478,17 +2567,20 @@ void gfs2_unlink_di(struct inode *inode) rgd = gfs2_blk2rgrpd(sdp, blkno, true); if (!rgd) return; + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED); trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED); gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); be32_add_cpu(&rgd->rd_rgl->rl_unlinked, 1); + rgrp_unlock_local(rgd); } void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) { struct gfs2_sbd *sdp = rgd->rd_sbd; + rgrp_lock_local(rgd); rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE); if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); @@ -2497,6 +2589,7 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); + rgrp_unlock_local(rgd); be32_add_cpu(&rgd->rd_rgl->rl_unlinked, -1); gfs2_statfs_change(sdp, 0, +1, -1); @@ -2511,6 +2604,10 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) * @no_addr: The block number to check * @type: The block type we are looking for * + * The inode glock of @no_addr must be held. The @type to check for is either + * GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; checking for type GFS2_BLKST_FREE + * or GFS2_BLKST_USED would make no sense. + * * Returns: 0 if the block type matches the expected type * -ESTALE if it doesn't match * or -ve errno if something went wrong while checking @@ -2534,6 +2631,13 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) rbm.rgd = rgd; error = gfs2_rbm_from_block(&rbm, no_addr); if (!WARN_ON_ONCE(error)) { + /* + * No need to take the local resource group lock here; the + * inode glock of @no_addr provides the necessary + * synchronization in case the block is an inode. (In case + * the block is not an inode, the block type will not match + * the @type we are looking for.) + */ if (gfs2_testbit(&rbm, false) != type) error = -ESTALE; } @@ -2578,7 +2682,7 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist, return; rgd = gfs2_blk2rgrpd(sdp, block, 1); } else { - rgd = ip->i_res.rs_rbm.rgd; + rgd = ip->i_res.rs_rgd; if (!rgd || !rgrp_contains_block(rgd, block)) rgd = gfs2_blk2rgrpd(sdp, block, 1); } @@ -2633,9 +2737,8 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist) sizeof(struct gfs2_holder), GFP_NOFS | __GFP_NOFAIL); for (x = 0; x < rlist->rl_rgrps; x++) - gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, - LM_ST_EXCLUSIVE, 0, - &rlist->rl_ghs[x]); + gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rlist->rl_ghs[x]); } /** @@ -2658,3 +2761,14 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) } } +void rgrp_lock_local(struct gfs2_rgrpd *rgd) +{ + BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) && + !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags)); + mutex_lock(&rgd->rd_mutex); +} + +void rgrp_unlock_local(struct gfs2_rgrpd *rgd) +{ + mutex_unlock(&rgd->rd_mutex); +} diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 9a587ada51ed..a6855fd796e0 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -77,7 +77,7 @@ extern int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) { - return rs && !RB_EMPTY_NODE(&rs->rs_node); + return !RB_EMPTY_NODE(&rs->rs_node); } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -88,4 +88,8 @@ static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) } extern void check_and_update_goal(struct gfs2_inode *ip); + +extern void rgrp_lock_local(struct gfs2_rgrpd *rgd); +extern void rgrp_unlock_local(struct gfs2_rgrpd *rgd); + #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2f56acc41c04..861ed5fe02a5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -81,19 +81,12 @@ void gfs2_jindex_free(struct gfs2_sbd *sdp) static struct gfs2_jdesc *jdesc_find_i(struct list_head *head, unsigned int jid) { struct gfs2_jdesc *jd; - int found = 0; list_for_each_entry(jd, head, jd_list) { - if (jd->jd_jid == jid) { - found = 1; - break; - } + if (jd->jd_jid == jid) + return jd; } - - if (!found) - jd = NULL; - - return jd; + return NULL; } struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) @@ -165,7 +158,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_holder freeze_gh; struct gfs2_log_header_host head; int error; @@ -173,12 +165,6 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) if (error) return error; - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error) - goto fail_threads; - j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawn(sdp)) { error = -EIO; @@ -205,13 +191,9 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_glock_dq_uninit(&freeze_gh); - return 0; fail: - gfs2_glock_dq_uninit(&freeze_gh); -fail_threads: if (sdp->sd_quotad_process) kthread_stop(sdp->sd_quotad_process); sdp->sd_quotad_process = NULL; @@ -452,7 +434,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) } if (error) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + gfs2_freeze_unlock(&sdp->sd_freeze_gh); out: while (!list_empty(&list)) { @@ -562,8 +544,6 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) int need_endtrans = 0; int ret; - if (!(flags & I_DIRTY_INODE)) - return; if (unlikely(gfs2_withdrawn(sdp))) return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { @@ -609,30 +589,9 @@ out: int gfs2_make_fs_ro(struct gfs2_sbd *sdp) { - struct gfs2_holder freeze_gh; int error = 0; int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - gfs2_holder_mark_uninitialized(&freeze_gh); - if (sdp->sd_freeze_gl && - !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { - if (!log_write_allowed) { - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, LM_FLAG_TRY | - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error == GLR_TRYFAILED) - error = 0; - } else { - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, - LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, - &freeze_gh); - if (error && !gfs2_withdrawn(sdp)) - return error; - } - } - gfs2_flush_delete_work(sdp); if (!log_write_allowed && current == sdp->sd_quotad_process) fs_warn(sdp, "The quotad daemon is withdrawing.\n"); @@ -652,18 +611,15 @@ int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_SHUTDOWN | GFS2_LFC_MAKE_FS_RO); - wait_event(sdp->sd_reserving_log_wait, - atomic_read(&sdp->sd_reserving_log) == 0); - gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == - sdp->sd_jdesc->jd_blocks); + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), + HZ * 5); + gfs2_assert_warn(sdp, gfs2_log_is_empty(sdp)); } else { - wait_event_timeout(sdp->sd_reserving_log_wait, - atomic_read(&sdp->sd_reserving_log) == 0, + wait_event_timeout(sdp->sd_log_waitq, + gfs2_log_is_empty(sdp), HZ * 5); } - if (gfs2_holder_initialized(&freeze_gh)) - gfs2_glock_dq_uninit(&freeze_gh); - gfs2_quota_cleanup(sdp); if (!log_write_allowed) @@ -772,10 +728,8 @@ void gfs2_freeze_func(struct work_struct *work) struct super_block *sb = sdp->sd_vfs; atomic_inc(&sb->s_active); - error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, - LM_FLAG_NOEXP | GL_EXACT, &freeze_gh); + error = gfs2_freeze_lock(sdp, &freeze_gh, 0); if (error) { - fs_info(sdp, "GFS2: couldn't get freeze lock : %d\n", error); gfs2_assert_withdraw(sdp, 0); } else { atomic_set(&sdp->sd_freeze_state, SFS_UNFROZEN); @@ -785,7 +739,7 @@ void gfs2_freeze_func(struct work_struct *work) error); gfs2_assert_withdraw(sdp, 0); } - gfs2_glock_dq_uninit(&freeze_gh); + gfs2_freeze_unlock(&freeze_gh); } deactivate_super(sb); clear_bit_unlock(SDF_FS_FROZEN, &sdp->sd_flags); @@ -853,7 +807,7 @@ static int gfs2_unfreeze(struct super_block *sb) return 0; } - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + gfs2_freeze_unlock(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_mutex); return wait_on_bit(&sdp->sd_flags, SDF_FS_FROZEN, TASK_INTERRUPTIBLE); } @@ -1229,7 +1183,8 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip) goto out_qs; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (error) goto out_qs; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index c9fb2a654181..08e502dec7ec 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -11,6 +11,10 @@ #include <linux/dcache.h> #include "incore.h" +/* Supported fs format version range */ +#define GFS2_FS_FORMAT_MIN (1801) +#define GFS2_FS_FORMAT_MAX (1802) + extern void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) @@ -54,7 +58,9 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern const struct xattr_handler *gfs2_xattr_handlers[]; + +extern const struct xattr_handler *gfs2_xattr_handlers_max[]; +extern const struct xattr_handler **gfs2_xattr_handlers_min; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 0b2f858d9a8c..bd6c8e9e49db 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -560,6 +560,7 @@ TRACE_EVENT(gfs2_block_alloc, __field( u8, block_state ) __field( u64, rd_addr ) __field( u32, rd_free_clone ) + __field( u32, rd_requested ) __field( u32, rd_reserved ) ), @@ -571,17 +572,20 @@ TRACE_EVENT(gfs2_block_alloc, __entry->block_state = block_state; __entry->rd_addr = rgd->rd_addr; __entry->rd_free_clone = rgd->rd_free_clone; + __entry->rd_requested = rgd->rd_requested; __entry->rd_reserved = rgd->rd_reserved; ), - TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu", + TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rq:%u rr:%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long)__entry->len, block_state_name(__entry->block_state), (unsigned long long)__entry->rd_addr, - __entry->rd_free_clone, (unsigned long)__entry->rd_reserved) + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved) ); /* Keep track of multi-block reservations as they are allocated/freed */ @@ -595,33 +599,40 @@ TRACE_EVENT(gfs2_rs, __field( dev_t, dev ) __field( u64, rd_addr ) __field( u32, rd_free_clone ) + __field( u32, rd_requested ) __field( u32, rd_reserved ) __field( u64, inum ) __field( u64, start ) - __field( u32, free ) + __field( u32, requested ) + __field( u32, reserved ) __field( u8, func ) ), TP_fast_assign( - __entry->dev = rs->rs_rbm.rgd->rd_sbd->sd_vfs->s_dev; - __entry->rd_addr = rs->rs_rbm.rgd->rd_addr; - __entry->rd_free_clone = rs->rs_rbm.rgd->rd_free_clone; - __entry->rd_reserved = rs->rs_rbm.rgd->rd_reserved; + __entry->dev = rs->rs_rgd->rd_sbd->sd_vfs->s_dev; + __entry->rd_addr = rs->rs_rgd->rd_addr; + __entry->rd_free_clone = rs->rs_rgd->rd_free_clone; + __entry->rd_requested = rs->rs_rgd->rd_requested; + __entry->rd_reserved = rs->rs_rgd->rd_reserved; __entry->inum = container_of(rs, struct gfs2_inode, i_res)->i_no_addr; - __entry->start = gfs2_rbm_to_block(&rs->rs_rbm); - __entry->free = rs->rs_free; + __entry->start = rs->rs_start; + __entry->requested = rs->rs_requested; + __entry->reserved = rs->rs_reserved; __entry->func = func; ), - TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s f:%lu", + TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%u rq:%u rr:%u %s q:%u r:%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->inum, (unsigned long long)__entry->start, (unsigned long long)__entry->rd_addr, - (unsigned long)__entry->rd_free_clone, - (unsigned long)__entry->rd_reserved, - rs_func_name(__entry->func), (unsigned long)__entry->free) + __entry->rd_free_clone, + __entry->rd_requested, + __entry->rd_reserved, + rs_func_name(__entry->func), + __entry->requested, + __entry->reserved) ); #endif /* _TRACE_GFS2_H */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 6d4bf7ea7b3b..ab96cf0bf26b 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -31,17 +31,17 @@ static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr) fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n", tr->tr_blocks, tr->tr_revokes, tr->tr_reserved, test_bit(TR_TOUCHED, &tr->tr_flags)); - fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n", + fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u\n", tr->tr_num_buf_new, tr->tr_num_buf_rm, tr->tr_num_databuf_new, tr->tr_num_databuf_rm, - tr->tr_num_revoke, tr->tr_num_revoke_rm); + tr->tr_num_revoke); } -int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, - unsigned int revokes) +int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip) { - struct gfs2_trans *tr; - int error; + unsigned int extra_revokes; if (current->journal_info) { gfs2_print_trans(sdp, current->journal_info); @@ -52,39 +52,72 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) return -EROFS; - tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); - if (!tr) - return -ENOMEM; - - tr->tr_ip = _RET_IP_; + tr->tr_ip = ip; tr->tr_blocks = blocks; tr->tr_revokes = revokes; - tr->tr_reserved = 1; - set_bit(TR_ALLOCED, &tr->tr_flags); - if (blocks) - tr->tr_reserved += 6 + blocks; - if (revokes) - tr->tr_reserved += gfs2_struct2blk(sdp, revokes); + tr->tr_reserved = GFS2_LOG_FLUSH_MIN_BLOCKS; + if (blocks) { + /* + * The reserved blocks are either used for data or metadata. + * We can have mixed data and metadata, each with its own log + * descriptor block; see calc_reserved(). + */ + tr->tr_reserved += blocks + 1 + DIV_ROUND_UP(blocks - 1, databuf_limit(sdp)); + } INIT_LIST_HEAD(&tr->tr_databuf); INIT_LIST_HEAD(&tr->tr_buf); INIT_LIST_HEAD(&tr->tr_list); INIT_LIST_HEAD(&tr->tr_ail1_list); INIT_LIST_HEAD(&tr->tr_ail2_list); + if (gfs2_assert_warn(sdp, tr->tr_reserved <= sdp->sd_jdesc->jd_blocks)) + return -EINVAL; + sb_start_intwrite(sdp->sd_vfs); - error = gfs2_log_reserve(sdp, tr->tr_reserved); - if (error) - goto fail; + /* + * Try the reservations under sd_log_flush_lock to prevent log flushes + * from creating inconsistencies between the number of allocated and + * reserved revokes. If that fails, do a full-block allocation outside + * of the lock to avoid stalling log flushes. Then, allot the + * appropriate number of blocks to revokes, use as many revokes locally + * as needed, and "release" the surplus into the revokes pool. + */ + + down_read(&sdp->sd_log_flush_lock); + if (gfs2_log_try_reserve(sdp, tr, &extra_revokes)) + goto reserved; + up_read(&sdp->sd_log_flush_lock); + gfs2_log_reserve(sdp, tr, &extra_revokes); + down_read(&sdp->sd_log_flush_lock); + +reserved: + gfs2_log_release_revokes(sdp, extra_revokes); + if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); + gfs2_log_release(sdp, tr->tr_reserved); + sb_end_intwrite(sdp->sd_vfs); + return -EROFS; + } current->journal_info = tr; return 0; +} -fail: - sb_end_intwrite(sdp->sd_vfs); - kmem_cache_free(gfs2_trans_cachep, tr); +int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, + unsigned int revokes) +{ + struct gfs2_trans *tr; + int error; + tr = kmem_cache_zalloc(gfs2_trans_cachep, GFP_NOFS); + if (!tr) + return -ENOMEM; + error = __gfs2_trans_begin(tr, sdp, blocks, revokes, _RET_IP_); + if (error) + kmem_cache_free(gfs2_trans_cachep, tr); return error; } @@ -92,37 +125,39 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) { struct gfs2_trans *tr = current->journal_info; s64 nbuf; - int alloced = test_bit(TR_ALLOCED, &tr->tr_flags); current->journal_info = NULL; if (!test_bit(TR_TOUCHED, &tr->tr_flags)) { + gfs2_log_release_revokes(sdp, tr->tr_revokes); + up_read(&sdp->sd_log_flush_lock); gfs2_log_release(sdp, tr->tr_reserved); - if (alloced) { + if (!test_bit(TR_ONSTACK, &tr->tr_flags)) gfs2_trans_free(sdp, tr); - sb_end_intwrite(sdp->sd_vfs); - } + sb_end_intwrite(sdp->sd_vfs); return; } + gfs2_log_release_revokes(sdp, tr->tr_revokes - tr->tr_num_revoke); + nbuf = tr->tr_num_buf_new + tr->tr_num_databuf_new; nbuf -= tr->tr_num_buf_rm; nbuf -= tr->tr_num_databuf_rm; - if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) && - (tr->tr_num_revoke <= tr->tr_revokes))) + if (gfs2_assert_withdraw(sdp, nbuf <= tr->tr_blocks) || + gfs2_assert_withdraw(sdp, tr->tr_num_revoke <= tr->tr_revokes)) gfs2_print_trans(sdp, tr); gfs2_log_commit(sdp, tr); - if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags)) + if (!test_bit(TR_ONSTACK, &tr->tr_flags) && + !test_bit(TR_ATTACHED, &tr->tr_flags)) gfs2_trans_free(sdp, tr); up_read(&sdp->sd_log_flush_lock); if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_TRANS_END); - if (alloced) - sb_end_intwrite(sdp->sd_vfs); + sb_end_intwrite(sdp->sd_vfs); } static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl, @@ -262,7 +297,6 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd) void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) { struct gfs2_bufdata *bd, *tmp; - struct gfs2_trans *tr = current->journal_info; unsigned int n = len; gfs2_log_lock(sdp); @@ -274,7 +308,7 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len) if (bd->bd_gl) gfs2_glock_remove_revoke(bd->bd_gl); kmem_cache_free(gfs2_bufdata_cachep, bd); - tr->tr_num_revoke_rm++; + gfs2_log_release_revokes(sdp, 1); if (--n == 0) break; } diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index 83199ce5a5c5..c76ad9a4c75a 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -27,13 +27,16 @@ struct gfs2_glock; * block, or all of the blocks in the rg, whichever is smaller */ static inline unsigned int gfs2_rg_blocks(const struct gfs2_inode *ip, unsigned requested) { - struct gfs2_rgrpd *rgd = ip->i_res.rs_rbm.rgd; + struct gfs2_rgrpd *rgd = ip->i_res.rs_rgd; if (requested < rgd->rd_length) return requested + 1; return rgd->rd_length; } +extern int __gfs2_trans_begin(struct gfs2_trans *tr, struct gfs2_sbd *sdp, + unsigned int blocks, unsigned int revokes, + unsigned long ip); extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, unsigned int revokes); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index a374397f4273..8d3c670c990f 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -91,12 +91,39 @@ out_unlock: return error; } +/** + * gfs2_freeze_lock - hold the freeze glock + * @sdp: the superblock + * @freeze_gh: pointer to the requested holder + * @caller_flags: any additional flags needed by the caller + */ +int gfs2_freeze_lock(struct gfs2_sbd *sdp, struct gfs2_holder *freeze_gh, + int caller_flags) +{ + int flags = LM_FLAG_NOEXP | GL_EXACT | caller_flags; + int error; + + error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, flags, + freeze_gh); + if (error && error != GLR_TRYFAILED) + fs_err(sdp, "can't lock the freeze lock: %d\n", error); + return error; +} + +void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh) +{ + if (gfs2_holder_initialized(freeze_gh)) + gfs2_glock_dq_uninit(freeze_gh); +} + static void signal_our_withdraw(struct gfs2_sbd *sdp) { - struct gfs2_glock *gl = sdp->sd_live_gh.gh_gl; + struct gfs2_glock *live_gl = sdp->sd_live_gh.gh_gl; struct inode *inode = sdp->sd_jdesc->jd_inode; struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_glock *i_gl = ip->i_gl; u64 no_formal_ino = ip->i_no_formal_ino; + int log_write_allowed = test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); int ret = 0; int tries; @@ -117,8 +144,21 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * therefore we need to clear SDF_JOURNAL_LIVE manually. */ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - if (!sb_rdonly(sdp->sd_vfs)) - ret = gfs2_make_fs_ro(sdp); + if (!sb_rdonly(sdp->sd_vfs)) { + struct gfs2_holder freeze_gh; + + gfs2_holder_mark_uninitialized(&freeze_gh); + if (sdp->sd_freeze_gl && + !gfs2_glock_is_locked_by_me(sdp->sd_freeze_gl)) { + ret = gfs2_freeze_lock(sdp, &freeze_gh, + log_write_allowed ? 0 : LM_FLAG_TRY); + if (ret == GLR_TRYFAILED) + ret = 0; + } + if (!ret) + ret = gfs2_make_fs_ro(sdp); + gfs2_freeze_unlock(&freeze_gh); + } if (sdp->sd_lockstruct.ls_ops->lm_lock == NULL) { /* lock_nolock */ if (!ret) @@ -141,7 +181,8 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) atomic_set(&sdp->sd_freeze_state, SFS_FROZEN); thaw_super(sdp->sd_vfs); } else { - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, TASK_UNINTERRUPTIBLE); + wait_on_bit(&i_gl->gl_flags, GLF_DEMOTE, + TASK_UNINTERRUPTIBLE); } /* @@ -161,15 +202,15 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) * on other nodes to be successful, otherwise we remain the owner of * the glock as far as dlm is concerned. */ - if (gl->gl_ops->go_free) { - set_bit(GLF_FREEING, &gl->gl_flags); - wait_on_bit(&gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); + if (i_gl->gl_ops->go_free) { + set_bit(GLF_FREEING, &i_gl->gl_flags); + wait_on_bit(&i_gl->gl_flags, GLF_FREEING, TASK_UNINTERRUPTIBLE); } /* * Dequeue the "live" glock, but keep a reference so it's never freed. */ - gfs2_glock_hold(gl); + gfs2_glock_hold(live_gl); gfs2_glock_dq_wait(&sdp->sd_live_gh); /* * We enqueue the "live" glock in EX so that all other nodes @@ -208,7 +249,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) gfs2_glock_nq(&sdp->sd_live_gh); } - gfs2_glock_queue_put(gl); /* drop the extra reference we acquired */ + gfs2_glock_queue_put(live_gl); /* drop extra reference we acquired */ clear_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags); /* diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index a4443dd8a94b..69e1a0ae5a4d 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -149,6 +149,9 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, extern int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, bool verbose); +extern int gfs2_freeze_lock(struct gfs2_sbd *sdp, + struct gfs2_holder *freeze_gh, int caller_flags); +extern void gfs2_freeze_unlock(struct gfs2_holder *freeze_gh); #define gfs2_io_error(sdp) \ gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 9d7667bc4292..124b3d5a7266 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -70,6 +70,20 @@ static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) return 0; } +static bool gfs2_eatype_valid(struct gfs2_sbd *sdp, u8 type) +{ + switch(sdp->sd_sb.sb_fs_format) { + case GFS2_FS_FORMAT_MAX: + return true; + + case GFS2_FS_FORMAT_MIN: + return type <= GFS2_EATYPE_SECURITY; + + default: + return false; + } +} + typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private); @@ -77,6 +91,7 @@ typedef int (*ea_call_t) (struct gfs2_inode *ip, struct buffer_head *bh, static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, ea_call_t ea_call, void *data) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_header *ea, *prev = NULL; int error = 0; @@ -89,9 +104,8 @@ static int ea_foreach_i(struct gfs2_inode *ip, struct buffer_head *bh, if (!(bh->b_data <= (char *)ea && (char *)GFS2_EA2NEXT(ea) <= bh->b_data + bh->b_size)) goto fail; - if (!GFS2_EATYPE_VALID(ea->ea_type)) + if (!gfs2_eatype_valid(sdp, ea->ea_type)) goto fail; - error = ea_call(ip, bh, ea, prev, data); if (error) return error; @@ -259,7 +273,8 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &rg_gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &rg_gh); if (error) return error; @@ -344,6 +359,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct ea_list *ei = private; struct gfs2_ea_request *er = ei->ei_er; unsigned int ea_size; @@ -353,6 +369,8 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; + BUG_ON(ea->ea_type > GFS2_EATYPE_SECURITY && + sdp->sd_sb.sb_fs_format == GFS2_FS_FORMAT_MIN); switch (ea->ea_type) { case GFS2_EATYPE_USR: prefix = "user."; @@ -366,8 +384,12 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, prefix = "security."; l = 9; break; + case GFS2_EATYPE_TRUSTED: + prefix = "trusted."; + l = 8; + break; default: - BUG(); + return 0; } ea_size = l + ea->ea_name_len + 1; @@ -1214,6 +1236,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, } static int gfs2_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1385,7 +1408,8 @@ static int ea_dealloc_block(struct gfs2_inode *ip) return -EIO; } - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, &gh); + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); if (error) return error; @@ -1463,7 +1487,25 @@ static const struct xattr_handler gfs2_xattr_security_handler = { .set = gfs2_xattr_set, }; -const struct xattr_handler *gfs2_xattr_handlers[] = { +static bool +gfs2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler gfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .flags = GFS2_EATYPE_TRUSTED, + .list = gfs2_xattr_trusted_list, + .get = gfs2_xattr_get, + .set = gfs2_xattr_set, +}; + +const struct xattr_handler *gfs2_xattr_handlers_max[] = { + /* GFS2_FS_FORMAT_MAX */ + &gfs2_xattr_trusted_handler, + + /* GFS2_FS_FORMAT_MIN */ &gfs2_xattr_user_handler, &gfs2_xattr_security_handler, &posix_acl_access_xattr_handler, @@ -1471,3 +1513,4 @@ const struct xattr_handler *gfs2_xattr_handlers[] = { NULL, }; +const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 74fa62643136..2bd54efaf416 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -121,6 +121,7 @@ static int hfs_xattr_get(const struct xattr_handler *handler, } static int hfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 3bf2ae0e467c..527f6e46cbe8 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -189,8 +189,8 @@ static int hfs_dir_release(struct inode *inode, struct file *file) * a directory and return a corresponding inode, given the inode for * the directory and the name (and its length) of the new file. */ -static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; int res; @@ -219,7 +219,8 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int res; @@ -279,9 +280,9 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) * new file/directory. * XXX: how do you handle must_be dir? */ -static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int hfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { int res; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index f71c384064c8..b8eb0322a3e5 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -204,7 +204,8 @@ extern const struct address_space_operations hfs_btree_aops; extern struct inode *hfs_new_inode(struct inode *, const struct qstr *, umode_t); extern void hfs_inode_write_fork(struct inode *, struct hfs_extent *, __be32 *, __be32 *); extern int hfs_write_inode(struct inode *, struct writeback_control *); -extern int hfs_inode_setattr(struct dentry *, struct iattr *); +extern int hfs_inode_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void hfs_inode_read_fork(struct inode *inode, struct hfs_extent *ext, __be32 log_size, __be32 phys_size, u32 clump_size); extern struct inode *hfs_iget(struct super_block *, struct hfs_cat_key *, hfs_cat_rec *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index f35a37c65e5f..3fc5cb346586 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -602,13 +602,15 @@ static int hfs_file_release(struct inode *inode, struct file *file) * correspond to the same HFS file. */ -int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) +int hfs_inode_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; - error = setattr_prepare(dentry, attr); /* basic permission checks */ + error = setattr_prepare(&init_user_ns, dentry, + attr); /* basic permission checks */ if (error) return error; @@ -647,7 +649,7 @@ int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) current_time(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 29a9dcfbe81f..03e6c046faf4 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -434,8 +434,8 @@ out: return res; } -static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int hfsplus_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -476,8 +476,8 @@ out: return res; } -static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int hfsplus_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; @@ -517,18 +517,20 @@ out: return res; } -static int hfsplus_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hfsplus_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return hfsplus_mknod(dir, dentry, mode, 0); + return hfsplus_mknod(&init_user_ns, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hfsplus_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(dir, dentry, mode | S_IFDIR, 0); + return hfsplus_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); } -static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, +static int hfsplus_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index a92de5199ec3..12b20479ed2b 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -488,8 +488,9 @@ void hfsplus_inode_write_fork(struct inode *inode, struct hfsplus_fork_raw *fork); int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd); int hfsplus_cat_write_inode(struct inode *inode); -int hfsplus_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags); +int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index ca464328b79c..078c5c8a5156 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -241,12 +241,13 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) +static int hfsplus_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -264,14 +265,15 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) inode->i_mtime = inode->i_ctime = current_time(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } -int hfsplus_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int hfsplus_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); @@ -286,7 +288,7 @@ int hfsplus_getattr(const struct path *path, struct kstat *stat, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } @@ -376,7 +378,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, return NULL; inode->i_ino = sbi->next_cnid++; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); set_nlink(inode, 1); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index ce15b9496b77..3edb1926d127 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -91,7 +91,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) if (err) goto out; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EACCES; goto out_drop_write; } diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index bb0b27d88e50..4d169c5a2673 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -858,6 +858,7 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index cfbe6a3bfb1e..c1c7a16cbf21 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -23,6 +23,7 @@ static int hfsplus_security_getxattr(const struct xattr_handler *handler, } static int hfsplus_security_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index fbad91e1dada..e150372ec564 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -22,6 +22,7 @@ static int hfsplus_trusted_getxattr(const struct xattr_handler *handler, } static int hfsplus_trusted_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 74d19faf255e..a6b60b153916 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -22,6 +22,7 @@ static int hfsplus_user_getxattr(const struct xattr_handler *handler, } static int hfsplus_user_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 4a5beca6eaa8..29e407762626 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -557,8 +557,8 @@ static int read_name(struct inode *ino, char *name) return 0; } -static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; char *name; @@ -656,8 +656,8 @@ static int hostfs_unlink(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_symlink(struct inode *ino, struct dentry *dentry, - const char *to) +static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino, + struct dentry *dentry, const char *to) { char *file; int err; @@ -669,7 +669,8 @@ static int hostfs_symlink(struct inode *ino, struct dentry *dentry, return err; } -static int hostfs_mkdir(struct inode *ino, struct dentry *dentry, umode_t mode) +static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino, + struct dentry *dentry, umode_t mode) { char *file; int err; @@ -693,7 +694,8 @@ static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) return err; } -static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; char *name; @@ -731,7 +733,8 @@ static int hostfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return err; } -static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, +static int hostfs_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -759,7 +762,8 @@ static int hostfs_rename2(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int hostfs_permission(struct inode *ino, int desired) +static int hostfs_permission(struct user_namespace *mnt_userns, + struct inode *ino, int desired) { char *name; int r = 0, w = 0, x = 0, err; @@ -781,11 +785,12 @@ static int hostfs_permission(struct inode *ino, int desired) err = access_file(name, r, w, x); __putname(name); if (!err) - err = generic_permission(ino, desired); + err = generic_permission(&init_user_ns, ino, desired); return err; } -static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hostfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hostfs_iattr attrs; @@ -794,7 +799,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) int fd = HOSTFS_I(inode)->fd; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -851,7 +856,7 @@ static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_size != i_size_read(inode)) truncate_setsize(inode, attr->ia_size); - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index 1cca83218fb5..167ec6884642 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -280,7 +280,7 @@ void hpfs_init_inode(struct inode *); void hpfs_read_inode(struct inode *); void hpfs_write_inode(struct inode *); void hpfs_write_inode_nolock(struct inode *); -int hpfs_setattr(struct dentry *, struct iattr *); +int hpfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); void hpfs_write_if_changed(struct inode *); void hpfs_evict_inode(struct inode *); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index eb8b4baf0f2e..82208cc28ebd 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,8 @@ void hpfs_write_inode_nolock(struct inode *i) brelse(bh); } -int hpfs_setattr(struct dentry *dentry, struct iattr *attr) +int hpfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); int error = -EINVAL; @@ -274,7 +275,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size > inode->i_size) goto out_unlock; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) goto out_unlock; @@ -288,7 +289,7 @@ int hpfs_setattr(struct dentry *dentry, struct iattr *attr) hpfs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); hpfs_write_inode(inode); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 1aee39160ac5..d73f8a67168e 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -20,7 +20,8 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hpfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -128,7 +129,8 @@ bail: return err; } -static int hpfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int hpfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -215,7 +217,8 @@ bail: return err; } -static int hpfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int hpfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -289,7 +292,8 @@ bail: return err; } -static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *symlink) +static int hpfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symlink) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -506,10 +510,10 @@ fail: const struct address_space_operations hpfs_symlink_aops = { .readpage = hpfs_symlink_readpage }; - -static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + +static int hpfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { const unsigned char *old_name = old_dentry->d_name.name; unsigned old_len = old_dentry->d_name.len; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 21c20fd5f9ee..701c82c36138 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,7 +171,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) file_accessed(file); ret = -ENOMEM; - if (hugetlb_reserve_pages(inode, + if (!hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), len >> huge_page_shift(h), vma, vma->vm_flags)) @@ -310,7 +310,7 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, /* * Support for read() - Find the page attached to f_mapping and copy out the - * data. Its *very* similar to do_generic_mapping_read(), we can't use that + * data. Its *very* similar to generic_file_buffered_read(), we can't use that * since it has PAGE_SIZE assumptions. */ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) @@ -442,15 +442,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. - * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv + * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents * page faults in the truncated range by checking i_size. i_size is * modified while holding i_mmap_rwsem. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. - * Only when releasing a page is the associated region/reserv map - * deleted. The region/reserv map for ranges without associated + * Only when releasing a page is the associated region/reserve map + * deleted. The region/reserve map for ranges without associated * pages are not modified. Page faults can race with hole punch. * This is indicated if we find a mapped page. * Note: If the passed end of range value is beyond the end of file, but @@ -567,7 +567,7 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) +static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; struct address_space *mapping = inode->i_mapping; @@ -582,7 +582,6 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); - return 0; } static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) @@ -604,7 +603,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; @@ -680,7 +679,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, */ struct page *page; unsigned long addr; - int avoid_reserve = 0; cond_resched(); @@ -716,8 +714,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, continue; } - /* Allocate page and add to page cache */ - page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve); + /* + * Allocate page without setting the avoid_reserve argument. + * There certainly are no reserves associated with the + * pseudo_vma. However, there could be shared mappings with + * reserves for the file at the inode level. If we fallocate + * pages in these areas, we need to consume the reserves + * to keep reservation accounting consistent. + */ + page = alloc_huge_page(&pseudo_vma, addr, 0); hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(page)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); @@ -735,7 +740,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); - set_page_huge_active(page); + SetHPageMigratable(page); /* * unlock_page because locked by add_to_page_cache() * put_page() due to reference from alloc_huge_page() @@ -752,7 +757,8 @@ out: return error; } -static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) +static int hugetlbfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); @@ -760,9 +766,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); - BUG_ON(!inode); - - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -772,16 +776,14 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) if (newsize & ~huge_page_mask(h)) return -EINVAL; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; - error = hugetlb_vmtruncate(inode, newsize); - if (error) - return error; + hugetlb_vmtruncate(inode, newsize); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -837,7 +839,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; @@ -899,33 +901,39 @@ static int do_hugetlbfs_mknod(struct inode *dir, return error; } -static int hugetlbfs_mknod(struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t dev) +static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { return do_hugetlbfs_mknod(dir, dentry, mode, dev, false); } -static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0); + int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry, + mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int hugetlbfs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode, bool excl) { - return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0); + return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } -static int hugetlbfs_tmpfile(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + umode_t mode) { return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true); } -static int hugetlbfs_symlink(struct inode *dir, - struct dentry *dentry, const char *symname) +static int hugetlbfs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, + const char *symname) { struct inode *inode; int error = -ENOSPC; @@ -945,17 +953,6 @@ static int hugetlbfs_symlink(struct inode *dir, return error; } -/* - * mark the head page dirty - */ -static int hugetlbfs_set_page_dirty(struct page *page) -{ - struct page *head = compound_head(page); - - SetPageDirty(head); - return 0; -} - static int hugetlbfs_migrate_page(struct address_space *mapping, struct page *newpage, struct page *page, enum migrate_mode mode) @@ -966,15 +963,9 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, if (rc != MIGRATEPAGE_SUCCESS) return rc; - /* - * page_private is subpool pointer in hugetlb pages. Transfer to - * new page. PagePrivate is not associated with page_private for - * hugetlb pages and can not be set here as only page_huge_active - * pages can be migrated. - */ - if (page_private(page)) { - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); + if (hugetlb_page_subpool(page)) { + hugetlb_set_page_subpool(newpage, hugetlb_page_subpool(page)); + hugetlb_set_page_subpool(page, NULL); } if (mode != MIGRATE_SYNC_NO_COPY) @@ -1149,7 +1140,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, - .set_page_dirty = hugetlbfs_set_page_dirty, + .set_page_dirty = __set_page_dirty_no_writeback, .migratepage = hugetlbfs_migrate_page, .error_remove_page = hugetlbfs_error_remove_page, }; @@ -1349,7 +1340,7 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc) /* * Allocate and initialize subpool if maximum or minimum size is - * specified. Any needed reservations (for minimim size) are taken + * specified. Any needed reservations (for minimum size) are taken * taken when the subpool is created. */ if (ctx->max_hpages != -1 || ctx->min_hpages != -1) { @@ -1492,7 +1483,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode->i_size = size; clear_nlink(inode); - if (hugetlb_reserve_pages(inode, 0, + if (!hugetlb_reserve_pages(inode, 0, size >> huge_page_shift(hstate_inode(inode)), NULL, acctflag)) file = ERR_PTR(-ENOMEM); @@ -1526,8 +1517,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h) put_fs_context(fc); } if (IS_ERR(mnt)) - pr_err("Cannot mount internal hugetlbfs for page size %uK", - 1U << (h->order + PAGE_SHIFT - 10)); + pr_err("Cannot mount internal hugetlbfs for page size %luK", + huge_page_size(h) >> 10); return mnt; } @@ -1555,7 +1546,7 @@ static int __init init_hugetlbfs_fs(void) goto out_free; /* default hstate mount is required */ - mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]); + mnt = mount_one_hugetlbfs(&default_hstate); if (IS_ERR(mnt)) { error = PTR_ERR(mnt); goto out_unreg; diff --git a/fs/init.c b/fs/init.c index e9c320a48cf1..5c36adaa9b44 100644 --- a/fs/init.c +++ b/fs/init.c @@ -49,7 +49,7 @@ int __init init_chdir(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (error) return error; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &path); path_put(&path); @@ -64,7 +64,7 @@ int __init init_chroot(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); if (error) return error; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; error = -EPERM; @@ -118,7 +118,7 @@ int __init init_eaccess(const char *filename) error = kern_path(filename, LOOKUP_FOLLOW, &path); if (error) return error; - error = inode_permission(d_inode(path.dentry), MAY_ACCESS); + error = path_permission(&path, MAY_ACCESS); path_put(&path); return error; } @@ -157,8 +157,8 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) mode &= ~current_umask(); error = security_path_mknod(&path, dentry, mode, dev); if (!error) - error = vfs_mknod(path.dentry->d_inode, dentry, mode, - new_decode_dev(dev)); + error = vfs_mknod(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, mode, new_decode_dev(dev)); done_path_create(&path, dentry); return error; } @@ -167,6 +167,7 @@ int __init init_link(const char *oldname, const char *newname) { struct dentry *new_dentry; struct path old_path, new_path; + struct user_namespace *mnt_userns; int error; error = kern_path(oldname, 0, &old_path); @@ -181,14 +182,15 @@ int __init init_link(const char *oldname, const char *newname) error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = may_linkat(&old_path); + mnt_userns = mnt_user_ns(new_path.mnt); + error = may_linkat(mnt_userns, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, - NULL); + error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + new_dentry, NULL); out_dput: done_path_create(&new_path, new_dentry); out: @@ -207,7 +209,8 @@ int __init init_symlink(const char *oldname, const char *newname) return PTR_ERR(dentry); error = security_path_symlink(&path, dentry, oldname); if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, oldname); + error = vfs_symlink(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, oldname); done_path_create(&path, dentry); return error; } @@ -230,7 +233,8 @@ int __init init_mkdir(const char *pathname, umode_t mode) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + error = vfs_mkdir(mnt_user_ns(path.mnt), path.dentry->d_inode, + dentry, mode); done_path_create(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 6442d97d9a4a..a047ab306f9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -142,6 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; + inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) @@ -1493,7 +1494,7 @@ struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, EXPORT_SYMBOL(find_inode_rcu); /** - * find_inode_by_rcu - Find an inode in the inode cache + * find_inode_by_ino_rcu - Find an inode in the inode cache * @sb: Super block of file system to search * @ino: The inode number to match * @@ -1743,24 +1744,26 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) { - int iflags = I_DIRTY_TIME; - bool dirty = false; - - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_VERSION) - dirty = inode_maybe_inc_iversion(inode, false); - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - if ((flags & (S_ATIME | S_CTIME | S_MTIME)) && - !(inode->i_sb->s_flags & SB_LAZYTIME)) - dirty = true; - - if (dirty) - iflags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, iflags); + int dirty_flags = 0; + + if (flags & (S_ATIME | S_CTIME | S_MTIME)) { + if (flags & S_ATIME) + inode->i_atime = *time; + if (flags & S_CTIME) + inode->i_ctime = *time; + if (flags & S_MTIME) + inode->i_mtime = *time; + + if (inode->i_sb->s_flags & SB_LAZYTIME) + dirty_flags |= I_DIRTY_TIME; + else + dirty_flags |= I_DIRTY_SYNC; + } + + if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) + dirty_flags |= I_DIRTY_SYNC; + + __mark_inode_dirty(inode, dirty_flags); return 0; } EXPORT_SYMBOL(generic_update_time); @@ -1777,7 +1780,7 @@ static int update_time(struct inode *inode, struct timespec64 *time, int flags) } /** - * touch_atime - update the access time + * atime_needs_update - update the access time * @path: the &struct path to update * @inode: inode to update * @@ -1796,7 +1799,7 @@ bool atime_needs_update(const struct path *path, struct inode *inode) /* Atime updates will likely cause i_uid and i_gid to be written * back improprely if their true value is unknown to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_user_ns(mnt), inode)) return false; if (IS_NOATIME(inode)) @@ -1903,7 +1906,8 @@ int dentry_needs_remove_privs(struct dentry *dentry) return mask; } -static int __remove_privs(struct dentry *dentry, int kill) +static int __remove_privs(struct user_namespace *mnt_userns, + struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1912,7 +1916,7 @@ static int __remove_privs(struct dentry *dentry, int kill) * Note we call this on write, so notify_change will not * encounter any conflicting delegations: */ - return notify_change(dentry, &newattrs, NULL); + return notify_change(mnt_userns, dentry, &newattrs, NULL); } /* @@ -1939,7 +1943,7 @@ int file_remove_privs(struct file *file) if (kill < 0) return kill; if (kill) - error = __remove_privs(dentry, kill); + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); @@ -2130,14 +2134,21 @@ EXPORT_SYMBOL(init_special_inode); /** * inode_init_owner - Init uid,gid,mode for new inode according to posix standards + * @mnt_userns: User namespace of the mount the inode was created from * @inode: New inode * @dir: Directory inode * @mode: mode of the new inode + * + * If the inode has been created through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions + * and initializing i_uid and i_gid. On non-idmapped mounts or if permission + * checking is to be performed on the raw inode simply passs init_user_ns. */ -void inode_init_owner(struct inode *inode, const struct inode *dir, - umode_t mode) +void inode_init_owner(struct user_namespace *mnt_userns, struct inode *inode, + const struct inode *dir, umode_t mode) { - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); if (dir && dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -2145,31 +2156,41 @@ void inode_init_owner(struct inode *inode, const struct inode *dir, if (S_ISDIR(mode)) mode |= S_ISGID; else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) && - !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(dir, CAP_FSETID)) + !in_group_p(i_gid_into_mnt(mnt_userns, dir)) && + !capable_wrt_inode_uidgid(mnt_userns, dir, CAP_FSETID)) mode &= ~S_ISGID; } else - inode->i_gid = current_fsgid(); + inode->i_gid = fsgid_into_mnt(mnt_userns); inode->i_mode = mode; } EXPORT_SYMBOL(inode_init_owner); /** * inode_owner_or_capable - check current task permissions to inode + * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode being checked * * Return true if current either has CAP_FOWNER in a namespace with the * inode owner uid mapped, or owns the file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -bool inode_owner_or_capable(const struct inode *inode) +bool inode_owner_or_capable(struct user_namespace *mnt_userns, + const struct inode *inode) { + kuid_t i_uid; struct user_namespace *ns; - if (uid_eq(current_fsuid(), inode->i_uid)) + i_uid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(current_fsuid(), i_uid)) return true; ns = current_user_ns(); - if (kuid_has_mapping(ns, inode->i_uid) && ns_capable(ns, CAP_FOWNER)) + if (kuid_has_mapping(ns, i_uid) && ns_capable(ns, CAP_FOWNER)) return true; return false; } diff --git a/fs/internal.h b/fs/internal.h index 49bfb3750b22..6aeae7ef3380 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -74,7 +74,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); long do_rmdir(int dfd, struct filename *name); long do_unlinkat(int dfd, struct filename *name); -int may_linkat(struct path *link); +int may_linkat(struct user_namespace *mnt_userns, struct path *link); int do_renameat2(int olddfd, struct filename *oldname, int newdfd, struct filename *newname, unsigned int flags); diff --git a/fs/io-wq.c b/fs/io-wq.c index c36bbcd823ce..44e20248805a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,13 +13,9 @@ #include <linux/sched/mm.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/kthread.h> #include <linux/rculist_nulls.h> -#include <linux/fs_struct.h> -#include <linux/task_work.h> -#include <linux/blk-cgroup.h> -#include <linux/audit.h> #include <linux/cpu.h> +#include <linux/tracehook.h> #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -36,7 +32,6 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ - IO_WQ_BIT_ERROR = 1, /* error on setup */ }; enum { @@ -57,14 +52,12 @@ struct io_worker { struct io_wq_work *cur_work; spinlock_t lock; - struct rcu_head rcu; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif const struct cred *cur_creds; const struct cred *saved_creds; - struct nsproxy *restore_nsproxy; + + struct completion ref_done; + + struct rcu_head rcu; }; #if BITS_PER_LONG == 64 @@ -93,7 +86,6 @@ struct io_wqe { struct { raw_spinlock_t lock; struct io_wq_work_list work_list; - unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -103,6 +95,8 @@ struct io_wqe { struct hlist_nulls_head free_list; struct list_head all_list; + struct wait_queue_entry wait; + struct io_wq *wq; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; @@ -119,12 +113,15 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; + + struct io_wq_hash *hash; + refcount_t refs; struct completion done; struct hlist_node cpuhp_node; - refcount_t use_refs; + pid_t task_pid; }; static enum cpuhp_state io_wq_online; @@ -137,62 +134,7 @@ static bool io_worker_get(struct io_worker *worker) static void io_worker_release(struct io_worker *worker) { if (refcount_dec_and_test(&worker->ref)) - wake_up_process(worker->task); -} - -/* - * Note: drops the wqe->lock if returning true! The caller must re-acquire - * the lock in that case. Some callers need to restart handling if this - * happens, so we can't just re-acquire the lock on behalf of the caller. - */ -static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) -{ - bool dropped_lock = false; - - if (worker->saved_creds) { - revert_creds(worker->saved_creds); - worker->cur_creds = worker->saved_creds = NULL; - } - - if (current->files) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - - task_lock(current); - current->files = NULL; - current->nsproxy = worker->restore_nsproxy; - task_unlock(current); - } - - if (current->fs) - current->fs = NULL; - - /* - * If we have an active mm, we need to drop the wq lock before unusing - * it. If we do, return true and let the caller retry the idle loop. - */ - if (worker->mm) { - if (!dropped_lock) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - } - __set_current_state(TASK_RUNNING); - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; - } - -#ifdef CONFIG_BLK_CGROUP - if (worker->blkcg_css) { - kthread_associate_blkcg(NULL); - worker->blkcg_css = NULL; - } -#endif - if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - return dropped_lock; + complete(&worker->ref_done); } static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, @@ -204,9 +146,10 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, return &wqe->acct[IO_WQ_ACCT_BOUND]; } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, - struct io_worker *worker) +static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) { + struct io_wqe *wqe = worker->wqe; + if (worker->flags & IO_WORKER_F_BOUND) return &wqe->acct[IO_WQ_ACCT_BOUND]; @@ -216,39 +159,36 @@ static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + unsigned flags; - /* - * If we're not at zero, someone else is holding a brief reference - * to the worker. Wait for that to go away. - */ - set_current_state(TASK_INTERRUPTIBLE); - if (!refcount_dec_and_test(&worker->ref)) - schedule(); - __set_current_state(TASK_RUNNING); + if (refcount_dec_and_test(&worker->ref)) + complete(&worker->ref_done); + wait_for_completion(&worker->ref_done); preempt_disable(); current->flags &= ~PF_IO_WORKER; - if (worker->flags & IO_WORKER_F_RUNNING) + flags = worker->flags; + worker->flags = 0; + if (flags & IO_WORKER_F_RUNNING) atomic_dec(&acct->nr_running); - if (!(worker->flags & IO_WORKER_F_BOUND)) - atomic_dec(&wqe->wq->user->processes); worker->flags = 0; preempt_enable(); + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; + } + raw_spin_lock_irq(&wqe->lock); - hlist_nulls_del_rcu(&worker->nulls_node); + if (flags & IO_WORKER_F_FREE) + hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - if (__io_worker_unuse(wqe, worker)) { - __release(&wqe->lock); - raw_spin_lock_irq(&wqe->lock); - } acct->nr_workers--; raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); - if (refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + io_wq_put(wqe->wq); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -306,33 +246,27 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) wake_up_process(wqe->wq->manager); } -static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); atomic_inc(&acct->nr_running); } -static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_dec_running(struct io_worker *worker) __must_hold(wqe->lock) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) io_wqe_wake_worker(wqe, acct); } -static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) +static void io_worker_start(struct io_worker *worker) { - allow_kernel_signal(SIGINT); - - current->flags |= PF_IO_WORKER; - current->fs = NULL; - current->files = NULL; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - worker->restore_nsproxy = current->nsproxy; - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } /* @@ -357,19 +291,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; if (worker_bound != work_bound) { - io_wqe_dec_running(wqe, worker); + io_wqe_dec_running(worker); if (work_bound) { worker->flags |= IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; - atomic_dec(&wqe->wq->user->processes); } else { worker->flags &= ~IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; - atomic_inc(&wqe->wq->user->processes); } - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } } @@ -380,15 +312,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, * retry the loop in that case (we changed task state), we don't regrab * the lock if we return success. */ -static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } - - return __io_worker_unuse(wqe, worker); + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; + } } static inline unsigned int io_get_work_hash(struct io_wq_work *work) @@ -396,14 +330,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } +static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +{ + struct io_wq *wq = wqe->wq; + + spin_lock(&wq->hash->wait.lock); + if (list_empty(&wqe->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (!test_bit(hash, &wq->hash->map)) { + __set_current_state(TASK_RUNNING); + list_del_init(&wqe->wait.entry); + } + } + spin_unlock(&wq->hash->wait.lock); +} + static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; - unsigned int hash; + unsigned int stall_hash = -1U; wq_list_for_each(node, prev, &wqe->work_list) { + unsigned int hash; + work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ @@ -412,111 +363,60 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return work; } - /* hashed, can run if not already running */ hash = io_get_work_hash(work); - if (!(wqe->hash_map & BIT(hash))) { - wqe->hash_map |= BIT(hash); - /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + + /* hashed, can run if not already running */ + if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { wqe->hash_tail[hash] = NULL; wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } + if (stall_hash == -1U) + stall_hash = hash; + /* fast forward to a next hash, for-each will fix up @prev */ + node = &tail->list; } - return NULL; -} - -static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) -{ - if (worker->mm) { - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; + if (stall_hash != -1U) { + raw_spin_unlock(&wqe->lock); + io_wait_on_hash(wqe, stall_hash); + raw_spin_lock(&wqe->lock); } - if (mmget_not_zero(work->identity->mm)) { - kthread_use_mm(work->identity->mm); - worker->mm = work->identity->mm; - return; - } - - /* failed grabbing mm, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; + return NULL; } -static inline void io_wq_switch_blkcg(struct io_worker *worker, - struct io_wq_work *work) +static void io_flush_signals(void) { -#ifdef CONFIG_BLK_CGROUP - if (!(work->flags & IO_WQ_WORK_BLKCG)) - return; - if (work->identity->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->identity->blkcg_css); - worker->blkcg_css = work->identity->blkcg_css; + if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (current->task_works) + task_work_run(); + clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); } -#endif } static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->identity->creds); + const struct cred *old_creds = override_creds(work->creds); - worker->cur_creds = work->identity->creds; + worker->cur_creds = work->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else worker->saved_creds = old_creds; } -static void io_impersonate_work(struct io_worker *worker, - struct io_wq_work *work) -{ - if ((work->flags & IO_WQ_WORK_FILES) && - current->files != work->identity->files) { - task_lock(current); - current->files = work->identity->files; - current->nsproxy = work->identity->nsproxy; - task_unlock(current); - if (!work->identity->files) { - /* failed grabbing files, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) - current->fs = work->identity->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) - io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && - worker->cur_creds != work->identity->creds) - io_wq_switch_creds(worker, work); - if (work->flags & IO_WQ_WORK_FSIZE) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; - else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - io_wq_switch_blkcg(worker, work); -#ifdef CONFIG_AUDIT - current->loginuid = work->identity->loginuid; - current->sessionid = work->identity->sessionid; -#endif -} - static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - /* flush pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); cond_resched(); } -#ifdef CONFIG_AUDIT - current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); - current->sessionid = AUDIT_SID_UNSET; -#endif - spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -550,6 +450,7 @@ get_next: if (!work) break; io_assign_current_work(worker, work); + __set_current_state(TASK_RUNNING); /* handle a whole dependent link */ do { @@ -557,7 +458,8 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); - io_impersonate_work(worker, work); + if (work->creds && worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -572,8 +474,10 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { + clear_bit(hash, &wq->hash->map); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); raw_spin_lock_irq(&wqe->lock); - wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) @@ -592,27 +496,23 @@ static int io_wqe_worker(void *data) struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - io_worker_start(wqe, worker); + io_worker_start(worker); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { - __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); goto loop; } - /* drops the lock on success, retry */ - if (__io_worker_idle(wqe, worker)) { - __release(&wqe->lock); - goto loop; - } + __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) continue; + if (fatal_signal_pending(current)) + break; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -636,15 +536,16 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } /* @@ -654,9 +555,10 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (!(worker->flags & IO_WORKER_F_RUNNING)) @@ -664,32 +566,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&wqe->lock); - io_wqe_dec_running(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_lock_irq(&worker->wqe->lock); + io_wqe_dec_running(worker); + raw_spin_unlock_irq(&worker->wqe->lock); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static int task_thread(void *data, int index) { + struct io_worker *worker = data; + struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = &wqe->acct[index]; - struct io_worker *worker; + struct io_wq *wq = wqe->wq; + char buf[TASK_COMM_LEN]; - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) - return false; + sprintf(buf, "iou-wrk-%d", wq->task_pid); + set_task_comm(current, buf); - refcount_set(&worker->ref, 1); - worker->nulls_node.pprev = NULL; - worker->wqe = wqe; - spin_lock_init(&worker->lock); + current->pf_io_worker = worker; + worker->task = current; - worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, - "io_wqe_worker-%d/%d", index, wqe->node); - if (IS_ERR(worker->task)) { - kfree(worker); - return false; - } - kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); + set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node)); + current->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -702,11 +599,63 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) acct->nr_workers++; raw_spin_unlock_irq(&wqe->lock); - if (index == IO_WQ_ACCT_UNBOUND) - atomic_inc(&wq->user->processes); + io_wqe_worker(data); + do_exit(0); +} + +static int task_thread_bound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_BOUND); +} + +static int task_thread_unbound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_UNBOUND); +} + +pid_t io_wq_fork_thread(int (*fn)(void *), void *arg) +{ + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + CLONE_IO|SIGCHLD; + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return kernel_clone(&args); +} + +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +{ + struct io_worker *worker; + pid_t pid; + + __set_current_state(TASK_RUNNING); + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) + return false; + + refcount_set(&worker->ref, 1); + worker->nulls_node.pprev = NULL; + worker->wqe = wqe; + spin_lock_init(&worker->lock); + init_completion(&worker->ref_done); refcount_inc(&wq->refs); - wake_up_process(worker->task); + + if (index == IO_WQ_ACCT_BOUND) + pid = io_wq_fork_thread(task_thread_bound, worker); + else + pid = io_wq_fork_thread(task_thread_unbound, worker); + if (pid < 0) { + io_wq_put(wq); + kfree(worker); + return false; + } return true; } @@ -752,93 +701,57 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } -/* - * Manager thread. Tasked with creating new workers, if we need them. - */ -static int io_wq_manager(void *data) +static void io_wq_check_workers(struct io_wq *wq) { - struct io_wq *wq = data; int node; - /* create fixed workers */ - refcount_set(&wq->refs, 1); for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + bool fork_worker[2] = { false, false }; + if (!node_online(node)) continue; - if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - continue; - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - goto out; - } - - complete(&wq->done); - - while (!kthread_should_stop()) { - if (current->task_works) - task_work_run(); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - bool fork_worker[2] = { false, false }; - - if (!node_online(node)) - continue; - - raw_spin_lock_irq(&wqe->lock); - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) - fork_worker[IO_WQ_ACCT_BOUND] = true; - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) - fork_worker[IO_WQ_ACCT_UNBOUND] = true; - raw_spin_unlock_irq(&wqe->lock); - if (fork_worker[IO_WQ_ACCT_BOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); - if (fork_worker[IO_WQ_ACCT_UNBOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); - } - set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(HZ); - } - - if (current->task_works) - task_work_run(); -out: - if (refcount_dec_and_test(&wq->refs)) { - complete(&wq->done); - return 0; - } - /* if ERROR is set and we get here, we have workers to wake */ - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - rcu_read_lock(); - for_each_node(node) - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); - rcu_read_unlock(); + raw_spin_lock_irq(&wqe->lock); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; + raw_spin_unlock_irq(&wqe->lock); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); } - return 0; } -static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, - struct io_wq_work *work) +/* + * Manager thread. Tasked with creating new workers, if we need them. + */ +static int io_wq_manager(void *data) { - bool free_worker; - - if (!(work->flags & IO_WQ_WORK_UNBOUND)) - return true; - if (atomic_read(&acct->nr_running)) - return true; + struct io_wq *wq = data; + char buf[TASK_COMM_LEN]; - rcu_read_lock(); - free_worker = !hlist_nulls_empty(&wqe->free_list); - rcu_read_unlock(); - if (free_worker) - return true; + sprintf(buf, "iou-mgr-%d", wq->task_pid); + set_task_comm(current, buf); + current->flags |= PF_IO_WORKER; + wq->manager = current; - if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers && - !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) - return false; + complete(&wq->done); - return true; + do { + set_current_state(TASK_INTERRUPTIBLE); + io_wq_check_workers(wq); + schedule_timeout(HZ); + if (fatal_signal_pending(current)) + set_bit(IO_WQ_BIT_EXIT, &wq->state); + } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); + + io_wq_check_workers(wq); + wq->manager = NULL; + io_wq_put(wq); + do_exit(0); } static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) @@ -872,20 +785,37 @@ append: wq_list_add_after(&work->list, &tail->list, &wqe->work_list); } +static int io_wq_fork_manager(struct io_wq *wq) +{ + int ret; + + if (wq->manager) + return 0; + + clear_bit(IO_WQ_BIT_EXIT, &wq->state); + refcount_inc(&wq->refs); + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_wq_manager, wq); + current->flags &= ~PF_IO_WORKER; + if (ret >= 0) { + wait_for_completion(&wq->done); + return 0; + } + + io_wq_put(wq); + return ret; +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); int work_flags; unsigned long flags; - /* - * Do early check to see if we need a new unbound worker, and if we do, - * if we're allowed to do so. This isn't 100% accurate as there's a - * gap between this check and incrementing the value, but that's OK. - * It's close enough to not be an issue, fork() has the same delay. - */ - if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - io_run_cancel(work, wqe); + /* Can only happen if manager creation fails after exec */ + if (unlikely(io_wq_fork_manager(wqe->wq))) { + work->flags |= IO_WQ_WORK_CANCEL; + wqe->wq->do_work(work); return; } @@ -939,7 +869,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { - send_sig(SIGINT, worker->task, 1); + set_notify_signal(worker->task); match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); @@ -1043,6 +973,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } +static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + int ret; + + list_del_init(&wait->entry); + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret) + wake_up_process(wqe->wq->manager); + + return 1; +} + struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; @@ -1063,12 +1011,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (ret) goto err_wqes; + refcount_inc(&data->hash->refs); + wq->hash = data->hash; wq->free_work = data->free_work; wq->do_work = data->do_work; - /* caller must already hold a reference to this */ - wq->user = data->user; - ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; @@ -1083,11 +1030,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - if (wq->user) { - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); - } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); + wqe->wait.func = io_wqe_hash_wake; + INIT_LIST_HEAD(&wqe->wait.entry); wqe->wq = wq; raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); @@ -1095,23 +1042,16 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_LIST_HEAD(&wqe->all_list); } + wq->task_pid = current->pid; init_completion(&wq->done); + refcount_set(&wq->refs, 1); - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); - if (!IS_ERR(wq->manager)) { - wake_up_process(wq->manager); - wait_for_completion(&wq->done); - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - ret = -ENOMEM; - goto err; - } - refcount_set(&wq->use_refs, 1); - reinit_completion(&wq->done); + ret = io_wq_fork_manager(wq); + if (!ret) return wq; - } - ret = PTR_ERR(wq->manager); - complete(&wq->done); + io_wq_put(wq); + io_wq_put_hash(data->hash); err: cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) @@ -1123,15 +1063,7 @@ err_wq: return ERR_PTR(ret); } -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) -{ - if (data->free_work != wq->free_work || data->do_work != wq->do_work) - return false; - - return refcount_inc_not_zero(&wq->use_refs); -} - -static void __io_wq_destroy(struct io_wq *wq) +static void io_wq_destroy(struct io_wq *wq) { int node; @@ -1139,30 +1071,31 @@ static void __io_wq_destroy(struct io_wq *wq) set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) - kthread_stop(wq->manager); + wake_up_process(wq->manager); rcu_read_lock(); for_each_node(node) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - wait_for_completion(&wq->done); + spin_lock_irq(&wq->hash->wait.lock); + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; - for_each_node(node) - kfree(wq->wqes[node]); + list_del_init(&wqe->wait.entry); + kfree(wqe); + } + spin_unlock_irq(&wq->hash->wait.lock); + io_wq_put_hash(wq->hash); kfree(wq->wqes); kfree(wq); -} -void io_wq_destroy(struct io_wq *wq) -{ - if (refcount_dec_and_test(&wq->use_refs)) - __io_wq_destroy(wq); } -struct task_struct *io_wq_get_task(struct io_wq *wq) +void io_wq_put(struct io_wq *wq) { - return wq->manager; + if (refcount_dec_and_test(&wq->refs)) + io_wq_destroy(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 096f1021018e..b6ca12b60c35 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,7 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include <linux/refcount.h> #include <linux/io_uring.h> struct io_wq; @@ -11,13 +12,6 @@ enum { IO_WQ_WORK_UNBOUND = 4, IO_WQ_WORK_CONCURRENT = 16, - IO_WQ_WORK_FILES = 32, - IO_WQ_WORK_FS = 64, - IO_WQ_WORK_MM = 128, - IO_WQ_WORK_CREDS = 256, - IO_WQ_WORK_BLKCG = 512, - IO_WQ_WORK_FSIZE = 1024, - IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -85,7 +79,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct io_identity *identity; + const struct cred *creds; unsigned flags; }; @@ -100,20 +94,32 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *); -struct io_wq_data { - struct user_struct *user; +struct io_wq_hash { + refcount_t refs; + unsigned long map; + struct wait_queue_head wait; +}; +static inline void io_wq_put_hash(struct io_wq_hash *hash) +{ + if (refcount_dec_and_test(&hash->refs)) + kfree(hash); +} + +struct io_wq_data { + struct io_wq_hash *hash; io_wq_work_fn *do_work; free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); -void io_wq_destroy(struct io_wq *wq); +void io_wq_put(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); +pid_t io_wq_fork_thread(int (*fn)(void *), void *arg); + static inline bool io_wq_is_hashed(struct io_wq_work *work) { return work->flags & IO_WQ_WORK_HASHED; @@ -124,8 +130,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data, bool cancel_all); -struct task_struct *io_wq_get_task(struct io_wq *wq); - #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); @@ -140,6 +144,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { - return in_task() && (current->flags & PF_IO_WORKER); + return in_task() && (current->flags & PF_IO_WORKER) && + current->pf_io_worker; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 14ce789927e4..4a088581b0f2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,6 @@ #include <linux/mman.h> #include <linux/percpu.h> #include <linux/slab.h> -#include <linux/kthread.h> #include <linux/blkdev.h> #include <linux/bvec.h> #include <linux/net.h> @@ -104,6 +103,10 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -232,6 +235,7 @@ struct fixed_rsrc_data { struct fixed_rsrc_ref_node *node; struct percpu_ref refs; struct completion done; + bool quiesce; }; struct io_buffer { @@ -249,6 +253,11 @@ struct io_restriction { bool registered; }; +enum { + IO_SQ_THREAD_SHOULD_STOP = 0, + IO_SQ_THREAD_SHOULD_PARK, +}; + struct io_sq_data { refcount_t refs; struct mutex lock; @@ -262,6 +271,13 @@ struct io_sq_data { struct wait_queue_head wait; unsigned sq_thread_idle; + int sq_cpu; + pid_t task_pid; + + unsigned long state; + struct completion startup; + struct completion completion; + struct completion exited; }; #define IO_IOPOLL_BATCH 8 @@ -279,8 +295,14 @@ struct io_comp_state { struct list_head locked_free_list; }; +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + struct io_submit_state { struct blk_plug plug; + struct io_submit_link link; /* * io_kiocb alloc cache @@ -312,12 +334,12 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int sqo_dead: 1; + unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -339,6 +361,9 @@ struct io_ring_ctx { unsigned cached_cq_overflow; unsigned long sq_check_overflow; + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; @@ -355,22 +380,14 @@ struct io_ring_ctx { struct io_rings *rings; - /* IO offload */ - struct io_wq *io_wq; - /* - * For SQPOLL usage - we hold a reference to the parent task, so we - * have access to the ->files + * For SQPOLL usage */ struct task_struct *sqo_task; /* Only used for accounting purposes */ struct mm_struct *mm_account; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *sqo_blkcg_css; -#endif - struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -390,13 +407,6 @@ struct io_ring_ctx { struct user_struct *user; - const struct cred *creds; - -#ifdef CONFIG_AUDIT - kuid_t loginuid; - unsigned int sessionid; -#endif - struct completion ref_comp; struct completion sq_thread_comp; @@ -445,6 +455,11 @@ struct io_ring_ctx { struct io_restriction restrictions; + /* exit task_work */ + struct callback_head *exit_task_work; + + struct wait_queue_head hash_wait; + /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; }; @@ -827,7 +842,6 @@ struct io_op_def { unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; - unsigned work_flags; }; static const struct io_op_def io_op_defs[] = { @@ -840,7 +854,6 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -850,12 +863,9 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FSYNC] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, @@ -863,7 +873,6 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -872,8 +881,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | - IO_WQ_WORK_MM, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -882,7 +889,6 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { .needs_file = 1, @@ -890,8 +896,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -900,29 +904,23 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { .needs_file = 1, @@ -930,26 +928,14 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, - }, - [IORING_OP_OPENAT] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS | IO_WQ_WORK_MM, - }, - [IORING_OP_CLOSE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_FILES_UPDATE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, - }, - [IORING_OP_STATX] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_OPENAT] = {}, + [IORING_OP_CLOSE] = {}, + [IORING_OP_FILES_UPDATE] = {}, + [IORING_OP_STATX] = {}, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -957,7 +943,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -965,42 +950,31 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FADVISE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, - }, - [IORING_OP_MADVISE] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, + [IORING_OP_MADVISE] = {}, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -1012,24 +986,18 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SHUTDOWN] = { .needs_file = 1, }, - [IORING_OP_RENAMEAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_UNLINKAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, + [IORING_OP_RENAMEAT] = {}, + [IORING_OP_UNLINKAT] = {}, }; static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); +static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx); static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node); static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( struct io_ring_ctx *ctx); -static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *ref_node); +static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static bool io_rw_reissue(struct io_kiocb *req); static void io_cqring_fill_event(struct io_kiocb *req, long res); @@ -1116,161 +1084,18 @@ static bool io_match_task(struct io_kiocb *head, continue; if (req->file && req->file->f_op == &io_uring_fops) return true; - if ((req->work.flags & IO_WQ_WORK_FILES) && - req->work.identity->files == files) + if (req->task->files == files) return true; } return false; } -static void io_sq_thread_drop_mm_files(void) -{ - struct files_struct *files = current->files; - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - current->mm = NULL; - } - if (files) { - struct nsproxy *nsproxy = current->nsproxy; - - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); - put_files_struct(files); - put_nsproxy(nsproxy); - } -} - -static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) -{ - if (!current->files) { - struct files_struct *files; - struct nsproxy *nsproxy; - - task_lock(ctx->sqo_task); - files = ctx->sqo_task->files; - if (!files) { - task_unlock(ctx->sqo_task); - return -EOWNERDEAD; - } - atomic_inc(&files->count); - get_nsproxy(ctx->sqo_task->nsproxy); - nsproxy = ctx->sqo_task->nsproxy; - task_unlock(ctx->sqo_task); - - task_lock(current); - current->files = files; - current->nsproxy = nsproxy; - task_unlock(current); - } - return 0; -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm; - - if (current->mm) - return 0; - - task_lock(ctx->sqo_task); - mm = ctx->sqo_task->mm; - if (unlikely(!mm || !mmget_not_zero(mm))) - mm = NULL; - task_unlock(ctx->sqo_task); - - if (mm) { - kthread_use_mm(mm); - return 0; - } - - return -EFAULT; -} - -static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - int ret; - - if (def->work_flags & IO_WQ_WORK_MM) { - ret = __io_sq_thread_acquire_mm(ctx); - if (unlikely(ret)) - return ret; - } - - if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { - ret = __io_sq_thread_acquire_files(ctx); - if (unlikely(ret)) - return ret; - } - - return 0; -} - -static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!(ctx->flags & IORING_SETUP_SQPOLL)) - return 0; - return __io_sq_thread_acquire_mm_files(ctx, req); -} - -static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, - struct cgroup_subsys_state **cur_css) - -{ -#ifdef CONFIG_BLK_CGROUP - /* puts the old one when swapping */ - if (*cur_css != ctx->sqo_blkcg_css) { - kthread_associate_blkcg(ctx->sqo_blkcg_css); - *cur_css = ctx->sqo_blkcg_css; - } -#endif -} - -static void io_sq_thread_unassociate_blkcg(void) -{ -#ifdef CONFIG_BLK_CGROUP - kthread_associate_blkcg(NULL); -#endif -} - static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; } -/* - * None of these are dereferenced, they are simply used to check if any of - * them have changed. If we're under current and check they are still the - * same, we're fine to grab references to them for actual out-of-line use. - */ -static void io_init_identity(struct io_identity *id) -{ - id->files = current->files; - id->mm = current->mm; -#ifdef CONFIG_BLK_CGROUP - rcu_read_lock(); - id->blkcg_css = blkcg_css(); - rcu_read_unlock(); -#endif - id->creds = current_cred(); - id->nsproxy = current->nsproxy; - id->fs = current->fs; - id->fsize = rlimit(RLIMIT_FSIZE); -#ifdef CONFIG_AUDIT - id->loginuid = current->loginuid; - id->sessionid = current->sessionid; -#endif - refcount_set(&id->count, 1); -} - static inline void __io_req_init_async(struct io_kiocb *req) { memset(&req->work, 0, sizeof(req->work)); @@ -1283,17 +1108,10 @@ static inline void __io_req_init_async(struct io_kiocb *req) */ static inline void io_req_init_async(struct io_kiocb *req) { - struct io_uring_task *tctx = current->io_uring; - if (req->flags & REQ_F_WORK_INITIALIZED) return; __io_req_init_async(req); - - /* Grab a ref if this isn't our static identity */ - req->work.identity = tctx->identity; - if (tctx->identity != &tctx->__identity) - refcount_inc(&req->work.identity->count); } static void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1378,40 +1196,14 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) -{ - if (req->work.identity == &tctx->__identity) - return; - if (refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); -} - static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; - if (req->work.flags & IO_WQ_WORK_MM) - mmdrop(req->work.identity->mm); -#ifdef CONFIG_BLK_CGROUP - if (req->work.flags & IO_WQ_WORK_BLKCG) - css_put(req->work.identity->blkcg_css); -#endif - if (req->work.flags & IO_WQ_WORK_CREDS) - put_cred(req->work.identity->creds); - if (req->work.flags & IO_WQ_WORK_FS) { - struct fs_struct *fs = req->work.identity->fs; - - spin_lock(&req->work.identity->fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.identity->fs->lock); - if (fs) - free_fs_struct(fs); - } - if (req->work.flags & IO_WQ_WORK_FILES) { - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1427,54 +1219,6 @@ static void io_req_clean_work(struct io_kiocb *req) } req->flags &= ~REQ_F_WORK_INITIALIZED; - req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS | - IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES); - io_put_identity(req->task->io_uring, req); -} - -/* - * Create a private copy of io_identity, since some fields don't match - * the current context. - */ -static bool io_identity_cow(struct io_kiocb *req) -{ - struct io_uring_task *tctx = current->io_uring; - const struct cred *creds = NULL; - struct io_identity *id; - - if (req->work.flags & IO_WQ_WORK_CREDS) - creds = req->work.identity->creds; - - id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) { - req->work.flags |= IO_WQ_WORK_CANCEL; - return false; - } - - /* - * We can safely just re-init the creds we copied Either the field - * matches the current one, or we haven't grabbed it yet. The only - * exception is ->creds, through registered personalities, so handle - * that one separately. - */ - io_init_identity(id); - if (creds) - id->creds = creds; - - /* add one for this request */ - refcount_inc(&id->count); - - /* drop tctx and req identity references, if needed */ - if (tctx->identity != &tctx->__identity && - refcount_dec_and_test(&tctx->identity->count)) - kfree(tctx->identity); - if (req->work.identity != &tctx->__identity && - refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); - - req->work.identity = id; - tctx->identity = id; - return true; } static void io_req_track_inflight(struct io_kiocb *req) @@ -1491,79 +1235,6 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static bool io_grab_identity(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_identity *id = req->work.identity; - - if (def->work_flags & IO_WQ_WORK_FSIZE) { - if (id->fsize != rlimit(RLIMIT_FSIZE)) - return false; - req->work.flags |= IO_WQ_WORK_FSIZE; - } -#ifdef CONFIG_BLK_CGROUP - if (!(req->work.flags & IO_WQ_WORK_BLKCG) && - (def->work_flags & IO_WQ_WORK_BLKCG)) { - rcu_read_lock(); - if (id->blkcg_css != blkcg_css()) { - rcu_read_unlock(); - return false; - } - /* - * This should be rare, either the cgroup is dying or the task - * is moving cgroups. Just punt to root for the handful of ios. - */ - if (css_tryget_online(id->blkcg_css)) - req->work.flags |= IO_WQ_WORK_BLKCG; - rcu_read_unlock(); - } -#endif - if (!(req->work.flags & IO_WQ_WORK_CREDS)) { - if (id->creds != current_cred()) - return false; - get_cred(id->creds); - req->work.flags |= IO_WQ_WORK_CREDS; - } -#ifdef CONFIG_AUDIT - if (!uid_eq(current->loginuid, id->loginuid) || - current->sessionid != id->sessionid) - return false; -#endif - if (!(req->work.flags & IO_WQ_WORK_FS) && - (def->work_flags & IO_WQ_WORK_FS)) { - if (current->fs != id->fs) - return false; - spin_lock(&id->fs->lock); - if (!id->fs->in_exec) { - id->fs->users++; - req->work.flags |= IO_WQ_WORK_FS; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } - if (!(req->work.flags & IO_WQ_WORK_FILES) && - (def->work_flags & IO_WQ_WORK_FILES) && - !(req->flags & REQ_F_NO_FILE_TABLE)) { - if (id->files != current->files || - id->nsproxy != current->nsproxy) - return false; - atomic_inc(&id->files->count); - get_nsproxy(id->nsproxy); - req->work.flags |= IO_WQ_WORK_FILES; - io_req_track_inflight(req); - } - if (!(req->work.flags & IO_WQ_WORK_MM) && - (def->work_flags & IO_WQ_WORK_MM)) { - if (id->mm != current->mm) - return false; - mmgrab(id->mm); - req->work.flags |= IO_WQ_WORK_MM; - } - - return true; -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1581,17 +1252,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - /* if we fail grabbing identity, we must COW, regrab, and retry */ - if (io_grab_identity(req)) - return; - - if (!io_identity_cow(req)) - return; - - /* can't fail at this point */ - if (!io_grab_identity(req)) - WARN_ON(1); + if (!req->work.creds) + req->work.creds = get_current_cred(); } static void io_prep_async_link(struct io_kiocb *req) @@ -1606,10 +1268,14 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); + struct io_uring_task *tctx = req->task->io_uring; + + BUG_ON(!tctx); + BUG_ON(!tctx->io_wq); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); - io_wq_enqueue(ctx->io_wq, &req->work); + io_wq_enqueue(tctx->io_wq, &req->work); return link; } @@ -2303,11 +1969,14 @@ static int io_req_task_work_add(struct io_kiocb *req) static void io_req_task_work_add_fallback(struct io_kiocb *req, task_work_func_t cb) { - struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + struct io_ring_ctx *ctx = req->ctx; + struct callback_head *head; init_task_work(&req->task_work, cb); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + do { + head = READ_ONCE(ctx->exit_task_work); + req->task_work.next = head; + } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -2329,7 +1998,9 @@ static void io_req_task_cancel(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; - __io_req_task_cancel(req, -ECANCELED); + mutex_lock(&ctx->uring_lock); + __io_req_task_cancel(req, req->result); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -2339,15 +2010,11 @@ static void __io_req_task_submit(struct io_kiocb *req) /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); - if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && - !io_sq_thread_acquire_mm_files(ctx, req)) + if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve) __io_queue_sqe(req); else __io_req_task_cancel(req, -EFAULT); mutex_unlock(&ctx->uring_lock); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_sq_thread_drop_mm_files(); } static void io_req_task_submit(struct callback_head *cb) @@ -2364,11 +2031,22 @@ static void io_req_task_queue(struct io_kiocb *req) req->task_work.func = io_req_task_submit; ret = io_req_task_work_add(req); if (unlikely(ret)) { + req->result = -ECANCELED; percpu_ref_get(&req->ctx->refs); io_req_task_work_add_fallback(req, io_req_task_cancel); } } +static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +{ + percpu_ref_get(&req->ctx->refs); + req->result = ret; + req->task_work.func = io_req_task_cancel; + + if (unlikely(io_req_task_work_add(req))) + io_req_task_work_add_fallback(req, io_req_task_cancel); +} + static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2800,18 +2478,22 @@ static bool io_rw_reissue(struct io_kiocb *req) { #ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; - int ret; if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker()) return false; + /* + * If ref is dying, we might be running poll reap from the exit work. + * Don't attempt to reissue from that path, just let it fail with + * -EAGAIN. + */ + if (percpu_ref_is_dying(&req->ctx->refs)) + return false; lockdep_assert_held(&req->ctx->uring_lock); - ret = io_sq_thread_acquire_mm_files(req->ctx, req); - - if (!ret && io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) { refcount_inc(&req->refs); io_queue_async_work(req); return true; @@ -3467,19 +3149,9 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, READ); + return io_prep_rw(req, sqe); } /* @@ -3607,10 +3279,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EIOCBQUEUED) { - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return 0; + goto out_free; } else if (ret == -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -3631,6 +3300,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret2) return ret2; + iovec = NULL; rw = req->async_data; /* now use our persistent iterator, if we aren't already */ iter = &rw->iter; @@ -3657,24 +3327,18 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) } while (ret > 0 && ret < io_size); done: kiocb_done(kiocb, ret, issue_flags); +out_free: + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); return 0; } static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, WRITE); + return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -4011,7 +3675,7 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -4598,13 +4262,10 @@ err: return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) @@ -4664,11 +4325,21 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, req->sr_msg.msg_flags, &iomsg->free_iov); } +static int io_sendmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4681,13 +4352,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_sendmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -4881,13 +4546,22 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) return io_put_kbuf(req, req->sr_msg.kbuf); } -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_recvmsg_prep_async(struct io_kiocb *req) { - struct io_async_msghdr *async_msg = req->async_data; - struct io_sr_msg *sr = &req->sr_msg; int ret; + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_recvmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = &req->sr_msg; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4900,13 +4574,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_recvmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -5059,10 +4727,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_connect_prep_async(struct io_kiocb *req) +{ + struct io_async_connect *io = req->async_data; + struct io_connect *conn = &req->connect; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); +} + static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -5071,12 +4746,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); - - if (!io) - return 0; - - return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->address); + return 0; } static int io_connect(struct io_kiocb *req, unsigned int issue_flags) @@ -5121,56 +4791,32 @@ out: return 0; } #else /* !CONFIG_NET */ -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} +#define IO_NETOP_FN(op) \ +static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ +{ \ + return -EOPNOTSUPP; \ +} + +#define IO_NETOP_PREP(op) \ +IO_NETOP_FN(op) \ +static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ +{ \ + return -EOPNOTSUPP; \ +} \ + +#define IO_NETOP_PREP_ASYNC(op) \ +IO_NETOP_PREP(op) \ +static int io_##op##_prep_async(struct io_kiocb *req) \ +{ \ + return -EOPNOTSUPP; \ +} + +IO_NETOP_PREP_ASYNC(sendmsg); +IO_NETOP_PREP_ASYNC(recvmsg); +IO_NETOP_PREP_ASYNC(connect); +IO_NETOP_PREP(accept); +IO_NETOP_FN(send); +IO_NETOP_FN(recv); #endif /* CONFIG_NET */ struct io_poll_table { @@ -5952,12 +5598,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->user_data == (unsigned long) data; } -static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) { enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); + if (!tctx->io_wq) + return -ENOENT; + + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5980,7 +5629,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, + (void *) (unsigned long) sqe_addr); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; @@ -6084,9 +5734,9 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_REMOVE: return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - return io_prep_fsync(req, sqe); + return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - return io_prep_sfr(req, sqe); + return io_sfr_prep(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: return io_sendmsg_prep(req, sqe); @@ -6144,14 +5794,39 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return-EINVAL; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep_async(struct io_kiocb *req) +{ + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + return io_rw_prep_async(req, READ); + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + return io_rw_prep_async(req, WRITE); + case IORING_OP_SENDMSG: + case IORING_OP_SEND: + return io_sendmsg_prep_async(req); + case IORING_OP_RECVMSG: + case IORING_OP_RECV: + return io_recvmsg_prep_async(req); + case IORING_OP_CONNECT: + return io_connect_prep_async(req); + } + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req) { - if (!sqe) + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + /* some opcodes init it during the inital prep */ + if (req->async_data) return 0; - if (io_alloc_async_data(req)) + if (__io_alloc_async_data(req)) return -EAGAIN; - return io_req_prep(req, sqe); + return io_req_prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -6167,7 +5842,7 @@ static u32 io_get_sequence(struct io_kiocb *req) return total_submitted - nr_reqs; } -static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_req_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; @@ -6184,11 +5859,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (ret) - return ret; - } + ret = io_req_defer_prep(req); + if (ret) + return ret; io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) @@ -6427,29 +6100,11 @@ static void io_wq_submit_work(struct io_wq_work *work) } while (1); } + /* avoid locking problems by failing it from a clean context */ if (ret) { - struct io_ring_ctx *lock_ctx = NULL; - - if (req->ctx->flags & IORING_SETUP_IOPOLL) - lock_ctx = req->ctx; - - /* - * io_iopoll_complete() does not hold completion_lock to - * complete polled io, so here for polled io, we can not call - * io_req_complete() directly, otherwise there maybe concurrent - * access to cqring, defer_list, etc, which is not safe. Given - * that io_iopoll_complete() is always called under uring_lock, - * so here for polled io, we also get uring_lock to complete - * it. - */ - if (lock_ctx) - mutex_lock(&lock_ctx->uring_lock); - - req_set_fail_links(req); - io_req_complete(req, ret); - - if (lock_ctx) - mutex_unlock(&lock_ctx->uring_lock); + /* io-wq is going to take one down */ + refcount_inc(&req->refs); + io_req_task_queue_fail(req, ret); } } @@ -6564,10 +6219,9 @@ static void __io_queue_sqe(struct io_kiocb *req) const struct cred *old_creds = NULL; int ret; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_CREDS) && - req->work.identity->creds != current_cred()) - old_creds = override_creds(req->work.identity->creds); + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && + req->work.creds != current_cred()) + old_creds = override_creds(req->work.creds); ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -6607,11 +6261,11 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(req, sqe); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { fail_req: @@ -6620,42 +6274,139 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; io_queue_async_work(req); } else { - if (sqe) { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } __io_queue_sqe(req); } } -static inline void io_queue_link_head(struct io_kiocb *req) +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) { - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_put_req(req); - io_req_complete(req, -ECANCELED); - } else - io_queue_sqe(req, NULL); + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; } -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_submit_state *state; + unsigned int sqe_flags; + int id, ret = 0; + + req->opcode = READ_ONCE(sqe->opcode); + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags = sqe_flags = READ_ONCE(sqe->flags); + req->user_data = READ_ONCE(sqe->user_data); + req->async_data = NULL; + req->file = NULL; + req->ctx = ctx; + req->link = NULL; + req->fixed_rsrc_refs = NULL; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = current; + req->result = 0; + + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { + req->flags = 0; + return -EINVAL; + } + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + id = READ_ONCE(sqe->personality); + if (id) { + __io_req_init_async(req); + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); + } + + state = &ctx->submit_state; + + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file)) + ret = -EBADF; + } + + state->ios_left--; + return ret; +} -static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_link *link) +static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; + struct io_submit_link *link = &ctx->submit_state.link; int ret; + ret = io_init_req(ctx, req, sqe); + if (unlikely(ret)) { +fail_req: + io_put_req(req); + io_req_complete(req, ret); + if (link->head) { + /* fail even hard links since we don't submit */ + link->head->flags |= REQ_F_FAIL_LINK; + io_put_req(link->head); + io_req_complete(link->head, -ECANCELED); + link->head = NULL; + } + return ret; + } + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + + /* don't need @sqe from now on */ + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, ctx->flags & IORING_SETUP_SQPOLL); + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -6677,19 +6428,16 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) { - /* fail even hard links since we don't submit */ - head->flags |= REQ_F_FAIL_LINK; - return ret; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_sqe(head); link->head = NULL; } } else { @@ -6698,13 +6446,10 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - req->flags |= REQ_F_FAIL_LINK; link->head = req; link->last = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req); } } @@ -6717,6 +6462,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, static void io_submit_state_end(struct io_submit_state *state, struct io_ring_ctx *ctx) { + if (state->link.head) + io_queue_sqe(state->link.head); if (state->comp.nr) io_submit_flush_completions(&state->comp, ctx); if (state->plug_started) @@ -6732,6 +6479,8 @@ static void io_submit_state_start(struct io_submit_state *state, { state->plug_started = false; state->ios_left = max_ios; + /* set only head, no need to init link_last in advance */ + state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) @@ -6777,117 +6526,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -/* - * Check SQE restrictions (opcode and flags). - * - * Returns 'true' if SQE is allowed, 'false' otherwise. - */ -static inline bool io_check_restriction(struct io_ring_ctx *ctx, - struct io_kiocb *req, - unsigned int sqe_flags) -{ - if (!ctx->restricted) - return true; - - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) - return false; - - if ((sqe_flags & ctx->restrictions.sqe_flags_required) != - ctx->restrictions.sqe_flags_required) - return false; - - if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | - ctx->restrictions.sqe_flags_required)) - return false; - - return true; -} - -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) - -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_submit_state *state; - unsigned int sqe_flags; - int id, ret = 0; - - req->opcode = READ_ONCE(sqe->opcode); - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); - req->async_data = NULL; - req->file = NULL; - req->ctx = ctx; - req->link = NULL; - req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->task = current; - req->result = 0; - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - - if (unlikely(req->opcode >= IORING_OP_LAST)) - return -EINVAL; - - if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) - return -EFAULT; - - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) - return -EACCES; - - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - - id = READ_ONCE(sqe->personality); - if (id) { - struct io_identity *iod; - - iod = idr_find(&ctx->personality_idr, id); - if (unlikely(!iod)) - return -EINVAL; - refcount_inc(&iod->count); - - __io_req_init_async(req); - get_cred(iod->creds); - req->work.identity = iod; - req->work.flags |= IO_WQ_WORK_CREDS; - } - - state = &ctx->submit_state; - - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; -} - static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { - struct io_submit_link link; - int i, submitted = 0; + int submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -6903,14 +6544,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); - io_submit_state_start(&ctx->submit_state, nr); - link.head = NULL; - for (i = 0; i < nr; i++) { + while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - int err; req = io_alloc_req(ctx); if (unlikely(!req)) { @@ -6925,20 +6563,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - - err = io_init_req(ctx, req, sqe); - if (unlikely(err)) { -fail_req: - io_put_req(req); - io_req_complete(req, err); + if (io_submit_sqe(ctx, req, sqe)) break; - } - - trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, ctx->flags & IORING_SETUP_SQPOLL); - err = io_submit_sqe(req, sqe, &link); - if (err) - goto fail_req; } if (unlikely(submitted != nr)) { @@ -6950,10 +6576,8 @@ fail_req: percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link.head) - io_queue_link_head(link.head); - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(&ctx->submit_state, ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7030,71 +6654,97 @@ static void io_sqd_init_new(struct io_sq_data *sqd) io_sqd_update_thread_idle(sqd); } +static bool io_sq_thread_should_stop(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + +static bool io_sq_thread_should_park(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); +} + +static void io_sq_thread_parkme(struct io_sq_data *sqd) +{ + for (;;) { + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); + if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) + break; + + /* + * Thread is going to call schedule(), do not preempt it, + * or the caller of kthread_park() may spend more time in + * wait_task_inactive(). + */ + preempt_disable(); + complete(&sqd->completion); + schedule_preempt_disabled(); + preempt_enable(); + } + __set_current_state(TASK_RUNNING); +} + static int io_sq_thread(void *data) { - struct cgroup_subsys_state *cur_css = NULL; - struct files_struct *old_files = current->files; - struct nsproxy *old_nsproxy = current->nsproxy; - const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; unsigned long timeout = 0; + char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); + sprintf(buf, "iou-sqp-%d", sqd->task_pid); + set_task_comm(current, buf); + sqd->thread = current; + current->pf_io_worker = NULL; + + if (sqd->sq_cpu != -1) + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |= PF_NO_SETAFFINITY; + + complete(&sqd->completion); + + wait_for_completion(&sqd->startup); - while (!kthread_should_stop()) { + while (!io_sq_thread_should_stop(sqd)) { int ret; bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the - * kthread parking. This synchronizes the thread vs users, + * thread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) { - kthread_parkme(); - /* - * When sq thread is unparked, in case the previous park operation - * comes from io_put_sq_data(), which means that sq thread is going - * to be stopped, so here needs to have a check. - */ - if (kthread_should_stop()) - break; + if (io_sq_thread_should_park(sqd)) { + io_sq_thread_parkme(sqd); + continue; } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); timeout = jiffies + sqd->sq_thread_idle; } - + if (fatal_signal_pending(current)) + break; sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - if (current->cred != ctx->creds) { - if (old_cred) - revert_creds(old_cred); - old_cred = override_creds(ctx->creds); - } - io_sq_thread_associate_blkcg(ctx, &cur_css); -#ifdef CONFIG_AUDIT - current->loginuid = ctx->loginuid; - current->sessionid = ctx->sessionid; -#endif - ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; - - io_sq_thread_drop_mm_files(); } if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); - io_sq_thread_drop_mm_files(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -7115,7 +6765,7 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !kthread_should_park()) { + if (needs_sched && !io_sq_thread_should_park(sqd)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); @@ -7128,22 +6778,25 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } - io_run_task_work(); - io_sq_thread_drop_mm_files(); - - if (cur_css) - io_sq_thread_unassociate_blkcg(); - if (old_cred) - revert_creds(old_cred); + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_cancel_sqpoll(ctx); - task_lock(current); - current->files = old_files; - current->nsproxy = old_nsproxy; - task_unlock(current); + io_run_task_work(); - kthread_parkme(); + /* + * Clear thread under lock so that concurrent parks work correctly + */ + complete_all(&sqd->completion); + mutex_lock(&sqd->lock); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + ctx->sqo_exec = 1; + io_ring_set_wakeup_flag(ctx); + } + mutex_unlock(&sqd->lock); - return 0; + complete(&sqd->exited); + do_exit(0); } struct io_wait_queue { @@ -7328,38 +6981,59 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, percpu_ref_get(&rsrc_data->refs); } -static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, - struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *backup_node) +static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) { - struct fixed_rsrc_ref_node *ref_node; - int ret; + struct fixed_rsrc_ref_node *ref_node = NULL; io_rsrc_ref_lock(ctx); ref_node = data->node; + data->node = NULL; io_rsrc_ref_unlock(ctx); if (ref_node) percpu_ref_kill(&ref_node->refs); +} + +static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, + struct io_ring_ctx *ctx, + void (*rsrc_put)(struct io_ring_ctx *ctx, + struct io_rsrc_put *prsrc)) +{ + struct fixed_rsrc_ref_node *backup_node; + int ret; - percpu_ref_kill(&data->refs); + if (data->quiesce) + return -ENXIO; - /* wait for all refs nodes to complete */ - flush_delayed_work(&ctx->rsrc_put_work); + data->quiesce = true; do { + ret = -ENOMEM; + backup_node = alloc_fixed_rsrc_ref_node(ctx); + if (!backup_node) + break; + backup_node->rsrc_data = data; + backup_node->rsrc_put = rsrc_put; + + io_sqe_rsrc_kill_node(ctx, data); + percpu_ref_kill(&data->refs); + flush_delayed_work(&ctx->rsrc_put_work); + ret = wait_for_completion_interruptible(&data->done); if (!ret) break; + + percpu_ref_resurrect(&data->refs); + io_sqe_rsrc_set_node(ctx, data, backup_node); + backup_node = NULL; + reinit_completion(&data->done); + mutex_unlock(&ctx->uring_lock); ret = io_run_task_work_sig(); - if (ret < 0) { - percpu_ref_resurrect(&data->refs); - reinit_completion(&data->done); - io_sqe_rsrc_set_node(ctx, data, backup_node); - return ret; - } - } while (1); + mutex_lock(&ctx->uring_lock); + } while (ret >= 0); + data->quiesce = false; - destroy_fixed_rsrc_ref_node(backup_node); - return 0; + if (backup_node) + destroy_fixed_rsrc_ref_node(backup_node); + return ret; } static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) @@ -7390,18 +7064,17 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data) static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_rsrc_data *data = ctx->file_data; - struct fixed_rsrc_ref_node *backup_node; unsigned nr_tables, i; int ret; - if (!data) + /* + * percpu_ref_is_dying() is to stop parallel files unregister + * Since we possibly drop uring lock later in this function to + * run task work. + */ + if (!data || percpu_ref_is_dying(&data->refs)) return -ENXIO; - backup_node = alloc_fixed_rsrc_ref_node(ctx); - if (!backup_node) - return -ENOMEM; - init_fixed_file_ref_node(ctx, backup_node); - - ret = io_rsrc_ref_quiesce(data, ctx, backup_node); + ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put); if (ret) return ret; @@ -7415,20 +7088,74 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (!sqd->thread) + return; + if (sqd->thread == current) + return; + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + wake_up_state(sqd->thread, TASK_PARKED); + mutex_unlock(&sqd->lock); +} + +static bool io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (sqd->thread == current) + return true; + mutex_lock(&sqd->lock); + if (!sqd->thread) { + mutex_unlock(&sqd->lock); + return false; + } + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + wake_up_process(sqd->thread); + wait_for_completion(&sqd->completion); + return true; +} + +static void io_sq_thread_stop(struct io_sq_data *sqd) +{ + if (!sqd->thread) + return; + + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); + wake_up_process(sqd->thread); + wait_for_completion(&sqd->exited); +} + static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { - /* - * The park is a bit of a work-around, without it we get - * warning spews on shutdown with SQPOLL set and affinity - * set to a single CPU. - */ + io_sq_thread_stop(sqd); + kfree(sqd); + } +} + +static void io_sq_thread_finish(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + complete(&sqd->startup); if (sqd->thread) { - kthread_park(sqd->thread); - kthread_stop(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + io_sq_thread_park(sqd); } - kfree(sqd); + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -7475,68 +7202,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) mutex_init(&sqd->ctx_lock); mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); + init_completion(&sqd->startup); + init_completion(&sqd->completion); + init_completion(&sqd->exited); return sqd; } -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - if (!sqd->thread) - return; - kthread_unpark(sqd->thread); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - if (!sqd->thread) - return; - mutex_lock(&sqd->lock); - kthread_park(sqd->thread); -} - -static void io_sq_thread_stop(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - if (sqd->thread) { - /* - * We may arrive here from the error branch in - * io_sq_offload_create() where the kthread is created - * without being waked up, thus wake it up now to make - * sure the wait will complete. - */ - wake_up_process(sqd->thread); - wait_for_completion(&ctx->sq_thread_comp); - - io_sq_thread_park(sqd); - } - - mutex_lock(&sqd->ctx_lock); - list_del(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static void io_finish_async(struct io_ring_ctx *ctx) -{ - io_sq_thread_stop(ctx); - - if (ctx->io_wq) { - io_wq_destroy(ctx->io_wq); - ctx->io_wq = NULL; - } -} - #if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that @@ -7563,7 +7234,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) skb->sk = sk; nr_files = 0; - fpl->user = get_uid(ctx->user); + fpl->user = get_uid(current_user()); for (i = 0; i < nr; i++) { struct file *file = io_file_from_index(ctx, i + offset); @@ -8095,54 +7766,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static int io_init_wq_offload(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) { + struct io_wq_hash *hash; struct io_wq_data data; - struct fd f; - struct io_ring_ctx *ctx_attach; unsigned int concurrency; - int ret = 0; - - data.user = ctx->user; - data.free_work = io_free_work; - data.do_work = io_wq_submit_work; - - if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; - } - return ret; + hash = ctx->hash_map; + if (!hash) { + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return ERR_PTR(-ENOMEM); + refcount_set(&hash->refs, 1); + init_waitqueue_head(&hash->wait); + ctx->hash_map = hash; } - f = fdget(p->wq_fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &io_uring_fops) { - ret = -EINVAL; - goto out_fput; - } + data.hash = hash; + data.free_work = io_free_work; + data.do_work = io_wq_submit_work; - ctx_attach = f.file->private_data; - /* @io_wq is protected by holding the fd */ - if (!io_wq_get(ctx_attach->io_wq, &data)) { - ret = -EINVAL; - goto out_fput; - } + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = ctx_attach->io_wq; -out_fput: - fdput(f); - return ret; + return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task) +static int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8157,13 +7808,19 @@ static int io_uring_alloc_task_context(struct task_struct *task) return ret; } + tctx->io_wq = io_init_wq_offload(ctx); + if (IS_ERR(tctx->io_wq)) { + ret = PTR_ERR(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + return ret; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); tctx->sqpoll = false; - io_init_identity(&tctx->__identity); - tctx->identity = &tctx->__identity; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8177,19 +7834,49 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); - if (tctx->identity != &tctx->__identity) - kfree(tctx->identity); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } +static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) +{ + int ret; + + clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + reinit_completion(&sqd->completion); + ctx->sqo_dead = ctx->sqo_exec = 0; + sqd->task_pid = current->pid; + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_sq_thread, sqd); + current->flags &= ~PF_IO_WORKER; + if (ret < 0) { + sqd->thread = NULL; + return ret; + } + wait_for_completion(&sqd->completion); + return io_uring_alloc_task_context(sqd->thread, ctx); +} + static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { int ret; + /* Retain compatibility with failing for an invalid attach attempt */ + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == + IORING_SETUP_ATTACH_WQ) { + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return -EINVAL; + } + fdput(f); + } if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sqd; @@ -8215,7 +7902,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (sqd->thread) - goto done; + return 0; if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -8226,18 +7913,21 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, - cpu, "io_uring-sq"); + sqd->sq_cpu = cpu; } else { - sqd->thread = kthread_create(io_sq_thread, sqd, - "io_uring-sq"); + sqd->sq_cpu = -1; } - if (IS_ERR(sqd->thread)) { - ret = PTR_ERR(sqd->thread); + + sqd->task_pid = current->pid; + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_sq_thread, sqd); + current->flags &= ~PF_IO_WORKER; + if (ret < 0) { sqd->thread = NULL; goto err; } - ret = io_uring_alloc_task_context(sqd->thread); + wait_for_completion(&sqd->completion); + ret = io_uring_alloc_task_context(sqd->thread, ctx); if (ret) goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { @@ -8246,14 +7936,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } -done: - ret = io_init_wq_offload(ctx, p); - if (ret) - goto err; - return 0; err: - io_finish_async(ctx); + io_sq_thread_finish(ctx); return ret; } @@ -8261,8 +7946,8 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) - wake_up_process(sqd->thread); + if (ctx->flags & IORING_SETUP_SQPOLL) + complete(&sqd->startup); } static inline void __io_unaccount_mem(struct user_struct *user, @@ -8292,7 +7977,7 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) + if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) @@ -8303,7 +7988,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; - if (ctx->limit_mem) { + if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; @@ -8702,19 +8387,23 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct io_submit_state *submit_state = &ctx->submit_state; + struct io_comp_state *cs = &ctx->submit_state.comp; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) + if (submit_state->free_reqs) { kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, submit_state->reqs); - - io_req_cache_free(&submit_state->comp.free_list, NULL); + submit_state->free_reqs = 0; + } spin_lock_irq(&ctx->completion_lock); - io_req_cache_free(&submit_state->comp.locked_free_list, NULL); + list_splice_init(&cs->locked_free_list, &cs->free_list); + cs->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); + io_req_cache_free(&cs->free_list, NULL); + mutex_unlock(&ctx->uring_lock); } @@ -8728,22 +8417,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock); - io_finish_async(ctx); + io_sq_thread_finish(ctx); io_sqe_buffers_unregister(ctx); - if (ctx->sqo_task) { - put_task_struct(ctx->sqo_task); - ctx->sqo_task = NULL; + if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } -#ifdef CONFIG_BLK_CGROUP - if (ctx->sqo_blkcg_css) - css_put(ctx->sqo_blkcg_css); -#endif - + mutex_lock(&ctx->uring_lock); io_sqe_files_unregister(ctx); + mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); @@ -8760,8 +8444,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); - put_cred(ctx->creds); io_req_caches_free(ctx, NULL); + if (ctx->hash_map) + io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx); } @@ -8808,13 +8493,11 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - struct io_identity *iod; + const struct cred *creds; - iod = idr_remove(&ctx->personality_idr, id); - if (iod) { - put_cred(iod->creds); - if (refcount_dec_and_test(&iod->count)) - kfree(iod); + creds = idr_remove(&ctx->personality_idr, id); + if (creds) { + put_cred(creds); return 0; } @@ -8829,6 +8512,28 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static void io_run_ctx_fallback(struct io_ring_ctx *ctx) +{ + struct callback_head *work, *head, *next; + + do { + do { + head = NULL; + work = READ_ONCE(ctx->exit_task_work); + } while (cmpxchg(&ctx->exit_task_work, work, head) != work); + + if (!work) + break; + + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + } while (1); +} + static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -8842,17 +8547,11 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + io_run_ctx_fallback(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -8871,9 +8570,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); - if (ctx->io_wq) - io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true); - /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); @@ -8952,13 +8648,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; + struct io_uring_task *tctx = current->io_uring; while (1) { enum io_wq_cancel cret; bool ret = false; - if (ctx->io_wq) { - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, + if (tctx && tctx->io_wq) { + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } @@ -9041,12 +8738,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct task_struct *task = current; + bool did_park = false; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { io_disable_sqo_submit(ctx); - task = ctx->sq_data->thread; - atomic_inc(&task->io_uring->in_idle); - io_sq_thread_park(ctx->sq_data); + did_park = io_sq_thread_park(ctx->sq_data); + if (did_park) { + task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + } } io_cancel_defer_files(ctx, task, files); @@ -9055,7 +8755,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, if (!files) io_uring_try_cancel_requests(ctx, task, NULL); - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + if (did_park) { atomic_dec(&task->io_uring->in_idle); io_sq_thread_unpark(ctx->sq_data); } @@ -9070,7 +8770,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current); + ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; tctx = current->io_uring; @@ -9140,8 +8840,13 @@ void __io_uring_files_cancel(struct files_struct *files) io_uring_cancel_task_requests(file->private_data, files); atomic_dec(&tctx->in_idle); - if (files) + if (files) { io_uring_remove_task_files(tctx); + if (tctx->io_wq) { + io_wq_put(tctx->io_wq); + tctx->io_wq = NULL; + } + } } static s64 tctx_inflight(struct io_uring_task *tctx) @@ -9151,14 +8856,17 @@ static s64 tctx_inflight(struct io_uring_task *tctx) static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { + struct io_sq_data *sqd = ctx->sq_data; struct io_uring_task *tctx; s64 inflight; DEFINE_WAIT(wait); - if (!ctx->sq_data) + if (!sqd) return; - tctx = ctx->sq_data->thread->io_uring; io_disable_sqo_submit(ctx); + if (!io_sq_thread_park(sqd)) + return; + tctx = ctx->sq_data->thread->io_uring; atomic_inc(&tctx->in_idle); do { @@ -9179,6 +8887,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); + io_sq_thread_unpark(sqd); } /* @@ -9232,11 +8941,17 @@ static int io_uring_flush(struct file *file, void *data) struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx = file->private_data; + /* Ignore helper thread files exit */ + if (current->flags & PF_IO_WORKER) + return 0; + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { io_uring_cancel_task_requests(ctx, NULL); io_req_caches_free(ctx, current); } + io_run_ctx_fallback(ctx); + if (!tctx) return 0; @@ -9435,6 +9150,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (unlikely(ctx->sqo_exec)) { + ret = io_sq_thread_fork(ctx->sq_data, ctx); + if (ret) + goto out; + ctx->sqo_exec = 0; + } ret = -EOWNERDEAD; if (unlikely(ctx->sqo_dead)) goto out; @@ -9491,8 +9212,7 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - struct io_identity *iod = p; - const struct cred *cred = iod->creds; + const struct cred *cred = p; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9537,8 +9257,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) */ has_lock = mutex_trylock(&ctx->uring_lock); - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { sq = ctx->sq_data; + if (!sq->thread) + sq = NULL; + } seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); @@ -9698,7 +9421,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) static int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { - struct user_struct *user = NULL; struct io_ring_ctx *ctx; struct file *file; int ret; @@ -9740,22 +9462,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } - user = get_uid(current_user()); - ctx = io_ring_ctx_alloc(p); - if (!ctx) { - free_uid(user); + if (!ctx) return -ENOMEM; - } ctx->compat = in_compat_syscall(); - ctx->limit_mem = !capable(CAP_IPC_LOCK); - ctx->user = user; - ctx->creds = get_current_cred(); -#ifdef CONFIG_AUDIT - ctx->loginuid = current->loginuid; - ctx->sessionid = current->sessionid; -#endif - ctx->sqo_task = get_task_struct(current); + if (!capable(CAP_IPC_LOCK)) + ctx->user = get_uid(current_user()); + ctx->sqo_task = current; /* * This is just grabbed for accounting purposes. When a process exits, @@ -9766,24 +9479,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; -#ifdef CONFIG_BLK_CGROUP - /* - * The sq thread will belong to the original cgroup it was inited in. - * If the cgroup goes offline (e.g. disabling the io controller), then - * issued bios will be associated with the closest cgroup later in the - * block layer. - */ - rcu_read_lock(); - ctx->sqo_blkcg_css = blkcg_css(); - ret = css_tryget_online(ctx->sqo_blkcg_css); - rcu_read_unlock(); - if (!ret) { - /* don't init against a dying cgroup, have the user try again */ - ctx->sqo_blkcg_css = NULL; - ret = -ENODEV; - goto err; - } -#endif ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -9817,7 +9512,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -9923,21 +9618,15 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - struct io_identity *id; + const struct cred *creds; int ret; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) - return -ENOMEM; - - io_init_identity(id); - id->creds = get_current_cred(); + creds = get_current_cred(); - ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); - if (ret < 0) { - put_cred(id->creds); - kfree(id); - } + ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (ret < 0) + put_cred(creds); return ret; } @@ -10196,6 +9885,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx = f.file->private_data; + io_run_task_work(); + mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 16a1e82e3aeb..7ffcd7ef33d4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -278,14 +278,14 @@ iomap_readpage_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (!is_contig || bio_full(ctx->bio, plen)) { gfp_t gfp = mapping_gfp_constraint(page->mapping, GFP_KERNEL); gfp_t orig_gfp = gfp; - int nr_vecs = (length + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); if (ctx->bio) submit_bio(ctx->bio); if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; - ctx->bio = bio_alloc(gfp, min(BIO_MAX_PAGES, nr_vecs)); + ctx->bio = bio_alloc(gfp, bio_max_segs(nr_vecs)); /* * If the bio_alloc fails, try it again for a single page to * avoid having to deal with partial page reads. This emulates @@ -1459,13 +1459,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) goto redirty; /* - * Given that we do not allow direct reclaim to call us, we should - * never be called in a recursive filesystem reclaim context. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - - /* * Is this page beyond the end of the file? * * The page index is less than the end_index, adjust the end_offset diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index 107ee80c3568..dab1b02eba5b 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,122 +10,17 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * Returns true if found and updates @lastoff to the offset in file. - */ -static bool -page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, - int whence) -{ - const struct address_space_operations *ops = inode->i_mapping->a_ops; - unsigned int bsize = i_blocksize(inode), off; - bool seek_data = whence == SEEK_DATA; - loff_t poff = page_offset(page); - - if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) - return false; - - if (*lastoff < poff) { - /* - * Last offset smaller than the start of the page means we found - * a hole: - */ - if (whence == SEEK_HOLE) - return true; - *lastoff = poff; - } - - /* - * Just check the page unless we can and should check block ranges: - */ - if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) - return PageUptodate(page) == seek_data; - - lock_page(page); - if (unlikely(page->mapping != inode->i_mapping)) - goto out_unlock_not_found; - - for (off = 0; off < PAGE_SIZE; off += bsize) { - if (offset_in_page(*lastoff) >= off + bsize) - continue; - if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { - unlock_page(page); - return true; - } - *lastoff = poff + off + bsize; - } - -out_unlock_not_found: - unlock_page(page); - return false; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: uptodate buffer heads count as data; everything else - * counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ static loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - if (page_seek_hole_data(inode, page, &lastoff, whence)) - goto check_range; - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - - -static loff_t -iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_hole_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_HOLE); - if (offset < 0) + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_HOLE); + if (offset == start + length) return length; fallthrough; case IOMAP_HOLE: @@ -164,15 +59,17 @@ iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops) EXPORT_SYMBOL_GPL(iomap_seek_hole); static loff_t -iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length, +iomap_seek_data_actor(struct inode *inode, loff_t start, loff_t length, void *data, struct iomap *iomap, struct iomap *srcmap) { + loff_t offset = start; + switch (iomap->type) { case IOMAP_HOLE: return length; case IOMAP_UNWRITTEN: - offset = page_cache_seek_hole_data(inode, offset, length, - SEEK_DATA); + offset = mapping_seek_hole_data(inode->i_mapping, start, + start + length, SEEK_DATA); if (offset < 0) return length; fallthrough; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index f0fe641893a5..b9e6a7ec78be 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -152,6 +152,7 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, inode->i_ino); + brelse(bh); return -EIO; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index ec90773527ee..21edc423b79f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -339,6 +339,7 @@ static int parse_options(char *options, struct iso9660_options *popt) { char *p; int option; + unsigned int uv; popt->map = 'n'; popt->rock = 1; @@ -434,17 +435,17 @@ static int parse_options(char *options, struct iso9660_options *popt) case Opt_ignore: break; case Opt_uid: - if (match_int(&args[0], &option)) + if (match_uint(&args[0], &uv)) return 0; - popt->uid = make_kuid(current_user_ns(), option); + popt->uid = make_kuid(current_user_ns(), uv); if (!uid_valid(popt->uid)) return 0; popt->uid_set = 1; break; case Opt_gid: - if (match_int(&args[0], &option)) + if (match_uint(&args[0], &uv)) return 0; - popt->gid = make_kgid(current_user_ns(), option); + popt->gid = make_kgid(current_user_ns(), uv); if (!gid_valid(popt->gid)) return 0; popt->gid_set = 1; diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 402769881c32..58f80e1b3ac0 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -102,6 +102,7 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, dir->i_ino); + brelse(bh); return 0; } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 093ffbd82395..55a79df70d24 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -226,7 +226,8 @@ static int __jffs2_set_acl(struct inode *inode, int xprefix, struct posix_acl *a return rc; } -int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int rc, xprefix; @@ -236,7 +237,8 @@ int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (acl) { umode_t mode; - rc = posix_acl_update_mode(inode, &mode, &acl); + rc = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (rc) return rc; if (inode->i_mode != mode) { diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index 12d0271bdde3..62c50da9d493 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -28,7 +28,8 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL struct posix_acl *jffs2_get_acl(struct inode *inode, int type); -int jffs2_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jffs2_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int jffs2_init_acl_pre(struct inode *, struct inode *, umode_t *); extern int jffs2_init_acl_post(struct inode *); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 776493713153..c0aabbcbfd58 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -24,18 +24,21 @@ static int jffs2_readdir (struct file *, struct dir_context *); -static int jffs2_create (struct inode *,struct dentry *,umode_t, - bool); +static int jffs2_create (struct user_namespace *, struct inode *, + struct dentry *, umode_t, bool); static struct dentry *jffs2_lookup (struct inode *,struct dentry *, unsigned int); static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); -static int jffs2_symlink (struct inode *,struct dentry *,const char *); -static int jffs2_mkdir (struct inode *,struct dentry *,umode_t); +static int jffs2_symlink (struct user_namespace *, struct inode *, + struct dentry *, const char *); +static int jffs2_mkdir (struct user_namespace *, struct inode *,struct dentry *, + umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); -static int jffs2_mknod (struct inode *,struct dentry *,umode_t,dev_t); -static int jffs2_rename (struct inode *, struct dentry *, - struct inode *, struct dentry *, +static int jffs2_mknod (struct user_namespace *, struct inode *,struct dentry *, + umode_t,dev_t); +static int jffs2_rename (struct user_namespace *, struct inode *, + struct dentry *, struct inode *, struct dentry *, unsigned int); const struct file_operations jffs2_dir_operations = @@ -157,8 +160,8 @@ static int jffs2_readdir(struct file *file, struct dir_context *ctx) /***********************************************************************/ -static int jffs2_create(struct inode *dir_i, struct dentry *dentry, - umode_t mode, bool excl) +static int jffs2_create(struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode, bool excl) { struct jffs2_raw_inode *ri; struct jffs2_inode_info *f, *dir_f; @@ -276,7 +279,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de /***********************************************************************/ -static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char *target) +static int jffs2_symlink (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, const char *target) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -438,7 +442,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } -static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, umode_t mode) +static int jffs2_mkdir (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -609,7 +614,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) return ret; } -static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode, dev_t rdev) +static int jffs2_mknod (struct user_namespace *mnt_userns, struct inode *dir_i, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -756,7 +762,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, umode_t mode return ret; } -static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, +static int jffs2_rename (struct user_namespace *mnt_userns, + struct inode *old_dir_i, struct dentry *old_dentry, struct inode *new_dir_i, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 78858f6e9583..2ac410477c4f 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,18 +190,19 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return 0; } -int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) +int jffs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(dentry, iattr); + rc = setattr_prepare(&init_user_ns, dentry, iattr); if (rc) return rc; rc = jffs2_do_setattr(inode, iattr); if (!rc && (iattr->ia_valid & ATTR_MODE)) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); return rc; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index ef1cfa61549e..173eccac691d 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -164,7 +164,7 @@ long jffs2_ioctl(struct file *, unsigned int, unsigned long); extern const struct inode_operations jffs2_symlink_inode_operations; /* fs.c */ -int jffs2_setattr (struct dentry *, struct iattr *); +int jffs2_setattr (struct user_namespace *, struct dentry *, struct iattr *); int jffs2_do_setattr (struct inode *, struct iattr *); struct inode *jffs2_iget(struct super_block *, unsigned long); void jffs2_evict_inode (struct inode *); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index c2332e30f218..aef5522551db 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -57,6 +57,7 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, } static int jffs2_security_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 5d6030826c52..cc3f24883e7d 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -25,6 +25,7 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, } static int jffs2_trusted_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 9d027b4abcf9..fb945977c013 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -25,6 +25,7 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, } static int jffs2_user_setxattr(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 92cc0ac2d1fc..43c285c3d2a7 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -91,7 +91,8 @@ out: return rc; } -int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int rc; tid_t tid; @@ -101,7 +102,7 @@ int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) tid = txBegin(inode->i_sb, 0); mutex_lock(&JFS_IP(inode)->commit_mutex); if (type == ACL_TYPE_ACCESS && acl) { - rc = posix_acl_update_mode(inode, &mode, &acl); + rc = posix_acl_update_mode(&init_user_ns, inode, &mode, &acl); if (rc) goto end_tx; if (mode != inode->i_mode) diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 930d2701f206..28b70e7c7dd4 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -85,12 +85,13 @@ static int jfs_release(struct inode *inode, struct file *file) return 0; } -int jfs_setattr(struct dentry *dentry, struct iattr *iattr) +int jfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int rc; - rc = setattr_prepare(dentry, iattr); + rc = setattr_prepare(&init_user_ns, dentry, iattr); if (rc) return rc; @@ -118,11 +119,11 @@ int jfs_setattr(struct dentry *dentry, struct iattr *iattr) jfs_truncate(inode); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) - rc = posix_acl_chmod(inode, inode->i_mode); + rc = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); return rc; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 10ee0ecca1a8..2581d4db58ff 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -76,7 +76,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) return err; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EACCES; goto setflags_out; } diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 9f8f92dd6f84..7ae389a7a366 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -8,7 +8,8 @@ #ifdef CONFIG_JFS_POSIX_ACL struct posix_acl *jfs_get_acl(struct inode *inode, int type); -int jfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int jfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int jfs_init_acl(tid_t, struct inode *, struct inode *); #else diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 4cef170630db..59379089e939 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -64,7 +64,7 @@ struct inode *ialloc(struct inode *parent, umode_t mode) goto fail_put; } - inode_init_owner(inode, parent, mode); + inode_init_owner(&init_user_ns, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 70a0d12e427e..01daa0cb0ae5 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -26,7 +26,7 @@ extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern void jfs_set_inode_flags(struct inode *); extern int jfs_get_block(struct inode *, sector_t, struct buffer_head *, int); -extern int jfs_setattr(struct dentry *, struct iattr *); +extern int jfs_setattr(struct user_namespace *, struct dentry *, struct iattr *); extern const struct address_space_operations jfs_aops; extern const struct inode_operations jfs_dir_inode_operations; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 7a55d14cc1af..9abed0d750e5 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -59,8 +59,8 @@ static inline void free_ea_wmap(struct inode *inode) * RETURN: Errors from subroutines * */ -static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, - bool excl) +static int jfs_create(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, umode_t mode, bool excl) { int rc = 0; tid_t tid; /* transaction id */ @@ -192,7 +192,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, umode_t mode, * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +static int jfs_mkdir(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -868,8 +869,8 @@ static int jfs_link(struct dentry *old_dentry, * an intermediate result whose length exceeds PATH_MAX [XPG4.2] */ -static int jfs_symlink(struct inode *dip, struct dentry *dentry, - const char *name) +static int jfs_symlink(struct user_namespace *mnt_userns, struct inode *dip, + struct dentry *dentry, const char *name) { int rc; tid_t tid; @@ -1058,9 +1059,9 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, * * FUNCTION: rename a file or directory */ -static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int jfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct btstack btstack; ino_t ino; @@ -1344,8 +1345,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, * * FUNCTION: Create a special file (device) */ -static int jfs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int jfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct jfs_inode_info *jfs_ip; struct btstack btstack; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index b2dc4d1f9dcc..1f0ffabbde56 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -551,7 +551,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_ino = 0; inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index db41e7803163..f9273f6901c8 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -932,6 +932,7 @@ static int jfs_xattr_get(const struct xattr_handler *handler, } static int jfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -950,6 +951,7 @@ static int jfs_xattr_get_os2(const struct xattr_handler *handler, } static int jfs_xattr_set_os2(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 7a53eed69fef..7e0e62deab53 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1110,7 +1110,8 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return ret; } -static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, +static int kernfs_iop_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) { struct kernfs_node *parent = dir->i_private; @@ -1147,7 +1148,8 @@ static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) return ret; } -static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, +static int kernfs_iop_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fc2469a20fed..d73950fc3d57 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,8 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) return ret; } -int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) +int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct kernfs_node *kn = inode->i_private; @@ -122,7 +123,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) return -EINVAL; mutex_lock(&kernfs_mutex); - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) goto out; @@ -131,7 +132,7 @@ int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) goto out; /* this ignores size changes */ - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); out: mutex_unlock(&kernfs_mutex); @@ -183,7 +184,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) set_nlink(inode, kn->dir.subdirs + 2); } -int kernfs_iop_getattr(const struct path *path, struct kstat *stat, +int kernfs_iop_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -193,7 +195,7 @@ int kernfs_iop_getattr(const struct path *path, struct kstat *stat, kernfs_refresh_inode(kn, inode); mutex_unlock(&kernfs_mutex); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } @@ -272,7 +274,8 @@ void kernfs_evict_inode(struct inode *inode) kernfs_put(kn); } -int kernfs_iop_permission(struct inode *inode, int mask) +int kernfs_iop_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct kernfs_node *kn; @@ -285,7 +288,7 @@ int kernfs_iop_permission(struct inode *inode, int mask) kernfs_refresh_inode(kn, inode); mutex_unlock(&kernfs_mutex); - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int kernfs_xattr_get(struct kernfs_node *kn, const char *name, @@ -319,6 +322,7 @@ static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, } static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) @@ -385,6 +389,7 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *suffix, const void *value, size_t size, int flags) diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 7ee97ef59184..ccc3b44f6306 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -89,9 +89,12 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); -int kernfs_iop_permission(struct inode *inode, int mask); -int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); -int kernfs_iop_getattr(const struct path *path, struct kstat *stat, +int kernfs_iop_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); +int kernfs_iop_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr); +int kernfs_iop_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); diff --git a/fs/libfs.c b/fs/libfs.c index abf7674fb437..e2de5401abca 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -27,11 +27,12 @@ #include "internal.h" -int simple_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int simple_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -447,9 +448,9 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); -int simple_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); @@ -492,18 +493,19 @@ EXPORT_SYMBOL(simple_rename); * on simple regular filesystems. Anything that needs to change on-disk * or wire state on size changes needs its own setattr method. */ -int simple_setattr(struct dentry *dentry, struct iattr *iattr) +int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(mnt_userns, dentry, iattr); if (error) return error; if (iattr->ia_valid & ATTR_SIZE) truncate_setsize(inode, iattr->ia_size); - setattr_copy(inode, iattr); + setattr_copy(mnt_userns, inode, iattr); mark_inode_dirty(inode); return 0; } @@ -1214,11 +1216,6 @@ static int anon_set_page_dirty(struct page *page) return 0; }; -/* - * A single inode exists for all anon_inode files. Contrary to pipes, - * anon_inode inodes have no associated per-instance data, so we need - * only allocate one of them. - */ struct inode *alloc_anon_inode(struct super_block *s) { static const struct address_space_operations anon_aops = { @@ -1300,15 +1297,17 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(-ENOENT); } -static int empty_dir_getattr(const struct path *path, struct kstat *stat, +static int empty_dir_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } -static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +static int empty_dir_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { return -EPERM; } diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index f4e5e5181a14..9115948c624e 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -252,7 +252,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode, int *error) iput(inode); return NULL; } - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = j; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/minix/file.c b/fs/minix/file.c index c50b0a20fcd9..6a7bd2d9eec0 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -22,12 +22,13 @@ const struct file_operations minix_file_operations = { .splice_read = generic_file_splice_read, }; -static int minix_setattr(struct dentry *dentry, struct iattr *attr) +static int minix_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -41,7 +42,7 @@ static int minix_setattr(struct dentry *dentry, struct iattr *attr) minix_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 34f546404aa1..a532a99bbe81 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -652,13 +652,13 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) return err; } -int minix_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int minix_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 168d45d3de73..202173368025 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -51,7 +51,8 @@ extern unsigned long minix_count_free_inodes(struct super_block *sb); extern int minix_new_block(struct inode * inode); extern void minix_free_block(struct inode *inode, unsigned long block); extern unsigned long minix_count_free_blocks(struct super_block *sb); -extern int minix_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int minix_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int minix_prepare_chunk(struct page *page, loff_t pos, unsigned len); extern void V1_minix_truncate(struct inode *); diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 1a6084d2b02e..937fa5fae2b8 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -33,7 +33,8 @@ static struct dentry *minix_lookup(struct inode * dir, struct dentry *dentry, un return d_splice_alias(inode, dentry); } -static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int minix_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { int error; struct inode *inode; @@ -51,7 +52,8 @@ static int minix_mknod(struct inode * dir, struct dentry *dentry, umode_t mode, return error; } -static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int minix_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; struct inode *inode = minix_new_inode(dir, mode, &error); @@ -63,14 +65,14 @@ static int minix_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return error; } -static int minix_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int minix_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return minix_mknod(dir, dentry, mode, 0); + return minix_mknod(mnt_userns, dir, dentry, mode, 0); } -static int minix_symlink(struct inode * dir, struct dentry *dentry, - const char * symname) +static int minix_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; int i = strlen(symname)+1; @@ -109,7 +111,8 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +static int minix_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; @@ -181,8 +184,9 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) return err; } -static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir, struct dentry *new_dentry, +static int minix_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); diff --git a/fs/mount.h b/fs/mount.h index ce6c376e0bc2..0b6e08cf8afb 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -124,16 +124,6 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; -static inline void lock_mount_hash(void) -{ - write_seqlock(&mount_lock); -} - -static inline void unlock_mount_hash(void) -{ - write_sequnlock(&mount_lock); -} - struct proc_mounts { struct mnt_namespace *ns; struct path root; diff --git a/fs/mpage.c b/fs/mpage.c index 830e6cc2a9e7..961234d68779 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -304,9 +304,7 @@ alloc_new: goto out; } args->bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), - min_t(int, args->nr_pages, - BIO_MAX_PAGES), - gfp); + bio_max_segs(args->nr_pages), gfp); if (args->bio == NULL) goto confused; } diff --git a/fs/namei.c b/fs/namei.c index de74ad2bc6e2..216f16e74351 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -259,7 +259,24 @@ void putname(struct filename *name) __putname(name); } -static int check_acl(struct inode *inode, int mask) +/** + * check_acl - perform ACL permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the ACL permission checking. Since this function + * retrieve POSIX acls it needs to know whether it is called from a blocking or + * non-blocking context and thus cares about the MAY_NOT_BLOCK bit. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +static int check_acl(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *acl; @@ -271,14 +288,14 @@ static int check_acl(struct inode *inode, int mask) /* no ->get_acl() calls in RCU mode... */ if (is_uncached_acl(acl)) return -ECHILD; - return posix_acl_permission(inode, acl, mask); + return posix_acl_permission(mnt_userns, inode, acl, mask); } acl = get_acl(inode, ACL_TYPE_ACCESS); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - int error = posix_acl_permission(inode, acl, mask); + int error = posix_acl_permission(mnt_userns, inode, acl, mask); posix_acl_release(acl); return error; } @@ -287,18 +304,31 @@ static int check_acl(struct inode *inode, int mask) return -EAGAIN; } -/* - * This does the basic UNIX permission checking. +/** + * acl_permission_check - perform basic UNIX permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * + * This function performs the basic UNIX permission checking. Since this + * function may retrieve POSIX acls it needs to know whether it is called from a + * blocking or non-blocking context and thus cares about the MAY_NOT_BLOCK bit. * - * Note that the POSIX ACL check cares about the MAY_NOT_BLOCK bit, - * for RCU walking. + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -static int acl_permission_check(struct inode *inode, int mask) +static int acl_permission_check(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { unsigned int mode = inode->i_mode; + kuid_t i_uid; /* Are we the owner? If so, ACL's don't matter */ - if (likely(uid_eq(current_fsuid(), inode->i_uid))) { + i_uid = i_uid_into_mnt(mnt_userns, inode); + if (likely(uid_eq(current_fsuid(), i_uid))) { mask &= 7; mode >>= 6; return (mask & ~mode) ? -EACCES : 0; @@ -306,7 +336,7 @@ static int acl_permission_check(struct inode *inode, int mask) /* Do we have ACL's? */ if (IS_POSIXACL(inode) && (mode & S_IRWXG)) { - int error = check_acl(inode, mask); + int error = check_acl(mnt_userns, inode, mask); if (error != -EAGAIN) return error; } @@ -320,7 +350,8 @@ static int acl_permission_check(struct inode *inode, int mask) * about? Need to check group ownership if so. */ if (mask & (mode ^ (mode >> 3))) { - if (in_group_p(inode->i_gid)) + kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); + if (in_group_p(kgid)) mode >>= 3; } @@ -330,6 +361,7 @@ static int acl_permission_check(struct inode *inode, int mask) /** * generic_permission - check for access rights on a Posix-like filesystem + * @mnt_userns: user namespace of the mount the inode was found from * @inode: inode to check access rights for * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC, * %MAY_NOT_BLOCK ...) @@ -342,25 +374,33 @@ static int acl_permission_check(struct inode *inode, int mask) * generic_permission is rcu-walk aware. It returns -ECHILD in case an rcu-walk * request cannot be satisfied (eg. requires blocking or too much complexity). * It would then be called again in ref-walk mode. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int generic_permission(struct inode *inode, int mask) +int generic_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int ret; /* * Do the basic permission checks. */ - ret = acl_permission_check(inode, mask); + ret = acl_permission_check(mnt_userns, inode, mask); if (ret != -EACCES) return ret; if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ if (!(mask & MAY_WRITE)) - if (capable_wrt_inode_uidgid(inode, + if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_DAC_READ_SEARCH)) return 0; - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_OVERRIDE)) return 0; return -EACCES; } @@ -370,7 +410,8 @@ int generic_permission(struct inode *inode, int mask) */ mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ) - if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_READ_SEARCH)) return 0; /* * Read/write DACs are always overridable. @@ -378,31 +419,38 @@ int generic_permission(struct inode *inode, int mask) * at least one exec bit set. */ if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + if (capable_wrt_inode_uidgid(mnt_userns, inode, + CAP_DAC_OVERRIDE)) return 0; return -EACCES; } EXPORT_SYMBOL(generic_permission); -/* +/** + * do_inode_permission - UNIX permission checking + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: inode to check permissions on + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC ...) + * * We _really_ want to just do "generic_permission()" without * even looking at the inode->i_op values. So we keep a cache * flag in inode->i_opflags, that says "this has not special * permission function, use the fast case". */ -static inline int do_inode_permission(struct inode *inode, int mask) +static inline int do_inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) { if (likely(inode->i_op->permission)) - return inode->i_op->permission(inode, mask); + return inode->i_op->permission(mnt_userns, inode, mask); /* This gets set once for the inode lifetime */ spin_lock(&inode->i_lock); inode->i_opflags |= IOP_FASTPERM; spin_unlock(&inode->i_lock); } - return generic_permission(inode, mask); + return generic_permission(mnt_userns, inode, mask); } /** @@ -427,8 +475,9 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) /** * inode_permission - Check for access rights to a given inode - * @inode: Inode to check permission on - * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @mnt_userns: User namespace of the mount the inode was found from + * @inode: Inode to check permission on + * @mask: Right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) * * Check for read/write/execute permissions on an inode. We use fs[ug]id for * this, letting us set arbitrary permissions for filesystem access without @@ -436,7 +485,8 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask) * * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask. */ -int inode_permission(struct inode *inode, int mask) +int inode_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { int retval; @@ -456,11 +506,11 @@ int inode_permission(struct inode *inode, int mask) * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EACCES; } - retval = do_inode_permission(inode, mask); + retval = do_inode_permission(mnt_userns, inode, mask); if (retval) return retval; @@ -960,11 +1010,16 @@ int sysctl_protected_regular __read_mostly; */ static inline int may_follow_link(struct nameidata *nd, const struct inode *inode) { + struct user_namespace *mnt_userns; + kuid_t i_uid; + if (!sysctl_protected_symlinks) return 0; + mnt_userns = mnt_user_ns(nd->path.mnt); + i_uid = i_uid_into_mnt(mnt_userns, inode); /* Allowed if owner and follower match. */ - if (uid_eq(current_cred()->fsuid, inode->i_uid)) + if (uid_eq(current_cred()->fsuid, i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ @@ -972,7 +1027,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod return 0; /* Allowed if parent directory and link owner match. */ - if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, inode->i_uid)) + if (uid_valid(nd->dir_uid) && uid_eq(nd->dir_uid, i_uid)) return 0; if (nd->flags & LOOKUP_RCU) @@ -985,6 +1040,7 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod /** * safe_hardlink_source - Check for safe hardlink conditions + * @mnt_userns: user namespace of the mount the inode was found from * @inode: the source inode to hardlink from * * Return false if at least one of the following conditions: @@ -995,7 +1051,8 @@ static inline int may_follow_link(struct nameidata *nd, const struct inode *inod * * Otherwise returns true. */ -static bool safe_hardlink_source(struct inode *inode) +static bool safe_hardlink_source(struct user_namespace *mnt_userns, + struct inode *inode) { umode_t mode = inode->i_mode; @@ -1012,7 +1069,7 @@ static bool safe_hardlink_source(struct inode *inode) return false; /* Hardlinking to unreadable or unwritable sources is dangerous. */ - if (inode_permission(inode, MAY_READ | MAY_WRITE)) + if (inode_permission(mnt_userns, inode, MAY_READ | MAY_WRITE)) return false; return true; @@ -1020,6 +1077,7 @@ static bool safe_hardlink_source(struct inode *inode) /** * may_linkat - Check permissions for creating a hardlink + * @mnt_userns: user namespace of the mount the inode was found from * @link: the source to hardlink from * * Block hardlink when all of: @@ -1028,14 +1086,21 @@ static bool safe_hardlink_source(struct inode *inode) * - hardlink source is unsafe (see safe_hardlink_source() above) * - not CAP_FOWNER in a namespace with the inode owner uid mapped * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + * * Returns 0 if successful, -ve on error. */ -int may_linkat(struct path *link) +int may_linkat(struct user_namespace *mnt_userns, struct path *link) { struct inode *inode = link->dentry->d_inode; /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; if (!sysctl_protected_hardlinks) @@ -1044,7 +1109,8 @@ int may_linkat(struct path *link) /* Source inode owner (or CAP_FOWNER) can hardlink all they like, * otherwise, it must be a safe source. */ - if (safe_hardlink_source(inode) || inode_owner_or_capable(inode)) + if (safe_hardlink_source(mnt_userns, inode) || + inode_owner_or_capable(mnt_userns, inode)) return 0; audit_log_path_denied(AUDIT_ANOM_LINK, "linkat"); @@ -1055,6 +1121,7 @@ int may_linkat(struct path *link) * may_create_in_sticky - Check whether an O_CREAT open in a sticky directory * should be allowed, or not, on files that already * exist. + * @mnt_userns: user namespace of the mount the inode was found from * @dir_mode: mode bits of directory * @dir_uid: owner of directory * @inode: the inode of the file to open @@ -1070,16 +1137,25 @@ int may_linkat(struct path *link) * the directory doesn't have to be world writable: being group writable will * be enough. * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + * * Returns 0 if the open is allowed, -ve on error. */ -static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid, - struct inode * const inode) +static int may_create_in_sticky(struct user_namespace *mnt_userns, + struct nameidata *nd, struct inode *const inode) { + umode_t dir_mode = nd->dir_mode; + kuid_t dir_uid = nd->dir_uid; + if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) || (!sysctl_protected_regular && S_ISREG(inode->i_mode)) || likely(!(dir_mode & S_ISVTX)) || - uid_eq(inode->i_uid, dir_uid) || - uid_eq(current_fsuid(), inode->i_uid)) + uid_eq(i_uid_into_mnt(mnt_userns, inode), dir_uid) || + uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) return 0; if (likely(dir_mode & 0002) || @@ -1569,14 +1645,15 @@ static struct dentry *lookup_slow(const struct qstr *name, return res; } -static inline int may_lookup(struct nameidata *nd) +static inline int may_lookup(struct user_namespace *mnt_userns, + struct nameidata *nd) { if (nd->flags & LOOKUP_RCU) { - int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); + int err = inode_permission(mnt_userns, nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD || !try_to_unlazy(nd)) return err; } - return inode_permission(nd->inode, MAY_EXEC); + return inode_permission(mnt_userns, nd->inode, MAY_EXEC); } static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq) @@ -2122,11 +2199,13 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { + struct user_namespace *mnt_userns; const char *link; u64 hash_len; int type; - err = may_lookup(nd); + mnt_userns = mnt_user_ns(nd->path.mnt); + err = may_lookup(mnt_userns, nd); if (err) return err; @@ -2174,7 +2253,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) OK: /* pathname or trailing symlink, done */ if (!depth) { - nd->dir_uid = nd->inode->i_uid; + nd->dir_uid = i_uid_into_mnt(mnt_userns, nd->inode); nd->dir_mode = nd->inode->i_mode; nd->flags &= ~LOOKUP_PARENT; return 0; @@ -2511,7 +2590,7 @@ static int lookup_one_len_common(const char *name, struct dentry *base, return err; } - return inode_permission(base->d_inode, MAY_EXEC); + return inode_permission(&init_user_ns, base->d_inode, MAY_EXEC); } /** @@ -2656,15 +2735,16 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags, } EXPORT_SYMBOL(user_path_at_empty); -int __check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct user_namespace *mnt_userns, struct inode *dir, + struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (uid_eq(inode->i_uid, fsuid)) + if (uid_eq(i_uid_into_mnt(mnt_userns, inode), fsuid)) return 0; - if (uid_eq(dir->i_uid, fsuid)) + if (uid_eq(i_uid_into_mnt(mnt_userns, dir), fsuid)) return 0; - return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); + return !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FOWNER); } EXPORT_SYMBOL(__check_sticky); @@ -2688,7 +2768,8 @@ EXPORT_SYMBOL(__check_sticky); * 11. We don't allow removal of NFS sillyrenamed files; it's handled by * nfs_async_unlink(). */ -static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) +static int may_delete(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *victim, bool isdir) { struct inode *inode = d_backing_inode(victim); int error; @@ -2700,19 +2781,21 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) BUG_ON(victim->d_parent->d_inode != dir); /* Inode writeback is not safe when the uid or gid are invalid. */ - if (!uid_valid(inode->i_uid) || !gid_valid(inode->i_gid)) + if (!uid_valid(i_uid_into_mnt(mnt_userns, inode)) || + !gid_valid(i_gid_into_mnt(mnt_userns, inode))) return -EOVERFLOW; audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, inode) || IS_APPEND(inode) || - IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || HAS_UNMAPPED_ID(inode)) + if (check_sticky(mnt_userns, dir, inode) || IS_APPEND(inode) || + IS_IMMUTABLE(inode) || IS_SWAPFILE(inode) || + HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -2737,7 +2820,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) * 4. We should have write and exec permissions on dir * 5. We can't do it if dir is immutable (done in permission()) */ -static inline int may_create(struct inode *dir, struct dentry *child) +static inline int may_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *child) { struct user_namespace *s_user_ns; audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE); @@ -2746,10 +2830,10 @@ static inline int may_create(struct inode *dir, struct dentry *child) if (IS_DEADDIR(dir)) return -ENOENT; s_user_ns = dir->i_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) + if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || + !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) return -EOVERFLOW; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); } /* @@ -2796,10 +2880,26 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) } EXPORT_SYMBOL(unlock_rename); -int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) +/** + * vfs_create - create new file + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new file + * @want_excl: whether the file must not yet exist + * + * Create a new file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool want_excl) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -2810,7 +2910,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(dir, dentry, mode, want_excl); + error = dir->i_op->create(mnt_userns, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -2822,7 +2922,7 @@ int vfs_mkobj(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; - int error = may_create(dir, dentry); + int error = may_create(&init_user_ns, dir, dentry); if (error) return error; @@ -2844,7 +2944,8 @@ bool may_open_dev(const struct path *path) !(path->mnt->mnt_sb->s_iflags & SB_I_NODEV); } -static int may_open(const struct path *path, int acc_mode, int flag) +static int may_open(struct user_namespace *mnt_userns, const struct path *path, + int acc_mode, int flag) { struct dentry *dentry = path->dentry; struct inode *inode = dentry->d_inode; @@ -2879,7 +2980,7 @@ static int may_open(const struct path *path, int acc_mode, int flag) break; } - error = inode_permission(inode, MAY_OPEN | acc_mode); + error = inode_permission(mnt_userns, inode, MAY_OPEN | acc_mode); if (error) return error; @@ -2894,13 +2995,13 @@ static int may_open(const struct path *path, int acc_mode, int flag) } /* O_NOATIME can only be set by the owner or superuser */ - if (flag & O_NOATIME && !inode_owner_or_capable(inode)) + if (flag & O_NOATIME && !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; return 0; } -static int handle_truncate(struct file *filp) +static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) { const struct path *path = &filp->f_path; struct inode *inode = path->dentry->d_inode; @@ -2914,7 +3015,7 @@ static int handle_truncate(struct file *filp) if (!error) error = security_path_truncate(path); if (!error) { - error = do_truncate(path->dentry, 0, + error = do_truncate(mnt_userns, path->dentry, 0, ATTR_MTIME|ATTR_CTIME|ATTR_OPEN, filp); } @@ -2929,7 +3030,9 @@ static inline int open_to_namei_flags(int flag) return flag; } -static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t mode) +static int may_o_create(struct user_namespace *mnt_userns, + const struct path *dir, struct dentry *dentry, + umode_t mode) { struct user_namespace *s_user_ns; int error = security_path_mknod(dir, dentry, mode, 0); @@ -2937,11 +3040,12 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m return error; s_user_ns = dir->dentry->d_sb->s_user_ns; - if (!kuid_has_mapping(s_user_ns, current_fsuid()) || - !kgid_has_mapping(s_user_ns, current_fsgid())) + if (!kuid_has_mapping(s_user_ns, fsuid_into_mnt(mnt_userns)) || + !kgid_has_mapping(s_user_ns, fsgid_into_mnt(mnt_userns))) return -EOVERFLOW; - error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir->dentry->d_inode, + MAY_WRITE | MAY_EXEC); if (error) return error; @@ -3020,6 +3124,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, const struct open_flags *op, bool got_write) { + struct user_namespace *mnt_userns; struct dentry *dir = nd->path.dentry; struct inode *dir_inode = dir->d_inode; int open_flag = op->open_flag; @@ -3067,13 +3172,15 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, */ if (unlikely(!got_write)) open_flag &= ~O_TRUNC; + mnt_userns = mnt_user_ns(nd->path.mnt); if (open_flag & O_CREAT) { if (open_flag & O_EXCL) open_flag &= ~O_TRUNC; if (!IS_POSIXACL(dir->d_inode)) mode &= ~current_umask(); if (likely(got_write)) - create_error = may_o_create(&nd->path, dentry, mode); + create_error = may_o_create(mnt_userns, &nd->path, + dentry, mode); else create_error = -EROFS; } @@ -3108,8 +3215,9 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, error = -EACCES; goto out_dput; } - error = dir_inode->i_op->create(dir_inode, dentry, mode, - open_flag & O_EXCL); + + error = dir_inode->i_op->create(mnt_userns, dir_inode, dentry, + mode, open_flag & O_EXCL); if (error) goto out_dput; } @@ -3213,6 +3321,7 @@ finish_lookup: static int do_open(struct nameidata *nd, struct file *file, const struct open_flags *op) { + struct user_namespace *mnt_userns; int open_flag = op->open_flag; bool do_truncate; int acc_mode; @@ -3225,12 +3334,13 @@ static int do_open(struct nameidata *nd, } if (!(file->f_mode & FMODE_CREATED)) audit_inode(nd->name, nd->path.dentry, 0); + mnt_userns = mnt_user_ns(nd->path.mnt); if (open_flag & O_CREAT) { if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED)) return -EEXIST; if (d_is_dir(nd->path.dentry)) return -EISDIR; - error = may_create_in_sticky(nd->dir_mode, nd->dir_uid, + error = may_create_in_sticky(mnt_userns, nd, d_backing_inode(nd->path.dentry)); if (unlikely(error)) return error; @@ -3250,13 +3360,13 @@ static int do_open(struct nameidata *nd, return error; do_truncate = true; } - error = may_open(&nd->path, acc_mode, open_flag); + error = may_open(mnt_userns, &nd->path, acc_mode, open_flag); if (!error && !(file->f_mode & FMODE_OPENED)) error = vfs_open(&nd->path, file); if (!error) error = ima_file_check(file, op->acc_mode); if (!error && do_truncate) - error = handle_truncate(file); + error = handle_truncate(mnt_userns, file); if (unlikely(error > 0)) { WARN_ON(1); error = -EINVAL; @@ -3266,7 +3376,23 @@ static int do_open(struct nameidata *nd, return error; } -struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) +/** + * vfs_tmpfile - create tmpfile + * @mnt_userns: user namespace of the mount the inode was found from + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new tmpfile + * @open_flags: flags + * + * Create a temporary file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +struct dentry *vfs_tmpfile(struct user_namespace *mnt_userns, + struct dentry *dentry, umode_t mode, int open_flag) { struct dentry *child = NULL; struct inode *dir = dentry->d_inode; @@ -3274,7 +3400,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) int error; /* we want directory to be writable */ - error = inode_permission(dir, MAY_WRITE | MAY_EXEC); + error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC); if (error) goto out_err; error = -EOPNOTSUPP; @@ -3284,7 +3410,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) child = d_alloc(dentry, &slash_name); if (unlikely(!child)) goto out_err; - error = dir->i_op->tmpfile(dir, child, mode); + error = dir->i_op->tmpfile(mnt_userns, dir, child, mode); if (error) goto out_err; error = -ENOENT; @@ -3296,7 +3422,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag) inode->i_state |= I_LINKABLE; spin_unlock(&inode->i_lock); } - ima_post_create_tmpfile(inode); + ima_post_create_tmpfile(mnt_userns, inode); return child; out_err: @@ -3309,6 +3435,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file) { + struct user_namespace *mnt_userns; struct dentry *child; struct path path; int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); @@ -3317,7 +3444,8 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; - child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); + mnt_userns = mnt_user_ns(path.mnt); + child = vfs_tmpfile(mnt_userns, path.dentry, op->mode, op->open_flag); error = PTR_ERR(child); if (IS_ERR(child)) goto out2; @@ -3325,7 +3453,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, path.dentry = child; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&path, 0, op->open_flag); + error = may_open(mnt_userns, &path, 0, op->open_flag); if (!error) error = vfs_open(&path, file); out2: @@ -3527,10 +3655,27 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname, } EXPORT_SYMBOL(user_path_create); -int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +/** + * vfs_mknod - create device node or file + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new device node or file + * @dev: device number of device to create + * + * Create a device node or file. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { bool is_whiteout = S_ISCHR(mode) && dev == WHITEOUT_DEV; - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -3550,7 +3695,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) if (error) return error; - error = dir->i_op->mknod(dir, dentry, mode, dev); + error = dir->i_op->mknod(mnt_userns, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -3577,6 +3722,7 @@ static int may_mknod(umode_t mode) static long do_mknodat(int dfd, const char __user *filename, umode_t mode, unsigned int dev) { + struct user_namespace *mnt_userns; struct dentry *dentry; struct path path; int error; @@ -3595,18 +3741,22 @@ retry: error = security_path_mknod(&path, dentry, mode, dev); if (error) goto out; + + mnt_userns = mnt_user_ns(path.mnt); switch (mode & S_IFMT) { case 0: case S_IFREG: - error = vfs_create(path.dentry->d_inode,dentry,mode,true); + error = vfs_create(mnt_userns, path.dentry->d_inode, + dentry, mode, true); if (!error) - ima_post_path_mknod(dentry); + ima_post_path_mknod(mnt_userns, dentry); break; case S_IFCHR: case S_IFBLK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode, - new_decode_dev(dev)); + error = vfs_mknod(mnt_userns, path.dentry->d_inode, + dentry, mode, new_decode_dev(dev)); break; case S_IFIFO: case S_IFSOCK: - error = vfs_mknod(path.dentry->d_inode,dentry,mode,0); + error = vfs_mknod(mnt_userns, path.dentry->d_inode, + dentry, mode, 0); break; } out: @@ -3629,9 +3779,25 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d return do_mknodat(AT_FDCWD, filename, mode, dev); } -int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +/** + * vfs_mkdir - create directory + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @mode: mode of the new directory + * + * Create a directory. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); unsigned max_links = dir->i_sb->s_max_links; if (error) @@ -3648,7 +3814,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(dir, dentry, mode); + error = dir->i_op->mkdir(mnt_userns, dir, dentry, mode); if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -3670,8 +3836,12 @@ retry: if (!IS_POSIXACL(path.dentry->d_inode)) mode &= ~current_umask(); error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(path.dentry->d_inode, dentry, mode); + if (!error) { + struct user_namespace *mnt_userns; + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_mkdir(mnt_userns, path.dentry->d_inode, dentry, + mode); + } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3690,9 +3860,24 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode) return do_mkdirat(AT_FDCWD, pathname, mode); } -int vfs_rmdir(struct inode *dir, struct dentry *dentry) +/** + * vfs_rmdir - remove directory + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * + * Remove a directory. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_rmdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry) { - int error = may_delete(dir, dentry, 1); + int error = may_delete(mnt_userns, dir, dentry, 1); if (error) return error; @@ -3732,6 +3917,7 @@ EXPORT_SYMBOL(vfs_rmdir); long do_rmdir(int dfd, struct filename *name) { + struct user_namespace *mnt_userns; int error = 0; struct dentry *dentry; struct path path; @@ -3772,7 +3958,8 @@ retry: error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(path.dentry->d_inode, dentry); + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_rmdir(mnt_userns, path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: @@ -3795,6 +3982,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) /** * vfs_unlink - unlink a filesystem object + * @mnt_userns: user namespace of the mount the inode was found from * @dir: parent directory * @dentry: victim * @delegated_inode: returns victim inode, if the inode is delegated. @@ -3810,11 +3998,18 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname) * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode) +int vfs_unlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, struct inode **delegated_inode) { struct inode *target = dentry->d_inode; - int error = may_delete(dir, dentry, 0); + int error = may_delete(mnt_userns, dir, dentry, 0); if (error) return error; @@ -3885,6 +4080,8 @@ retry_deleg: dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { + struct user_namespace *mnt_userns; + /* Why not before? Because we want correct error value */ if (last.name[last.len]) goto slashes; @@ -3895,7 +4092,9 @@ retry_deleg: error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_unlink(mnt_userns, path.dentry->d_inode, dentry, + &delegated_inode); exit2: dput(dentry); } @@ -3944,9 +4143,25 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname) return do_unlinkat(AT_FDCWD, getname(pathname)); } -int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) +/** + * vfs_symlink - create symlink + * @mnt_userns: user namespace of the mount the inode was found from + * @dir: inode of @dentry + * @dentry: pointer to dentry of the base directory + * @oldname: name of the file to link to + * + * Create a symlink. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. + */ +int vfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *oldname) { - int error = may_create(dir, dentry); + int error = may_create(mnt_userns, dir, dentry); if (error) return error; @@ -3958,7 +4173,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - error = dir->i_op->symlink(dir, dentry, oldname); + error = dir->i_op->symlink(mnt_userns, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -3984,8 +4199,13 @@ retry: goto out_putname; error = security_path_symlink(&path, dentry, from->name); - if (!error) - error = vfs_symlink(path.dentry->d_inode, dentry, from->name); + if (!error) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(path.mnt); + error = vfs_symlink(mnt_userns, path.dentry->d_inode, dentry, + from->name); + } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4010,6 +4230,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn /** * vfs_link - create a new link * @old_dentry: object to be linked + * @mnt_userns: the user namespace of the mount * @dir: new parent * @new_dentry: where to create the new link * @delegated_inode: returns inode needing a delegation break @@ -4025,8 +4246,16 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn * Alternatively, a caller may pass NULL for delegated_inode. This may * be appropriate for callers that expect the underlying filesystem not * to be NFS exported. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then take + * care to map the inode according to @mnt_userns before checking permissions. + * On non-idmapped mounts or if permission checking is to be performed on the + * raw inode simply passs init_user_ns. */ -int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode) +int vfs_link(struct dentry *old_dentry, struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *new_dentry, + struct inode **delegated_inode) { struct inode *inode = old_dentry->d_inode; unsigned max_links = dir->i_sb->s_max_links; @@ -4035,7 +4264,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (!inode) return -ENOENT; - error = may_create(dir, new_dentry); + error = may_create(mnt_userns, dir, new_dentry); if (error) return error; @@ -4052,7 +4281,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de * be writen back improperly if their true value is unknown to * the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; if (!dir->i_op->link) return -EPERM; @@ -4099,6 +4328,7 @@ EXPORT_SYMBOL(vfs_link); static int do_linkat(int olddfd, const char __user *oldname, int newdfd, const char __user *newname, int flags) { + struct user_namespace *mnt_userns; struct dentry *new_dentry; struct path old_path, new_path; struct inode *delegated_inode = NULL; @@ -4134,13 +4364,15 @@ retry: error = -EXDEV; if (old_path.mnt != new_path.mnt) goto out_dput; - error = may_linkat(&old_path); + mnt_userns = mnt_user_ns(new_path.mnt); + error = may_linkat(mnt_userns, &old_path); if (unlikely(error)) goto out_dput; error = security_path_link(old_path.dentry, &new_path, new_dentry); if (error) goto out_dput; - error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode); + error = vfs_link(old_path.dentry, mnt_userns, new_path.dentry->d_inode, + new_dentry, &delegated_inode); out_dput: done_path_create(&new_path, new_dentry); if (delegated_inode) { @@ -4174,12 +4406,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname /** * vfs_rename - rename a filesystem object - * @old_dir: parent of source - * @old_dentry: source - * @new_dir: parent of destination - * @new_dentry: destination - * @delegated_inode: returns an inode needing a delegation break - * @flags: rename flags + * @old_mnt_userns: old user namespace of the mount the inode was found from + * @old_dir: parent of source + * @old_dentry: source + * @new_mnt_userns: new user namespace of the mount the inode was found from + * @new_dir: parent of destination + * @new_dentry: destination + * @delegated_inode: returns an inode needing a delegation break + * @flags: rename flags * * The caller must hold multiple mutexes--see lock_rename()). * @@ -4222,11 +4456,14 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname * ->i_mutex on parents, which works but leads to some truly excessive * locking]. */ -int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - struct inode **delegated_inode, unsigned int flags) +int vfs_rename(struct renamedata *rd) { int error; + struct inode *old_dir = rd->old_dir, *new_dir = rd->new_dir; + struct dentry *old_dentry = rd->old_dentry; + struct dentry *new_dentry = rd->new_dentry; + struct inode **delegated_inode = rd->delegated_inode; + unsigned int flags = rd->flags; bool is_dir = d_is_dir(old_dentry); struct inode *source = old_dentry->d_inode; struct inode *target = new_dentry->d_inode; @@ -4237,19 +4474,21 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (source == target) return 0; - error = may_delete(old_dir, old_dentry, is_dir); + error = may_delete(rd->old_mnt_userns, old_dir, old_dentry, is_dir); if (error) return error; if (!target) { - error = may_create(new_dir, new_dentry); + error = may_create(rd->new_mnt_userns, new_dir, new_dentry); } else { new_is_dir = d_is_dir(new_dentry); if (!(flags & RENAME_EXCHANGE)) - error = may_delete(new_dir, new_dentry, is_dir); + error = may_delete(rd->new_mnt_userns, new_dir, + new_dentry, is_dir); else - error = may_delete(new_dir, new_dentry, new_is_dir); + error = may_delete(rd->new_mnt_userns, new_dir, + new_dentry, new_is_dir); } if (error) return error; @@ -4263,12 +4502,14 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ if (new_dir != old_dir) { if (is_dir) { - error = inode_permission(source, MAY_WRITE); + error = inode_permission(rd->old_mnt_userns, source, + MAY_WRITE); if (error) return error; } if ((flags & RENAME_EXCHANGE) && new_is_dir) { - error = inode_permission(target, MAY_WRITE); + error = inode_permission(rd->new_mnt_userns, target, + MAY_WRITE); if (error) return error; } @@ -4308,8 +4549,8 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) goto out; } - error = old_dir->i_op->rename(old_dir, old_dentry, - new_dir, new_dentry, flags); + error = old_dir->i_op->rename(rd->new_mnt_userns, old_dir, old_dentry, + new_dir, new_dentry, flags); if (error) goto out; @@ -4350,6 +4591,7 @@ EXPORT_SYMBOL(vfs_rename); int do_renameat2(int olddfd, struct filename *from, int newdfd, struct filename *to, unsigned int flags) { + struct renamedata rd; struct dentry *old_dentry, *new_dentry; struct dentry *trap; struct path old_path, new_path; @@ -4453,9 +4695,16 @@ retry_deleg: &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_path.dentry->d_inode, old_dentry, - new_path.dentry->d_inode, new_dentry, - &delegated_inode, flags); + + rd.old_dir = old_path.dentry->d_inode; + rd.old_dentry = old_dentry; + rd.old_mnt_userns = mnt_user_ns(old_path.mnt); + rd.new_dir = new_path.dentry->d_inode; + rd.new_dentry = new_dentry; + rd.new_mnt_userns = mnt_user_ns(new_path.mnt); + rd.delegated_inode = &delegated_inode; + rd.flags = flags; + error = vfs_rename(&rd); exit5: dput(new_dentry); exit4: diff --git a/fs/namespace.c b/fs/namespace.c index 9d33909d0f9e..56bb5a5fdc0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -25,6 +25,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> +#include <linux/proc_fs.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> @@ -73,6 +74,15 @@ static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ +struct mount_kattr { + unsigned int attr_set; + unsigned int attr_clr; + unsigned int propagation; + unsigned int lookup_flags; + bool recurse; + struct user_namespace *mnt_userns; +}; + /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); @@ -87,6 +97,16 @@ EXPORT_SYMBOL_GPL(fs_kobj); */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); +static inline void lock_mount_hash(void) +{ + write_seqlock(&mount_lock); +} + +static inline void unlock_mount_hash(void) +{ + write_sequnlock(&mount_lock); +} + static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -210,6 +230,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); + mnt->mnt.mnt_userns = &init_user_ns; } return mnt; @@ -360,50 +381,36 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ - /* superblock may be r/o */ - if (__mnt_is_readonly(mnt)) - return -EROFS; - preempt_disable(); - mnt_inc_writers(real_mount(mnt)); - preempt_enable(); - return 0; -} -EXPORT_SYMBOL_GPL(mnt_clone_write); - -/** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like __mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the check for emergency r/o remounts. This must be + * paired with __mnt_drop_write_file. */ int __mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITER)) - return __mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + if (file->f_mode & FMODE_WRITER) { + /* + * Superblock may have become readonly while there are still + * writable fd's, e.g. due to a fs error with errors=remount-ro + */ + if (__mnt_is_readonly(file->f_path.mnt)) + return -EROFS; + return 0; + } + return __mnt_want_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the freeze protection and the check for emergency r/o + * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { @@ -449,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); + if (!(file->f_mode & FMODE_WRITER)) + __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) @@ -459,11 +467,8 @@ void mnt_drop_write_file(struct file *file) } EXPORT_SYMBOL(mnt_drop_write_file); -static int mnt_make_readonly(struct mount *mnt) +static inline int mnt_hold_writers(struct mount *mnt) { - int ret = 0; - - lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -488,25 +493,30 @@ static int mnt_make_readonly(struct mount *mnt) * we're counting up here. */ if (mnt_get_writers(mnt) > 0) - ret = -EBUSY; - else - mnt->mnt.mnt_flags |= MNT_READONLY; + return -EBUSY; + + return 0; +} + +static inline void mnt_unhold_writers(struct mount *mnt) +{ /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; - unlock_mount_hash(); - return ret; } -static int __mnt_unmake_readonly(struct mount *mnt) +static int mnt_make_readonly(struct mount *mnt) { - lock_mount_hash(); - mnt->mnt.mnt_flags &= ~MNT_READONLY; - unlock_mount_hash(); - return 0; + int ret; + + ret = mnt_hold_writers(mnt); + if (!ret) + mnt->mnt.mnt_flags |= MNT_READONLY; + mnt_unhold_writers(mnt); + return ret; } int sb_prepare_remount_readonly(struct super_block *sb) @@ -547,6 +557,11 @@ int sb_prepare_remount_readonly(struct super_block *sb) static void free_vfsmnt(struct mount *mnt) { + struct user_namespace *mnt_userns; + + mnt_userns = mnt_user_ns(&mnt->mnt); + if (mnt_userns != &init_user_ns) + put_user_ns(mnt_userns); kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); @@ -1055,6 +1070,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); + mnt->mnt.mnt_userns = mnt_user_ns(&old->mnt); + if (mnt->mnt.mnt_userns != &init_user_ns) + mnt->mnt.mnt_userns = get_user_ns(mnt->mnt.mnt_userns); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; @@ -2514,20 +2532,15 @@ static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) if (readonly_request) return mnt_make_readonly(mnt); - return __mnt_unmake_readonly(mnt); + mnt->mnt.mnt_flags &= ~MNT_READONLY; + return 0; } -/* - * Update the user-settable attributes on a mount. The caller must hold - * sb->s_umount for writing. - */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { - lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); - unlock_mount_hash(); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) @@ -2572,11 +2585,17 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; - down_write(&sb->s_umount); + /* + * We're only checking whether the superblock is read-only not + * changing it, so only take down_read(&sb->s_umount). + */ + down_read(&sb->s_umount); + lock_mount_hash(); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); - up_write(&sb->s_umount); + unlock_mount_hash(); + up_read(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); @@ -2616,8 +2635,11 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags, err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); - if (!err) + if (!err) { + lock_mount_hash(); set_mount_attributes(mnt, mnt_flags); + unlock_mount_hash(); + } } up_write(&sb->s_umount); } @@ -3440,6 +3462,33 @@ out_type: return ret; } +#define FSMOUNT_VALID_FLAGS \ + (MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | \ + MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME) + +#define MOUNT_SETATTR_VALID_FLAGS (FSMOUNT_VALID_FLAGS | MOUNT_ATTR_IDMAP) + +#define MOUNT_SETATTR_PROPAGATION_FLAGS \ + (MS_UNBINDABLE | MS_PRIVATE | MS_SLAVE | MS_SHARED) + +static unsigned int attr_flags_to_mnt_flags(u64 attr_flags) +{ + unsigned int mnt_flags = 0; + + if (attr_flags & MOUNT_ATTR_RDONLY) + mnt_flags |= MNT_READONLY; + if (attr_flags & MOUNT_ATTR_NOSUID) + mnt_flags |= MNT_NOSUID; + if (attr_flags & MOUNT_ATTR_NODEV) + mnt_flags |= MNT_NODEV; + if (attr_flags & MOUNT_ATTR_NOEXEC) + mnt_flags |= MNT_NOEXEC; + if (attr_flags & MOUNT_ATTR_NODIRATIME) + mnt_flags |= MNT_NODIRATIME; + + return mnt_flags; +} + /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. @@ -3462,24 +3511,10 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; - if (attr_flags & ~(MOUNT_ATTR_RDONLY | - MOUNT_ATTR_NOSUID | - MOUNT_ATTR_NODEV | - MOUNT_ATTR_NOEXEC | - MOUNT_ATTR__ATIME | - MOUNT_ATTR_NODIRATIME)) + if (attr_flags & ~FSMOUNT_VALID_FLAGS) return -EINVAL; - if (attr_flags & MOUNT_ATTR_RDONLY) - mnt_flags |= MNT_READONLY; - if (attr_flags & MOUNT_ATTR_NOSUID) - mnt_flags |= MNT_NOSUID; - if (attr_flags & MOUNT_ATTR_NODEV) - mnt_flags |= MNT_NODEV; - if (attr_flags & MOUNT_ATTR_NOEXEC) - mnt_flags |= MNT_NOEXEC; - if (attr_flags & MOUNT_ATTR_NODIRATIME) - mnt_flags |= MNT_NODIRATIME; + mnt_flags = attr_flags_to_mnt_flags(attr_flags); switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: @@ -3787,6 +3822,362 @@ out0: return error; } +static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) +{ + unsigned int flags = mnt->mnt.mnt_flags; + + /* flags to clear */ + flags &= ~kattr->attr_clr; + /* flags to raise */ + flags |= kattr->attr_set; + + return flags; +} + +static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct vfsmount *m = &mnt->mnt; + + if (!kattr->mnt_userns) + return 0; + + /* + * Once a mount has been idmapped we don't allow it to change its + * mapping. It makes things simpler and callers can just create + * another bind-mount they can idmap if they want to. + */ + if (mnt_user_ns(m) != &init_user_ns) + return -EPERM; + + /* The underlying filesystem doesn't support idmapped mounts yet. */ + if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) + return -EINVAL; + + /* We're not controlling the superblock. */ + if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Mount has already been visible in the filesystem hierarchy. */ + if (!is_anon_ns(mnt->mnt_ns)) + return -EINVAL; + + return 0; +} + +static struct mount *mount_setattr_prepare(struct mount_kattr *kattr, + struct mount *mnt, int *err) +{ + struct mount *m = mnt, *last = NULL; + + if (!is_mounted(&m->mnt)) { + *err = -EINVAL; + goto out; + } + + if (!(mnt_has_parent(m) ? check_mnt(m) : is_anon_ns(m->mnt_ns))) { + *err = -EINVAL; + goto out; + } + + do { + unsigned int flags; + + flags = recalc_flags(kattr, m); + if (!can_change_locked_flags(m, flags)) { + *err = -EPERM; + goto out; + } + + *err = can_idmap_mount(kattr, m); + if (*err) + goto out; + + last = m; + + if ((kattr->attr_set & MNT_READONLY) && + !(m->mnt.mnt_flags & MNT_READONLY)) { + *err = mnt_hold_writers(m); + if (*err) + goto out; + } + } while (kattr->recurse && (m = next_mnt(m, mnt))); + +out: + return last; +} + +static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) +{ + struct user_namespace *mnt_userns; + + if (!kattr->mnt_userns) + return; + + mnt_userns = get_user_ns(kattr->mnt_userns); + /* Pairs with smp_load_acquire() in mnt_user_ns(). */ + smp_store_release(&mnt->mnt.mnt_userns, mnt_userns); +} + +static void mount_setattr_commit(struct mount_kattr *kattr, + struct mount *mnt, struct mount *last, + int err) +{ + struct mount *m = mnt; + + do { + if (!err) { + unsigned int flags; + + do_idmap_mount(kattr, m); + flags = recalc_flags(kattr, m); + WRITE_ONCE(m->mnt.mnt_flags, flags); + } + + /* + * We either set MNT_READONLY above so make it visible + * before ~MNT_WRITE_HOLD or we failed to recursively + * apply mount options. + */ + if ((kattr->attr_set & MNT_READONLY) && + (m->mnt.mnt_flags & MNT_WRITE_HOLD)) + mnt_unhold_writers(m); + + if (!err && kattr->propagation) + change_mnt_propagation(m, kattr->propagation); + + /* + * On failure, only cleanup until we found the first mount + * we failed to handle. + */ + if (err && m == last) + break; + } while (kattr->recurse && (m = next_mnt(m, mnt))); + + if (!err) + touch_mnt_namespace(mnt->mnt_ns); +} + +static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +{ + struct mount *mnt = real_mount(path->mnt), *last = NULL; + int err = 0; + + if (path->dentry != mnt->mnt.mnt_root) + return -EINVAL; + + if (kattr->propagation) { + /* + * Only take namespace_lock() if we're actually changing + * propagation. + */ + namespace_lock(); + if (kattr->propagation == MS_SHARED) { + err = invent_group_ids(mnt, kattr->recurse); + if (err) { + namespace_unlock(); + return err; + } + } + } + + lock_mount_hash(); + + /* + * Get the mount tree in a shape where we can change mount + * properties without failure. + */ + last = mount_setattr_prepare(kattr, mnt, &err); + if (last) /* Commit all changes or revert to the old state. */ + mount_setattr_commit(kattr, mnt, last, err); + + unlock_mount_hash(); + + if (kattr->propagation) { + namespace_unlock(); + if (err) + cleanup_group_ids(mnt, NULL); + } + + return err; +} + +static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + int err = 0; + struct ns_common *ns; + struct user_namespace *mnt_userns; + struct file *file; + + if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) + return 0; + + /* + * We currently do not support clearing an idmapped mount. If this ever + * is a use-case we can revisit this but for now let's keep it simple + * and not allow it. + */ + if (attr->attr_clr & MOUNT_ATTR_IDMAP) + return -EINVAL; + + if (attr->userns_fd > INT_MAX) + return -EINVAL; + + file = fget(attr->userns_fd); + if (!file) + return -EBADF; + + if (!proc_ns_file(file)) { + err = -EINVAL; + goto out_fput; + } + + ns = get_proc_ns(file_inode(file)); + if (ns->ops->type != CLONE_NEWUSER) { + err = -EINVAL; + goto out_fput; + } + + /* + * The init_user_ns is used to indicate that a vfsmount is not idmapped. + * This is simpler than just having to treat NULL as unmapped. Users + * wanting to idmap a mount to init_user_ns can just use a namespace + * with an identity mapping. + */ + mnt_userns = container_of(ns, struct user_namespace, ns); + if (mnt_userns == &init_user_ns) { + err = -EPERM; + goto out_fput; + } + kattr->mnt_userns = get_user_ns(mnt_userns); + +out_fput: + fput(file); + return err; +} + +static int build_mount_kattr(const struct mount_attr *attr, size_t usize, + struct mount_kattr *kattr, unsigned int flags) +{ + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + *kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + .recurse = !!(flags & AT_RECURSIVE), + }; + + if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) + return -EINVAL; + if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) + return -EINVAL; + kattr->propagation = attr->propagation; + + if ((attr->attr_set | attr->attr_clr) & ~MOUNT_SETATTR_VALID_FLAGS) + return -EINVAL; + + kattr->attr_set = attr_flags_to_mnt_flags(attr->attr_set); + kattr->attr_clr = attr_flags_to_mnt_flags(attr->attr_clr); + + /* + * Since the MOUNT_ATTR_<atime> values are an enum, not a bitmap, + * users wanting to transition to a different atime setting cannot + * simply specify the atime setting in @attr_set, but must also + * specify MOUNT_ATTR__ATIME in the @attr_clr field. + * So ensure that MOUNT_ATTR__ATIME can't be partially set in + * @attr_clr and that @attr_set can't have any atime bits set if + * MOUNT_ATTR__ATIME isn't set in @attr_clr. + */ + if (attr->attr_clr & MOUNT_ATTR__ATIME) { + if ((attr->attr_clr & MOUNT_ATTR__ATIME) != MOUNT_ATTR__ATIME) + return -EINVAL; + + /* + * Clear all previous time settings as they are mutually + * exclusive. + */ + kattr->attr_clr |= MNT_RELATIME | MNT_NOATIME; + switch (attr->attr_set & MOUNT_ATTR__ATIME) { + case MOUNT_ATTR_RELATIME: + kattr->attr_set |= MNT_RELATIME; + break; + case MOUNT_ATTR_NOATIME: + kattr->attr_set |= MNT_NOATIME; + break; + case MOUNT_ATTR_STRICTATIME: + break; + default: + return -EINVAL; + } + } else { + if (attr->attr_set & MOUNT_ATTR__ATIME) + return -EINVAL; + } + + return build_mount_idmapped(attr, usize, kattr, flags); +} + +static void finish_mount_kattr(struct mount_kattr *kattr) +{ + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_attr attr; + struct mount_kattr kattr; + + BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (unlikely(usize > PAGE_SIZE)) + return -E2BIG; + if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) + return -EINVAL; + + if (!may_mount()) + return -EPERM; + + err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (err) + return err; + + /* Don't bother walking through the mounts if this is a nop. */ + if (attr.attr_set == 0 && + attr.attr_clr == 0 && + attr.propagation == 0) + return 0; + + err = build_mount_kattr(&attr, usize, &kattr, flags); + if (err) + return err; + + err = user_path_at(dfd, path, kattr.lookup_flags, &target); + if (err) + return err; + + err = do_mount_setattr(&target, &kattr); + finish_mount_kattr(&kattr); + path_put(&target); + return err; +} + static void __init init_mount_tree(void) { struct vfsmount *mnt; diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1a96ce28efb0..fe860c538747 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -115,13 +115,13 @@ bl_submit_bio(struct bio *bio) return NULL; } -static struct bio * -bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, +static struct bio *bl_alloc_init_bio(unsigned int npg, + struct block_device *bdev, sector_t disk_sector, bio_end_io_t end_io, struct parallel_io *par) { struct bio *bio; - npg = min(npg, BIO_MAX_PAGES); + npg = bio_max_segs(npg); bio = bio_alloc(GFP_NOIO, npg); if (bio) { bio->bi_iter.bi_sector = disk_sector; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ef827ae193d2..19a9f434442f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2095,8 +2095,8 @@ EXPORT_SYMBOL_GPL(nfs_instantiate); * that the operation succeeded on the server, but an error in the * reply path made it appear to have failed. */ -int nfs_create(struct inode *dir, struct dentry *dentry, - umode_t mode, bool excl) +int nfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct iattr attr; int open_flags = excl ? O_CREAT | O_EXCL : O_CREAT; @@ -2124,7 +2124,8 @@ EXPORT_SYMBOL_GPL(nfs_create); * See comments for nfs_proc_create regarding failed operations. */ int -nfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +nfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct iattr attr; int status; @@ -2150,7 +2151,8 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +int nfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; int error; @@ -2295,7 +2297,8 @@ EXPORT_SYMBOL_GPL(nfs_unlink); * now have a new file handle and can instantiate an in-core NFS inode * and move the raw page into its mapping. */ -int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +int nfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct page *page; char *kaddr; @@ -2398,9 +2401,9 @@ EXPORT_SYMBOL_GPL(nfs_link); * If these conditions are met, we can drop the dentries before doing * the rename. */ -int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); @@ -2939,7 +2942,9 @@ static int nfs_execute_ok(struct inode *inode, int mask) return ret; } -int nfs_permission(struct inode *inode, int mask) +int nfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, + int mask) { const struct cred *cred = current_cred(); int res = 0; @@ -2987,7 +2992,7 @@ out_notsup: res = nfs_revalidate_inode(NFS_SERVER(inode), inode); if (res == 0) - res = generic_permission(inode, mask); + res = generic_permission(&init_user_ns, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 63940a7a70be..16ad5050e046 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -89,7 +89,7 @@ nfs_file_release(struct inode *inode, struct file *filp) EXPORT_SYMBOL_GPL(nfs_file_release); /** - * nfs_revalidate_size - Revalidate the file size + * nfs_revalidate_file_size - Revalidate the file size * @inode: pointer to inode struct * @filp: pointer to struct file * @@ -606,8 +606,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - unsigned long written = 0; - ssize_t result; + unsigned int mntflags = NFS_SERVER(inode)->flags; + ssize_t result, written; errseq_t since; int error; @@ -626,13 +626,13 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (iocb->ki_flags & IOCB_APPEND) { + if (iocb->ki_flags & IOCB_APPEND || iocb->ki_pos > i_size_read(inode)) { result = nfs_revalidate_file_size(inode, file); if (result) goto out; } - if (iocb->ki_pos > i_size_read(inode)) - nfs_revalidate_mapping(inode, file->f_mapping); + + nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); nfs_start_io_write(inode); @@ -648,6 +648,21 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) written = result; iocb->ki_pos += written; + + if (mntflags & NFS_MOUNT_WRITE_EAGER) { + result = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } + if (mntflags & NFS_MOUNT_WRITE_WAIT) { + result = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); + if (result < 0) + goto out; + } result = generic_write_sync(iocb, written); if (result < 0) goto out; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 06894bcdea2d..971a9251c1d9 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -82,6 +82,7 @@ enum nfs_param { Opt_v, Opt_vers, Opt_wsize, + Opt_write, }; enum { @@ -113,6 +114,19 @@ static const struct constant_table nfs_param_enums_lookupcache[] = { {} }; +enum { + Opt_write_lazy, + Opt_write_eager, + Opt_write_wait, +}; + +static const struct constant_table nfs_param_enums_write[] = { + { "lazy", Opt_write_lazy }, + { "eager", Opt_write_eager }, + { "wait", Opt_write_wait }, + {} +}; + static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), @@ -171,6 +185,7 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), + fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), {} }; @@ -770,6 +785,24 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_invalid_value; } break; + case Opt_write: + switch (result.uint_32) { + case Opt_write_lazy: + ctx->flags &= + ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); + break; + case Opt_write_eager: + ctx->flags |= NFS_MOUNT_WRITE_EAGER; + ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; + break; + case Opt_write_wait: + ctx->flags |= + NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; + break; + default: + goto out_invalid_value; + } + break; /* * Special options @@ -1479,6 +1512,8 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; + + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index a60df88efc40..c4c021c6ebbd 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -390,10 +390,6 @@ static void nfs_readpage_from_fscache_complete(struct page *page, if (!error) { SetPageUptodate(page); unlock_page(page); - } else { - error = nfs_readpage_async(context, page->mapping->host, page); - if (error) - unlock_page(page); } } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 522aa10a1a3e..749bbea14d99 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -195,6 +195,18 @@ bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); +#ifdef CONFIG_NFS_V4_2 +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return nfsi->xattr_cache != NULL; +} +#else +static bool nfs_has_xattr_cache(const struct nfs_inode *nfsi) +{ + return false; +} +#endif + static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); @@ -209,6 +221,8 @@ static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) | NFS_INO_INVALID_XATTR); } + if (!nfs_has_xattr_cache(nfsi)) + flags &= ~NFS_INO_INVALID_XATTR; if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; @@ -594,7 +608,8 @@ EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int -nfs_setattr(struct dentry *dentry, struct iattr *attr) +nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; @@ -787,8 +802,8 @@ static bool nfs_need_revalidate_inode(struct inode *inode) return false; } -int nfs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int nfs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); @@ -857,7 +872,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask &= request_mask; out_no_update: - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; @@ -1257,55 +1272,19 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map return 0; } -bool nfs_mapping_need_revalidate_inode(struct inode *inode) -{ - return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || - NFS_STALE(inode); -} - -int nfs_revalidate_mapping_rcu(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - unsigned long *bitlock = &nfsi->flags; - int ret = 0; - - if (IS_SWAPFILE(inode)) - goto out; - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = -ECHILD; - goto out; - } - spin_lock(&inode->i_lock); - if (test_bit(NFS_INO_INVALIDATING, bitlock) || - (nfsi->cache_validity & NFS_INO_INVALID_DATA)) - ret = -ECHILD; - spin_unlock(&inode->i_lock); -out: - return ret; -} - /** - * nfs_revalidate_mapping - Revalidate the pagecache - * @inode: pointer to host inode + * nfs_clear_invalid_mapping - Conditionally clear a mapping * @mapping: pointer to mapping + * + * If the NFS_INO_INVALID_DATA inode flag is set, clear the mapping. */ -int nfs_revalidate_mapping(struct inode *inode, - struct address_space *mapping) +int nfs_clear_invalid_mapping(struct address_space *mapping) { + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; - /* swapfiles are not supposed to be shared. */ - if (IS_SWAPFILE(inode)) - goto out; - - if (nfs_mapping_need_revalidate_inode(inode)) { - ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); - if (ret < 0) - goto out; - } - /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings @@ -1336,8 +1315,8 @@ int nfs_revalidate_mapping(struct inode *inode, set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); - nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| - NFS_INO_DATA_INVAL_DEFER); + nfsi->cache_validity &= + ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); @@ -1350,6 +1329,53 @@ out: return ret; } +bool nfs_mapping_need_revalidate_inode(struct inode *inode) +{ + return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || + NFS_STALE(inode); +} + +int nfs_revalidate_mapping_rcu(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = -ECHILD; + goto out; + } + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock) || + (nfsi->cache_validity & NFS_INO_INVALID_DATA)) + ret = -ECHILD; + spin_unlock(&inode->i_lock); +out: + return ret; +} + +/** + * nfs_revalidate_mapping - Revalidate the pagecache + * @inode: pointer to host inode + * @mapping: pointer to mapping + */ +int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) +{ + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + return 0; + + if (nfs_mapping_need_revalidate_inode(inode)) { + int ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (ret < 0) + return ret; + } + + return nfs_clear_invalid_mapping(mapping); +} + static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 62d3189745cd..25fb43b69e5a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -378,14 +378,18 @@ extern unsigned long nfs_access_cache_count(struct shrinker *shrink, extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); -int nfs_create(struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct inode *, struct dentry *, umode_t); +int nfs_create(struct user_namespace *, struct inode *, struct dentry *, + umode_t, bool); +int nfs_mkdir(struct user_namespace *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); -int nfs_symlink(struct inode *, struct dentry *, const char *); +int nfs_symlink(struct user_namespace *, struct inode *, struct dentry *, + const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); -int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); -int nfs_rename(struct inode *, struct dentry *, +int nfs_mknod(struct user_namespace *, struct inode *, struct dentry *, umode_t, + dev_t); +int nfs_rename(struct user_namespace *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); /* file.c */ diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 2bcbe38afe2e..93e60e921f92 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -208,20 +208,23 @@ out_fc: } static int -nfs_namespace_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +nfs_namespace_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) { if (NFS_FH(d_inode(path->dentry))->size != 0) - return nfs_getattr(path, stat, request_mask, query_flags); - generic_fillattr(d_inode(path->dentry), stat); + return nfs_getattr(mnt_userns, path, stat, request_mask, + query_flags); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); return 0; } static int -nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +nfs_namespace_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { if (NFS_FH(d_inode(dentry))->size != 0) - return nfs_setattr(dentry, attr); + return nfs_setattr(mnt_userns, dentry, attr); return -EACCES; } diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h index 1b950b66b3bb..c8a192802dda 100644 --- a/fs/nfs/nfs3_fs.h +++ b/fs/nfs/nfs3_fs.h @@ -12,7 +12,8 @@ */ #ifdef CONFIG_NFS_V3_ACL extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); -extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, struct posix_acl *dfacl); extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index c6c863382f37..bb386a691e69 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -111,6 +111,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type) fallthrough; case -ENOTSUPP: status = -EOPNOTSUPP; + goto getout; default: goto getout; } @@ -251,7 +252,8 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, } -int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int nfs3_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct posix_acl *orig = acl, *dfacl = NULL, *alloc; int status; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86acffe7335c..889a9f4c0310 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -609,6 +609,7 @@ found: * changed. Schedule recovery! */ nfs4_schedule_path_down_recovery(pos); + goto out; default: goto out; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2f4679a62712..74bc5120013d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -71,10 +71,6 @@ #include "nfs4trace.h" -#ifdef CONFIG_NFS_V4_2 -#include "nfs42.h" -#endif /* CONFIG_NFS_V4_2 */ - #define NFSDBG_FACILITY NFSDBG_PROC #define NFS4_BITMASK_SZ 3 @@ -2231,6 +2227,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct default: printk(KERN_ERR "NFS: %s: unhandled error " "%d.\n", __func__, err); + fallthrough; case 0: case -ENOENT: case -EAGAIN: @@ -5438,15 +5435,16 @@ static void nfs4_bitmask_adjust(__u32 *bitmask, struct inode *inode, if (cache_validity & NFS_INO_INVALID_ATIME) bitmask[1] |= FATTR4_WORD1_TIME_ACCESS; - if (cache_validity & NFS_INO_INVALID_ACCESS) - bitmask[0] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | - FATTR4_WORD1_OWNER_GROUP; - if (cache_validity & NFS_INO_INVALID_ACL) - bitmask[0] |= FATTR4_WORD0_ACL; - if (cache_validity & NFS_INO_INVALID_LABEL) + if (cache_validity & NFS_INO_INVALID_OTHER) + bitmask[1] |= FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | + FATTR4_WORD1_OWNER_GROUP | + FATTR4_WORD1_NUMLINKS; + if (label && label->len && cache_validity & NFS_INO_INVALID_LABEL) bitmask[2] |= FATTR4_WORD2_SECURITY_LABEL; - if (cache_validity & NFS_INO_INVALID_CTIME) + if (cache_validity & NFS_INO_INVALID_CHANGE) bitmask[0] |= FATTR4_WORD0_CHANGE; + if (cache_validity & NFS_INO_INVALID_CTIME) + bitmask[1] |= FATTR4_WORD1_TIME_METADATA; if (cache_validity & NFS_INO_INVALID_MTIME) bitmask[1] |= FATTR4_WORD1_TIME_MODIFY; if (cache_validity & NFS_INO_INVALID_SIZE) @@ -7491,6 +7489,7 @@ nfs4_release_lockowner(struct nfs_server *server, struct nfs4_lock_state *lsp) #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7513,6 +7512,7 @@ static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) #ifdef CONFIG_NFS_V4_SECURITY_LABEL static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -7563,6 +7563,7 @@ nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) #ifdef CONFIG_NFS_V4_2 static int nfs4_xattr_set_nfs4_user(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *key, const void *buf, size_t buflen, int flags) @@ -9705,6 +9706,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case -NFS4ERR_BADLAYOUT: /* no layout */ case -NFS4ERR_GRACE: /* loca_recalim always false */ task->tk_status = 0; + break; case 0: break; default: diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4bf10792cb5b..3a51351bdc6a 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1125,6 +1125,7 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid) " sequence-id error on an" " unconfirmed sequence %p!\n", seqid->sequence); + return; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_BAD_STATEID: diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index af64b4e6fd1f..102b66e0bdef 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2875,6 +2875,7 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: @@ -3019,6 +3020,7 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) switch (trypnfs) { case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + break; case PNFS_ATTEMPTED: break; case PNFS_TRY_AGAIN: diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb854f1f86e2..d2b6dce1f99f 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,6 +74,24 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, + struct inode *inode) +{ + struct nfs_pgio_mirror *pgm; + unsigned long npages; + + nfs_pageio_complete(pgio); + + /* It doesn't make sense to do mirrored reads! */ + WARN_ON_ONCE(pgio->pg_mirror_count != 1); + + pgm = &pgio->pg_mirrors[0]; + NFS_I(inode)->read_io += pgm->pg_bytes_written; + npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; + nfs_add_stats(inode, NFSIOS_READPAGES, npages); +} + + void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *mirror; @@ -114,41 +132,10 @@ static void nfs_readpage_release(struct nfs_page *req, int error) nfs_release_request(req); } -int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, - struct page *page) -{ - struct nfs_page *new; - unsigned int len; +struct nfs_readdesc { struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); - new = nfs_create_request(ctx, page, 0, len); - if (IS_ERR(new)) { - unlock_page(page); - return PTR_ERR(new); - } - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - - nfs_pageio_init_read(&pgio, inode, false, - &nfs_async_read_completion_ops); - if (!nfs_pageio_add_request(&pgio, new)) { - nfs_list_remove_request(new); - nfs_readpage_release(new, pgio.pg_error); - } - nfs_pageio_complete(&pgio); - - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); - - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - - return pgio.pg_error < 0 ? pgio.pg_error : 0; -} + struct nfs_open_context *ctx; +}; static void nfs_page_group_set_uptodate(struct nfs_page *req) { @@ -171,8 +158,7 @@ static void nfs_read_completion(struct nfs_pgio_header *hdr) if (test_bit(NFS_IOHDR_EOF, &hdr->flags)) { /* note: regions of the page not covered by a - * request are zeroed in nfs_readpage_async / - * readpage_async_filler */ + * request are zeroed in readpage_async_filler */ if (bytes > hdr->good_bytes) { /* nothing in this request was good, so zero * the full extent of the request */ @@ -304,6 +290,38 @@ static void nfs_readpage_result(struct rpc_task *task, nfs_readpage_retry(task, hdr); } +static int +readpage_async_filler(void *data, struct page *page) +{ + struct nfs_readdesc *desc = data; + struct nfs_page *new; + unsigned int len; + int error; + + len = nfs_page_length(page); + if (len == 0) + return nfs_return_empty_page(page); + + new = nfs_create_request(desc->ctx, page, 0, len); + if (IS_ERR(new)) + goto out_error; + + if (len < PAGE_SIZE) + zero_user_segment(page, len, PAGE_SIZE); + if (!nfs_pageio_add_request(&desc->pgio, new)) { + nfs_list_remove_request(new); + error = desc->pgio.pg_error; + nfs_readpage_release(new, error); + goto out; + } + return 0; +out_error: + error = PTR_ERR(new); + unlock_page(page); +out: + return error; +} + /* * Read a page over NFS. * We read the page synchronously in the following case: @@ -312,14 +330,13 @@ static void nfs_readpage_result(struct rpc_task *task, */ int nfs_readpage(struct file *file, struct page *page) { - struct nfs_open_context *ctx; + struct nfs_readdesc desc; struct inode *inode = page_file_mapping(page)->host; - int error; + int ret; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_SIZE, page_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. @@ -328,93 +345,59 @@ int nfs_readpage(struct file *file, struct page *page) * be any new pending writes generated at this point * for this page (other pages can be written to). */ - error = nfs_wb_page(inode, page); - if (error) + ret = nfs_wb_page(inode, page); + if (ret) goto out_unlock; if (PageUptodate(page)) goto out_unlock; - error = -ESTALE; + ret = -ESTALE; if (NFS_STALE(inode)) goto out_unlock; if (file == NULL) { - error = -EBADF; - ctx = nfs_find_open_context(inode, NULL, FMODE_READ); - if (ctx == NULL) + ret = -EBADF; + desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); + if (desc.ctx == NULL) goto out_unlock; } else - ctx = get_nfs_open_context(nfs_file_open_context(file)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); if (!IS_SYNC(inode)) { - error = nfs_readpage_from_fscache(ctx, inode, page); - if (error == 0) + ret = nfs_readpage_from_fscache(desc.ctx, inode, page); + if (ret == 0) goto out; } - xchg(&ctx->error, 0); - error = nfs_readpage_async(ctx, inode, page); - if (!error) { - error = wait_on_page_locked_killable(page); - if (!PageUptodate(page) && !error) - error = xchg(&ctx->error, 0); - } -out: - put_nfs_open_context(ctx); - return error; -out_unlock: - unlock_page(page); - return error; -} - -struct nfs_readdesc { - struct nfs_pageio_descriptor *pgio; - struct nfs_open_context *ctx; -}; - -static int -readpage_async_filler(void *data, struct page *page) -{ - struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct nfs_page *new; - unsigned int len; - int error; + xchg(&desc.ctx->error, 0); + nfs_pageio_init_read(&desc.pgio, inode, false, + &nfs_async_read_completion_ops); - len = nfs_page_length(page); - if (len == 0) - return nfs_return_empty_page(page); + ret = readpage_async_filler(&desc, page); - new = nfs_create_request(desc->ctx, page, 0, len); - if (IS_ERR(new)) - goto out_error; + if (!ret) + nfs_pageio_complete_read(&desc.pgio, inode); - if (len < PAGE_SIZE) - zero_user_segment(page, len, PAGE_SIZE); - if (!nfs_pageio_add_request(desc->pgio, new)) { - nfs_list_remove_request(new); - error = desc->pgio->pg_error; - nfs_readpage_release(new, error); - goto out; + ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; + if (!ret) { + ret = wait_on_page_locked_killable(page); + if (!PageUptodate(page) && !ret) + ret = xchg(&desc.ctx->error, 0); } - return 0; -out_error: - error = PTR_ERR(new); - unlock_page(page); out: - return error; + put_nfs_open_context(desc.ctx); + return ret; +out_unlock: + unlock_page(page); + return ret; } -int nfs_readpages(struct file *filp, struct address_space *mapping, +int nfs_readpages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { - struct nfs_pageio_descriptor pgio; - struct nfs_pgio_mirror *pgm; - struct nfs_readdesc desc = { - .pgio = &pgio, - }; + struct nfs_readdesc desc; struct inode *inode = mapping->host; - unsigned long npages; - int ret = -ESTALE; + int ret; dprintk("NFS: nfs_readpages (%s/%Lu %d)\n", inode->i_sb->s_id, @@ -422,15 +405,17 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, nr_pages); nfs_inc_stats(inode, NFSIOS_VFSREADPAGES); + ret = -ESTALE; if (NFS_STALE(inode)) goto out; - if (filp == NULL) { + if (file == NULL) { + ret = -EBADF; desc.ctx = nfs_find_open_context(inode, NULL, FMODE_READ); if (desc.ctx == NULL) - return -EBADF; + goto out; } else - desc.ctx = get_nfs_open_context(nfs_file_open_context(filp)); + desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); /* attempt to read as many of the pages as possible from the cache * - this returns -ENOBUFS immediately if the cookie is negative @@ -440,20 +425,13 @@ int nfs_readpages(struct file *filp, struct address_space *mapping, if (ret == 0) goto read_complete; /* all pages were read */ - nfs_pageio_init_read(&pgio, inode, false, + nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete(&pgio); - /* It doesn't make sense to do mirrored reads! */ - WARN_ON_ONCE(pgio.pg_mirror_count != 1); + nfs_pageio_complete_read(&desc.pgio, inode); - pgm = &pgio.pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; - npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> - PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); read_complete: put_nfs_open_context(desc.ctx); out: diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c7a924580eec..94885c6f8f54 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -523,6 +523,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, ",local_lock=flock"); else seq_puts(m, ",local_lock=posix"); + + if (nfss->flags & NFS_MOUNT_WRITE_EAGER) { + if (nfss->flags & NFS_MOUNT_WRITE_WAIT) + seq_puts(m, ",write=wait"); + else + seq_puts(m, ",write=eager"); + } } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 639c34fec04a..82bdcb982186 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -712,16 +712,23 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; struct nfs_pageio_descriptor pgio; - struct nfs_io_completion *ioc; + struct nfs_io_completion *ioc = NULL; + unsigned int mntflags = NFS_SERVER(inode)->flags; + int priority = 0; int err; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); - ioc = nfs_io_completion_alloc(GFP_KERNEL); - if (ioc) - nfs_io_completion_init(ioc, nfs_io_completion_commit, inode); + if (!(mntflags & NFS_MOUNT_WRITE_EAGER) || wbc->for_kupdate || + wbc->for_background || wbc->for_sync || wbc->for_reclaim) { + ioc = nfs_io_completion_alloc(GFP_KERNEL); + if (ioc) + nfs_io_completion_init(ioc, nfs_io_completion_commit, + inode); + priority = wb_priority(wbc); + } - nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio); @@ -1278,19 +1285,21 @@ bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode) * the PageUptodate() flag. In this case, we will need to turn off * write optimisations that depend on the page contents being correct. */ -static bool nfs_write_pageuptodate(struct page *page, struct inode *inode) +static bool nfs_write_pageuptodate(struct page *page, struct inode *inode, + unsigned int pagelen) { struct nfs_inode *nfsi = NFS_I(inode); if (nfs_have_delegated_attributes(inode)) goto out; - if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE) + if (nfsi->cache_validity & + (NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_SIZE)) return false; smp_rmb(); - if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags)) + if (test_bit(NFS_INO_INVALIDATING, &nfsi->flags) && pagelen != 0) return false; out: - if (nfsi->cache_validity & NFS_INO_INVALID_DATA) + if (nfsi->cache_validity & NFS_INO_INVALID_DATA && pagelen != 0) return false; return PageUptodate(page) != 0; } @@ -1310,7 +1319,8 @@ is_whole_file_wrlock(struct file_lock *fl) * If the file is opened for synchronous writes then we can just skip the rest * of the checks. */ -static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode) +static int nfs_can_extend_write(struct file *file, struct page *page, + struct inode *inode, unsigned int pagelen) { int ret; struct file_lock_context *flctx = inode->i_flctx; @@ -1318,7 +1328,7 @@ static int nfs_can_extend_write(struct file *file, struct page *page, struct ino if (file->f_flags & O_DSYNC) return 0; - if (!nfs_write_pageuptodate(page, inode)) + if (!nfs_write_pageuptodate(page, inode, pagelen)) return 0; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) return 1; @@ -1356,6 +1366,7 @@ int nfs_updatepage(struct file *file, struct page *page, struct nfs_open_context *ctx = nfs_file_open_context(file); struct address_space *mapping = page_file_mapping(page); struct inode *inode = mapping->host; + unsigned int pagelen = nfs_page_length(page); int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -1366,8 +1377,8 @@ int nfs_updatepage(struct file *file, struct page *page, if (!count) goto out; - if (nfs_can_extend_write(file, page, inode)) { - count = max(count + offset, nfs_page_length(page)); + if (nfs_can_extend_write(file, page, inode, pagelen)) { + count = max(count + offset, pagelen); offset = 0; } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 7c863f2c21e0..9421dae22737 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -386,8 +386,9 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct inode *inode, int *flags, unsigned char *uuid) +static int check_export(struct path *path, int *flags, unsigned char *uuid) { + struct inode *inode = d_inode(path->dentry); /* * We currently export only dirs, regular files, and (for v4 @@ -411,6 +412,7 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) * or an FSID number (so NFSEXP_FSID or ->uuid is needed). * 2: We must be able to find an inode from a filehandle. * This means that s_export_op must be set. + * 3: We must not currently be on an idmapped mount. */ if (!(inode->i_sb->s_type->fs_flags & FS_REQUIRES_DEV) && !(*flags & NFSEXP_FSID) && @@ -425,6 +427,11 @@ static int check_export(struct inode *inode, int *flags, unsigned char *uuid) return -EINVAL; } + if (mnt_user_ns(path->mnt) != &init_user_ns) { + dprintk("exp_export: export of idmapped mounts not yet supported.\n"); + return -EINVAL; + } + if (inode->i_sb->s_export_op->flags & EXPORT_OP_NOSUBTREECHK && !(*flags & NFSEXP_NOSUBTREECHECK)) { dprintk("%s: %s does not support subtree checking!\n", @@ -653,8 +660,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, - exp.ex_uuid); + err = check_export(&exp.ex_path, &exp.ex_flags, exp.ex_uuid); if (err) goto out4; /* diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index 7eeac5b81c20..855e17772eba 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -113,10 +113,12 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst *rqstp) fh_lock(fh); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + argp->acl_default); if (error) goto out_drop_lock; diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index a568b842e9eb..9a6f18d74d14 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -103,10 +103,12 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst *rqstp) fh_lock(fh); - error = set_posix_acl(inode, ACL_TYPE_ACCESS, argp->acl_access); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, + argp->acl_access); if (error) goto out_drop_lock; - error = set_posix_acl(inode, ACL_TYPE_DEFAULT, argp->acl_default); + error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_DEFAULT, + argp->acl_default); out_drop_lock: fh_unlock(fh); diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 71292a0d6f09..eaa3a0cf38f1 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -781,12 +781,13 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, fh_lock(fhp); - host_error = set_posix_acl(inode, ACL_TYPE_ACCESS, pacl); + host_error = set_posix_acl(&init_user_ns, inode, ACL_TYPE_ACCESS, pacl); if (host_error < 0) goto out_drop_lock; if (S_ISDIR(inode->i_mode)) { - host_error = set_posix_acl(inode, ACL_TYPE_DEFAULT, dpacl); + host_error = set_posix_acl(&init_user_ns, inode, + ACL_TYPE_DEFAULT, dpacl); } out_drop_lock: diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 186fa2c2c6ba..891395c6c7d3 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -233,7 +233,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); + status = vfs_mkdir(&init_user_ns, d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: @@ -353,7 +353,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) status = -ENOENT; if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(d_inode(dir), dentry); + status = vfs_rmdir(&init_user_ns, d_inode(dir), dentry); out: dput(dentry); out_unlock: @@ -443,7 +443,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(name, nn)) goto out_free; - status = vfs_rmdir(d_inode(parent), child); + status = vfs_rmdir(&init_user_ns, d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4f6e514192bd..ef86ed23af82 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1525,12 +1525,9 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_pernet_subsys(&nfsd_net_ops); - if (retval < 0) - return retval; retval = register_cld_notifier(); if (retval) - goto out_unregister_pernet; + return retval; retval = nfsd4_init_slabs(); if (retval) goto out_unregister_notifier; @@ -1549,9 +1546,14 @@ static int __init init_nfsd(void) goto out_free_lockd; retval = register_filesystem(&nfsd_fs_type); if (retval) + goto out_free_exports; + retval = register_pernet_subsys(&nfsd_net_ops); + if (retval < 0) goto out_free_all; return 0; out_free_all: + unregister_pernet_subsys(&nfsd_net_ops); +out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); out_free_lockd: @@ -1565,13 +1567,12 @@ out_free_slabs: nfsd4_free_slabs(); out_unregister_notifier: unregister_cld_notifier(); -out_unregister_pernet: - unregister_pernet_subsys(&nfsd_net_ops); return retval; } static void __exit exit_nfsd(void) { + unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); @@ -1581,7 +1582,6 @@ static void __exit exit_nfsd(void) nfsd4_exit_pnfs(); unregister_filesystem(&nfsd_fs_type); unregister_cld_notifier(); - unregister_pernet_subsys(&nfsd_net_ops); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 4744a276058d..10b44421eace 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -40,7 +40,8 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(d_inode(parent), MAY_EXEC); + err = inode_permission(&init_user_ns, + d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b2f8035f166b..a8d5449dd0e9 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -90,7 +90,7 @@ nfsd_proc_setattr(struct svc_rqst *rqstp) if (delta < 0) delta = -delta; if (delta < MAX_TOUCH_TIME_ERROR && - setattr_prepare(fhp->fh_dentry, iap) != 0) { + setattr_prepare(&init_user_ns, fhp->fh_dentry, iap) != 0) { /* * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. * This will cause notify_change to set these times diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d316e11923c5..fd6be35a1642 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -448,7 +448,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, .ia_size = iap->ia_size, }; - host_err = notify_change(dentry, &size_attr, NULL); + host_err = notify_change(&init_user_ns, dentry, &size_attr, NULL); if (host_err) goto out_unlock; iap->ia_valid &= ~ATTR_SIZE; @@ -463,7 +463,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } iap->ia_valid |= ATTR_CTIME; - host_err = notify_change(dentry, iap, NULL); + host_err = notify_change(&init_user_ns, dentry, iap, NULL); out_unlock: fh_unlock(fhp); @@ -499,7 +499,8 @@ int nfsd4_is_junction(struct dentry *dentry) return 0; if (!(inode->i_mode & S_ISVTX)) return 0; - if (vfs_getxattr(dentry, NFSD_JUNCTION_XATTR_NAME, NULL, 0) <= 0) + if (vfs_getxattr(&init_user_ns, dentry, NFSD_JUNCTION_XATTR_NAME, + NULL, 0) <= 0) return 0; return 1; } @@ -1254,12 +1255,12 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = 0; switch (type) { case S_IFREG: - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); if (!host_err) nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); + host_err = vfs_mkdir(&init_user_ns, dirp, dchild, iap->ia_mode); if (!host_err && unlikely(d_unhashed(dchild))) { struct dentry *d; d = lookup_one_len(dchild->d_name.name, @@ -1287,7 +1288,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, case S_IFBLK: case S_IFIFO: case S_IFSOCK: - host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); + host_err = vfs_mknod(&init_user_ns, dirp, dchild, + iap->ia_mode, rdev); break; default: printk(KERN_WARNING "nfsd: bad file type %o in nfsd_create\n", @@ -1485,7 +1487,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, if (!IS_POSIXACL(dirp)) iap->ia_mode &= ~current_umask(); - host_err = vfs_create(dirp, dchild, iap->ia_mode, true); + host_err = vfs_create(&init_user_ns, dirp, dchild, iap->ia_mode, true); if (host_err < 0) { fh_drop_write(fhp); goto out_nfserr; @@ -1609,7 +1611,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = vfs_symlink(d_inode(dentry), dnew, path); + host_err = vfs_symlink(&init_user_ns, d_inode(dentry), dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1677,7 +1679,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, err = nfserr_noent; if (d_really_is_negative(dold)) goto out_dput; - host_err = vfs_link(dold, dirp, dnew, NULL); + host_err = vfs_link(dold, &init_user_ns, dirp, dnew, NULL); if (!host_err) { err = nfserrno(commit_metadata(ffhp)); if (!err) @@ -1797,7 +1799,15 @@ retry: close_cached = true; goto out_dput_old; } else { - host_err = vfs_rename(fdir, odentry, tdir, ndentry, NULL, 0); + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = fdir, + .old_dentry = odentry, + .new_mnt_userns = &init_user_ns, + .new_dir = tdir, + .new_dentry = ndentry, + }; + host_err = vfs_rename(&rd); if (!host_err) { host_err = commit_metadata(tfhp); if (!host_err) @@ -1884,9 +1894,9 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (type != S_IFDIR) { if (rdentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) nfsd_close_cached_files(rdentry); - host_err = vfs_unlink(dirp, rdentry, NULL); + host_err = vfs_unlink(&init_user_ns, dirp, rdentry, NULL); } else { - host_err = vfs_rmdir(dirp, rdentry); + host_err = vfs_rmdir(&init_user_ns, dirp, rdentry); } if (!host_err) @@ -2149,7 +2159,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, inode_lock_shared(inode); - len = vfs_getxattr(dentry, name, NULL, 0); + len = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); /* * Zero-length attribute, just return. @@ -2176,7 +2186,7 @@ nfsd_getxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, goto out; } - len = vfs_getxattr(dentry, name, buf, len); + len = vfs_getxattr(&init_user_ns, dentry, name, buf, len); if (len <= 0) { kvfree(buf); buf = NULL; @@ -2283,7 +2293,8 @@ nfsd_removexattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name) fh_lock(fhp); - ret = __vfs_removexattr_locked(fhp->fh_dentry, name, NULL); + ret = __vfs_removexattr_locked(&init_user_ns, fhp->fh_dentry, + name, NULL); fh_unlock(fhp); fh_drop_write(fhp); @@ -2307,8 +2318,8 @@ nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, char *name, return nfserrno(ret); fh_lock(fhp); - ret = __vfs_setxattr_locked(fhp->fh_dentry, name, buf, len, flags, - NULL); + ret = __vfs_setxattr_locked(&init_user_ns, fhp->fh_dentry, name, buf, + len, flags, NULL); fh_unlock(fhp); fh_drop_write(fhp); @@ -2391,13 +2402,14 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, return 0; /* This assumes NFSD_MAY_{READ,WRITE,EXEC} == MAY_{READ,WRITE,EXEC} */ - err = inode_permission(inode, acc & (MAY_READ|MAY_WRITE|MAY_EXEC)); + err = inode_permission(&init_user_ns, inode, + acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); /* Allow read access to binaries even when mode 111 */ if (err == -EACCES && S_ISREG(inode->i_mode) && (acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE) || acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC))) - err = inode_permission(inode, MAY_EXEC); + err = inode_permission(&init_user_ns, inode, MAY_EXEC); return err? nfserrno(err) : 0; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 745d371d6fea..2e8eb263cf0f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -348,7 +348,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) /* reference count of i_bh inherits from nilfs_mdt_read_block() */ atomic64_inc(&root->inodes_count); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = ino; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); @@ -805,14 +805,15 @@ void nilfs_evict_inode(struct inode *inode) */ } -int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) +int nilfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct nilfs_transaction_info ti; struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int err; - err = setattr_prepare(dentry, iattr); + err = setattr_prepare(&init_user_ns, dentry, iattr); if (err) return err; @@ -827,7 +828,7 @@ int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) nilfs_truncate(inode); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) { @@ -843,7 +844,8 @@ out_err: return err; } -int nilfs_permission(struct inode *inode, int mask) +int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { struct nilfs_root *root = NILFS_I(inode)->i_root; @@ -851,7 +853,7 @@ int nilfs_permission(struct inode *inode, int mask) root->cno != NILFS_CPTREE_CURRENT_CNO) return -EROFS; /* snapshot is not writable */ - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh) diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 07d26f61f22a..b053b40315bf 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -132,7 +132,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, unsigned int flags, oldflags; int ret; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(flags, (int __user *)argp)) diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index a6ec7961d4f5..ecace5f96a95 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -72,8 +72,8 @@ nilfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int nilfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; struct nilfs_transaction_info ti; @@ -100,7 +100,8 @@ static int nilfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, } static int -nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +nilfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct nilfs_transaction_info ti; @@ -124,8 +125,8 @@ nilfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) return err; } -static int nilfs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int nilfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct nilfs_transaction_info ti; struct super_block *sb = dir->i_sb; @@ -201,7 +202,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int nilfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -338,8 +340,9 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) return err; } -static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, +static int nilfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index f8450ee3fd06..c4a45a081ade 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -267,9 +267,11 @@ extern struct inode *nilfs_iget_for_gc(struct super_block *sb, extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); -extern int nilfs_setattr(struct dentry *, struct iattr *); +extern int nilfs_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void nilfs_write_failed(struct address_space *mapping, loff_t to); -int nilfs_permission(struct inode *inode, int mask); +int nilfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index dcab112e1f00..9e0c1afac8bd 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -702,7 +702,7 @@ static int fanotify_find_path(int dfd, const char __user *filename, } /* you can only watch an inode if you have read permissions on it */ - ret = inode_permission(path->dentry->d_inode, MAY_READ); + ret = path_permission(path, MAY_READ); if (ret) { path_put(path); goto out; @@ -976,7 +976,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) f_flags |= O_NONBLOCK; /* fsnotify_alloc_group takes a ref. Dropped in fanotify_release */ - group = fsnotify_alloc_group(&fanotify_fsnotify_ops); + group = fsnotify_alloc_user_group(&fanotify_fsnotify_ops); if (IS_ERR(group)) { free_uid(user); return PTR_ERR(group); diff --git a/fs/notify/group.c b/fs/notify/group.c index a4a4b1c64d32..ffd723ffe46d 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -111,14 +111,12 @@ void fsnotify_put_group(struct fsnotify_group *group) } EXPORT_SYMBOL_GPL(fsnotify_put_group); -/* - * Create a new fsnotify_group and hold a reference for the group returned. - */ -struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +static struct fsnotify_group *__fsnotify_alloc_group( + const struct fsnotify_ops *ops, gfp_t gfp) { struct fsnotify_group *group; - group = kzalloc(sizeof(struct fsnotify_group), GFP_KERNEL); + group = kzalloc(sizeof(struct fsnotify_group), gfp); if (!group) return ERR_PTR(-ENOMEM); @@ -139,8 +137,25 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } + +/* + * Create a new fsnotify_group and hold a reference for the group returned. + */ +struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) +{ + return __fsnotify_alloc_group(ops, GFP_KERNEL); +} EXPORT_SYMBOL_GPL(fsnotify_alloc_group); +/* + * Create a new fsnotify_group and hold a reference for the group returned. + */ +struct fsnotify_group *fsnotify_alloc_user_group(const struct fsnotify_ops *ops) +{ + return __fsnotify_alloc_group(ops, GFP_KERNEL_ACCOUNT); +} +EXPORT_SYMBOL_GPL(fsnotify_alloc_user_group); + int fsnotify_fasync(int fd, struct file *file, int on) { struct fsnotify_group *group = file->private_data; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 59c177011a0f..c71be4fb7dc5 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -352,7 +352,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, if (error) return error; /* you can only watch an inode if you have read permissions on it */ - error = inode_permission(path->dentry->d_inode, MAY_READ); + error = path_permission(path, MAY_READ); if (error) { path_put(path); return error; @@ -632,11 +632,11 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events) struct fsnotify_group *group; struct inotify_event_info *oevent; - group = fsnotify_alloc_group(&inotify_fsnotify_ops); + group = fsnotify_alloc_user_group(&inotify_fsnotify_ops); if (IS_ERR(group)) return group; - oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL); + oevent = kmalloc(sizeof(struct inotify_event_info), GFP_KERNEL_ACCOUNT); if (unlikely(!oevent)) { fsnotify_destroy_group(group); return ERR_PTR(-ENOMEM); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f7e4cbc26eaf..f5c058b3192c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -629,6 +629,12 @@ static int ntfs_read_locked_inode(struct inode *vi) } a = ctx->attr; /* Get the standard information attribute value. */ + if ((u8 *)a + le16_to_cpu(a->data.resident.value_offset) + + le32_to_cpu(a->data.resident.value_length) > + (u8 *)ctx->mrec + vol->mft_record_size) { + ntfs_error(vi->i_sb, "Corrupt standard information attribute in inode."); + goto unm_err_out; + } si = (STANDARD_INFORMATION*)((u8*)a + le16_to_cpu(a->data.resident.value_offset)); @@ -2848,6 +2854,7 @@ void ntfs_truncate_vfs(struct inode *vi) { /** * ntfs_setattr - called from notify_change() when an attribute is being changed + * @mnt_userns: user namespace of the mount the inode was found from * @dentry: dentry whose attributes to change * @attr: structure describing the attributes and the changes * @@ -2860,13 +2867,14 @@ void ntfs_truncate_vfs(struct inode *vi) { * * Called with ->i_mutex held. */ -int ntfs_setattr(struct dentry *dentry, struct iattr *attr) +int ntfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) goto out; /* We do not support NTFS ACLs yet. */ diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 363e4e820673..6f78ee00f57f 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -289,7 +289,8 @@ extern int ntfs_show_options(struct seq_file *sf, struct dentry *root); extern int ntfs_truncate(struct inode *vi); extern void ntfs_truncate_vfs(struct inode *vi); -extern int ntfs_setattr(struct dentry *dentry, struct iattr *attr); +extern int ntfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr); extern int __ntfs_write_inode(struct inode *vi, int sync); diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h index 85422761ff43..5d4bf7a3259f 100644 --- a/fs/ntfs/layout.h +++ b/fs/ntfs/layout.h @@ -703,7 +703,7 @@ typedef struct { /* 14*/ le16 instance; /* The instance of this attribute record. This number is unique within this mft record (see MFT_RECORD/next_attribute_instance notes in - in mft.h for more details). */ + mft.h for more details). */ /* 16*/ union { /* Resident attributes. */ struct { @@ -1838,7 +1838,7 @@ typedef struct { * Also, each security descriptor is stored twice in the $SDS stream with a * fixed offset of 0x40000 bytes (256kib, the Windows cache manager's max size) * between them; i.e. if a SDS_ENTRY specifies an offset of 0x51d0, then the - * the first copy of the security descriptor will be at offset 0x51d0 in the + * first copy of the security descriptor will be at offset 0x51d0 in the * $SDS data stream and the second copy will be at offset 0x451d0. */ typedef struct { diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 7b07f5df3a29..5259badabb56 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -262,7 +262,8 @@ static int ocfs2_set_acl(handle_t *handle, return ret; } -int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { struct buffer_head *bh = NULL; int status, had_lock; @@ -274,7 +275,8 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (type == ACL_TYPE_ACCESS && acl) { umode_t mode; - status = posix_acl_update_mode(inode, &mode, &acl); + status = posix_acl_update_mode(&init_user_ns, inode, &mode, + &acl); if (status) goto unlock; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 127b13432146..4e86450917b2 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -19,7 +19,8 @@ struct ocfs2_acl_entry { }; struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); -int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int ocfs2_iop_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 0179a73a3fa2..12a7590601dd 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2042,7 +2042,7 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g o2hb_nego_timeout_handler, reg, NULL, ®->hr_handler_list); if (ret) - goto free; + goto remove_item; ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, sizeof(struct o2hb_nego_msg), @@ -2057,6 +2057,12 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g unregister_handler: o2net_unregister_handler_list(®->hr_handler_list); +remove_item: + spin_lock(&o2hb_live_lock); + list_del(®->hr_all_item); + if (o2hb_global_heartbeat_active()) + clear_bit(reg->hr_region_num, o2hb_region_bitmap); + spin_unlock(&o2hb_live_lock); free: kfree(reg); return ERR_PTR(ret); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 6abaded3ff6b..70a10764f249 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -165,16 +165,6 @@ void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) spin_unlock(&lock->spinlock); } -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock) -{ - BUG_ON(!dlm); - BUG_ON(!lock); - - spin_lock(&dlm->ast_lock); - __dlm_queue_bast(dlm, lock); - spin_unlock(&dlm->ast_lock); -} - static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, struct dlm_lock *lock) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index c8a444622faa..58d57e25d384 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -17,10 +17,7 @@ #define DLM_LOCKID_NAME_MAX 32 -#define DLM_DOMAIN_NAME_MAX_LEN 255 #define DLM_LOCK_RES_OWNER_UNKNOWN O2NM_MAX_NODES -#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes -#define DLM_THREAD_MS 200 // flush at least every 200 ms #define DLM_HASH_SIZE_DEFAULT (1 << 17) #if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE @@ -902,7 +899,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); -void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void __dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); void dlm_do_local_ast(struct dlm_ctxt *dlm, diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 583820ec63e2..b2870f1a31df 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -190,17 +190,18 @@ static int dlmfs_file_release(struct inode *inode, * We do ->setattr() just to override size changes. Our size is the size * of the LVB and nothing else. */ -static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) +static int dlmfs_file_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -329,7 +330,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inc_nlink(inode); @@ -352,7 +353,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, return NULL; inode->i_ino = get_next_ino(); - inode_init_owner(inode, parent, mode); + inode_init_owner(&init_user_ns, inode, parent, mode); inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); ip = DLMFS_I(inode); @@ -395,7 +396,8 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct inode * dir, +static int dlmfs_mkdir(struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode) { @@ -443,7 +445,8 @@ bail: return status; } -static int dlmfs_create(struct inode *dir, +static int dlmfs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index df6d709d2ae3..6611c64ca0be 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1112,7 +1112,8 @@ out: return ret; } -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) +int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int status = 0, size_change; int inode_locked = 0; @@ -1142,7 +1143,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) return 0; - status = setattr_prepare(dentry, attr); + status = setattr_prepare(&init_user_ns, dentry, attr); if (status) return status; @@ -1263,7 +1264,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); status = ocfs2_mark_inode_dirty(handle, inode, bh); @@ -1298,8 +1299,8 @@ bail: return status; } -int ocfs2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct super_block *sb = path->dentry->d_sb; @@ -1313,7 +1314,7 @@ int ocfs2_getattr(const struct path *path, struct kstat *stat, goto bail; } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -1330,7 +1331,8 @@ bail: return err; } -int ocfs2_permission(struct inode *inode, int mask) +int ocfs2_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { int ret, had_lock; struct ocfs2_lock_holder oh; @@ -1355,7 +1357,7 @@ int ocfs2_permission(struct inode *inode, int mask) dump_stack(); } - ret = generic_permission(inode, mask); + ret = generic_permission(&init_user_ns, inode, mask); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); out: diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 4832cbceba5b..8536cec5f122 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,10 +51,13 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); -int ocfs2_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int ocfs2_permission(struct inode *inode, int mask); +int ocfs2_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ocfs2_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ocfs2_permission(struct user_namespace *mnt_userns, + struct inode *inode, + int mask); int ocfs2_should_update_atime(struct inode *inode, struct vfsmount *vfsmnt); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 89984172fc4a..50c9b30ee9f6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -96,7 +96,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, } status = -EACCES; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto bail_unlock; if (!S_ISDIR(inode->i_mode)) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2a237ab00453..3abdd36da2e2 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -198,7 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode) * callers. */ if (S_ISDIR(mode)) set_nlink(inode, 2); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); status = dquot_initialize(inode); if (status) return ERR_PTR(status); @@ -221,7 +221,8 @@ static void ocfs2_cleanup_add_entry_failure(struct ocfs2_super *osb, iput(inode); } -static int ocfs2_mknod(struct inode *dir, +static int ocfs2_mknod(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) @@ -645,7 +646,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, return status; } -static int ocfs2_mkdir(struct inode *dir, +static int ocfs2_mkdir(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -653,14 +655,15 @@ static int ocfs2_mkdir(struct inode *dir, trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name, OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0); + ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); if (ret) mlog_errno(ret); return ret; } -static int ocfs2_create(struct inode *dir, +static int ocfs2_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) @@ -669,7 +672,7 @@ static int ocfs2_create(struct inode *dir, trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno, mode); - ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0); + ret = ocfs2_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); if (ret) mlog_errno(ret); @@ -1195,7 +1198,8 @@ static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2) ocfs2_inode_unlock(inode2, 1); } -static int ocfs2_rename(struct inode *old_dir, +static int ocfs2_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, @@ -1784,7 +1788,8 @@ bail: return status; } -static int ocfs2_symlink(struct inode *dir, +static int ocfs2_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 3b397fa9c9e8..c19a463fac55 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -978,7 +978,7 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, return 0; } - if (!eb || (eb && !eb->h_next_leaf_blk)) { + if (!eb || !eb->h_next_leaf_blk) { /* * We are the last extent rec, so any high cpos should * be stored in this leaf refcount block. @@ -4346,7 +4346,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; - return inode_permission(dir, MAY_WRITE | MAY_EXEC); + return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC); } /** @@ -4400,7 +4400,7 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, * file. */ if (!preserve) { - error = inode_permission(inode, MAY_READ); + error = inode_permission(&init_user_ns, inode, MAY_READ); if (error) return error; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2febc76e9de7..079f8826993e 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -973,8 +973,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) * quota files */ dquot_disable(sb, type, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); - if (!inode) - continue; iput(inode); } } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 9ccd19d8f7b1..36ae47a4aef6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7249,6 +7249,7 @@ static int ocfs2_xattr_security_get(const struct xattr_handler *handler, } static int ocfs2_xattr_security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7321,6 +7322,7 @@ static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, } static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -7351,6 +7353,7 @@ static int ocfs2_xattr_user_get(const struct xattr_handler *handler, } static int ocfs2_xattr_user_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index a0f45651f3b7..c219f91f44e9 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,13 +279,14 @@ out_free_inode: return err; } -static int omfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int omfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return omfs_add_node(dir, dentry, mode | S_IFDIR); } -static int omfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int omfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return omfs_add_node(dir, dentry, mode | S_IFREG); } @@ -369,9 +370,9 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, return true; } -static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int omfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 2c7b70ee1388..11e733aab25d 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -343,12 +343,13 @@ const struct file_operations omfs_file_operations = { .splice_read = generic_file_splice_read, }; -static int omfs_setattr(struct dentry *dentry, struct iattr *attr) +static int omfs_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -361,7 +362,7 @@ static int omfs_setattr(struct dentry *dentry, struct iattr *attr) omfs_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ce93ccca8639..2a0e83236c01 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -48,7 +48,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) goto fail; inode->i_ino = new_block; - inode_init_owner(inode, NULL, mode); + inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); diff --git a/fs/open.c b/fs/open.c index ca5444733acd..e53af13b5835 100644 --- a/fs/open.c +++ b/fs/open.c @@ -35,8 +35,8 @@ #include "internal.h" -int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, - struct file *filp) +int do_truncate(struct user_namespace *mnt_userns, struct dentry *dentry, + loff_t length, unsigned int time_attrs, struct file *filp) { int ret; struct iattr newattrs; @@ -61,13 +61,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ - ret = notify_change(dentry, &newattrs, NULL); + ret = notify_change(mnt_userns, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } long vfs_truncate(const struct path *path, loff_t length) { + struct user_namespace *mnt_userns; struct inode *inode; long error; @@ -83,7 +84,8 @@ long vfs_truncate(const struct path *path, loff_t length) if (error) goto out; - error = inode_permission(inode, MAY_WRITE); + mnt_userns = mnt_user_ns(path->mnt); + error = inode_permission(mnt_userns, inode, MAY_WRITE); if (error) goto mnt_drop_write_and_out; @@ -107,7 +109,7 @@ long vfs_truncate(const struct path *path, loff_t length) if (!error) error = security_path_truncate(path); if (!error) - error = do_truncate(path->dentry, length, 0, NULL); + error = do_truncate(mnt_userns, path->dentry, length, 0, NULL); put_write_and_out: put_write_access(inode); @@ -186,13 +188,13 @@ long do_sys_ftruncate(unsigned int fd, loff_t length, int small) /* Check IS_APPEND on real upper inode */ if (IS_APPEND(file_inode(f.file))) goto out_putf; - sb_start_write(inode->i_sb); error = locks_verify_truncate(inode, f.file, length); if (!error) error = security_path_truncate(&f.file->f_path); if (!error) - error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file); + error = do_truncate(file_mnt_user_ns(f.file), dentry, length, + ATTR_MTIME | ATTR_CTIME, f.file); sb_end_write(inode->i_sb); out_putf: fdput(f); @@ -436,7 +438,7 @@ retry: goto out_path_release; } - res = inode_permission(inode, mode | MAY_ACCESS); + res = inode_permission(mnt_user_ns(path.mnt), inode, mode | MAY_ACCESS); /* SuS v2 requires we report a read only fs too */ if (res || !(mode & S_IWOTH) || special_file(inode->i_mode)) goto out_path_release; @@ -492,7 +494,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -521,7 +523,7 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd) if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; - error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); + error = file_permission(f.file, MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -540,7 +542,7 @@ retry: if (error) goto out; - error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR); + error = path_permission(&path, MAY_EXEC | MAY_CHDIR); if (error) goto dput_and_out; @@ -580,7 +582,8 @@ retry_deleg: goto out_unlock; newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO); newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_user_ns(path->mnt), path->dentry, + &newattrs, &delegated_inode); out_unlock: inode_unlock(inode); if (delegated_inode) { @@ -641,6 +644,7 @@ SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) int chown_common(const struct path *path, uid_t user, gid_t group) { + struct user_namespace *mnt_userns; struct inode *inode = path->dentry->d_inode; struct inode *delegated_inode = NULL; int error; @@ -651,6 +655,10 @@ int chown_common(const struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); + mnt_userns = mnt_user_ns(path->mnt); + uid = kuid_from_mnt(mnt_userns, uid); + gid = kgid_from_mnt(mnt_userns, gid); + retry_deleg: newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { @@ -671,7 +679,8 @@ retry_deleg: inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_userns, path->dentry, &newattrs, + &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index a25e6c890975..18852b9ed82b 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -116,7 +116,8 @@ out: return error; } -int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int orangefs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; struct iattr iattr; @@ -132,7 +133,8 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) * and "mode" to the new desired value. It is up to * us to propagate the new mode back to the server... */ - error = posix_acl_update_mode(inode, &iattr.ia_mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, + &iattr.ia_mode, &acl); if (error) { gossip_err("%s: posix_acl_update_mode err: %d\n", __func__, diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index ec8ae4257975..9b28a7132466 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -487,10 +487,7 @@ static int orangefs_file_mmap(struct file *file, struct vm_area_struct *vma) return ret; gossip_debug(GOSSIP_FILE_DEBUG, - "orangefs_file_mmap: called on %s\n", - (file ? - (char *)file->f_path.dentry->d_name.name : - (char *)"Unknown")); + "orangefs_file_mmap: called on %pD\n", file); /* set the sequential readahead hint */ vma->vm_flags |= VM_SEQ_READ; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 48f0547d4850..5079cfafa8d7 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -855,13 +855,13 @@ again: ORANGEFS_I(inode)->attr_uid = current_fsuid(); ORANGEFS_I(inode)->attr_gid = current_fsgid(); } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); spin_unlock(&inode->i_lock); mark_inode_dirty(inode); if (iattr->ia_valid & ATTR_MODE) /* change mod on a file that has ACLs */ - ret = posix_acl_chmod(inode, inode->i_mode); + ret = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); ret = 0; out: @@ -871,12 +871,13 @@ out: /* * Change attributes of an object referenced by dentry. */ -int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) +int orangefs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { int ret; gossip_debug(GOSSIP_INODE_DEBUG, "__orangefs_setattr: called on %pd\n", dentry); - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) goto out; ret = __orangefs_setattr(d_inode(dentry), iattr); @@ -890,8 +891,8 @@ out: /* * Obtain attributes of an object given a dentry */ -int orangefs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { int ret; struct inode *inode = path->dentry->d_inode; @@ -903,7 +904,7 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -919,7 +920,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, return ret; } -int orangefs_permission(struct inode *inode, int mask) +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { int ret; @@ -933,7 +935,7 @@ int orangefs_permission(struct inode *inode, int mask) if (ret < 0) return ret; - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 3e7cf3d0a494..600e8eee541f 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -15,7 +15,8 @@ /* * Get a newly allocated inode to go with a negative dentry. */ -static int orangefs_create(struct inode *dir, +static int orangefs_create(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode, bool exclusive) @@ -215,7 +216,8 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) return ret; } -static int orangefs_symlink(struct inode *dir, +static int orangefs_symlink(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, const char *symname) { @@ -303,7 +305,8 @@ out: return ret; } -static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int orangefs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; @@ -372,7 +375,8 @@ out: return ret; } -static int orangefs_rename(struct inode *old_dir, +static int orangefs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index e12aeb9623d6..0e6b97682e41 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -107,7 +107,9 @@ extern int orangefs_init_acl(struct inode *inode, struct inode *dir); extern const struct xattr_handler *orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type); -extern int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int orangefs_set_acl(struct user_namespace *mnt_userns, + struct inode *inode, struct posix_acl *acl, + int type); /* * orangefs data structures @@ -359,12 +361,13 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct orangefs_object_kref *ref); int __orangefs_setattr(struct inode *, struct iattr *); -int orangefs_setattr(struct dentry *, struct iattr *); +int orangefs_setattr(struct user_namespace *, struct dentry *, struct iattr *); -int orangefs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); +int orangefs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); -int orangefs_permission(struct inode *inode, int mask); +int orangefs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); int orangefs_update_time(struct inode *, struct timespec64 *, int); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index bdc285aea360..9a5b757fbd2f 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -526,6 +526,7 @@ out_unlock: } static int orangefs_xattr_set_default(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0fed532efa68..0b2891c6c71e 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -93,9 +93,9 @@ int ovl_copy_xattr(struct super_block *sb, struct dentry *old, continue; /* Discard */ } retry: - size = vfs_getxattr(old, name, value, value_size); + size = vfs_getxattr(&init_user_ns, old, name, value, value_size); if (size == -ERANGE) - size = vfs_getxattr(old, name, NULL, 0); + size = vfs_getxattr(&init_user_ns, old, name, NULL, 0); if (size < 0) { error = size; @@ -115,7 +115,7 @@ retry: goto retry; } - error = vfs_setxattr(new, name, value, size, 0); + error = vfs_setxattr(&init_user_ns, new, name, value, size, 0); if (error) { if (error != -EOPNOTSUPP || ovl_must_copy_xattr(name)) break; @@ -236,7 +236,7 @@ static int ovl_set_size(struct dentry *upperdentry, struct kstat *stat) .ia_size = stat->size, }; - return notify_change(upperdentry, &attr, NULL); + return notify_change(&init_user_ns, upperdentry, &attr, NULL); } static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) @@ -248,7 +248,7 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) .ia_mtime = stat->mtime, }; - return notify_change(upperdentry, &attr, NULL); + return notify_change(&init_user_ns, upperdentry, &attr, NULL); } int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) @@ -260,7 +260,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_valid = ATTR_MODE, .ia_mode = stat->mode, }; - err = notify_change(upperdentry, &attr, NULL); + err = notify_change(&init_user_ns, upperdentry, &attr, NULL); } if (!err) { struct iattr attr = { @@ -268,7 +268,7 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) .ia_uid = stat->uid, .ia_gid = stat->gid, }; - err = notify_change(upperdentry, &attr, NULL); + err = notify_change(&init_user_ns, upperdentry, &attr, NULL); } if (!err) ovl_set_timestamps(upperdentry, stat); @@ -796,7 +796,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) ssize_t res; char *buf; - res = vfs_getxattr(dentry, name, NULL, 0); + res = vfs_getxattr(&init_user_ns, dentry, name, NULL, 0); if (res == -ENODATA || res == -EOPNOTSUPP) res = 0; @@ -805,7 +805,7 @@ static ssize_t ovl_getxattr(struct dentry *dentry, char *name, char **value) if (!buf) return -ENOMEM; - res = vfs_getxattr(dentry, name, buf, res); + res = vfs_getxattr(&init_user_ns, dentry, name, buf, res); if (res < 0) kfree(buf); else @@ -847,8 +847,8 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c) * don't want that to happen for normal copy-up operation. */ if (capability) { - err = vfs_setxattr(upperpath.dentry, XATTR_NAME_CAPS, - capability, cap_size, 0); + err = vfs_setxattr(&init_user_ns, upperpath.dentry, + XATTR_NAME_CAPS, capability, cap_size, 0); if (err) goto out_free; } diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d1efa3a5a503..836f14b9d3a6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -449,7 +449,7 @@ static int ovl_set_upper_acl(struct dentry *upperdentry, const char *name, if (err < 0) goto out_free; - err = vfs_setxattr(upperdentry, name, buffer, size, XATTR_CREATE); + err = vfs_setxattr(&init_user_ns, upperdentry, name, buffer, size, XATTR_CREATE); out_free: kfree(buffer); return err; @@ -508,7 +508,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, .ia_mode = cattr->mode, }; inode_lock(newdentry->d_inode); - err = notify_change(newdentry, &attr, NULL); + err = notify_change(&init_user_ns, newdentry, &attr, NULL); inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; @@ -636,7 +636,7 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, inode->i_state |= I_CREATING; spin_unlock(&inode->i_lock); - inode_init_owner(inode, dentry->d_parent->d_inode, mode); + inode_init_owner(&init_user_ns, inode, dentry->d_parent->d_inode, mode); attr.mode = inode->i_mode; err = ovl_create_or_link(dentry, inode, &attr, false); @@ -650,19 +650,20 @@ out: return err; } -static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ovl_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ovl_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); } -static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int ovl_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { /* Don't allow creation of "whiteout" on overlay */ if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) @@ -671,8 +672,8 @@ static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return ovl_create_object(dentry, mode, rdev, NULL); } -static int ovl_symlink(struct inode *dir, struct dentry *dentry, - const char *link) +static int ovl_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *link) { return ovl_create_object(dentry, S_IFLNK, 0, link); } @@ -821,9 +822,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir, goto out_dput_upper; if (is_dir) - err = vfs_rmdir(dir, upper); + err = vfs_rmdir(&init_user_ns, dir, upper); else - err = vfs_unlink(dir, upper, NULL); + err = vfs_unlink(&init_user_ns, dir, upper, NULL); ovl_dir_modified(dentry->d_parent, ovl_type_origin(dentry)); /* @@ -1069,9 +1070,9 @@ static int ovl_set_redirect(struct dentry *dentry, bool samedir) return err; } -static int ovl_rename(struct inode *olddir, struct dentry *old, - struct inode *newdir, struct dentry *new, - unsigned int flags) +static int ovl_rename(struct user_namespace *mnt_userns, struct inode *olddir, + struct dentry *old, struct inode *newdir, + struct dentry *new, unsigned int flags) { int err; struct dentry *old_upperdir; diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 077d3ad343f6..dbfb35fb0ff7 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -50,11 +50,11 @@ static struct file *ovl_open_realfile(const struct file *file, acc_mode |= MAY_APPEND; old_cred = ovl_override_creds(inode->i_sb); - err = inode_permission(realinode, MAY_OPEN | acc_mode); + err = inode_permission(&init_user_ns, realinode, MAY_OPEN | acc_mode); if (err) { realfile = ERR_PTR(err); } else { - if (!inode_owner_or_capable(realinode)) + if (!inode_owner_or_capable(&init_user_ns, realinode)) flags &= ~O_NOATIME; realfile = open_with_fake_path(&file->f_path, flags, realinode, @@ -521,7 +521,7 @@ static long ovl_ioctl_set_flags(struct file *file, unsigned int cmd, long ret; struct inode *inode = file_inode(file); - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; ret = mnt_want_write_file(file); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index cf41bcb664bc..003cf83bf78a 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -14,14 +14,15 @@ #include "overlayfs.h" -int ovl_setattr(struct dentry *dentry, struct iattr *attr) +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int err; bool full_copy_up = false; struct dentry *upperdentry; const struct cred *old_cred; - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; @@ -79,7 +80,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) inode_lock(upperdentry->d_inode); old_cred = ovl_override_creds(dentry->d_sb); - err = notify_change(upperdentry, attr, NULL); + err = notify_change(&init_user_ns, upperdentry, attr, NULL); revert_creds(old_cred); if (!err) ovl_copyattr(upperdentry->d_inode, dentry->d_inode); @@ -154,8 +155,8 @@ static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) return 0; } -int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; enum ovl_path_type type; @@ -277,7 +278,8 @@ out: return err; } -int ovl_permission(struct inode *inode, int mask) +int ovl_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct inode *upperinode = ovl_inode_upper(inode); struct inode *realinode = upperinode ?: ovl_inode_lower(inode); @@ -294,7 +296,7 @@ int ovl_permission(struct inode *inode, int mask) * Check overlay inode with the creds of task and underlying inode * with creds of mounter */ - err = generic_permission(inode, mask); + err = generic_permission(&init_user_ns, inode, mask); if (err) return err; @@ -305,7 +307,7 @@ int ovl_permission(struct inode *inode, int mask) /* Make sure mounter can read file for copy up later */ mask |= MAY_READ; } - err = inode_permission(realinode, mask); + err = inode_permission(&init_user_ns, realinode, mask); revert_creds(old_cred); return err; @@ -353,7 +355,7 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, if (!value && !upperdentry) { old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getxattr(realdentry, name, NULL, 0); + err = vfs_getxattr(&init_user_ns, realdentry, name, NULL, 0); revert_creds(old_cred); if (err < 0) goto out_drop_write; @@ -369,10 +371,11 @@ int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, old_cred = ovl_override_creds(dentry->d_sb); if (value) - err = vfs_setxattr(realdentry, name, value, size, flags); + err = vfs_setxattr(&init_user_ns, realdentry, name, value, size, + flags); else { WARN_ON(flags != XATTR_REPLACE); - err = vfs_removexattr(realdentry, name); + err = vfs_removexattr(&init_user_ns, realdentry, name); } revert_creds(old_cred); @@ -394,7 +397,7 @@ int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry); old_cred = ovl_override_creds(dentry->d_sb); - res = vfs_getxattr(realdentry, name, value, size); + res = vfs_getxattr(&init_user_ns, realdentry, name, value, size); revert_creds(old_cred); return res; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index cb4e2d60ecf9..95cff83786a5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -123,7 +123,7 @@ static inline const char *ovl_xattr(struct ovl_fs *ofs, enum ovl_xattr ox) static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) { - int err = vfs_rmdir(dir, dentry); + int err = vfs_rmdir(&init_user_ns, dir, dentry); pr_debug("rmdir(%pd2) = %i\n", dentry, err); return err; @@ -131,7 +131,7 @@ static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) { - int err = vfs_unlink(dir, dentry, NULL); + int err = vfs_unlink(&init_user_ns, dir, dentry, NULL); pr_debug("unlink(%pd2) = %i\n", dentry, err); return err; @@ -140,7 +140,7 @@ static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int err = vfs_link(old_dentry, dir, new_dentry, NULL); + int err = vfs_link(old_dentry, &init_user_ns, dir, new_dentry, NULL); pr_debug("link(%pd2, %pd2) = %i\n", old_dentry, new_dentry, err); return err; @@ -149,7 +149,7 @@ static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_create(dir, dentry, mode, true); + int err = vfs_create(&init_user_ns, dir, dentry, mode, true); pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); return err; @@ -158,7 +158,7 @@ static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - int err = vfs_mkdir(dir, dentry, mode); + int err = vfs_mkdir(&init_user_ns, dir, dentry, mode); pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); return err; } @@ -166,7 +166,7 @@ static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) { - int err = vfs_mknod(dir, dentry, mode, dev); + int err = vfs_mknod(&init_user_ns, dir, dentry, mode, dev); pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", dentry, mode, dev, err); return err; @@ -175,7 +175,7 @@ static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) { - int err = vfs_symlink(dir, dentry, oldname); + int err = vfs_symlink(&init_user_ns, dir, dentry, oldname); pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); return err; @@ -186,7 +186,7 @@ static inline ssize_t ovl_do_getxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size) { const char *name = ovl_xattr(ofs, ox); - return vfs_getxattr(dentry, name, value, size); + return vfs_getxattr(&init_user_ns, dentry, name, value, size); } static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, @@ -194,7 +194,7 @@ static inline int ovl_do_setxattr(struct ovl_fs *ofs, struct dentry *dentry, size_t size) { const char *name = ovl_xattr(ofs, ox); - int err = vfs_setxattr(dentry, name, value, size, 0); + int err = vfs_setxattr(&init_user_ns, dentry, name, value, size, 0); pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0) = %i\n", dentry, name, min((int)size, 48), value, size, err); return err; @@ -204,7 +204,7 @@ static inline int ovl_do_removexattr(struct ovl_fs *ofs, struct dentry *dentry, enum ovl_xattr ox) { const char *name = ovl_xattr(ofs, ox); - int err = vfs_removexattr(dentry, name); + int err = vfs_removexattr(&init_user_ns, dentry, name); pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); return err; } @@ -214,9 +214,18 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, unsigned int flags) { int err; + struct renamedata rd = { + .old_mnt_userns = &init_user_ns, + .old_dir = olddir, + .old_dentry = olddentry, + .new_mnt_userns = &init_user_ns, + .new_dir = newdir, + .new_dentry = newdentry, + .flags = flags, + }; pr_debug("rename(%pd2, %pd2, 0x%x)\n", olddentry, newdentry, flags); - err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + err = vfs_rename(&rd); if (err) { pr_debug("...rename(%pd2, %pd2, ...) = %i\n", olddentry, newdentry, err); @@ -226,14 +235,14 @@ static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) { - int err = vfs_whiteout(dir, dentry); + int err = vfs_whiteout(&init_user_ns, dir, dentry); pr_debug("whiteout(%pd2) = %i\n", dentry, err); return err; } static inline struct dentry *ovl_do_tmpfile(struct dentry *dentry, umode_t mode) { - struct dentry *ret = vfs_tmpfile(dentry, mode, 0); + struct dentry *ret = vfs_tmpfile(&init_user_ns, dentry, mode, 0); int err = PTR_ERR_OR_ZERO(ret); pr_debug("tmpfile(%pd2, 0%o) = %i\n", dentry, mode, err); @@ -436,10 +445,12 @@ int ovl_set_nlink_lower(struct dentry *dentry); unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, struct dentry *upperdentry, unsigned int fallback); -int ovl_setattr(struct dentry *dentry, struct iattr *attr); -int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags); -int ovl_permission(struct inode *inode, int mask); +int ovl_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); +int ovl_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags); +int ovl_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask); int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags); int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name, diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d58b8f2bf9d0..fdd72f1a9c5e 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -803,17 +803,19 @@ retry: * allowed as upper are limited to "normal" ones, where checking * for the above two errors is sufficient. */ - err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_DEFAULT); + err = vfs_removexattr(&init_user_ns, work, + XATTR_NAME_POSIX_ACL_DEFAULT); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; - err = vfs_removexattr(work, XATTR_NAME_POSIX_ACL_ACCESS); + err = vfs_removexattr(&init_user_ns, work, + XATTR_NAME_POSIX_ACL_ACCESS); if (err && err != -ENODATA && err != -EOPNOTSUPP) goto out_dput; /* Clear any inherited mode bits */ inode_lock(work->d_inode); - err = notify_change(work, &attr, NULL); + err = notify_change(&init_user_ns, work, &attr, NULL); inode_unlock(work->d_inode); if (err) goto out_dput; @@ -865,6 +867,10 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) pr_err("filesystem on '%s' not supported\n", name); goto out_put; } + if (mnt_user_ns(path->mnt) != &init_user_ns) { + pr_err("idmapped layers are currently not supported\n"); + goto out_put; + } if (!d_is_dir(path->dentry)) { pr_err("'%s' not a directory\n", name); goto out_put; @@ -989,6 +995,7 @@ ovl_posix_acl_xattr_get(const struct xattr_handler *handler, static int __maybe_unused ovl_posix_acl_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1014,7 +1021,7 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, goto out_acl_release; } err = -EPERM; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) goto out_acl_release; posix_acl_release(acl); @@ -1026,10 +1033,10 @@ ovl_posix_acl_xattr_set(const struct xattr_handler *handler, if (unlikely(inode->i_mode & S_ISGID) && handler->flags == ACL_TYPE_ACCESS && !in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) { + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) { struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; - err = ovl_setattr(dentry, &iattr); + err = ovl_setattr(&init_user_ns, dentry, &iattr); if (err) return err; } @@ -1053,6 +1060,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler, } static int ovl_own_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) @@ -1068,6 +1076,7 @@ static int ovl_other_xattr_get(const struct xattr_handler *handler, } static int ovl_other_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 9826b003f1d2..7f5a01a11f97 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -479,12 +479,12 @@ struct file *ovl_path_open(struct path *path, int flags) BUG(); } - err = inode_permission(inode, acc_mode | MAY_OPEN); + err = inode_permission(&init_user_ns, inode, acc_mode | MAY_OPEN); if (err) return ERR_PTR(err); /* O_NOATIME is an optimization, don't fail if not permitted */ - if (inode_owner_or_capable(inode)) + if (inode_owner_or_capable(&init_user_ns, inode)) flags |= O_NOATIME; return dentry_open(path, flags, current_cred()); diff --git a/fs/pipe.c b/fs/pipe.c index 39c96845a72f..bfd946a9ad01 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -171,7 +171,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); * * Description: * This function grabs an extra reference to @buf. It's used in - * in the tee() system call, when we duplicate the buffers in one + * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 95882b3f5f62..f3309a7edb49 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -345,10 +345,13 @@ EXPORT_SYMBOL(posix_acl_from_mode); * by the acl. Returns -E... otherwise. */ int -posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) +posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, + const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; int found = 0; + kuid_t uid; + kgid_t gid; want &= MAY_READ | MAY_WRITE | MAY_EXEC; @@ -356,22 +359,26 @@ posix_acl_permission(struct inode *inode, const struct posix_acl *acl, int want) switch(pa->e_tag) { case ACL_USER_OBJ: /* (May have been checked already) */ - if (uid_eq(inode->i_uid, current_fsuid())) + uid = i_uid_into_mnt(mnt_userns, inode); + if (uid_eq(uid, current_fsuid())) goto check_perm; break; case ACL_USER: - if (uid_eq(pa->e_uid, current_fsuid())) + uid = kuid_into_mnt(mnt_userns, pa->e_uid); + if (uid_eq(uid, current_fsuid())) goto mask; break; case ACL_GROUP_OBJ: - if (in_group_p(inode->i_gid)) { + gid = i_gid_into_mnt(mnt_userns, inode); + if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; } break; case ACL_GROUP: - if (in_group_p(pa->e_gid)) { + gid = kgid_into_mnt(mnt_userns, pa->e_gid); + if (in_group_p(gid)) { found = 1; if ((pa->e_perm & want) == want) goto mask; @@ -551,8 +558,22 @@ __posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode) } EXPORT_SYMBOL(__posix_acl_chmod); +/** + * posix_acl_chmod - chmod a posix acl + * + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: inode to check permissions on + * @mode: the new mode of @inode + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + */ int -posix_acl_chmod(struct inode *inode, umode_t mode) + posix_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode, + umode_t mode) { struct posix_acl *acl; int ret = 0; @@ -572,7 +593,7 @@ posix_acl_chmod(struct inode *inode, umode_t mode) ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); if (ret) return ret; - ret = inode->i_op->set_acl(inode, acl, ACL_TYPE_ACCESS); + ret = inode->i_op->set_acl(mnt_userns, inode, acl, ACL_TYPE_ACCESS); posix_acl_release(acl); return ret; } @@ -631,9 +652,10 @@ EXPORT_SYMBOL_GPL(posix_acl_create); /** * posix_acl_update_mode - update mode in set_acl - * @inode: target inode - * @mode_p: mode (pointer) for update - * @acl: acl pointer + * @mnt_userns: user namespace of the mount @inode was found from + * @inode: target inode + * @mode_p: mode (pointer) for update + * @acl: acl pointer * * Update the file mode when setting an ACL: compute the new file permission * bits based on the ACL. In addition, if the ACL is equivalent to the new @@ -642,9 +664,16 @@ EXPORT_SYMBOL_GPL(posix_acl_create); * As with chmod, clear the setgid bit if the caller is not in the owning group * or capable of CAP_FSETID (see inode_change_ok). * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before checking + * permissions. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. + * * Called from set_acl inode operations. */ -int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, +int posix_acl_update_mode(struct user_namespace *mnt_userns, + struct inode *inode, umode_t *mode_p, struct posix_acl **acl) { umode_t mode = inode->i_mode; @@ -655,8 +684,8 @@ int posix_acl_update_mode(struct inode *inode, umode_t *mode_p, return error; if (error == 0) *acl = NULL; - if (!in_group_p(inode->i_gid) && - !capable_wrt_inode_uidgid(inode, CAP_FSETID)) + if (!in_group_p(i_gid_into_mnt(mnt_userns, inode)) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) mode &= ~S_ISGID; *mode_p = mode; return 0; @@ -668,7 +697,8 @@ EXPORT_SYMBOL(posix_acl_update_mode); */ static void posix_acl_fix_xattr_userns( struct user_namespace *to, struct user_namespace *from, - void *value, size_t size) + struct user_namespace *mnt_userns, + void *value, size_t size, bool from_user) { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; @@ -693,10 +723,18 @@ static void posix_acl_fix_xattr_userns( switch(le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(from, le32_to_cpu(entry->e_id)); + if (from_user) + uid = kuid_from_mnt(mnt_userns, uid); + else + uid = kuid_into_mnt(mnt_userns, uid); entry->e_id = cpu_to_le32(from_kuid(to, uid)); break; case ACL_GROUP: gid = make_kgid(from, le32_to_cpu(entry->e_id)); + if (from_user) + gid = kgid_from_mnt(mnt_userns, gid); + else + gid = kgid_into_mnt(mnt_userns, gid); entry->e_id = cpu_to_le32(from_kgid(to, gid)); break; default: @@ -705,20 +743,24 @@ static void posix_acl_fix_xattr_userns( } } -void posix_acl_fix_xattr_from_user(void *value, size_t size) +void posix_acl_fix_xattr_from_user(struct user_namespace *mnt_userns, + void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) + if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; - posix_acl_fix_xattr_userns(&init_user_ns, user_ns, value, size); + posix_acl_fix_xattr_userns(&init_user_ns, user_ns, mnt_userns, value, + size, true); } -void posix_acl_fix_xattr_to_user(void *value, size_t size) +void posix_acl_fix_xattr_to_user(struct user_namespace *mnt_userns, + void *value, size_t size) { struct user_namespace *user_ns = current_user_ns(); - if (user_ns == &init_user_ns) + if ((user_ns == &init_user_ns) && (mnt_userns == &init_user_ns)) return; - posix_acl_fix_xattr_userns(user_ns, &init_user_ns, value, size); + posix_acl_fix_xattr_userns(user_ns, &init_user_ns, mnt_userns, value, + size, false); } /* @@ -858,7 +900,8 @@ posix_acl_xattr_get(const struct xattr_handler *handler, } int -set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) +set_posix_acl(struct user_namespace *mnt_userns, struct inode *inode, + int type, struct posix_acl *acl) { if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; @@ -867,7 +910,7 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; if (acl) { @@ -875,15 +918,16 @@ set_posix_acl(struct inode *inode, int type, struct posix_acl *acl) if (ret) return ret; } - return inode->i_op->set_acl(inode, acl, type); + return inode->i_op->set_acl(mnt_userns, inode, acl, type); } EXPORT_SYMBOL(set_posix_acl); static int posix_acl_xattr_set(const struct xattr_handler *handler, - struct dentry *unused, struct inode *inode, - const char *name, const void *value, - size_t size, int flags) + struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, + const char *name, const void *value, size_t size, + int flags) { struct posix_acl *acl = NULL; int ret; @@ -893,7 +937,7 @@ posix_acl_xattr_set(const struct xattr_handler *handler, if (IS_ERR(acl)) return PTR_ERR(acl); } - ret = set_posix_acl(inode, handler->flags, acl); + ret = set_posix_acl(mnt_userns, inode, handler->flags, acl); posix_acl_release(acl); return ret; } @@ -922,12 +966,13 @@ const struct xattr_handler posix_acl_default_xattr_handler = { }; EXPORT_SYMBOL_GPL(posix_acl_default_xattr_handler); -int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type) +int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(inode, + error = posix_acl_update_mode(mnt_userns, inode, &inode->i_mode, &acl); if (error) return error; diff --git a/fs/proc/base.c b/fs/proc/base.c index b3422cda2a91..3851bfcdba56 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,7 +67,6 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> -#include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> @@ -386,19 +385,17 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; - char symname[KSYM_NAME_LEN]; - if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - goto print0; + if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + wchan = get_wchan(task); + else + wchan = 0; - wchan = get_wchan(task); - if (wchan && !lookup_symbol_name(wchan, symname)) { - seq_puts(m, symname); - return 0; - } + if (wchan) + seq_printf(m, "%ps", (void *) wchan); + else + seq_putc(m, '0'); -print0: - seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -685,7 +682,8 @@ static int proc_fd_access_allowed(struct inode *inode) return allowed; } -int proc_setattr(struct dentry *dentry, struct iattr *attr) +int proc_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int error; struct inode *inode = d_inode(dentry); @@ -693,11 +691,11 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } @@ -726,7 +724,8 @@ static bool has_pid_permissions(struct proc_fs_info *fs_info, } -static int proc_pid_permission(struct inode *inode, int mask) +static int proc_pid_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; @@ -751,7 +750,7 @@ static int proc_pid_permission(struct inode *inode, int mask) return -EPERM; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } @@ -1927,14 +1926,14 @@ out_unlock: return NULL; } -int pid_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int query_flags) +int pid_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3473,7 +3472,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) * This function makes sure that the node is always accessible for members of * same thread group. */ -static int proc_tid_comm_permission(struct inode *inode, int mask) +static int proc_tid_comm_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { bool is_same_tgroup; struct task_struct *task; @@ -3492,7 +3492,7 @@ static int proc_tid_comm_permission(struct inode *inode, int mask) return 0; } - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static const struct inode_operations proc_tid_comm_inode_operations = { @@ -3798,12 +3798,13 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int proc_task_getattr(const struct path *path, struct kstat *stat, +static int proc_task_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index cb51763ed554..07fc4fad2602 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -276,12 +276,13 @@ static struct dentry *proc_lookupfd(struct inode *dir, struct dentry *dentry, * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -int proc_fd_permission(struct inode *inode, int mask) +int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { struct task_struct *p; int rv; - rv = generic_permission(inode, mask); + rv = generic_permission(&init_user_ns, inode, mask); if (rv == 0) return rv; diff --git a/fs/proc/fd.h b/fs/proc/fd.h index f371a602bf58..c5a921a06a0b 100644 --- a/fs/proc/fd.h +++ b/fs/proc/fd.h @@ -10,7 +10,8 @@ extern const struct inode_operations proc_fd_inode_operations; extern const struct file_operations proc_fdinfo_operations; extern const struct inode_operations proc_fdinfo_inode_operations; -extern int proc_fd_permission(struct inode *inode, int mask); +extern int proc_fd_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); static inline unsigned int proc_fd(struct inode *inode) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 6c0a05f55d6b..bc86aa87cc41 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -115,17 +115,18 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, return true; } -static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) +static int proc_notify_change(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; - error = setattr_prepare(dentry, iattr); + error = setattr_prepare(&init_user_ns, dentry, iattr); if (error) return error; - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); mark_inode_dirty(inode); proc_set_user(de, inode->i_uid, inode->i_gid); @@ -133,7 +134,8 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) return 0; } -static int proc_getattr(const struct path *path, struct kstat *stat, +static int proc_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -145,7 +147,7 @@ static int proc_getattr(const struct path *path, struct kstat *stat, } } - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index f60b379dcdc7..03415f3fb3a8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -162,8 +162,10 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, * base.c */ extern const struct dentry_operations pid_dentry_operations; -extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); -extern int proc_setattr(struct dentry *, struct iattr *); +extern int pid_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); +extern int proc_setattr(struct user_namespace *, struct dentry *, + struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index d6fc74619625..6fa761c9cc78 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -129,15 +129,15 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", - global_node_page_state(NR_ANON_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", - global_node_page_state(NR_SHMEM_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", - global_node_page_state(NR_SHMEM_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", - global_node_page_state(NR_FILE_THPS) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", - global_node_page_state(NR_FILE_PMDMAPPED) * HPAGE_PMD_NR); + global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 18601042af99..15c2e55d2ed2 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -289,7 +289,8 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, return de; } -static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, +static int proc_tgid_net_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -297,7 +298,7 @@ static int proc_tgid_net_getattr(const struct path *path, struct kstat *stat, net = get_proc_task_net(inode); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d2018f70d1fa..984e42f8cb11 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -571,7 +571,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = -ENOMEM; if (count >= KMALLOC_MAX_SIZE) goto out; - kbuf = kzalloc(count + 1, GFP_KERNEL); + kbuf = kvzalloc(count + 1, GFP_KERNEL); if (!kbuf) goto out; @@ -600,7 +600,7 @@ static ssize_t proc_sys_call_handler(struct kiocb *iocb, struct iov_iter *iter, error = count; out_free_buf: - kfree(kbuf); + kvfree(kbuf); out: sysctl_head_finish(head); @@ -785,7 +785,8 @@ out: return 0; } -static int proc_sys_permission(struct inode *inode, int mask) +static int proc_sys_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask) { /* * sysctl entries that are not writeable, @@ -813,7 +814,8 @@ static int proc_sys_permission(struct inode *inode, int mask) return error; } -static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) +static int proc_sys_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; @@ -821,16 +823,17 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) return -EPERM; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } -static int proc_sys_getattr(const struct path *path, struct kstat *stat, +static int proc_sys_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); @@ -840,7 +843,7 @@ static int proc_sys_getattr(const struct path *path, struct kstat *stat, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; diff --git a/fs/proc/root.c b/fs/proc/root.c index 5e444d4f9717..c7e3b1350ef8 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -308,10 +308,11 @@ void __init proc_root_init(void) register_filesystem(&proc_fs_type); } -static int proc_root_getattr(const struct path *path, struct kstat *stat, +static int proc_root_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index a4012154e109..72cd69bcaf4a 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry, pid_t tgid = task_tgid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index d56681d86d28..a553273fbd41 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, pid_t pid = task_pid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!pid) return ERR_PTR(-ENOENT); name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index c3a345c28a93..9a15334da208 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -1503,11 +1503,8 @@ int vmcore_add_device_dump(struct vmcoredd_data *data) return 0; out_err: - if (buf) - vfree(buf); - - if (dump) - vfree(dump); + vfree(buf); + vfree(dump); return ret; } diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index eafb75755fa3..392ef5162655 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -79,6 +79,9 @@ static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) if (mnt->mnt_flags & fs_infop->flag) seq_puts(m, fs_infop->str); } + + if (mnt_user_ns(mnt) != &init_user_ns) + seq_puts(m, ",idmapped"); } static inline void mangle(struct seq_file *m, const char *s) diff --git a/fs/pstore/zone.c b/fs/pstore/zone.c index 5266ccbec007..7c8f8feac6c3 100644 --- a/fs/pstore/zone.c +++ b/fs/pstore/zone.c @@ -23,7 +23,7 @@ #include "internal.h" /** - * struct psz_head - header of zone to flush to storage + * struct psz_buffer - header of zone to flush to storage * * @sig: signature to indicate header (PSZ_SIG xor PSZONE-type value) * @datalen: length of data in @data diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index c21106557a37..b1467f3921c2 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -164,19 +164,24 @@ static int v2_read_file_info(struct super_block *sb, int type) quota_error(sb, "Number of blocks too big for quota file size (%llu > %llu).", (loff_t)qinfo->dqi_blocks << qinfo->dqi_blocksize_bits, i_size_read(sb_dqopt(sb)->files[type])); - goto out; + goto out_free; } if (qinfo->dqi_free_blk >= qinfo->dqi_blocks) { quota_error(sb, "Free block number too big (%u >= %u).", qinfo->dqi_free_blk, qinfo->dqi_blocks); - goto out; + goto out_free; } if (qinfo->dqi_free_entry >= qinfo->dqi_blocks) { quota_error(sb, "Block with free entry too big (%u >= %u).", qinfo->dqi_free_entry, qinfo->dqi_blocks); - goto out; + goto out_free; } ret = 0; +out_free: + if (ret) { + kfree(info->dqi_priv); + info->dqi_priv = NULL; + } out: up_read(&dqopt->dqio_sem); return ret; diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 355523f4a4bf..ba3525ccc27e 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include "internal.h" -static int ramfs_nommu_setattr(struct dentry *, struct iattr *); +static int ramfs_nommu_setattr(struct user_namespace *, struct dentry *, struct iattr *); static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, @@ -158,14 +158,15 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) * handle a change of attributes * - we're specifically interested in a change of size */ -static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) +static int ramfs_nommu_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *ia) { struct inode *inode = d_inode(dentry); unsigned int old_ia_valid = ia->ia_valid; int ret = 0; /* POSIX UID/GID verification for setting inode attributes */ - ret = setattr_prepare(dentry, ia); + ret = setattr_prepare(&init_user_ns, dentry, ia); if (ret) return ret; @@ -185,7 +186,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) } } - setattr_copy(inode, ia); + setattr_copy(&init_user_ns, inode, ia); out: ia->ia_valid = old_ia_valid; return ret; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index ee179a81b3da..9ebd17d7befb 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -67,7 +67,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mapping->a_ops = &ramfs_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); @@ -101,7 +101,8 @@ struct inode *ramfs_get_inode(struct super_block *sb, */ /* SMP-safe */ static int -ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +ramfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode * inode = ramfs_get_inode(dir->i_sb, dir, mode, dev); int error = -ENOSPC; @@ -115,20 +116,23 @@ ramfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } -static int ramfs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ramfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { - int retval = ramfs_mknod(dir, dentry, mode | S_IFDIR, 0); + int retval = ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); return retval; } -static int ramfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) +static int ramfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return ramfs_mknod(dir, dentry, mode | S_IFREG, 0); + return ramfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } -static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * symname) +static int ramfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode; int error = -ENOSPC; @@ -147,6 +151,18 @@ static int ramfs_symlink(struct inode * dir, struct dentry *dentry, const char * return error; } +static int ramfs_tmpfile(struct user_namespace *mnt_userns, + struct inode *dir, struct dentry *dentry, umode_t mode) +{ + struct inode *inode; + + inode = ramfs_get_inode(dir->i_sb, dir, mode, 0); + if (!inode) + return -ENOSPC; + d_tmpfile(dentry, inode); + return 0; +} + static const struct inode_operations ramfs_dir_inode_operations = { .create = ramfs_create, .lookup = simple_lookup, @@ -157,6 +173,7 @@ static const struct inode_operations ramfs_dir_inode_operations = { .rmdir = simple_rmdir, .mknod = ramfs_mknod, .rename = simple_rename, + .tmpfile = ramfs_tmpfile, }; /* diff --git a/fs/reiserfs/acl.h b/fs/reiserfs/acl.h index 0c1c847f992f..fd58618da360 100644 --- a/fs/reiserfs/acl.h +++ b/fs/reiserfs/acl.h @@ -49,7 +49,8 @@ static inline int reiserfs_acl_count(size_t size) #ifdef CONFIG_REISERFS_FS_POSIX_ACL struct posix_acl *reiserfs_get_acl(struct inode *inode, int type); -int reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +int reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); int reiserfs_acl_chmod(struct inode *inode); int reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th, struct inode *dir, struct dentry *dentry, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index c76d563dec0e..780bb90c1804 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -3282,13 +3282,14 @@ static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return ret; } -int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) +int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -3413,7 +3414,7 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } if (!error) { - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index adb21bea3d60..4f1cbd930179 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -59,7 +59,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (err) break; - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EPERM; goto setflags_out; } @@ -101,7 +101,7 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!inode_owner_or_capable(inode)) { + if (!inode_owner_or_capable(&init_user_ns, inode)) { err = -EPERM; break; } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 1594687582f0..e6eb05e2b2f1 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -615,12 +615,12 @@ static int new_inode_init(struct inode *inode, struct inode *dir, umode_t mode) * the quota init calls have to know who to charge the quota to, so * we have to set uid and gid here */ - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); return dquot_initialize(inode); } -static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int reiserfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { int retval; struct inode *inode; @@ -698,8 +698,8 @@ out_failed: return retval; } -static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int reiserfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { int retval; struct inode *inode; @@ -781,7 +781,8 @@ out_failed: return retval; } -static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int reiserfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval; struct inode *inode; @@ -1094,8 +1095,9 @@ out_unlink: return retval; } -static int reiserfs_symlink(struct inode *parent_dir, - struct dentry *dentry, const char *symname) +static int reiserfs_symlink(struct user_namespace *mnt_userns, + struct inode *parent_dir, struct dentry *dentry, + const char *symname) { int retval; struct inode *inode; @@ -1304,7 +1306,8 @@ static void set_ino_in_dir_entry(struct reiserfs_dir_entry *de, * one path. If it holds 2 or more, it can get into endless waiting in * get_empty_nodes or its clones */ -static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, +static int reiserfs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index f69871516167..0ca2ac62e534 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -3102,7 +3102,8 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); +int reiserfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index fe63a7c3e0da..bd073836e141 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -66,14 +66,14 @@ static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->create(dir, dentry, mode, true); + return dir->i_op->create(&init_user_ns, dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { BUG_ON(!inode_is_locked(dir)); - return dir->i_op->mkdir(dir, dentry, mode); + return dir->i_op->mkdir(&init_user_ns, dir, dentry, mode); } /* @@ -352,7 +352,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) * ATTR_MODE is set. */ attrs->ia_valid &= (ATTR_UID|ATTR_GID); - err = reiserfs_setattr(dentry, attrs); + err = reiserfs_setattr(&init_user_ns, dentry, attrs); attrs->ia_valid = ia_valid; return err; @@ -604,7 +604,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); - err = reiserfs_setattr(dentry, &newattrs); + err = reiserfs_setattr(&init_user_ns, dentry, &newattrs); inode_unlock(d_inode(dentry)); } else update_ctime(inode); @@ -948,7 +948,8 @@ static int xattr_mount_check(struct super_block *s) return 0; } -int reiserfs_permission(struct inode *inode, int mask) +int reiserfs_permission(struct user_namespace *mnt_userns, struct inode *inode, + int mask) { /* * We don't do permission checks on the internal objects. @@ -957,7 +958,7 @@ int reiserfs_permission(struct inode *inode, int mask) if (IS_PRIVATE(inode)) return 0; - return generic_permission(inode, mask); + return generic_permission(&init_user_ns, inode, mask); } static int xattr_hide_revalidate(struct dentry *dentry, unsigned int flags) diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index c764352447ba..9b3b06da568c 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -16,7 +16,8 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask); +int reiserfs_permission(struct user_namespace *mnt_userns, + struct inode *inode, int mask); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index ccd40df6eb45..a9547144a099 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -18,7 +18,8 @@ static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, int -reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +reiserfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { int error, error2; struct reiserfs_transaction_handle th; @@ -40,7 +41,8 @@ reiserfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) reiserfs_write_unlock(inode->i_sb); if (error == 0) { if (type == ACL_TYPE_ACCESS && acl) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(&init_user_ns, inode, + &mode, &acl); if (error) goto unlock; update_mode = 1; @@ -399,5 +401,5 @@ int reiserfs_acl_chmod(struct inode *inode) !reiserfs_posixacl(inode->i_sb)) return 0; - return posix_acl_chmod(inode, inode->i_mode); + return posix_acl_chmod(&init_user_ns, inode, inode->i_mode); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 20be9a0e5870..8965c8e5e172 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -21,7 +21,8 @@ security_get(const struct xattr_handler *handler, struct dentry *unused, } static int -security_set(const struct xattr_handler *handler, struct dentry *unused, +security_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5ed48da3d02b..d853cea2afcd 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -20,7 +20,8 @@ trusted_get(const struct xattr_handler *handler, struct dentry *unused, } static int -trusted_set(const struct xattr_handler *handler, struct dentry *unused, +trusted_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index a573ca45bacc..65d9cd10a5ea 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -18,7 +18,8 @@ user_get(const struct xattr_handler *handler, struct dentry *unused, } static int -user_set(const struct xattr_handler *handler, struct dentry *unused, +user_set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, + struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { diff --git a/fs/remap_range.c b/fs/remap_range.c index 77dba3a49e65..e4a5fdd7ad7b 100644 --- a/fs/remap_range.c +++ b/fs/remap_range.c @@ -432,13 +432,16 @@ EXPORT_SYMBOL(vfs_clone_file_range); /* Check whether we are allowed to dedupe the destination file */ static bool allow_file_dedupe(struct file *file) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct inode *inode = file_inode(file); + if (capable(CAP_SYS_ADMIN)) return true; if (file->f_mode & FMODE_WRITE) return true; - if (uid_eq(current_fsuid(), file_inode(file)->i_uid)) + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) return true; - if (!inode_permission(file_inode(file), MAY_WRITE)) + if (!inode_permission(mnt_userns, inode, MAY_WRITE)) return true; return false; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 03a369ccd28c..cb11a34fb871 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -669,7 +669,8 @@ void seq_puts(struct seq_file *m, const char *s) EXPORT_SYMBOL(seq_puts); /** - * A helper routine for putting decimal numbers without rich format of printf(). + * seq_put_decimal_ull_width - A helper routine for putting decimal numbers + * without rich format of printf(). * only 'unsigned long long' is supported. * @m: seq_file identifying the buffer to which data should be written * @delimiter: a string which is printed before the number @@ -1044,7 +1045,7 @@ struct hlist_node *seq_hlist_next_rcu(void *v, EXPORT_SYMBOL(seq_hlist_next_rcu); /** - * seq_hlist_start_precpu - start an iteration of a percpu hlist array + * seq_hlist_start_percpu - start an iteration of a percpu hlist array * @head: pointer to percpu array of struct hlist_heads * @cpu: pointer to cpu "cursor" * @pos: start position of sequence diff --git a/fs/stat.c b/fs/stat.c index dacecdda2e79..fbc171d038aa 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -26,21 +26,29 @@ /** * generic_fillattr - Fill in the basic attributes from the inode struct - * @inode: Inode to use as the source - * @stat: Where to fill in the attributes + * @mnt_userns: user namespace of the mount the inode was found from + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode * operation is supplied. + * + * If the inode has been found through an idmapped mount the user namespace of + * the vfsmount must be passed through @mnt_userns. This function will then + * take care to map the inode according to @mnt_userns before filling in the + * uid and gid filds. On non-idmapped mounts or if permission checking is to be + * performed on the raw inode simply passs init_user_ns. */ -void generic_fillattr(struct inode *inode, struct kstat *stat) +void generic_fillattr(struct user_namespace *mnt_userns, struct inode *inode, + struct kstat *stat) { stat->dev = inode->i_sb->s_dev; stat->ino = inode->i_ino; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; + stat->uid = i_uid_into_mnt(mnt_userns, inode); + stat->gid = i_gid_into_mnt(mnt_userns, inode); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; @@ -67,6 +75,7 @@ EXPORT_SYMBOL(generic_fillattr); int vfs_getattr_nosec(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { + struct user_namespace *mnt_userns; struct inode *inode = d_backing_inode(path->dentry); memset(stat, 0, sizeof(*stat)); @@ -83,11 +92,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, if (IS_DAX(inode)) stat->attributes |= STATX_ATTR_DAX; + mnt_userns = mnt_user_ns(path->mnt); if (inode->i_op->getattr) - return inode->i_op->getattr(path, stat, request_mask, - query_flags); + return inode->i_op->getattr(mnt_userns, path, stat, + request_mask, query_flags); - generic_fillattr(inode, stat); + generic_fillattr(mnt_userns, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); diff --git a/fs/super.c b/fs/super.c index 5a1f384ffc74..8c1baca35c16 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1719,12 +1719,6 @@ int freeze_super(struct super_block *sb) } EXPORT_SYMBOL(freeze_super); -/** - * thaw_super -- unlock filesystem - * @sb: the super to thaw - * - * Unlocks the filesystem and marks it writeable again after freeze_super(). - */ static int thaw_super_locked(struct super_block *sb) { int error; @@ -1760,6 +1754,12 @@ out: return 0; } +/** + * thaw_super -- unlock filesystem + * @sb: the super to thaw + * + * Unlocks the filesystem and marks it writeable again after freeze_super(). + */ int thaw_super(struct super_block *sb) { down_write(&sb->s_umount); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 96d0da65e088..9aefa7779b29 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -170,6 +170,16 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, return battr->mmap(of->file, kobj, battr, vma); } +static int sysfs_kf_bin_open(struct kernfs_open_file *of) +{ + struct bin_attribute *battr = of->kn->priv; + + if (battr->mapping) + of->file->f_mapping = battr->mapping; + + return 0; +} + void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { struct kernfs_node *kn = kobj->sd, *tmp; @@ -241,6 +251,7 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = { .read = sysfs_kf_bin_read, .write = sysfs_kf_bin_write, .mmap = sysfs_kf_bin_mmap, + .open = sysfs_kf_bin_open, }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, diff --git a/fs/sysv/file.c b/fs/sysv/file.c index 45fc79a18594..90e00124ea07 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -29,12 +29,13 @@ const struct file_operations sysv_file_operations = { .splice_read = generic_file_splice_read, }; -static int sysv_setattr(struct dentry *dentry, struct iattr *attr) +static int sysv_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -47,7 +48,7 @@ static int sysv_setattr(struct dentry *dentry, struct iattr *attr) sysv_truncate(inode); } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 6c9801986af6..50df794a3c1f 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -163,7 +163,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); dirty_sb(sb); - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_blocks = 0; diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index bcb67b0cabe7..8b2e99b7bc9f 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -441,11 +441,11 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) return blocks; } -int sysv_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int sysv_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(d_inode(path->dentry), stat); + generic_fillattr(&init_user_ns, d_inode(path->dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index ea2414b385ec..b2e6abc06a2d 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -41,7 +41,8 @@ static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, un return d_splice_alias(inode, dentry); } -static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, dev_t rdev) +static int sysv_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode * inode; int err; @@ -60,13 +61,14 @@ static int sysv_mknod(struct inode * dir, struct dentry * dentry, umode_t mode, return err; } -static int sysv_create(struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) +static int sysv_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return sysv_mknod(dir, dentry, mode, 0); + return sysv_mknod(&init_user_ns, dir, dentry, mode, 0); } -static int sysv_symlink(struct inode * dir, struct dentry * dentry, - const char * symname) +static int sysv_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int err = -ENAMETOOLONG; int l = strlen(symname)+1; @@ -108,7 +110,8 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int sysv_mkdir(struct inode * dir, struct dentry *dentry, umode_t mode) +static int sysv_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; @@ -186,9 +189,9 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, - struct inode * new_dir, struct dentry * new_dentry, - unsigned int flags) +static int sysv_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode * old_inode = d_inode(old_dentry); struct inode * new_inode = d_inode(new_dentry); diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 1cff585526b1..99ddf033da4f 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -141,7 +141,8 @@ extern struct inode *sysv_iget(struct super_block *, unsigned int); extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); extern int sysv_sync_inode(struct inode *); extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(const struct path *, struct kstat *, u32, unsigned int); +extern int sysv_getattr(struct user_namespace *, const struct path *, + struct kstat *, u32, unsigned int); extern int sysv_init_icache(void); extern void sysv_destroy_icache(void); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0ee8c6dfb036..4b83cbded559 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -67,7 +67,9 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +static int tracefs_syscall_mkdir(struct user_namespace *mnt_userns, + struct inode *inode, struct dentry *dentry, + umode_t mode) { char *name; int ret; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 9a6b8660425a..d9d8d7794eff 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -94,7 +94,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, */ inode->i_flags |= S_NOCMTIME; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_mapping->nrpages = 0; @@ -280,8 +280,8 @@ static int ubifs_prepare_create(struct inode *dir, struct dentry *dentry, return fscrypt_setup_filename(dir, &dentry->d_name, 0, nm); } -static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int ubifs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; struct ubifs_info *c = dir->i_sb->s_fs_info; @@ -441,8 +441,8 @@ out_budg: return err; } -static int ubifs_tmpfile(struct inode *dir, struct dentry *dentry, - umode_t mode) +static int ubifs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { return do_tmpfile(dir, dentry, mode, NULL); } @@ -942,7 +942,8 @@ out_fname: return err; } -static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int ubifs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -1013,8 +1014,8 @@ out_budg: return err; } -static int ubifs_mknod(struct inode *dir, struct dentry *dentry, - umode_t mode, dev_t rdev) +static int ubifs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; struct ubifs_inode *ui; @@ -1102,8 +1103,8 @@ out_budg: return err; } -static int ubifs_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int ubifs_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode; struct ubifs_inode *ui; @@ -1542,7 +1543,8 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, return err; } -static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, +static int ubifs_rename(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { @@ -1566,8 +1568,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, return do_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -int ubifs_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *stat, u32 request_mask, unsigned int flags) { loff_t size; struct inode *inode = d_inode(path->dentry); @@ -1589,7 +1591,7 @@ int ubifs_getattr(const struct path *path, struct kstat *stat, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 2bc7780d2963..0e4b4be3aa26 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1257,7 +1257,8 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, return err; } -int ubifs_setattr(struct dentry *dentry, struct iattr *attr) +int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { int err; struct inode *inode = d_inode(dentry); @@ -1265,7 +1266,7 @@ int ubifs_setattr(struct dentry *dentry, struct iattr *attr) dbg_gen("ino %lu, mode %#x, ia_valid %#x", inode->i_ino, inode->i_mode, attr->ia_valid); - err = setattr_prepare(dentry, attr); + err = setattr_prepare(&init_user_ns, dentry, attr); if (err) return err; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 4363d85a3fd4..2326d5122beb 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -155,7 +155,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (IS_RDONLY(inode)) return -EROFS; - if (!inode_owner_or_capable(inode)) + if (!inode_owner_or_capable(&init_user_ns, inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index fc2cdde3b549..7fdfdbda4b8a 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1989,13 +1989,14 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); /* file.c */ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); -int ubifs_setattr(struct dentry *dentry, struct iattr *attr); +int ubifs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, umode_t mode); -int ubifs_getattr(const struct path *path, struct kstat *stat, +int ubifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); int ubifs_check_dir_empty(struct inode *dir); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 842d5f14545d..6b1e9830b274 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -681,6 +681,7 @@ static int xattr_get(const struct xattr_handler *handler, } static int xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *value, size_t size, int flags) diff --git a/fs/udf/file.c b/fs/udf/file.c index ad8eefad27d7..2846dcd92197 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -183,7 +183,7 @@ long udf_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) long old_block, new_block; int result; - if (inode_permission(inode, MAY_READ) != 0) { + if (file_permission(filp, MAY_READ) != 0) { udf_debug("no permission to access inode %lu\n", inode->i_ino); return -EPERM; } @@ -253,13 +253,14 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct dentry *dentry, struct iattr *attr) +static int udf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -282,7 +283,7 @@ static int udf_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) udf_update_extra_perms(inode, attr->ia_mode); - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 84ed23edebfd..2ecf0e87660e 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -103,7 +103,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) mutex_unlock(&sbi->s_alloc_mutex); } - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UID_SET)) inode->i_uid = sbi->s_uid; if (UDF_QUERY_FLAG(sb, UDF_FLAG_GID_SET)) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index bb89c3e43212..0dd2f93ac048 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -544,11 +544,14 @@ static int udf_do_extend_file(struct inode *inode, udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); + /* - * We've rewritten the last extent but there may be empty - * indirect extent after it - enter it. + * We've rewritten the last extent. If we are going to add + * more extents, we may need to enter possible following + * empty indirect extent. */ - udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); + if (new_block_bytes || prealloc_len) + udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); } /* Managed to do everything necessary? */ diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e169d8fe35b5..f146b3089f3d 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -604,8 +604,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) return 0; } -static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int udf_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode = udf_new_inode(dir, mode); @@ -623,7 +623,8 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return udf_add_nondir(dentry, inode); } -static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +static int udf_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode = udf_new_inode(dir, mode); @@ -642,8 +643,8 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) +static int udf_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; @@ -658,7 +659,8 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int udf_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_bh fibh; @@ -877,8 +879,8 @@ out: return retval; } -static int udf_symlink(struct inode *dir, struct dentry *dentry, - const char *symname) +static int udf_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; @@ -1065,9 +1067,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, /* Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int udf_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); diff --git a/fs/udf/super.c b/fs/udf/super.c index d0df217f4712..2f83c1204e20 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -459,6 +459,7 @@ static int udf_parse_options(char *options, struct udf_options *uopt, { char *p; int option; + unsigned int uv; uopt->novrs = 0; uopt->session = 0xFFFFFFFF; @@ -508,17 +509,17 @@ static int udf_parse_options(char *options, struct udf_options *uopt, uopt->flags &= ~(1 << UDF_FLAG_USE_SHORT_AD); break; case Opt_gid: - if (match_int(args, &option)) + if (match_uint(args, &uv)) return 0; - uopt->gid = make_kgid(current_user_ns(), option); + uopt->gid = make_kgid(current_user_ns(), uv); if (!gid_valid(uopt->gid)) return 0; uopt->flags |= (1 << UDF_FLAG_GID_SET); break; case Opt_uid: - if (match_int(args, &option)) + if (match_uint(args, &uv)) return 0; - uopt->uid = make_kuid(current_user_ns(), option); + uopt->uid = make_kuid(current_user_ns(), uv); if (!uid_valid(uopt->uid)) return 0; uopt->flags |= (1 << UDF_FLAG_UID_SET); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index c973db239604..9b223421a3c5 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -152,14 +152,15 @@ out_unmap: return err; } -static int udf_symlink_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +static int udf_symlink_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(inode, stat); + generic_fillattr(&init_user_ns, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 969fd60436d3..7e3e08c0166f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -289,7 +289,7 @@ cg_found: ufs_mark_sb_dirty(sb); inode->i_ino = cg * uspi->s_ipg + bit; - inode_init_owner(inode, dir, mode); + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c843ec858cf7..debc282c1bb4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1211,13 +1211,14 @@ out: return err; } -int ufs_setattr(struct dentry *dentry, struct iattr *attr) +int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr) { struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -1227,7 +1228,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) return error; } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); mark_inode_dirty(inode); return 0; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 9ef40f100415..29d5a0e0c8f0 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -69,7 +69,8 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, +static int ufs_create (struct user_namespace * mnt_userns, + struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; @@ -85,7 +86,8 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, return ufs_add_nondir(dentry, inode); } -static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) +static int ufs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; int err; @@ -104,8 +106,8 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev return err; } -static int ufs_symlink (struct inode * dir, struct dentry * dentry, - const char * symname) +static int ufs_symlink (struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; int err; @@ -164,7 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) +static int ufs_mkdir(struct user_namespace * mnt_userns, struct inode * dir, + struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -240,9 +243,9 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) return err; } -static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) +static int ufs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + struct dentry *old_dentry, struct inode *new_dir, + struct dentry *new_dentry, unsigned int flags) { struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index b49e0efdf3d7..550f7c5a3636 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -123,7 +123,8 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); +extern int ufs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 894cc28142e7..0be8cdd4425a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -979,14 +979,14 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait) static const struct file_operations userfaultfd_fops; -static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, - struct userfaultfd_ctx *new, +static int resolve_userfault_fork(struct userfaultfd_ctx *new, + struct inode *inode, struct uffd_msg *msg) { int fd; - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); + fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode); if (fd < 0) return fd; @@ -996,7 +996,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, } static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, - struct uffd_msg *msg) + struct uffd_msg *msg, struct inode *inode) { ssize_t ret; DECLARE_WAITQUEUE(wait, current); @@ -1107,7 +1107,7 @@ static ssize_t userfaultfd_ctx_read(struct userfaultfd_ctx *ctx, int no_wait, spin_unlock_irq(&ctx->fd_wqh.lock); if (!ret && msg->event == UFFD_EVENT_FORK) { - ret = resolve_userfault_fork(ctx, fork_nctx, msg); + ret = resolve_userfault_fork(fork_nctx, inode, msg); spin_lock_irq(&ctx->event_wqh.lock); if (!list_empty(&fork_event)) { /* @@ -1167,6 +1167,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, ssize_t _ret, ret = 0; struct uffd_msg msg; int no_wait = file->f_flags & O_NONBLOCK; + struct inode *inode = file_inode(file); if (ctx->state == UFFD_STATE_WAIT_API) return -EINVAL; @@ -1174,7 +1175,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf, for (;;) { if (count < sizeof(msg)) return ret ? ret : -EINVAL; - _ret = userfaultfd_ctx_read(ctx, no_wait, &msg); + _ret = userfaultfd_ctx_read(ctx, no_wait, &msg, inode); if (_ret < 0) return ret ? ret : _ret; if (copy_to_user((__u64 __user *) buf, &msg, sizeof(msg))) @@ -1999,8 +2000,8 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL); if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); diff --git a/fs/utimes.c b/fs/utimes.c index fd3cc4226224..39f356017635 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -62,7 +62,8 @@ int vfs_utimes(const struct path *path, struct timespec64 *times) } retry_deleg: inode_lock(inode); - error = notify_change(path->dentry, &newattrs, &delegated_inode); + error = notify_change(mnt_user_ns(path->mnt), path->dentry, &newattrs, + &delegated_inode); inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 4d569f14a8d8..7aee0ec63ade 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -288,13 +288,15 @@ static int vboxsf_dir_create(struct inode *parent, struct dentry *dentry, return 0; } -static int vboxsf_dir_mkfile(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_mkfile(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, umode_t mode, bool excl) { return vboxsf_dir_create(parent, dentry, mode, 0); } -static int vboxsf_dir_mkdir(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_mkdir(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, umode_t mode) { return vboxsf_dir_create(parent, dentry, mode, 1); @@ -332,7 +334,8 @@ static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) return 0; } -static int vboxsf_dir_rename(struct inode *old_parent, +static int vboxsf_dir_rename(struct user_namespace *mnt_userns, + struct inode *old_parent, struct dentry *old_dentry, struct inode *new_parent, struct dentry *new_dentry, @@ -374,7 +377,8 @@ err_put_old_path: return err; } -static int vboxsf_dir_symlink(struct inode *parent, struct dentry *dentry, +static int vboxsf_dir_symlink(struct user_namespace *mnt_userns, + struct inode *parent, struct dentry *dentry, const char *symname) { struct vboxsf_inode *sf_parent_i = VBOXSF_I(parent); diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 018057546067..3b847e3fba24 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -212,8 +212,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return 0; } -int vboxsf_getattr(const struct path *path, struct kstat *kstat, - u32 request_mask, unsigned int flags) +int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *kstat, u32 request_mask, unsigned int flags) { int err; struct dentry *dentry = path->dentry; @@ -233,11 +233,12 @@ int vboxsf_getattr(const struct path *path, struct kstat *kstat, if (err) return err; - generic_fillattr(d_inode(dentry), kstat); + generic_fillattr(&init_user_ns, d_inode(dentry), kstat); return 0; } -int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr) +int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr) { struct vboxsf_inode *sf_i = VBOXSF_I(d_inode(dentry)); struct vboxsf_sbi *sbi = VBOXSF_SBI(dentry->d_sb); diff --git a/fs/vboxsf/vfsmod.h b/fs/vboxsf/vfsmod.h index 18f95b00fc33..760524e78c88 100644 --- a/fs/vboxsf/vfsmod.h +++ b/fs/vboxsf/vfsmod.h @@ -90,9 +90,11 @@ int vboxsf_stat(struct vboxsf_sbi *sbi, struct shfl_string *path, struct shfl_fsobjinfo *info); int vboxsf_stat_dentry(struct dentry *dentry, struct shfl_fsobjinfo *info); int vboxsf_inode_revalidate(struct dentry *dentry); -int vboxsf_getattr(const struct path *path, struct kstat *kstat, - u32 request_mask, unsigned int query_flags); -int vboxsf_setattr(struct dentry *dentry, struct iattr *iattr); +int vboxsf_getattr(struct user_namespace *mnt_userns, const struct path *path, + struct kstat *kstat, u32 request_mask, + unsigned int query_flags); +int vboxsf_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct iattr *iattr); struct shfl_string *vboxsf_path_from_dentry(struct vboxsf_sbi *sbi, struct dentry *dentry); int vboxsf_nlscpy(struct vboxsf_sbi *sbi, char *name, size_t name_bound_len, diff --git a/fs/verity/enable.c b/fs/verity/enable.c index f7e997a01ad0..77e159a0346b 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -369,7 +369,7 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg) * has verity enabled, and to stabilize the data being hashed. */ - err = inode_permission(inode, MAY_WRITE); + err = file_permission(filp, MAY_WRITE); if (err) return err; diff --git a/fs/xattr.c b/fs/xattr.c index fd57153b1f61..b3444e06cded 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -83,7 +83,8 @@ xattr_resolve_name(struct inode *inode, const char **name) * because different namespaces have very different rules. */ static int -xattr_permission(struct inode *inode, const char *name, int mask) +xattr_permission(struct user_namespace *mnt_userns, struct inode *inode, + const char *name, int mask) { /* * We can never set or remove an extended attribute on a read-only @@ -97,7 +98,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) * to be writen back improperly if their true value is * unknown to the vfs. */ - if (HAS_UNMAPPED_ID(inode)) + if (HAS_UNMAPPED_ID(mnt_userns, inode)) return -EPERM; } @@ -127,11 +128,12 @@ xattr_permission(struct inode *inode, const char *name, int mask) if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) return (mask & MAY_WRITE) ? -EPERM : -ENODATA; if (S_ISDIR(inode->i_mode) && (inode->i_mode & S_ISVTX) && - (mask & MAY_WRITE) && !inode_owner_or_capable(inode)) + (mask & MAY_WRITE) && + !inode_owner_or_capable(mnt_userns, inode)) return -EPERM; } - return inode_permission(inode, mask); + return inode_permission(mnt_userns, inode, mask); } /* @@ -162,8 +164,9 @@ xattr_supported_namespace(struct inode *inode, const char *prefix) EXPORT_SYMBOL(xattr_supported_namespace); int -__vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, - const void *value, size_t size, int flags) +__vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { const struct xattr_handler *handler; @@ -174,7 +177,8 @@ __vfs_setxattr(struct dentry *dentry, struct inode *inode, const char *name, return -EOPNOTSUPP; if (size == 0) value = ""; /* empty EA, do not remove */ - return handler->set(handler, dentry, inode, name, value, size, flags); + return handler->set(handler, mnt_userns, dentry, inode, name, value, + size, flags); } EXPORT_SYMBOL(__vfs_setxattr); @@ -182,6 +186,7 @@ EXPORT_SYMBOL(__vfs_setxattr); * __vfs_setxattr_noperm - perform setxattr operation without performing * permission checks. * + * @mnt_userns - user namespace of the mount the inode was found from * @dentry - object to perform setxattr on * @name - xattr name to set * @value - value to set @name to @@ -194,8 +199,9 @@ EXPORT_SYMBOL(__vfs_setxattr); * is executed. It also assumes that the caller will make the appropriate * permission checks. */ -int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) +int __vfs_setxattr_noperm(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; int error = -EAGAIN; @@ -205,7 +211,8 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, if (issec) inode->i_flags &= ~S_NOSEC; if (inode->i_opflags & IOP_XATTR) { - error = __vfs_setxattr(dentry, inode, name, value, size, flags); + error = __vfs_setxattr(mnt_userns, dentry, inode, name, value, + size, flags); if (!error) { fsnotify_xattr(dentry); security_inode_post_setxattr(dentry, name, value, @@ -244,18 +251,19 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, * a delegation was broken on, NULL if none. */ int -__vfs_setxattr_locked(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, - struct inode **delegated_inode) +__vfs_setxattr_locked(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, const void *value, size_t size, + int flags, struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_WRITE); + error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_setxattr(dentry, name, value, size, flags); + error = security_inode_setxattr(mnt_userns, dentry, name, value, size, + flags); if (error) goto out; @@ -263,7 +271,8 @@ __vfs_setxattr_locked(struct dentry *dentry, const char *name, if (error) goto out; - error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + error = __vfs_setxattr_noperm(mnt_userns, dentry, name, value, + size, flags); out: return error; @@ -271,8 +280,8 @@ out: EXPORT_SYMBOL_GPL(__vfs_setxattr_locked); int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +vfs_setxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -280,7 +289,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, int error; if (size && strcmp(name, XATTR_NAME_CAPS) == 0) { - error = cap_convert_nscap(dentry, &value, size); + error = cap_convert_nscap(mnt_userns, dentry, &value, size); if (error < 0) return error; size = error; @@ -288,8 +297,8 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, retry_deleg: inode_lock(inode); - error = __vfs_setxattr_locked(dentry, name, value, size, flags, - &delegated_inode); + error = __vfs_setxattr_locked(mnt_userns, dentry, name, value, size, + flags, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -305,18 +314,20 @@ retry_deleg: EXPORT_SYMBOL_GPL(vfs_setxattr); static ssize_t -xattr_getsecurity(struct inode *inode, const char *name, void *value, - size_t size) +xattr_getsecurity(struct user_namespace *mnt_userns, struct inode *inode, + const char *name, void *value, size_t size) { void *buffer = NULL; ssize_t len; if (!value || !size) { - len = security_inode_getsecurity(inode, name, &buffer, false); + len = security_inode_getsecurity(mnt_userns, inode, name, + &buffer, false); goto out_noalloc; } - len = security_inode_getsecurity(inode, name, &buffer, true); + len = security_inode_getsecurity(mnt_userns, inode, name, &buffer, + true); if (len < 0) return len; if (size < len) { @@ -339,15 +350,16 @@ out_noalloc: * Returns the result of alloc, if failed, or the getxattr operation. */ ssize_t -vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, - size_t xattr_size, gfp_t flags) +vfs_getxattr_alloc(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, char **xattr_value, size_t xattr_size, + gfp_t flags) { const struct xattr_handler *handler; struct inode *inode = dentry->d_inode; char *value = *xattr_value; int error; - error = xattr_permission(inode, name, MAY_READ); + error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; @@ -388,12 +400,13 @@ __vfs_getxattr(struct dentry *dentry, struct inode *inode, const char *name, EXPORT_SYMBOL(__vfs_getxattr); ssize_t -vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) +vfs_getxattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_READ); + error = xattr_permission(mnt_userns, inode, name, MAY_READ); if (error) return error; @@ -404,7 +417,8 @@ vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - int ret = xattr_getsecurity(inode, suffix, value, size); + int ret = xattr_getsecurity(mnt_userns, inode, suffix, value, + size); /* * Only overwrite the return value if a security module * is actually active. @@ -439,7 +453,8 @@ vfs_listxattr(struct dentry *dentry, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -__vfs_removexattr(struct dentry *dentry, const char *name) +__vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name) { struct inode *inode = d_inode(dentry); const struct xattr_handler *handler; @@ -449,7 +464,8 @@ __vfs_removexattr(struct dentry *dentry, const char *name) return PTR_ERR(handler); if (!handler->set) return -EOPNOTSUPP; - return handler->set(handler, dentry, inode, name, NULL, 0, XATTR_REPLACE); + return handler->set(handler, mnt_userns, dentry, inode, name, NULL, 0, + XATTR_REPLACE); } EXPORT_SYMBOL(__vfs_removexattr); @@ -463,17 +479,18 @@ EXPORT_SYMBOL(__vfs_removexattr); * a delegation was broken on, NULL if none. */ int -__vfs_removexattr_locked(struct dentry *dentry, const char *name, - struct inode **delegated_inode) +__vfs_removexattr_locked(struct user_namespace *mnt_userns, + struct dentry *dentry, const char *name, + struct inode **delegated_inode) { struct inode *inode = dentry->d_inode; int error; - error = xattr_permission(inode, name, MAY_WRITE); + error = xattr_permission(mnt_userns, inode, name, MAY_WRITE); if (error) return error; - error = security_inode_removexattr(dentry, name); + error = security_inode_removexattr(mnt_userns, dentry, name); if (error) goto out; @@ -481,7 +498,7 @@ __vfs_removexattr_locked(struct dentry *dentry, const char *name, if (error) goto out; - error = __vfs_removexattr(dentry, name); + error = __vfs_removexattr(mnt_userns, dentry, name); if (!error) { fsnotify_xattr(dentry); @@ -494,7 +511,8 @@ out: EXPORT_SYMBOL_GPL(__vfs_removexattr_locked); int -vfs_removexattr(struct dentry *dentry, const char *name) +vfs_removexattr(struct user_namespace *mnt_userns, struct dentry *dentry, + const char *name) { struct inode *inode = dentry->d_inode; struct inode *delegated_inode = NULL; @@ -502,7 +520,8 @@ vfs_removexattr(struct dentry *dentry, const char *name) retry_deleg: inode_lock(inode); - error = __vfs_removexattr_locked(dentry, name, &delegated_inode); + error = __vfs_removexattr_locked(mnt_userns, dentry, + name, &delegated_inode); inode_unlock(inode); if (delegated_inode) { @@ -519,8 +538,9 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); * Extended attribute SET operations */ static long -setxattr(struct dentry *d, const char __user *name, const void __user *value, - size_t size, int flags) +setxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, const void __user *value, size_t size, + int flags) { int error; void *kvalue = NULL; @@ -547,10 +567,10 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, } if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_from_user(kvalue, size); + posix_acl_fix_xattr_from_user(mnt_userns, kvalue, size); } - error = vfs_setxattr(d, kname, kvalue, size, flags); + error = vfs_setxattr(mnt_userns, d, kname, kvalue, size, flags); out: kvfree(kvalue); @@ -563,13 +583,15 @@ static int path_setxattr(const char __user *pathname, { struct path path; int error; + retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; error = mnt_want_write(path.mnt); if (!error) { - error = setxattr(path.dentry, name, value, size, flags); + error = setxattr(mnt_user_ns(path.mnt), path.dentry, name, + value, size, flags); mnt_drop_write(path.mnt); } path_put(&path); @@ -605,7 +627,9 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = setxattr(f.file->f_path.dentry, name, value, size, flags); + error = setxattr(file_mnt_user_ns(f.file), + f.file->f_path.dentry, name, + value, size, flags); mnt_drop_write_file(f.file); } fdput(f); @@ -616,8 +640,8 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, * Extended attribute GET operations */ static ssize_t -getxattr(struct dentry *d, const char __user *name, void __user *value, - size_t size) +getxattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name, void __user *value, size_t size) { ssize_t error; void *kvalue = NULL; @@ -637,11 +661,11 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return -ENOMEM; } - error = vfs_getxattr(d, kname, kvalue, size); + error = vfs_getxattr(mnt_userns, d, kname, kvalue, size); if (error > 0) { if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) - posix_acl_fix_xattr_to_user(kvalue, error); + posix_acl_fix_xattr_to_user(mnt_userns, kvalue, error); if (size && copy_to_user(value, kvalue, error)) error = -EFAULT; } else if (error == -ERANGE && size >= XATTR_SIZE_MAX) { @@ -665,7 +689,7 @@ retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) return error; - error = getxattr(path.dentry, name, value, size); + error = getxattr(mnt_user_ns(path.mnt), path.dentry, name, value, size); path_put(&path); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -695,7 +719,8 @@ SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = getxattr(f.file->f_path.dentry, name, value, size); + error = getxattr(file_mnt_user_ns(f.file), f.file->f_path.dentry, + name, value, size); fdput(f); return error; } @@ -779,7 +804,8 @@ SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) * Extended attribute REMOVE operations */ static long -removexattr(struct dentry *d, const char __user *name) +removexattr(struct user_namespace *mnt_userns, struct dentry *d, + const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; @@ -790,7 +816,7 @@ removexattr(struct dentry *d, const char __user *name) if (error < 0) return error; - return vfs_removexattr(d, kname); + return vfs_removexattr(mnt_userns, d, kname); } static int path_removexattr(const char __user *pathname, @@ -804,7 +830,7 @@ retry: return error; error = mnt_want_write(path.mnt); if (!error) { - error = removexattr(path.dentry, name); + error = removexattr(mnt_user_ns(path.mnt), path.dentry, name); mnt_drop_write(path.mnt); } path_put(&path); @@ -837,7 +863,8 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) audit_file(f.file); error = mnt_want_write_file(f.file); if (!error) { - error = removexattr(f.file->f_path.dentry, name); + error = removexattr(file_mnt_user_ns(f.file), + f.file->f_path.dentry, name); mnt_drop_write_file(f.file); } fdput(f); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b56ff451adce..5b6fcb9b44e2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2805,7 +2805,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_MEMALLOC_NOFS; + unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work @@ -2817,12 +2817,20 @@ xfs_btree_split_worker( new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); + xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); - complete(args->done); + xfs_trans_clear_context(args->cur->bc_tp); current_restore_flags_nested(&pflags, new_pflags); + + /* + * Do not access args after complete() has run here. We don't own args + * and the owner may run and free args before we return here. + */ + complete(args->done); + } /* diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 779cb73b3d00..d02bef24b32b 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -238,7 +238,8 @@ xfs_acl_set_mode( } int -xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type) { umode_t mode; bool set_mode = false; @@ -252,7 +253,7 @@ xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) return error; if (type == ACL_TYPE_ACCESS) { - error = posix_acl_update_mode(inode, &mode, &acl); + error = posix_acl_update_mode(mnt_userns, inode, &mode, &acl); if (error) return error; set_mode = true; diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index c042c0868016..7bdb3a4ed798 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -11,7 +11,8 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); -extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int xfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); extern int __xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); void xfs_forget_acl(struct inode *inode, const char *name); #else diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4304c6416fbb..b4186d666157 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_clear_context(tp); return 0; } @@ -125,7 +125,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_set_context(tp); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -568,6 +568,12 @@ xfs_vm_writepage( { struct xfs_writepage_ctx wpc = { }; + if (WARN_ON_ONCE(current->journal_info)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -578,6 +584,13 @@ xfs_vm_writepages( { struct xfs_writepage_ctx wpc = { }; + /* + * Writing back data in a transaction context can result in recursive + * transactions. This is bad, so issue a warning and get out of here. + */ + if (WARN_ON_ONCE(current->journal_info)) + return 0; + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index e2148f2d5d6b..17f36db2f792 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -6,7 +6,7 @@ static inline unsigned int bio_max_vecs(unsigned int count) { - return min_t(unsigned, howmany(count, PAGE_SIZE), BIO_MAX_PAGES); + return bio_max_segs(howmany(count, PAGE_SIZE)); } int diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index f6e5235df7c9..37a1d12762d8 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1480,7 +1480,7 @@ xfs_buf_ioapply_map( int op) { int page_index; - int total_nr_pages = bp->b_page_count; + unsigned int total_nr_pages = bp->b_page_count; int nr_pages; struct bio *bio; sector_t sector = bp->b_maps[map].bm_bn; @@ -1505,7 +1505,7 @@ xfs_buf_ioapply_map( next_chunk: atomic_inc(&bp->b_io_remaining); - nr_pages = min(total_nr_pages, BIO_MAX_PAGES); + nr_pages = bio_max_segs(total_nr_pages); bio = bio_alloc(GFP_NOIO, nr_pages); bio_set_dev(bio, bp->b_target->bt_bdev); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 3991e59cfd18..ef17c1f6db32 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -344,7 +344,6 @@ xfs_extent_busy_trim( ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); -restart: fbno = *bno; flen = *len; rbp = args->pag->pagb_tree.rb_node; @@ -363,19 +362,6 @@ restart: continue; } - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!(args->datatype & XFS_ALLOC_USERDATA) && - !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { - if (!xfs_extent_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - if (bbno <= fbno) { /* start overlap */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 68ca1b40d8c7..a007ca0711d9 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -29,6 +29,7 @@ #include <linux/backing-dev.h> #include <linux/mman.h> #include <linux/fadvise.h> +#include <linux/mount.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -1051,7 +1052,8 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; - error = xfs_vn_setattr_size(file_dentry(file), &iattr); + error = xfs_vn_setattr_size(file_mnt_user_ns(file), + file_dentry(file), &iattr); if (error) goto out_unlock; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 636ac13b1df2..46a861d55e48 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -766,6 +766,7 @@ xfs_inode_inherit_flags2( */ static int xfs_init_new_inode( + struct user_namespace *mnt_userns, struct xfs_trans *tp, struct xfs_inode *pip, xfs_ino_t ino, @@ -811,11 +812,11 @@ xfs_init_new_inode( if (dir && !(dir->i_mode & S_ISGID) && (mp->m_flags & XFS_MOUNT_GRPID)) { - inode->i_uid = current_fsuid(); + inode->i_uid = fsuid_into_mnt(mnt_userns); inode->i_gid = dir->i_gid; inode->i_mode = mode; } else { - inode_init_owner(inode, dir, mode); + inode_init_owner(mnt_userns, inode, dir, mode); } /* @@ -824,7 +825,8 @@ xfs_init_new_inode( * (and only if the irix_sgid_inherit compatibility variable is set). */ if (irix_sgid_inherit && - (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid)) + (inode->i_mode & S_ISGID) && + !in_group_p(i_gid_into_mnt(mnt_userns, inode))) inode->i_mode &= ~S_ISGID; ip->i_d.di_size = 0; @@ -901,6 +903,7 @@ xfs_init_new_inode( */ int xfs_dir_ialloc( + struct user_namespace *mnt_userns, struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode, @@ -933,7 +936,8 @@ xfs_dir_ialloc( return error; ASSERT(ino != NULLFSINO); - return xfs_init_new_inode(*tpp, dp, ino, mode, nlink, rdev, prid, ipp); + return xfs_init_new_inode(mnt_userns, *tpp, dp, ino, mode, nlink, rdev, + prid, ipp); } /* @@ -973,6 +977,7 @@ xfs_bumplink( int xfs_create( + struct user_namespace *mnt_userns, xfs_inode_t *dp, struct xfs_name *name, umode_t mode, @@ -1046,7 +1051,8 @@ xfs_create( * entry pointing to them, but a directory also the "." entry * pointing to itself. */ - error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, is_dir ? 2 : 1, rdev, + prid, &ip); if (error) goto out_trans_cancel; @@ -1127,6 +1133,7 @@ xfs_create( int xfs_create_tmpfile( + struct user_namespace *mnt_userns, struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp) @@ -1164,7 +1171,7 @@ xfs_create_tmpfile( if (error) goto out_release_dquots; - error = xfs_dir_ialloc(&tp, dp, mode, 0, 0, prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, mode, 0, 0, prid, &ip); if (error) goto out_trans_cancel; @@ -2977,13 +2984,15 @@ out_trans_abort: */ static int xfs_rename_alloc_whiteout( + struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_inode **wip) { struct xfs_inode *tmpfile; int error; - error = xfs_create_tmpfile(dp, S_IFCHR | WHITEOUT_MODE, &tmpfile); + error = xfs_create_tmpfile(mnt_userns, dp, S_IFCHR | WHITEOUT_MODE, + &tmpfile); if (error) return error; @@ -3005,6 +3014,7 @@ xfs_rename_alloc_whiteout( */ int xfs_rename( + struct user_namespace *mnt_userns, struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, @@ -3036,7 +3046,7 @@ xfs_rename( */ if (flags & RENAME_WHITEOUT) { ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); - error = xfs_rename_alloc_whiteout(target_dp, &wip); + error = xfs_rename_alloc_whiteout(mnt_userns, target_dp, &wip); if (error) return error; diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index eca333f5f715..88ee4c3930ae 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -369,15 +369,18 @@ int xfs_release(struct xfs_inode *ip); void xfs_inactive(struct xfs_inode *ip); int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode **ipp, struct xfs_name *ci_name); -int xfs_create(struct xfs_inode *dp, struct xfs_name *name, +int xfs_create(struct user_namespace *mnt_userns, + struct xfs_inode *dp, struct xfs_name *name, umode_t mode, dev_t rdev, struct xfs_inode **ipp); -int xfs_create_tmpfile(struct xfs_inode *dp, umode_t mode, +int xfs_create_tmpfile(struct user_namespace *mnt_userns, + struct xfs_inode *dp, umode_t mode, struct xfs_inode **ipp); int xfs_remove(struct xfs_inode *dp, struct xfs_name *name, struct xfs_inode *ip); int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip, struct xfs_name *target_name); -int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, +int xfs_rename(struct user_namespace *mnt_userns, + struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, struct xfs_name *target_name, struct xfs_inode *target_ip, unsigned int flags); @@ -407,9 +410,10 @@ void xfs_lock_two_inodes(struct xfs_inode *ip0, uint ip0_mode, xfs_extlen_t xfs_get_extsz_hint(struct xfs_inode *ip); xfs_extlen_t xfs_get_cowextsz_hint(struct xfs_inode *ip); -int xfs_dir_ialloc(struct xfs_trans **tpp, struct xfs_inode *dp, umode_t mode, - xfs_nlink_t nlink, dev_t dev, prid_t prid, - struct xfs_inode **ipp); +int xfs_dir_ialloc(struct user_namespace *mnt_userns, + struct xfs_trans **tpp, struct xfs_inode *dp, + umode_t mode, xfs_nlink_t nlink, dev_t dev, + prid_t prid, struct xfs_inode **ipp); static inline int xfs_itruncate_extents( diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 248083ea0276..99dfe89a8d08 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -693,7 +693,8 @@ xfs_ioc_space( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = bf->l_start; - error = xfs_vn_setattr_size(file_dentry(filp), &iattr); + error = xfs_vn_setattr_size(file_mnt_user_ns(filp), file_dentry(filp), + &iattr); if (error) goto out_unlock; @@ -734,13 +735,15 @@ xfs_fsinumbers_fmt( STATIC int xfs_ioc_fsbulkstat( - xfs_mount_t *mp, + struct file *file, unsigned int cmd, void __user *arg) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), .ocount = 0, }; xfs_ino_t lastino; @@ -908,13 +911,15 @@ xfs_bulk_ireq_teardown( /* Handle the v5 bulkstat ioctl. */ STATIC int xfs_ioc_bulkstat( - struct xfs_mount *mp, + struct file *file, unsigned int cmd, struct xfs_bulkstat_req __user *arg) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; struct xfs_bulk_ireq hdr; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), }; int error; @@ -1275,9 +1280,10 @@ xfs_ioctl_setattr_prepare_dax( */ static struct xfs_trans * xfs_ioctl_setattr_get_trans( - struct xfs_inode *ip, + struct file *file, struct xfs_dquot *pdqp) { + struct xfs_inode *ip = XFS_I(file_inode(file)); struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error = -EROFS; @@ -1299,7 +1305,7 @@ xfs_ioctl_setattr_get_trans( * The user ID of the calling process must be equal to the file owner * ID, except in cases where the CAP_FSETID capability is applicable. */ - if (!inode_owner_or_capable(VFS_I(ip))) { + if (!inode_owner_or_capable(file_mnt_user_ns(file), VFS_I(ip))) { error = -EPERM; goto out_cancel; } @@ -1427,9 +1433,11 @@ xfs_ioctl_setattr_check_projid( STATIC int xfs_ioctl_setattr( - xfs_inode_t *ip, + struct file *file, struct fsxattr *fa) { + struct user_namespace *mnt_userns = file_mnt_user_ns(file); + struct xfs_inode *ip = XFS_I(file_inode(file)); struct fsxattr old_fa; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -1461,7 +1469,7 @@ xfs_ioctl_setattr( xfs_ioctl_setattr_prepare_dax(ip, fa); - tp = xfs_ioctl_setattr_get_trans(ip, pdqp); + tp = xfs_ioctl_setattr_get_trans(file, pdqp); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto error_free_dquots; @@ -1493,7 +1501,7 @@ xfs_ioctl_setattr( */ if ((VFS_I(ip)->i_mode & (S_ISUID|S_ISGID)) && - !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID)) + !capable_wrt_inode_uidgid(mnt_userns, VFS_I(ip), CAP_FSETID)) VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID); /* Change the ownerships and register project quota modifications */ @@ -1540,7 +1548,6 @@ error_free_dquots: STATIC int xfs_ioc_fssetxattr( - xfs_inode_t *ip, struct file *filp, void __user *arg) { @@ -1553,7 +1560,7 @@ xfs_ioc_fssetxattr( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_ioctl_setattr(ip, &fa); + error = xfs_ioctl_setattr(filp, &fa); mnt_drop_write_file(filp); return error; } @@ -1599,7 +1606,7 @@ xfs_ioc_setxflags( xfs_ioctl_setattr_prepare_dax(ip, &fa); - tp = xfs_ioctl_setattr_get_trans(ip, NULL); + tp = xfs_ioctl_setattr_get_trans(filp, NULL); if (IS_ERR(tp)) { error = PTR_ERR(tp); goto out_drop_write; @@ -2110,10 +2117,10 @@ xfs_file_ioctl( case XFS_IOC_FSBULKSTAT_SINGLE: case XFS_IOC_FSBULKSTAT: case XFS_IOC_FSINUMBERS: - return xfs_ioc_fsbulkstat(mp, cmd, arg); + return xfs_ioc_fsbulkstat(filp, cmd, arg); case XFS_IOC_BULKSTAT: - return xfs_ioc_bulkstat(mp, cmd, arg); + return xfs_ioc_bulkstat(filp, cmd, arg); case XFS_IOC_INUMBERS: return xfs_ioc_inumbers(mp, cmd, arg); @@ -2135,7 +2142,7 @@ xfs_file_ioctl( case XFS_IOC_FSGETXATTRA: return xfs_ioc_fsgetxattr(ip, 1, arg); case XFS_IOC_FSSETXATTR: - return xfs_ioc_fssetxattr(ip, filp, arg); + return xfs_ioc_fssetxattr(filp, arg); case XFS_IOC_GETXFLAGS: return xfs_ioc_getxflags(ip, arg); case XFS_IOC_SETXFLAGS: diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index c1771e728117..33c09ec8e6c0 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -209,14 +209,16 @@ xfs_fsbulkstat_one_fmt_compat( /* copied from xfs_ioctl.c */ STATIC int xfs_compat_ioc_fsbulkstat( - xfs_mount_t *mp, + struct file *file, unsigned int cmd, struct compat_xfs_fsop_bulkreq __user *p32) { + struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; u32 addr; struct xfs_fsop_bulkreq bulkreq; struct xfs_ibulk breq = { .mp = mp, + .mnt_userns = file_mnt_user_ns(file), .ocount = 0, }; xfs_ino_t lastino; @@ -436,7 +438,6 @@ xfs_file_compat_ioctl( { struct inode *inode = file_inode(filp); struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; void __user *arg = compat_ptr(p); int error; @@ -456,7 +457,7 @@ xfs_file_compat_ioctl( return xfs_ioc_space(filp, &bf); } case XFS_IOC_FSGEOMETRY_V1_32: - return xfs_compat_ioc_fsgeometry_v1(mp, arg); + return xfs_compat_ioc_fsgeometry_v1(ip->i_mount, arg); case XFS_IOC_FSGROWFSDATA_32: { struct xfs_growfs_data in; @@ -465,7 +466,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_growfs_data(mp, &in); + error = xfs_growfs_data(ip->i_mount, &in); mnt_drop_write_file(filp); return error; } @@ -477,7 +478,7 @@ xfs_file_compat_ioctl( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_growfs_rt(mp, &in); + error = xfs_growfs_rt(ip->i_mount, &in); mnt_drop_write_file(filp); return error; } @@ -507,7 +508,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSBULKSTAT_32: case XFS_IOC_FSBULKSTAT_SINGLE_32: case XFS_IOC_FSINUMBERS_32: - return xfs_compat_ioc_fsbulkstat(mp, cmd, arg); + return xfs_compat_ioc_fsbulkstat(filp, cmd, arg); case XFS_IOC_FD_TO_HANDLE_32: case XFS_IOC_PATH_TO_HANDLE_32: case XFS_IOC_PATH_TO_FSHANDLE_32: { diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 00369502fe25..66ebccb5a6ff 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -128,6 +128,7 @@ xfs_cleanup_inode( STATIC int xfs_generic_create( + struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, @@ -161,9 +162,10 @@ xfs_generic_create( goto out_free_acl; if (!tmpfile) { - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + error = xfs_create(mnt_userns, XFS_I(dir), &name, mode, rdev, + &ip); } else { - error = xfs_create_tmpfile(XFS_I(dir), mode, &ip); + error = xfs_create_tmpfile(mnt_userns, XFS_I(dir), mode, &ip); } if (unlikely(error)) goto out_free_acl; @@ -220,31 +222,35 @@ xfs_generic_create( STATIC int xfs_vn_mknod( - struct inode *dir, - struct dentry *dentry, - umode_t mode, - dev_t rdev) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) { - return xfs_generic_create(dir, dentry, mode, rdev, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, rdev, false); } STATIC int xfs_vn_create( - struct inode *dir, - struct dentry *dentry, - umode_t mode, - bool flags) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode, + bool flags) { - return xfs_generic_create(dir, dentry, mode, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, false); } STATIC int xfs_vn_mkdir( - struct inode *dir, - struct dentry *dentry, - umode_t mode) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - return xfs_generic_create(dir, dentry, mode | S_IFDIR, 0, false); + return xfs_generic_create(mnt_userns, dir, dentry, mode | S_IFDIR, 0, + false); } STATIC struct dentry * @@ -361,9 +367,10 @@ xfs_vn_unlink( STATIC int xfs_vn_symlink( - struct inode *dir, - struct dentry *dentry, - const char *symname) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + const char *symname) { struct inode *inode; struct xfs_inode *cip = NULL; @@ -377,7 +384,7 @@ xfs_vn_symlink( if (unlikely(error)) goto out; - error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); + error = xfs_symlink(mnt_userns, XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; @@ -403,11 +410,12 @@ xfs_vn_symlink( STATIC int xfs_vn_rename( - struct inode *odir, - struct dentry *odentry, - struct inode *ndir, - struct dentry *ndentry, - unsigned int flags) + struct user_namespace *mnt_userns, + struct inode *odir, + struct dentry *odentry, + struct inode *ndir, + struct dentry *ndentry, + unsigned int flags) { struct inode *new_inode = d_inode(ndentry); int omode = 0; @@ -431,8 +439,8 @@ xfs_vn_rename( if (unlikely(error)) return error; - return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), - XFS_I(ndir), &nname, + return xfs_rename(mnt_userns, XFS_I(odir), &oname, + XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -529,6 +537,7 @@ xfs_stat_blksize( STATIC int xfs_vn_getattr( + struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, @@ -547,8 +556,8 @@ xfs_vn_getattr( stat->dev = inode->i_sb->s_dev; stat->mode = inode->i_mode; stat->nlink = inode->i_nlink; - stat->uid = inode->i_uid; - stat->gid = inode->i_gid; + stat->uid = i_uid_into_mnt(mnt_userns, inode); + stat->gid = i_gid_into_mnt(mnt_userns, inode); stat->ino = ip->i_ino; stat->atime = inode->i_atime; stat->mtime = inode->i_mtime; @@ -626,8 +635,9 @@ xfs_setattr_time( static int xfs_vn_change_ok( - struct dentry *dentry, - struct iattr *iattr) + struct user_namespace *mnt_userns, + struct dentry *dentry, + struct iattr *iattr) { struct xfs_mount *mp = XFS_I(d_inode(dentry))->i_mount; @@ -637,7 +647,7 @@ xfs_vn_change_ok( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - return setattr_prepare(dentry, iattr); + return setattr_prepare(mnt_userns, dentry, iattr); } /* @@ -648,6 +658,7 @@ xfs_vn_change_ok( */ static int xfs_setattr_nonsize( + struct user_namespace *mnt_userns, struct xfs_inode *ip, struct iattr *iattr) { @@ -788,7 +799,7 @@ xfs_setattr_nonsize( * Posix ACL code seems to care about this issue either. */ if (mask & ATTR_MODE) { - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(mnt_userns, inode, inode->i_mode); if (error) return error; } @@ -809,6 +820,7 @@ out_dqrele: */ STATIC int xfs_setattr_size( + struct user_namespace *mnt_userns, struct xfs_inode *ip, struct iattr *iattr) { @@ -840,7 +852,7 @@ xfs_setattr_size( * Use the regular setattr path to update the timestamps. */ iattr->ia_valid &= ~ATTR_SIZE; - return xfs_setattr_nonsize(ip, iattr); + return xfs_setattr_nonsize(mnt_userns, ip, iattr); } /* @@ -1009,6 +1021,7 @@ out_trans_cancel: int xfs_vn_setattr_size( + struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { @@ -1017,14 +1030,15 @@ xfs_vn_setattr_size( trace_xfs_setattr(ip); - error = xfs_vn_change_ok(dentry, iattr); + error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (error) return error; - return xfs_setattr_size(ip, iattr); + return xfs_setattr_size(mnt_userns, ip, iattr); } STATIC int xfs_vn_setattr( + struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { @@ -1044,14 +1058,14 @@ xfs_vn_setattr( return error; } - error = xfs_vn_setattr_size(dentry, iattr); + error = xfs_vn_setattr_size(mnt_userns, dentry, iattr); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); } else { trace_xfs_setattr(ip); - error = xfs_vn_change_ok(dentry, iattr); + error = xfs_vn_change_ok(mnt_userns, dentry, iattr); if (!error) - error = xfs_setattr_nonsize(ip, iattr); + error = xfs_setattr_nonsize(mnt_userns, ip, iattr); } return error; @@ -1122,11 +1136,12 @@ xfs_vn_fiemap( STATIC int xfs_vn_tmpfile( - struct inode *dir, - struct dentry *dentry, - umode_t mode) + struct user_namespace *mnt_userns, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - return xfs_generic_create(dir, dentry, mode, 0, true); + return xfs_generic_create(mnt_userns, dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 99ca745c1071..278949056048 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -14,6 +14,7 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr); -extern int xfs_vn_setattr_size(struct dentry *dentry, struct iattr *vap); +int xfs_vn_setattr_size(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *vap); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 16ca97a7ff00..ca310a125d1e 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -54,10 +54,12 @@ struct xfs_bstat_chunk { STATIC int xfs_bulkstat_one_int( struct xfs_mount *mp, + struct user_namespace *mnt_userns, struct xfs_trans *tp, xfs_ino_t ino, struct xfs_bstat_chunk *bc) { + struct user_namespace *sb_userns = mp->m_super->s_user_ns; struct xfs_icdinode *dic; /* dinode core info pointer */ struct xfs_inode *ip; /* incore inode pointer */ struct inode *inode; @@ -86,8 +88,8 @@ xfs_bulkstat_one_int( */ buf->bs_projectid = ip->i_d.di_projid; buf->bs_ino = ino; - buf->bs_uid = i_uid_read(inode); - buf->bs_gid = i_gid_read(inode); + buf->bs_uid = from_kuid(sb_userns, i_uid_into_mnt(mnt_userns, inode)); + buf->bs_gid = from_kgid(sb_userns, i_gid_into_mnt(mnt_userns, inode)); buf->bs_size = dic->di_size; buf->bs_nlink = inode->i_nlink; @@ -173,7 +175,8 @@ xfs_bulkstat_one( if (!bc.buf) return -ENOMEM; - error = xfs_bulkstat_one_int(breq->mp, NULL, breq->startino, &bc); + error = xfs_bulkstat_one_int(breq->mp, breq->mnt_userns, NULL, + breq->startino, &bc); kmem_free(bc.buf); @@ -194,9 +197,10 @@ xfs_bulkstat_iwalk( xfs_ino_t ino, void *data) { + struct xfs_bstat_chunk *bc = data; int error; - error = xfs_bulkstat_one_int(mp, tp, ino, data); + error = xfs_bulkstat_one_int(mp, bc->breq->mnt_userns, tp, ino, data); /* bulkstat just skips over missing inodes */ if (error == -ENOENT || error == -EINVAL) return 0; @@ -239,6 +243,11 @@ xfs_bulkstat( }; int error; + if (breq->mnt_userns != &init_user_ns) { + xfs_warn_ratelimited(breq->mp, + "bulkstat not supported inside of idmapped mounts."); + return -EINVAL; + } if (xfs_bulkstat_already_done(breq->mp, breq->startino)) return 0; diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 96a1e2a9be3f..7078d10c9b12 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -8,6 +8,7 @@ /* In-memory representation of a userspace request for batch inode data. */ struct xfs_ibulk { struct xfs_mount *mp; + struct user_namespace *mnt_userns; void __user *ubuffer; /* user output buffer */ xfs_ino_t startino; /* start with this inode */ unsigned int icount; /* number of elements in ubuffer */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 742d1413e2d0..bfa4164990b1 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -787,7 +787,8 @@ xfs_qm_qino_alloc( return error; if (need_alloc) { - error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, ipp); + error = xfs_dir_ialloc(&init_user_ns, &tp, NULL, S_IFREG, 1, 0, + 0, ipp); if (error) { xfs_trans_cancel(tp); return error; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 586d42342a79..e5e0713bebcd 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1881,7 +1881,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 8565663b16cd..1379013d74b8 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -134,6 +134,7 @@ xfs_readlink( int xfs_symlink( + struct user_namespace *mnt_userns, struct xfs_inode *dp, struct xfs_name *link_name, const char *target_path, @@ -221,8 +222,8 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, - prid, &ip); + error = xfs_dir_ialloc(mnt_userns, &tp, dp, S_IFLNK | (mode & ~S_IFMT), + 1, 0, prid, &ip); if (error) goto out_trans_cancel; diff --git a/fs/xfs/xfs_symlink.h b/fs/xfs/xfs_symlink.h index b1fa091427e6..2586b7e393f3 100644 --- a/fs/xfs/xfs_symlink.h +++ b/fs/xfs/xfs_symlink.h @@ -7,8 +7,9 @@ /* Kernel only symlink definitions */ -int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name, - const char *target_path, umode_t mode, struct xfs_inode **ipp); +int xfs_symlink(struct user_namespace *mnt_userns, struct xfs_inode *dp, + struct xfs_name *link_name, const char *target_path, + umode_t mode, struct xfs_inode **ipp); int xfs_readlink_bmap_ilocked(struct xfs_inode *ip, char *link); int xfs_readlink(struct xfs_inode *ip, char *link); int xfs_inactive_symlink(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 145e06c47744..546a6cd96729 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler( #endif /* CONFIG_PROC_FS */ STATIC int -xfs_deprecate_irix_sgid_inherit_proc_handler( +xfs_deprecated_dointvec_minmax( struct ctl_table *ctl, int write, void *buffer, @@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler( loff_t *ppos) { if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", - ctl->procname); - } - return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); -} - -STATIC int -xfs_deprecate_irix_symlink_mode_proc_handler( - struct ctl_table *ctl, - int write, - void *buffer, - size_t *lenp, - loff_t *ppos) -{ - if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", + printk_ratelimited(KERN_WARNING + "XFS: %s sysctl option is deprecated.\n", ctl->procname); } return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); @@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, @@ -201,6 +185,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.blockgc_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_deprecated_dointvec_minmax, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 44f72c09c203..b22a09e9daee 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -72,6 +72,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); + xfs_trans_clear_context(tp); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -123,7 +124,8 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; - ntp->t_pflags = tp->t_pflags; + + xfs_trans_switch_context(tp, ntp); /* move deferred ops over to the new tp */ xfs_defer_move(ntp, tp); @@ -157,9 +159,6 @@ xfs_trans_reserve( int error = 0; bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - /* * Attempt to reserve the needed disk blocks by decrementing * the number needed from the number available. This will @@ -167,10 +166,8 @@ xfs_trans_reserve( */ if (blocks > 0) { error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); - if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + if (error != 0) return -ENOSPC; - } tp->t_blk_res += blocks; } @@ -244,9 +241,6 @@ undo_blocks: xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } - - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - return error; } @@ -260,6 +254,7 @@ xfs_trans_alloc( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool want_retry = true; int error; /* @@ -267,9 +262,11 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ +retry: tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); + xfs_trans_set_context(tp); /* * Zero-reservation ("empty") transactions can't modify anything, so @@ -289,7 +286,9 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK; error = xfs_trans_reserve(tp, resp, blocks, rtextents); - if (error == -ENOSPC) { + if (error == -ENOSPC && want_retry) { + xfs_trans_cancel(tp); + /* * We weren't able to reserve enough space for the transaction. * Flush the other speculative space allocations to free space. @@ -297,8 +296,11 @@ xfs_trans_alloc( * other locks. */ error = xfs_blockgc_free_space(mp, NULL); - if (!error) - error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) + return error; + + want_retry = false; + goto retry; } if (error) { xfs_trans_cancel(tp); @@ -893,7 +895,6 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -925,7 +926,6 @@ out_unreserve: xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, !!error); xfs_trans_free(tp); @@ -985,9 +985,6 @@ xfs_trans_cancel( tp->t_ticket = NULL; } - /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - xfs_trans_free_items(tp, dirty); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8b03fbfe9a1b..9dd745cf77c9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +static inline void +xfs_trans_set_context( + struct xfs_trans *tp) +{ + ASSERT(current->journal_info == NULL); + tp->t_pflags = memalloc_nofs_save(); + current->journal_info = tp; +} + +static inline void +xfs_trans_clear_context( + struct xfs_trans *tp) +{ + if (current->journal_info == tp) { + memalloc_nofs_restore(tp->t_pflags); + current->journal_info = NULL; + } +} + +static inline void +xfs_trans_switch_context( + struct xfs_trans *old_tp, + struct xfs_trans *new_tp) +{ + ASSERT(current->journal_info == old_tp); + new_tp->t_pflags = old_tp->t_pflags; + old_tp->t_pflags = 0; + current->journal_info = new_tp; +} + #endif /* __XFS_TRANS_H__ */ diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index bca48b308c02..12be32f66dc1 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -38,9 +38,10 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused, } static int -xfs_xattr_set(const struct xattr_handler *handler, struct dentry *unused, - struct inode *inode, const char *name, const void *value, - size_t size, int flags) +xfs_xattr_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, + struct inode *inode, const char *name, const void *value, + size_t size, int flags) { struct xfs_da_args args = { .dp = XFS_I(inode), diff --git a/fs/zonefs/Makefile b/fs/zonefs/Makefile index 75a380aa1ae1..33c1a4f1132e 100644 --- a/fs/zonefs/Makefile +++ b/fs/zonefs/Makefile @@ -1,4 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 +ccflags-y += -I$(src) + obj-$(CONFIG_ZONEFS_FS) += zonefs.o zonefs-y := super.o diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index a29653c0196b..b6ff4a21abac 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -24,6 +24,9 @@ #include "zonefs.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + static inline int zonefs_zone_mgmt(struct inode *inode, enum req_opf op) { @@ -32,6 +35,7 @@ static inline int zonefs_zone_mgmt(struct inode *inode, lockdep_assert_held(&zi->i_truncate_mutex); + trace_zonefs_zone_mgmt(inode, op); ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector, zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) { @@ -100,6 +104,8 @@ static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = inode->i_sb->s_bdev; iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset; + trace_zonefs_iomap_begin(inode, iomap); + return 0; } @@ -250,6 +256,9 @@ static loff_t zonefs_check_zone_condition(struct inode *inode, } inode->i_mode &= ~0222; return i_size_read(inode); + case BLK_ZONE_COND_FULL: + /* The write pointer of full zones is invalid. */ + return zi->i_max_size; default: if (zi->i_ztype == ZONEFS_ZTYPE_CNV) return zi->i_max_size; @@ -480,7 +489,8 @@ unlock: return ret; } -static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) +static int zonefs_inode_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); int ret; @@ -488,7 +498,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; - ret = setattr_prepare(dentry, iattr); + ret = setattr_prepare(&init_user_ns, dentry, iattr); if (ret) return ret; @@ -516,7 +526,7 @@ static int zonefs_inode_setattr(struct dentry *dentry, struct iattr *iattr) return ret; } - setattr_copy(inode, iattr); + setattr_copy(&init_user_ns, inode, iattr); return 0; } @@ -703,6 +713,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from) ret = submit_bio_wait(bio); zonefs_file_write_dio_end_io(iocb, size, ret, 0); + trace_zonefs_file_dio_append(inode, size, ret); out_release: bio_release_pages(bio, false); @@ -1223,7 +1234,7 @@ static void zonefs_init_dir_inode(struct inode *parent, struct inode *inode, struct super_block *sb = parent->i_sb; inode->i_ino = blkdev_nr_zones(sb->s_bdev->bd_disk) + type + 1; - inode_init_owner(inode, parent, S_IFDIR | 0555); + inode_init_owner(&init_user_ns, inode, parent, S_IFDIR | 0555); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); diff --git a/fs/zonefs/trace.h b/fs/zonefs/trace.h new file mode 100644 index 000000000000..f369d7d50303 --- /dev/null +++ b/fs/zonefs/trace.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * zonefs filesystem driver tracepoints. + * + * Copyright (C) 2021 Western Digital Corporation or its affiliates. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zonefs + +#if !defined(_TRACE_ZONEFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZONEFS_H + +#include <linux/tracepoint.h> +#include <linux/trace_seq.h> +#include <linux/blkdev.h> + +#include "zonefs.h" + +#define show_dev(dev) MAJOR(dev), MINOR(dev) + +TRACE_EVENT(zonefs_zone_mgmt, + TP_PROTO(struct inode *inode, enum req_opf op), + TP_ARGS(inode, op), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(int, op) + __field(sector_t, sector) + __field(sector_t, nr_sectors) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->op = op; + __entry->sector = ZONEFS_I(inode)->i_zsector; + __entry->nr_sectors = + ZONEFS_I(inode)->i_zone_size >> SECTOR_SHIFT; + ), + TP_printk("bdev=(%d,%d), ino=%lu op=%s, sector=%llu, nr_sectors=%llu", + show_dev(__entry->dev), (unsigned long)__entry->ino, + blk_op_str(__entry->op), __entry->sector, + __entry->nr_sectors + ) +); + +TRACE_EVENT(zonefs_file_dio_append, + TP_PROTO(struct inode *inode, ssize_t size, ssize_t ret), + TP_ARGS(inode, size, ret), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(sector_t, sector) + __field(ssize_t, size) + __field(loff_t, wpoffset) + __field(ssize_t, ret) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->sector = ZONEFS_I(inode)->i_zsector; + __entry->size = size; + __entry->wpoffset = ZONEFS_I(inode)->i_wpoffset; + __entry->ret = ret; + ), + TP_printk("bdev=(%d, %d), ino=%lu, sector=%llu, size=%zu, wpoffset=%llu, ret=%zu", + show_dev(__entry->dev), (unsigned long)__entry->ino, + __entry->sector, __entry->size, __entry->wpoffset, + __entry->ret + ) +); + +TRACE_EVENT(zonefs_iomap_begin, + TP_PROTO(struct inode *inode, struct iomap *iomap), + TP_ARGS(inode, iomap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(u64, addr) + __field(loff_t, offset) + __field(u64, length) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->addr = iomap->addr; + __entry->offset = iomap->offset; + __entry->length = iomap->length; + ), + TP_printk("bdev=(%d,%d), ino=%lu, addr=%llu, offset=%llu, length=%llu", + show_dev(__entry->dev), (unsigned long)__entry->ino, + __entry->addr, __entry->offset, __entry->length + ) +); + +#endif /* _TRACE_ZONEFS_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +/* This part must be outside protection */ +#include <trace/define_trace.h> |