diff options
Diffstat (limited to 'fs')
490 files changed, 11219 insertions, 9931 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 0d28ecf668d0..b845ee18a80b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mapping->a_ops = &v9fs_addr_operations; inode->i_private = NULL; @@ -1150,8 +1150,8 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, set_nlink(inode, 1); - inode->i_atime.tv_sec = stat->atime; - inode->i_mtime.tv_sec = stat->mtime; + inode_set_atime(inode, stat->atime, 0); + inode_set_mtime(inode, stat->mtime, 0); inode_set_ctime(inode, stat->mtime, 0); inode->i_uid = v9ses->dfltuid; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1312f68965ac..c7319af2f471 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -641,10 +641,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, struct v9fs_inode *v9inode = V9FS_I(inode); if ((stat->st_result_mask & P9_STATS_BASIC) == P9_STATS_BASIC) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode_set_atime(inode, stat->st_atime_sec, + stat->st_atime_nsec); + inode_set_mtime(inode, stat->st_mtime_sec, + stat->st_mtime_nsec); inode_set_ctime(inode, stat->st_ctime_sec, stat->st_ctime_nsec); inode->i_uid = stat->st_uid; @@ -660,12 +660,12 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_blocks = stat->st_blocks; } else { if (stat->st_result_mask & P9_STATS_ATIME) { - inode->i_atime.tv_sec = stat->st_atime_sec; - inode->i_atime.tv_nsec = stat->st_atime_nsec; + inode_set_atime(inode, stat->st_atime_sec, + stat->st_atime_nsec); } if (stat->st_result_mask & P9_STATS_MTIME) { - inode->i_mtime.tv_sec = stat->st_mtime_sec; - inode->i_mtime.tv_nsec = stat->st_mtime_nsec; + inode_set_mtime(inode, stat->st_mtime_sec, + stat->st_mtime_nsec); } if (stat->st_result_mask & P9_STATS_CTIME) { inode_set_ctime(inode, stat->st_ctime_sec, diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index e00cf8109b3f..053d1cef6e13 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -162,27 +162,27 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler, return v9fs_xattr_set(dentry, full_name, value, size, flags); } -static struct xattr_handler v9fs_xattr_user_handler = { +static const struct xattr_handler v9fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; -static struct xattr_handler v9fs_xattr_trusted_handler = { +static const struct xattr_handler v9fs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; #ifdef CONFIG_9P_FS_SECURITY -static struct xattr_handler v9fs_xattr_security_handler = { +static const struct xattr_handler v9fs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = v9fs_xattr_handler_get, .set = v9fs_xattr_handler_set, }; #endif -const struct xattr_handler *v9fs_xattr_handlers[] = { +const struct xattr_handler * const v9fs_xattr_handlers[] = { &v9fs_xattr_user_handler, &v9fs_xattr_trusted_handler, #ifdef CONFIG_9P_FS_SECURITY diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index b5636e544c8a..3ad5a802352a 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -10,7 +10,7 @@ #include <net/9p/9p.h> #include <net/9p/client.h> -extern const struct xattr_handler *v9fs_xattr_handlers[]; +extern const struct xattr_handler * const v9fs_xattr_handlers[]; ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 20963002578a..3081edb09e46 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -242,6 +242,7 @@ struct inode * adfs_iget(struct super_block *sb, struct object_info *obj) { struct inode *inode; + struct timespec64 ts; inode = new_inode(sb); if (!inode) @@ -268,9 +269,10 @@ adfs_iget(struct super_block *sb, struct object_info *obj) ADFS_I(inode)->attr = obj->attr; inode->i_mode = adfs_atts2mode(sb, inode); - adfs_adfs2unix_time(&inode->i_mtime, inode); - inode->i_atime = inode->i_mtime; - inode_set_ctime_to_ts(inode, inode->i_mtime); + adfs_adfs2unix_time(&ts, inode); + inode_set_atime_to_ts(inode, ts); + inode_set_mtime_to_ts(inode, ts); + inode_set_ctime_to_ts(inode, ts); if (S_ISDIR(inode->i_mode)) { inode->i_op = &adfs_dir_inode_operations; @@ -321,7 +323,8 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) { adfs_unix2adfs_time(inode, &attr->ia_mtime); - adfs_adfs2unix_time(&inode->i_mtime, inode); + adfs_adfs2unix_time(&attr->ia_mtime, inode); + inode_set_mtime_to_ts(inode, attr->ia_mtime); } /* @@ -329,7 +332,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, * have the ability to represent them in our filesystem? */ if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 7ba93efc1143..fd669daa4e7b 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_inc_iversion(dir); mark_inode_dirty(dir); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 060746c63151..0210df8d3500 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -149,13 +149,9 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; } - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = - inode_set_ctime(inode, - (be32_to_cpu(tail->change.days) * 86400LL + - be32_to_cpu(tail->change.mins) * 60 + - be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) - + sys_tz.tz_minuteswest * 60, 0).tv_sec; - inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, + inode_set_atime(inode, inode_set_ctime(inode, (be32_to_cpu(tail->change.days) * 86400LL + be32_to_cpu(tail->change.mins) * 60 + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + sys_tz.tz_minuteswest * 60, 0).tv_sec, 0).tv_sec, + 0); affs_brelse(bh); unlock_new_inode(inode); return inode; @@ -187,12 +183,13 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) } tail = AFFS_TAIL(sb, bh); if (tail->stype == cpu_to_be32(ST_ROOT)) { - affs_secs_to_datestamp(inode->i_mtime.tv_sec, + affs_secs_to_datestamp(inode_get_mtime_sec(inode), &AFFS_ROOT_TAIL(sb, bh)->root_change); } else { tail->protect = cpu_to_be32(AFFS_I(inode)->i_protect); tail->size = cpu_to_be32(inode->i_size); - affs_secs_to_datestamp(inode->i_mtime.tv_sec, &tail->change); + affs_secs_to_datestamp(inode_get_mtime_sec(inode), + &tail->change); if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { uid = i_uid_read(inode); gid = i_gid_read(inode); @@ -314,7 +311,7 @@ affs_new_inode(struct inode *dir) inode->i_gid = current_fsgid(); inode->i_ino = block; set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index 95bcbd7654d1..4d04ef2d3ae7 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_generation = 0; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 1c794a1896aa..78efc9719349 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -91,8 +91,8 @@ static int afs_inode_init_from_status(struct afs_operation *op, t = status->mtime_client; inode_set_ctime_to_ts(inode, t); - inode->i_mtime = t; - inode->i_atime = t; + inode_set_mtime_to_ts(inode, t); + inode_set_atime_to_ts(inode, t); inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); @@ -204,7 +204,7 @@ static void afs_apply_status(struct afs_operation *op, } t = status->mtime_client; - inode->i_mtime = t; + inode_set_mtime_to_ts(inode, t); if (vp->update_ctime) inode_set_ctime_to_ts(inode, op->ctime); @@ -253,7 +253,7 @@ static void afs_apply_status(struct afs_operation *op, if (change_size) { afs_set_i_size(vnode, status->size); inode_set_ctime_to_ts(inode, t); - inode->i_atime = t; + inode_set_atime_to_ts(inode, t); } } } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index da73b97e19a9..23e2cc4efe41 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1541,7 +1541,7 @@ int afs_launder_folio(struct folio *); /* * xattr.c */ -extern const struct xattr_handler *afs_xattr_handlers[]; +extern const struct xattr_handler * const afs_xattr_handlers[]; /* * yfsclient.c diff --git a/fs/afs/write.c b/fs/afs/write.c index e1c45341719b..4a168781936b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -424,7 +424,7 @@ try_next_key: op->store.write_iter = iter; op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); - op->mtime = vnode->netfs.inode.i_mtime; + op->mtime = inode_get_mtime(&vnode->netfs.inode); afs_wait_for_operation(op); diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c index 9048d8ccc715..64b2c0224f62 100644 --- a/fs/afs/xattr.c +++ b/fs/afs/xattr.c @@ -353,7 +353,7 @@ static const struct xattr_handler afs_xattr_afs_volume_handler = { .get = afs_xattr_get_volume, }; -const struct xattr_handler *afs_xattr_handlers[] = { +const struct xattr_handler * const afs_xattr_handlers[] = { &afs_xattr_afs_acl_handler, &afs_xattr_afs_cell_handler, &afs_xattr_afs_fid_handler, @@ -80,7 +80,7 @@ struct aio_ring { struct kioctx_table { struct rcu_head rcu; unsigned nr; - struct kioctx __rcu *table[]; + struct kioctx __rcu *table[] __counted_by(nr); }; struct kioctx_cpu { diff --git a/fs/attr.c b/fs/attr.c index a8ae5f6d9b16..bdf5deb06ea9 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -308,9 +308,9 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index d5a44fa88acf..8c1d587b3eef 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -25,6 +25,8 @@ #include <linux/completion.h> #include <linux/file.h> #include <linux/magic.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -205,20 +207,34 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs_fill_super(struct super_block *, void *, int); +extern const struct fs_parameter_spec autofs_param_specs[]; +int autofs_init_fs_context(struct fs_context *fc); struct autofs_info *autofs_new_ino(struct autofs_sb_info *); void autofs_clean_ino(struct autofs_info *); -static inline int autofs_prepare_pipe(struct file *pipe) +static inline int autofs_check_pipe(struct file *pipe) { if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; + return 0; +} + +static inline void autofs_set_packet_pipe_flags(struct file *pipe) +{ /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; /* We don't expect -EAGAIN */ pipe->f_flags &= ~O_NONBLOCK; +} + +static inline int autofs_prepare_pipe(struct file *pipe) +{ + int ret = autofs_check_pipe(pipe); + if (ret < 0) + return ret; + autofs_set_packet_pipe_flags(pipe); return 0; } diff --git a/fs/autofs/init.c b/fs/autofs/init.c index d3f55e874338..b5e4dfa04ed0 100644 --- a/fs/autofs/init.c +++ b/fs/autofs/init.c @@ -7,16 +7,11 @@ #include <linux/init.h> #include "autofs_i.h" -static struct dentry *autofs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_nodev(fs_type, flags, data, autofs_fill_super); -} - struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", - .mount = autofs_mount, + .init_fs_context = autofs_init_fs_context, + .parameters = autofs_param_specs, .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 2b49662ed237..a5083d447a62 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -6,7 +6,6 @@ #include <linux/seq_file.h> #include <linux/pagemap.h> -#include <linux/parser.h> #include "autofs_i.h" @@ -110,189 +109,179 @@ static const struct super_operations autofs_sops = { .evict_inode = autofs_evict_inode, }; -enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, - Opt_ignore}; - -static const match_table_t tokens = { - {Opt_fd, "fd=%u"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pgrp, "pgrp=%u"}, - {Opt_minproto, "minproto=%u"}, - {Opt_maxproto, "maxproto=%u"}, - {Opt_indirect, "indirect"}, - {Opt_direct, "direct"}, - {Opt_offset, "offset"}, - {Opt_strictexpire, "strictexpire"}, - {Opt_ignore, "ignore"}, - {Opt_err, NULL} +enum { + Opt_direct, + Opt_fd, + Opt_gid, + Opt_ignore, + Opt_indirect, + Opt_maxproto, + Opt_minproto, + Opt_offset, + Opt_pgrp, + Opt_strictexpire, + Opt_uid, }; -static int parse_options(char *options, - struct inode *root, int *pgrp, bool *pgrp_set, - struct autofs_sb_info *sbi) +const struct fs_parameter_spec autofs_param_specs[] = { + fsparam_flag ("direct", Opt_direct), + fsparam_fd ("fd", Opt_fd), + fsparam_u32 ("gid", Opt_gid), + fsparam_flag ("ignore", Opt_ignore), + fsparam_flag ("indirect", Opt_indirect), + fsparam_u32 ("maxproto", Opt_maxproto), + fsparam_u32 ("minproto", Opt_minproto), + fsparam_flag ("offset", Opt_offset), + fsparam_u32 ("pgrp", Opt_pgrp), + fsparam_flag ("strictexpire", Opt_strictexpire), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +struct autofs_fs_context { + kuid_t uid; + kgid_t gid; + int pgrp; + bool pgrp_set; +}; + +/* + * Open the fd. We do it here rather than in get_tree so that it's done in the + * context of the system call that passed the data and not the one that + * triggered the superblock creation, lest the fd gets reassigned. + */ +static int autofs_parse_fd(struct fs_context *fc, struct autofs_sb_info *sbi, + struct fs_parameter *param, + struct fs_parse_result *result) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - int pipefd = -1; - kuid_t uid; - kgid_t gid; + struct file *pipe; + int ret; - root->i_uid = current_uid(); - root->i_gid = current_gid(); + if (param->type == fs_value_is_file) { + /* came through the new api */ + pipe = param->file; + param->file = NULL; + } else { + pipe = fget(result->uint_32); + } + if (!pipe) { + errorf(fc, "could not open pipe file descriptor"); + return -EBADF; + } - sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; - sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; + ret = autofs_check_pipe(pipe); + if (ret < 0) { + errorf(fc, "Invalid/unusable pipe"); + if (param->type != fs_value_is_file) + fput(pipe); + return -EBADF; + } - sbi->pipefd = -1; + autofs_set_packet_pipe_flags(pipe); - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_fd: - if (match_int(args, &pipefd)) - return 1; - sbi->pipefd = pipefd; - break; - case Opt_uid: - if (match_int(args, &option)) - return 1; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return 1; - root->i_uid = uid; - break; - case Opt_gid: - if (match_int(args, &option)) - return 1; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return 1; - root->i_gid = gid; - break; - case Opt_pgrp: - if (match_int(args, &option)) - return 1; - *pgrp = option; - *pgrp_set = true; - break; - case Opt_minproto: - if (match_int(args, &option)) - return 1; - sbi->min_proto = option; - break; - case Opt_maxproto: - if (match_int(args, &option)) - return 1; - sbi->max_proto = option; - break; - case Opt_indirect: - set_autofs_type_indirect(&sbi->type); - break; - case Opt_direct: - set_autofs_type_direct(&sbi->type); - break; - case Opt_offset: - set_autofs_type_offset(&sbi->type); - break; - case Opt_strictexpire: - sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; - break; - case Opt_ignore: - sbi->flags |= AUTOFS_SBI_IGNORE; - break; - default: - return 1; - } + if (sbi->pipe) + fput(sbi->pipe); + + sbi->pipefd = result->uint_32; + sbi->pipe = pipe; + + return 0; +} + +static int autofs_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = fc->s_fs_info; + struct fs_parse_result result; + kuid_t uid; + kgid_t gid; + int opt; + + opt = fs_parse(fc, autofs_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_fd: + return autofs_parse_fd(fc, sbi, param, &result); + case Opt_uid: + uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(uid)) + return invalfc(fc, "Invalid uid"); + ctx->uid = uid; + break; + case Opt_gid: + gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(gid)) + return invalfc(fc, "Invalid gid"); + ctx->gid = gid; + break; + case Opt_pgrp: + ctx->pgrp = result.uint_32; + ctx->pgrp_set = true; + break; + case Opt_minproto: + sbi->min_proto = result.uint_32; + break; + case Opt_maxproto: + sbi->max_proto = result.uint_32; + break; + case Opt_indirect: + set_autofs_type_indirect(&sbi->type); + break; + case Opt_direct: + set_autofs_type_direct(&sbi->type); + break; + case Opt_offset: + set_autofs_type_offset(&sbi->type); + break; + case Opt_strictexpire: + sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; + break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; } - return (sbi->pipefd < 0); + + return 0; } -int autofs_fill_super(struct super_block *s, void *data, int silent) +static struct autofs_sb_info *autofs_alloc_sbi(void) { - struct inode *root_inode; - struct dentry *root; - struct file *pipe; struct autofs_sb_info *sbi; - struct autofs_info *ino; - int pgrp = 0; - bool pgrp_set = false; - int ret = -EINVAL; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; - pr_debug("starting up, sbi = %p\n", sbi); + return NULL; - s->s_fs_info = sbi; sbi->magic = AUTOFS_SBI_MAGIC; - sbi->pipefd = -1; - sbi->pipe = NULL; - sbi->exp_timeout = 0; - sbi->oz_pgrp = NULL; - sbi->sb = s; - sbi->version = 0; - sbi->sub_version = 0; sbi->flags = AUTOFS_SBI_CATATONIC; + sbi->min_proto = AUTOFS_MIN_PROTO_VERSION; + sbi->max_proto = AUTOFS_MAX_PROTO_VERSION; + sbi->pipefd = -1; + set_autofs_type_indirect(&sbi->type); - sbi->min_proto = 0; - sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); - sbi->queues = NULL; spin_lock_init(&sbi->lookup_lock); INIT_LIST_HEAD(&sbi->active_list); INIT_LIST_HEAD(&sbi->expiring_list); - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs_sops; - s->s_d_op = &autofs_dentry_operations; - s->s_time_gran = 1; - /* - * Get the root inode and dentry, but defer checking for errors. - */ - ino = autofs_new_ino(sbi); - if (!ino) { - ret = -ENOMEM; - goto fail_free; - } - root_inode = autofs_get_inode(s, S_IFDIR | 0755); - root = d_make_root(root_inode); - if (!root) { - ret = -ENOMEM; - goto fail_ino; - } - pipe = NULL; - - root->d_fsdata = ino; + return sbi; +} - /* Can this call block? */ - if (parse_options(data, root_inode, &pgrp, &pgrp_set, sbi)) { - pr_err("called with bogus options\n"); - goto fail_dput; - } +static int autofs_validate_protocol(struct fs_context *fc) +{ + struct autofs_sb_info *sbi = fc->s_fs_info; /* Test versions first */ if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " + errorf(fc, "kernel does not match daemon version " "daemon (%d, %d) kernel (%d, %d)\n", sbi->min_proto, sbi->max_proto, AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; + return -EINVAL; } /* Establish highest kernel protocol version */ @@ -300,13 +289,62 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) sbi->version = AUTOFS_MAX_PROTO_VERSION; else sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - if (pgrp_set) { - sbi->oz_pgrp = find_get_pid(pgrp); + switch (sbi->version) { + case 4: + sbi->sub_version = 7; + break; + case 5: + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + break; + default: + sbi->sub_version = 0; + } + + return 0; +} + +static int autofs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = s->s_fs_info; + struct inode *root_inode; + struct dentry *root; + struct autofs_info *ino; + int ret = -ENOMEM; + + pr_debug("starting up, sbi = %p\n", sbi); + + sbi->sb = s; + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = AUTOFS_SUPER_MAGIC; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; + s->s_time_gran = 1; + + /* + * Get the root inode and dentry, but defer checking for errors. + */ + ino = autofs_new_ino(sbi); + if (!ino) + goto fail; + + root_inode = autofs_get_inode(s, S_IFDIR | 0755); + root_inode->i_uid = ctx->uid; + root_inode->i_gid = ctx->gid; + + root = d_make_root(root_inode); + if (!root) + goto fail_ino; + + root->d_fsdata = ino; + + if (ctx->pgrp_set) { + sbi->oz_pgrp = find_get_pid(ctx->pgrp); if (!sbi->oz_pgrp) { - pr_err("could not find process group %d\n", - pgrp); + ret = invalf(fc, "Could not find process group %d", + ctx->pgrp); goto fail_dput; } } else { @@ -321,16 +359,7 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) pr_debug("pipe fd = %d, pgrp = %u\n", sbi->pipefd, pid_nr(sbi->oz_pgrp)); - pipe = fget(sbi->pipefd); - if (!pipe) { - pr_err("could not open pipe file descriptor\n"); - goto fail_put_pid; - } - ret = autofs_prepare_pipe(pipe); - if (ret < 0) - goto fail_fput; - sbi->pipe = pipe; sbi->flags &= ~AUTOFS_SBI_CATATONIC; /* @@ -342,22 +371,82 @@ int autofs_fill_super(struct super_block *s, void *data, int silent) /* * Failure ... clean up. */ -fail_fput: - pr_err("pipe file descriptor does not contain proper ops\n"); - fput(pipe); -fail_put_pid: - put_pid(sbi->oz_pgrp); fail_dput: dput(root); - goto fail_free; + goto fail; fail_ino: autofs_free_ino(ino); -fail_free: - kfree(sbi); - s->s_fs_info = NULL; +fail: return ret; } +/* + * Validate the parameters and then request a superblock. + */ +static int autofs_get_tree(struct fs_context *fc) +{ + struct autofs_sb_info *sbi = fc->s_fs_info; + int ret; + + ret = autofs_validate_protocol(fc); + if (ret) + return ret; + + if (sbi->pipefd < 0) + return invalf(fc, "No control pipe specified"); + + return get_tree_nodev(fc, autofs_fill_super); +} + +static void autofs_free_fc(struct fs_context *fc) +{ + struct autofs_fs_context *ctx = fc->fs_private; + struct autofs_sb_info *sbi = fc->s_fs_info; + + if (sbi) { + if (sbi->pipe) + fput(sbi->pipe); + kfree(sbi); + } + kfree(ctx); +} + +static const struct fs_context_operations autofs_context_ops = { + .free = autofs_free_fc, + .parse_param = autofs_parse_param, + .get_tree = autofs_get_tree, +}; + +/* + * Set up the filesystem mount context. + */ +int autofs_init_fs_context(struct fs_context *fc) +{ + struct autofs_fs_context *ctx; + struct autofs_sb_info *sbi; + + ctx = kzalloc(sizeof(struct autofs_fs_context), GFP_KERNEL); + if (!ctx) + goto nomem; + + ctx->uid = current_uid(); + ctx->gid = current_gid(); + + sbi = autofs_alloc_sbi(); + if (!sbi) + goto nomem_ctx; + + fc->fs_private = ctx; + fc->s_fs_info = sbi; + fc->ops = &autofs_context_ops; + return 0; + +nomem_ctx: + kfree(ctx); +nomem: + return -ENOMEM; +} + struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -370,7 +459,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 512b9a26c63d..530d18827e35 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return 0; } diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 83f9566c973b..316d88da2ce1 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -208,7 +208,7 @@ void make_bad_inode(struct inode *inode) remove_inode_hash(inode); inode->i_mode = S_IFREG; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index bc009ef497d0..6642b88c41a0 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -66,9 +66,9 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->v.i_mode = bi->bi_mode; if (fields & ATTR_ATIME) - inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + inode_set_atime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_atime)); if (fields & ATTR_MTIME) - inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + inode_set_mtime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_mtime)); if (fields & ATTR_CTIME) inode_set_ctime_to_ts(&inode->v, bch2_time_to_timespec(c, bi->bi_ctime)); @@ -753,8 +753,8 @@ static int bch2_getattr(struct mnt_idmap *idmap, stat->gid = inode->v.i_gid; stat->rdev = inode->v.i_rdev; stat->size = i_size_read(&inode->v); - stat->atime = inode->v.i_atime; - stat->mtime = inode->v.i_mtime; + stat->atime = inode_get_atime(&inode->v); + stat->mtime = inode_get_mtime(&inode->v); stat->ctime = inode_get_ctime(&inode->v); stat->blksize = block_bytes(c); stat->blocks = inode->v.i_blocks; @@ -1418,8 +1418,8 @@ static int inode_update_times_fn(struct btree_trans *trans, { struct bch_fs *c = inode->v.i_sb->s_fs_info; - bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); - bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); + bi->bi_atime = timespec_to_bch2_time(c, inode_get_atime(&inode->v)); + bi->bi_mtime = timespec_to_bch2_time(c, inode_get_mtime(&inode->v)); bi->bi_ctime = timespec_to_bch2_time(c, inode_get_ctime(&inode->v)); return 0; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9a16a51fbb88..9acdec56f626 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -360,11 +360,11 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) * for indexing purposes. (PFD, page 54) */ - inode->i_mtime.tv_sec = - fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; - inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ - inode_set_ctime_to_ts(inode, inode->i_mtime); - inode->i_atime = inode->i_mtime; + inode_set_mtime(inode, + fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16, + 0);/* lower 16 bits are not a time */ + inode_set_ctime_to_ts(inode, inode_get_mtime(inode)); + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); befs_ino->i_parent = fsrun_to_cpu(sb, raw_inode->parent); diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 12b8af04dcb3..fbc4ae80a4b2 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; @@ -187,7 +187,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) } de->ino = 0; mark_buffer_dirty_inode(bh, dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); @@ -240,7 +240,7 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto end_rename; } old_de->ino = 0; - old_dir->i_mtime = inode_set_ctime_current(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); if (new_inode) { inode_set_ctime_current(new_inode); @@ -294,7 +294,8 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) dir->i_size += BFS_DIRENT_SIZE; inode_set_ctime_current(dir); } - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index e6a76ae9eb44..355957dbce39 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -80,11 +80,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) set_nlink(inode, le32_to_cpu(di->i_nlink)); inode->i_size = BFS_FILESIZE(di); inode->i_blocks = BFS_FILEBLOCKS(di); - inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); - inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); + inode_set_atime(inode, le32_to_cpu(di->i_atime), 0); + inode_set_mtime(inode, le32_to_cpu(di->i_mtime), 0); inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; brelse(bh); unlock_new_inode(inode); @@ -140,9 +138,9 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_uid = cpu_to_le32(i_uid_read(inode)); di->i_gid = cpu_to_le32(i_gid_read(inode)); di->i_nlink = cpu_to_le32(inode->i_nlink); - di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); + di->i_atime = cpu_to_le32(inode_get_atime_sec(inode)); + di->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode)); + di->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode)); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 43b2a2851ba3..206812ce544a 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -345,10 +345,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) /* there's now no turning back... the old userspace image is dead, * defunct, deceased, etc. */ + SET_PERSONALITY(exec_params.hdr); if (elf_check_fdpic(&exec_params.hdr)) - set_personality(PER_LINUX_FDPIC); - else - set_personality(PER_LINUX); + current->personality |= PER_LINUX_FDPIC; if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index e0108d17b085..5d2be9b0a0a5 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -547,7 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } return inode; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 3282adc84d52..4fb925e8c981 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -31,7 +31,7 @@ config BTRFS_FS continue to be mountable and usable by newer kernels. For more information, please see the web pages at - http://btrfs.wiki.kernel.org. + https://btrfs.readthedocs.io To compile this file system support as a module, choose M here. The module will be called btrfs. @@ -48,27 +48,6 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DEPRECATED)" - depends on BTRFS_FS - help - This feature has been deprecated and will be removed in 6.7. - - Adds code that examines all block write requests (including - writes of the super block). The goal is to verify that the - state of the filesystem on disk is always consistent, i.e., - after a power-loss or kernel panic event the filesystem is - in a consistent state. - - If the integrity check tool is included and activated in - the mount options, plenty of kernel memory is used, and - plenty of additional CPU cycles are spent. Enabling this - functionality is not intended for normal use. - - In most cases, unless you are a btrfs developer who needs - to verify the integrity of (super)-block write requests - during the run of a regression test, say N - config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 90d53209755b..525af975f61c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -33,10 +33,9 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o \ - lru_cache.o + lru_cache.o raid-stripe-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o -btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_REF_VERIFY) += ref-verify.o btrfs-$(CONFIG_BLK_DEV_ZONED) += zoned.o btrfs-$(CONFIG_FS_VERITY) += verity.o diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 8cfc8214109c..aa0844535644 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -4,6 +4,7 @@ #define BTRFS_ACCESSORS_H #include <linux/stddef.h> +#include <asm/unaligned.h> struct btrfs_map_token { struct extent_buffer *eb; @@ -305,6 +306,14 @@ BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_FUNCS(stripe_extent_encoding, struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_FUNCS(raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_FUNCS(raid_stride_physical, struct btrfs_raid_stride, physical, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_extent_encoding, + struct btrfs_stripe_extent, encoding, 8); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_devid, struct btrfs_raid_stride, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_raid_stride_physical, struct btrfs_raid_stride, physical, 64); + /* struct btrfs_dev_extent */ BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, @@ -349,6 +358,9 @@ BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 3 BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); +BTRFS_SETGET_FUNCS(extent_owner_ref_root_id, struct btrfs_extent_owner_ref, + root_id, 64); + BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, type, 8); BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, @@ -365,6 +377,8 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); return 0; } @@ -966,6 +980,8 @@ BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, flags, 64); BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, rescan, 64); +BTRFS_SETGET_FUNCS(qgroup_status_enable_gen, struct btrfs_qgroup_status_item, + enable_gen, 64); /* btrfs_qgroup_info_item */ BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ce083e99ef68..9e261aac671e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -9,6 +9,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/freezer.h> +#include <trace/events/btrfs.h> #include "async-thread.h" #include "ctree.h" @@ -242,7 +243,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, break; trace_btrfs_ordered_sched(work); spin_unlock_irqrestore(lock, flags); - work->ordered_func(work); + work->ordered_func(work, false); /* now take the lock again and drop our item from the list */ spin_lock_irqsave(lock, flags); @@ -277,7 +278,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, * We don't want to call the ordered free functions with * the lock held. */ - work->ordered_free(work); + work->ordered_func(work, true); /* NB: work must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, work); } @@ -285,7 +286,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_unlock_irqrestore(lock, flags); if (free_self) { - self->ordered_free(self); + self->ordered_func(self, true); /* NB: self must not be dereferenced past this point. */ trace_btrfs_all_work_done(wq->fs_info, self); } @@ -300,7 +301,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) /* * We should not touch things inside work in the following cases: - * 1) after work->func() if it has no ordered_free + * 1) after work->func() if it has no ordered_func(..., true) to free * Since the struct is freed in work->func(). * 2) after setting WORK_DONE_BIT * The work may be freed in other threads almost instantly. @@ -329,11 +330,10 @@ static void btrfs_work_helper(struct work_struct *normal_work) } void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free) + btrfs_ordered_func_t ordered_func) { work->func = func; work->ordered_func = ordered_func; - work->ordered_free = ordered_free; INIT_WORK(&work->normal_work, btrfs_work_helper); INIT_LIST_HEAD(&work->ordered_list); work->flags = 0; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 30f66c5e2e6e..62b8a0d57898 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -13,11 +13,11 @@ struct btrfs_fs_info; struct btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); +typedef void (*btrfs_ordered_func_t)(struct btrfs_work *arg, bool); struct btrfs_work { btrfs_func_t func; - btrfs_func_t ordered_func; - btrfs_func_t ordered_free; + btrfs_ordered_func_t ordered_func; /* Don't touch things below */ struct work_struct normal_work; @@ -35,7 +35,7 @@ struct btrfs_workqueue *btrfs_alloc_ordered_workqueue( struct btrfs_fs_info *fs_info, const char *name, unsigned int flags); void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, - btrfs_func_t ordered_func, btrfs_func_t ordered_free); + btrfs_ordered_func_t ordered_func); void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b7d54efb4728..beed7e459dab 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1129,6 +1129,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, count, sc, GFP_NOFS); break; } + case BTRFS_EXTENT_OWNER_REF_KEY: + ASSERT(btrfs_fs_incompat(ctx->fs_info, SIMPLE_QUOTA)); + break; default: WARN_ON(1); } @@ -2998,7 +3001,7 @@ int btrfs_backref_iter_next(struct btrfs_backref_iter *iter) } void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc) + struct btrfs_backref_cache *cache, bool is_reloc) { int i; @@ -3196,12 +3199,14 @@ static int handle_direct_tree_backref(struct btrfs_backref_cache *cache, * We still need to do a tree search to find out the parents. This is for * TREE_BLOCK_REF backref (keyed or inlined). * + * @trans: Transaction handle. * @ref_key: The same as @ref_key in handle_direct_tree_backref() * @tree_key: The first key of this tree block. * @path: A clean (released) path, to avoid allocating path every time * the function get called. */ -static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, +static int handle_indirect_tree_backref(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_key *ref_key, struct btrfs_key *tree_key, @@ -3315,7 +3320,7 @@ static int handle_indirect_tree_backref(struct btrfs_backref_cache *cache, * If we know the block isn't shared we can avoid * checking its backrefs. */ - if (btrfs_block_can_be_shared(root, eb)) + if (btrfs_block_can_be_shared(trans, root, eb)) upper->checked = 0; else upper->checked = 1; @@ -3363,11 +3368,13 @@ out: * links aren't yet bi-directional. Needs to finish such links. * Use btrfs_backref_finish_upper_links() to finish such linkage. * + * @trans: Transaction handle. * @path: Released path for indirect tree backref lookup * @iter: Released backref iter for extent tree search * @node_key: The first key of the tree block */ -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, @@ -3467,8 +3474,8 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, * offset means the root objectid. We need to search * the tree to get its parent bytenr. */ - ret = handle_indirect_tree_backref(cache, path, &key, node_key, - cur); + ret = handle_indirect_tree_backref(trans, cache, path, + &key, node_key, cur); if (ret < 0) goto out; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 1616e3e3f1e4..ab4ca0eda605 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -247,7 +247,7 @@ struct prelim_ref { struct rb_node rbnode; u64 root_id; struct btrfs_key key_for_search; - int level; + u8 level; int count; struct extent_inode_elem *inode_list; u64 parent; @@ -440,11 +440,11 @@ struct btrfs_backref_cache { * Reloction backref cache require more info for reloc root compared * to generic backref cache. */ - unsigned int is_reloc; + bool is_reloc; }; void btrfs_backref_init_cache(struct btrfs_fs_info *fs_info, - struct btrfs_backref_cache *cache, int is_reloc); + struct btrfs_backref_cache *cache, bool is_reloc); struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_cache *cache, u64 bytenr, int level); struct btrfs_backref_edge *btrfs_backref_alloc_edge( @@ -533,14 +533,15 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, void btrfs_backref_release_cache(struct btrfs_backref_cache *cache); static inline void btrfs_backref_panic(struct btrfs_fs_info *fs_info, - u64 bytenr, int errno) + u64 bytenr, int error) { - btrfs_panic(fs_info, errno, + btrfs_panic(fs_info, error, "Inconsistency in backref cache found at offset %llu", bytenr); } -int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, +int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, + struct btrfs_backref_cache *cache, struct btrfs_path *path, struct btrfs_backref_iter *iter, struct btrfs_key *node_key, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 12b12443efaa..4f3b693a16b1 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -10,11 +10,11 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "rcu-string.h" #include "zoned.h" #include "file-item.h" +#include "raid-stripe-tree.h" static struct bio_set btrfs_bioset; static struct bio_set btrfs_clone_bioset; @@ -416,6 +416,9 @@ static void btrfs_orig_write_end_io(struct bio *bio) else bio->bi_status = BLK_STS_OK; + if (bio_op(bio) == REQ_OP_ZONE_APPEND && !bio->bi_status) + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + btrfs_orig_bbio_end_io(bbio); btrfs_put_bioc(bioc); } @@ -427,6 +430,8 @@ static void btrfs_clone_write_end_io(struct bio *bio) if (bio->bi_status) { atomic_inc(&stripe->bioc->error); btrfs_log_dev_io_error(bio, stripe->dev); + } else if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + stripe->physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; } /* Pass on control to the original bio this one was cloned from */ @@ -463,8 +468,6 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); - btrfsic_check_bio(bio); - if (bio->bi_opf & REQ_BTRFS_CGROUP_PUNT) blkcg_punt_bio_submit(bio); else @@ -490,6 +493,7 @@ static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; bioc->stripes[dev_nr].bioc = bioc; + bioc->size = bio->bi_iter.bi_size; btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); } @@ -499,6 +503,8 @@ static void __btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, if (!bioc) { /* Single mirror read/write fast path. */ btrfs_bio(bio)->mirror_num = mirror_num; + if (bio_op(bio) != REQ_OP_READ) + btrfs_bio(bio)->orig_physical = smap->physical; bio->bi_iter.bi_sector = smap->physical >> SECTOR_SHIFT; if (bio_op(bio) != REQ_OP_READ) btrfs_bio(bio)->orig_physical = smap->physical; @@ -568,13 +574,20 @@ static void run_one_async_start(struct btrfs_work *work) * * At IO completion time the csums attached on the ordered extent record are * inserted into the tree. + * + * If called with @do_free == true, then it will free the work struct. */ -static void run_one_async_done(struct btrfs_work *work) +static void run_one_async_done(struct btrfs_work *work, bool do_free) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); struct bio *bio = &async->bbio->bio; + if (do_free) { + kfree(container_of(work, struct async_submit_bio, work)); + return; + } + /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { btrfs_orig_bbio_end_io(async->bbio); @@ -590,11 +603,6 @@ static void run_one_async_done(struct btrfs_work *work) __btrfs_submit_bio(bio, async->bioc, &async->smap, async->mirror_num); } -static void run_one_async_free(struct btrfs_work *work) -{ - kfree(container_of(work, struct async_submit_bio, work)); -} - static bool should_async_write(struct btrfs_bio *bbio) { /* Submit synchronously if the checksum implementation is fast. */ @@ -636,8 +644,7 @@ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, async->smap = *smap; async->mirror_num = mirror_num; - btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, - run_one_async_free); + btrfs_init_work(&async->work, run_one_async_start, run_one_async_done); btrfs_queue_work(fs_info->workers, &async->work); return true; } @@ -657,9 +664,11 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) blk_status_t ret; int error; + smap.is_scrub = !bbio->inode; + btrfs_bio_counter_inc_blocked(fs_info); error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); + &bioc, &smap, &mirror_num); if (error) { ret = errno_to_blk_status(error); goto fail; @@ -691,6 +700,18 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bio->bi_opf |= REQ_OP_ZONE_APPEND; } + if (is_data_bbio(bbio) && bioc && + btrfs_need_stripe_tree_update(bioc->fs_info, bioc->map_type)) { + /* + * No locking for the list update, as we only add to + * the list in the I/O submission path, and list + * iteration only happens in the completion path, which + * can't happen until after the last submission. + */ + btrfs_get_bioc(bioc); + list_add_tail(&bioc->rst_ordered_entry, &bbio->ordered->bioc_list); + } + /* * Csum items for reloc roots have already been cloned at this * point, so they are handled as part of the no-checksum case. @@ -779,8 +800,6 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0cb1dee965a0..6e5dc68ff661 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -935,7 +935,7 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait) caching_ctl->block_group = cache; refcount_set(&caching_ctl->count, 2); atomic_set(&caching_ctl->progress, 0); - btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL); + btrfs_init_work(&caching_ctl->work, caching_thread, NULL); spin_lock(&cache->lock); if (cache->cached != BTRFS_CACHE_NO) { @@ -1286,7 +1286,7 @@ out: /* Once for the lookup reference */ btrfs_put_block_group(block_group); if (remove_rsv) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); btrfs_free_path(path); return ret; } @@ -2601,7 +2601,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); btrfs_set_dev_extent_length(leaf, extent, num_bytes); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -2709,7 +2709,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } @@ -2819,8 +2819,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran #endif list_add_tail(&cache->bg_list, &trans->new_bgs); - trans->delayed_ref_updates++; - btrfs_update_delayed_refs_rsv(trans); + btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); return cache; @@ -3025,11 +3024,19 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); fail: btrfs_release_path(path); - /* We didn't update the block group item, need to revert @commit_used. */ - if (ret < 0) { + /* + * We didn't update the block group item, need to revert commit_used + * unless the block group item didn't exist yet - this is to prevent a + * race with a concurrent insertion of the block group item, with + * insert_block_group_item(), that happened just after we attempted to + * update. In that case we would reset commit_used to 0 just after the + * insertion set it to a value greater than 0 - if the block group later + * becomes with 0 used bytes, we would incorrectly skip its update. + */ + if (ret < 0 && ret != -ENOENT) { spin_lock(&cache->lock); cache->commit_used = old_commit_used; spin_unlock(&cache->lock); @@ -3043,7 +3050,6 @@ static int cache_save_setup(struct btrfs_block_group *block_group, struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_root *root = fs_info->tree_root; struct inode *inode = NULL; struct extent_changeset *data_reserved = NULL; u64 alloc_hint = 0; @@ -3095,7 +3101,7 @@ again: * time. */ BTRFS_I(inode)->generation = 0; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { /* * So theoretically we could recover from this, simply set the @@ -3362,7 +3368,7 @@ again: if (should_put) btrfs_put_block_group(cache); if (drop_reserve) - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); /* * Avoid blocking other tasks for too long. It might even save * us from writing caches for block groups that are going to be @@ -3466,8 +3472,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); if (!ret) - ret = btrfs_run_delayed_refs(trans, - (unsigned long) -1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { cache->io_ctl.inode = NULL; @@ -3510,7 +3515,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) /* If its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -3535,12 +3540,12 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; - struct btrfs_block_group *cache = NULL; - u64 total = num_bytes; + struct btrfs_space_info *space_info; + struct btrfs_block_group *cache; u64 old_val; - u64 byte_in_group; + bool reclaim = false; + bool bg_already_dirty = true; int factor; - int ret = 0; /* Block accounting for super block */ spin_lock(&info->delalloc_root_lock); @@ -3552,97 +3557,86 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, btrfs_set_super_bytes_used(info->super_copy, old_val); spin_unlock(&info->delalloc_root_lock); - while (total) { - struct btrfs_space_info *space_info; - bool reclaim = false; - - cache = btrfs_lookup_block_group(info, bytenr); - if (!cache) { - ret = -ENOENT; - break; - } - space_info = cache->space_info; - factor = btrfs_bg_type_to_factor(cache->flags); + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -ENOENT; - /* - * If this block group has free space cache written out, we - * need to make sure to load it if we are removing space. This - * is because we need the unpinning stage to actually add the - * space back to the block group, otherwise we will leak space. - */ - if (!alloc && !btrfs_block_group_done(cache)) - btrfs_cache_block_group(cache, true); + /* An extent can not span multiple block groups. */ + ASSERT(bytenr + num_bytes <= cache->start + cache->length); - byte_in_group = bytenr - cache->start; - WARN_ON(byte_in_group > cache->length); + space_info = cache->space_info; + factor = btrfs_bg_type_to_factor(cache->flags); - spin_lock(&space_info->lock); - spin_lock(&cache->lock); + /* + * If this block group has free space cache written out, we need to make + * sure to load it if we are removing space. This is because we need + * the unpinning stage to actually add the space back to the block group, + * otherwise we will leak space. + */ + if (!alloc && !btrfs_block_group_done(cache)) + btrfs_cache_block_group(cache, true); - if (btrfs_test_opt(info, SPACE_CACHE) && - cache->disk_cache_state < BTRFS_DC_CLEAR) - cache->disk_cache_state = BTRFS_DC_CLEAR; + spin_lock(&space_info->lock); + spin_lock(&cache->lock); - old_val = cache->used; - num_bytes = min(total, cache->length - byte_in_group); - if (alloc) { - old_val += num_bytes; - cache->used = old_val; - cache->reserved -= num_bytes; - space_info->bytes_reserved -= num_bytes; - space_info->bytes_used += num_bytes; - space_info->disk_used += num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); - } else { - old_val -= num_bytes; - cache->used = old_val; - cache->pinned += num_bytes; - btrfs_space_info_update_bytes_pinned(info, space_info, - num_bytes); - space_info->bytes_used -= num_bytes; - space_info->disk_used -= num_bytes * factor; + if (btrfs_test_opt(info, SPACE_CACHE) && + cache->disk_cache_state < BTRFS_DC_CLEAR) + cache->disk_cache_state = BTRFS_DC_CLEAR; - reclaim = should_reclaim_block_group(cache, num_bytes); + old_val = cache->used; + if (alloc) { + old_val += num_bytes; + cache->used = old_val; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; + space_info->bytes_used += num_bytes; + space_info->disk_used += num_bytes * factor; + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); + } else { + old_val -= num_bytes; + cache->used = old_val; + cache->pinned += num_bytes; + btrfs_space_info_update_bytes_pinned(info, space_info, num_bytes); + space_info->bytes_used -= num_bytes; + space_info->disk_used -= num_bytes * factor; - spin_unlock(&cache->lock); - spin_unlock(&space_info->lock); + reclaim = should_reclaim_block_group(cache, num_bytes); - set_extent_bit(&trans->transaction->pinned_extents, - bytenr, bytenr + num_bytes - 1, - EXTENT_DIRTY, NULL); - } + spin_unlock(&cache->lock); + spin_unlock(&space_info->lock); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - trans->delayed_ref_updates++; - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); + set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + } - /* - * No longer have used bytes in this block group, queue it for - * deletion. We do this after adding the block group to the - * dirty list to avoid races between cleaner kthread and space - * cache writeout. - */ - if (!alloc && old_val == 0) { - if (!btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(cache); - } else if (!alloc && reclaim) { - btrfs_mark_bg_to_reclaim(cache); - } + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, &trans->transaction->dirty_bgs); + bg_already_dirty = false; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_put_block_group(cache); - total -= num_bytes; - bytenr += num_bytes; + /* + * No longer have used bytes in this block group, queue it for deletion. + * We do this after adding the block group to the dirty list to avoid + * races between cleaner kthread and space cache writeout. + */ + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } + btrfs_put_block_group(cache); + /* Modified block groups are accounted for in the delayed_refs_rsv. */ - btrfs_update_delayed_refs_rsv(trans); - return ret; + if (!bg_already_dirty) + btrfs_inc_delayed_refs_rsv_bg_updates(info); + + return 0; } /* diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 77684c5e0c8b..ceb5f586a2d5 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -221,7 +221,8 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, if (num_bytes == 0) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) btrfs_block_rsv_add_bytes(block_rsv, num_bytes, true); @@ -261,7 +262,8 @@ int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, if (!ret) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + num_bytes, flush); if (!ret) { btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); return 0; @@ -279,10 +281,10 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *target = NULL; /* - * If we are the delayed_rsv then push to the global rsv, otherwise dump - * into the delayed rsv if it is not full. + * If we are a delayed block reserve then push to the global rsv, + * otherwise dump into the global delayed reserve if it is not full. */ - if (block_rsv == delayed_rsv) + if (block_rsv->type == BTRFS_BLOCK_RSV_DELOPS) target = global_rsv; else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv)) target = delayed_rsv; @@ -354,6 +356,11 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) min_items++; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + num_bytes += btrfs_root_used(&fs_info->stripe_root->root_item); + min_items++; + } + /* * But we also want to reserve enough space so we can do the fallback * global reserve for an unlink, which is an additional @@ -405,6 +412,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_EXTENT_TREE_OBJECTID: case BTRFS_FREE_SPACE_TREE_OBJECTID: case BTRFS_BLOCK_GROUP_TREE_OBJECTID: + case BTRFS_RAID_STRIPE_TREE_OBJECTID: root->block_rsv = &fs_info->delayed_refs_rsv; break; case BTRFS_ROOT_TREE_OBJECTID: @@ -517,8 +525,8 @@ again: block_rsv->type, ret); } try_reserve: - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, - BTRFS_RESERVE_NO_FLUSH); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + blocksize, BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; /* @@ -539,7 +547,7 @@ try_reserve: * one last time to force a reservation if there's enough actual space * on disk to make the reservation. */ - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, blocksize, BTRFS_RESERVE_FLUSH_EMERGENCY); if (!ret) return block_rsv; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index bda1fdbba666..5572ae52444e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -8,6 +8,8 @@ #include <linux/hash.h> #include <linux/refcount.h> +#include <linux/fscrypt.h> +#include <trace/events/btrfs.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" @@ -79,11 +81,21 @@ struct btrfs_inode { */ struct btrfs_key location; + /* Cached value of inode property 'compression'. */ + u8 prop_compress; + + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set. + */ + u8 defrag_compress; + /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans), to access/update new_delalloc_bytes and to update the - * VFS' inode number of bytes used. + * logged_trans), to access/update delalloc_bytes, new_delalloc_bytes, + * defrag_bytes, disk_i_size, outstanding_extents, csum_bytes and to + * update the VFS' inode number of bytes used. */ spinlock_t lock; @@ -102,8 +114,18 @@ struct btrfs_inode { /* held while logging the inode in tree-log.c */ struct mutex log_mutex; + /* + * Counters to keep track of the number of extent item's we may use due + * to delalloc and such. outstanding_extents is the number of extent + * items we think we'll end up using, and reserved_extents is the number + * of extent items we've reserved metadata for. Protected by 'lock'. + */ + unsigned outstanding_extents; + /* used to order data wrt metadata */ - struct btrfs_ordered_inode_tree ordered_tree; + spinlock_t ordered_tree_lock; + struct rb_root ordered_tree; + struct rb_node *ordered_tree_last; /* list of all the delalloc inodes in the FS. There are times we need * to write all the delalloc pages to disk, and this list is used @@ -122,28 +144,31 @@ struct btrfs_inode { u64 generation; /* - * transid of the trans_handle that last modified this inode + * ID of the transaction handle that last modified this inode. + * Protected by 'lock'. */ u64 last_trans; /* - * transid that last logged this inode + * ID of the transaction that last logged this inode. + * Protected by 'lock'. */ u64 logged_trans; /* - * log transid when this inode was last modified + * Log transaction ID when this inode was last modified. + * Protected by 'lock'. */ int last_sub_trans; - /* a local copy of root's last_log_commit */ + /* A local copy of root's last_log_commit. Protected by 'lock'. */ int last_log_commit; union { /* * Total number of bytes pending delalloc, used by stat to * calculate the real block usage of the file. This is used - * only for files. + * only for files. Protected by 'lock'. */ u64 delalloc_bytes; /* @@ -161,7 +186,7 @@ struct btrfs_inode { * Total number of bytes pending delalloc that fall within a file * range that is either a hole or beyond EOF (and no prealloc extent * exists in the range). This is always <= delalloc_bytes and this - * is used only for files. + * is used only for files. Protected by 'lock'. */ u64 new_delalloc_bytes; /* @@ -172,15 +197,15 @@ struct btrfs_inode { }; /* - * total number of bytes pending defrag, used by stat to check whether - * it needs COW. + * Total number of bytes pending defrag, used by stat to check whether + * it needs COW. Protected by 'lock'. */ u64 defrag_bytes; /* - * the size of the file stored in the metadata on disk. data=ordered + * The size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk - * because not all the blocks are written yet. + * because not all the blocks are written yet. Protected by 'lock'. */ u64 disk_i_size; @@ -214,7 +239,7 @@ struct btrfs_inode { /* * Number of bytes outstanding that are going to need csums. This is - * used in ENOSPC accounting. + * used in ENOSPC accounting. Protected by 'lock'. */ u64 csum_bytes; @@ -223,30 +248,13 @@ struct btrfs_inode { /* Read-only compatibility flags, upper half of inode_item::flags */ u32 ro_flags; - /* - * Counters to keep track of the number of extent item's we may use due - * to delalloc and such. outstanding_extents is the number of extent - * items we think we'll end up using, and reserved_extents is the number - * of extent items we've reserved metadata for. - */ - unsigned outstanding_extents; - struct btrfs_block_rsv block_rsv; - /* - * Cached values of inode properties - */ - unsigned prop_compress; /* per-file compression algorithm */ - /* - * Force compression on the file using the defrag ioctl, could be - * different from prop_compress and takes precedence if set - */ - unsigned defrag_compress; - struct btrfs_delayed_node *delayed_node; /* File creation time. */ - struct timespec64 i_otime; + u64 i_otime_sec; + u32 i_otime_nsec; /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; @@ -387,7 +395,7 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) + inode->last_sub_trans <= btrfs_get_root_last_log_commit(inode->root)) ret = true; spin_unlock(&inode->lock); return ret; @@ -481,9 +489,9 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); + struct btrfs_inode *inode); int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c deleted file mode 100644 index 3caf339c4bb3..000000000000 --- a/fs/btrfs/check-integrity.c +++ /dev/null @@ -1,2871 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -/* - * This module can be used to catch cases when the btrfs kernel - * code executes write requests to the disk that bring the file - * system in an inconsistent state. In such a state, a power-loss - * or kernel panic event would cause that the data on disk is - * lost or at least damaged. - * - * Code is added that examines all block write requests during - * runtime (including writes of the super block). Three rules - * are verified and an error is printed on violation of the - * rules: - * 1. It is not allowed to write a disk block which is - * currently referenced by the super block (either directly - * or indirectly). - * 2. When a super block is written, it is verified that all - * referenced (directly or indirectly) blocks fulfill the - * following requirements: - * 2a. All referenced blocks have either been present when - * the file system was mounted, (i.e., they have been - * referenced by the super block) or they have been - * written since then and the write completion callback - * was called and no write error was indicated and a - * FLUSH request to the device where these blocks are - * located was received and completed. - * 2b. All referenced blocks need to have a generation - * number which is equal to the parent's number. - * - * One issue that was found using this module was that the log - * tree on disk became temporarily corrupted because disk blocks - * that had been in use for the log tree had been freed and - * reused too early, while being referenced by the written super - * block. - * - * The search term in the kernel log that can be used to filter - * on the existence of detected integrity issues is - * "btrfs: attempt". - * - * The integrity check is enabled via mount options. These - * mount options are only supported if the integrity check - * tool is compiled by defining BTRFS_FS_CHECK_INTEGRITY. - * - * Example #1, apply integrity checks to all metadata: - * mount /dev/sdb1 /mnt -o check_int - * - * Example #2, apply integrity checks to all metadata and - * to data extents: - * mount /dev/sdb1 /mnt -o check_int_data - * - * Example #3, apply integrity checks to all metadata and dump - * the tree that the super block references to kernel messages - * each time after a super block was written: - * mount /dev/sdb1 /mnt -o check_int,check_int_print_mask=263 - * - * If the integrity check tool is included and activated in - * the mount options, plenty of kernel memory is used, and - * plenty of additional CPU cycles are spent. Enabling this - * functionality is not intended for normal use. In most - * cases, unless you are a btrfs developer who needs to verify - * the integrity of (super)-block write requests, do not - * enable the config option BTRFS_FS_CHECK_INTEGRITY to - * include and compile the integrity check tool. - * - * Expect millions of lines of information in the kernel log with an - * enabled check_int_print_mask. Therefore set LOG_BUF_SHIFT in the - * kernel config to at least 26 (which is 64MB). Usually the value is - * limited to 21 (which is 2MB) in init/Kconfig. The file needs to be - * changed like this before LOG_BUF_SHIFT can be set to a high value: - * config LOG_BUF_SHIFT - * int "Kernel log buffer size (16 => 64KB, 17 => 128KB)" - * range 12 30 - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mutex.h> -#include <linux/blkdev.h> -#include <linux/mm.h> -#include <linux/string.h> -#include <crypto/hash.h> -#include "messages.h" -#include "ctree.h" -#include "disk-io.h" -#include "transaction.h" -#include "extent_io.h" -#include "volumes.h" -#include "print-tree.h" -#include "locking.h" -#include "check-integrity.h" -#include "rcu-string.h" -#include "compression.h" -#include "accessors.h" - -#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 -#define BTRFSIC_DEV2STATE_HASHTABLE_SIZE 0x100 -#define BTRFSIC_BLOCK_MAGIC_NUMBER 0x14491051 -#define BTRFSIC_BLOCK_LINK_MAGIC_NUMBER 0x11070807 -#define BTRFSIC_DEV2STATE_MAGIC_NUMBER 0x20111530 -#define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300 -#define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6) /* in characters, - * excluding " [...]" */ -#define BTRFSIC_GENERATION_UNKNOWN ((u64)-1) - -/* - * The definition of the bitmask fields for the print_mask. - * They are specified with the mount option check_integrity_print_mask. - */ -#define BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE 0x00000001 -#define BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION 0x00000002 -#define BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE 0x00000004 -#define BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE 0x00000008 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH 0x00000010 -#define BTRFSIC_PRINT_MASK_END_IO_BIO_BH 0x00000020 -#define BTRFSIC_PRINT_MASK_VERBOSE 0x00000040 -#define BTRFSIC_PRINT_MASK_VERY_VERBOSE 0x00000080 -#define BTRFSIC_PRINT_MASK_INITIAL_TREE 0x00000100 -#define BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES 0x00000200 -#define BTRFSIC_PRINT_MASK_INITIAL_DATABASE 0x00000400 -#define BTRFSIC_PRINT_MASK_NUM_COPIES 0x00000800 -#define BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS 0x00001000 -#define BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE 0x00002000 - -struct btrfsic_dev_state; -struct btrfsic_state; - -struct btrfsic_block { - u32 magic_num; /* only used for debug purposes */ - unsigned int is_metadata:1; /* if it is meta-data, not data-data */ - unsigned int is_superblock:1; /* if it is one of the superblocks */ - unsigned int is_iodone:1; /* if is done by lower subsystem */ - unsigned int iodone_w_error:1; /* error was indicated to endio */ - unsigned int never_written:1; /* block was added because it was - * referenced, not because it was - * written */ - unsigned int mirror_num; /* large enough to hold - * BTRFS_SUPER_MIRROR_MAX */ - struct btrfsic_dev_state *dev_state; - u64 dev_bytenr; /* key, physical byte num on disk */ - u64 logical_bytenr; /* logical byte num on disk */ - u64 generation; - struct btrfs_disk_key disk_key; /* extra info to print in case of - * issues, will not always be correct */ - struct list_head collision_resolving_node; /* list node */ - struct list_head all_blocks_node; /* list node */ - - /* the following two lists contain block_link items */ - struct list_head ref_to_list; /* list */ - struct list_head ref_from_list; /* list */ - struct btrfsic_block *next_in_same_bio; - void *orig_bio_private; - bio_end_io_t *orig_bio_end_io; - blk_opf_t submit_bio_bh_rw; - u64 flush_gen; /* only valid if !never_written */ -}; - -/* - * Elements of this type are allocated dynamically and required because - * each block object can refer to and can be ref from multiple blocks. - * The key to lookup them in the hashtable is the dev_bytenr of - * the block ref to plus the one from the block referred from. - * The fact that they are searchable via a hashtable and that a - * ref_cnt is maintained is not required for the btrfs integrity - * check algorithm itself, it is only used to make the output more - * beautiful in case that an error is detected (an error is defined - * as a write operation to a block while that block is still referenced). - */ -struct btrfsic_block_link { - u32 magic_num; /* only used for debug purposes */ - u32 ref_cnt; - struct list_head node_ref_to; /* list node */ - struct list_head node_ref_from; /* list node */ - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block *block_ref_to; - struct btrfsic_block *block_ref_from; - u64 parent_generation; -}; - -struct btrfsic_dev_state { - u32 magic_num; /* only used for debug purposes */ - struct block_device *bdev; - struct btrfsic_state *state; - struct list_head collision_resolving_node; /* list node */ - struct btrfsic_block dummy_block_for_bio_bh_flush; - u64 last_flush_gen; -}; - -struct btrfsic_block_hashtable { - struct list_head table[BTRFSIC_BLOCK_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_link_hashtable { - struct list_head table[BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE]; -}; - -struct btrfsic_dev_state_hashtable { - struct list_head table[BTRFSIC_DEV2STATE_HASHTABLE_SIZE]; -}; - -struct btrfsic_block_data_ctx { - u64 start; /* virtual bytenr */ - u64 dev_bytenr; /* physical bytenr on device */ - u32 len; - struct btrfsic_dev_state *dev; - char **datav; - struct page **pagev; - void *mem_to_free; -}; - -/* This structure is used to implement recursion without occupying - * any stack space, refer to btrfsic_process_metablock() */ -struct btrfsic_stack_frame { - u32 magic; - u32 nr; - int error; - int i; - int limit_nesting; - int num_copies; - int mirror_num; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx *block_ctx; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfs_header *hdr; - struct btrfsic_stack_frame *prev; -}; - -/* Some state per mounted filesystem */ -struct btrfsic_state { - u32 print_mask; - int include_extent_data; - struct list_head all_blocks_list; - struct btrfsic_block_hashtable block_hashtable; - struct btrfsic_block_link_hashtable block_link_hashtable; - struct btrfs_fs_info *fs_info; - u64 max_superblock_generation; - struct btrfsic_block *latest_superblock; - u32 metablock_size; - u32 datablock_size; -}; - -static int btrfsic_process_metablock(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - int limit_nesting, int force_iodone_flag); -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dst, u32 offset, size_t len); -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx - *block_ctx, u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation); -static int btrfsic_handle_extent_data(struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag); -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num); -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx); -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const block, - struct btrfs_super_block *const super_hdr); -static void btrfsic_bio_end_io(struct bio *bp); -static int btrfsic_is_block_ref_by_superblock(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level); -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level); -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l); -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block); -static void btrfsic_dump_tree(const struct btrfsic_state *state); -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level); -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation); -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created); -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr); - -static struct mutex btrfsic_mutex; -static int btrfsic_is_initialized; -static struct btrfsic_dev_state_hashtable btrfsic_dev_state_hashtable; - - -static void btrfsic_block_init(struct btrfsic_block *b) -{ - b->magic_num = BTRFSIC_BLOCK_MAGIC_NUMBER; - b->dev_state = NULL; - b->dev_bytenr = 0; - b->logical_bytenr = 0; - b->generation = BTRFSIC_GENERATION_UNKNOWN; - b->disk_key.objectid = 0; - b->disk_key.type = 0; - b->disk_key.offset = 0; - b->is_metadata = 0; - b->is_superblock = 0; - b->is_iodone = 0; - b->iodone_w_error = 0; - b->never_written = 0; - b->mirror_num = 0; - b->next_in_same_bio = NULL; - b->orig_bio_private = NULL; - b->orig_bio_end_io = NULL; - INIT_LIST_HEAD(&b->collision_resolving_node); - INIT_LIST_HEAD(&b->all_blocks_node); - INIT_LIST_HEAD(&b->ref_to_list); - INIT_LIST_HEAD(&b->ref_from_list); - b->submit_bio_bh_rw = 0; - b->flush_gen = 0; -} - -static struct btrfsic_block *btrfsic_block_alloc(void) -{ - struct btrfsic_block *b; - - b = kzalloc(sizeof(*b), GFP_NOFS); - if (NULL != b) - btrfsic_block_init(b); - - return b; -} - -static void btrfsic_block_free(struct btrfsic_block *b) -{ - BUG_ON(!(NULL == b || BTRFSIC_BLOCK_MAGIC_NUMBER == b->magic_num)); - kfree(b); -} - -static void btrfsic_block_link_init(struct btrfsic_block_link *l) -{ - l->magic_num = BTRFSIC_BLOCK_LINK_MAGIC_NUMBER; - l->ref_cnt = 1; - INIT_LIST_HEAD(&l->node_ref_to); - INIT_LIST_HEAD(&l->node_ref_from); - INIT_LIST_HEAD(&l->collision_resolving_node); - l->block_ref_to = NULL; - l->block_ref_from = NULL; -} - -static struct btrfsic_block_link *btrfsic_block_link_alloc(void) -{ - struct btrfsic_block_link *l; - - l = kzalloc(sizeof(*l), GFP_NOFS); - if (NULL != l) - btrfsic_block_link_init(l); - - return l; -} - -static void btrfsic_block_link_free(struct btrfsic_block_link *l) -{ - BUG_ON(!(NULL == l || BTRFSIC_BLOCK_LINK_MAGIC_NUMBER == l->magic_num)); - kfree(l); -} - -static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) -{ - ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; - ds->bdev = NULL; - ds->state = NULL; - INIT_LIST_HEAD(&ds->collision_resolving_node); - ds->last_flush_gen = 0; - btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); - ds->dummy_block_for_bio_bh_flush.is_iodone = 1; - ds->dummy_block_for_bio_bh_flush.dev_state = ds; -} - -static struct btrfsic_dev_state *btrfsic_dev_state_alloc(void) -{ - struct btrfsic_dev_state *ds; - - ds = kzalloc(sizeof(*ds), GFP_NOFS); - if (NULL != ds) - btrfsic_dev_state_init(ds); - - return ds; -} - -static void btrfsic_dev_state_free(struct btrfsic_dev_state *ds) -{ - BUG_ON(!(NULL == ds || - BTRFSIC_DEV2STATE_MAGIC_NUMBER == ds->magic_num)); - kfree(ds); -} - -static void btrfsic_block_hashtable_init(struct btrfsic_block_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_hashtable_add(struct btrfsic_block *b, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(b->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)b->dev_state->bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - - list_add(&b->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_hashtable_remove(struct btrfsic_block *b) -{ - list_del(&b->collision_resolving_node); -} - -static struct btrfsic_block *btrfsic_block_hashtable_lookup( - struct block_device *bdev, - u64 dev_bytenr, - struct btrfsic_block_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)bdev))) & - (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct btrfsic_block *b; - - list_for_each_entry(b, h->table + hashval, collision_resolving_node) { - if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) - return b; - } - - return NULL; -} - -static void btrfsic_block_link_hashtable_init( - struct btrfsic_block_link_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_block_link_hashtable_add( - struct btrfsic_block_link *l, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(l->block_ref_to->dev_bytenr >> 16)) ^ - ((unsigned int)(l->block_ref_from->dev_bytenr >> 16)) ^ - ((unsigned int)((uintptr_t)l->block_ref_to->dev_state->bdev)) ^ - ((unsigned int)((uintptr_t)l->block_ref_from->dev_state->bdev))) - & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - list_add(&l->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_block_link_hashtable_remove(struct btrfsic_block_link *l) -{ - list_del(&l->collision_resolving_node); -} - -static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( - struct block_device *bdev_ref_to, - u64 dev_bytenr_ref_to, - struct block_device *bdev_ref_from, - u64 dev_bytenr_ref_from, - struct btrfsic_block_link_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)(dev_bytenr_ref_to >> 16)) ^ - ((unsigned int)(dev_bytenr_ref_from >> 16)) ^ - ((unsigned int)((uintptr_t)bdev_ref_to)) ^ - ((unsigned int)((uintptr_t)bdev_ref_from))) & - (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct btrfsic_block_link *l; - - list_for_each_entry(l, h->table + hashval, collision_resolving_node) { - BUG_ON(NULL == l->block_ref_to); - BUG_ON(NULL == l->block_ref_from); - if (l->block_ref_to->dev_state->bdev == bdev_ref_to && - l->block_ref_to->dev_bytenr == dev_bytenr_ref_to && - l->block_ref_from->dev_state->bdev == bdev_ref_from && - l->block_ref_from->dev_bytenr == dev_bytenr_ref_from) - return l; - } - - return NULL; -} - -static void btrfsic_dev_state_hashtable_init( - struct btrfsic_dev_state_hashtable *h) -{ - int i; - - for (i = 0; i < BTRFSIC_DEV2STATE_HASHTABLE_SIZE; i++) - INIT_LIST_HEAD(h->table + i); -} - -static void btrfsic_dev_state_hashtable_add( - struct btrfsic_dev_state *ds, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - (((unsigned int)((uintptr_t)ds->bdev->bd_dev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - - list_add(&ds->collision_resolving_node, h->table + hashval); -} - -static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) -{ - list_del(&ds->collision_resolving_node); -} - -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, - struct btrfsic_dev_state_hashtable *h) -{ - const unsigned int hashval = - dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); - struct btrfsic_dev_state *ds; - - list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev->bd_dev == dev) - return ds; - } - - return NULL; -} - -static int btrfsic_process_superblock(struct btrfsic_state *state, - struct btrfs_fs_devices *fs_devices) -{ - struct btrfs_super_block *selected_super; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - struct btrfsic_dev_state *selected_dev_state = NULL; - int ret = 0; - int pass; - - selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); - if (!selected_super) - return -ENOMEM; - - list_for_each_entry(device, dev_head, dev_list) { - int i; - struct btrfsic_dev_state *dev_state; - - if (!device->bdev || !device->name) - continue; - - dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); - BUG_ON(NULL == dev_state); - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - ret = btrfsic_process_superblock_dev_mirror( - state, dev_state, device, i, - &selected_dev_state, selected_super); - if (0 != ret && 0 == i) { - kfree(selected_super); - return ret; - } - } - } - - if (NULL == state->latest_superblock) { - pr_info("btrfsic: no superblock found!\n"); - kfree(selected_super); - return -1; - } - - for (pass = 0; pass < 3; pass++) { - int num_copies; - int mirror_num; - u64 next_bytenr; - - switch (pass) { - case 0: - next_bytenr = btrfs_super_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - next_bytenr = btrfs_super_chunk_root(selected_super); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - next_bytenr = btrfs_super_log_root(selected_super); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(state->fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(root @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - kfree(selected_super); - return -1; - } - - next_block = btrfsic_block_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - &state->block_hashtable); - BUG_ON(NULL == next_block); - - l = btrfsic_block_link_hashtable_lookup( - tmp_next_block_ctx.dev->bdev, - tmp_next_block_ctx.dev_bytenr, - state->latest_superblock->dev_state-> - bdev, - state->latest_superblock->dev_bytenr, - &state->block_link_hashtable); - BUG_ON(NULL == l); - - ret = btrfsic_read_block(state, &tmp_next_block_ctx); - if (ret < (int)PAGE_SIZE) { - pr_info("btrfsic: read @logical %llu failed!\n", - tmp_next_block_ctx.start); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - kfree(selected_super); - return -1; - } - - ret = btrfsic_process_metablock(state, - next_block, - &tmp_next_block_ctx, - BTRFS_MAX_LEVEL + 3, 1); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - } - } - - kfree(selected_super); - return ret; -} - -static int btrfsic_process_superblock_dev_mirror( - struct btrfsic_state *state, - struct btrfsic_dev_state *dev_state, - struct btrfs_device *device, - int superblock_mirror_num, - struct btrfsic_dev_state **selected_dev_state, - struct btrfs_super_block *selected_super) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_super_block *super_tmp; - u64 dev_bytenr; - struct btrfsic_block *superblock_tmp; - int pass; - struct block_device *const superblock_bdev = device->bdev; - struct page *page; - struct address_space *mapping = superblock_bdev->bd_inode->i_mapping; - int ret = 0; - - /* super block bytenr is always the unmapped device bytenr */ - dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) - return -1; - - page = read_cache_page_gfp(mapping, dev_bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return -1; - - super_tmp = page_address(page); - - if (btrfs_super_bytenr(super_tmp) != dev_bytenr || - btrfs_super_magic(super_tmp) != BTRFS_MAGIC || - memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || - btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_sectorsize(super_tmp) != state->datablock_size) { - ret = 0; - goto out; - } - - superblock_tmp = - btrfsic_block_hashtable_lookup(superblock_bdev, - dev_bytenr, - &state->block_hashtable); - if (NULL == superblock_tmp) { - superblock_tmp = btrfsic_block_alloc(); - if (NULL == superblock_tmp) { - ret = -1; - goto out; - } - /* for superblock, only the dev_bytenr makes sense */ - superblock_tmp->dev_bytenr = dev_bytenr; - superblock_tmp->dev_state = dev_state; - superblock_tmp->logical_bytenr = dev_bytenr; - superblock_tmp->generation = btrfs_super_generation(super_tmp); - superblock_tmp->is_metadata = 1; - superblock_tmp->is_superblock = 1; - superblock_tmp->is_iodone = 1; - superblock_tmp->never_written = 0; - superblock_tmp->mirror_num = 1 + superblock_mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", - superblock_bdev, - btrfs_dev_name(device), dev_bytenr, - dev_state->bdev, dev_bytenr, - superblock_mirror_num); - list_add(&superblock_tmp->all_blocks_node, - &state->all_blocks_list); - btrfsic_block_hashtable_add(superblock_tmp, - &state->block_hashtable); - } - - /* select the one with the highest generation field */ - if (btrfs_super_generation(super_tmp) > - state->max_superblock_generation || - 0 == state->max_superblock_generation) { - memcpy(selected_super, super_tmp, sizeof(*selected_super)); - *selected_dev_state = dev_state; - state->max_superblock_generation = - btrfs_super_generation(super_tmp); - state->latest_superblock = superblock_tmp; - } - - for (pass = 0; pass < 3; pass++) { - u64 next_bytenr; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key; - - tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY; - tmp_disk_key.offset = 0; - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "initial root "; - next_bytenr = btrfs_super_root(super_tmp); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "initial chunk "; - next_bytenr = btrfs_super_chunk_root(super_tmp); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "initial log "; - next_bytenr = btrfs_super_log_root(super_tmp); - if (0 == next_bytenr) - continue; - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - - if (btrfsic_map_block(state, next_bytenr, - state->metablock_size, - &tmp_next_block_ctx, - mirror_num)) { - pr_info("btrfsic: btrfsic_map_block(bytenr @%llu, mirror %d) failed!\n", - next_bytenr, mirror_num); - ret = -1; - goto out; - } - - next_block = btrfsic_block_lookup_or_add( - state, &tmp_next_block_ctx, - additional_string, 1, 1, 0, - mirror_num, NULL); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - ret = -1; - goto out; - } - - next_block->disk_key = tmp_disk_key; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, &tmp_next_block_ctx, - next_block, superblock_tmp, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) { - ret = -1; - goto out; - } - } - } - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_ALL_TREES) - btrfsic_dump_tree_sub(state, superblock_tmp, 0); - -out: - put_page(page); - return ret; -} - -static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void) -{ - struct btrfsic_stack_frame *sf; - - sf = kzalloc(sizeof(*sf), GFP_NOFS); - if (sf) - sf->magic = BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER; - return sf; -} - -static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf) -{ - BUG_ON(!(NULL == sf || - BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER == sf->magic)); - kfree(sf); -} - -static noinline_for_stack int btrfsic_process_metablock( - struct btrfsic_state *state, - struct btrfsic_block *const first_block, - struct btrfsic_block_data_ctx *const first_block_ctx, - int first_limit_nesting, int force_iodone_flag) -{ - struct btrfsic_stack_frame initial_stack_frame = { 0 }; - struct btrfsic_stack_frame *sf; - struct btrfsic_stack_frame *next_stack; - struct btrfs_header *const first_hdr = - (struct btrfs_header *)first_block_ctx->datav[0]; - - BUG_ON(!first_hdr); - sf = &initial_stack_frame; - sf->error = 0; - sf->i = -1; - sf->limit_nesting = first_limit_nesting; - sf->block = first_block; - sf->block_ctx = first_block_ctx; - sf->next_block = NULL; - sf->hdr = first_hdr; - sf->prev = NULL; - -continue_with_new_stack_frame: - sf->block->generation = btrfs_stack_header_generation(sf->hdr); - if (0 == sf->hdr->level) { - struct btrfs_leaf *const leafhdr = - (struct btrfs_leaf *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&leafhdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("leaf %llu items %d generation %llu owner %llu\n", - sf->block_ctx->start, sf->nr, - btrfs_stack_header_generation( - &leafhdr->header), - btrfs_stack_header_owner( - &leafhdr->header)); - } - -continue_with_current_leaf_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_item disk_item; - u32 disk_item_offset = - (uintptr_t)(leafhdr->items + sf->i) - - (uintptr_t)leafhdr; - struct btrfs_disk_key *disk_key; - u8 type; - u32 item_offset; - u32 item_size; - - if (disk_item_offset + sizeof(struct btrfs_item) > - sf->block_ctx->len) { -leaf_item_out_of_bounce_error: - pr_info( - "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data(sf->block_ctx, - &disk_item, - disk_item_offset, - sizeof(struct btrfs_item)); - item_offset = btrfs_stack_item_offset(&disk_item); - item_size = btrfs_stack_item_size(&disk_item); - disk_key = &disk_item.key; - type = btrfs_disk_key_type(disk_key); - - if (BTRFS_ROOT_ITEM_KEY == type) { - struct btrfs_root_item root_item; - u32 root_item_offset; - u64 next_bytenr; - - root_item_offset = item_offset + - offsetof(struct btrfs_leaf, items); - if (root_item_offset + item_size > - sf->block_ctx->len) - goto leaf_item_out_of_bounce_error; - btrfsic_read_from_block_data( - sf->block_ctx, &root_item, - root_item_offset, - item_size); - next_bytenr = btrfs_root_bytenr(&root_item); - - sf->error = - btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - disk_key, - btrfs_root_generation( - &root_item)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = - btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - btrfsic_release_block_ctx( - &sf-> - next_block_ctx); - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = - &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - } else if (BTRFS_EXTENT_DATA_KEY == type && - state->include_extent_data) { - sf->error = btrfsic_handle_extent_data( - state, - sf->block, - sf->block_ctx, - item_offset, - force_iodone_flag); - if (sf->error) - goto one_stack_frame_backwards; - } - - goto continue_with_current_leaf_stack_frame; - } - } else { - struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr; - - if (-1 == sf->i) { - sf->nr = btrfs_stack_header_nritems(&nodehdr->header); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("node %llu level %d items %d generation %llu owner %llu\n", - sf->block_ctx->start, - nodehdr->header.level, sf->nr, - btrfs_stack_header_generation( - &nodehdr->header), - btrfs_stack_header_owner( - &nodehdr->header)); - } - -continue_with_current_node_stack_frame: - if (0 == sf->num_copies || sf->mirror_num > sf->num_copies) { - sf->i++; - sf->num_copies = 0; - } - - if (sf->i < sf->nr) { - struct btrfs_key_ptr key_ptr; - u32 key_ptr_offset; - u64 next_bytenr; - - key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) - - (uintptr_t)nodehdr; - if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > - sf->block_ctx->len) { - pr_info( - "btrfsic: node item out of bounce at logical %llu, dev %pg\n", - sf->block_ctx->start, - sf->block_ctx->dev->bdev); - goto one_stack_frame_backwards; - } - btrfsic_read_from_block_data( - sf->block_ctx, &key_ptr, key_ptr_offset, - sizeof(struct btrfs_key_ptr)); - next_bytenr = btrfs_stack_key_blockptr(&key_ptr); - - sf->error = btrfsic_create_link_to_next_block( - state, - sf->block, - sf->block_ctx, - next_bytenr, - sf->limit_nesting, - &sf->next_block_ctx, - &sf->next_block, - force_iodone_flag, - &sf->num_copies, - &sf->mirror_num, - &key_ptr.key, - btrfs_stack_key_generation(&key_ptr)); - if (sf->error) - goto one_stack_frame_backwards; - - if (NULL != sf->next_block) { - struct btrfs_header *const next_hdr = - (struct btrfs_header *) - sf->next_block_ctx.datav[0]; - - next_stack = btrfsic_stack_frame_alloc(); - if (NULL == next_stack) { - sf->error = -1; - goto one_stack_frame_backwards; - } - - next_stack->i = -1; - next_stack->block = sf->next_block; - next_stack->block_ctx = &sf->next_block_ctx; - next_stack->next_block = NULL; - next_stack->hdr = next_hdr; - next_stack->limit_nesting = - sf->limit_nesting - 1; - next_stack->prev = sf; - sf = next_stack; - goto continue_with_new_stack_frame; - } - - goto continue_with_current_node_stack_frame; - } - } - -one_stack_frame_backwards: - if (NULL != sf->prev) { - struct btrfsic_stack_frame *const prev = sf->prev; - - /* the one for the initial block is freed in the caller */ - btrfsic_release_block_ctx(sf->block_ctx); - - if (sf->error) { - prev->error = sf->error; - btrfsic_stack_frame_free(sf); - sf = prev; - goto one_stack_frame_backwards; - } - - btrfsic_stack_frame_free(sf); - sf = prev; - goto continue_with_new_stack_frame; - } else { - BUG_ON(&initial_stack_frame != sf); - } - - return sf->error; -} - -static void btrfsic_read_from_block_data( - struct btrfsic_block_data_ctx *block_ctx, - void *dstv, u32 offset, size_t len) -{ - size_t cur; - size_t pgoff; - char *kaddr; - char *dst = (char *)dstv; - size_t start_offset = offset_in_page(block_ctx->start); - unsigned long i = (start_offset + offset) >> PAGE_SHIFT; - - WARN_ON(offset + len > block_ctx->len); - pgoff = offset_in_page(start_offset + offset); - - while (len > 0) { - cur = min(len, ((size_t)PAGE_SIZE - pgoff)); - BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_SIZE)); - kaddr = block_ctx->datav[i]; - memcpy(dst, kaddr + pgoff, cur); - - dst += cur; - len -= cur; - pgoff = 0; - i++; - } -} - -static int btrfsic_create_link_to_next_block( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u64 next_bytenr, - int limit_nesting, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block **next_blockp, - int force_iodone_flag, - int *num_copiesp, int *mirror_nump, - struct btrfs_disk_key *disk_key, - u64 parent_generation) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block *next_block = NULL; - int ret; - struct btrfsic_block_link *l; - int did_alloc_block_link; - int block_was_created; - - *next_blockp = NULL; - if (0 == *num_copiesp) { - *num_copiesp = btrfs_num_copies(fs_info, next_bytenr, - state->metablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, *num_copiesp); - *mirror_nump = 1; - } - - if (*mirror_nump > *num_copiesp) - return 0; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_create_link_to_next_block(mirror_num=%d)\n", - *mirror_nump); - ret = btrfsic_map_block(state, next_bytenr, - state->metablock_size, - next_block_ctx, *mirror_nump); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, *mirror_nump); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - next_block = btrfsic_block_lookup_or_add(state, - next_block_ctx, "referenced ", - 1, force_iodone_flag, - !force_iodone_flag, - *mirror_nump, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - if (block_was_created) { - l = NULL; - next_block->generation = BTRFSIC_GENERATION_UNKNOWN; - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block), - next_block->logical_bytenr); - else - pr_info( - "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", - next_bytenr, next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, *mirror_nump, - btrfsic_get_block_type(state, - next_block)); - } - next_block->logical_bytenr = next_bytenr; - - next_block->mirror_num = *mirror_nump; - l = btrfsic_block_link_hashtable_lookup( - next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_link_hashtable); - } - - next_block->disk_key = *disk_key; - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (NULL == l) { - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - did_alloc_block_link = 1; - l->block_ref_to = next_block; - l->block_ref_from = block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - did_alloc_block_link = 0; - if (0 == limit_nesting) { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - } - - if (limit_nesting > 0 && did_alloc_block_link) { - ret = btrfsic_read_block(state, next_block_ctx); - if (ret < (int)next_block_ctx->len) { - pr_info("btrfsic: read block @logical %llu failed!\n", - next_bytenr); - btrfsic_release_block_ctx(next_block_ctx); - *next_blockp = NULL; - return -1; - } - - *next_blockp = next_block; - } else { - *next_blockp = NULL; - } - (*mirror_nump)++; - - return 0; -} - -static int btrfsic_handle_extent_data( - struct btrfsic_state *state, - struct btrfsic_block *block, - struct btrfsic_block_data_ctx *block_ctx, - u32 item_offset, int force_iodone_flag) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfs_file_extent_item file_extent_item; - u64 file_extent_item_offset; - u64 next_bytenr; - u64 num_bytes; - u64 generation; - struct btrfsic_block_link *l; - int ret; - - file_extent_item_offset = offsetof(struct btrfs_leaf, items) + - item_offset; - if (file_extent_item_offset + - offsetof(struct btrfs_file_extent_item, disk_num_bytes) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - offsetof(struct btrfs_file_extent_item, disk_num_bytes)); - if (BTRFS_FILE_EXTENT_REG != file_extent_item.type || - btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr( - &file_extent_item)); - return 0; - } - - if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > - block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", - block_ctx->start, block_ctx->dev->bdev); - return -1; - } - btrfsic_read_from_block_data(block_ctx, &file_extent_item, - file_extent_item_offset, - sizeof(struct btrfs_file_extent_item)); - next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item); - if (btrfs_stack_file_extent_compression(&file_extent_item) == - BTRFS_COMPRESS_NONE) { - next_bytenr += btrfs_stack_file_extent_offset(&file_extent_item); - num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item); - } else { - num_bytes = btrfs_stack_file_extent_disk_num_bytes(&file_extent_item); - } - generation = btrfs_stack_file_extent_generation(&file_extent_item); - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("extent_data: type %u, disk_bytenr = %llu, offset = %llu, num_bytes = %llu\n", - file_extent_item.type, - btrfs_stack_file_extent_disk_bytenr(&file_extent_item), - btrfs_stack_file_extent_offset(&file_extent_item), - num_bytes); - while (num_bytes > 0) { - u32 chunk_len; - int num_copies; - int mirror_num; - - if (num_bytes > state->datablock_size) - chunk_len = state->datablock_size; - else - chunk_len = num_bytes; - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - state->datablock_size); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - struct btrfsic_block_data_ctx next_block_ctx; - struct btrfsic_block *next_block; - int block_was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_handle_extent_data(mirror_num=%d)\n", - mirror_num); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE) - pr_info("\tdisk_bytenr = %llu, num_bytes %u\n", - next_bytenr, chunk_len); - ret = btrfsic_map_block(state, next_bytenr, - chunk_len, &next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &next_block_ctx, - "referenced ", - 0, - force_iodone_flag, - !force_iodone_flag, - mirror_num, - &block_was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&next_block_ctx); - return -1; - } - if (!block_was_created) { - if ((state->print_mask & - BTRFSIC_PRINT_MASK_VERBOSE) && - next_block->logical_bytenr != next_bytenr && - !(!next_block->is_metadata && - 0 == next_block->logical_bytenr)) { - pr_info( -"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", - next_bytenr, - next_block_ctx.dev->bdev, - next_block_ctx.dev_bytenr, - mirror_num, - next_block->logical_bytenr); - } - next_block->logical_bytenr = next_bytenr; - next_block->mirror_num = mirror_num; - } - - l = btrfsic_block_link_lookup_or_add(state, - &next_block_ctx, - next_block, block, - generation); - btrfsic_release_block_ctx(&next_block_ctx); - if (NULL == l) - return -1; - } - - next_bytenr += chunk_len; - num_bytes -= chunk_len; - } - - return 0; -} - -static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, - struct btrfsic_block_data_ctx *block_ctx_out, - int mirror_num) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int ret; - u64 length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap, *map; - struct btrfs_device *device; - - length = len; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, bytenr, &length, &bioc, - NULL, &mirror_num, 0); - if (ret) { - block_ctx_out->start = 0; - block_ctx_out->dev_bytenr = 0; - block_ctx_out->len = 0; - block_ctx_out->dev = NULL; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - return ret; - } - - if (bioc) - map = &bioc->stripes[0]; - else - map = &smap; - - device = map->dev; - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) || - !device->bdev || !device->name) - block_ctx_out->dev = NULL; - else - block_ctx_out->dev = btrfsic_dev_state_lookup( - device->bdev->bd_dev); - block_ctx_out->dev_bytenr = map->physical; - block_ctx_out->start = bytenr; - block_ctx_out->len = len; - block_ctx_out->datav = NULL; - block_ctx_out->pagev = NULL; - block_ctx_out->mem_to_free = NULL; - - kfree(bioc); - if (NULL == block_ctx_out->dev) { - ret = -ENXIO; - pr_info("btrfsic: error, cannot lookup dev (#1)!\n"); - } - - return ret; -} - -static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) -{ - if (block_ctx->mem_to_free) { - unsigned int num_pages; - - BUG_ON(!block_ctx->datav); - BUG_ON(!block_ctx->pagev); - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - /* Pages must be unmapped in reverse order */ - while (num_pages > 0) { - num_pages--; - if (block_ctx->datav[num_pages]) - block_ctx->datav[num_pages] = NULL; - if (block_ctx->pagev[num_pages]) { - __free_page(block_ctx->pagev[num_pages]); - block_ctx->pagev[num_pages] = NULL; - } - } - - kfree(block_ctx->mem_to_free); - block_ctx->mem_to_free = NULL; - block_ctx->pagev = NULL; - block_ctx->datav = NULL; - } -} - -static int btrfsic_read_block(struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx) -{ - unsigned int num_pages; - unsigned int i; - size_t size; - u64 dev_bytenr; - int ret; - - BUG_ON(block_ctx->datav); - BUG_ON(block_ctx->pagev); - BUG_ON(block_ctx->mem_to_free); - if (!PAGE_ALIGNED(block_ctx->dev_bytenr)) { - pr_info("btrfsic: read_block() with unaligned bytenr %llu\n", - block_ctx->dev_bytenr); - return -1; - } - - num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> - PAGE_SHIFT; - size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev); - block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS); - if (!block_ctx->mem_to_free) - return -ENOMEM; - block_ctx->datav = block_ctx->mem_to_free; - block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); - if (ret) - return ret; - - dev_bytenr = block_ctx->dev_bytenr; - for (i = 0; i < num_pages;) { - struct bio *bio; - unsigned int j; - - bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, - REQ_OP_READ, GFP_NOFS); - bio->bi_iter.bi_sector = dev_bytenr >> SECTOR_SHIFT; - - for (j = i; j < num_pages; j++) { - ret = bio_add_page(bio, block_ctx->pagev[j], - PAGE_SIZE, 0); - if (PAGE_SIZE != ret) - break; - } - if (j == i) { - pr_info("btrfsic: error, failed to add a single page!\n"); - return -1; - } - if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %pg!\n", - block_ctx->start, block_ctx->dev->bdev); - bio_put(bio); - return -1; - } - bio_put(bio); - dev_bytenr += (j - i) * PAGE_SIZE; - i = j; - } - for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = page_address(block_ctx->pagev[i]); - - return block_ctx->len; -} - -static void btrfsic_dump_database(struct btrfsic_state *state) -{ - const struct btrfsic_block *b_all; - - BUG_ON(NULL == state); - - pr_info("all_blocks_list:\n"); - list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { - const struct btrfsic_block_link *l; - - pr_info("%c-block @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - - list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info( - " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - } - - list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info( - " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - } - - pr_info("\n"); - } -} - -/* - * Test whether the disk block contains a tree block (leaf or node) - * (note that this test fails for the super block) - */ -static noinline_for_stack int btrfsic_test_for_metadata( - struct btrfsic_state *state, - char **datav, unsigned int num_pages) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - struct btrfs_header *h; - u8 csum[BTRFS_CSUM_SIZE]; - unsigned int i; - - if (num_pages * PAGE_SIZE < state->metablock_size) - return 1; /* not metadata */ - num_pages = state->metablock_size >> PAGE_SHIFT; - h = (struct btrfs_header *)datav[0]; - - if (memcmp(h->fsid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE)) - return 1; - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); - - for (i = 0; i < num_pages; i++) { - u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE); - size_t sublen = i ? PAGE_SIZE : - (PAGE_SIZE - BTRFS_CSUM_SIZE); - - crypto_shash_update(shash, data, sublen); - } - crypto_shash_final(shash, csum); - if (memcmp(csum, h->csum, fs_info->csum_size)) - return 1; - - return 0; /* is metadata */ -} - -static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, - u64 dev_bytenr, char **mapped_datav, - unsigned int num_pages, - struct bio *bio, int *bio_is_patched, - blk_opf_t submit_bio_bh_rw) -{ - int is_metadata; - struct btrfsic_block *block; - struct btrfsic_block_data_ctx block_ctx; - int ret; - struct btrfsic_state *state = dev_state->state; - struct block_device *bdev = dev_state->bdev; - unsigned int processed_len; - - if (NULL != bio_is_patched) - *bio_is_patched = 0; - -again: - if (num_pages == 0) - return; - - processed_len = 0; - is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav, - num_pages)); - - block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr, - &state->block_hashtable); - if (NULL != block) { - u64 bytenr = 0; - struct btrfsic_block_link *l, *tmp; - - if (block->is_superblock) { - bytenr = btrfs_super_bytenr((struct btrfs_super_block *) - mapped_datav[0]); - if (num_pages * PAGE_SIZE < - BTRFS_SUPER_INFO_SIZE) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - is_metadata = 1; - BUG_ON(!PAGE_ALIGNED(BTRFS_SUPER_INFO_SIZE)); - processed_len = BTRFS_SUPER_INFO_SIZE; - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) { - pr_info("[before new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } - if (is_metadata) { - if (!block->is_superblock) { - if (num_pages * PAGE_SIZE < - state->metablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, - dev_state, - dev_bytenr); - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) { - if (block->logical_bytenr != bytenr && - !(!block->is_metadata && - block->logical_bytenr == 0)) - pr_info( -"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", - bytenr, dev_state->bdev, - dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, - block), - block->logical_bytenr); - else - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, - dev_bytenr, block->mirror_num, - btrfsic_get_block_type(state, - block)); - } - block->logical_bytenr = bytenr; - } else { - if (num_pages * PAGE_SIZE < - state->datablock_size) { - pr_info("btrfsic: cannot work with too short bios!\n"); - return; - } - processed_len = state->datablock_size; - bytenr = block->logical_bytenr; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", - bytenr, dev_state->bdev, dev_bytenr, - block->mirror_num, - btrfsic_get_block_type(state, block)); - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("ref_to_list: %cE, ref_from_list: %cE\n", - list_empty(&block->ref_to_list) ? ' ' : '!', - list_empty(&block->ref_from_list) ? ' ' : '!'); - if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_disk_key_objectid(&block->disk_key), - block->disk_key.type, - btrfs_disk_key_offset(&block->disk_key), - btrfs_stack_header_generation( - (struct btrfs_header *) mapped_datav[0]), - state->max_superblock_generation); - btrfsic_dump_tree(state); - } - - if (!block->is_iodone && !block->never_written) { - pr_info( -"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", - btrfsic_get_block_type(state, block), bytenr, - dev_state->bdev, dev_bytenr, block->mirror_num, - block->generation, - btrfs_stack_header_generation( - (struct btrfs_header *) - mapped_datav[0])); - /* it would not be safe to go on */ - btrfsic_dump_tree(state); - goto continue_loop; - } - - /* - * Clear all references of this block. Do not free - * the block itself even if is not referenced anymore - * because it still carries valuable information - * like whether it was ever written and IO completed. - */ - list_for_each_entry_safe(l, tmp, &block->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - l->ref_cnt--; - if (0 == l->ref_cnt) { - list_del(&l->node_ref_to); - list_del(&l->node_ref_from); - btrfsic_block_link_hashtable_remove(l); - btrfsic_block_link_free(l); - } - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - if (is_metadata || state->include_extent_data) { - block->never_written = 0; - block->iodone_w_error = 0; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = - bio->bi_private; - block->orig_bio_end_io = - bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - } - - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (is_metadata) { - block->logical_bytenr = bytenr; - block->is_metadata = 1; - if (block->is_superblock) { - BUG_ON(PAGE_SIZE != - BTRFS_SUPER_INFO_SIZE); - ret = btrfsic_process_written_superblock( - state, - block, - (struct btrfs_super_block *) - mapped_datav[0]); - if (state->print_mask & - BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) { - pr_info("[after new superblock is written]:\n"); - btrfsic_dump_tree_sub(state, block, 0); - } - } else { - block->mirror_num = 0; /* unknown */ - ret = btrfsic_process_metablock( - state, - block, - &block_ctx, - 0, 0); - } - if (ret) - pr_info("btrfsic: btrfsic_process_metablock(root @%llu) failed!\n", - dev_bytenr); - } else { - block->is_metadata = 0; - block->mirror_num = 0; /* unknown */ - block->generation = BTRFSIC_GENERATION_UNKNOWN; - if (!state->include_extent_data - && list_empty(&block->ref_from_list)) { - /* - * disk block is overwritten with extent - * data (not meta data) and we are configured - * to not include extent data: take the - * chance and free the block's memory - */ - btrfsic_block_hashtable_remove(block); - list_del(&block->all_blocks_node); - btrfsic_block_free(block); - } - } - btrfsic_release_block_ctx(&block_ctx); - } else { - /* block has not been found in hash table */ - u64 bytenr; - - if (!is_metadata) { - processed_len = state->datablock_size; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block (%pg/%llu/?) !found in hash table, D\n", - dev_state->bdev, dev_bytenr); - if (!state->include_extent_data) { - /* ignore that written D block */ - goto continue_loop; - } - - /* this is getting ugly for the - * include_extent_data case... */ - bytenr = 0; /* unknown */ - } else { - processed_len = state->metablock_size; - bytenr = btrfs_stack_header_bytenr( - (struct btrfs_header *) - mapped_datav[0]); - btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, - dev_bytenr); - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "written block @%llu (%pg/%llu/?) !found in hash table, M\n", - bytenr, dev_state->bdev, dev_bytenr); - } - - block_ctx.dev = dev_state; - block_ctx.dev_bytenr = dev_bytenr; - block_ctx.start = bytenr; - block_ctx.len = processed_len; - block_ctx.pagev = NULL; - block_ctx.mem_to_free = NULL; - block_ctx.datav = mapped_datav; - - block = btrfsic_block_alloc(); - if (NULL == block) { - btrfsic_release_block_ctx(&block_ctx); - goto continue_loop; - } - block->dev_state = dev_state; - block->dev_bytenr = dev_bytenr; - block->logical_bytenr = bytenr; - block->is_metadata = is_metadata; - block->never_written = 0; - block->iodone_w_error = 0; - block->mirror_num = 0; /* unknown */ - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = submit_bio_bh_rw; - if (NULL != bio) { - block->is_iodone = 0; - BUG_ON(NULL == bio_is_patched); - if (!*bio_is_patched) { - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - *bio_is_patched = 1; - } else { - struct btrfsic_block *chained_block = - (struct btrfsic_block *) - bio->bi_private; - - BUG_ON(NULL == chained_block); - block->orig_bio_private = - chained_block->orig_bio_private; - block->orig_bio_end_io = - chained_block->orig_bio_end_io; - block->next_in_same_bio = chained_block; - bio->bi_private = block; - } - } else { - block->is_iodone = 1; - block->orig_bio_private = NULL; - block->orig_bio_end_io = NULL; - block->next_in_same_bio = NULL; - } - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", - is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - - if (is_metadata) { - ret = btrfsic_process_metablock(state, block, - &block_ctx, 0, 0); - if (ret) - pr_info("btrfsic: process_metablock(root @%llu) failed!\n", - dev_bytenr); - } - btrfsic_release_block_ctx(&block_ctx); - } - -continue_loop: - BUG_ON(!processed_len); - dev_bytenr += processed_len; - mapped_datav += processed_len >> PAGE_SHIFT; - num_pages -= processed_len >> PAGE_SHIFT; - goto again; -} - -static void btrfsic_bio_end_io(struct bio *bp) -{ - struct btrfsic_block *block = bp->bi_private; - int iodone_w_error; - - /* mutex is not held! This is not save if IO is not yet completed - * on umount */ - iodone_w_error = 0; - if (bp->bi_status) - iodone_w_error = 1; - - BUG_ON(NULL == block); - bp->bi_private = block->orig_bio_private; - bp->bi_end_io = block->orig_bio_end_io; - - do { - struct btrfsic_block *next_block; - struct btrfsic_dev_state *const dev_state = block->dev_state; - - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", - bp->bi_status, - btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, block->mirror_num); - next_block = block->next_in_same_bio; - block->iodone_w_error = iodone_w_error; - if (block->submit_bio_bh_rw & REQ_PREFLUSH) { - dev_state->last_flush_gen++; - if ((dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %pg flush_gen=%llu\n", - dev_state->bdev, - dev_state->last_flush_gen); - } - if (block->submit_bio_bh_rw & REQ_FUA) - block->flush_gen = 0; /* FUA completed means block is - * on disk */ - block->is_iodone = 1; /* for FLUSH, this releases the block */ - block = next_block; - } while (NULL != block); - - bp->bi_end_io(bp); -} - -static int btrfsic_process_written_superblock( - struct btrfsic_state *state, - struct btrfsic_block *const superblock, - struct btrfs_super_block *const super_hdr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - int pass; - - superblock->generation = btrfs_super_generation(super_hdr); - if (!(superblock->generation > state->max_superblock_generation || - 0 == state->max_superblock_generation)) { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - } else { - if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info( - "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", - superblock->logical_bytenr, - superblock->dev_state->bdev, - superblock->dev_bytenr, superblock->mirror_num, - btrfs_super_generation(super_hdr), - state->max_superblock_generation); - - state->max_superblock_generation = - btrfs_super_generation(super_hdr); - state->latest_superblock = superblock; - } - - for (pass = 0; pass < 3; pass++) { - int ret; - u64 next_bytenr; - struct btrfsic_block *next_block; - struct btrfsic_block_data_ctx tmp_next_block_ctx; - struct btrfsic_block_link *l; - int num_copies; - int mirror_num; - const char *additional_string = NULL; - struct btrfs_disk_key tmp_disk_key = {0}; - - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_ITEM_KEY); - btrfs_set_disk_key_objectid(&tmp_disk_key, 0); - - switch (pass) { - case 0: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_ROOT_TREE_OBJECTID); - additional_string = "root "; - next_bytenr = btrfs_super_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("root@%llu\n", next_bytenr); - break; - case 1: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_CHUNK_TREE_OBJECTID); - additional_string = "chunk "; - next_bytenr = btrfs_super_chunk_root(super_hdr); - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("chunk@%llu\n", next_bytenr); - break; - case 2: - btrfs_set_disk_key_objectid(&tmp_disk_key, - BTRFS_TREE_LOG_OBJECTID); - additional_string = "log "; - next_bytenr = btrfs_super_log_root(super_hdr); - if (0 == next_bytenr) - continue; - if (state->print_mask & - BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION) - pr_info("log@%llu\n", next_bytenr); - break; - } - - num_copies = btrfs_num_copies(fs_info, next_bytenr, - BTRFS_SUPER_INFO_SIZE); - if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) - pr_info("num_copies(log_bytenr=%llu) = %d\n", - next_bytenr, num_copies); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - int was_created; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic_process_written_superblock(mirror_num=%d)\n", mirror_num); - ret = btrfsic_map_block(state, next_bytenr, - BTRFS_SUPER_INFO_SIZE, - &tmp_next_block_ctx, - mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n", - next_bytenr, mirror_num); - return -1; - } - - next_block = btrfsic_block_lookup_or_add( - state, - &tmp_next_block_ctx, - additional_string, - 1, 0, 1, - mirror_num, - &was_created); - if (NULL == next_block) { - btrfsic_release_block_ctx(&tmp_next_block_ctx); - return -1; - } - - next_block->disk_key = tmp_disk_key; - if (was_created) - next_block->generation = - BTRFSIC_GENERATION_UNKNOWN; - l = btrfsic_block_link_lookup_or_add( - state, - &tmp_next_block_ctx, - next_block, - superblock, - BTRFSIC_GENERATION_UNKNOWN); - btrfsic_release_block_ctx(&tmp_next_block_ctx); - if (NULL == l) - return -1; - } - } - - if (WARN_ON(-1 == btrfsic_check_all_ref_blocks(state, superblock, 0))) - btrfsic_dump_tree(state); - - return 0; -} - -static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, - struct btrfsic_block *const block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - int ret = 0; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* - * Note that this situation can happen and does not - * indicate an error in regular cases. It happens - * when disk blocks are freed and later reused. - * The check-integrity module is not aware of any - * block free operations, it just recognizes block - * write operations. Therefore it keeps the linkage - * information for a block until a block is - * rewritten. This can temporarily cause incorrect - * and even circular linkage information. This - * causes no harm unless such blocks are referenced - * by the most recent super block. - */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 1).\n"); - - return ret; - } - - /* - * This algorithm is recursive because the amount of used stack - * space is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - if (l->block_ref_to->never_written) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (!l->block_ref_to->is_iodone) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->block_ref_to->iodone_w_error) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); - ret = -1; - } else if (l->parent_generation != - l->block_ref_to->generation && - BTRFSIC_GENERATION_UNKNOWN != - l->parent_generation && - BTRFSIC_GENERATION_UNKNOWN != - l->block_ref_to->generation) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, - l->block_ref_to->generation, - l->parent_generation); - ret = -1; - } else if (l->block_ref_to->flush_gen > - l->block_ref_to->dev_state->last_flush_gen) { - pr_info( -"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, - l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num, block->flush_gen, - l->block_ref_to->dev_state->last_flush_gen); - ret = -1; - } else if (-1 == btrfsic_check_all_ref_blocks(state, - l->block_ref_to, - recursion_level + - 1)) { - ret = -1; - } - } - - return ret; -} - -static int btrfsic_is_block_ref_by_superblock( - const struct btrfsic_state *state, - const struct btrfsic_block *block, - int recursion_level) -{ - const struct btrfsic_block_link *l; - - if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { - /* refer to comment at "abort cyclic linkage (case 1)" */ - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("btrfsic: abort cyclic linkage (case 2).\n"); - - return 0; - } - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - list_for_each_entry(l, &block->ref_from_list, node_ref_from) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info( - "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", - recursion_level, - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num, - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, - l->block_ref_from->mirror_num); - if (l->block_ref_from->is_superblock && - state->latest_superblock->dev_bytenr == - l->block_ref_from->dev_bytenr && - state->latest_superblock->dev_state->bdev == - l->block_ref_from->dev_state->bdev) - return 1; - else if (btrfsic_is_block_ref_by_superblock(state, - l->block_ref_from, - recursion_level + - 1)) - return 1; - } - - return 0; -} - -static void btrfsic_print_add_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static void btrfsic_print_rem_link(const struct btrfsic_state *state, - const struct btrfsic_block_link *l) -{ - pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", - l->ref_cnt, - btrfsic_get_block_type(state, l->block_ref_from), - l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->bdev, - l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, - btrfsic_get_block_type(state, l->block_ref_to), - l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, - l->block_ref_to->mirror_num); -} - -static char btrfsic_get_block_type(const struct btrfsic_state *state, - const struct btrfsic_block *block) -{ - if (block->is_superblock && - state->latest_superblock->dev_bytenr == block->dev_bytenr && - state->latest_superblock->dev_state->bdev == block->dev_state->bdev) - return 'S'; - else if (block->is_superblock) - return 's'; - else if (block->is_metadata) - return 'M'; - else - return 'D'; -} - -static void btrfsic_dump_tree(const struct btrfsic_state *state) -{ - btrfsic_dump_tree_sub(state, state->latest_superblock, 0); -} - -static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, - const struct btrfsic_block *block, - int indent_level) -{ - const struct btrfsic_block_link *l; - int indent_add; - static char buf[80]; - int cursor_position; - - /* - * Should better fill an on-stack buffer with a complete line and - * dump it at once when it is time to print a newline character. - */ - - /* - * This algorithm is recursive because the amount of used stack space - * is very small and the max recursion depth is limited. - */ - indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", - btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->bdev, - block->dev_bytenr, block->mirror_num); - if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - return; - } - printk(buf); - indent_level += indent_add; - if (list_empty(&block->ref_to_list)) { - printk("\n"); - return; - } - if (block->mirror_num > 1 && - !(state->print_mask & BTRFSIC_PRINT_MASK_TREE_WITH_ALL_MIRRORS)) { - printk(" [...]\n"); - return; - } - - cursor_position = indent_level; - list_for_each_entry(l, &block->ref_to_list, node_ref_to) { - while (cursor_position < indent_level) { - printk(" "); - cursor_position++; - } - if (l->ref_cnt > 1) - indent_add = sprintf(buf, " %d*--> ", l->ref_cnt); - else - indent_add = sprintf(buf, " --> "); - if (indent_level + indent_add > - BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { - printk("[...]\n"); - cursor_position = 0; - continue; - } - - printk(buf); - - btrfsic_dump_tree_sub(state, l->block_ref_to, - indent_level + indent_add); - cursor_position = 0; - } -} - -static struct btrfsic_block_link *btrfsic_block_link_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *next_block_ctx, - struct btrfsic_block *next_block, - struct btrfsic_block *from_block, - u64 parent_generation) -{ - struct btrfsic_block_link *l; - - l = btrfsic_block_link_hashtable_lookup(next_block_ctx->dev->bdev, - next_block_ctx->dev_bytenr, - from_block->dev_state->bdev, - from_block->dev_bytenr, - &state->block_link_hashtable); - if (NULL == l) { - l = btrfsic_block_link_alloc(); - if (!l) - return NULL; - - l->block_ref_to = next_block; - l->block_ref_from = from_block; - l->ref_cnt = 1; - l->parent_generation = parent_generation; - - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - - list_add(&l->node_ref_to, &from_block->ref_to_list); - list_add(&l->node_ref_from, &next_block->ref_from_list); - - btrfsic_block_link_hashtable_add(l, - &state->block_link_hashtable); - } else { - l->ref_cnt++; - l->parent_generation = parent_generation; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_add_link(state, l); - } - - return l; -} - -static struct btrfsic_block *btrfsic_block_lookup_or_add( - struct btrfsic_state *state, - struct btrfsic_block_data_ctx *block_ctx, - const char *additional_string, - int is_metadata, - int is_iodone, - int never_written, - int mirror_num, - int *was_created) -{ - struct btrfsic_block *block; - - block = btrfsic_block_hashtable_lookup(block_ctx->dev->bdev, - block_ctx->dev_bytenr, - &state->block_hashtable); - if (NULL == block) { - struct btrfsic_dev_state *dev_state; - - block = btrfsic_block_alloc(); - if (!block) - return NULL; - - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); - if (NULL == dev_state) { - pr_info("btrfsic: error, lookup dev_state failed!\n"); - btrfsic_block_free(block); - return NULL; - } - block->dev_state = dev_state; - block->dev_bytenr = block_ctx->dev_bytenr; - block->logical_bytenr = block_ctx->start; - block->is_metadata = is_metadata; - block->is_iodone = is_iodone; - block->never_written = never_written; - block->mirror_num = mirror_num; - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", - additional_string, - btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->bdev, - block->dev_bytenr, mirror_num); - list_add(&block->all_blocks_node, &state->all_blocks_list); - btrfsic_block_hashtable_add(block, &state->block_hashtable); - if (NULL != was_created) - *was_created = 1; - } else { - if (NULL != was_created) - *was_created = 0; - } - - return block; -} - -static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, - u64 bytenr, - struct btrfsic_dev_state *dev_state, - u64 dev_bytenr) -{ - struct btrfs_fs_info *fs_info = state->fs_info; - struct btrfsic_block_data_ctx block_ctx; - int num_copies; - int mirror_num; - int match = 0; - int ret; - - num_copies = btrfs_num_copies(fs_info, bytenr, state->metablock_size); - - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, state->metablock_size, - &block_ctx, mirror_num); - if (ret) { - pr_info("btrfsic: btrfsic_map_block(logical @%llu, mirror %d) failed!\n", - bytenr, mirror_num); - continue; - } - - if (dev_state->bdev == block_ctx.dev->bdev && - dev_bytenr == block_ctx.dev_bytenr) { - match++; - btrfsic_release_block_ctx(&block_ctx); - break; - } - btrfsic_release_block_ctx(&block_ctx); - } - - if (WARN_ON(!match)) { - pr_info( -"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", - bytenr, dev_state->bdev, dev_bytenr); - for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { - ret = btrfsic_map_block(state, bytenr, - state->metablock_size, - &block_ctx, mirror_num); - if (ret) - continue; - - pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", - bytenr, block_ctx.dev->bdev, - block_ctx.dev_bytenr, mirror_num); - } - } -} - -static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) -{ - return btrfsic_dev_state_hashtable_lookup(dev, - &btrfsic_dev_state_hashtable); -} - -static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - unsigned int segs = bio_segments(bio); - u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; - u64 cur_bytenr = dev_bytenr; - struct bvec_iter iter; - struct bio_vec bvec; - char **mapped_datav; - int bio_is_patched = 0; - int i = 0; - - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info( -"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - return; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - - btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, - bio, &bio_is_patched, bio->bi_opf); - kfree(mapped_datav); -} - -static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) -{ - if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - - if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; - - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } else if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) { - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } -} - -void btrfsic_check_bio(struct bio *bio) -{ - struct btrfsic_dev_state *dev_state; - - if (!btrfsic_is_initialized) - return; - - /* - * We can be called before btrfsic_mount, so there might not be a - * dev_state. - */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - mutex_lock(&btrfsic_mutex); - if (dev_state) { - if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) - btrfsic_check_write_bio(bio, dev_state); - else if (bio->bi_opf & REQ_PREFLUSH) - btrfsic_check_flush_bio(bio, dev_state); - } - mutex_unlock(&btrfsic_mutex); -} - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask) -{ - int ret; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!PAGE_ALIGNED(fs_info->nodesize)) { - pr_info("btrfsic: cannot handle nodesize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->nodesize, PAGE_SIZE); - return -1; - } - if (!PAGE_ALIGNED(fs_info->sectorsize)) { - pr_info("btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_SIZE %ld!\n", - fs_info->sectorsize, PAGE_SIZE); - return -1; - } - state = kvzalloc(sizeof(*state), GFP_KERNEL); - if (!state) - return -ENOMEM; - - if (!btrfsic_is_initialized) { - mutex_init(&btrfsic_mutex); - btrfsic_dev_state_hashtable_init(&btrfsic_dev_state_hashtable); - btrfsic_is_initialized = 1; - } - mutex_lock(&btrfsic_mutex); - state->fs_info = fs_info; - state->print_mask = print_mask; - state->include_extent_data = including_extent_data; - state->metablock_size = fs_info->nodesize; - state->datablock_size = fs_info->sectorsize; - INIT_LIST_HEAD(&state->all_blocks_list); - btrfsic_block_hashtable_init(&state->block_hashtable); - btrfsic_block_link_hashtable_init(&state->block_link_hashtable); - state->max_superblock_generation = 0; - state->latest_superblock = NULL; - - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_alloc(); - if (NULL == ds) { - mutex_unlock(&btrfsic_mutex); - return -ENOMEM; - } - ds->bdev = device->bdev; - ds->state = state; - btrfsic_dev_state_hashtable_add(ds, - &btrfsic_dev_state_hashtable); - } - - ret = btrfsic_process_superblock(state, fs_devices); - if (0 != ret) { - mutex_unlock(&btrfsic_mutex); - btrfsic_unmount(fs_devices); - return ret; - } - - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_DATABASE) - btrfsic_dump_database(state); - if (state->print_mask & BTRFSIC_PRINT_MASK_INITIAL_TREE) - btrfsic_dump_tree(state); - - mutex_unlock(&btrfsic_mutex); - return 0; -} - -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) -{ - struct btrfsic_block *b_all, *tmp_all; - struct btrfsic_state *state; - struct list_head *dev_head = &fs_devices->devices; - struct btrfs_device *device; - - if (!btrfsic_is_initialized) - return; - - mutex_lock(&btrfsic_mutex); - - state = NULL; - list_for_each_entry(device, dev_head, dev_list) { - struct btrfsic_dev_state *ds; - - if (!device->bdev || !device->name) - continue; - - ds = btrfsic_dev_state_hashtable_lookup( - device->bdev->bd_dev, - &btrfsic_dev_state_hashtable); - if (NULL != ds) { - state = ds->state; - btrfsic_dev_state_hashtable_remove(ds); - btrfsic_dev_state_free(ds); - } - } - - if (NULL == state) { - pr_info("btrfsic: error, cannot find state information on umount!\n"); - mutex_unlock(&btrfsic_mutex); - return; - } - - /* - * Don't care about keeping the lists' state up to date, - * just free all memory that was allocated dynamically. - * Free the blocks and the block_links. - */ - list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, - all_blocks_node) { - struct btrfsic_block_link *l, *tmp; - - list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, - node_ref_to) { - if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - btrfsic_print_rem_link(state, l); - - l->ref_cnt--; - if (0 == l->ref_cnt) - btrfsic_block_link_free(l); - } - - if (b_all->is_iodone || b_all->never_written) - btrfsic_block_free(b_all); - else - pr_info( -"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", - btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->bdev, - b_all->dev_bytenr, b_all->mirror_num); - } - - mutex_unlock(&btrfsic_mutex); - - kvfree(state); -} diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h deleted file mode 100644 index e4c8aed7996f..000000000000 --- a/fs/btrfs/check-integrity.h +++ /dev/null @@ -1,20 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) STRATO AG 2011. All rights reserved. - */ - -#ifndef BTRFS_CHECK_INTEGRITY_H -#define BTRFS_CHECK_INTEGRITY_H - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_check_bio(struct bio *bio); -#else -static inline void btrfsic_check_bio(struct bio *bio) { } -#endif - -int btrfsic_mount(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, - int including_extent_data, u32 print_mask); -void btrfsic_unmount(struct btrfs_fs_devices *fs_devices); - -#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 8818ed5c390f..19b22b4653c8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -193,12 +193,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int errno = blk_status_to_errno(cb->bbio.bio.bi_status); + const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (errno) - mapping_set_error(inode->i_mapping, errno); + if (error) + mapping_set_error(inode->i_mapping, error); folio_batch_init(&fbatch); while (index <= end_index) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a4cb4b642987..2a9344a3fcee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -230,9 +230,9 @@ noinline void btrfs_release_path(struct btrfs_path *p) * cause could be a bug, eg. due to ENOSPC, and not for common errors that are * caused by external factors. */ -bool __cold abort_should_print_stack(int errno) +bool __cold abort_should_print_stack(int error) { - switch (errno) { + switch (error) { case -EIO: case -EROFS: case -ENOMEM: @@ -316,6 +316,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, int ret = 0; int level; struct btrfs_disk_key disk_key; + u64 reloc_src_root = 0; WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -328,9 +329,11 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); + if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID) + reloc_src_root = btrfs_header_owner(buf); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, &disk_key, level, buf->start, 0, - BTRFS_NESTING_NEW_ROOT); + reloc_src_root, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -359,7 +362,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return ret; } - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -367,7 +370,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf) { /* @@ -376,11 +380,21 @@ int btrfs_block_can_be_shared(struct btrfs_root *root, * not allocated by tree relocation, we know the block is not shared. */ if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && buf != root->commit_root && + buf != root->node && (btrfs_header_generation(buf) <= btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) - return 1; + btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { + if (buf != root->commit_root) + return 1; + /* + * An extent buffer that used to be the commit root may still be + * shared because the tree height may have increased and it + * became a child of a higher level root. This can happen when + * snapshotting a subvolume created in the current transaction. + */ + if (btrfs_header_generation(buf) == trans->transid) + return 1; + } return 0; } @@ -415,7 +429,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * are only allowed for blocks use full backrefs. */ - if (btrfs_block_can_be_shared(root, buf)) { + if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, &refs, &flags); @@ -507,13 +521,13 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, * bytes the allocator should try to find free next to the block it returns. * This is just a hint and may be ignored by the allocator. */ -static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf, - struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size, - enum btrfs_lock_nesting nest) +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -522,6 +536,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, int last_ref = 0; int unlock_orig = 0; u64 parent_start = 0; + u64 reloc_src_root = 0; if (*cow_ret == buf) unlock_orig = 1; @@ -540,12 +555,14 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) - parent_start = parent->start; - + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + if (parent) + parent_start = parent->start; + reloc_src_root = btrfs_header_owner(buf); + } cow = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, &disk_key, level, - search_start, empty_size, nest); + search_start, empty_size, reloc_src_root, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -616,7 +633,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (last_ref) { ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { @@ -632,7 +649,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (unlock_orig) btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); - btrfs_mark_buffer_dirty(cow); + btrfs_mark_buffer_dirty(trans, cow); *cow_ret = cow; return 0; } @@ -668,11 +685,11 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, } /* - * cows a single block, see __btrfs_cow_block for the real work. + * COWs a single block, see btrfs_force_cow_block() for the real work. * This version of it has extra checks so that a block isn't COWed more than * once per transaction, as long as it hasn't been written yet */ -noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, +int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, @@ -682,25 +699,37 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, u64 search_start; int ret; - if (test_bit(BTRFS_ROOT_DELETING, &root->state)) - btrfs_err(fs_info, - "COW'ing blocks on a fs root that's being dropped"); - - if (trans->transaction != fs_info->running_transaction) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, - fs_info->running_transaction->transid); + if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, + "attempt to COW block %llu on root %llu that is being deleted", + buf->start, btrfs_root_id(root)); + return -EUCLEAN; + } - if (trans->transid != fs_info->generation) - WARN(1, KERN_CRIT "trans %llu running %llu\n", - trans->transid, fs_info->generation); + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu", + buf->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } if (!should_cow_block(trans, root, buf)) { *cow_ret = buf; return 0; } - search_start = buf->start & ~((u64)SZ_1G - 1); + search_start = round_down(buf->start, SZ_1G); /* * Before CoWing this block for later modification, check if it's @@ -709,8 +738,8 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, * Also We don't care about the error, as it's handled internally. */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); - ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0, nest); + ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot, + cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -719,49 +748,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO); /* - * helper function for defrag to decide if two blocks pointed to by a - * node are actually close by - */ -static int close_blocks(u64 blocknr, u64 other, u32 blocksize) -{ - if (blocknr < other && other - (blocknr + blocksize) < 32768) - return 1; - if (blocknr > other && blocknr - (other + blocksize) < 32768) - return 1; - return 0; -} - -#ifdef __LITTLE_ENDIAN - -/* - * Compare two keys, on little-endian the disk order is same as CPU order and - * we can avoid the conversion. - */ -static int comp_keys(const struct btrfs_disk_key *disk_key, - const struct btrfs_key *k2) -{ - const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; - - return btrfs_comp_cpu_keys(k1, k2); -} - -#else - -/* - * compare two keys in a memcmp fashion - */ -static int comp_keys(const struct btrfs_disk_key *disk, - const struct btrfs_key *k2) -{ - struct btrfs_key k1; - - btrfs_disk_key_to_cpu(&k1, disk); - - return btrfs_comp_cpu_keys(&k1, k2); -} -#endif - -/* * same as comp_keys only with two btrfs_key's */ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2) @@ -782,91 +768,6 @@ int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_ke } /* - * this is used by the defrag code to go through all the - * leaves pointed to by a node and reallocate them so that - * disk order is close to key order - */ -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *cur; - u64 blocknr; - u64 search_start = *last_ret; - u64 last_block = 0; - u64 other; - u32 parent_nritems; - int end_slot; - int i; - int err = 0; - u32 blocksize; - int progress_passed = 0; - struct btrfs_disk_key disk_key; - - WARN_ON(trans->transaction != fs_info->running_transaction); - WARN_ON(trans->transid != fs_info->generation); - - parent_nritems = btrfs_header_nritems(parent); - blocksize = fs_info->nodesize; - end_slot = parent_nritems - 1; - - if (parent_nritems <= 1) - return 0; - - for (i = start_slot; i <= end_slot; i++) { - int close = 1; - - btrfs_node_key(parent, &disk_key, i); - if (!progress_passed && comp_keys(&disk_key, progress) < 0) - continue; - - progress_passed = 1; - blocknr = btrfs_node_blockptr(parent, i); - if (last_block == 0) - last_block = blocknr; - - if (i > 0) { - other = btrfs_node_blockptr(parent, i - 1); - close = close_blocks(blocknr, other, blocksize); - } - if (!close && i < end_slot) { - other = btrfs_node_blockptr(parent, i + 1); - close = close_blocks(blocknr, other, blocksize); - } - if (close) { - last_block = blocknr; - continue; - } - - cur = btrfs_read_node_slot(parent, i); - if (IS_ERR(cur)) - return PTR_ERR(cur); - if (search_start == 0) - search_start = last_block; - - btrfs_tree_lock(cur); - err = __btrfs_cow_block(trans, root, cur, parent, i, - &cur, search_start, - min(16 * blocksize, - (end_slot - i) * blocksize), - BTRFS_NESTING_COW); - if (err) { - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - break; - } - search_start = cur->start; - last_block = cur->start; - *last_ret = search_start; - btrfs_tree_unlock(cur); - free_extent_buffer(cur); - } - return err; -} - -/* * Search for a key in the given extent_buffer. * * The lower boundary for the search is specified by the slot number @first_slot. @@ -932,7 +833,7 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, tmp = &unaligned; } - ret = comp_keys(tmp, key); + ret = btrfs_comp_keys(tmp, key); if (ret < 0) low = mid + 1; @@ -947,19 +848,19 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, return 1; } -static void root_add_used(struct btrfs_root *root, u32 size) +static void root_add_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) + size); + btrfs_root_used(&root->root_item) + root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } -static void root_sub_used(struct btrfs_root *root, u32 size) +static void root_sub_used_bytes(struct btrfs_root *root) { spin_lock(&root->accounting_lock); btrfs_set_root_used(&root->root_item, - btrfs_root_used(&root->root_item) - size); + btrfs_root_used(&root->root_item) - root->fs_info->nodesize); spin_unlock(&root->accounting_lock); } @@ -1075,7 +976,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* once for the path */ free_extent_buffer(mid); - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); /* once for the root ptr */ free_extent_buffer_stale(mid); @@ -1145,7 +1046,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; goto out; } - root_sub_used(root, right->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), right, 0, 1); free_extent_buffer_stale(right); @@ -1160,7 +1061,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &right_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } } if (btrfs_header_nritems(mid) == 1) { @@ -1203,7 +1104,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, mid = NULL; goto out; } - root_sub_used(root, mid->len); + root_sub_used_bytes(root); btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1); free_extent_buffer_stale(mid); mid = NULL; @@ -1218,7 +1119,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto out; } btrfs_set_node_key(parent, &mid_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); } /* update the path */ @@ -1325,7 +1226,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(left) > orig_slot) { path->nodes[level] = left; path->slots[level + 1] -= 1; @@ -1385,7 +1286,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, return ret; } btrfs_set_node_key(parent, &disk_key, pslot + 1); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); if (btrfs_header_nritems(mid) <= orig_slot) { path->nodes[level] = right; @@ -1969,7 +1870,7 @@ static int search_leaf(struct btrfs_trans_handle *trans, * the extent buffer's header and we have recently accessed * the header's level field. */ - ret = comp_keys(&first_key, key); + ret = btrfs_comp_keys(&first_key, key); if (ret < 0) { /* * The first key is smaller than the key we want @@ -2054,8 +1955,8 @@ static int search_leaf(struct btrfs_trans_handle *trans, } /* - * btrfs_search_slot - look for a key in a tree and perform necessary - * modifications to preserve tree invariants. + * Look for a key in a tree and perform necessary modifications to preserve + * tree invariants. * * @trans: Handle of transaction, used when modifying the tree * @p: Holds all btree nodes along the search path @@ -2478,7 +2379,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) */ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { btrfs_item_key(path->nodes[0], &found_key, path->slots[0]); - ret = comp_keys(&found_key, &orig_key); + ret = btrfs_comp_keys(&found_key, &orig_key); if (ret == 0) { if (path->slots[0] > 0) { path->slots[0]--; @@ -2493,7 +2394,7 @@ static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) } btrfs_item_key(path->nodes[0], &found_key, 0); - ret = comp_keys(&found_key, &key); + ret = btrfs_comp_keys(&found_key, &key); /* * We might have had an item with the previous key in the tree right * before we released our path. And after we released our path, that @@ -2641,7 +2542,8 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, * higher levels * */ -static void fixup_low_keys(struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -2658,7 +2560,7 @@ static void fixup_low_keys(struct btrfs_path *path, BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); - btrfs_mark_buffer_dirty(path->nodes[i]); + btrfs_mark_buffer_dirty(trans, path->nodes[i]); if (tslot != 0) break; } @@ -2670,10 +2572,11 @@ static void fixup_low_keys(struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_disk_key disk_key; struct extent_buffer *eb; int slot; @@ -2682,7 +2585,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, slot = path->slots[0]; if (slot > 0) { btrfs_item_key(eb, &disk_key, slot - 1); - if (unlikely(comp_keys(&disk_key, new_key) >= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2696,7 +2599,7 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, } if (slot < btrfs_header_nritems(eb) - 1) { btrfs_item_key(eb, &disk_key, slot + 1); - if (unlikely(comp_keys(&disk_key, new_key) <= 0)) { + if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) { btrfs_print_leaf(eb); btrfs_crit(fs_info, "slot %u key (%llu %u %llu) new key (%llu %u %llu)", @@ -2711,9 +2614,9 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, btrfs_cpu_key_to_disk(&disk_key, new_key); btrfs_set_item_key(eb, &disk_key, slot); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -2844,8 +2747,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, } btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2920,8 +2823,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(src, src_nritems - push_items); btrfs_set_header_nritems(dst, dst_nritems + push_items); - btrfs_mark_buffer_dirty(src); - btrfs_mark_buffer_dirty(dst); + btrfs_mark_buffer_dirty(trans, src); + btrfs_mark_buffer_dirty(trans, dst); return ret; } @@ -2937,7 +2840,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level) { - struct btrfs_fs_info *fs_info = root->fs_info; u64 lower_gen; struct extent_buffer *lower; struct extent_buffer *c; @@ -2956,11 +2858,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NEW_ROOT); + 0, BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); btrfs_set_header_nritems(c, 1); btrfs_set_node_key(c, &lower_key, 0); @@ -2970,7 +2872,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(c, 0, lower_gen); - btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(trans, c); old = root->node; ret = btrfs_tree_mod_log_insert_root(root->node, c, false); @@ -3042,7 +2944,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, WARN_ON(trans->transid == 0); btrfs_set_node_ptr_generation(lower, slot, trans->transid); btrfs_set_header_nritems(lower, nritems + 1); - btrfs_mark_buffer_dirty(lower); + btrfs_mark_buffer_dirty(trans, lower); return 0; } @@ -3100,11 +3002,11 @@ static noinline int split_node(struct btrfs_trans_handle *trans, split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, &disk_key, level, c->start, 0, - BTRFS_NESTING_SPLIT); + 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); ASSERT(btrfs_header_level(c) == level); ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); @@ -3121,8 +3023,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); - btrfs_mark_buffer_dirty(c); - btrfs_mark_buffer_dirty(split); + btrfs_mark_buffer_dirty(trans, c); + btrfs_mark_buffer_dirty(trans, split); ret = insert_ptr(trans, path, &disk_key, split->start, path->slots[level + 1] + 1, level + 1); @@ -3288,15 +3190,15 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(left, left_nritems); if (left_nritems) - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); else btrfs_clear_buffer_dirty(trans, left); - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); btrfs_set_node_key(upper, &disk_key, slot + 1); - btrfs_mark_buffer_dirty(upper); + btrfs_mark_buffer_dirty(trans, upper); /* then fixup the leaf pointer in the path */ if (path->slots[0] >= left_nritems) { @@ -3508,14 +3410,14 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, btrfs_set_token_item_offset(&token, i, push_space); } - btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(trans, left); if (right_nritems) - btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(trans, right); else btrfs_clear_buffer_dirty(trans, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -3646,8 +3548,8 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans, if (ret < 0) return ret; - btrfs_mark_buffer_dirty(right); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, right); + btrfs_mark_buffer_dirty(trans, l); BUG_ON(path->slots[0] != slot); if (mid <= slot) { @@ -3851,13 +3753,13 @@ again: * use BTRFS_NESTING_NEW_ROOT. */ right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, - &disk_key, 0, l->start, 0, + &disk_key, 0, l->start, 0, 0, num_doubles ? BTRFS_NESTING_NEW_ROOT : BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, fs_info->nodesize); + root_add_used_bytes(root); if (split == 0) { if (mid <= slot) { @@ -3888,7 +3790,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* * We create a new leaf 'right' for the required ins_len and @@ -3987,7 +3889,8 @@ err: return ret; } -static noinline int split_item(struct btrfs_path *path, +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, const struct btrfs_key *new_key, unsigned long split_offset) { @@ -4046,7 +3949,7 @@ static noinline int split_item(struct btrfs_path *path, write_extent_buffer(leaf, buf + split_offset, btrfs_item_ptr_offset(leaf, slot), item_size - split_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); BUG_ON(btrfs_leaf_free_space(leaf) < 0); kfree(buf); @@ -4080,7 +3983,7 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = split_item(path, new_key, split_offset); + ret = split_item(trans, path, new_key, split_offset); return ret; } @@ -4090,7 +3993,8 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, * off the end of the item or if we shift the item to chop bytes off * the front. */ -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end) { int slot; struct extent_buffer *leaf; @@ -4166,11 +4070,11 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_set_item_size(leaf, slot, new_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4181,7 +4085,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* * make the item pointed to by the path bigger, data_size is the added size. */ -void btrfs_extend_item(struct btrfs_path *path, u32 data_size) +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size) { int slot; struct extent_buffer *leaf; @@ -4231,7 +4136,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) data_end = old_data; old_size = btrfs_item_size(leaf, slot); btrfs_set_item_size(leaf, slot, old_size + data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4242,6 +4147,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) /* * Make space in the node before inserting one or more items. * + * @trans: transaction handle * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert @@ -4249,7 +4155,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * Main purpose is to save stack depth by doing the bulk of the work in a * function that doesn't call btrfs_search_slot */ -static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, +static void setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4269,7 +4176,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p */ if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4328,7 +4235,7 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p } btrfs_set_header_nritems(leaf, nritems + batch->nr); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (btrfs_leaf_free_space(leaf) < 0) { btrfs_print_leaf(leaf); @@ -4339,12 +4246,14 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p /* * Insert a new item into a leaf. * + * @trans: Transaction handle. * @root: The root of the btree. * @path: A path pointing to the target leaf and slot. * @key: The key of the new item. * @data_size: The size of the data associated with the new key. */ -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size) @@ -4356,7 +4265,7 @@ void btrfs_setup_item_for_insert(struct btrfs_root *root, batch.total_data_size = data_size; batch.nr = 1; - setup_items_for_insert(root, path, &batch); + setup_items_for_insert(trans, root, path, &batch); } /* @@ -4382,7 +4291,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, batch); + setup_items_for_insert(trans, root, path, batch); return 0; } @@ -4407,7 +4316,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, data, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } btrfs_free_path(path); return ret; @@ -4438,7 +4347,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - btrfs_setup_item_for_insert(root, path, new_key, item_size); + btrfs_setup_item_for_insert(trans, root, path, new_key, item_size); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4496,9 +4405,9 @@ int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(path, &disk_key, level + 1); + fixup_low_keys(trans, path, &disk_key, level + 1); } - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); return 0; } @@ -4530,7 +4439,7 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - root_sub_used(root, leaf->len); + root_sub_used_bytes(root); atomic_inc(&leaf->refs); btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1); @@ -4595,7 +4504,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(path, &disk_key, 1); + fixup_low_keys(trans, path, &disk_key, 1); } /* @@ -4660,11 +4569,11 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, * dirtied this buffer */ if (path->nodes[0] == leaf) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); free_extent_buffer(leaf); } } else { - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } } return ret; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9419f4e37a58..196c005c31f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,37 +6,10 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include <linux/mm.h> -#include <linux/sched/signal.h> -#include <linux/highmem.h> -#include <linux/fs.h> -#include <linux/rwsem.h> -#include <linux/semaphore.h> -#include <linux/completion.h> -#include <linux/backing-dev.h> -#include <linux/wait.h> -#include <linux/slab.h> -#include <trace/events/btrfs.h> -#include <asm/unaligned.h> #include <linux/pagemap.h> -#include <linux/btrfs.h> -#include <linux/btrfs_tree.h> -#include <linux/workqueue.h> -#include <linux/security.h> -#include <linux/sizes.h> -#include <linux/dynamic_debug.h> -#include <linux/refcount.h> -#include <linux/crc32c.h> -#include <linux/iomap.h> -#include <linux/fscrypt.h> -#include "extent-io-tree.h" -#include "extent_io.h" -#include "extent_map.h" -#include "async-thread.h" -#include "block-rsv.h" #include "locking.h" -#include "misc.h" #include "fs.h" +#include "accessors.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -218,10 +191,22 @@ struct btrfs_root { atomic_t log_commit[2]; /* Used only for log trees of subvolumes, not for the log root tree */ atomic_t log_batch; + /* + * Protected by the 'log_mutex' lock but can be read without holding + * that lock to avoid unnecessary lock contention, in which case it + * should be read using btrfs_get_root_log_transid() except if it's a + * log tree in which case it can be directly accessed. Updates to this + * field should always use btrfs_set_root_log_transid(), except for log + * trees where the field can be updated directly. + */ int log_transid; /* No matter the commit succeeds or not*/ int log_transid_committed; - /* Just be updated when the commit succeeds. */ + /* + * Just be updated when the commit succeeds. Use + * btrfs_get_root_last_log_commit() and btrfs_set_root_last_log_commit() + * to access this field. + */ int last_log_commit; pid_t log_start_pid; @@ -326,6 +311,9 @@ struct btrfs_root { /* Used only by log trees, when logging csum items */ struct extent_io_tree log_csum_range; + /* Used in simple quotas, track root during relocation. */ + u64 relocation_src_root; + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS u64 alloc_bytenr; #endif @@ -352,6 +340,26 @@ static inline u64 btrfs_root_id(const struct btrfs_root *root) return root->root_key.objectid; } +static inline int btrfs_get_root_log_transid(const struct btrfs_root *root) +{ + return READ_ONCE(root->log_transid); +} + +static inline void btrfs_set_root_log_transid(struct btrfs_root *root, int log_transid) +{ + WRITE_ONCE(root->log_transid, log_transid); +} + +static inline int btrfs_get_root_last_log_commit(const struct btrfs_root *root) +{ + return READ_ONCE(root->last_log_commit); +} + +static inline void btrfs_set_root_last_log_commit(struct btrfs_root *root, int commit_id) +{ + WRITE_ONCE(root->last_log_commit, commit_id); +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. @@ -470,30 +478,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) -{ - return crc32c(crc, address, length); -} - -static inline void btrfs_crc32c_final(u32 crc, u8 *result) -{ - put_unaligned_le32(~crc, result); -} - -static inline u64 btrfs_name_hash(const char *name, int len) -{ - return crc32c((u32)~1, name, len); -} - -/* - * Figure the key offset of an extended inode ref - */ -static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, - int len) -{ - return (u64) crc32c(parent_objectid, name, len); -} - static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); @@ -513,12 +497,42 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); + +#ifdef __LITTLE_ENDIAN + +/* + * Compare two keys, on little-endian the disk order is same as CPU order and + * we can avoid the conversion. + */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk_key, + const struct btrfs_key *k2) +{ + const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key; + + return btrfs_comp_cpu_keys(k1, k2); +} + +#else + +/* Compare two keys in a memcmp fashion. */ +static inline int btrfs_comp_keys(const struct btrfs_disk_key *disk, + const struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + return btrfs_comp_cpu_keys(&k1, k2); +} + +#endif + int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, +void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); @@ -536,16 +550,26 @@ int btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, enum btrfs_lock_nesting nest); +int btrfs_force_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_root *root, +int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -void btrfs_extend_item(struct btrfs_path *path, u32 data_size); -void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end); +void btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 data_size); +void btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_path *path, u32 new_size, int from_end); int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -566,10 +590,6 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int find_higher, int return_any); -int btrfs_realloc_node(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *parent, - int start_slot, u64 *last_ret, - struct btrfs_key *progress); void btrfs_release_path(struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); @@ -609,7 +629,8 @@ struct btrfs_item_batch { int nr; }; -void btrfs_setup_item_for_insert(struct btrfs_root *root, +void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *key, u32 data_size); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index f2ff4cbe8656..5244561e2016 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -338,13 +338,118 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) } /* + * Check if two blocks addresses are close, used by defrag. + */ +static bool close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < SZ_32K) + return true; + if (blocknr > other && blocknr - (other + blocksize) < SZ_32K) + return true; + return false; +} + +/* + * Go through all the leaves pointed to by a node and reallocate them so that + * disk order is close to key order. + */ +static int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *parent, + int start_slot, u64 *last_ret, + struct btrfs_key *progress) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + const u32 blocksize = fs_info->nodesize; + const int end_slot = btrfs_header_nritems(parent) - 1; + u64 search_start = *last_ret; + u64 last_block = 0; + int ret = 0; + bool progress_passed = false; + + /* + * COWing must happen through a running transaction, which always + * matches the current fs generation (it's a transaction with a state + * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs + * into error state to prevent the commit of any transaction. + */ + if (unlikely(trans->transaction != fs_info->running_transaction || + trans->transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"unexpected transaction when attempting to reallocate parent %llu for root %llu, transaction %llu running transaction %llu fs generation %llu", + parent->start, btrfs_root_id(root), trans->transid, + fs_info->running_transaction->transid, + fs_info->generation); + return -EUCLEAN; + } + + if (btrfs_header_nritems(parent) <= 1) + return 0; + + for (int i = start_slot; i <= end_slot; i++) { + struct extent_buffer *cur; + struct btrfs_disk_key disk_key; + u64 blocknr; + u64 other; + bool close = true; + + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && btrfs_comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = true; + blocknr = btrfs_node_blockptr(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + + cur = btrfs_read_node_slot(parent, i); + if (IS_ERR(cur)) + return PTR_ERR(cur); + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + ret = btrfs_force_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); + if (ret) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + return ret; +} + +/* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to * better reflect disk order */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root) +static int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root) { struct btrfs_path *path = NULL; struct btrfs_key key; @@ -461,6 +566,45 @@ done: } /* + * Defrag a given btree. Every leaf in the btree is read and defragmented. + */ +int btrfs_defrag_root(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) + return 0; + + while (1) { + struct btrfs_trans_handle *trans; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + break; + } + + ret = btrfs_defrag_leaves(trans, root); + + btrfs_end_transaction(trans); + btrfs_btree_balance_dirty(fs_info); + cond_resched(); + + if (btrfs_fs_closing(fs_info) || ret != -EAGAIN) + break; + + if (btrfs_defrag_cancelled(fs_info)) { + btrfs_debug(fs_info, "defrag_root cancelled"); + ret = -EAGAIN; + break; + } + } + clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); + return ret; +} + +/* * Defrag specific helper to get an extent map. * * Differences between this and btrfs_get_extent() are: @@ -891,8 +1035,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) + if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 5305f2283b5e..5a62763528d1 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -12,7 +12,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 427abaf608b8..51453d4928fa 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -322,9 +322,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, } else { if (current->journal_info) flush = BTRFS_RESERVE_FLUSH_LIMIT; - - if (btrfs_transaction_in_commit(fs_info)) - schedule_timeout(1); } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); @@ -346,7 +343,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, noflush); if (ret) return ret; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv->space_info, + meta_reserve, flush); if (ret) { btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve); return ret; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 53c1211dd60b..7381241334e8 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -313,7 +313,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, { struct btrfs_delayed_item *item; - item = kmalloc(sizeof(*item) + data_len, GFP_NOFS); + item = kmalloc(struct_size(item, data, data_len), GFP_NOFS); if (item) { item->data_len = data_len; item->type = type; @@ -328,7 +328,8 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u16 data_len, } /* - * __btrfs_lookup_delayed_item - look up the delayed item by key + * Look up the delayed item by key. + * * @delayed_node: pointer to the delayed node * @index: the dir index value to lookup (offset of a dir index key) * @@ -412,6 +413,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root) static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) { + struct btrfs_delayed_node *delayed_node = delayed_item->delayed_node; struct rb_root_cached *root; struct btrfs_delayed_root *delayed_root; @@ -419,18 +421,21 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item) if (RB_EMPTY_NODE(&delayed_item->rb_node)) return; - delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root; + /* If it's in a rbtree, then we need to have delayed node locked. */ + lockdep_assert_held(&delayed_node->mutex); + + delayed_root = delayed_node->root->fs_info->delayed_root; BUG_ON(!delayed_root); if (delayed_item->type == BTRFS_DELAYED_INSERTION_ITEM) - root = &delayed_item->delayed_node->ins_root; + root = &delayed_node->ins_root; else - root = &delayed_item->delayed_node->del_root; + root = &delayed_node->del_root; rb_erase_cached(&delayed_item->rb_node, root); RB_CLEAR_NODE(&delayed_item->rb_node); - delayed_item->delayed_node->count--; + delayed_node->count--; finish_one_item(delayed_root); } @@ -513,7 +518,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, /* * For insertions we track reserved metadata space by accounting * for the number of leaves that will be used, based on the delayed - * node's index_items_size field. + * node's curr_index_batch_size and index_item_leaves fields. */ if (item->type == BTRFS_DELAYED_DELETION_ITEM) item->bytes_reserved = num_bytes; @@ -1026,7 +1031,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_item); write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item, sizeof(struct btrfs_inode_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; @@ -1153,20 +1158,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr) ret = __btrfs_commit_inode_delayed_items(trans, path, curr_node); if (ret) { - btrfs_release_delayed_node(curr_node); - curr_node = NULL; btrfs_abort_transaction(trans, ret); break; } prev_node = curr_node; curr_node = btrfs_next_delayed_node(curr_node); + /* + * See the comment below about releasing path before releasing + * node. If the commit of delayed items was successful the path + * should always be released, but in case of an error, it may + * point to locked extent buffers (a leaf at the very least). + */ + ASSERT(path->nodes[0] == NULL); btrfs_release_delayed_node(prev_node); } + /* + * Release the path to avoid a potential deadlock and lockdep splat when + * releasing the delayed node, as that requires taking the delayed node's + * mutex. If another task starts running delayed items before we take + * the mutex, it will first lock the mutex and then it may try to lock + * the same btree path (leaf). + */ + btrfs_free_path(path); + if (curr_node) btrfs_release_delayed_node(curr_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1361,8 +1379,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, return -ENOMEM; async_work->delayed_root = delayed_root; - btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL, - NULL); + btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL); async_work->nr = nr; btrfs_queue_work(fs_info->delayed_workers, &async_work->work); @@ -1413,7 +1430,29 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } -/* Will return 0 or -ENOMEM */ +static void btrfs_release_dir_index_item_space(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; +} + +/* Will return 0, -ENOMEM or -EEXIST (index number collision, unexpected). */ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, @@ -1455,6 +1494,27 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); + /* + * First attempt to insert the delayed item. This is to make the error + * handling path simpler in case we fail (-EEXIST). There's no risk of + * any other task coming in and running the delayed item before we do + * the metadata space reservation below, because we are holding the + * delayed node's mutex and that mutex must also be locked before the + * node's delayed items can be run. + */ + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); + if (unlikely(ret)) { + btrfs_err(trans->fs_info, +"error adding delayed dir index item, name: %.*s, index: %llu, root: %llu, dir: %llu, dir->index_cnt: %llu, delayed_node->index_cnt: %llu, error: %d", + name_len, name, index, btrfs_root_id(delayed_node->root), + delayed_node->inode_id, dir->index_cnt, + delayed_node->index_cnt, ret); + btrfs_release_delayed_item(delayed_item); + btrfs_release_dir_index_item_space(trans); + mutex_unlock(&delayed_node->mutex); + goto release_node; + } + if (delayed_node->index_item_leaves == 0 || delayed_node->curr_index_batch_size + data_len > leaf_data_size) { delayed_node->curr_index_batch_size = data_len; @@ -1472,36 +1532,14 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, * impossible. */ if (WARN_ON(ret)) { - mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_item(delayed_item); + mutex_unlock(&delayed_node->mutex); goto release_node; } delayed_node->index_item_leaves++; - } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { - const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); - - /* - * Adding the new dir index item does not require touching another - * leaf, so we can release 1 unit of metadata that was previously - * reserved when starting the transaction. This applies only to - * the case where we had a transaction start and excludes the - * transaction join case (when replaying log trees). - */ - trace_btrfs_space_reservation(fs_info, "transaction", - trans->transid, bytes, 0); - btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); - ASSERT(trans->bytes_reserved >= bytes); - trans->bytes_reserved -= bytes; - } - - ret = __btrfs_add_delayed_item(delayed_node, delayed_item); - if (unlikely(ret)) { - btrfs_err(trans->fs_info, - "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", - name_len, name, delayed_node->root->root_key.objectid, - delayed_node->inode_id, ret); - BUG(); + } else { + btrfs_release_dir_index_item_space(trans); } mutex_unlock(&delayed_node->mutex); @@ -1722,8 +1760,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, } /* - * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree - * + * Read dir info stored in the delayed tree. */ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, struct list_head *ins_list) @@ -1796,24 +1833,22 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); } int btrfs_fill_inode(struct inode *inode, u32 *rdev) @@ -1853,19 +1888,17 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); - inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime); - inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime); + inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + btrfs_stack_timespec_nsec(&inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); + inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + btrfs_stack_timespec_nsec(&inode_item->mtime)); inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_stack_timespec_nsec(&inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); inode->i_generation = BTRFS_I(inode)->generation; BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1876,9 +1909,9 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) } int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_delayed_node *delayed_node; int ret = 0; diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index dc1085b2a397..5cceb31bbd16 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -95,7 +95,7 @@ struct btrfs_delayed_item { bool logged; /* The maximum leaf size is 64K, so u16 is more than enough. */ u16 data_len; - char data[]; + char data[] __counted_by(data_len); }; static inline void btrfs_init_delayed_root( @@ -135,7 +135,6 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); int btrfs_fill_inode(struct inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6a13cf00218b..9223934d95f4 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -57,16 +57,20 @@ bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info) * Release a ref head's reservation. * * @fs_info: the filesystem - * @nr: number of items to drop + * @nr_refs: number of delayed refs to drop + * @nr_csums: number of csum items to drop * * Drops the delayed ref head's count from the delayed refs rsv and free any * excess reservation we had. */ -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; - const u64 num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr); - u64 released = 0; + u64 num_bytes; + u64 released; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, nr_refs); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL); if (released) @@ -77,50 +81,135 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) /* * Adjust the size of the delayed refs rsv. * - * This is to be called anytime we may have adjusted trans->delayed_ref_updates, - * it'll calculate the additional size and add it to the delayed_refs_rsv. + * This is to be called anytime we may have adjusted trans->delayed_ref_updates + * or trans->delayed_ref_csum_deletions, it'll calculate the additional size and + * add it to the delayed_refs_rsv. */ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *local_rsv = &trans->delayed_rsv; u64 num_bytes; + u64 reserved_bytes; + + num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, trans->delayed_ref_updates); + num_bytes += btrfs_calc_delayed_ref_csum_bytes(fs_info, + trans->delayed_ref_csum_deletions); - if (!trans->delayed_ref_updates) + if (num_bytes == 0) return; - num_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - trans->delayed_ref_updates); + /* + * Try to take num_bytes from the transaction's local delayed reserve. + * If not possible, try to take as much as it's available. If the local + * reserve doesn't have enough reserved space, the delayed refs reserve + * will be refilled next time btrfs_delayed_refs_rsv_refill() is called + * by someone or if a transaction commit is triggered before that, the + * global block reserve will be used. We want to minimize using the + * global block reserve for cases we can account for in advance, to + * avoid exhausting it and reach -ENOSPC during a transaction commit. + */ + spin_lock(&local_rsv->lock); + reserved_bytes = min(num_bytes, local_rsv->reserved); + local_rsv->reserved -= reserved_bytes; + local_rsv->full = (local_rsv->reserved >= local_rsv->size); + spin_unlock(&local_rsv->lock); spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = false; + delayed_rsv->reserved += reserved_bytes; + delayed_rsv->full = (delayed_rsv->reserved >= delayed_rsv->size); spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; + trans->delayed_ref_csum_deletions = 0; +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * insertion, used after allocating a block group. + */ +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Inserting a block group item does not require changing the free space + * tree, only the extent tree or the block group tree, so this is all we + * need. + */ + delayed_rsv->size += btrfs_calc_insert_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item insertion. + */ +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); +} + +/* + * Adjust the size of the delayed refs block reserve for 1 block group item + * update. + */ +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + + spin_lock(&delayed_rsv->lock); + /* + * Updating a block group item does not result in new nodes/leaves and + * does not require changing the free space tree, only the extent tree + * or the block group tree, so this is all we need. + */ + delayed_rsv->size += btrfs_calc_metadata_size(fs_info, 1); + delayed_rsv->full = false; + spin_unlock(&delayed_rsv->lock); +} + +/* + * Adjust the size of the delayed refs block reserve to release space for 1 + * block group item update. + */ +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv; + const u64 num_bytes = btrfs_calc_metadata_size(fs_info, 1); + u64 released; + + released = btrfs_block_rsv_release(fs_info, delayed_rsv, num_bytes, NULL); + if (released > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", + 0, released, 0); } /* * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem - * @src: source block rsv to transfer from * @num_bytes: number of bytes to transfer * - * This transfers up to the num_bytes amount from the src rsv to the + * This transfers up to the num_bytes amount, previously reserved, to the * delayed_refs_rsv. Any extra bytes are returned to the space info. */ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; u64 to_free = 0; - spin_lock(&src->lock); - src->reserved -= num_bytes; - src->size -= num_bytes; - spin_unlock(&src->lock); - spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) { u64 delta = delayed_refs_rsv->size - @@ -161,8 +250,11 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *space_info = block_rsv->space_info; u64 limit = btrfs_calc_delayed_ref_bytes(fs_info, 1); u64 num_bytes = 0; + u64 refilled_bytes; + u64 to_free; int ret = -ENOSPC; spin_lock(&block_rsv->lock); @@ -175,12 +267,40 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; - ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush); + ret = btrfs_reserve_metadata_bytes(fs_info, space_info, num_bytes, flush); if (ret) return ret; - btrfs_block_rsv_add_bytes(block_rsv, num_bytes, false); - trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", - 0, num_bytes, 1); + + /* + * We may have raced with someone else, so check again if we the block + * reserve is still not full and release any excess space. + */ + spin_lock(&block_rsv->lock); + if (block_rsv->reserved < block_rsv->size) { + u64 needed = block_rsv->size - block_rsv->reserved; + + if (num_bytes >= needed) { + block_rsv->reserved += needed; + block_rsv->full = true; + to_free = num_bytes - needed; + refilled_bytes = needed; + } else { + block_rsv->reserved += num_bytes; + to_free = 0; + refilled_bytes = num_bytes; + } + } else { + to_free = num_bytes; + refilled_bytes = 0; + } + spin_unlock(&block_rsv->lock); + + if (to_free > 0) + btrfs_space_info_free_bytes_may_use(fs_info, space_info, to_free); + + if (refilled_bytes > 0) + trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv", 0, + refilled_bytes, 1); return 0; } @@ -398,7 +518,8 @@ int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs, return 0; } -static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, +static inline void drop_delayed_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref) { @@ -409,9 +530,11 @@ static inline void drop_delayed_ref(struct btrfs_delayed_ref_root *delayed_refs, list_del(&ref->add_list); btrfs_put_delayed_ref(ref); atomic_dec(&delayed_refs->num_entries); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } -static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, +static bool merge_ref(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head, struct btrfs_delayed_ref_node *ref, u64 seq) @@ -440,10 +563,10 @@ static bool merge_ref(struct btrfs_delayed_ref_root *delayed_refs, mod = -next->ref_mod; } - drop_delayed_ref(delayed_refs, head, next); + drop_delayed_ref(fs_info, delayed_refs, head, next); ref->ref_mod += mod; if (ref->ref_mod == 0) { - drop_delayed_ref(delayed_refs, head, ref); + drop_delayed_ref(fs_info, delayed_refs, head, ref); done = true; } else { /* @@ -481,7 +604,7 @@ again: ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node); if (seq && ref->seq >= seq) continue; - if (merge_ref(delayed_refs, head, ref, seq)) + if (merge_ref(fs_info, delayed_refs, head, ref, seq)) goto again; } } @@ -560,10 +683,11 @@ void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs, * Return true if the ref was merged into an existing one (and therefore can be * freed by the caller). */ -static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, +static bool insert_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *ref) { + struct btrfs_delayed_ref_root *root = &trans->transaction->delayed_refs; struct btrfs_delayed_ref_node *exist; int mod; @@ -574,6 +698,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, list_add_tail(&ref->add_list, &href->ref_add_list); atomic_inc(&root->num_entries); spin_unlock(&href->lock); + trans->delayed_ref_updates++; return false; } @@ -602,7 +727,7 @@ static bool insert_delayed_ref(struct btrfs_delayed_ref_root *root, /* remove existing tail if its ref_mod is zero */ if (exist->ref_mod == 0) - drop_delayed_ref(root, href, exist); + drop_delayed_ref(trans->fs_info, root, href, exist); spin_unlock(&href->lock); return true; } @@ -623,6 +748,15 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, BUG_ON(existing->is_data != update->is_data); spin_lock(&existing->lock); + + /* + * When freeing an extent, we may not know the owning root when we + * first create the head_ref. However, some deref before the last deref + * will know it, so we just need to update the head_ref accordingly. + */ + if (!existing->owning_root) + existing->owning_root = update->owning_root; + if (update->must_insert_reserved) { /* if the extent was freed and then * reallocated before the delayed ref @@ -632,6 +766,7 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, * Set it again here */ existing->must_insert_reserved = update->must_insert_reserved; + existing->owning_root = update->owning_root; /* * update the num_bytes so we make sure the accounting @@ -671,6 +806,8 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, /* * If we are going to from a positive ref mod to a negative or vice * versa we need to make sure to adjust pending_csums accordingly. + * We reserve bytes for csum deletion when adding or updating a ref head + * see add_delayed_ref_head() for more details. */ if (existing->is_data) { u64 csum_leaves = @@ -679,11 +816,11 @@ static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans, if (existing->total_ref_mod >= 0 && old_ref_mod < 0) { delayed_refs->pending_csums -= existing->num_bytes; - btrfs_delayed_refs_rsv_release(fs_info, csum_leaves); + btrfs_delayed_refs_rsv_release(fs_info, 0, csum_leaves); } if (existing->total_ref_mod < 0 && old_ref_mod >= 0) { delayed_refs->pending_csums += existing->num_bytes; - trans->delayed_ref_updates += csum_leaves; + trans->delayed_ref_csum_deletions += csum_leaves; } } @@ -694,7 +831,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, struct btrfs_qgroup_extent_record *qrecord, u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved, int action, bool is_data, - bool is_system) + bool is_system, u64 owning_root) { int count_mod = 1; bool must_insert_reserved = false; @@ -734,7 +871,9 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref, head_ref->bytenr = bytenr; head_ref->num_bytes = num_bytes; head_ref->ref_mod = count_mod; + head_ref->reserved_bytes = reserved; head_ref->must_insert_reserved = must_insert_reserved; + head_ref->owning_root = owning_root; head_ref->is_data = is_data; head_ref->is_system = is_system; head_ref->ref_tree = RB_ROOT_CACHED; @@ -795,16 +934,21 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + /* + * We reserve the amount of bytes needed to delete csums when + * adding the ref head and not when adding individual drop refs + * since the csum items are deleted only after running the last + * delayed drop ref (the data extent's ref count drops to 0). + */ if (head_ref->is_data && head_ref->ref_mod < 0) { delayed_refs->pending_csums += head_ref->num_bytes; - trans->delayed_ref_updates += + trans->delayed_ref_csum_deletions += btrfs_csum_bytes_to_leaves(trans->fs_info, head_ref->num_bytes); } delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; } if (qrecord_inserted_ret) *qrecord_inserted_ret = qrecord_inserted; @@ -813,8 +957,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans, } /* - * init_delayed_ref_common - Initialize the structure which represents a - * modification to a an extent. + * Initialize the structure which represents a modification to a an extent. * * @fs_info: Internal to the mounted filesystem mount structure. * @@ -885,7 +1028,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.ref_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); @@ -898,8 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -914,15 +1056,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, action, + generic_ref->tree_ref.ref_root, action, ref_type); - ref->root = generic_ref->tree_ref.owning_root; + ref->root = generic_ref->tree_ref.ref_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.owning_root, 0, action, - false, is_system); + generic_ref->tree_ref.ref_root, 0, action, + false, is_system, generic_ref->owning_root); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -935,7 +1077,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -974,7 +1116,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.owning_root; + u64 ref_root = generic_ref->data_ref.ref_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1002,8 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - !generic_ref->skip_qgroup) { + if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); @@ -1014,7 +1155,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root, - reserved, action, true, false); + reserved, action, true, false, generic_ref->owning_root); head_ref->extent_op = NULL; delayed_refs = &trans->transaction->delayed_refs; @@ -1027,7 +1168,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, head_ref = add_delayed_ref_head(trans, head_ref, record, action, &qrecord_inserted); - merged = insert_delayed_ref(delayed_refs, head_ref, &ref->node); + merged = insert_delayed_ref(trans, head_ref, &ref->node); spin_unlock(&delayed_refs->lock); /* @@ -1060,7 +1201,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, false, false); + BTRFS_UPDATE_DELAYED_HEAD, false, false, 0); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index b8e14b0ba5f1..62d679d40f4f 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -9,10 +9,16 @@ #include <linux/refcount.h> /* these are the possible values of struct btrfs_delayed_ref_node->action */ -#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ -#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ -#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ -#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +enum btrfs_delayed_ref_action { + /* Add one backref to the tree */ + BTRFS_ADD_DELAYED_REF = 1, + /* Delete one backref from the tree */ + BTRFS_DROP_DELAYED_REF, + /* Record a full extent allocation */ + BTRFS_ADD_DELAYED_EXTENT, + /* Not changing ref count on head ref */ + BTRFS_UPDATE_DELAYED_HEAD, +} __packed; struct btrfs_delayed_ref_node { struct rb_node ref_node; @@ -105,6 +111,18 @@ struct btrfs_delayed_ref_head { int ref_mod; /* + * The root that triggered the allocation when must_insert_reserved is + * set to true. + */ + u64 owning_root; + + /* + * Track reserved bytes when setting must_insert_reserved. On success + * or cleanup, we will need to free the reservation. + */ + u64 reserved_bytes; + + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree * until the delayed ref is processed. must_insert_reserved is @@ -117,6 +135,7 @@ struct btrfs_delayed_ref_head { * the free has happened. */ bool must_insert_reserved; + bool is_data; bool is_system; bool processing; @@ -183,13 +202,13 @@ enum btrfs_ref_type { BTRFS_REF_DATA, BTRFS_REF_METADATA, BTRFS_REF_LAST, -}; +} __packed; struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Original root this data extent belongs to */ - u64 owning_root; + /* Root which owns this data reference. */ + u64 ref_root; /* Inode which refers to this data extent */ u64 ino; @@ -212,18 +231,18 @@ struct btrfs_tree_ref { int level; /* - * Root which owns this tree block. + * Root which owns this tree block reference. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 owning_root; + u64 ref_root; /* For non-skinny metadata, no special member needed */ }; struct btrfs_ref { enum btrfs_ref_type type; - int action; + enum btrfs_delayed_ref_action action; /* * Whether this extent should go through qgroup record. @@ -239,6 +258,7 @@ struct btrfs_ref { #endif u64 bytenr; u64 len; + u64 owning_root; /* Bytenr of the parent tree block */ u64 parent; @@ -277,24 +297,37 @@ static inline u64 btrfs_calc_delayed_ref_bytes(const struct btrfs_fs_info *fs_in return num_bytes; } +static inline u64 btrfs_calc_delayed_ref_csum_bytes(const struct btrfs_fs_info *fs_info, + int num_csum_items) +{ + /* + * Deleting csum items does not result in new nodes/leaves and does not + * require changing the free space tree, only the csum tree, so this is + * all we need. + */ + return btrfs_calc_metadata_size(fs_info, num_csum_items); +} + static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, - int action, u64 bytenr, u64 len, u64 parent) + int action, u64 bytenr, u64 len, + u64 parent, u64 owning_root) { generic_ref->action = action; generic_ref->bytenr = bytenr; generic_ref->len = len; generic_ref->parent = parent; + generic_ref->owning_root = owning_root; } -static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root, u64 mod_root, bool skip_qgroup) +static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, int level, + u64 root, u64 mod_root, bool skip_qgroup) { #ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: root; #endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.owning_root = root; + generic_ref->tree_ref.ref_root = root; generic_ref->type = BTRFS_REF_METADATA; if (skip_qgroup || !(is_fstree(root) && (!mod_root || is_fstree(mod_root)))) @@ -312,7 +345,7 @@ static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, /* If @real_root not set, use @root as fallback */ generic_ref->real_root = mod_root ?: ref_root; #endif - generic_ref->data_ref.owning_root = ref_root; + generic_ref->data_ref.ref_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; @@ -338,7 +371,6 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(refcount_read(&ref->refs) == 0); if (refcount_dec_and_test(&ref->refs)) { WARN_ON(!RB_EMPTY_NODE(&ref->ref_node)); switch (ref->type) { @@ -402,12 +434,15 @@ struct btrfs_delayed_ref_head *btrfs_select_ref_head( int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq); -void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr); +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr_refs, int nr_csums); void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans); +void btrfs_inc_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_inserts(struct btrfs_fs_info *fs_info); +void btrfs_inc_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); +void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush); void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *src, u64 num_bytes); bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index fff22ed55c42..f9544fda38e9 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -17,7 +17,6 @@ #include "print-tree.h" #include "volumes.h" #include "async-thread.h" -#include "check-integrity.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -247,6 +246,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + struct bdev_handle *bdev_handle; struct block_device *bdev; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -257,12 +257,13 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return -EINVAL; } - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) { btrfs_err(fs_info, "target device %s is invalid!", device_path); - return PTR_ERR(bdev); + return PTR_ERR(bdev_handle); } + bdev = bdev_handle->bdev; if (!btrfs_check_device_zone_type(fs_info, bdev)) { btrfs_err(fs_info, @@ -313,9 +314,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->commit_bytes_used = device->bytes_used; device->fs_info = fs_info; device->bdev = bdev; + device->bdev_handle = bdev_handle; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_devices; @@ -334,7 +335,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, return 0; error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); return ret; } @@ -442,7 +443,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 082eb0e19598..9c07d5c3e5ad 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -38,7 +38,7 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (di) return ERR_PTR(-EEXIST); - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); @@ -93,7 +93,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); return ret; } @@ -153,7 +153,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); second_insert: /* FIXME, use some real flag for selecting the extra index */ @@ -439,7 +439,7 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); - btrfs_truncate_item(path, item_len - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; } diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index aab4b7cc7fa0..e40a226373d7 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -3,6 +3,10 @@ #ifndef BTRFS_DIR_ITEM_H #define BTRFS_DIR_ITEM_H +#include <linux/crc32c.h> + +struct fscrypt_str; + int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, @@ -39,4 +43,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, const char *name, int name_len); +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return crc32c((u32)~1, name, len); +} + #endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0a96ea8c1d3a..401ea09ae4b8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -29,7 +29,6 @@ #include "tree-log.h" #include "free-space-cache.h" #include "free-space-tree.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "raid56.h" @@ -245,6 +244,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start = btrfs_header_bytenr(eb); + u64 last_trans; u8 result[BTRFS_CSUM_SIZE]; int ret; @@ -282,12 +282,12 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) * Also check the generation, the eb reached here must be newer than * last committed. Or something seriously wrong happened. */ - if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (unlikely(btrfs_header_generation(eb) <= last_trans)) { ret = -EUCLEAN; btrfs_err(fs_info, "block=%llu bad generation, have %llu expect > %llu", - eb->start, btrfs_header_generation(eb), - fs_info->last_trans_committed); + eb->start, btrfs_header_generation(eb), last_trans); goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); @@ -318,9 +318,10 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) BTRFS_FSID_SIZE); /* - * alloc_fs_devices() copies the fsid into metadata_uuid if the - * metadata_uuid is unset in the superblock, including for a seed device. - * So, we can use fs_devices->metadata_uuid. + * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid. + * This is then overwritten by metadata_uuid if it is present in the + * device_list_add(). The same true for a seed device as well. So use of + * fs_devices::metadata_uuid is appropriate here. */ if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; @@ -520,6 +521,7 @@ static bool btree_dirty_folio(struct address_space *mapping, struct folio *folio) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); + struct btrfs_subpage_info *spi = fs_info->subpage_info; struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; @@ -533,18 +535,19 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); return filemap_dirty_folio(mapping, folio); } + + ASSERT(spi); subpage = folio_get_private(folio); - ASSERT(subpage->dirty_bitmap); - while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + for (cur_bit = spi->dirty_offset; + cur_bit < spi->dirty_offset + spi->bitmap_nr_bits; + cur_bit++) { unsigned long flags; u64 cur; - u16 tmp = (1 << cur_bit); spin_lock_irqsave(&subpage->lock, flags); - if (!(tmp & subpage->dirty_bitmap)) { + if (!test_bit(cur_bit, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - cur_bit++; continue; } spin_unlock_irqrestore(&subpage->lock, flags); @@ -557,7 +560,7 @@ static bool btree_dirty_folio(struct address_space *mapping, btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); - cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1; } return filemap_dirty_folio(mapping, folio); } @@ -673,9 +676,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, refcount_set(&root->refs, 1); atomic_set(&root->snapshot_force_cow, 0); atomic_set(&root->nr_swapfiles, 0); - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, @@ -857,7 +860,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.offset = 0; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -865,7 +868,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, } root->node = leaf; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); root->commit_root = btrfs_root_node(root); set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); @@ -934,13 +937,13 @@ int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); + NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) return PTR_ERR(leaf); root->node = leaf; - btrfs_mark_buffer_dirty(root->node); + btrfs_mark_buffer_dirty(trans, root->node); btrfs_tree_unlock(root->node); return 0; @@ -1002,9 +1005,9 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root->log_root); root->log_root = log_root; - root->log_transid = 0; + btrfs_set_root_log_transid(root, 0); root->log_transid_committed = -1; - root->last_log_commit = 0; + btrfs_set_root_last_log_commit(root, 0); return 0; } @@ -1177,6 +1180,8 @@ static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info, return btrfs_grab_root(fs_info->block_group_root); case BTRFS_FREE_SPACE_TREE_OBJECTID: return btrfs_grab_root(btrfs_global_root(fs_info, &key)); + case BTRFS_RAID_STRIPE_TREE_OBJECTID: + return btrfs_grab_root(fs_info->stripe_root); default: return NULL; } @@ -1257,6 +1262,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); btrfs_put_root(fs_info->block_group_root); + btrfs_put_root(fs_info->stripe_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1400,7 +1406,8 @@ struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info, } /* - * btrfs_get_fs_root_commit_root - return a root for the given objectid + * Return a root for the given objectid. + * * @fs_info: the fs_info * @objectid: the objectid we need to lookup * @@ -1547,7 +1554,7 @@ static int transaction_kthread(void *arg) delta = ktime_get_seconds() - cur->start_time; if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) && - cur->state < TRANS_STATE_COMMIT_START && + cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); delay -= msecs_to_jiffies((delta - 1) * 1000); @@ -1697,11 +1704,11 @@ static void backup_super_roots(struct btrfs_fs_info *info) } /* - * read_backup_root - Reads a backup root based on the passed priority. Prio 0 - * is the newest, prio 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots + * Reads a backup root based on the passed priority. Prio 0 is the newest, prio + * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots * - * fs_info - filesystem whose backup roots need to be read - * priority - priority of backup root required + * @fs_info: filesystem whose backup roots need to be read + * @priority: priority of backup root required * * Returns backup root index on success and -EINVAL otherwise. */ @@ -1801,6 +1808,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); free_root_extent_buffers(info->block_group_root); + free_root_extent_buffers(info->stripe_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2260,7 +2268,6 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) root = btrfs_read_tree_root(tree_root, &location); if (!IS_ERR(root)) { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); - set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); fs_info->quota_root = root; } @@ -2277,6 +2284,20 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) fs_info->uuid_root = root; } + if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) { + location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) { + ret = PTR_ERR(root); + goto out; + } + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->stripe_root = root; + } + } + return 0; out: btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d", @@ -2379,7 +2400,8 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { + if (!fs_info->fs_devices->temp_fsid && + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", sb->fsid, fs_info->fs_devices->fsid); @@ -2632,7 +2654,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = btrfs_header_generation(tree_root->node); - fs_info->last_trans_committed = fs_info->generation; + btrfs_set_last_trans_committed(fs_info, fs_info->generation); fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -2682,8 +2704,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered); btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent); - btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_start, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked, BTRFS_LOCKDEP_TRANS_UNBLOCKED); btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed, @@ -2733,9 +2755,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->ordered_root_lock); btrfs_init_scrub(fs_info); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - fs_info->check_integrity_print_mask = 0; -#endif btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); @@ -3155,7 +3174,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device u32 nodesize; u32 stripesize; u64 generation; - u64 features; u16 csum_type; struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -3237,15 +3255,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device disk_super = fs_info->super_copy; - - features = btrfs_super_flags(disk_super); - if (features & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { - features &= ~BTRFS_SUPER_FLAG_CHANGING_FSID_V2; - btrfs_set_super_flags(disk_super, features); - btrfs_info(fs_info, - "found metadata UUID change in progress flag, clearing"); - } - memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_for_commit)); @@ -3507,18 +3516,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device "auto enabling async discard"); } -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { - ret = btrfsic_mount(fs_info, fs_devices, - btrfs_test_opt(fs_info, - CHECK_INTEGRITY_DATA) ? 1 : 0, - fs_info->check_integrity_print_mask); - if (ret) - btrfs_warn(fs_info, - "failed to initialize integrity check module: %d", - ret); - } -#endif ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3818,8 +3815,6 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - - btrfsic_check_bio(bio); submit_bio(bio); if (btrfs_advance_sb_log(device, i)) @@ -3915,28 +3910,11 @@ static void write_dev_flush(struct btrfs_device *device) device->last_flush_error = BLK_STS_OK; -#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * When a disk has write caching disabled, we skip submission of a bio - * with flush and sync requests before writing the superblock, since - * it's not needed. However when the integrity checker is enabled, this - * results in reports that there are metadata blocks referred by a - * superblock that were not properly flushed. So don't skip the bio - * submission only when the integrity checker is enabled for the sake - * of simplicity, since this is a debug tool and not meant for use in - * non-debug builds. - */ - if (!bdev_write_cache(device->bdev)) - return; -#endif - bio_init(bio, device->bdev, NULL, 0, REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - - btrfsic_check_bio(bio); submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4412,16 +4390,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) - btrfsic_unmount(fs_info->fs_devices); -#endif - btrfs_mapping_tree_free(&fs_info->mapping_tree); btrfs_close_devices(fs_info->fs_devices); } -void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf) { struct btrfs_fs_info *fs_info = buf->fs_info; u64 transid = btrfs_header_generation(buf); @@ -4435,21 +4409,16 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif + /* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */ + ASSERT(trans->transid == fs_info->generation); btrfs_assert_tree_write_locked(buf); - if (transid != fs_info->generation) - WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", - buf->start, transid, fs_info->generation); - set_extent_buffer_dirty(buf); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - /* - * btrfs_check_leaf() won't check item data if we don't have WRITTEN - * set, so this will only validate the basic structure of the items. - */ - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(buf)) { - btrfs_print_leaf(buf); - ASSERT(0); + if (unlikely(transid != fs_info->generation)) { + btrfs_abort_transaction(trans, -EUCLEAN); + btrfs_crit(fs_info, +"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu", + buf->start, transid, fs_info->generation); } -#endif + set_extent_buffer_dirty(buf); } static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info, @@ -4609,6 +4578,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, list_del(&ref->add_list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); } if (head->must_insert_reserved) pin_bytes = true; @@ -4806,7 +4776,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, spin_unlock(&cur_trans->dirty_bgs_lock); btrfs_put_block_group(cache); - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_updates(fs_info); spin_lock(&cur_trans->dirty_bgs_lock); } spin_unlock(&cur_trans->dirty_bgs_lock); @@ -4870,7 +4840,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) while (!list_empty(&fs_info->trans_list)) { t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); - if (t->state >= TRANS_STATE_COMMIT_START) { + if (t->state >= TRANS_STATE_COMMIT_PREP) { refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 02b645744a82..50dab8f639dc 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -104,7 +104,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) } void btrfs_put_root(struct btrfs_root *root); -void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, + struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index ff8e117a1ace..ea149be28dff 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -105,32 +105,40 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, lockdep_set_class(&tree->lock, &file_extent_tree_class); } +/* + * Empty an io tree, removing and freeing every extent state record from the + * tree. This should be called once we are sure no other task can access the + * tree anymore, so no tree updates happen after we empty the tree and there + * aren't any waiters on any extent state record (EXTENT_LOCKED bit is never + * set on any extent state when calling this function). + */ void extent_io_tree_release(struct extent_io_tree *tree) { + struct rb_root root; + struct extent_state *state; + struct extent_state *tmp; + spin_lock(&tree->lock); - /* - * Do a single barrier for the waitqueue_active check here, the state - * of the waitqueue should not change once extent_io_tree_release is - * called. - */ - smp_mb(); - while (!RB_EMPTY_ROOT(&tree->state)) { - struct rb_node *node; - struct extent_state *state; - - node = rb_first(&tree->state); - state = rb_entry(node, struct extent_state, rb_node); - rb_erase(&state->rb_node, &tree->state); + root = tree->state; + tree->state = RB_ROOT; + rbtree_postorder_for_each_entry_safe(state, tmp, &root, rb_node) { + /* Clear node to keep free_extent_state() happy. */ RB_CLEAR_NODE(&state->rb_node); + ASSERT(!(state->state & EXTENT_LOCKED)); /* - * btree io trees aren't supposed to have tasks waiting for - * changes in the flags of extent states ever. + * No need for a memory barrier here, as we are holding the tree + * lock and we only change the waitqueue while holding that lock + * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - cond_resched_lock(&tree->lock); } + /* + * Should still be empty even after a reschedule, no other task should + * be accessing the tree anymore. + */ + ASSERT(RB_EMPTY_ROOT(&tree->state)); spin_unlock(&tree->lock); } @@ -327,6 +335,36 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) "locking error: extent tree was modified by another thread while locked"); } +static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *prev; + + prev = prev_state(state); + if (prev && prev->end == state->start - 1 && prev->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, prev); + state->start = prev->start; + rb_erase(&prev->rb_node, &tree->state); + RB_CLEAR_NODE(&prev->rb_node); + free_extent_state(prev); + } +} + +static void merge_next_state(struct extent_io_tree *tree, struct extent_state *state) +{ + struct extent_state *next; + + next = next_state(state); + if (next && next->start == state->end + 1 && next->state == state->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, state, next); + state->end = next->end; + rb_erase(&next->rb_node, &tree->state); + RB_CLEAR_NODE(&next->rb_node); + free_extent_state(next); + } +} + /* * Utility function to look for merge candidates inside a given range. Any * extents with matching state are merged together into a single extent in the @@ -338,31 +376,11 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err) */ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) { - struct extent_state *other; - if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY)) return; - other = prev_state(state); - if (other && other->end == state->start - 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->start = other->start; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } - other = next_state(state); - if (other && other->start == state->end + 1 && - other->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, other); - state->end = other->end; - rb_erase(&other->rb_node, &tree->state); - RB_CLEAR_NODE(&other->rb_node); - free_extent_state(other); - } + merge_prev_state(tree, state); + merge_next_state(tree, state); } static void set_state_bits(struct extent_io_tree *tree, @@ -384,19 +402,27 @@ static void set_state_bits(struct extent_io_tree *tree, * Insert an extent_state struct into the tree. 'bits' are set on the * struct before it is inserted. * - * This may return -EEXIST if the extent is already there, in which case the - * state struct is freed. + * Returns a pointer to the struct extent_state record containing the range + * requested for insertion, which may be the same as the given struct or it + * may be an existing record in the tree that was expanded to accommodate the + * requested range. In case of an extent_state different from the one that was + * given, the later can be freed or reused by the caller. + * + * On error it returns an error pointer. * * The tree lock is not taken internally. This is a utility function and * probably isn't what you want to call (see set/clear_extent_bit). */ -static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, - u32 bits, struct extent_changeset *changeset) +static struct extent_state *insert_state(struct extent_io_tree *tree, + struct extent_state *state, + u32 bits, + struct extent_changeset *changeset) { struct rb_node **node; struct rb_node *parent = NULL; - const u64 end = state->end; + const u64 start = state->start - 1; + const u64 end = state->end + 1; + const bool try_merge = !(bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)); set_state_bits(tree, state, bits, changeset); @@ -407,23 +433,42 @@ static int insert_state(struct extent_io_tree *tree, parent = *node; entry = rb_entry(parent, struct extent_state, rb_node); - if (end < entry->start) { + if (state->end < entry->start) { + if (try_merge && end == entry->start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->start = state->start; + merge_prev_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_left; - } else if (end > entry->end) { + } else if (state->end > entry->end) { + if (try_merge && entry->end == start && + state->state == entry->state) { + if (tree->inode) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); + entry->end = state->end; + merge_next_state(tree, entry); + state->state = 0; + return entry; + } node = &(*node)->rb_right; } else { btrfs_err(tree->fs_info, "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, end); - return -EEXIST; + entry->start, entry->end, state->start, state->end); + return ERR_PTR(-EEXIST); } } rb_link_node(&state->rb_node, parent, node); rb_insert_color(&state->rb_node, &tree->state); - merge_state(tree, state); - return 0; + return state; } /* @@ -708,26 +753,13 @@ out: } -static void wait_on_state(struct extent_io_tree *tree, - struct extent_state *state) - __releases(tree->lock) - __acquires(tree->lock) -{ - DEFINE_WAIT(wait); - prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&tree->lock); - schedule(); - spin_lock(&tree->lock); - finish_wait(&state->wq, &wait); -} - /* * Wait for one or more bits to clear on a range in the state tree. * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { struct extent_state *state; @@ -758,9 +790,15 @@ process_node: goto out; if (state->state & bits) { + DEFINE_WAIT(wait); + start = state->start; refcount_inc(&state->refs); - wait_on_state(tree, state); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); free_extent_state(state); goto again; } @@ -847,10 +885,19 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, if (state->end == start - 1 && extent_state_in_tree(state)) { while ((state = next_state(state)) != NULL) { if (state->state & bits) - goto got_it; + break; } + /* + * If we found the next extent state, clear cached_state + * so that we can cache the next extent state below and + * avoid future calls going over the same extent state + * again. If we haven't found any, clear as well since + * it's now useless. + */ free_extent_state(*cached_state); *cached_state = NULL; + if (state) + goto got_it; goto out; } free_extent_state(*cached_state); @@ -1133,6 +1180,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1148,12 +1197,15 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, changeset); - if (err) + inserted_state = insert_state(tree, prealloc, bits, changeset); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); + } - cache_state(prealloc, cached_state); - prealloc = NULL; + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1356,6 +1408,8 @@ hit_next: */ if (state->start > start) { u64 this_end; + struct extent_state *inserted_state; + if (end < last_start) this_end = end; else @@ -1373,11 +1427,14 @@ hit_next: */ prealloc->start = start; prealloc->end = this_end; - err = insert_state(tree, prealloc, bits, NULL); - if (err) + inserted_state = insert_state(tree, prealloc, bits, NULL); + if (IS_ERR(inserted_state)) { + err = PTR_ERR(inserted_state); extent_io_tree_panic(tree, err); - cache_state(prealloc, cached_state); - prealloc = NULL; + } + cache_state(inserted_state, cached_state); + if (inserted_state == prealloc) + prealloc = NULL; start = this_end + 1; goto search_again; } @@ -1640,15 +1697,46 @@ search: } /* - * Search a range in the state tree for a given mask. If 'filled' == 1, this - * returns 1 only if every extent in the tree has the bits set. Otherwise, 1 - * is returned if any bit in the range is found set. + * Check if the single @bit exists in the given range. + */ +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) +{ + struct extent_state *state = NULL; + bool bitset = false; + + ASSERT(is_power_of_2(bit)); + + spin_lock(&tree->lock); + state = tree_search(tree, start); + while (state && start <= end) { + if (state->start > end) + break; + + if (state->state & bit) { + bitset = true; + break; + } + + /* If state->end is (u64)-1, start will overflow to 0 */ + start = state->end + 1; + if (start > end || start == 0) + break; + state = next_state(state); + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * Check if the whole range [@start,@end) contains the single @bit set. */ -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached) +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) { struct extent_state *state = NULL; - int bitset = 0; + bool bitset = true; + + ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1657,35 +1745,35 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, else state = tree_search(tree, start); while (state && start <= end) { - if (filled && state->start > start) { - bitset = 0; + if (state->start > start) { + bitset = false; break; } if (state->start > end) break; - if (state->state & bits) { - bitset = 1; - if (!filled) - break; - } else if (filled) { - bitset = 0; + if ((state->state & bit) == 0) { + bitset = false; break; } if (state->end == (u64)-1) break; + /* + * Last entry (if state->end is (u64)-1 and overflow happens), + * or next entry starts after the range. + */ start = state->end + 1; - if (start > end) + if (start > end || start == 0) break; state = next_state(state); } /* We ran out of states and were still inside of our range. */ - if (filled && !state) - bitset = 0; + if (!state) + bitset = false; spin_unlock(&tree->lock); return bitset; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 28c23a23d121..5602b0137fcd 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -131,8 +131,9 @@ u64 count_range_bits(struct extent_io_tree *tree, struct extent_state **cached_state); void free_extent_state(struct extent_state *state); -int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, int filled, struct extent_state *cached_state); +bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_changeset *changeset); int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, @@ -192,7 +193,5 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f356f08b55cb..c8e5b4715b49 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,14 +42,16 @@ #include "file-item.h" #include "orphan.h" #include "tree-checker.h" +#include "raid-stripe-tree.h" #undef SCRAMBLE_DELAYED_REFS static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, @@ -57,7 +59,7 @@ static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod); + struct btrfs_key *ins, int ref_mod, u64 oref_root); static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op); @@ -344,9 +346,15 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, struct btrfs_extent_inline_ref *iref, enum btrfs_inline_ref_type is_data) { + struct btrfs_fs_info *fs_info = eb->fs_info; int type = btrfs_extent_inline_ref_type(eb, iref); u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + return type; + } + if (type == BTRFS_TREE_BLOCK_REF_KEY || type == BTRFS_SHARED_BLOCK_REF_KEY || type == BTRFS_SHARED_DATA_REF_KEY || @@ -355,26 +363,25 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, if (type == BTRFS_TREE_BLOCK_REF_KEY) return type; if (type == BTRFS_SHARED_BLOCK_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ - if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + if (offset && IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else if (is_data == BTRFS_REF_TYPE_DATA) { if (type == BTRFS_EXTENT_DATA_REF_KEY) return type; if (type == BTRFS_SHARED_DATA_REF_KEY) { - ASSERT(eb->fs_info); + ASSERT(fs_info); /* * Every shared one has parent tree block, * which must be aligned to sector size. */ if (offset && - IS_ALIGNED(offset, eb->fs_info->sectorsize)) + IS_ALIGNED(offset, fs_info->sectorsize)) return type; } } else { @@ -385,7 +392,7 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, WARN_ON(1); btrfs_print_leaf(eb); - btrfs_err(eb->fs_info, + btrfs_err(fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); @@ -399,11 +406,11 @@ u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } @@ -575,7 +582,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, num_refs); } } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; fail: btrfs_release_path(path); @@ -623,7 +630,7 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); else if (key.type == BTRFS_SHARED_DATA_REF_KEY) btrfs_set_shared_data_ref_count(leaf, ref2, num_refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } return ret; } @@ -789,7 +796,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int type; int want; int ret; - int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); int needed; @@ -816,10 +822,8 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } /* * We may be a newly converted file system which still has the old fat @@ -846,7 +850,7 @@ again: } if (ret && !insert) { - err = -ENOENT; + ret = -ENOENT; goto out; } else if (WARN_ON(ret)) { btrfs_print_leaf(path->nodes[0]); @@ -854,18 +858,18 @@ again: "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", bytenr, num_bytes, parent, root_objectid, owner, offset); - err = -EIO; + ret = -EUCLEAN; goto out; } leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %llu expect >= %zu", item_size, sizeof(*ei)); - btrfs_abort_transaction(trans, err); + btrfs_abort_transaction(trans, ret); goto out; } @@ -885,22 +889,17 @@ again: else needed = BTRFS_REF_TYPE_BLOCK; - err = -ENOENT; - while (1) { - if (ptr >= end) { - if (ptr > end) { - err = -EUCLEAN; - btrfs_print_leaf(path->nodes[0]); - btrfs_crit(fs_info, -"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", - path->slots[0], root_objectid, owner, offset, parent); - } - break; - } + ret = -ENOENT; + while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ptr += btrfs_extent_inline_ref_size(type); + continue; + } if (type == BTRFS_REF_TYPE_INVALID) { - err = -EUCLEAN; + ret = -EUCLEAN; goto out; } @@ -916,7 +915,7 @@ again: dref = (struct btrfs_extent_data_ref *)(&iref->offset); if (match_extent_data_ref(leaf, dref, root_objectid, owner, offset)) { - err = 0; + ret = 0; break; } if (hash_extent_data_ref_item(leaf, dref) < @@ -927,14 +926,14 @@ again: ref_offset = btrfs_extent_inline_ref_offset(leaf, iref); if (parent > 0) { if (parent == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < parent) break; } else { if (root_objectid == ref_offset) { - err = 0; + ret = 0; break; } if (ref_offset < root_objectid) @@ -943,10 +942,20 @@ again: } ptr += btrfs_extent_inline_ref_size(type); } - if (err == -ENOENT && insert) { + + if (unlikely(ptr > end)) { + ret = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + goto out; + } + + if (ret == -ENOENT && insert) { if (item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } /* @@ -958,7 +967,7 @@ again: if (find_next_key(path, 0, &key) == 0 && key.objectid == bytenr && key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) { - err = -EAGAIN; + ret = -EAGAIN; goto out; } } @@ -969,14 +978,14 @@ out: path->search_for_extension = 0; btrfs_unlock_up_safe(path, 1); } - return err; + return ret; } /* * helper to add new inline back ref */ static noinline_for_stack -void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, +void setup_inline_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, u64 parent, u64 root_objectid, @@ -999,7 +1008,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, type = extent_ref_type(parent, owner); size = btrfs_extent_inline_ref_size(type); - btrfs_extend_item(path, size); + btrfs_extend_item(trans, path, size); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); @@ -1033,7 +1042,7 @@ void setup_inline_extent_backref(struct btrfs_fs_info *fs_info, } else { btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } static int lookup_extent_backref(struct btrfs_trans_handle *trans, @@ -1066,7 +1075,9 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref( + struct btrfs_trans_handle *trans, + struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) @@ -1174,9 +1185,9 @@ static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *pa memmove_extent_buffer(leaf, ptr, ptr + size, end - ptr - size); item_size -= size; - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); return 0; } @@ -1206,9 +1217,10 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(trans, path, iref, + refs_to_add, extent_op); } else if (ret == -ENOENT) { - setup_inline_extent_backref(trans->fs_info, path, iref, parent, + setup_inline_extent_backref(trans, path, iref, parent, root_objectid, owner, offset, refs_to_add, extent_op); ret = 0; @@ -1226,7 +1238,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(trans, path, iref, + -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else @@ -1422,7 +1435,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -1435,7 +1448,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, } /* - * __btrfs_inc_extent_ref - insert backreference for a given extent + * Insert backreference for a given extent. * * The counterpart is in __btrfs_free_extent(), with examples and more details * how it works. @@ -1465,8 +1478,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, * always passed as 0. For data extents it is the fileoffset * this extent belongs to. * - * @refs_to_add Number of references to add - * * @extent_op Pointer to a structure, holding information necessary when * updating a tree block's flags * @@ -1474,7 +1485,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, - u64 owner, u64 offset, int refs_to_add, + u64 owner, u64 offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_path *path; @@ -1484,6 +1495,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; u64 refs; + int refs_to_add = node->ref_mod; int ret; path = btrfs_alloc_path(); @@ -1510,19 +1522,18 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* now insert the actual backref */ - if (owner < BTRFS_FIRST_FREE_OBJECTID) { - BUG_ON(refs_to_add != 1); + if (owner < BTRFS_FIRST_FREE_OBJECTID) ret = insert_tree_block_ref(trans, path, bytenr, parent, root_objectid); - } else { + else ret = insert_extent_data_ref(trans, path, bytenr, parent, root_objectid, owner, offset, refs_to_add); - } + if (ret) btrfs_abort_transaction(trans, ret); out: @@ -1531,44 +1542,57 @@ out: } static int run_delayed_data_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; struct btrfs_delayed_data_ref *ref; - struct btrfs_key ins; u64 parent = 0; - u64 ref_root = 0; u64 flags = 0; - ins.objectid = node->bytenr; - ins.offset = node->num_bytes; - ins.type = BTRFS_EXTENT_ITEM_KEY; - ref = btrfs_delayed_node_to_data_ref(node); trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action); if (node->type == BTRFS_SHARED_DATA_REF_KEY) parent = ref->parent; - ref_root = ref->root; if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_key key; + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = node->num_bytes, + .rsv_bytes = href->reserved_bytes, + .is_data = true, + .is_inc = true, + .generation = trans->transid, + }; + if (extent_op) flags |= extent_op->flags_to_set; - ret = alloc_reserved_file_extent(trans, parent, ref_root, + + key.objectid = node->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = node->num_bytes; + + ret = alloc_reserved_file_extent(trans, parent, ref->root, flags, ref->objectid, - ref->offset, &ins, - node->ref_mod); + ref->offset, &key, + node->ref_mod, href->owning_root); + if (!ret) + ret = btrfs_record_squota_delta(trans->fs_info, &delta); + else + btrfs_qgroup_free_refroot(trans->fs_info, delta.root, + delta.rsv_bytes, BTRFS_QGROUP_RSV_DATA); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, + ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, ref->objectid, ref->offset, - node->ref_mod, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, - ref_root, ref->objectid, - ref->offset, node->ref_mod, - extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, + ref->root, ref->objectid, + ref->offset, extent_op); } else { BUG(); } @@ -1605,7 +1629,6 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 item_size; int ret; - int err = 0; int metadata = 1; if (TRANS_ABORTED(trans)) @@ -1632,10 +1655,8 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - err = ret; goto out; - } - if (ret > 0) { + } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { path->slots[0]--; @@ -1656,7 +1677,10 @@ again: goto again; } } else { - err = -EIO; + ret = -EUCLEAN; + btrfs_err(fs_info, + "missing extent item for extent %llu num_bytes %llu level %d", + head->bytenr, head->num_bytes, extent_op->level); goto out; } } @@ -1665,29 +1689,31 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_err(fs_info, "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); - btrfs_abort_transaction(trans, err); + btrfs_abort_transaction(trans, ret); goto out; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); - return err; + return ret; } static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) { int ret = 0; + struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_tree_ref *ref; u64 parent = 0; u64 ref_root = 0; @@ -1699,22 +1725,33 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, parent = ref->parent; ref_root = ref->root; - if (node->ref_mod != 1) { + if (unlikely(node->ref_mod != 1)) { btrfs_err(trans->fs_info, - "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu", + "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu", node->bytenr, node->ref_mod, node->action, ref_root, parent); - return -EIO; + return -EUCLEAN; } if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) { + struct btrfs_squota_delta delta = { + .root = href->owning_root, + .num_bytes = fs_info->nodesize, + .rsv_bytes = 0, + .is_data = false, + .is_inc = true, + .generation = trans->transid, + }; + BUG_ON(!extent_op || !extent_op->update_flags); ret = alloc_reserved_tree_block(trans, node, extent_op); + if (!ret) + btrfs_record_squota_delta(fs_info, &delta); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ref->level, 0, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, node, parent, ref_root, - ref->level, 0, 1, extent_op); + ret = __btrfs_free_extent(trans, href, node, parent, ref_root, + ref->level, 0, extent_op); } else { BUG(); } @@ -1723,6 +1760,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, /* helper function to actually process a single delayed ref entry */ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op, bool insert_reserved) @@ -1737,12 +1775,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, if (node->type == BTRFS_TREE_BLOCK_REF_KEY || node->type == BTRFS_SHARED_BLOCK_REF_KEY) - ret = run_delayed_tree_ref(trans, node, extent_op, + ret = run_delayed_tree_ref(trans, href, node, extent_op, insert_reserved); else if (node->type == BTRFS_EXTENT_DATA_REF_KEY || node->type == BTRFS_SHARED_DATA_REF_KEY) - ret = run_delayed_data_ref(trans, node, extent_op, + ret = run_delayed_data_ref(trans, href, node, extent_op, insert_reserved); + else if (node->type == BTRFS_EXTENT_OWNER_REF_KEY) + ret = 0; else BUG(); if (ret && insert_reserved) @@ -1821,28 +1861,37 @@ static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans, return ret ? ret : 1; } -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { - int nr_items = 1; /* Dropping this ref head update. */ - /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. */ if (head->total_ref_mod < 0 && head->is_data) { + int nr_csums; + spin_lock(&delayed_refs->lock); delayed_refs->pending_csums -= head->num_bytes; spin_unlock(&delayed_refs->lock); - nr_items += btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + nr_csums = btrfs_csum_bytes_to_leaves(fs_info, head->num_bytes); + + btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); + + return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && + head->must_insert_reserved && head->is_data) + btrfs_qgroup_free_refroot(fs_info, head->owning_root, + head->reserved_bytes, BTRFS_QGROUP_RSV_DATA); - btrfs_delayed_refs_rsv_release(fs_info, nr_items); + return 0; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *head) + struct btrfs_delayed_ref_head *head, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1887,7 +1936,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, } } - btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); + *bytes_released += btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head); trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); @@ -1929,7 +1978,8 @@ static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( } static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_head *locked_ref) + struct btrfs_delayed_ref_head *locked_ref, + u64 *bytes_released) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; @@ -1983,8 +2033,10 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; spin_unlock(&locked_ref->lock); - ret = run_one_delayed_ref(trans, ref, extent_op, + ret = run_one_delayed_ref(trans, locked_ref, ref, extent_op, must_insert_reserved); + btrfs_delayed_refs_rsv_release(fs_info, 1, 0); + *bytes_released += btrfs_calc_delayed_ref_bytes(fs_info, 1); btrfs_free_delayed_extent_op(extent_op); if (ret) { @@ -2008,15 +2060,22 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long nr) + u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *locked_ref = NULL; int ret; unsigned long count = 0; + unsigned long max_count = 0; + u64 bytes_processed = 0; delayed_refs = &trans->transaction->delayed_refs; + if (min_bytes == 0) { + max_count = delayed_refs->num_heads_ready; + min_bytes = U64_MAX; + } + do { if (!locked_ref) { locked_ref = btrfs_obtain_ref_head(trans); @@ -2044,7 +2103,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(fs_info, delayed_refs, locked_ref); - ret = btrfs_run_delayed_refs_for_head(trans, locked_ref); + ret = btrfs_run_delayed_refs_for_head(trans, locked_ref, &bytes_processed); if (ret < 0 && ret != -EAGAIN) { /* * Error, btrfs_run_delayed_refs_for_head already @@ -2056,7 +2115,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * Success, perform the usual cleanup of a processed * head */ - ret = cleanup_ref_head(trans, locked_ref); + ret = cleanup_ref_head(trans, locked_ref, &bytes_processed); if (ret > 0 ) { /* We dropped our lock, we need to loop. */ ret = 0; @@ -2073,7 +2132,9 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, locked_ref = NULL; cond_resched(); - } while ((nr != -1 && count < nr) || locked_ref); + } while ((min_bytes != U64_MAX && bytes_processed < min_bytes) || + (max_count > 0 && count < max_count) || + locked_ref); return 0; } @@ -2122,24 +2183,25 @@ static u64 find_middle(struct rb_root *root) #endif /* - * this starts processing the delayed reference count updates and - * extent insertions we have queued up so far. count can be - * 0, which means to process everything in the tree at the start - * of the run (but not newly added entries), or it can be some target - * number you'd like to process. + * Start processing the delayed reference count updates and extent insertions + * we have queued up so far. + * + * @trans: Transaction handle. + * @min_bytes: How many bytes of delayed references to process. After this + * many bytes we stop processing delayed references if there are + * any more. If 0 it means to run all existing delayed references, + * but not new ones added after running all existing ones. + * Use (u64)-1 (U64_MAX) to run all existing delayed references + * plus any new ones that are added. * * Returns 0 on success or if called with an aborted transaction * Returns <0 on error and aborts the transaction */ -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count) +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_head *head; int ret; - int run_all = count == (unsigned long)-1; /* We'll clean this up in btrfs_cleanup_transaction */ if (TRANS_ABORTED(trans)) @@ -2149,42 +2211,30 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, return 0; delayed_refs = &trans->transaction->delayed_refs; - if (count == 0) - count = delayed_refs->num_heads_ready; - again: #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - ret = __btrfs_run_delayed_refs(trans, count); + ret = __btrfs_run_delayed_refs(trans, min_bytes); if (ret < 0) { btrfs_abort_transaction(trans, ret); return ret; } - if (run_all) { + if (min_bytes == U64_MAX) { btrfs_create_pending_block_groups(trans); spin_lock(&delayed_refs->lock); - node = rb_first_cached(&delayed_refs->href_root); - if (!node) { + if (RB_EMPTY_ROOT(&delayed_refs->href_root.rb_root)) { spin_unlock(&delayed_refs->lock); - goto out; + return 0; } - head = rb_entry(node, struct btrfs_delayed_ref_head, - href_node); - refcount_inc(&head->refs); spin_unlock(&delayed_refs->lock); - /* Mutex was contended, block until it's released and retry. */ - mutex_lock(&head->mutex); - mutex_unlock(&head->mutex); - - btrfs_put_delayed_ref_head(head); cond_resched(); goto again; } -out: + return 0; } @@ -2309,6 +2359,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + u32 expected_size; int type; int ret; @@ -2335,10 +2386,22 @@ static noinline int check_committed_ref(struct btrfs_root *root, ret = 1; item_size = btrfs_item_size(leaf, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); + expected_size = sizeof(*ei) + btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY); + + /* No inline refs; we need to bail before checking for owner ref. */ + if (item_size == sizeof(*ei)) + goto out; + + /* Check for an owner ref; skip over it to the real inline refs. */ + iref = (struct btrfs_extent_inline_ref *)(ei + 1); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (btrfs_fs_incompat(fs_info, SIMPLE_QUOTA) && type == BTRFS_EXTENT_OWNER_REF_KEY) { + expected_size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + iref = (struct btrfs_extent_inline_ref *)(iref + 1); + } /* If extent item has more than 1 inline ref then it's shared */ - if (item_size != sizeof(*ei) + - btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY)) + if (item_size != expected_size) goto out; /* @@ -2350,8 +2413,6 @@ static noinline int check_committed_ref(struct btrfs_root *root, btrfs_root_last_snapshot(&root->root_item))) goto out; - iref = (struct btrfs_extent_inline_ref *)(ei + 1); - /* If this extent has SHARED_DATA_REF then it's shared */ type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); if (type != BTRFS_EXTENT_DATA_REF_KEY) @@ -2448,7 +2509,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi); key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, ref_root); btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, key.offset, root->root_key.objectid, for_reloc); @@ -2461,8 +2522,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, } else { bytenr = btrfs_node_blockptr(buf, i); num_bytes = fs_info->nodesize; + /* We don't know the owning_root, use 0. */ btrfs_init_generic_ref(&generic_ref, action, bytenr, - num_bytes, parent); + num_bytes, parent, 0); btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, root->root_key.objectid, for_reloc); if (inc) @@ -2563,16 +2625,13 @@ int btrfs_pin_extent(struct btrfs_trans_handle *trans, return 0; } -/* - * this function must be called within transaction - */ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes) + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret; - cache = btrfs_lookup_block_group(trans->fs_info, bytenr); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) return -EINVAL; @@ -2584,10 +2643,10 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, if (ret) goto out; - pin_down_extent(trans, cache, bytenr, num_bytes, 0); + pin_down_extent(trans, cache, eb->start, eb->len, 0); /* remove us from the free space cache (if we're there at all) */ - ret = btrfs_remove_free_space(cache, bytenr, num_bytes); + ret = btrfs_remove_free_space(cache, eb->start, eb->len); out: btrfs_put_block_group(cache); return ret; @@ -2842,12 +2901,61 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Parse an extent item's inline extents looking for a simple quotas owner ref. + * + * @fs_info: the btrfs_fs_info for this mount + * @leaf: a leaf in the extent tree containing the extent item + * @slot: the slot in the leaf where the extent item is found + * + * Returns the objectid of the root that originally allocated the extent item + * if the inline owner ref is expected and present, otherwise 0. + * + * If an extent item has an owner ref item, it will be the first inline ref + * item. Therefore the logic is to check whether there are any inline ref + * items, then check the type of the first one. + */ +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot) +{ + struct btrfs_extent_item *ei; + struct btrfs_extent_inline_ref *iref; + struct btrfs_extent_owner_ref *oref; + unsigned long ptr; + unsigned long end; + int type; + + if (!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)) + return 0; + + ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + ptr = (unsigned long)(ei + 1); + end = (unsigned long)ei + btrfs_item_size(leaf, slot); + + /* No inline ref items of any kind, can't check type. */ + if (ptr == end) + return 0; + + iref = (struct btrfs_extent_inline_ref *)ptr; + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + + /* We found an owner ref, get the root out of it. */ + if (type == BTRFS_EXTENT_OWNER_REF_KEY) { + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + return btrfs_extent_owner_ref_root_id(leaf, oref); + } + + /* We have inline refs, but not an owner ref. */ + return 0; +} + static int do_free_extent_accounting(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, bool is_data) + u64 bytenr, struct btrfs_squota_delta *delta) { int ret; + u64 num_bytes = delta->num_bytes; - if (is_data) { + if (delta->is_data) { struct btrfs_root *csum_root; csum_root = btrfs_csum_root(trans->fs_info, bytenr); @@ -2856,6 +2964,18 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } + + ret = btrfs_delete_raid_extent(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = btrfs_record_squota_delta(trans->fs_info, delta); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; } ret = add_to_free_space_tree(trans, bytenr, num_bytes); @@ -2938,9 +3058,10 @@ static int do_free_extent_accounting(struct btrfs_trans_handle *trans, * And that (13631488 EXTENT_DATA_REF <HASH>) gets removed. */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, - u64 owner_offset, int refs_to_drop, + u64 owner_offset, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *info = trans->fs_info; @@ -2955,11 +3076,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int refs_to_drop = node->ref_mod; u32 item_size; u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); + u64 delayed_ref_root = href->owning_root; extent_root = btrfs_extent_root(info, bytenr); ASSERT(extent_root); @@ -3149,7 +3272,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } else { btrfs_set_extent_refs(leaf, ei, refs); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, @@ -3160,6 +3283,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + struct btrfs_squota_delta delta = { + .root = delayed_ref_root, + .num_bytes = num_bytes, + .rsv_bytes = 0, + .is_data = is_data, + .is_inc = false, + .generation = btrfs_extent_generation(leaf, ei), + }; + /* In this branch refs == 1 */ if (found_extent) { if (is_data && refs_to_drop != @@ -3198,6 +3330,16 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, num_to_del = 2; } } + /* + * We can't infer the data owner from the delayed ref, so we need + * to try to get it from the owning ref item. + * + * If it is not present, then that extent was not written under + * simple quotas mode, so we don't need to account for its deletion. + */ + if (is_data) + delta.root = btrfs_get_extent_owner_root(trans->fs_info, + leaf, extent_slot); ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); @@ -3207,7 +3349,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); + ret = do_free_extent_accounting(trans, bytenr, &delta); } btrfs_release_path(path); @@ -3281,7 +3423,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, - buf->start, buf->len, parent); + buf->start, buf->len, parent, btrfs_header_owner(buf)); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), root_id, 0, false); @@ -3368,10 +3510,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { - /* unlocks the pinned mutex */ + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; } else if (ref->type == BTRFS_REF_METADATA) { @@ -3381,9 +3522,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.ref_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -4440,8 +4581,8 @@ loop: } /* - * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a - * hole that is at least as big as @num_bytes. + * Entry point to the extent allocator. Tries to find a hole that is at least + * as big as @num_bytes. * * @root - The root that will contain this extent * @@ -4560,20 +4701,20 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return 0; } -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len) +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb) { struct btrfs_block_group *cache; int ret = 0; - cache = btrfs_lookup_block_group(trans->fs_info, start); + cache = btrfs_lookup_block_group(trans->fs_info, eb->start); if (!cache) { btrfs_err(trans->fs_info, "unable to find block group for %llu", - start); + eb->start); return -ENOSPC; } - ret = pin_down_extent(trans, cache, start, len, 1); + ret = pin_down_extent(trans, cache, eb->start, eb->len, 1); btrfs_put_block_group(cache); return ret; } @@ -4603,24 +4744,29 @@ static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, - struct btrfs_key *ins, int ref_mod) + struct btrfs_key *ins, int ref_mod, u64 oref_root) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *extent_root; int ret; struct btrfs_extent_item *extent_item; + struct btrfs_extent_owner_ref *oref; struct btrfs_extent_inline_ref *iref; struct btrfs_path *path; struct extent_buffer *leaf; int type; u32 size; + const bool simple_quota = (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE); if (parent > 0) type = BTRFS_SHARED_DATA_REF_KEY; else type = BTRFS_EXTENT_DATA_REF_KEY; - size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); + size = sizeof(*extent_item); + if (simple_quota) + size += btrfs_extent_inline_ref_size(BTRFS_EXTENT_OWNER_REF_KEY); + size += btrfs_extent_inline_ref_size(type); path = btrfs_alloc_path(); if (!path) @@ -4642,7 +4788,14 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, flags | BTRFS_EXTENT_FLAG_DATA); iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); + if (simple_quota) { + btrfs_set_extent_inline_ref_type(leaf, iref, BTRFS_EXTENT_OWNER_REF_KEY); + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + btrfs_set_extent_owner_ref_root_id(leaf, oref, oref_root); + iref = (struct btrfs_extent_inline_ref *)(oref + 1); + } btrfs_set_extent_inline_ref_type(leaf, iref, type); + if (parent > 0) { struct btrfs_shared_data_ref *ref; ref = (struct btrfs_shared_data_ref *)(iref + 1); @@ -4657,7 +4810,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_set_extent_data_ref_count(leaf, ref, ref_mod); } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return alloc_reserved_extent(trans, ins->objectid, ins->offset); @@ -4732,7 +4885,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); @@ -4744,12 +4897,17 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_key *ins) { struct btrfs_ref generic_ref = { 0 }; + u64 root_objectid = root->root_key.objectid; + u64 owning_root = root_objectid; + + BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID); - BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + if (btrfs_is_data_reloc_root(root) && is_fstree(root->relocation_src_root)) + owning_root = root->relocation_src_root; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + ins->objectid, ins->offset, 0, owning_root); + btrfs_init_data_ref(&generic_ref, root_objectid, owner, offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); @@ -4769,6 +4927,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, int ret; struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; + struct btrfs_squota_delta delta = { + .root = root_objectid, + .num_bytes = ins->offset, + .generation = trans->transid, + .rsv_bytes = 0, + .is_data = true, + .is_inc = true, + }; /* * Mixed block groups will exclude before processing the log so we only @@ -4794,13 +4960,36 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, spin_unlock(&space_info->lock); ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner, - offset, ins, 1); + offset, ins, 1, root_objectid); if (ret) btrfs_pin_extent(trans, ins->objectid, ins->offset, 1); + ret = btrfs_record_squota_delta(fs_info, &delta); btrfs_put_block_group(block_group); return ret; } +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extra safety check in case the extent tree is corrupted and extent allocator + * chooses to use a tree block which is already used and locked. + */ +static bool check_eb_lock_owner(const struct extent_buffer *eb) +{ + if (eb->lock_owner == current->pid) { + btrfs_err_rl(eb->fs_info, +"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", + eb->start, btrfs_header_owner(eb), current->pid); + return true; + } + return false; +} +#else +static bool check_eb_lock_owner(struct extent_buffer *eb) +{ + return false; +} +#endif + static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, int level, u64 owner, @@ -4814,15 +5003,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, if (IS_ERR(buf)) return buf; - /* - * Extra safety check in case the extent tree is corrupted and extent - * allocator chooses to use a tree block which is already used and - * locked. - */ - if (buf->lock_owner == current->pid) { - btrfs_err_rl(fs_info, -"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected", - buf->start, btrfs_header_owner(buf), current->pid); + if (check_eb_lock_owner(buf)) { free_extent_buffer(buf); return ERR_PTR(-EUCLEAN); } @@ -4899,6 +5080,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4911,6 +5093,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, int ret; u32 blocksize = fs_info->nodesize; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + u64 owning_root; #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { @@ -4937,11 +5120,13 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = PTR_ERR(buf); goto out_free_reserved; } + owning_root = btrfs_header_owner(buf); if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) parent = ins.objectid; flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF; + owning_root = reloc_src_root; } else BUG_ON(parent > 0); @@ -4961,7 +5146,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, - ins.objectid, ins.offset, parent); + ins.objectid, ins.offset, parent, owning_root); btrfs_init_tree_ref(&generic_ref, level, root_objectid, root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); @@ -5382,7 +5567,8 @@ skip: find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent); + fs_info->nodesize, parent, + btrfs_header_owner(next)); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 88c249c37516..0716f65d9753 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -7,6 +7,7 @@ #include "block-group.h" struct btrfs_free_cluster; +struct btrfs_delayed_ref_head; enum btrfs_extent_allocation_policy { BTRFS_EXTENT_ALLOC_CLUSTERED, @@ -91,8 +92,8 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, u64 min_bytes); +u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); @@ -102,7 +103,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); + const struct extent_buffer *eb); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, u64 bytenr, bool strict, @@ -113,6 +114,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, const struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size, + u64 reloc_src_root, enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, u64 root_id, @@ -136,12 +138,15 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); +u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, + const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, +int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc); int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ac3fca5a5e41..03cef28d9e37 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -21,7 +21,6 @@ #include "ctree.h" #include "btrfs_inode.h" #include "bio.h" -#include "check-integrity.h" #include "locking.h" #include "rcu-string.h" #include "backref.h" @@ -395,7 +394,7 @@ again: /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, 1, cached_state); + EXTENT_DELALLOC, cached_state); if (!ret) { unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); @@ -484,10 +483,8 @@ static void end_bio_extent_writepage(struct btrfs_bio *bbio) bvec->bv_offset, bvec->bv_len); btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); - if (error) { - btrfs_page_clear_uptodate(fs_info, page, start, len); + if (error) mapping_set_error(page->mapping, error); - } btrfs_page_clear_writeback(fs_info, page, start, len); } @@ -1456,8 +1453,6 @@ done: if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page, - page_start, PAGE_SIZE); mapping_set_error(page->mapping, ret); } unlock_page(page); @@ -1624,8 +1619,6 @@ static void extent_buffer_write_end_io(struct btrfs_bio *bbio) struct page *page = bvec->bv_page; u32 len = bvec->bv_len; - if (!uptodate) - btrfs_page_clear_uptodate(fs_info, page, start, len); btrfs_page_clear_writeback(fs_info, page, start, len); bio_offset += len; } @@ -2201,7 +2194,6 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, if (ret) { btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, cur, cur_len, !ret); - btrfs_page_clear_uptodate(fs_info, page, cur, cur_len); mapping_set_error(page->mapping, ret); } btrfs_page_unlock_writer(fs_info, page, cur, cur_len); @@ -2301,7 +2293,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, u64 end = start + PAGE_SIZE - 1; int ret = 1; - if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) { + if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { ret = 0; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | @@ -2360,9 +2352,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) free_extent_map(em); break; } - if (test_range_bit(tree, em->start, - extent_map_end(em) - 1, - EXTENT_LOCKED, 0, NULL)) + if (test_range_bit_exists(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used @@ -3462,6 +3454,12 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) start, fs_info->nodesize); return -EINVAL; } + if (!IS_ALIGNED(start, fs_info->nodesize) && + !test_and_set_bit(BTRFS_FS_UNALIGNED_TREE_BLOCK, &fs_info->flags)) { + btrfs_warn(fs_info, +"tree block not nodesize aligned, start %llu nodesize %u, can be resolved by a full metadata balance", + start, fs_info->nodesize); + } return 0; } @@ -4002,8 +4000,14 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = get_eb_page_index(start); - if (check_eb_range(eb, start, len)) + if (check_eb_range(eb, start, len)) { + /* + * Invalid range hit, reset the memory, so callers won't get + * some random garbage for their uninitialzed memory. + */ + memset(dstv, 0, len); return; + } offset = get_eb_offset_in_page(eb, start); @@ -4249,14 +4253,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * eb_bitmap_offset() - calculate the page and offset of the byte containing the - * given bit number - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains the - * given bit number - * @page_offset: return offset into the page given by page_index + * Calculate the page and offset of the byte containing the given bit number. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains + * the given bit number + * @page_offset: return offset into the page given by page_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. @@ -4615,7 +4619,8 @@ int try_release_extent_buffer(struct page *page) } /* - * btrfs_readahead_tree_block - attempt to readahead a child block + * Attempt to readahead a child block. + * * @fs_info: the fs_info * @bytenr: bytenr to read * @owner_root: objectid of the root that owns this eb @@ -4654,7 +4659,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, } /* - * btrfs_readahead_node_child - readahead a node's child block + * Readahead a node's child block. + * * @node: parent node we're reading from * @slot: slot in the parent node for the child we want to read * diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 68368ba99321..2171057a4477 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -80,16 +80,16 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; int read_mirror; - struct rcu_head rcu_head; - pid_t lock_owner; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; + struct rcu_head rcu_head; struct rw_semaphore lock; struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; + pid_t lock_owner; #endif }; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 1ce5dd154499..45cae356e89b 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -194,7 +194,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -811,11 +811,12 @@ blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) * This calls btrfs_truncate_item with the correct args based on the overlap, * and fixes up the key as required. */ -static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, +static noinline void truncate_one_csum(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *key, u64 bytenr, u64 len) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct extent_buffer *leaf; const u32 csum_size = fs_info->csum_size; u64 csum_end; @@ -836,7 +837,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, */ u32 new_size = (bytenr - key->offset) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 1); + btrfs_truncate_item(trans, path, new_size, 1); } else if (key->offset >= bytenr && csum_end > end_byte && end_byte > key->offset) { /* @@ -848,10 +849,10 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, u32 new_size = (csum_end - end_byte) >> blocksize_bits; new_size *= csum_size; - btrfs_truncate_item(path, new_size, 0); + btrfs_truncate_item(trans, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(fs_info, path, key); + btrfs_set_item_key_safe(trans, path, key); } else { BUG(); } @@ -994,7 +995,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, key.offset = end_byte - 1; } else { - truncate_one_csum(fs_info, path, &key, bytenr, len); + truncate_one_csum(trans, path, &key, bytenr, len); if (key.offset < bytenr) break; } @@ -1202,7 +1203,7 @@ extend_csum: diff /= csum_size; diff *= csum_size; - btrfs_extend_item(path, diff); + btrfs_extend_item(trans, path, diff); ret = 0; goto csum; } @@ -1249,7 +1250,7 @@ found: ins_size /= csum_size; total_bytes += ins_size * fs_info->sectorsize; - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); if (total_bytes < sums->len) { btrfs_release_path(path); cond_resched(); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ca46a529d56b..f47731c45bb5 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -17,6 +17,7 @@ #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> +#include <linux/iomap.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -368,12 +369,13 @@ next_slot: btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, @@ -405,13 +407,13 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; @@ -431,7 +433,7 @@ next_slot: btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) @@ -463,7 +465,8 @@ delete_extent_item: } else if (update_refs && disk_bytenr > 0) { btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - disk_bytenr, num_bytes, 0); + disk_bytenr, num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, @@ -536,7 +539,8 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); + btrfs_setup_item_for_insert(trans, root, path, &key, + args->extent_item_size); args->extent_inserted = true; } @@ -593,7 +597,6 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { - struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_path *path; @@ -664,7 +667,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -679,7 +682,7 @@ again: trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -698,7 +701,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(fs_info, path, &new_key); + btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -708,7 +711,7 @@ again: other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } } @@ -742,10 +745,10 @@ again: btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); @@ -771,7 +774,7 @@ again: other_start = end; other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, 0); + num_bytes, 0, root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, @@ -814,7 +817,7 @@ again: btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); @@ -823,7 +826,7 @@ again: btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { @@ -1106,6 +1109,26 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } +static void update_time_for_write(struct inode *inode) +{ + struct timespec64 now, ts; + + if (IS_NOCMTIME(inode)) + return; + + now = current_time(inode); + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_mtime_to_ts(inode, now); + + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) + inode_set_ctime_to_ts(inode, now); + + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); +} + static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { @@ -1137,10 +1160,7 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - if (!IS_NOCMTIME(inode)) { - inode->i_mtime = inode_set_ctime_current(inode); - inode_inc_iversion(inode); - } + update_time_for_write(inode); start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); @@ -1451,8 +1471,13 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_NOWAIT) ilock_flags |= BTRFS_ILOCK_TRY; - /* If the write DIO is within EOF, use a shared lock */ - if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode)) + /* + * If the write DIO is within EOF, use a shared lock and also only if + * security bits will likely not be dropped by file_remove_privs() called + * from btrfs_write_check(). Either will need to be rechecked after the + * lock was acquired. + */ + if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode)) ilock_flags |= BTRFS_ILOCK_SHARED; relock: @@ -1460,6 +1485,13 @@ relock: if (err < 0) return err; + /* Shared lock cannot be used with security bits set. */ + if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + ilock_flags &= ~BTRFS_ILOCK_SHARED; + goto relock; + } + err = generic_write_checks(iocb, from); if (err <= 0) { btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); @@ -1718,7 +1750,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) struct btrfs_inode *inode = BTRFS_I(ctx->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (btrfs_inode_in_log(inode, fs_info->generation) && + if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; @@ -1729,7 +1761,7 @@ static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ - if (inode->last_trans <= fs_info->last_trans_committed && + if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; @@ -1858,7 +1890,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - smp_mb(); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were @@ -2076,7 +2107,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } @@ -2084,7 +2115,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(fs_info, path, &key); + btrfs_set_item_key_safe(trans, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2093,7 +2124,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); goto out; } btrfs_release_path(path); @@ -2245,7 +2276,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, @@ -2275,7 +2306,8 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, extent_info->disk_offset, - extent_info->disk_len, 0); + extent_info->disk_len, 0, + root->root_key.objectid); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, btrfs_ino(inode), ref_offset, 0, false); @@ -2445,9 +2477,10 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, inode_inc_iversion(&inode->vfs_inode); if (!extent_info || extent_info->update_times) - inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); + inode_set_mtime_to_ts(&inode->vfs_inode, + inode_set_ctime_current(&inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -2686,8 +2719,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); @@ -2706,14 +2739,14 @@ out_only_mutex: struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); - inode->i_mtime = now; + inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; @@ -2780,7 +2813,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 27fad70451aa..6f93c9a2c3e3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -57,6 +57,11 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes, bool update_stats); +static void btrfs_crc32c_final(u32 crc, u8 *result) +{ + put_unaligned_le32(~crc, result); +} + static void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { struct btrfs_free_space *info; @@ -195,7 +200,7 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_set_inode_nlink(leaf, inode_item, 1); btrfs_set_inode_transid(leaf, inode_item, trans->transid); btrfs_set_inode_block_group(leaf, inode_item, offset); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; @@ -213,7 +218,7 @@ static int __create_free_space_inode(struct btrfs_root *root, struct btrfs_free_space_header); memzero_extent_buffer(leaf, (unsigned long)header, sizeof(*header)); btrfs_set_free_space_key(leaf, header, &disk_key); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -354,7 +359,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, if (ret) goto fail; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); fail: if (locked) @@ -540,7 +545,7 @@ static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); io_ctl_unmap_page(io_ctl); tmp = page_address(io_ctl->pages[0]); @@ -562,7 +567,7 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) val = *tmp; io_ctl_map_page(io_ctl, 0); - crc = btrfs_crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); + crc = crc32c(crc, io_ctl->orig + offset, PAGE_SIZE - offset); btrfs_crc32c_final(crc, (u8 *)&crc); if (val != crc) { btrfs_err_rl(io_ctl->fs_info, @@ -1185,7 +1190,7 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_set_free_space_entries(leaf, header, entries); btrfs_set_free_space_bitmaps(leaf, header, bitmaps); btrfs_set_free_space_generation(leaf, header, trans->transid); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); return 0; @@ -1321,7 +1326,7 @@ out: "failed to write free space cache for block group %llu error %d", block_group->start, ret); } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (block_group) { /* the dirty list is protected by the dirty_bgs_lock */ @@ -1362,7 +1367,6 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, /* * Write out cached info to an inode. * - * @root: root the inode belongs to * @inode: freespace inode we are writing out * @ctl: free space cache we are going to write out * @block_group: block_group for this cache if it belongs to a block_group @@ -1373,7 +1377,7 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, * on mount. This will return 0 if it was successful in writing the cache out, * or an errno if it was not. */ -static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, +static int __btrfs_write_out_cache(struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group, struct btrfs_io_ctl *io_ctl, @@ -1506,7 +1510,7 @@ out: invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } - btrfs_update_inode(trans, root, BTRFS_I(inode)); + btrfs_update_inode(trans, BTRFS_I(inode)); if (must_iput) iput(inode); return ret; @@ -1532,8 +1536,8 @@ int btrfs_write_out_cache(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(fs_info->tree_root, inode, ctl, - block_group, &block_group->io_ctl, trans); + ret = __btrfs_write_out_cache(inode, ctl, block_group, + &block_group->io_ctl, trans); if (ret) { btrfs_debug(fs_info, "failed to write free space cache for block group %llu error %d", diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c0e734082dcc..7b598b070700 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -89,7 +89,7 @@ static int add_new_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_free_space_info); btrfs_set_free_space_extent_count(leaf, info, 0); btrfs_set_free_space_flags(leaf, info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -287,7 +287,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, flags |= BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); if (extent_count != expected_extent_count) { @@ -324,7 +324,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, bitmap_cursor, ptr, data_size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); i += extent_size; @@ -430,7 +430,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; btrfs_set_free_space_flags(leaf, info, flags); expected_extent_count = btrfs_free_space_extent_count(leaf, info); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); nrbits = block_group->length >> block_group->fs_info->sectorsize_bits; @@ -495,7 +495,7 @@ static int update_free_space_extent_count(struct btrfs_trans_handle *trans, extent_count += new_extents; btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && @@ -533,7 +533,8 @@ int free_space_test_bit(struct btrfs_block_group *block_group, return !!extent_buffer_test_bit(leaf, ptr, i); } -static void free_space_set_bits(struct btrfs_block_group *block_group, +static void free_space_set_bits(struct btrfs_trans_handle *trans, + struct btrfs_block_group *block_group, struct btrfs_path *path, u64 *start, u64 *size, int bit) { @@ -563,7 +564,7 @@ static void free_space_set_bits(struct btrfs_block_group *block_group, extent_buffer_bitmap_set(leaf, ptr, first, last - first); else extent_buffer_bitmap_clear(leaf, ptr, first, last - first); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); *size -= end - *start; *start = end; @@ -656,7 +657,7 @@ static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, cur_start = start; cur_size = size; while (1) { - free_space_set_bits(block_group, path, &cur_start, &cur_size, + free_space_set_bits(trans, block_group, path, &cur_start, &cur_size, !remove); if (cur_size == 0) break; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a523d64d5491..318df6f9d9cb 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -139,6 +139,12 @@ enum { */ BTRFS_FS_FEATURE_CHANGED, + /* + * Indicate that we have found a tree block which is only aligned to + * sectorsize, but not to nodesize. This should be rare nowadays. + */ + BTRFS_FS_UNALIGNED_TREE_BLOCK, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -171,19 +177,17 @@ enum { BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 19), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 20), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 21), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 22), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 23), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 24), + BTRFS_MOUNT_REF_VERIFY = (1UL << 25), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 26), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), + BTRFS_MOUNT_NODISCARD = (1UL << 29), }; /* @@ -216,7 +220,8 @@ enum { BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_SIMPLE_QUOTA) #ifdef CONFIG_BTRFS_DEBUG /* @@ -225,6 +230,7 @@ enum { */ #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_SUPP_STABLE | \ + BTRFS_FEATURE_INCOMPAT_RAID_STRIPE_TREE | \ BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) #else @@ -369,6 +375,7 @@ struct btrfs_fs_info { struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; struct btrfs_root *block_group_root; + struct btrfs_root *stripe_root; /* The log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -409,7 +416,17 @@ struct btrfs_fs_info { struct btrfs_block_rsv empty_block_rsv; + /* + * Updated while holding the lock 'trans_lock'. Due to the life cycle of + * a transaction, it can be directly read while holding a transaction + * handle, everywhere else must be read with btrfs_get_fs_generation(). + * Should always be updated using btrfs_set_fs_generation(). + */ u64 generation; + /* + * Always use btrfs_get_last_trans_committed() and + * btrfs_set_last_trans_committed() to read and update this field. + */ u64 last_trans_committed; /* * Generation of the last transaction used for block group relocation @@ -645,9 +662,6 @@ struct btrfs_fs_info { struct btrfs_discard_ctl discard_ctl; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif /* Is qgroup tracking in a consistent state? */ u64 qgroup_flags; @@ -683,6 +697,7 @@ struct btrfs_fs_info { /* Protected by qgroup_rescan_lock */ bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + u64 qgroup_enable_gen; /* * If this is not 0, then it indicates a serious filesystem error has @@ -812,6 +827,26 @@ struct btrfs_fs_info { #endif }; +static inline u64 btrfs_get_fs_generation(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->generation); +} + +static inline void btrfs_set_fs_generation(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->generation, gen); +} + +static inline u64 btrfs_get_last_trans_committed(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_trans_committed); +} + +static inline void btrfs_set_last_trans_committed(struct btrfs_fs_info *fs_info, u64 gen) +{ + WRITE_ONCE(fs_info->last_trans_committed, gen); +} + static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, u64 gen) { diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 4c322b720a80..7d734830e514 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -167,7 +167,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, memmove_extent_buffer(leaf, ptr, ptr + del_len, item_size - (ptr + del_len - item_start)); - btrfs_truncate_item(path, item_size - del_len, 1); + btrfs_truncate_item(trans, path, item_size - del_len, 1); out: btrfs_free_path(path); @@ -229,7 +229,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); - btrfs_truncate_item(path, item_size - sub_item_len, 1); + btrfs_truncate_item(trans, path, item_size - sub_item_len, 1); out: btrfs_free_path(path); @@ -247,7 +247,7 @@ out: } /* - * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree. + * Insert an extended inode ref into a tree. * * The caller must have checked against BTRFS_LINK_MAX already. */ @@ -282,7 +282,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, name)) goto out; - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) @@ -299,7 +299,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -338,7 +338,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, goto out; old_size = btrfs_item_size(path->nodes[0], path->slots[0]); - btrfs_extend_item(path, ins_len); + btrfs_extend_item(trans, path, ins_len); ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); @@ -364,7 +364,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ptr = (unsigned long)(ref + 1); } write_extent_buffer(path->nodes[0], name->name, ptr, name->len); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); @@ -591,7 +591,7 @@ search_again: num_dec = (orig_num_bytes - extent_num_bytes); if (extent_start != 0) control->sub_bytes += num_dec; - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { extent_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); @@ -617,7 +617,7 @@ search_again: btrfs_set_file_extent_ram_bytes(leaf, fi, size); size = btrfs_file_extent_calc_inline_size(size); - btrfs_truncate_item(path, size, 1); + btrfs_truncate_item(trans, path, size, 1); } else if (!del_item) { /* * We have to bail so the last_size is set to @@ -676,7 +676,8 @@ delete: bytes_deleted += extent_num_bytes; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, - extent_start, extent_num_bytes, 0); + extent_start, extent_num_bytes, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), control->ino, extent_offset, root->root_key.objectid, false); diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index ede43b6c6559..4337bb26f419 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -4,6 +4,7 @@ #define BTRFS_INODE_ITEM_H #include <linux/types.h> +#include <linux/crc32c.h> struct btrfs_trans_handle; struct btrfs_root; @@ -12,6 +13,7 @@ struct btrfs_key; struct btrfs_inode_extref; struct btrfs_inode; struct extent_buffer; +struct fscrypt_str; /* * Return this if we need to call truncate_block for the last bit of the @@ -76,6 +78,12 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, *ro_flags = (u32)(inode_item_flags >> 32); } +/* Figure the key offset of an extended inode ref. */ +static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name, int len) +{ + return (u64)crc32c(parent_objectid, name, len); +} + int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f09fbdc43f0f..5e3fccddde0c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,6 +71,7 @@ #include "super.h" #include "orphan.h" #include "backref.h" +#include "raid-stripe-tree.h" struct btrfs_iget_args { u64 ino; @@ -348,7 +349,7 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, } /* - * btrfs_inode_lock - lock inode i_rwsem based on arguments passed + * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * @@ -382,7 +383,7 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) } /* - * btrfs_inode_unlock - unock inode i_rwsem + * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. @@ -573,7 +574,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, kunmap_local(kaddr); put_page(page); } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -670,7 +671,7 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; @@ -1085,9 +1086,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, btrfs_mark_ordered_io_finished(inode, locked_page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(inode->root->fs_info, - locked_page, page_start, - PAGE_SIZE); mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } @@ -1568,8 +1566,11 @@ out_unlock: * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. + * + * If called with @do_free == true then it'll try to finish the work and free + * the work struct eventually. */ -static noinline void submit_compressed_extents(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); @@ -1578,6 +1579,21 @@ static noinline void submit_compressed_extents(struct btrfs_work *work) unsigned long nr_pages; u64 alloc_hint = 0; + if (do_free) { + struct async_chunk *async_chunk; + struct async_cow *async_cow; + + async_chunk = container_of(work, struct async_chunk, work); + btrfs_add_delayed_iput(async_chunk->inode); + if (async_chunk->blkcg_css) + css_put(async_chunk->blkcg_css); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); + return; + } + nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; @@ -1594,21 +1610,6 @@ static noinline void submit_compressed_extents(struct btrfs_work *work) cond_wake_up_nomb(&fs_info->async_submit_wait); } -static noinline void async_cow_free(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - struct async_cow *async_cow; - - async_chunk = container_of(work, struct async_chunk, work); - btrfs_add_delayed_iput(async_chunk->inode); - if (async_chunk->blkcg_css) - css_put(async_chunk->blkcg_css); - - async_cow = async_chunk->async_cow; - if (atomic_dec_and_test(&async_cow->num_chunks)) - kvfree(async_cow); -} - static bool run_delalloc_compressed(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc) @@ -1686,7 +1687,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, } btrfs_init_work(&async_chunk[i].work, compress_file_range, - submit_compressed_extents, async_cow_free); + submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); @@ -2238,8 +2239,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, - 0, NULL)) + test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2791,7 +2791,6 @@ out_page: mapping_set_error(page->mapping, ret); btrfs_mark_ordered_io_finished(inode, page, page_start, PAGE_SIZE, !ret); - btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE); clear_page_dirty_for_io(page); } btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); @@ -2851,7 +2850,7 @@ int btrfs_writepage_cow_fixup(struct page *page) ihold(inode); btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); - btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); + btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); @@ -2916,7 +2915,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* @@ -3074,7 +3073,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } trans->block_rsv = &inode->block_rsv; - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3095,6 +3094,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) trans->block_rsv = &inode->block_rsv; + ret = btrfs_insert_raid_extent(trans, ordered_extent); + if (ret) + goto out; + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { @@ -3140,7 +3143,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); - ret = btrfs_update_inode_fallback(trans, root, inode); + ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; @@ -3228,7 +3231,8 @@ out: int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && - !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } @@ -3286,7 +3290,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - 1, NULL)) { + NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); @@ -3310,7 +3314,7 @@ zeroit: } /* - * btrfs_add_delayed_iput - perform a delayed iput on @inode + * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * @@ -3758,19 +3762,17 @@ static int btrfs_read_locked_inode(struct inode *inode, btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime); - inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime); + inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); - inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); + inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + btrfs_timespec_nsec(leaf, &inode_item->mtime)); inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime.tv_sec = - btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime.tv_nsec = - btrfs_timespec_nsec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); @@ -3796,7 +3798,7 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ - if (BTRFS_I(inode)->last_trans == fs_info->generation) + if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -3926,24 +3928,22 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); - btrfs_set_token_timespec_sec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_sec); - btrfs_set_token_timespec_nsec(&token, &item->otime, - BTRFS_I(inode)->i_otime.tv_nsec); + btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); + btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, @@ -3961,8 +3961,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) + struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; @@ -3973,7 +3972,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1); + ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; @@ -3985,7 +3984,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: @@ -3996,10 +3995,10 @@ failed: /* * copy everything in the in-memory inode into the btree. */ -noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode) +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -4015,23 +4014,23 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); - ret = btrfs_delayed_update_inode(trans, root, inode); + ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode) + struct btrfs_inode *inode) { int ret; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) - return btrfs_update_inode_item(trans, root, inode); + return btrfs_update_inode_item(trans, inode); return ret; } @@ -4136,9 +4135,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode_set_ctime_current(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); - ret = btrfs_update_inode(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode(trans, dir); out: return ret; } @@ -4152,7 +4150,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, inode->root, inode); + ret = btrfs_update_inode(trans, inode); } return ret; } @@ -4310,8 +4308,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); - ret = btrfs_update_inode_fallback(trans, root, dir); + inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4645,7 +4643,8 @@ out_notrans: } /* - * btrfs_truncate_block - read, zero a chunk and write a block + * Read, zero a chunk and write a block. + * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the @@ -4795,9 +4794,9 @@ out: return ret; } -static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, - u64 offset, u64 len) +static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; @@ -4837,7 +4836,7 @@ static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode, btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); - btrfs_update_inode(trans, root, inode); + btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; @@ -4893,8 +4892,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; - err = maybe_insert_hole(root, inode, cur_offset, - hole_size); + err = maybe_insert_hole(inode, cur_offset, hole_size); if (err) break; @@ -4920,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; hole_em->compress_type = BTRFS_COMPRESS_NONE; - hole_em->generation = fs_info->generation; + hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); @@ -4960,7 +4958,8 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } } @@ -4988,7 +4987,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { @@ -5586,6 +5585,7 @@ static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { + struct timespec64 ts; struct inode *inode = new_inode(dir->i_sb); if (!inode) @@ -5604,9 +5604,13 @@ static struct inode *new_simple_dir(struct inode *dir, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = inode_set_ctime_current(inode); - inode->i_atime = dir->i_atime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + + ts = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, ts); + inode_set_atime_to_ts(inode, inode_get_atime(dir)); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + inode->i_uid = dir->i_uid; inode->i_gid = dir->i_gid; @@ -5769,20 +5773,24 @@ out: static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { - if (dir->index_cnt == (u64)-1) { - int ret; + int ret = 0; + btrfs_inode_lock(dir, 0); + if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) - return ret; + goto out; } } - *index = dir->index_cnt; + /* index_cnt is the index number of next new entry, so decrement it. */ + *index = dir->index_cnt - 1; +out: + btrfs_inode_unlock(dir, 0); - return 0; + return ret; } /* @@ -5817,6 +5825,19 @@ static int btrfs_opendir(struct inode *inode, struct file *file) return 0; } +static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) +{ + struct btrfs_file_private *private = file->private_data; + int ret; + + ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), + &private->last_index); + if (ret) + return ret; + + return generic_file_llseek(file, offset, whence); +} + struct dir_entry { u64 ino; u64 offset; @@ -5987,15 +6008,15 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); - if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { + ret = btrfs_update_inode(trans, inode); + if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) @@ -6011,7 +6032,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; - bool dirty = flags & ~S_VERSION; + bool dirty; if (btrfs_root_readonly(root)) return -EROFS; @@ -6147,6 +6168,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { + struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; @@ -6264,9 +6286,9 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = inode_set_ctime_current(inode); - inode->i_atime = inode->i_mtime; - BTRFS_I(inode)->i_otime = inode->i_mtime; + ts = simple_inode_init_ts(inode); + BTRFS_I(inode)->i_otime_sec = ts.tv_sec; + BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode @@ -6297,7 +6319,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in @@ -6431,10 +6453,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * values (the ones it had when the fsync was done). */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) - parent_inode->vfs_inode.i_mtime = - inode_set_ctime_current(&parent_inode->vfs_inode); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); - ret = btrfs_update_inode(trans, root, parent_inode); + ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; @@ -6585,7 +6607,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } else { struct dentry *parent = dentry->d_parent; - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); + err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { @@ -7090,8 +7112,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit(io_tree, offset, range_end, - EXTENT_DELALLOC, 0, NULL); + ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; @@ -7992,11 +8013,11 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all @@ -8326,7 +8347,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) break; @@ -8379,7 +8400,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, inode); + ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; @@ -8468,8 +8489,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->delayed_node = NULL; - ei->i_otime.tv_sec = 0; - ei->i_otime.tv_nsec = 0; + ei->i_otime_sec = 0; + ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); @@ -8478,7 +8499,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); mutex_init(&ei->log_mutex); - btrfs_ordered_inode_tree_init(&ei->ordered_tree); + spin_lock_init(&ei->ordered_tree_lock); + ei->ordered_tree = RB_ROOT; + ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); @@ -8621,8 +8644,8 @@ static int btrfs_getattr(struct mnt_idmap *idmap, u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; - stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec; - stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec; + stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; + stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) @@ -8810,7 +8833,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -8825,7 +8848,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9070,7 +9093,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) - ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); + ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -9195,7 +9218,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); + btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } @@ -9433,7 +9456,7 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); @@ -9626,7 +9649,7 @@ next: btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -10868,7 +10891,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { }; static const struct file_operations btrfs_dir_file_operations = { - .llseek = generic_file_llseek, + .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a18ee7b5a166..752acff2c734 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -385,7 +385,7 @@ update_flags: btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out_end_trans: btrfs_end_transaction(trans); @@ -652,18 +652,18 @@ static noinline int create_subvol(struct mnt_idmap *idmap, /* Tree log can't currently deal with an inode which is a new root. */ btrfs_set_log_full_commit(trans); - ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); + ret = btrfs_qgroup_inherit(trans, 0, objectid, root->root_key.objectid, inherit); if (ret) goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, - BTRFS_NESTING_NORMAL); + 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto out; } - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); inode_item = &root_item->inode; btrfs_set_stack_inode_generation(inode_item, 1); @@ -1958,6 +1958,13 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } + /* + * We don't need the path anymore, so release it and + * avoid deadlocks and lockdep warnings in case + * btrfs_iget() needs to lookup the inode from its root + * btree and lock the same leaf. + */ + btrfs_release_path(path); temp_inode = btrfs_iget(sb, key2.objectid, root); if (IS_ERR(temp_inode)) { ret = PTR_ERR(temp_inode); @@ -1978,7 +1985,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, goto out_put; } - btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; @@ -2629,6 +2635,12 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) return -EINVAL; } + if (fs_info->fs_devices->temp_fsid) { + btrfs_err(fs_info, + "device add not supported on cloned temp-fsid mount"); + return -EINVAL; + } + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) { if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; @@ -2670,8 +2682,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2708,7 +2719,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto err_drop; /* Exclusive operation is now claimed */ - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); btrfs_exclop_finish(fs_info); @@ -2722,8 +2733,8 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) } err_drop: mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2736,8 +2747,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; - struct block_device *bdev = NULL; - void *holder; + struct bdev_handle *bdev_handle = NULL; int ret; bool cancel = false; @@ -2764,15 +2774,15 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, &args, &bdev, &holder); + ret = btrfs_rm_device(fs_info, &args, &bdev_handle); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } mnt_drop_write_file(file); - if (bdev) - blkdev_put(bdev, holder); + if (bdev_handle) + bdev_release(bdev_handle); out: btrfs_put_dev_args_from_path(&args); kfree(vol_args); @@ -2816,7 +2826,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, } if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) { - fi_args->generation = fs_info->generation; + fi_args->generation = btrfs_get_fs_generation(fs_info); fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION; } @@ -2941,7 +2951,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key); btrfs_set_dir_item_key(path->nodes[0], di, &disk_key); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL); @@ -2972,7 +2982,7 @@ static void get_block_group_info(struct list_head *groups_list, static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, void __user *arg) { - struct btrfs_ioctl_space_args space_args; + struct btrfs_ioctl_space_args space_args = { 0 }; struct btrfs_ioctl_space_info space; struct btrfs_ioctl_space_info *dest; struct btrfs_ioctl_space_info *dest_orig; @@ -3125,7 +3135,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, return PTR_ERR(trans); /* No running transaction, don't bother */ - transid = root->fs_info->last_trans_committed; + transid = btrfs_get_last_trans_committed(root->fs_info); goto out; } transid = trans->transid; @@ -3691,7 +3701,8 @@ static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg) switch (sa->cmd) { case BTRFS_QUOTA_CTL_ENABLE: - ret = btrfs_quota_enable(fs_info); + case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA: + ret = btrfs_quota_enable(fs_info, sa); break; case BTRFS_QUOTA_CTL_DISABLE: ret = btrfs_quota_disable(fs_info); @@ -4332,7 +4343,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat if (compat) { #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) - struct btrfs_ioctl_send_args_32 args32; + struct btrfs_ioctl_send_args_32 args32 = { 0 }; ret = copy_from_user(&args32, argp, sizeof(args32)); if (ret) diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 7979449a58d6..74d8e2003f58 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> +#include <trace/events/btrfs.h> #include "misc.h" #include "ctree.h" #include "extent_io.h" @@ -73,6 +74,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, + { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, { .id = 0, DEFINE_NAME("tree") }, }; @@ -102,6 +104,15 @@ void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buff #endif +#ifdef CONFIG_BTRFS_DEBUG +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) +{ + eb->lock_owner = owner; +} +#else +static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } +#endif + /* * Extent buffer locking * ===================== @@ -164,7 +175,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } @@ -181,7 +192,8 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) } /* - * __btrfs_tree_lock - lock eb for write + * Lock eb for write. + * * @eb: the eb to lock * @nest: the nesting to use for the lock * @@ -196,7 +208,7 @@ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); - eb->lock_owner = current->pid; + btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } @@ -211,7 +223,7 @@ void btrfs_tree_lock(struct extent_buffer *eb) void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); - eb->lock_owner = 0; + btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index edb9b4a0dba1..7d6ee1e609bf 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -79,7 +79,7 @@ enum btrfs_lock_nesting { }; enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 7695decc7243..b8f9c9e56c8c 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -72,11 +72,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) * over the error. Each subsequent error that doesn't have any context * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. */ -const char * __attribute_const__ btrfs_decode_error(int errno) +const char * __attribute_const__ btrfs_decode_error(int error) { char *errstr = "unknown"; - switch (errno) { + switch (error) { case -ENOENT: /* -2 */ errstr = "No such entry"; break; @@ -110,12 +110,12 @@ const char * __attribute_const__ btrfs_decode_error(int errno) } /* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. + * Decodes expected errors from the caller and invokes the appropriate error + * response. */ __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { struct super_block *sb = fs_info->sb; #ifdef CONFIG_PRINTK @@ -132,11 +132,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Special case: if the error is EROFS, and we're already under * SB_RDONLY, then it is safe here. */ - if (errno == -EROFS && sb_rdonly(sb)) + if (error == -EROFS && sb_rdonly(sb)) return; #ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); btrfs_state_to_string(fs_info, statestr); if (fmt) { struct va_format vaf; @@ -147,11 +147,11 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function vaf.va = &args; pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); + sb->s_id, statestr, function, line, error, errstr, &vaf); va_end(args); } else { pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); + sb->s_id, statestr, function, line, error, errstr); } #endif @@ -159,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - WRITE_ONCE(fs_info->fs_error, errno); + WRITE_ONCE(fs_info->fs_error, error); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) @@ -283,12 +283,12 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) #endif /* - * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an - * alert, and either panics or BUGs, depending on mount options. + * Decode unexpected, fatal errors from the caller, issue an alert, and either + * panic or BUGs, depending on mount options. */ __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) + unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; const char *errstr; @@ -301,13 +301,13 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, va_start(args, fmt); vaf.va = &args; - errstr = btrfs_decode_error(errno); + errstr = btrfs_decode_error(error); if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); + s_id, function, line, &vaf, error, errstr); btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); + function, line, &vaf, error, errstr); va_end(args); /* Caller calls BUG() */ } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 1ae6f8e23e07..4d04c1fa5899 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -184,25 +184,25 @@ do { \ __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); -const char * __attribute_const__ btrfs_decode_error(int errno); +const char * __attribute_const__ btrfs_decode_error(int error); -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +#define btrfs_handle_fs_error(fs_info, error, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) + (error), fmt, ##args) __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); + unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic * will panic(). Otherwise we BUG() here. */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ +#define btrfs_panic(fs_info, error, fmt, args...) \ do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + __btrfs_panic(fs_info, __func__, __LINE__, error, fmt, ##args); \ BUG(); \ } while (0) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b46ab348e8e5..574e8a55e24a 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -124,25 +124,24 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, * look find the first ordered struct that has this offset, otherwise * the first one less than this offset */ -static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, - u64 file_offset) +static inline struct rb_node *ordered_tree_search(struct btrfs_inode *inode, + u64 file_offset) { - struct rb_root *root = &tree->tree; struct rb_node *prev = NULL; struct rb_node *ret; struct btrfs_ordered_extent *entry; - if (tree->last) { - entry = rb_entry(tree->last, struct btrfs_ordered_extent, + if (inode->ordered_tree_last) { + entry = rb_entry(inode->ordered_tree_last, struct btrfs_ordered_extent, rb_node); if (in_range(file_offset, entry->file_offset, entry->num_bytes)) - return tree->last; + return inode->ordered_tree_last; } - ret = __tree_search(root, file_offset, &prev); + ret = __tree_search(&inode->ordered_tree, file_offset, &prev); if (!ret) ret = prev; if (ret) - tree->last = ret; + inode->ordered_tree_last = ret; return ret; } @@ -191,6 +190,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); + INIT_LIST_HEAD(&entry->bioc_list); init_completion(&entry->completion); /* @@ -208,7 +208,6 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( static void insert_ordered_extent(struct btrfs_ordered_extent *entry) { struct btrfs_inode *inode = BTRFS_I(entry->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -221,13 +220,14 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) /* One ref for the tree. */ refcount_inc(&entry->refs); - spin_lock_irq(&tree->lock); - node = tree_insert(&tree->tree, entry->file_offset, &entry->rb_node); + spin_lock_irq(&inode->ordered_tree_lock); + node = tree_insert(&inode->ordered_tree, entry->file_offset, + &entry->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "inconsistency in ordered tree at offset %llu", entry->file_offset); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); list_add_tail(&entry->root_extent_list, @@ -287,12 +287,11 @@ struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_inode *inode = BTRFS_I(entry->inode); - tree = &BTRFS_I(entry->inode)->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree_lock); list_add_tail(&sum->list, &entry->list); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } static void finish_ordered_fn(struct btrfs_work *work) @@ -310,7 +309,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - lockdep_assert_held(&inode->ordered_tree.lock); + lockdep_assert_held(&inode->ordered_tree_lock); if (page) { ASSERT(page->mapping); @@ -364,7 +363,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) struct btrfs_workqueue *wq = btrfs_is_free_space_inode(inode) ? fs_info->endio_freespace_worker : fs_info->endio_write_workers; - btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); + btrfs_init_work(&ordered->work, finish_ordered_fn, NULL); btrfs_queue_work(wq, &ordered->work); } @@ -378,9 +377,9 @@ bool btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); - spin_lock_irqsave(&inode->ordered_tree.lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); ret = can_finish_ordered_extent(ordered, page, file_offset, len, uptodate); - spin_unlock_irqrestore(&inode->ordered_tree.lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); if (ret) btrfs_queue_ordered_fn(ordered); @@ -404,7 +403,6 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, u64 num_bytes, bool uptodate) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; @@ -414,13 +412,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, file_offset + num_bytes - 1, uptodate); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; u64 end; u32 len; - node = tree_search(tree, cur); + node = ordered_tree_search(inode, cur); /* No ordered extents at all */ if (!node) break; @@ -467,13 +465,13 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, len = end + 1 - cur; if (can_finish_ordered_extent(entry, page, cur, len, uptodate)) { - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); btrfs_queue_ordered_fn(entry); - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); } cur += len; } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); } /* @@ -497,19 +495,18 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; bool finished = false; - spin_lock_irqsave(&tree->lock, flags); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); if (cached && *cached) { entry = *cached; goto have_entry; } - node = tree_search(tree, file_offset); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -540,7 +537,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return finished; } @@ -578,7 +575,6 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_ordered_inode_tree *tree; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -609,16 +605,15 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, fs_info->delalloc_batch); - tree = &btrfs_inode->ordered_tree; - spin_lock_irq(&tree->lock); + spin_lock_irq(&btrfs_inode->ordered_tree_lock); node = &entry->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &btrfs_inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (btrfs_inode->ordered_tree_last == node) + btrfs_inode->ordered_tree_last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&btrfs_inode->ordered_tree_lock); /* * The current running transaction is waiting on us, we need to let it @@ -639,7 +634,7 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); - ASSERT(trans); + ASSERT(trans || BTRFS_FS_ERROR(fs_info)); if (trans) { if (atomic_dec_and_test(&trans->pending_ordered)) wake_up(&trans->pending_wait); @@ -711,7 +706,7 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, - btrfs_run_ordered_extent_work, NULL, NULL); + btrfs_run_ordered_extent_work, NULL); list_add_tail(&ordered->work_list, &works); btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work); @@ -875,14 +870,12 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; - tree = &inode->ordered_tree; - spin_lock_irqsave(&tree->lock, flags); - node = tree_search(tree, file_offset); + spin_lock_irqsave(&inode->ordered_tree_lock, flags); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -894,7 +887,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino trace_btrfs_ordered_extent_lookup(inode, entry); } out: - spin_unlock_irqrestore(&tree->lock, flags); + spin_unlock_irqrestore(&inode->ordered_tree_lock, flags); return entry; } @@ -904,15 +897,13 @@ out: struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) { - node = tree_search(tree, file_offset + len); + node = ordered_tree_search(inode, file_offset + len); if (!node) goto out; } @@ -936,7 +927,7 @@ out: refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -947,13 +938,12 @@ out: void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *n; ASSERT(inode_is_locked(&inode->vfs_inode)); - spin_lock_irq(&tree->lock); - for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + spin_lock_irq(&inode->ordered_tree_lock); + for (n = rb_first(&inode->ordered_tree); n; n = rb_next(n)) { struct btrfs_ordered_extent *ordered; ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); @@ -966,7 +956,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, refcount_inc(&ordered->refs); trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); } /* @@ -976,13 +966,11 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { - struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &inode->ordered_tree; - spin_lock_irq(&tree->lock); - node = tree_search(tree, file_offset); + spin_lock_irq(&inode->ordered_tree_lock); + node = ordered_tree_search(inode, file_offset); if (!node) goto out; @@ -990,7 +978,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) refcount_inc(&entry->refs); trace_btrfs_ordered_extent_lookup_first(inode, entry); out: - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1006,15 +994,14 @@ out: struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len) { - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct rb_node *cur; struct rb_node *prev; struct rb_node *next; struct btrfs_ordered_extent *entry = NULL; - spin_lock_irq(&tree->lock); - node = tree->tree.rb_node; + spin_lock_irq(&inode->ordered_tree_lock); + node = inode->ordered_tree.rb_node; /* * Here we don't want to use tree_search() which will use tree->last * and screw up the search order. @@ -1068,7 +1055,7 @@ out: trace_btrfs_ordered_extent_lookup_first_range(inode, entry); } - spin_unlock_irq(&tree->lock); + spin_unlock_irq(&inode->ordered_tree_lock); return entry; } @@ -1147,7 +1134,6 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( struct btrfs_ordered_extent *ordered, u64 len) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); - struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 file_offset = ordered->file_offset; @@ -1187,13 +1173,13 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( refcount_inc(&new->refs); spin_lock_irq(&root->ordered_extent_lock); - spin_lock(&tree->lock); + spin_lock(&inode->ordered_tree_lock); /* Remove from tree once */ node = &ordered->rb_node; - rb_erase(node, &tree->tree); + rb_erase(node, &inode->ordered_tree); RB_CLEAR_NODE(node); - if (tree->last == node) - tree->last = NULL; + if (inode->ordered_tree_last == node) + inode->ordered_tree_last = NULL; ordered->file_offset += len; ordered->disk_bytenr += len; @@ -1224,18 +1210,19 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( } /* Re-insert the node */ - node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node); + node = tree_insert(&inode->ordered_tree, ordered->file_offset, + &ordered->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", ordered->file_offset); - node = tree_insert(&tree->tree, new->file_offset, &new->rb_node); + node = tree_insert(&inode->ordered_tree, new->file_offset, &new->rb_node); if (node) btrfs_panic(fs_info, -EEXIST, "zoned: inconsistency in ordered tree at offset %llu", new->file_offset); - spin_unlock(&tree->lock); + spin_unlock(&inode->ordered_tree_lock); list_add_tail(&new->root_extent_list, &root->ordered_extents); root->nr_ordered_extents++; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 173bd5c5df26..567a6d3d4712 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -6,13 +6,6 @@ #ifndef BTRFS_ORDERED_DATA_H #define BTRFS_ORDERED_DATA_H -/* one of these per inode */ -struct btrfs_ordered_inode_tree { - spinlock_t lock; - struct rb_root tree; - struct rb_node *last; -}; - struct btrfs_ordered_sum { /* * Logical start address and length for of the blocks covered by @@ -151,15 +144,9 @@ struct btrfs_ordered_extent { struct completion completion; struct btrfs_work flush_work; struct list_head work_list; -}; -static inline void -btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) -{ - spin_lock_init(&t->lock); - t->tree = RB_ROOT; - t->last = NULL; -} + struct list_head bioc_list; +}; int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 0c93439e929f..7e46aa8a0444 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -9,6 +9,8 @@ #include "print-tree.h" #include "accessors.h" #include "tree-checker.h" +#include "volumes.h" +#include "raid-stripe-tree.h" struct root_name_map { u64 id; @@ -28,6 +30,7 @@ static const struct root_name_map root_map[] = { { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, { BTRFS_BLOCK_GROUP_TREE_OBJECTID, "BLOCK_GROUP_TREE" }, { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, + { BTRFS_RAID_STRIPE_TREE_OBJECTID, "RAID_STRIPE_TREE" }, }; const char *btrfs_root_name(const struct btrfs_key *key, char *buf) @@ -80,12 +83,20 @@ static void print_extent_data_ref(const struct extent_buffer *eb, btrfs_extent_data_ref_count(eb, ref)); } +static void print_extent_owner_ref(const struct extent_buffer *eb, + const struct btrfs_extent_owner_ref *ref) +{ + ASSERT(btrfs_fs_incompat(eb->fs_info, SIMPLE_QUOTA)); + pr_cont("extent data owner root %llu\n", btrfs_extent_owner_ref_root_id(eb, ref)); +} + static void print_extent_item(const struct extent_buffer *eb, int slot, int type) { struct btrfs_extent_item *ei; struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + struct btrfs_extent_owner_ref *oref; struct btrfs_disk_key key; unsigned long end; unsigned long ptr; @@ -161,6 +172,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type "\t\t\t(parent %llu not aligned to sectorsize %u)\n", offset, eb->fs_info->sectorsize); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + oref = (struct btrfs_extent_owner_ref *)(&iref->offset); + print_extent_owner_ref(eb, oref); + break; default: pr_cont("(extent %llu has INVALID ref type %d)\n", eb->start, type); @@ -189,6 +204,22 @@ static void print_uuid_item(const struct extent_buffer *l, unsigned long offset, } } +static void print_raid_stripe_key(const struct extent_buffer *eb, u32 item_size, + struct btrfs_stripe_extent *stripe) +{ + const int num_stripes = btrfs_num_raid_stripes(item_size); + const u8 encoding = btrfs_stripe_extent_encoding(eb, stripe); + + pr_info("\t\t\tencoding: %s\n", + (encoding && encoding < BTRFS_NR_RAID_TYPES) ? + btrfs_raid_array[encoding].raid_name : "unknown"); + + for (int i = 0; i < num_stripes; i++) + pr_info("\t\t\tstride %d devid %llu physical %llu\n", + i, btrfs_raid_stride_devid(eb, &stripe->strides[i]), + btrfs_raid_stride_physical(eb, &stripe->strides[i])); +} + /* * Helper to output refs and locking status of extent buffer. Useful to debug * race condition related problems. @@ -349,6 +380,10 @@ void btrfs_print_leaf(const struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size(l, i)); break; + case BTRFS_RAID_STRIPE_KEY: + print_raid_stripe_key(l, btrfs_item_size(l, i), + btrfs_item_ptr(l, i, struct btrfs_stripe_extent)); + break; } } } diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 0755af0e53e3..f9bf591a0718 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -15,6 +15,7 @@ #include "fs.h" #include "accessors.h" #include "super.h" +#include "dir-item.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b99230db3c82..edb84cc03237 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -30,6 +30,25 @@ #include "root-tree.h" #include "tree-checker.h" +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info) +{ + if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + return BTRFS_QGROUP_MODE_DISABLED; + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) + return BTRFS_QGROUP_MODE_SIMPLE; + return BTRFS_QGROUP_MODE_FULL; +} + +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_DISABLED; +} + +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info) +{ + return btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL; +} + /* * Helpers to access qgroup reservation * @@ -146,16 +165,6 @@ struct btrfs_qgroup_list { struct btrfs_qgroup *member; }; -static inline u64 qgroup_to_aux(struct btrfs_qgroup *qg) -{ - return (u64)(uintptr_t)qg; -} - -static inline struct btrfs_qgroup* unode_aux_to_qgroup(struct ulist_node *n) -{ - return (struct btrfs_qgroup *)(uintptr_t)n->aux; -} - static int qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, int init_flags); @@ -180,34 +189,46 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, return NULL; } -/* must be called with qgroup_lock held */ +/* + * Add qgroup to the filesystem's qgroup tree. + * + * Must be called with qgroup_lock held and @prealloc preallocated. + * + * The control on the lifespan of @prealloc would be transfered to this + * function, thus caller should no longer touch @prealloc. + */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup *prealloc, u64 qgroupid) { struct rb_node **p = &fs_info->qgroup_tree.rb_node; struct rb_node *parent = NULL; struct btrfs_qgroup *qgroup; + /* Caller must have pre-allocated @prealloc. */ + ASSERT(prealloc); + while (*p) { parent = *p; qgroup = rb_entry(parent, struct btrfs_qgroup, node); - if (qgroup->qgroupid < qgroupid) + if (qgroup->qgroupid < qgroupid) { p = &(*p)->rb_left; - else if (qgroup->qgroupid > qgroupid) + } else if (qgroup->qgroupid > qgroupid) { p = &(*p)->rb_right; - else + } else { + kfree(prealloc); return qgroup; + } } - qgroup = kzalloc(sizeof(*qgroup), GFP_ATOMIC); - if (!qgroup) - return ERR_PTR(-ENOMEM); - + qgroup = prealloc; qgroup->qgroupid = qgroupid; INIT_LIST_HEAD(&qgroup->groups); INIT_LIST_HEAD(&qgroup->members); INIT_LIST_HEAD(&qgroup->dirty); + INIT_LIST_HEAD(&qgroup->iterator); + INIT_LIST_HEAD(&qgroup->nested_iterator); rb_link_node(&qgroup->node, parent, p); rb_insert_color(&qgroup->node, &fs_info->qgroup_tree); @@ -254,27 +275,26 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid) /* * Add relation specified by two qgroups. * - * Must be called with qgroup_lock held. + * Must be called with qgroup_lock held, the ownership of @prealloc is + * transferred to this function and caller should not touch it anymore. * * Return: 0 on success * -ENOENT if one of the qgroups is NULL * <0 other errors */ -static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *parent) +static int __add_relation_rb(struct btrfs_qgroup_list *prealloc, + struct btrfs_qgroup *member, + struct btrfs_qgroup *parent) { - struct btrfs_qgroup_list *list; - - if (!member || !parent) + if (!member || !parent) { + kfree(prealloc); return -ENOENT; + } - list = kzalloc(sizeof(*list), GFP_ATOMIC); - if (!list) - return -ENOMEM; - - list->group = parent; - list->member = member; - list_add_tail(&list->next_group, &member->groups); - list_add_tail(&list->next_member, &parent->members); + prealloc->group = parent; + prealloc->member = member; + list_add_tail(&prealloc->next_group, &member->groups); + list_add_tail(&prealloc->next_member, &parent->members); return 0; } @@ -288,7 +308,9 @@ static int __add_relation_rb(struct btrfs_qgroup *member, struct btrfs_qgroup *p * -ENOENT if one of the ids does not exist * <0 other errors */ -static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 parentid) +static int add_relation_rb(struct btrfs_fs_info *fs_info, + struct btrfs_qgroup_list *prealloc, + u64 memberid, u64 parentid) { struct btrfs_qgroup *member; struct btrfs_qgroup *parent; @@ -296,7 +318,7 @@ static int add_relation_rb(struct btrfs_fs_info *fs_info, u64 memberid, u64 pare member = find_qgroup_rb(fs_info, memberid); parent = find_qgroup_rb(fs_info, parentid); - return __add_relation_rb(member, parent); + return __add_relation_rb(prealloc, member, parent); } /* Must be called with qgroup_lock held */ @@ -340,11 +362,22 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, static void qgroup_mark_inconsistent(struct btrfs_fs_info *fs_info) { + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; fs_info->qgroup_flags |= (BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT | BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN | BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING); } +static void qgroup_read_enable_gen(struct btrfs_fs_info *fs_info, + struct extent_buffer *leaf, int slot, + struct btrfs_qgroup_status_item *ptr) +{ + ASSERT(btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + ASSERT(btrfs_item_size(leaf, slot) >= sizeof(*ptr)); + fs_info->qgroup_enable_gen = btrfs_qgroup_status_enable_gen(leaf, ptr); +} + /* * The full config is read in one go, only called from open_ctree() * It doesn't use any locking, as at this point we're still single-threaded @@ -361,7 +394,7 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) u64 flags = 0; u64 rescan_progress = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!fs_info->quota_root) return 0; fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL); @@ -411,14 +444,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) "old qgroup version, quota disabled"); goto out; } - if (btrfs_qgroup_status_generation(l, ptr) != - fs_info->generation) { + fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, ptr); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE) { + qgroup_read_enable_gen(fs_info, l, slot, ptr); + } else if (btrfs_qgroup_status_generation(l, ptr) != fs_info->generation) { qgroup_mark_inconsistent(fs_info); btrfs_err(fs_info, "qgroup generation mismatch, marked as inconsistent"); } - fs_info->qgroup_flags = btrfs_qgroup_status_flags(l, - ptr); rescan_progress = btrfs_qgroup_status_rescan(l, ptr); goto next1; } @@ -434,11 +467,14 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info) qgroup_mark_inconsistent(fs_info); } if (!qgroup) { - qgroup = add_qgroup_rb(fs_info, found_key.offset); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); + struct btrfs_qgroup *prealloc; + + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { + ret = -ENOMEM; goto out; } + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) @@ -489,6 +525,8 @@ next1: if (ret) goto out; while (1) { + struct btrfs_qgroup_list *list = NULL; + slot = path->slots[0]; l = path->nodes[0]; btrfs_item_key_to_cpu(l, &found_key, slot); @@ -502,8 +540,14 @@ next1: goto next2; } - ret = add_relation_rb(fs_info, found_key.objectid, + list = kzalloc(sizeof(*list), GFP_KERNEL); + if (!list) { + ret = -ENOMEM; + goto out; + } + ret = add_relation_rb(fs_info, list, found_key.objectid, found_key.offset); + list = NULL; if (ret == -ENOENT) { btrfs_warn(fs_info, "orphan qgroup relation 0x%llx->0x%llx", @@ -522,13 +566,12 @@ next2: out: btrfs_free_path(path); fs_info->qgroup_flags |= flags; - if (!(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON)) - clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); - else if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN && - ret >= 0) - ret = qgroup_rescan_init(fs_info, rescan_progress, 0); - - if (ret < 0) { + if (ret >= 0) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_ON) + set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) + ret = qgroup_rescan_init(fs_info, rescan_progress, 0); + } else { ulist_free(fs_info->qgroup_ulist); fs_info->qgroup_ulist = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; @@ -550,7 +593,7 @@ bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info) struct rb_node *node; bool ret = false; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) return ret; /* * Since we're unmounting, there is no race and no need to grab qgroup @@ -622,7 +665,7 @@ static int add_qgroup_relation_item(struct btrfs_trans_handle *trans, u64 src, ret = btrfs_insert_empty_item(trans, quota_root, path, &key, 0); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_free_path(path); return ret; @@ -700,7 +743,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(leaf, qgroup_info, 0); btrfs_set_qgroup_info_excl_cmpr(leaf, qgroup_info, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -719,7 +762,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(leaf, qgroup_limit, 0); btrfs_set_qgroup_limit_rsv_excl(leaf, qgroup_limit, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -808,7 +851,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -854,7 +897,7 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, btrfs_set_qgroup_info_excl(l, qgroup_info, qgroup->excl); btrfs_set_qgroup_info_excl_cmpr(l, qgroup_info, qgroup->excl_cmpr); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -896,7 +939,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans) btrfs_set_qgroup_status_rescan(l, ptr, fs_info->qgroup_rescan_progress.objectid); - btrfs_mark_buffer_dirty(l); + btrfs_mark_buffer_dirty(trans, l); out: btrfs_free_path(path); @@ -949,7 +992,8 @@ out: return ret; } -int btrfs_quota_enable(struct btrfs_fs_info *fs_info) +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args) { struct btrfs_root *quota_root; struct btrfs_root *tree_root = fs_info->tree_root; @@ -959,8 +1003,10 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_key key; struct btrfs_key found_key; struct btrfs_qgroup *qgroup = NULL; + struct btrfs_qgroup *prealloc = NULL; struct btrfs_trans_handle *trans = NULL; struct ulist *ulist = NULL; + const bool simple = (quota_ctl_args->cmd == BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA); int ret = 0; int slot; @@ -1063,13 +1109,18 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) struct btrfs_qgroup_status_item); btrfs_set_qgroup_status_generation(leaf, ptr, trans->transid); btrfs_set_qgroup_status_version(leaf, ptr, BTRFS_QGROUP_STATUS_VERSION); - fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON | - BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + fs_info->qgroup_flags = BTRFS_QGROUP_STATUS_FLAG_ON; + if (simple) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; + btrfs_set_qgroup_status_enable_gen(leaf, ptr, trans->transid); + } else { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + } btrfs_set_qgroup_status_flags(leaf, ptr, fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAGS_MASK); btrfs_set_qgroup_status_rescan(leaf, ptr, 0); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); key.objectid = 0; key.type = BTRFS_ROOT_REF_KEY; @@ -1094,6 +1145,15 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) /* Release locks on tree_root before we access quota_root */ btrfs_release_path(path); + /* We should not have a stray @prealloc pointer. */ + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); + goto out_free_path; + } + ret = add_qgroup_item(trans, quota_root, found_key.offset); if (ret) { @@ -1101,7 +1161,8 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info) goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, found_key.offset); + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); + prealloc = NULL; if (IS_ERR(qgroup)) { ret = PTR_ERR(qgroup); btrfs_abort_transaction(trans, ret); @@ -1144,18 +1205,22 @@ out_add_root: goto out_free_path; } - qgroup = add_qgroup_rb(fs_info, BTRFS_FS_TREE_OBJECTID); - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - btrfs_abort_transaction(trans, ret); + ASSERT(prealloc == NULL); + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; goto out_free_path; } + qgroup = add_qgroup_rb(fs_info, prealloc, BTRFS_FS_TREE_OBJECTID); + prealloc = NULL; ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out_free_path; } + fs_info->qgroup_enable_gen = trans->transid; + mutex_unlock(&fs_info->qgroup_ioctl_lock); /* * Commit the transaction while not holding qgroup_ioctl_lock, to avoid @@ -1180,8 +1245,14 @@ out_add_root: spin_lock(&fs_info->qgroup_lock); fs_info->quota_root = quota_root; set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); + if (simple) + btrfs_set_fs_incompat(fs_info, SIMPLE_QUOTA); spin_unlock(&fs_info->qgroup_lock); + /* Skip rescan for simple qgroups. */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + goto out_free_path; + ret = qgroup_rescan_init(fs_info, 0, 1); if (!ret) { qgroup_rescan_zero_tracking(fs_info); @@ -1222,6 +1293,39 @@ out: else if (trans) ret = btrfs_end_transaction(trans); ulist_free(ulist); + kfree(prealloc); + return ret; +} + +/* + * It is possible to have outstanding ordered extents which reserved bytes + * before we disabled. We need to fully flush delalloc, ordered extents, and a + * commit to ensure that we don't leak such reservations, only to have them + * come back if we re-enable. + * + * - enable simple quotas + * - reserve space + * - release it, store rsv_bytes in OE + * - disable quotas + * - enable simple quotas (qgroup rsv are all 0) + * - OE finishes + * - run delayed refs + * - free rsv_bytes, resulting in miscounting or even underflow + */ +static int flush_reservations(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + int ret; + + ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false); + if (ret) + return ret; + btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); + trans = btrfs_join_transaction(fs_info->tree_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + btrfs_commit_transaction(trans); + return ret; } @@ -1269,6 +1373,10 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags); btrfs_qgroup_wait_for_completion(fs_info, false); + ret = flush_reservations(fs_info); + if (ret) + goto out_unlock_cleaner; + /* * 1 For the root item * @@ -1295,6 +1403,7 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info) quota_root = fs_info->quota_root; fs_info->quota_root = NULL; fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE; fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL; spin_unlock(&fs_info->qgroup_lock); @@ -1329,7 +1438,8 @@ out: if (ret && trans) btrfs_end_transaction(trans); else if (trans) - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); +out_unlock_cleaner: mutex_unlock(&fs_info->cleaner_mutex); return ret; @@ -1342,6 +1452,24 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +static void qgroup_iterator_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->iterator)) + return; + + list_add_tail(&qgroup->iterator, head); +} + +static void qgroup_iterator_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, iterator); + list_del_init(&qgroup->iterator); + } +} + /* * The easy accounting, we're updating qgroup relationship whose child qgroup * only has exclusive extents. @@ -1356,14 +1484,12 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, * * Caller should hold fs_info->qgroup_lock. */ -static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 ref_root, +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, u64 ref_root, struct btrfs_qgroup *src, int sign) { struct btrfs_qgroup *qgroup; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; + struct btrfs_qgroup *cur; + LIST_HEAD(qgroup_list); u64 num_bytes = src->excl; int ret = 0; @@ -1371,53 +1497,30 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, if (!qgroup) goto out; - qgroup->rfer += sign * num_bytes; - qgroup->rfer_cmpr += sign * num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < num_bytes); - qgroup->excl += sign * num_bytes; - qgroup->excl_cmpr += sign * num_bytes; - - if (sign > 0) - qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); - else - qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(cur, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = unode_aux_to_qgroup(unode); qgroup->rfer += sign * num_bytes; qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) qgroup_rsv_add_by_qgroup(fs_info, qgroup, src); else qgroup_rsv_release_by_qgroup(fs_info, qgroup, src); - qgroup->excl_cmpr += sign * num_bytes; qgroup_dirty(fs_info, qgroup); - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + /* Append parent qgroups to @qgroup_list. */ + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } ret = 0; out: + qgroup_iterator_clean(&qgroup_list); return ret; } @@ -1434,8 +1537,7 @@ out: * Return < 0 for other error. */ static int quick_update_accounting(struct btrfs_fs_info *fs_info, - struct ulist *tmp, u64 src, u64 dst, - int sign) + u64 src, u64 dst, int sign) { struct btrfs_qgroup *qgroup; int ret = 1; @@ -1446,8 +1548,7 @@ static int quick_update_accounting(struct btrfs_fs_info *fs_info, goto out; if (qgroup->excl == qgroup->rfer) { ret = 0; - err = __qgroup_excl_accounting(fs_info, tmp, dst, - qgroup, sign); + err = __qgroup_excl_accounting(fs_info, dst, qgroup, sign); if (err < 0) { ret = err; goto out; @@ -1459,28 +1560,19 @@ out: return ret; } -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst) +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; - unsigned int nofs_flag; + struct btrfs_qgroup_list *prealloc = NULL; int ret = 0; /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1501,6 +1593,11 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } } + prealloc = kzalloc(sizeof(*list), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } ret = add_qgroup_relation_item(trans, src, dst); if (ret) goto out; @@ -1512,16 +1609,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, } spin_lock(&fs_info->qgroup_lock); - ret = __add_relation_rb(member, parent); + ret = __add_relation_rb(prealloc, member, parent); + prealloc = NULL; if (ret < 0) { spin_unlock(&fs_info->qgroup_lock); goto out; } - ret = quick_update_accounting(fs_info, tmp, src, dst, 1); + ret = quick_update_accounting(fs_info, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: + kfree(prealloc); mutex_unlock(&fs_info->qgroup_ioctl_lock); - ulist_free(tmp); return ret; } @@ -1532,19 +1630,10 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; - struct ulist *tmp; bool found = false; - unsigned int nofs_flag; int ret = 0; int ret2; - /* We hold a transaction handle open, must do a NOFS allocation. */ - nofs_flag = memalloc_nofs_save(); - tmp = ulist_alloc(GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - if (!tmp) - return -ENOMEM; - if (!fs_info->quota_root) { ret = -ENOTCONN; goto out; @@ -1582,11 +1671,10 @@ delete_item: if (found) { spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); - ret = quick_update_accounting(fs_info, tmp, src, dst, -1); + ret = quick_update_accounting(fs_info, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); } out: - ulist_free(tmp); return ret; } @@ -1608,8 +1696,12 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *prealloc = NULL; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) + return 0; + mutex_lock(&fs_info->qgroup_ioctl_lock); if (!fs_info->quota_root) { ret = -ENOTCONN; @@ -1622,21 +1714,25 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) goto out; } + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) { + ret = -ENOMEM; + goto out; + } + ret = add_qgroup_item(trans, quota_root, qgroupid); if (ret) goto out; spin_lock(&fs_info->qgroup_lock); - qgroup = add_qgroup_rb(fs_info, qgroupid); + qgroup = add_qgroup_rb(fs_info, prealloc, qgroupid); spin_unlock(&fs_info->qgroup_lock); + prealloc = NULL; - if (IS_ERR(qgroup)) { - ret = PTR_ERR(qgroup); - goto out; - } ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + kfree(prealloc); return ret; } @@ -1771,6 +1867,17 @@ out: return ret; } +/* + * Inform qgroup to trace one dirty extent, its info is recorded in @record. + * So qgroup can account it at transaction committing time. + * + * No lock version, caller must acquire delayed ref lock and allocated memory, + * then call btrfs_qgroup_trace_extent_post() after exiting lock context. + * + * Return 0 for success insert + * Return >0 for existing record, caller can free @record safely. + * Error is not possible + */ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record) @@ -1780,6 +1887,9 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_extent_record *entry; u64 bytenr = record->bytenr; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 0; + lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); @@ -1806,12 +1916,35 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, return 0; } +/* + * Post handler after qgroup_trace_extent_nolock(). + * + * NOTE: Current qgroup does the expensive backref walk at transaction + * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming + * new transaction. + * This is designed to allow btrfs_find_all_roots() to get correct new_roots + * result. + * + * However for old_roots there is no need to do backref walk at that time, + * since we search commit roots to walk backref and result will always be + * correct. + * + * Due to the nature of no lock version, we can't do backref there. + * So we must call btrfs_qgroup_trace_extent_post() after exiting + * spinlock context. + * + * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; + if (!btrfs_qgroup_full_accounting(trans->fs_info)) + return 0; /* * We are always called in a context where we are already holding a * transaction handle. Often we are called when adding a data delayed @@ -1859,6 +1992,19 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, return 0; } +/* + * Inform qgroup to trace one dirty extent, specified by @bytenr and + * @num_bytes. + * So qgroup can account it at commit trans time. + * + * Better encapsulated version, with memory allocation and backref walk for + * commit roots. + * So this can sleep. + * + * Return 0 if the operation is done. + * Return <0 for error, like memory allocation failure or invalid parameter + * (NULL trans) + */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes) { @@ -1867,8 +2013,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct btrfs_delayed_ref_root *delayed_refs; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) - || bytenr == 0 || num_bytes == 0) + if (!btrfs_qgroup_full_accounting(fs_info) || bytenr == 0 || num_bytes == 0) return 0; record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) @@ -1889,6 +2034,12 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, return btrfs_qgroup_trace_extent_post(trans, record); } +/* + * Inform qgroup to trace all leaf items of data + * + * Return 0 for success + * Return <0 for error(ENOMEM) + */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { @@ -1900,7 +2051,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes; /* We can be called directly from walk_up_proc() */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; for (i = 0; i < nr; i++) { @@ -2276,7 +2427,7 @@ static int qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans, int level; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* Wrong parameter order */ @@ -2319,6 +2470,16 @@ out: return ret; } +/* + * Inform qgroup to trace a whole subtree, including all its child tree + * blocks and data. + * The root tree block is specified by @root_eb. + * + * Normally used by relocation(tree block swap) and subvolume deletion. + * + * Return 0 for success + * Return <0 for error(ENOMEM or tree search error) + */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level) @@ -2333,7 +2494,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; spin_lock(&fs_info->qgroup_lock); @@ -2445,62 +2606,64 @@ out: return ret; } +static void qgroup_iterator_nested_add(struct list_head *head, struct btrfs_qgroup *qgroup) +{ + if (!list_empty(&qgroup->nested_iterator)) + return; + + list_add_tail(&qgroup->nested_iterator, head); +} + +static void qgroup_iterator_nested_clean(struct list_head *head) +{ + while (!list_empty(head)) { + struct btrfs_qgroup *qgroup; + + qgroup = list_first_entry(head, struct btrfs_qgroup, nested_iterator); + list_del_init(&qgroup->nested_iterator); + } +} + #define UPDATE_NEW 0 #define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, - struct ulist *roots, struct ulist *tmp, - struct ulist *qgroups, u64 seq, int update_old) +static void qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct list_head *qgroups, + u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; - struct ulist_node *tmp_unode; - struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret = 0; if (!roots) - return 0; + return; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { + LIST_HEAD(tmp); + qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - ulist_reinit(tmp); - ret = ulist_add(qgroups, qg->qgroupid, qgroup_to_aux(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qg->qgroupid, qgroup_to_aux(qg), GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&tmp_uiter); - while ((tmp_unode = ulist_next(tmp, &tmp_uiter))) { + qgroup_iterator_nested_add(qgroups, qg); + qgroup_iterator_add(&tmp, qg); + list_for_each_entry(qg, &tmp, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(tmp_unode); if (update_old) btrfs_qgroup_update_old_refcnt(qg, seq, 1); else btrfs_qgroup_update_new_refcnt(qg, seq, 1); + list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - qgroup_to_aux(glist->group), - GFP_ATOMIC); - if (ret < 0) - return ret; + qgroup_iterator_nested_add(qgroups, glist->group); + qgroup_iterator_add(&tmp, glist->group); } } + qgroup_iterator_clean(&tmp); } - return 0; } /* @@ -2539,22 +2702,16 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * But this time we don't need to consider other things, the codes and logic * is easy to understand now. */ -static int qgroup_update_counters(struct btrfs_fs_info *fs_info, - struct ulist *qgroups, - u64 nr_old_roots, - u64 nr_new_roots, - u64 num_bytes, u64 seq) +static void qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct list_head *qgroups, u64 nr_old_roots, + u64 nr_new_roots, u64 num_bytes, u64 seq) { - struct ulist_node *unode; - struct ulist_iterator uiter; struct btrfs_qgroup *qg; - u64 cur_new_count, cur_old_count; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(qgroups, &uiter))) { + list_for_each_entry(qg, qgroups, nested_iterator) { + u64 cur_new_count, cur_old_count; bool dirty = false; - qg = unode_aux_to_qgroup(unode); cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); @@ -2625,7 +2782,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info, if (dirty) qgroup_dirty(fs_info, qg); } - return 0; } /* @@ -2662,8 +2818,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, struct ulist *new_roots) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct ulist *qgroups = NULL; - struct ulist *tmp = NULL; + LIST_HEAD(qgroups); u64 seq; u64 nr_new_roots = 0; u64 nr_old_roots = 0; @@ -2673,7 +2828,7 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, * If quotas get disabled meanwhile, the resources need to be freed and * we can't just exit here. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (!btrfs_qgroup_full_accounting(fs_info) || fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) goto out_free; @@ -2697,17 +2852,6 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, trace_btrfs_qgroup_account_extent(fs_info, trans->transid, bytenr, num_bytes, nr_old_roots, nr_new_roots); - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) { - ret = -ENOMEM; - goto out_free; - } - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out_free; - } - mutex_lock(&fs_info->qgroup_rescan_lock); if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { @@ -2722,29 +2866,21 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ - ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, - UPDATE_OLD); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); /* Update new refcnts using new_roots */ - ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, - UPDATE_NEW); - if (ret < 0) - goto out; + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); - qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; -out: spin_unlock(&fs_info->qgroup_lock); out_free: - ulist_free(tmp); - ulist_free(qgroups); + qgroup_iterator_nested_clean(&qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; @@ -2761,6 +2897,9 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) u64 qgroup_to_skip; int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return 0; + delayed_refs = &trans->transaction->delayed_refs; qgroup_to_skip = delayed_refs->qgroup_to_skip; while ((node = rb_first(&delayed_refs->dirty_extent_root))) { @@ -2876,7 +3015,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) qgroup_mark_inconsistent(fs_info); spin_lock(&fs_info->qgroup_lock); } - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (btrfs_qgroup_enabled(fs_info)) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_ON; else fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; @@ -2889,6 +3028,47 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) return ret; } +static int qgroup_auto_inherit(struct btrfs_fs_info *fs_info, + u64 inode_rootid, + struct btrfs_qgroup_inherit **inherit) +{ + int i = 0; + u64 num_qgroups = 0; + struct btrfs_qgroup *inode_qg; + struct btrfs_qgroup_list *qg_list; + struct btrfs_qgroup_inherit *res; + size_t struct_sz; + u64 *qgids; + + if (*inherit) + return -EEXIST; + + inode_qg = find_qgroup_rb(fs_info, inode_rootid); + if (!inode_qg) + return -ENOENT; + + num_qgroups = list_count_nodes(&inode_qg->groups); + + if (!num_qgroups) + return 0; + + struct_sz = struct_size(res, qgroups, num_qgroups); + if (struct_sz == SIZE_MAX) + return -ERANGE; + + res = kzalloc(struct_sz, GFP_NOFS); + if (!res) + return -ENOMEM; + res->num_qgroups = num_qgroups; + qgids = res->qgroups; + + list_for_each_entry(qg_list, &inode_qg->groups, next_group) + qgids[i] = qg_list->group->qgroupid; + + *inherit = res; + return 0; +} + /* * Copy the accounting information between qgroups. This is necessary * when a snapshot or a subvolume is created. Throwing an error will @@ -2896,7 +3076,8 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) * when a readonly fs is a reasonable outcome. */ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit) + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit) { int ret = 0; int i; @@ -2906,10 +3087,17 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, struct btrfs_root *quota_root; struct btrfs_qgroup *srcgroup; struct btrfs_qgroup *dstgroup; + struct btrfs_qgroup *prealloc; + struct btrfs_qgroup_list **qlist_prealloc = NULL; + bool free_inherit = false; bool need_rescan = false; u32 level_size = 0; u64 nums; + prealloc = kzalloc(sizeof(*prealloc), GFP_NOFS); + if (!prealloc) + return -ENOMEM; + /* * There are only two callers of this function. * @@ -2929,7 +3117,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, if (!committing) mutex_lock(&fs_info->qgroup_ioctl_lock); - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_enabled(fs_info)) goto out; quota_root = fs_info->quota_root; @@ -2938,6 +3126,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && !inherit) { + ret = qgroup_auto_inherit(fs_info, inode_rootid, &inherit); + if (ret) + goto out; + free_inherit = true; + } + if (inherit) { i_qgroups = (u64 *)(inherit + 1); nums = inherit->num_qgroups + 2 * inherit->num_ref_copies + @@ -2982,16 +3177,28 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, goto out; } ret = 0; - } + qlist_prealloc = kcalloc(inherit->num_qgroups, + sizeof(struct btrfs_qgroup_list *), + GFP_NOFS); + if (!qlist_prealloc) { + ret = -ENOMEM; + goto out; + } + for (int i = 0; i < inherit->num_qgroups; i++) { + qlist_prealloc[i] = kzalloc(sizeof(struct btrfs_qgroup_list), + GFP_NOFS); + if (!qlist_prealloc[i]) { + ret = -ENOMEM; + goto out; + } + } + } spin_lock(&fs_info->qgroup_lock); - dstgroup = add_qgroup_rb(fs_info, objectid); - if (IS_ERR(dstgroup)) { - ret = PTR_ERR(dstgroup); - goto unlock; - } + dstgroup = add_qgroup_rb(fs_info, prealloc, objectid); + prealloc = NULL; if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { dstgroup->lim_flags = inherit->lim.flags; @@ -3003,7 +3210,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, qgroup_dirty(fs_info, dstgroup); } - if (srcid) { + if (srcid && btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) goto unlock; @@ -3038,7 +3245,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, i_qgroups = (u64 *)(inherit + 1); for (i = 0; i < inherit->num_qgroups; ++i) { if (*i_qgroups) { - ret = add_relation_rb(fs_info, objectid, *i_qgroups); + ret = add_relation_rb(fs_info, qlist_prealloc[i], objectid, + *i_qgroups); + qlist_prealloc[i] = NULL; if (ret) goto unlock; } @@ -3102,6 +3311,14 @@ out: mutex_unlock(&fs_info->qgroup_ioctl_lock); if (need_rescan) qgroup_mark_inconsistent(fs_info); + if (qlist_prealloc) { + for (int i = 0; i < inherit->num_qgroups; i++) + kfree(qlist_prealloc[i]); + kfree(qlist_prealloc); + } + if (free_inherit) + kfree(inherit); + kfree(prealloc); return ret; } @@ -3125,8 +3342,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; - struct ulist_node *unode; - struct ulist_iterator uiter; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return 0; @@ -3146,49 +3362,28 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, if (!qgroup) goto out; - /* - * in a first step, we check all affected qgroups if any limits would - * be exceeded - */ - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - if (enforce && !qgroup_check_limits(qg, num_bytes)) { + if (enforce && !qgroup_check_limits(qgroup, num_bytes)) { ret = -EDQUOT; goto out; } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } + ret = 0; /* * no limits exceeded, now record the reservation into all qgroups */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_add(fs_info, qg, num_bytes, type); - } + list_for_each_entry(qgroup, &qgroup_list, iterator) + qgroup_rsv_add(fs_info, qgroup, num_bytes, type); out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); return ret; } @@ -3207,9 +3402,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, enum btrfs_qgroup_rsv_type type) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (!is_fstree(ref_root)) return; @@ -3237,30 +3430,17 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, */ num_bytes = qgroup->rsv.values[type]; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); - - qgroup_rsv_release(fs_info, qg, num_bytes, type); - - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; + qgroup_rsv_release(fs_info, qgroup, num_bytes, type); + list_for_each_entry(glist, &qgroup->groups, next_group) { + qgroup_iterator_add(&qgroup_list, glist->group); } } - out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } @@ -3295,6 +3475,9 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, int slot; int ret; + if (!btrfs_qgroup_full_accounting(fs_info)) + return 1; + mutex_lock(&fs_info->qgroup_rescan_lock); extent_root = btrfs_extent_root(fs_info, fs_info->qgroup_rescan_progress.objectid); @@ -3375,10 +3558,15 @@ out: static bool rescan_should_stop(struct btrfs_fs_info *fs_info) { - return btrfs_fs_closing(fs_info) || - test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) || - !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || - fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN; + if (btrfs_fs_closing(fs_info)) + return true; + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + return true; + if (!btrfs_qgroup_enabled(fs_info)) + return true; + if (fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN) + return true; + return false; } static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) @@ -3392,6 +3580,9 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) bool stopped = false; bool did_leaf_rescans = false; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + return; + path = btrfs_alloc_path(); if (!path) goto out; @@ -3495,6 +3686,11 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, { int ret = 0; + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + btrfs_warn(fs_info, "qgroup rescan init failed, running in simple mode"); + return -EINVAL; + } + if (!init_flags) { /* we're resuming qgroup rescan at mount time */ if (!(fs_info->qgroup_flags & @@ -3525,7 +3721,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, btrfs_warn(fs_info, "qgroup rescan init failed, qgroup is not enabled"); ret = -EINVAL; - } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { /* Quota disable is in progress */ ret = -EBUSY; } @@ -3546,7 +3742,7 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); btrfs_init_work(&fs_info->qgroup_rescan_work, - btrfs_qgroup_rescan_worker, NULL, NULL); + btrfs_qgroup_rescan_worker, NULL); return 0; } @@ -3784,7 +3980,7 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, u64 to_reserve; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) || + if (btrfs_qgroup_mode(root->fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || len == 0) return 0; @@ -3916,8 +4112,12 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int trace_op = QGROUP_RELEASE; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &inode->root->fs_info->flags)) - return 0; + if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { + extent_changeset_init(&changeset); + return clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + } /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); @@ -4027,7 +4227,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, struct btrfs_fs_info *fs_info = root->fs_info; int ret; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid) || num_bytes == 0) return 0; @@ -4064,11 +4264,15 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); } +/* + * Per-transaction meta reservation should be all freed at transaction commit + * time + */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4084,7 +4288,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; @@ -4104,9 +4308,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { struct btrfs_qgroup *qgroup; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; + LIST_HEAD(qgroup_list); if (num_bytes == 0) return; @@ -4117,39 +4319,35 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, qgroup = find_qgroup_rb(fs_info, ref_root); if (!qgroup) goto out; - ulist_reinit(fs_info->qgroup_ulist); - ret = ulist_add(fs_info->qgroup_ulist, qgroup->qgroupid, - qgroup_to_aux(qgroup), GFP_ATOMIC); - if (ret < 0) - goto out; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(fs_info->qgroup_ulist, &uiter))) { - struct btrfs_qgroup *qg; - struct btrfs_qgroup_list *glist; - qg = unode_aux_to_qgroup(unode); + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qgroup, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; - qgroup_rsv_release(fs_info, qg, num_bytes, + qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - qgroup_rsv_add(fs_info, qg, num_bytes, + qgroup_rsv_add(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(fs_info->qgroup_ulist, - glist->group->qgroupid, - qgroup_to_aux(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + + list_for_each_entry(glist, &qgroup->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); } out: + qgroup_iterator_clean(&qgroup_list); spin_unlock(&fs_info->qgroup_lock); } +/* + * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. + * + * This is called when preallocated meta reservation needs to be used. + * Normally after btrfs_join_transaction() call. + */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED || !is_fstree(root->root_key.objectid)) return; /* Same as btrfs_qgroup_free_meta_prealloc() */ @@ -4257,7 +4455,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans, int level = btrfs_header_level(subvol_parent) - 1; int ret = 0; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (btrfs_node_ptr_generation(subvol_parent, subvol_slot) > @@ -4367,7 +4565,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, int ret = 0; int i; - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; if (!is_fstree(root->root_key.objectid) || !root->reloc_root) return 0; @@ -4450,3 +4648,53 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) } *root = RB_ROOT; } + +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta) +{ + int ret; + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup *qg; + LIST_HEAD(qgroup_list); + u64 root = delta->root; + u64 num_bytes = delta->num_bytes; + const int sign = (delta->is_inc ? 1 : -1); + + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return 0; + + if (!is_fstree(root)) + return 0; + + /* If the extent predates enabling quotas, don't count it. */ + if (delta->generation < fs_info->qgroup_enable_gen) + return 0; + + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, root); + if (!qgroup) { + ret = -ENOENT; + goto out; + } + + ret = 0; + qgroup_iterator_add(&qgroup_list, qgroup); + list_for_each_entry(qg, &qgroup_list, iterator) { + struct btrfs_qgroup_list *glist; + + qg->excl += num_bytes * sign; + qg->rfer += num_bytes * sign; + qgroup_dirty(fs_info, qg); + + list_for_each_entry(glist, &qg->groups, next_group) + qgroup_iterator_add(&qgroup_list, glist->group); + } + qgroup_iterator_clean(&qgroup_list); + +out: + spin_unlock(&fs_info->qgroup_lock); + if (!ret && delta->rsv_bytes) + btrfs_qgroup_free_refroot(fs_info, root, delta->rsv_bytes, + BTRFS_QGROUP_RSV_DATA); + return ret; +} diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 7bffa10589d6..855a4f978761 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -101,8 +101,15 @@ * subtree rescan for them. */ -#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3) -#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4) +/* + * These flags share the flags field of the btrfs_qgroup_status_item with the + * persisted flags defined in btrfs_tree.h. + * + * To minimize the chance of collision with new persisted status flags, these + * count backwards from the MSB. + */ +#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1ULL << 63) +#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1ULL << 62) /* * Record a dirty extent, and info qgroup to update quota on it @@ -220,6 +227,33 @@ struct btrfs_qgroup { struct list_head groups; /* groups this group is member of */ struct list_head members; /* groups that are members of this group */ struct list_head dirty; /* dirty groups */ + + /* + * For qgroup iteration usage. + * + * The iteration list should always be empty until qgroup_iterator_add() + * is called. And should be reset to empty after the iteration is + * finished. + */ + struct list_head iterator; + + /* + * For nested iterator usage. + * + * Here we support at most one level of nested iterator calls like: + * + * LIST_HEAD(all_qgroups); + * { + * LIST_HEAD(local_qgroups); + * qgroup_iterator_add(local_qgroups, qg); + * qgroup_iterator_nested_add(all_qgroups, qg); + * do_some_work(local_qgroups); + * qgroup_iterator_clean(local_qgroups); + * } + * do_some_work(all_qgroups); + * qgroup_iterator_nested_clean(all_qgroups); + */ + struct list_head nested_iterator; struct rb_node node; /* tree of qgroups */ /* @@ -235,6 +269,21 @@ struct btrfs_qgroup { struct kobject kobj; }; +struct btrfs_squota_delta { + /* The fstree root this delta counts against. */ + u64 root; + /* The number of bytes in the extent being counted. */ + u64 num_bytes; + /* The number of bytes reserved for this extent. */ + u64 rsv_bytes; + /* The generation the extent was created in. */ + u64 generation; + /* Whether we are using or freeing the extent. */ + bool is_inc; + /* Whether the extent is data or metadata. */ + bool is_data; +}; + static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) { return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1)); @@ -249,14 +298,23 @@ enum { ENUM_BIT(QGROUP_FREE), }; -int btrfs_quota_enable(struct btrfs_fs_info *fs_info); +enum btrfs_qgroup_mode { + BTRFS_QGROUP_MODE_DISABLED, + BTRFS_QGROUP_MODE_FULL, + BTRFS_QGROUP_MODE_SIMPLE +}; + +enum btrfs_qgroup_mode btrfs_qgroup_mode(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_enabled(struct btrfs_fs_info *fs_info); +bool btrfs_qgroup_full_accounting(struct btrfs_fs_info *fs_info); +int btrfs_quota_enable(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_quota_ctl_args *quota_ctl_args); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info); void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info); int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info, bool interruptible); -int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, - u64 dst); +int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid); @@ -267,80 +325,16 @@ int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -/* - * Inform qgroup to trace one dirty extent, its info is recorded in @record. - * So qgroup can account it at transaction committing time. - * - * No lock version, caller must acquire delayed ref lock and allocated memory, - * then call btrfs_qgroup_trace_extent_post() after exiting lock context. - * - * Return 0 for success insert - * Return >0 for existing record, caller can free @record safely. - * Error is not possible - */ int btrfs_qgroup_trace_extent_nolock( struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_qgroup_extent_record *record); - -/* - * Post handler after qgroup_trace_extent_nolock(). - * - * NOTE: Current qgroup does the expensive backref walk at transaction - * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming - * new transaction. - * This is designed to allow btrfs_find_all_roots() to get correct new_roots - * result. - * - * However for old_roots there is no need to do backref walk at that time, - * since we search commit roots to walk backref and result will always be - * correct. - * - * Due to the nature of no lock version, we can't do backref there. - * So we must call btrfs_qgroup_trace_extent_post() after exiting - * spinlock context. - * - * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result - * using current root, then we can move all expensive backref walk out of - * transaction committing, but not now as qgroup accounting will be wrong again. - */ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord); - -/* - * Inform qgroup to trace one dirty extent, specified by @bytenr and - * @num_bytes. - * So qgroup can account it at commit trans time. - * - * Better encapsulated version, with memory allocation and backref walk for - * commit roots. - * So this can sleep. - * - * Return 0 if the operation is done. - * Return <0 for error, like memory allocation failure or invalid parameter - * (NULL trans) - */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); - -/* - * Inform qgroup to trace all leaf items of data - * - * Return 0 for success - * Return <0 for error(ENOMEM) - */ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, struct extent_buffer *eb); -/* - * Inform qgroup to trace a whole subtree, including all its child tree - * blocks and data. - * The root tree block is specified by @root_eb. - * - * Normally used by relocation(tree block swap) and subvolume deletion. - * - * Return 0 for success - * Return <0 for error(ENOMEM or tree search error) - */ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *root_eb, u64 root_gen, int root_level); @@ -350,7 +344,8 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans); int btrfs_run_qgroups(struct btrfs_trans_handle *trans); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid, - u64 objectid, struct btrfs_qgroup_inherit *inherit); + u64 objectid, u64 inode_rootid, + struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type); @@ -408,20 +403,8 @@ static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, BTRFS_QGROUP_RSV_META_PREALLOC); } -/* - * Per-transaction meta reservation should be all freed at transaction commit - * time - */ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); - -/* - * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS. - * - * This is called when preallocated meta reservation needs to be used. - * Normally after btrfs_join_transaction() call. - */ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); - void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode); /* btrfs_qgroup_swapped_blocks related functions */ @@ -439,5 +422,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); +int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, + struct btrfs_squota_delta *delta); #endif diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c new file mode 100644 index 000000000000..944e8f1862aa --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.c @@ -0,0 +1,274 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#include <linux/btrfs_tree.h> +#include "ctree.h" +#include "fs.h" +#include "accessors.h" +#include "transaction.h" +#include "disk-io.h" +#include "raid-stripe-tree.h" +#include "volumes.h" +#include "misc.h" +#include "print-tree.h" + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_path *path; + struct btrfs_key key; + struct extent_buffer *leaf; + u64 found_start; + u64 found_end; + u64 end = start + length; + int slot; + int ret; + + if (!stripe_root) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + key.objectid = start; + key.type = BTRFS_RAID_STRIPE_KEY; + key.offset = length; + + ret = btrfs_search_slot(trans, stripe_root, &key, path, -1, 1); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(leaf, &key, slot); + found_start = key.objectid; + found_end = found_start + key.offset; + + /* That stripe ends before we start, we're done. */ + if (found_end <= start) + break; + + trace_btrfs_raid_extent_delete(fs_info, start, end, + found_start, found_end); + + ASSERT(found_start >= start && found_end <= end); + ret = btrfs_del_item(trans, stripe_root, path); + if (ret) + break; + + btrfs_release_path(path); + } + + btrfs_free_path(path); + return ret; +} + +static int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_io_context *bioc) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_key stripe_key; + struct btrfs_root *stripe_root = fs_info->stripe_root; + const int num_stripes = btrfs_bg_type_to_factor(bioc->map_type); + u8 encoding = btrfs_bg_flags_to_raid_index(bioc->map_type); + struct btrfs_stripe_extent *stripe_extent; + const size_t item_size = struct_size(stripe_extent, strides, num_stripes); + int ret; + + stripe_extent = kzalloc(item_size, GFP_NOFS); + if (!stripe_extent) { + btrfs_abort_transaction(trans, -ENOMEM); + btrfs_end_transaction(trans); + return -ENOMEM; + } + + trace_btrfs_insert_one_raid_extent(fs_info, bioc->logical, bioc->size, + num_stripes); + btrfs_set_stack_stripe_extent_encoding(stripe_extent, encoding); + for (int i = 0; i < num_stripes; i++) { + u64 devid = bioc->stripes[i].dev->devid; + u64 physical = bioc->stripes[i].physical; + u64 length = bioc->stripes[i].length; + struct btrfs_raid_stride *raid_stride = &stripe_extent->strides[i]; + + if (length == 0) + length = bioc->size; + + btrfs_set_stack_raid_stride_devid(raid_stride, devid); + btrfs_set_stack_raid_stride_physical(raid_stride, physical); + } + + stripe_key.objectid = bioc->logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = bioc->size; + + ret = btrfs_insert_item(trans, stripe_root, &stripe_key, stripe_extent, + item_size); + if (ret) + btrfs_abort_transaction(trans, ret); + + kfree(stripe_extent); + + return ret; +} + +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent) +{ + struct btrfs_io_context *bioc; + int ret; + + if (!btrfs_fs_incompat(trans->fs_info, RAID_STRIPE_TREE)) + return 0; + + list_for_each_entry(bioc, &ordered_extent->bioc_list, rst_ordered_entry) { + ret = btrfs_insert_one_raid_extent(trans, bioc); + if (ret) + return ret; + } + + while (!list_empty(&ordered_extent->bioc_list)) { + bioc = list_first_entry(&ordered_extent->bioc_list, + typeof(*bioc), rst_ordered_entry); + list_del(&bioc->rst_ordered_entry); + btrfs_put_bioc(bioc); + } + + return ret; +} + +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe) +{ + struct btrfs_root *stripe_root = fs_info->stripe_root; + struct btrfs_stripe_extent *stripe_extent; + struct btrfs_key stripe_key; + struct btrfs_key found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + const u64 end = logical + *length; + int num_stripes; + u8 encoding; + u64 offset; + u64 found_logical; + u64 found_length; + u64 found_end; + int slot; + int ret; + + stripe_key.objectid = logical; + stripe_key.type = BTRFS_RAID_STRIPE_KEY; + stripe_key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (stripe->is_scrub) { + path->skip_locking = 1; + path->search_commit_root = 1; + } + + ret = btrfs_search_slot(NULL, stripe_root, &stripe_key, path, 0, 0); + if (ret < 0) + goto free_path; + if (ret) { + if (path->slots[0] != 0) + path->slots[0]--; + } + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &found_key, slot); + found_logical = found_key.objectid; + found_length = found_key.offset; + found_end = found_logical + found_length; + + if (found_logical > end) { + ret = -ENOENT; + goto out; + } + + if (in_range(logical, found_logical, found_length)) + break; + + ret = btrfs_next_item(stripe_root, path); + if (ret) + goto out; + } + + offset = logical - found_logical; + + /* + * If we have a logically contiguous, but physically non-continuous + * range, we need to split the bio. Record the length after which we + * must split the bio. + */ + if (end > found_end) + *length -= end - found_end; + + num_stripes = btrfs_num_raid_stripes(btrfs_item_size(leaf, slot)); + stripe_extent = btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + encoding = btrfs_stripe_extent_encoding(leaf, stripe_extent); + + if (encoding != btrfs_bg_flags_to_raid_index(map_type)) { + ret = -EUCLEAN; + btrfs_handle_fs_error(fs_info, ret, + "on-disk stripe encoding %d doesn't match RAID index %d", + encoding, + btrfs_bg_flags_to_raid_index(map_type)); + goto out; + } + + for (int i = 0; i < num_stripes; i++) { + struct btrfs_raid_stride *stride = &stripe_extent->strides[i]; + u64 devid = btrfs_raid_stride_devid(leaf, stride); + u64 physical = btrfs_raid_stride_physical(leaf, stride); + + if (devid != stripe->dev->devid) + continue; + + if ((map_type & BTRFS_BLOCK_GROUP_DUP) && stripe_index != i) + continue; + + stripe->physical = physical + offset; + + trace_btrfs_get_raid_extent_offset(fs_info, logical, *length, + stripe->physical, devid); + + ret = 0; + goto free_path; + } + + /* If we're here, we haven't found the requested devid in the stripe. */ + ret = -ENOENT; +out: + if (ret > 0) + ret = -ENOENT; + if (ret && ret != -EIO && !stripe->is_scrub) { + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) + btrfs_print_tree(leaf, 1); + btrfs_err(fs_info, + "cannot find raid-stripe for logical [%llu, %llu] devid %llu, profile %s", + logical, logical + *length, stripe->dev->devid, + btrfs_bg_type_to_raid_name(map_type)); + } +free_path: + btrfs_free_path(path); + + return ret; +} diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h new file mode 100644 index 000000000000..cdb58b38fcb5 --- /dev/null +++ b/fs/btrfs/raid-stripe-tree.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2023 Western Digital Corporation or its affiliates. + */ + +#ifndef BTRFS_RAID_STRIPE_TREE_H +#define BTRFS_RAID_STRIPE_TREE_H + +#define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ + BTRFS_BLOCK_GROUP_RAID1_MASK | \ + BTRFS_BLOCK_GROUP_RAID0 | \ + BTRFS_BLOCK_GROUP_RAID10) + +struct btrfs_io_context; +struct btrfs_io_stripe; +struct btrfs_ordered_extent; +struct btrfs_trans_handle; + +int btrfs_delete_raid_extent(struct btrfs_trans_handle *trans, u64 start, u64 length); +int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length, u64 map_type, + u32 stripe_index, struct btrfs_io_stripe *stripe); +int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, + struct btrfs_ordered_extent *ordered_extent); + +static inline bool btrfs_need_stripe_tree_update(struct btrfs_fs_info *fs_info, + u64 map_type) +{ + u64 type = map_type & BTRFS_BLOCK_GROUP_TYPE_MASK; + u64 profile = map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK; + + if (!btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) + return false; + + if (type != BTRFS_BLOCK_GROUP_DATA) + return false; + + if (profile & BTRFS_RST_SUPP_BLOCK_GROUP_MASK) + return true; + + return false; +} + +static inline int btrfs_num_raid_stripes(u32 item_size) +{ + return (item_size - offsetof(struct btrfs_stripe_extent, strides)) / + sizeof(struct btrfs_raid_stride); +} + +#endif diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 95d28497de7c..1f62976bee82 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -485,6 +485,9 @@ static int process_extent_item(struct btrfs_fs_info *fs_info, ret = add_shared_data_ref(fs_info, offset, count, key->objectid, key->offset); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: btrfs_err(fs_info, "invalid key type in iref"); ret = -EINVAL; @@ -652,7 +655,7 @@ static void dump_block_entry(struct btrfs_fs_info *fs_info, } /* - * btrfs_ref_tree_mod: called when we modify a ref for a bytenr + * Called when we modify a ref for a bytenr. * * This will add an action item to the given bytenr and do sanity checks to make * sure we haven't messed something up. If we are making a new allocation and @@ -681,10 +684,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.owning_root; + ref_root = generic_ref->tree_ref.ref_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.owning_root; + ref_root = generic_ref->data_ref.ref_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 65d2bd6910f2..f88b0c2ac3fe 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -25,12 +25,11 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, const u64 olen, int no_time_update) { - struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } /* * We round up to the block size at eof when determining which @@ -43,7 +42,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9951a0caf5bb..f5d9e5f74a52 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -111,8 +111,8 @@ struct tree_block { }; /* Use rb_simple_node for search/insert */ u64 owner; struct btrfs_key key; - unsigned int level:8; - unsigned int key_ready:1; + u8 level; + bool key_ready; }; #define MAX_EXTENTS 128 @@ -122,6 +122,13 @@ struct file_extent_cluster { u64 end; u64 boundary[MAX_EXTENTS]; unsigned int nr; + u64 owning_root; +}; + +/* Stages of data relocation. */ +enum reloc_stage { + MOVE_DATA_EXTENTS, + UPDATE_DATA_PTRS }; struct reloc_control { @@ -155,16 +162,12 @@ struct reloc_control { u64 search_start; u64 extents_found; - unsigned int stage:8; - unsigned int create_reloc_tree:1; - unsigned int merge_reloc_tree:1; - unsigned int found_file_extent:1; + enum reloc_stage stage; + bool create_reloc_tree; + bool merge_reloc_tree; + bool found_file_extent; }; -/* stages of data relocation */ -#define MOVE_DATA_EXTENTS 0 -#define UPDATE_DATA_PTRS 1 - static void mark_block_processed(struct reloc_control *rc, struct btrfs_backref_node *node) { @@ -180,13 +183,6 @@ static void mark_block_processed(struct reloc_control *rc, node->processed = 1; } - -static void mapping_tree_init(struct mapping_tree *tree) -{ - tree->rb_root = RB_ROOT; - spin_lock_init(&tree->lock); -} - /* * walk up backref nodes until reach node presents tree root */ @@ -299,7 +295,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans, return 1; } -static bool reloc_root_is_dead(struct btrfs_root *root) +static bool reloc_root_is_dead(const struct btrfs_root *root) { /* * Pair with set_bit/clear_bit in clean_dirty_subvols and @@ -320,7 +316,7 @@ static bool reloc_root_is_dead(struct btrfs_root *root) * from no reloc root. But btrfs_should_ignore_reloc_root() below is a * special case. */ -static bool have_reloc_root(struct btrfs_root *root) +static bool have_reloc_root(const struct btrfs_root *root) { if (reloc_root_is_dead(root)) return false; @@ -329,31 +325,30 @@ static bool have_reloc_root(struct btrfs_root *root) return true; } -int btrfs_should_ignore_reloc_root(struct btrfs_root *root) +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root) { struct btrfs_root *reloc_root; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) - return 0; + return false; /* This root has been merged with its reloc tree, we can ignore it */ if (reloc_root_is_dead(root)) - return 1; + return true; reloc_root = root->reloc_root; if (!reloc_root) - return 0; + return false; if (btrfs_header_generation(reloc_root->commit_root) == root->fs_info->running_transaction->transid) - return 0; + return false; /* - * if there is reloc tree and it was created in previous - * transaction backref lookup can find the reloc tree, - * so backref node for the fs tree root is useless for - * relocation. + * If there is reloc tree and it was created in previous transaction + * backref lookup can find the reloc tree, so backref node for the fs + * tree root is useless for relocation. */ - return 1; + return true; } /* @@ -466,6 +461,7 @@ static bool handle_useless_nodes(struct reloc_control *rc, * cached. */ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( + struct btrfs_trans_handle *trans, struct reloc_control *rc, struct btrfs_key *node_key, int level, u64 bytenr) { @@ -499,8 +495,8 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( /* Breadth-first search to build backref cache */ do { - ret = btrfs_backref_add_tree_node(cache, path, iter, node_key, - cur); + ret = btrfs_backref_add_tree_node(trans, cache, path, iter, + node_key, cur); if (ret < 0) { err = ret; goto out; @@ -546,7 +542,7 @@ out: */ static int clone_backref_node(struct btrfs_trans_handle *trans, struct reloc_control *rc, - struct btrfs_root *src, + const struct btrfs_root *src, struct btrfs_root *dest) { struct btrfs_root *reloc_root = src->reloc_root; @@ -631,7 +627,7 @@ fail: /* * helper to add 'address of tree root -> reloc tree' mapping */ -static int __must_check __add_reloc_root(struct btrfs_root *root) +static int __add_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *rb_node; @@ -1158,7 +1154,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1169,7 +1165,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - num_bytes, parent); + num_bytes, parent, root->root_key.objectid); btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), key.objectid, key.offset, root->root_key.objectid, false); @@ -1180,15 +1176,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, } } if (dirty) - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (inode) btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } -static noinline_for_stack -int memcmp_node_keys(struct extent_buffer *eb, int slot, - struct btrfs_path *path, int level) +static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb, + int slot, const struct btrfs_path *path, + int level) { struct btrfs_disk_key key1; struct btrfs_disk_key key2; @@ -1373,16 +1369,17 @@ again: */ btrfs_set_node_blockptr(parent, slot, new_bytenr); btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen); - btrfs_mark_buffer_dirty(parent); + btrfs_mark_buffer_dirty(trans, parent); btrfs_set_node_blockptr(path->nodes[level], path->slots[level], old_bytenr); btrfs_set_node_ptr_generation(path->nodes[level], path->slots[level], old_ptr_gen); - btrfs_mark_buffer_dirty(path->nodes[level]); + btrfs_mark_buffer_dirty(trans, path->nodes[level]); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, + src->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1391,7 +1388,7 @@ again: break; } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, - blocksize, 0); + blocksize, 0, dest->root_key.objectid); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_inc_extent_ref(trans, &ref); @@ -1400,8 +1397,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, - blocksize, path->nodes[level]->start); + blocksize, path->nodes[level]->start, 0); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1410,8 +1408,9 @@ again: break; } + /* We don't know the real owning_root, use 0. */ btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, - blocksize, 0); + blocksize, 0, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, true); ret = btrfs_free_extent(trans, &ref); @@ -1517,8 +1516,8 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, * [min_key, max_key) */ static int invalidate_extent_cache(struct btrfs_root *root, - struct btrfs_key *min_key, - struct btrfs_key *max_key) + const struct btrfs_key *min_key, + const struct btrfs_key *max_key) { struct btrfs_fs_info *fs_info = root->fs_info; struct inode *inode = NULL; @@ -1896,7 +1895,7 @@ again: } } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { reloc_root = list_entry(rc->reloc_roots.next, @@ -2516,11 +2515,12 @@ static int do_relocation(struct btrfs_trans_handle *trans, node->eb->start); btrfs_set_node_ptr_generation(upper->eb, slot, trans->transid); - btrfs_mark_buffer_dirty(upper->eb); + btrfs_mark_buffer_dirty(trans, upper->eb); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, - upper->eb->start); + upper->eb->start, + btrfs_header_owner(upper->eb)); btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb), root->root_key.objectid, false); @@ -2632,7 +2632,7 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) u32 blocksize = rc->extent_root->fs_info->nodesize; if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL)) + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2659,7 +2659,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, else btrfs_node_key_to_cpu(eb, &block->key, 0); free_extent_buffer(eb); - block->key_ready = 1; + block->key_ready = true; return 0; } @@ -2803,7 +2803,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, /* Do tree relocation */ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) { - node = build_backref_tree(rc, &block->key, + node = build_backref_tree(trans, rc, &block->key, block->level, block->bytenr); if (IS_ERR(node)) { err = PTR_ERR(node); @@ -2829,7 +2829,7 @@ out_free_blocks: static noinline_for_stack int prealloc_file_extent_cluster( struct btrfs_inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 alloc_hint = 0; u64 start; @@ -2964,7 +2964,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod /* * Allow error injection to test balance/relocation cancellation */ -noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) +noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info) { return atomic_read(&fs_info->balance_cancel_req) || atomic_read(&fs_info->reloc_cancel_req) || @@ -2972,7 +2972,7 @@ noinline int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info) } ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE); -static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, +static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster, int cluster_nr) { /* Last extent, use cluster end directly */ @@ -2984,7 +2984,7 @@ static u64 get_cluster_boundary_end(struct file_extent_cluster *cluster, } static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, - struct file_extent_cluster *cluster, + const struct file_extent_cluster *cluster, int *cluster_nr, unsigned long page_index) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3119,7 +3119,7 @@ release_page: } static int relocate_file_extent_cluster(struct inode *inode, - struct file_extent_cluster *cluster) + const struct file_extent_cluster *cluster) { u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; @@ -3157,11 +3157,12 @@ out: return ret; } -static noinline_for_stack -int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, - struct file_extent_cluster *cluster) +static noinline_for_stack int relocate_data_extent(struct inode *inode, + const struct btrfs_key *extent_key, + struct file_extent_cluster *cluster) { int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) { ret = relocate_file_extent_cluster(inode, cluster); @@ -3170,8 +3171,38 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, cluster->nr = 0; } - if (!cluster->nr) + /* + * Under simple quotas, we set root->relocation_src_root when we find + * the extent. If adjacent extents have different owners, we can't merge + * them while relocating. Handle this by storing the owning root that + * started a cluster and if we see an extent from a different root break + * cluster formation (just like the above case of non-adjacent extents). + * + * Without simple quotas, relocation_src_root is always 0, so we should + * never see a mismatch, and it should have no effect on relocation + * clusters. + */ + if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) { + u64 tmp = root->relocation_src_root; + + /* + * root->relocation_src_root is the state that actually affects + * the preallocation we do here, so set it to the root owning + * the cluster we need to relocate. + */ + root->relocation_src_root = cluster->owning_root; + ret = relocate_file_extent_cluster(inode, cluster); + if (ret) + return ret; + cluster->nr = 0; + /* And reset it back for the current extent's owning root. */ + root->relocation_src_root = tmp; + } + + if (!cluster->nr) { cluster->start = extent_key->objectid; + cluster->owning_root = root->relocation_src_root; + } else BUG_ON(cluster->nr >= MAX_EXTENTS); cluster->end = extent_key->objectid + extent_key->offset - 1; @@ -3192,7 +3223,7 @@ int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key, * the major work is getting the generation and level of the block */ static int add_tree_block(struct reloc_control *rc, - struct btrfs_key *extent_key, + const struct btrfs_key *extent_key, struct btrfs_path *path, struct rb_root *blocks) { @@ -3277,7 +3308,7 @@ static int add_tree_block(struct reloc_control *rc, block->key.objectid = rc->extent_root->fs_info->nodesize; block->key.offset = generation; block->level = level; - block->key_ready = 0; + block->key_ready = false; block->owner = owner; rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node); @@ -3443,11 +3474,10 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, /* * helper to find all tree blocks that reference a given data extent */ -static noinline_for_stack -int add_data_references(struct reloc_control *rc, - struct btrfs_key *extent_key, - struct btrfs_path *path, - struct rb_root *blocks) +static noinline_for_stack int add_data_references(struct reloc_control *rc, + const struct btrfs_key *extent_key, + struct btrfs_path *path, + struct rb_root *blocks) { struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; @@ -3621,7 +3651,7 @@ int prepare_to_relocate(struct reloc_control *rc) if (ret) return ret; - rc->create_reloc_tree = 1; + rc->create_reloc_tree = true; set_reloc_control(rc); trans = btrfs_join_transaction(rc->extent_root); @@ -3701,6 +3731,21 @@ restart: struct btrfs_extent_item); flags = btrfs_extent_flags(path->nodes[0], ei); + /* + * If we are relocating a simple quota owned extent item, we + * need to note the owner on the reloc data root so that when + * we allocate the replacement item, we can attribute it to the + * correct eventual owner (rather than the reloc data root). + */ + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) { + struct btrfs_root *root = BTRFS_I(rc->data_inode)->root; + u64 owning_root_id = btrfs_get_extent_owner_root(fs_info, + path->nodes[0], + path->slots[0]); + + root->relocation_src_root = owning_root_id; + } + if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); } else if (rc->stage == UPDATE_DATA_PTRS && @@ -3733,7 +3778,7 @@ restart: if (rc->stage == MOVE_DATA_EXTENTS && (flags & BTRFS_EXTENT_FLAG_DATA)) { - rc->found_file_extent = 1; + rc->found_file_extent = true; ret = relocate_data_extent(rc->data_inode, &key, &rc->cluster); if (ret < 0) { @@ -3770,7 +3815,7 @@ restart: err = ret; } - rc->create_reloc_tree = 0; + rc->create_reloc_tree = false; set_reloc_control(rc); btrfs_backref_release_cache(&rc->backref_cache); @@ -3788,7 +3833,7 @@ restart: merge_reloc_roots(rc); - rc->merge_reloc_tree = 0; + rc->merge_reloc_tree = false; unset_reloc_control(rc); btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); @@ -3834,7 +3879,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); return ret; @@ -3873,9 +3918,9 @@ out: * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 */ -static noinline_for_stack -struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *group) +static noinline_for_stack struct inode *create_reloc_inode( + struct btrfs_fs_info *fs_info, + const struct btrfs_block_group *group) { struct inode *inode = NULL; struct btrfs_trans_handle *trans; @@ -3970,8 +4015,9 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->reloc_roots); INIT_LIST_HEAD(&rc->dirty_subvol_roots); - btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); - mapping_tree_init(&rc->reloc_root_tree); + btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); + rc->reloc_root_tree.rb_root = RB_ROOT; + spin_lock_init(&rc->reloc_root_tree.lock); extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -4003,7 +4049,7 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } -static const char *stage_to_string(int stage) +static const char *stage_to_string(enum reloc_stage stage) { if (stage == MOVE_DATA_EXTENTS) return "move data extents"; @@ -4119,7 +4165,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) WARN_ON(ret && ret != -EAGAIN); while (1) { - int finishes_stage; + enum reloc_stage finishes_stage; mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); @@ -4302,7 +4348,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) goto out_unset; } - rc->merge_reloc_tree = 1; + rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { reloc_root = list_entry(reloc_roots.next, @@ -4421,7 +4467,8 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) } int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4560,7 +4607,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, * * Return U64_MAX if no running relocation. */ -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info) +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info) { u64 logical = U64_MAX; diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h index 77d69f6ae967..5fb60f2deb53 100644 --- a/fs/btrfs/relocation.h +++ b/fs/btrfs/relocation.h @@ -10,15 +10,16 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, + struct btrfs_root *root, + const struct extent_buffer *buf, struct extent_buffer *cow); void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, u64 *bytes_to_reserve); int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info); struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); -u64 btrfs_get_reloc_bg_bytenr(struct btrfs_fs_info *fs_info); +bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root); +u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 859874579456..603ad1459368 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -51,7 +51,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, } /* - * btrfs_find_root - lookup the root by the key. + * Lookup the root by the key. + * * root: the root of the root tree * search_key: the key to search * path: the path we search @@ -191,7 +192,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root btrfs_set_root_generation_v2(item, btrfs_root_generation(item)); write_extent_buffer(l, item, ptr, sizeof(*item)); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); out: btrfs_free_path(path); return ret; @@ -438,7 +439,7 @@ again: btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); write_extent_buffer(leaf, name->name, ptr, name->len); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { btrfs_release_path(path); @@ -485,7 +486,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, } /* - * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation + * Reserve space for subvolume operation. + * * root: the root of the parent directory * rsv: block reservation * items: the number of items that we need do reservation @@ -508,7 +510,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) { + if (btrfs_qgroup_enabled(fs_info)) { /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h index cbbaca32126e..8b2c3859e464 100644 --- a/fs/btrfs/root-tree.h +++ b/fs/btrfs/root-tree.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ROOT_TREE_H #define BTRFS_ROOT_TREE_H +struct fscrypt_str; + int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -18,10 +20,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_root_item *item); int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, struct btrfs_path *path, struct btrfs_root_item *root_item, struct btrfs_key *root_key); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b877203f1dc5..9ce5be21b036 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -16,7 +16,6 @@ #include "backref.h" #include "extent_io.h" #include "dev-replace.h" -#include "check-integrity.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -24,6 +23,7 @@ #include "accessors.h" #include "file-item.h" #include "scrub.h" +#include "raid-stripe-tree.h" /* * This is only the first step towards a full-features scrub. It reads all @@ -897,7 +897,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num >= 1); ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, stripe->logical, &mapped_len, &bioc, - NULL, NULL, 1); + NULL, NULL); /* * If we failed, dev will be NULL, and later detailed reports * will just be skipped. @@ -1635,6 +1635,71 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) } } +static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx, + struct scrub_stripe *stripe) +{ + struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct btrfs_bio *bbio = NULL; + u64 stripe_len = BTRFS_STRIPE_LEN; + int mirror = stripe->mirror_num; + int i; + + atomic_inc(&stripe->pending_io); + + for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + struct page *page = scrub_stripe_get_page(stripe, i); + unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); + + /* The current sector cannot be merged, submit the bio. */ + if (bbio && + ((i > 0 && + !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + bbio->bio.bi_iter.bi_size >= stripe_len)) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + bbio = NULL; + } + + if (!bbio) { + struct btrfs_io_stripe io_stripe = {}; + struct btrfs_io_context *bioc = NULL; + const u64 logical = stripe->logical + + (i << fs_info->sectorsize_bits); + int err; + + bbio = btrfs_bio_alloc(stripe->nr_sectors, REQ_OP_READ, + fs_info, scrub_read_endio, stripe); + bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; + + io_stripe.is_scrub = true; + err = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &stripe_len, &bioc, &io_stripe, + &mirror); + btrfs_put_bioc(bioc); + if (err) { + btrfs_bio_end_io(bbio, + errno_to_blk_status(err)); + return; + } + } + + __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + } + + if (bbio) { + ASSERT(bbio->bio.bi_iter.bi_size); + atomic_inc(&stripe->pending_io); + btrfs_submit_bio(bbio, mirror); + } + + if (atomic_dec_and_test(&stripe->pending_io)) { + wake_up(&stripe->io_wait); + INIT_WORK(&stripe->work, scrub_stripe_read_repair_worker); + queue_work(stripe->bg->fs_info->scrub_workers, &stripe->work); + } +} + static void scrub_submit_initial_read(struct scrub_ctx *sctx, struct scrub_stripe *stripe) { @@ -1646,6 +1711,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, ASSERT(stripe->mirror_num > 0); ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + if (btrfs_need_stripe_tree_update(fs_info, stripe->bg->flags)) { + scrub_submit_extent_sector_read(sctx, stripe); + return; + } + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, scrub_read_endio, stripe); @@ -1952,7 +2022,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, full_stripe_start, - &length, &bioc, NULL, NULL, 1); + &length, &bioc, NULL, NULL); if (ret < 0) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -2717,7 +2787,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (scrub_dev->fs_devices != fs_info->fs_devices) gen = scrub_dev->generation; else - gen = fs_info->last_trans_committed; + gen = btrfs_get_last_trans_committed(fs_info); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3a566150c531..3b929f0e8f04 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -796,7 +796,7 @@ static int send_cmd(struct send_ctx *sctx) put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); put_unaligned_le32(0, &hdr->crc); - crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); + crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, @@ -5669,8 +5669,8 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, hdr = (struct btrfs_cmd_header *)sctx->send_buf; hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); hdr->crc = 0; - crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); - crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + crc = crc32c(0, sctx->send_buf, sctx->send_size); + crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); hdr->crc = cpu_to_le32(crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index d7e8cd4f140c..571bb13587d5 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -345,8 +345,10 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { + struct btrfs_space_info *data_sinfo; u64 profile; u64 avail; + u64 data_chunk_size; int factor; if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) @@ -364,6 +366,36 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, */ factor = btrfs_bg_type_to_factor(profile); avail = div_u64(avail, factor); + if (avail == 0) + return 0; + + /* + * Calculate the data_chunk_size, space_info->chunk_size is the + * "optimal" chunk size based on the fs size. However when we actually + * allocate the chunk we will strip this down further, making it no more + * than 10% of the disk or 1G, whichever is smaller. + */ + data_sinfo = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); + data_chunk_size = min(data_sinfo->chunk_size, + mult_perc(fs_info->fs_devices->total_rw_bytes, 10)); + data_chunk_size = min_t(u64, data_chunk_size, SZ_1G); + + /* + * Since data allocations immediately use block groups as part of the + * reservation, because we assume that data reservations will == actual + * usage, we could potentially overcommit and then immediately have that + * available space used by a data allocation, which could put us in a + * bind when we get close to filling the file system. + * + * To handle this simply remove the data_chunk_size from the available + * space. If we are relatively empty this won't affect our ability to + * overcommit much, and if we're very close to full it'll keep us from + * getting into a position where we've given ourselves very little + * metadata wiggle room. + */ + if (avail <= data_chunk_size) + return 0; + avail -= data_chunk_size; /* * If we aren't flushing all things, let us overcommit up to @@ -556,18 +588,6 @@ static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, return nr; } -static inline u64 calc_delayed_refs_nr(const struct btrfs_fs_info *fs_info, - u64 to_reclaim) -{ - const u64 bytes = btrfs_calc_delayed_ref_bytes(fs_info, 1); - u64 nr; - - nr = div64_u64(to_reclaim, bytes); - if (!nr) - nr = 1; - return nr; -} - #define EXTENT_SIZE_PER_ITEM SZ_256K /* @@ -749,10 +769,9 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } if (state == FLUSH_DELAYED_REFS_NR) - nr = calc_delayed_refs_nr(fs_info, num_bytes); + btrfs_run_delayed_refs(trans, num_bytes); else - nr = 0; - btrfs_run_delayed_refs(trans, nr); + btrfs_run_delayed_refs(trans, 0); btrfs_end_transaction(trans); break; case ALLOC_CHUNK: @@ -978,7 +997,8 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, } /* - * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets + * We've exhausted our flushing, start failing tickets. + * * @fs_info - fs_info for this fs * @space_info - the space info we were flushing * @@ -1742,7 +1762,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem - * @block_rsv: block_rsv we're allocating for + * @space_info: the space_info we're allocating for * @orig_bytes: number of bytes we want * @flush: whether or not we can flush to make our reservation * @@ -1754,21 +1774,19 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * space already. */ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { int ret; - ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); + ret = __reserve_bytes(fs_info, space_info, orig_bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - block_rsv->space_info->flags, - orig_bytes, 1); + space_info->flags, orig_bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, block_rsv->space_info, - orig_bytes, 0); + btrfs_dump_space_info(fs_info, space_info, orig_bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 0bb9d14e60a8..92c595fed1b0 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,7 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include <trace/events/btrfs.h> #include "volumes.h" /* @@ -212,7 +213,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *info, u64 bytes, int dump_block_groups); int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *block_rsv, + struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 09bfe68d2ea3..6ecf78d09694 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/btrfs.h> +#include <linux/security.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -129,9 +130,6 @@ enum { Opt_inode_cache, Opt_noinode_cache, /* Debugging options */ - Opt_check_integrity, - Opt_check_integrity_including_extent_data, - Opt_check_integrity_print_mask, Opt_enospc_debug, Opt_noenospc_debug, #ifdef CONFIG_BTRFS_DEBUG Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, @@ -200,9 +198,6 @@ static const match_table_t tokens = { {Opt_recovery, "recovery"}, /* Debugging options */ - {Opt_check_integrity, "check_int"}, - {Opt_check_integrity_including_extent_data, "check_int_data"}, - {Opt_check_integrity_print_mask, "check_int_print_mask=%u"}, {Opt_enospc_debug, "enospc_debug"}, {Opt_noenospc_debug, "noenospc_debug"}, #ifdef CONFIG_BTRFS_DEBUG @@ -707,44 +702,6 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_skip_balance: btrfs_set_opt(info->mount_opt, SKIP_BALANCE); break; -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - case Opt_check_integrity_including_extent_data: - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, - "enabling check integrity including extent data"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity: - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, "enabling check integrity"); - btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); - break; - case Opt_check_integrity_print_mask: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, - "unrecognized check_integrity_print_mask value %s", - args[0].from); - goto out; - } - info->check_integrity_print_mask = intarg; - btrfs_warn(info, - "integrity checker is deprecated and will be removed in 6.7"); - btrfs_info(info, "check_integrity_print_mask 0x%x", - info->check_integrity_print_mask); - break; -#else - case Opt_check_integrity_including_extent_data: - case Opt_check_integrity: - case Opt_check_integrity_print_mask: - btrfs_err(info, - "support for check_integrity* not compiled in!"); - ret = -EINVAL; - goto out; -#endif case Opt_fatal_errors: if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, @@ -889,7 +846,7 @@ static int btrfs_parse_device_options(const char *options, blk_mode_t flags) error = -ENOMEM; goto out; } - device = btrfs_scan_one_device(device_name, flags); + device = btrfs_scan_one_device(device_name, flags, false); kfree(device_name); if (IS_ERR(device)) { error = PTR_ERR(device); @@ -1305,15 +1262,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",autodefrag"); if (btrfs_test_opt(info, SKIP_BALANCE)) seq_puts(seq, ",skip_balance"); -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_test_opt(info, CHECK_INTEGRITY_DATA)) - seq_puts(seq, ",check_int_data"); - else if (btrfs_test_opt(info, CHECK_INTEGRITY)) - seq_puts(seq, ",check_int"); - if (info->check_integrity_print_mask) - seq_printf(seq, ",check_int_print_mask=%d", - info->check_integrity_print_mask); -#endif if (info->metadata_ratio) seq_printf(seq, ",metadata_ratio=%u", info->metadata_ratio); if (btrfs_test_opt(info, PANIC_ON_FATAL_ERROR)) @@ -1484,7 +1432,12 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_fs_info; } - device = btrfs_scan_one_device(device_name, mode); + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(device_name, mode, true); + ASSERT(device != NULL); if (IS_ERR(device)) { mutex_unlock(&uuid_mutex); error = PTR_ERR(device); @@ -2117,7 +2070,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) * calculated f_bavail. */ if (!mixed && block_rsv->space_info->full && - total_free_meta - thresh < block_rsv->size) + (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size)) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; @@ -2150,7 +2103,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; static struct file_system_type btrfs_root_fs_type = { @@ -2158,8 +2111,7 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | - FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("btrfs"); @@ -2197,7 +2149,11 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, switch (cmd) { case BTRFS_IOC_SCAN_DEV: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; @@ -2211,8 +2167,12 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); - device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ); - if (IS_ERR(device)) { + /* + * Scanning outside of mount can return NULL which would turn + * into 0 error code. + */ + device = btrfs_scan_one_device(vol->name, BLK_OPEN_READ, false); + if (IS_ERR_OR_NULL(device)) { mutex_unlock(&uuid_mutex); ret = PTR_ERR(device); break; @@ -2257,6 +2217,7 @@ static int check_dev_super(struct btrfs_device *dev) { struct btrfs_fs_info *fs_info = dev->fs_info; struct btrfs_super_block *sb; + u64 last_trans; u16 csum_type; int ret = 0; @@ -2292,10 +2253,10 @@ static int check_dev_super(struct btrfs_device *dev) if (ret < 0) goto out; - if (btrfs_super_generation(sb) != fs_info->last_trans_committed) { + last_trans = btrfs_get_last_trans_committed(fs_info); + if (btrfs_super_generation(sb) != last_trans) { btrfs_err(fs_info, "transid mismatch, has %llu expect %llu", - btrfs_super_generation(sb), - fs_info->last_trans_committed); + btrfs_super_generation(sb), last_trans); ret = -EUCLEAN; goto out; } @@ -2405,9 +2366,6 @@ static int __init btrfs_print_mod_info(void) #ifdef CONFIG_BTRFS_ASSERT ", assert=on" #endif -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - ", integrity-checker=on" -#endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY ", ref-verify=on" #endif diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index b1d1ac25237b..e6b51fb3ddc1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -291,12 +291,15 @@ BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_COMPAT_RO(block_group_tree, BLOCK_GROUP_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); +BTRFS_FEAT_ATTR_INCOMPAT(simple_quota, SIMPLE_QUOTA); #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); #endif #ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); +/* Remove once support for raid stripe tree is feature complete. */ +BTRFS_FEAT_ATTR_INCOMPAT(raid_stripe_tree, RAID_STRIPE_TREE); #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); @@ -322,11 +325,13 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), BTRFS_FEAT_ATTR_PTR(block_group_tree), + BTRFS_FEAT_ATTR_PTR(simple_quota), #ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), #endif #ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), + BTRFS_FEAT_ATTR_PTR(raid_stripe_tree), #endif #ifdef CONFIG_FS_VERITY BTRFS_FEAT_ATTR_PTR(verity), @@ -420,6 +425,13 @@ static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *bu } BTRFS_ATTR(static_feature, acl, acl_show); +static ssize_t temp_fsid_supported_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "0\n"); +} +BTRFS_ATTR(static_feature, temp_fsid, temp_fsid_supported_show); + /* * Features which only depend on kernel version. * @@ -433,6 +445,7 @@ static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), + BTRFS_ATTR_PTR(static_feature, temp_fsid), NULL }; @@ -1196,10 +1209,19 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return sysfs_emit(buf, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", btrfs_get_fs_generation(fs_info)); } BTRFS_ATTR(, generation, btrfs_generation_show); +static ssize_t btrfs_temp_fsid_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, "%d\n", fs_info->fs_devices->temp_fsid); +} +BTRFS_ATTR(, temp_fsid, btrfs_temp_fsid_show); + static const char * const btrfs_read_policy_name[] = { "pid" }; static ssize_t btrfs_read_policy_show(struct kobject *kobj, @@ -1302,6 +1324,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), BTRFS_ATTR_PTR(, commit_stats), + BTRFS_ATTR_PTR(, temp_fsid), NULL, }; @@ -2086,6 +2109,33 @@ static ssize_t qgroup_enabled_show(struct kobject *qgroups_kobj, } BTRFS_ATTR(qgroups, enabled, qgroup_enabled_show); +static ssize_t qgroup_mode_show(struct kobject *qgroups_kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(qgroups_kobj->parent); + ssize_t ret = 0; + + spin_lock(&fs_info->qgroup_lock); + ASSERT(btrfs_qgroup_enabled(fs_info)); + switch (btrfs_qgroup_mode(fs_info)) { + case BTRFS_QGROUP_MODE_FULL: + ret = sysfs_emit(buf, "qgroup\n"); + break; + case BTRFS_QGROUP_MODE_SIMPLE: + ret = sysfs_emit(buf, "squota\n"); + break; + default: + btrfs_warn(fs_info, "unexpected qgroup mode %d\n", + btrfs_qgroup_mode(fs_info)); + break; + } + spin_unlock(&fs_info->qgroup_lock); + + return ret; +} +BTRFS_ATTR(qgroups, mode, qgroup_mode_show); + static ssize_t qgroup_inconsistent_show(struct kobject *qgroups_kobj, struct kobj_attribute *a, char *buf) @@ -2148,6 +2198,7 @@ static struct attribute *qgroups_attrs[] = { BTRFS_ATTR_PTR(qgroups, enabled), BTRFS_ATTR_PTR(qgroups, inconsistent), BTRFS_ATTR_PTR(qgroups, drop_subtree_threshold), + BTRFS_ATTR_PTR(qgroups, mode), NULL }; ATTRIBUTE_GROUPS(qgroups); diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 5ef0b90e25c3..6a43a64ba55a 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -61,7 +61,11 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, path, &key, value_len); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 05b03f5eab83..492d69d2fa73 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -34,7 +34,11 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,7 +68,11 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - btrfs_setup_item_for_insert(root, &path, &key, value_len); + /* + * Passing a NULL trans handle is fine here, we have a dummy root eb + * and the tree is a single node (level 0). + */ + btrfs_setup_item_for_insert(NULL, root, &path, &key, value_len); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 874e4394df86..6e63816dddcb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -56,12 +56,17 @@ static struct kmem_cache *btrfs_trans_handle_cachep; * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V - * Transaction N [[TRANS_STATE_COMMIT_START]] + * Transaction N [[TRANS_STATE_COMMIT_PREP]] + * | + * | If there are simultaneous calls to btrfs_commit_transaction() one will win + * | the race and the rest will wait for the winner to commit the transaction. * | - * | Will wait for previous running transaction to completely finish if there - * | is one + * | The winner will wait for previous running transaction to completely finish + * | if there is one. * | - * | Then one of the following happes: + * Transaction N [[TRANS_STATE_COMMIT_START]] + * | + * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. @@ -112,6 +117,7 @@ static struct kmem_cache *btrfs_trans_handle_cachep; */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, + [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | @@ -380,7 +386,7 @@ loop: IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); - fs_info->generation++; + btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; @@ -555,6 +561,69 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root) return true; } +static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, + enum btrfs_reserve_flush_enum flush, + u64 num_bytes, + u64 *delayed_refs_bytes) +{ + struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; + u64 extra_delayed_refs_bytes = 0; + u64 bytes; + int ret; + + /* + * If there's a gap between the size of the delayed refs reserve and + * its reserved space, than some tasks have added delayed refs or bumped + * its size otherwise (due to block group creation or removal, or block + * group item update). Also try to allocate that gap in order to prevent + * using (and possibly abusing) the global reserve when committing the + * transaction. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL && + !btrfs_block_rsv_full(delayed_refs_rsv)) { + spin_lock(&delayed_refs_rsv->lock); + if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) + extra_delayed_refs_bytes = delayed_refs_rsv->size - + delayed_refs_rsv->reserved; + spin_unlock(&delayed_refs_rsv->lock); + } + + bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; + + /* + * We want to reserve all the bytes we may need all at once, so we only + * do 1 enospc flushing cycle per transaction start. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) { + if (extra_delayed_refs_bytes > 0) + btrfs_migrate_to_delayed_refs_rsv(fs_info, + extra_delayed_refs_bytes); + return 0; + } + + if (extra_delayed_refs_bytes > 0) { + bytes -= extra_delayed_refs_bytes; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + if (ret == 0) + return 0; + } + + /* + * If we are an emergency flush, which can steal from the global block + * reserve, then attempt to not reserve space for the delayed refs, as + * we will consume space for them from the global block reserve. + */ + if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + bytes -= *delayed_refs_bytes; + *delayed_refs_bytes = 0; + ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); + } + + return ret; +} + static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, @@ -562,10 +631,12 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; + struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; + u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; @@ -588,9 +659,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { - struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv; - u64 delayed_refs_bytes = 0; - qgroup_reserved = num_items * fs_info->nodesize; /* * Use prealloc for now, as there might be a currently running @@ -602,20 +670,16 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, if (ret) return ERR_PTR(ret); + num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* - * We want to reserve all the bytes we may need all at once, so - * we only do 1 enospc flushing cycle per transaction start. We - * accomplish this by simply assuming we'll do num_items worth - * of delayed refs updates in this trans handle, and refill that - * amount for whatever is missing in the reserve. + * If we plan to insert/update/delete "num_items" from a btree, + * we will also generate delayed refs for extent buffers in the + * respective btree paths, so reserve space for the delayed refs + * that will be generated by the caller as it modifies btrees. + * Try to reserve them to avoid excessive use of the global + * block reserve. */ - num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); - if (flush == BTRFS_RESERVE_FLUSH_ALL && - !btrfs_block_rsv_full(delayed_refs_rsv)) { - delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, - num_items); - num_bytes += delayed_refs_bytes; - } + delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation @@ -625,16 +689,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, reloc_reserved = true; } - ret = btrfs_block_rsv_add(fs_info, rsv, num_bytes, flush); + ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, + &delayed_refs_bytes); if (ret) goto reserve_fail; - if (delayed_refs_bytes) { - btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv, - delayed_refs_bytes); - num_bytes -= delayed_refs_bytes; - } - if (rsv->space_info->force_alloc) + btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); + + if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { @@ -694,6 +756,7 @@ again: h->type = type; INIT_LIST_HEAD(&h->new_bgs); + btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && @@ -706,8 +769,17 @@ again: if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); - h->block_rsv = &fs_info->trans_block_rsv; + h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; + if (delayed_refs_bytes > 0) { + trace_btrfs_space_reservation(fs_info, + "local_delayed_refs_rsv", + h->transid, + delayed_refs_bytes, 1); + h->delayed_refs_bytes_reserved = delayed_refs_bytes; + btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); + delayed_refs_bytes = 0; + } h->reloc_reserved = reloc_reserved; } @@ -763,8 +835,10 @@ join_fail: kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) - btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, - num_bytes, NULL); + btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); + if (delayed_refs_bytes) + btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, + delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); @@ -811,7 +885,7 @@ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *roo } /* - * btrfs_attach_transaction() - catch the running transaction + * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. @@ -830,7 +904,7 @@ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) } /* - * btrfs_attach_transaction_barrier() - catch the running transaction + * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully @@ -906,7 +980,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) int ret = 0; if (transid) { - if (transid <= fs_info->last_trans_committed) + if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ @@ -930,7 +1004,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) * raced with btrfs_commit_transaction */ if (!cur_trans) { - if (transid > fs_info->last_trans_committed) + if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } @@ -985,11 +1059,14 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); + ASSERT(!trans->delayed_refs_bytes_reserved); return; } - if (!trans->bytes_reserved) + if (!trans->bytes_reserved) { + ASSERT(!trans->delayed_refs_bytes_reserved); return; + } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", @@ -997,6 +1074,16 @@ static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; + + if (!trans->delayed_refs_bytes_reserved) + return; + + trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", + trans->transid, + trans->delayed_refs_bytes_reserved, 0); + btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, + trans->delayed_refs_bytes_reserved, NULL); + trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, @@ -1328,7 +1415,7 @@ again: } /* Now flush any delayed refs generated by updating all of the roots */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; @@ -1343,7 +1430,7 @@ again: * so we want to keep this flushing in this loop to make sure * everything gets run. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } @@ -1478,45 +1565,6 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) } /* - * defrag a given btree. - * Every leaf in the btree is read and defragged. - */ -int btrfs_defrag_root(struct btrfs_root *root) -{ - struct btrfs_fs_info *info = root->fs_info; - struct btrfs_trans_handle *trans; - int ret; - - if (test_and_set_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state)) - return 0; - - while (1) { - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - - ret = btrfs_defrag_leaves(trans, root); - - btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(info); - cond_resched(); - - if (btrfs_fs_closing(info) || ret != -EAGAIN) - break; - - if (btrfs_defrag_cancelled(info)) { - btrfs_debug(info, "defrag_root cancelled"); - ret = -EAGAIN; - break; - } - } - clear_bit(BTRFS_ROOT_DEFRAG_RUNNING, &root->state); - return ret; -} - -/* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit @@ -1533,11 +1581,10 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, int ret; /* - * Save some performance in the case that qgroups are not - * enabled. If this check races with the ioctl, rescan will - * kick in anyway. + * Save some performance in the case that qgroups are not enabled. If + * this check races with the ioctl, rescan will kick in anyway. */ - if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) + if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* @@ -1561,7 +1608,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -1576,7 +1623,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, - inherit); + parent->root_key.objectid, inherit); if (ret < 0) goto out; @@ -1726,6 +1773,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_release_path(path); + ret = btrfs_create_qgroup(trans, objectid); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } + /* * pull in the delayed directory update * and the delayed inode item @@ -1837,8 +1890,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ - ret = qgroup_account_snapshot(trans, root, parent_root, - pending->inherit, objectid); + if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) + ret = qgroup_account_snapshot(trans, root, parent_root, + pending->inherit, objectid); + else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) + ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, + parent_root->root_key.objectid, pending->inherit); if (ret < 0) goto fail; @@ -1854,8 +1911,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = inode_set_ctime_current(parent_inode); - ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(parent_inode, + inode_set_ctime_current(parent_inode)); + ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1982,7 +2040,7 @@ void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) * Wait for the current transaction commit to start and block * subsequent transaction joins */ - btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); @@ -2078,7 +2136,7 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { - btrfs_delayed_refs_rsv_release(fs_info, 1); + btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); } } @@ -2129,7 +2187,7 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) return; lockdep_assert_held(&trans->fs_info->trans_lock); - ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_START); + ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } @@ -2153,7 +2211,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); - btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); @@ -2213,7 +2271,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } spin_lock(&fs_info->trans_lock); - if (cur_trans->state >= TRANS_STATE_COMMIT_START) { + if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); @@ -2225,7 +2283,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, - BTRFS_LOCKDEP_TRANS_COMMIT_START); + BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); @@ -2237,9 +2295,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) return ret; } - cur_trans->state = TRANS_STATE_COMMIT_START; + cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; @@ -2260,11 +2318,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; - } else { - spin_unlock(&fs_info->trans_lock); + spin_lock(&fs_info->trans_lock); } } else { - spin_unlock(&fs_info->trans_lock); /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we @@ -2272,11 +2328,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { + spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } + cur_trans->state = TRANS_STATE_COMMIT_START; + wake_up(&fs_info->transaction_blocked_wait); + spin_unlock(&fs_info->trans_lock); + /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit @@ -2394,7 +2455,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - ret = btrfs_run_delayed_refs(trans, (unsigned long)-1); + ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; @@ -2527,7 +2588,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); - fs_info->last_trans_committed = cur_trans->transid; + btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. @@ -2586,7 +2647,7 @@ lockdep_release: goto cleanup_transaction; lockdep_trans_commit_start_release: - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } @@ -2645,18 +2706,18 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit) + unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) + WRITE_ONCE(trans->aborted, error); + WRITE_ONCE(trans->transaction->aborted, error); + if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); + __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8e9fa23bd7fe..18c4f6e83b78 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -14,6 +14,7 @@ enum btrfs_trans_state { TRANS_STATE_RUNNING, + TRANS_STATE_COMMIT_PREP, TRANS_STATE_COMMIT_START, TRANS_STATE_COMMIT_DOING, TRANS_STATE_UNBLOCKED, @@ -117,8 +118,10 @@ enum { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 delayed_refs_bytes_reserved; u64 chunk_bytes_reserved; unsigned long delayed_ref_updates; + unsigned long delayed_ref_csum_deletions; struct btrfs_transaction *transaction; struct btrfs_block_rsv *block_rsv; struct btrfs_block_rsv *orig_rsv; @@ -138,6 +141,7 @@ struct btrfs_trans_handle { bool in_fsync; struct btrfs_fs_info *fs_info; struct list_head new_bgs; + struct btrfs_block_rsv delayed_rsv; }; /* @@ -171,7 +175,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, { spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; - inode->last_sub_trans = inode->root->log_transid; + inode->last_sub_trans = btrfs_get_root_log_transid(inode->root); inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } @@ -199,32 +203,32 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) delayed_refs->qgroup_to_skip = 0; } -bool __cold abort_should_print_stack(int errno); +bool __cold abort_should_print_stack(int error); /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact stack trace is reported for some errors. */ -#define btrfs_abort_transaction(trans, errno) \ +#define btrfs_abort_transaction(trans, error) \ do { \ bool first = false; \ /* Report first abort since mount */ \ if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ &((trans)->fs_info->fs_state))) { \ first = true; \ - if (WARN(abort_should_print_stack(errno), \ + if (WARN(abort_should_print_stack(error), \ KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ + (error))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (error)); \ } \ } \ __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ + __LINE__, (error), first); \ } while (0) int btrfs_end_transaction(struct btrfs_trans_handle *trans); @@ -242,7 +246,6 @@ struct btrfs_trans_handle *btrfs_attach_transaction_barrier( int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid); void btrfs_add_dead_root(struct btrfs_root *root); -int btrfs_defrag_root(struct btrfs_root *root); void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info); int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info); int btrfs_commit_transaction(struct btrfs_trans_handle *trans); @@ -263,7 +266,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, - unsigned int line, int errno, bool first_hit); + unsigned int line, int error, bool first_hit); int __init btrfs_transaction_init(void); void __cold btrfs_transaction_exit(void); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ab08a0b01311..a416cbea75d1 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -29,6 +29,8 @@ #include "accessors.h" #include "file-item.h" #include "inode-item.h" +#include "dir-item.h" +#include "raid-stripe-tree.h" /* * Error message should follow the following format: @@ -1465,6 +1467,9 @@ static int check_extent_item(struct extent_buffer *leaf, } inline_refs += btrfs_shared_data_ref_count(leaf, sref); break; + case BTRFS_EXTENT_OWNER_REF_KEY: + WARN_ON(!btrfs_fs_incompat(fs_info, SIMPLE_QUOTA)); + break; default: extent_err(leaf, slot, "unknown inline ref type: %u", inline_type); @@ -1631,6 +1636,44 @@ static int check_inode_ref(struct extent_buffer *leaf, return 0; } +static int check_raid_stripe_extent(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) +{ + struct btrfs_stripe_extent *stripe_extent = + btrfs_item_ptr(leaf, slot, struct btrfs_stripe_extent); + + if (unlikely(!IS_ALIGNED(key->objectid, leaf->fs_info->sectorsize))) { + generic_err(leaf, slot, +"invalid key objectid for raid stripe extent, have %llu expect aligned to %u", + key->objectid, leaf->fs_info->sectorsize); + return -EUCLEAN; + } + + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, RAID_STRIPE_TREE))) { + generic_err(leaf, slot, + "RAID_STRIPE_EXTENT present but RAID_STRIPE_TREE incompat bit unset"); + return -EUCLEAN; + } + + switch (btrfs_stripe_extent_encoding(leaf, stripe_extent)) { + case BTRFS_STRIPE_RAID0: + case BTRFS_STRIPE_RAID1: + case BTRFS_STRIPE_DUP: + case BTRFS_STRIPE_RAID10: + case BTRFS_STRIPE_RAID5: + case BTRFS_STRIPE_RAID6: + case BTRFS_STRIPE_RAID1C3: + case BTRFS_STRIPE_RAID1C4: + break; + default: + generic_err(leaf, slot, "invalid raid stripe encoding %u", + btrfs_stripe_extent_encoding(leaf, stripe_extent)); + return -EUCLEAN; + } + + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -1685,6 +1728,9 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: ret = check_extent_data_ref(leaf, key, slot); break; + case BTRFS_RAID_STRIPE_KEY: + ret = check_raid_stripe_extent(leaf, key, slot); + break; } if (ret) @@ -2005,7 +2051,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * So we only checks tree blocks which is read from disk, whose * generation <= fs_info->last_trans_committed. */ - if (btrfs_header_generation(eb) > fs_info->last_trans_committed) + if (btrfs_header_generation(eb) > btrfs_get_last_trans_committed(fs_info)) return 0; /* We have @first_key, so this @eb must have at least one item */ diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d1e46b839519..7d6729d9fd2f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -347,8 +347,7 @@ static int process_one_buffer(struct btrfs_root *log, } if (wc->pin) { - ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start, - eb->len); + ret = btrfs_pin_extent_for_log_replay(wc->trans, eb); if (ret) return ret; @@ -504,9 +503,9 @@ insert: found_size = btrfs_item_size(path->nodes[0], path->slots[0]); if (found_size > item_size) - btrfs_truncate_item(path, item_size, 1); + btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) - btrfs_extend_item(path, item_size - found_size); + btrfs_extend_item(trans, path, item_size - found_size); } else if (ret) { return ret; } @@ -574,7 +573,7 @@ insert: } } no_copy: - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -767,7 +766,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, } else if (ret == 0) { btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - ins.objectid, ins.offset, 0); + ins.objectid, ins.offset, 0, + root->root_key.objectid); btrfs_init_data_ref(&ref, root->root_key.objectid, key->objectid, offset, 0, false); @@ -890,7 +890,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); out: iput(inode); return ret; @@ -1445,7 +1445,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1483,8 +1483,7 @@ out: return ret; } -static int count_inode_extrefs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_extrefs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret = 0; int name_len; @@ -1498,8 +1497,8 @@ static int count_inode_extrefs(struct btrfs_root *root, struct extent_buffer *leaf; while (1) { - ret = btrfs_find_one_extref(root, inode_objectid, offset, path, - &extref, &offset); + ret = btrfs_find_one_extref(inode->root, inode_objectid, offset, + path, &extref, &offset); if (ret) break; @@ -1527,8 +1526,7 @@ static int count_inode_extrefs(struct btrfs_root *root, return nlink; } -static int count_inode_refs(struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_path *path) +static int count_inode_refs(struct btrfs_inode *inode, struct btrfs_path *path) { int ret; struct btrfs_key key; @@ -1543,7 +1541,7 @@ static int count_inode_refs(struct btrfs_root *root, key.offset = (u64)-1; while (1) { - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ret = btrfs_search_slot(NULL, inode->root, &key, path, 0, 0); if (ret < 0) break; if (ret > 0) { @@ -1595,9 +1593,9 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; int ret; u64 nlink = 0; @@ -1607,13 +1605,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - ret = count_inode_refs(root, BTRFS_I(inode), path); + ret = count_inode_refs(BTRFS_I(inode), path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(root, BTRFS_I(inode), path); + ret = count_inode_extrefs(BTRFS_I(inode), path); if (ret < 0) goto out; @@ -1623,7 +1621,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) goto out; } @@ -1685,7 +1683,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; } - ret = fixup_inode_link_count(trans, root, inode); + ret = fixup_inode_link_count(trans, inode); iput(inode); if (ret) break; @@ -1732,7 +1730,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, set_nlink(inode, 1); else inc_nlink(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; } @@ -1939,7 +1937,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_update_inode(trans, BTRFS_I(dir)); } kfree(name.name); iput(dir); @@ -2483,7 +2481,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, drop_args.bytes_found); /* Update the inode's nbytes. */ ret = btrfs_update_inode(wc->trans, - root, BTRFS_I(inode)); + BTRFS_I(inode)); } iput(inode); if (ret) @@ -2574,7 +2572,7 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, btrfs_tree_unlock(eb); if (trans) { - ret = btrfs_pin_reserved_extent(trans, eb->start, eb->len); + ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; btrfs_redirty_list_add(trans->transaction, eb); @@ -2848,10 +2846,9 @@ static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root, } /* - * btrfs_sync_log does sends a given tree log down to the disk and - * updates the super blocks to record it. When this call is done, - * you know that any inodes previously logged are safely on disk only - * if it returns 0. + * Sends a given tree log down to the disk and updates the super blocks to + * record it. When this call is done, you know that any inodes previously + * logged are safely on disk only if it returns 0. * * Any other return value means you need to call btrfs_commit_transaction. * Some of the edge cases for fsyncing directories that have had unlinks @@ -2961,7 +2958,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_root_node(&log->root_item, log->node); memcpy(&new_root_item, &log->root_item, sizeof(new_root_item)); - root->log_transid++; + btrfs_set_root_log_transid(root, root->log_transid + 1); log->log_transid = root->log_transid; root->log_start_pid = 0; /* @@ -2999,9 +2996,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ ret = update_log_root(trans, log, &new_root_item); if (ret) { - if (!list_empty(&root_log_ctx.list)) - list_del_init(&root_log_ctx.list); - + list_del_init(&root_log_ctx.list); blk_finish_plug(&plug); btrfs_set_log_full_commit(trans); if (ret != -ENOSPC) @@ -3021,7 +3016,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } - index2 = root_log_ctx.log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { blk_finish_plug(&plug); ret = btrfs_wait_tree_log_extents(log, mark); @@ -3136,8 +3130,8 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * someone else already started it. We use <= and not < because the * first log transaction has an ID of 0. */ - ASSERT(root->last_log_commit <= log_transid); - root->last_log_commit = log_transid; + ASSERT(btrfs_get_root_last_log_commit(root) <= log_transid); + btrfs_set_root_last_log_commit(root, log_transid); out_wake_log_root: mutex_lock(&log_root_tree->log_mutex); @@ -3211,8 +3205,7 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1, - EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); + extent_io_tree_release(&log->dirty_log_pages); extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); @@ -3530,7 +3523,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, last_offset = max(last_offset, curr_end); } btrfs_set_dir_log_end(path->nodes[0], item, last_offset); - btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_mark_buffer_dirty(trans, path->nodes[0]); btrfs_release_path(path); return 0; } @@ -4138,19 +4131,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, - inode->i_atime.tv_sec); + inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, - inode->i_atime.tv_nsec); + inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, - inode->i_mtime.tv_sec); + inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, - inode->i_mtime.tv_nsec); + inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode_get_ctime(inode).tv_nsec); + inode_get_ctime_nsec(inode)); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4488,7 +4481,7 @@ copy_item: dst_index++; } - btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_mark_buffer_dirty(trans, dst_path->nodes[0]); btrfs_release_path(dst_path); out: kfree(ins_data); @@ -4693,7 +4686,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, &fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(fi)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); @@ -4722,7 +4715,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; int ins_nr = 0; - int start_slot; + int start_slot = 0; int ret; if (!(inode->flags & BTRFS_INODE_PREALLOC)) @@ -4921,12 +4914,12 @@ process: set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { - spin_lock_irq(&inode->ordered_tree.lock); + spin_lock_irq(&inode->ordered_tree_lock); if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); atomic_inc(&trans->transaction->pending_ordered); } - spin_unlock_irq(&inode->ordered_tree.lock); + spin_unlock_irq(&inode->ordered_tree_lock); } btrfs_put_ordered_extent(ordered); } @@ -7204,9 +7197,7 @@ again: * each subsequent pass. */ if (ret == -ENOENT) - ret = btrfs_pin_extent_for_log_replay(trans, - log->node->start, - log->node->len); + ret = btrfs_pin_extent_for_log_replay(trans, log->node); btrfs_put_root(log); if (!ret) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 33606025513d..b4ac2b0cd235 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -223,7 +223,8 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, } /* - * ulist_del - delete one node from ulist + * Delete one node from ulist. + * * @ulist: ulist to remove node from * @val: value to delete * @aux: aux to delete diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 7c7001f42b14..5be74f9e47eb 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -124,7 +124,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, * An item with that type already exists. * Extend the item and store the new subid at the end. */ - btrfs_extend_item(path, sizeof(subid_le)); + btrfs_extend_item(trans, path, sizeof(subid_le)); eb = path->nodes[0]; slot = path->slots[0]; offset = btrfs_item_ptr_offset(eb, slot); @@ -139,7 +139,7 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, ret = 0; subid_le = cpu_to_le64(subid_cpu); write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -221,7 +221,7 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, move_src = offset + sizeof(subid); move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot)); memmove_extent_buffer(eb, move_dst, move_src, move_len); - btrfs_truncate_item(path, item_size - sizeof(subid), 1); + btrfs_truncate_item(trans, path, item_size - sizeof(subid), 1); out: btrfs_free_path(path); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index c5ff16f9e9fa..66e2270b0dae 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -487,7 +487,7 @@ static int rollback_verity(struct btrfs_inode *inode) } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -554,7 +554,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, } inode->ro_flags |= BTRFS_INODE_RO_VERITY; btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; ret = del_orphan(trans, inode); @@ -715,7 +715,7 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, pgoff_t index, unsigned long num_ra_pages) { - struct page *page; + struct folio *folio; u64 off = (u64)index << PAGE_SHIFT; loff_t merkle_pos = merkle_file_pos(inode); int ret; @@ -726,29 +726,36 @@ static struct page *btrfs_read_merkle_tree_page(struct inode *inode, return ERR_PTR(-EFBIG); index += merkle_pos >> PAGE_SHIFT; again: - page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED); - if (page) { - if (PageUptodate(page)) - return page; + folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + if (!IS_ERR(folio)) { + if (folio_test_uptodate(folio)) + goto out; - lock_page(page); - /* - * We only insert uptodate pages, so !Uptodate has to be - * an error - */ - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); + folio_lock(folio); + /* If it's not uptodate after we have the lock, we got a read error. */ + if (!folio_test_uptodate(folio)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-EIO); } - unlock_page(page); - return page; + folio_unlock(folio); + goto out; } - page = __page_cache_alloc(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS)); - if (!page) + folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), + 0); + if (!folio) return ERR_PTR(-ENOMEM); + ret = filemap_add_folio(inode->i_mapping, folio, index, GFP_NOFS); + if (ret) { + folio_put(folio); + /* Did someone else insert a folio here? */ + if (ret == -EEXIST) + goto again; + return ERR_PTR(ret); + } + /* * Merkle item keys are indexed from byte 0 in the merkle tree. * They have the form: @@ -756,28 +763,19 @@ again: * [ inode objectid, BTRFS_MERKLE_ITEM_KEY, offset in bytes ] */ ret = read_key_bytes(BTRFS_I(inode), BTRFS_VERITY_MERKLE_ITEM_KEY, off, - page_address(page), PAGE_SIZE, page); + folio_address(folio), PAGE_SIZE, &folio->page); if (ret < 0) { - put_page(page); + folio_put(folio); return ERR_PTR(ret); } if (ret < PAGE_SIZE) - memzero_page(page, ret, PAGE_SIZE - ret); + folio_zero_segment(folio, ret, PAGE_SIZE); - SetPageUptodate(page); - ret = add_to_page_cache_lru(page, inode->i_mapping, index, GFP_NOFS); + folio_mark_uptodate(folio); + folio_unlock(folio); - if (!ret) { - /* Inserted and ready for fsverity */ - unlock_page(page); - } else { - put_page(page); - /* Did someone race us into inserting this page? */ - if (ret == -EEXIST) - goto again; - page = ERR_PTR(ret); - } - return page; +out: + return folio_file_page(folio, index); } /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9621455edebc..c87e18827a0a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "relocation.h" #include "scrub.h" #include "super.h" +#include "raid-stripe-tree.h" #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ @@ -357,21 +358,19 @@ struct list_head * __attribute_const__ btrfs_get_fs_uuids(void) } /* - * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: if not NULL, copy the UUID to fs_devices::fsid - * @metadata_fsid: if not NULL, copy the UUID to fs_devices::metadata_fsid + * Allocate new btrfs_fs_devices structure identified by a fsid. + * + * @fsid: if not NULL, copy the UUID to fs_devices::fsid and to + * fs_devices::metadata_fsid * * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). * The returned struct is not linked onto any lists and can be destroyed with * kfree() right away. */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, - const u8 *metadata_fsid) +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; - ASSERT(fsid || !metadata_fsid); - fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL); if (!fs_devs) return ERR_PTR(-ENOMEM); @@ -385,8 +384,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - memcpy(fs_devs->metadata_uuid, - metadata_fsid ?: fsid, BTRFS_FSID_SIZE); + memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE); } return fs_devs; @@ -457,91 +455,41 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } -/* - * First check if the metadata_uuid is different from the fsid in the given - * fs_devices. Then check if the given fsid is the same as the metadata_uuid - * in the fs_devices. If it is, return true; otherwise, return false. - */ -static inline bool check_fsid_changed(const struct btrfs_fs_devices *fs_devices, - const u8 *fsid) -{ - return memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) != 0 && - memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE) == 0; -} - -static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( - struct btrfs_super_block *disk_super) -{ - - struct btrfs_fs_devices *fs_devices; - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (match_fsid_fs_devices(fs_devices, disk_super->metadata_uuid, - fs_devices->fsid)) - return fs_devices; - } - - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, disk_super->metadata_uuid); -} - - static int btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, - int flush, struct block_device **bdev, + int flush, struct bdev_handle **bdev_handle, struct btrfs_super_block **disk_super) { + struct block_device *bdev; int ret; - *bdev = blkdev_get_by_path(device_path, flags, holder, NULL); + *bdev_handle = bdev_open_by_path(device_path, flags, holder, NULL); - if (IS_ERR(*bdev)) { - ret = PTR_ERR(*bdev); + if (IS_ERR(*bdev_handle)) { + ret = PTR_ERR(*bdev_handle); goto error; } + bdev = (*bdev_handle)->bdev; if (flush) - sync_blockdev(*bdev); - ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); + sync_blockdev(bdev); + ret = set_blocksize(bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } - invalidate_bdev(*bdev); - *disk_super = btrfs_read_dev_super(*bdev); + invalidate_bdev(bdev); + *disk_super = btrfs_read_dev_super(bdev); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); - blkdev_put(*bdev, holder); + bdev_release(*bdev_handle); goto error; } return 0; error: - *bdev = NULL; + *bdev_handle = NULL; return ret; } @@ -562,13 +510,13 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device { struct btrfs_fs_devices *fs_devices, *tmp_fs_devices; struct btrfs_device *device, *tmp_device; - int ret = 0; + int ret; + bool freed = false; lockdep_assert_held(&uuid_mutex); - if (devt) - ret = -ENOENT; - + /* Return good status if there is no instance of devt. */ + ret = 0; list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) { mutex_lock(&fs_devices->device_list_mutex); @@ -579,8 +527,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device if (devt && devt != device->devt) continue; if (fs_devices->opened) { - /* for an already deleted device return 0 */ - if (devt && ret != 0) + if (devt) ret = -EBUSY; break; } @@ -590,7 +537,7 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device list_del(&device->dev_list); btrfs_free_device(device); - ret = 0; + freed = true; } mutex_unlock(&fs_devices->device_list_mutex); @@ -601,9 +548,81 @@ static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device } } + /* If there is at least one freed device return 0. */ + if (freed) + return 0; + return ret; } +static struct btrfs_fs_devices *find_fsid_by_device( + struct btrfs_super_block *disk_super, + dev_t devt, bool *same_fsid_diff_dev) +{ + struct btrfs_fs_devices *fsid_fs_devices; + struct btrfs_fs_devices *devt_fs_devices; + const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + bool found_by_devt = false; + + /* Find the fs_device by the usual method, if found use it. */ + fsid_fs_devices = find_fsid(disk_super->fsid, + has_metadata_uuid ? disk_super->metadata_uuid : NULL); + + /* The temp_fsid feature is supported only with single device filesystem. */ + if (btrfs_super_num_devices(disk_super) != 1) + return fsid_fs_devices; + + /* + * A seed device is an integral component of the sprout device, which + * functions as a multi-device filesystem. So, temp-fsid feature is + * not supported. + */ + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) + return fsid_fs_devices; + + /* Try to find a fs_devices by matching devt. */ + list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) { + struct btrfs_device *device; + + list_for_each_entry(device, &devt_fs_devices->devices, dev_list) { + if (device->devt == devt) { + found_by_devt = true; + break; + } + } + if (found_by_devt) + break; + } + + if (found_by_devt) { + /* Existing device. */ + if (fsid_fs_devices == NULL) { + if (devt_fs_devices->opened == 0) { + /* Stale device. */ + return NULL; + } else { + /* temp_fsid is mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* Regular or temp_fsid device mounting a subvol. */ + return devt_fs_devices; + } + } else { + /* New device. */ + if (fsid_fs_devices == NULL) { + return NULL; + } else { + /* sb::fsid is already used create a new temp_fsid. */ + *same_fsid_diff_dev = true; + return NULL; + } + } + + /* Not reached. */ +} + /* * This is only used on mount, and we are protected from competing things * messing with our fs_devices by the uuid_mutex, thus we do not need the @@ -613,7 +632,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device, blk_mode_t flags, void *holder) { - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct btrfs_super_block *disk_super; u64 devid; int ret; @@ -624,7 +643,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, return -EINVAL; ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) return ret; @@ -648,21 +667,21 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); fs_devices->seeding = true; } else { - if (bdev_read_only(bdev)) + if (bdev_read_only(bdev_handle->bdev)) clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); else set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); } - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(bdev_handle->bdev)) fs_devices->rotating = true; - if (bdev_max_discard_sectors(bdev)) + if (bdev_max_discard_sectors(bdev_handle->bdev)) fs_devices->discardable = true; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - device->holder = holder; fs_devices->open_devices++; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && @@ -676,7 +695,7 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, error_free_page: btrfs_release_disk_super(disk_super); - blkdev_put(bdev, holder); + bdev_release(bdev_handle); return -EINVAL; } @@ -690,84 +709,6 @@ u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) } /* - * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. Such - * disk can belong to an fs which has its FSID changed or to one which doesn't. - * Handle both cases here. - */ -static struct btrfs_fs_devices *find_fsid_inprogress( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return find_fsid(disk_super->fsid, NULL); -} - -static struct btrfs_fs_devices *find_fsid_changed( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handles the case where scanned device is part of an fs that had - * multiple successful changes of FSID but currently device didn't - * observe it. Meaning our fsid will be different than theirs. We need - * to handle two subcases : - * 1 - The fs still continues to have different METADATA/FSID uuids. - * 2 - The fs is switched back to its original FSID (METADATA/FSID - * are equal). - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - /* Changed UUIDs */ - if (check_fsid_changed(fs_devices, disk_super->metadata_uuid) && - memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) - return fs_devices; - - /* Unchanged UUIDs */ - if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, disk_super->metadata_uuid, - BTRFS_FSID_SIZE) == 0) - return fs_devices; - } - - return NULL; -} - -static struct btrfs_fs_devices *find_fsid_reverted_metadata( - struct btrfs_super_block *disk_super) -{ - struct btrfs_fs_devices *fs_devices; - - /* - * Handle the case where the scanned device is part of an fs whose last - * metadata UUID change reverted it to the original FSID. At the same - * time fs_devices was first created by another constituent device - * which didn't fully observe the operation. This results in an - * btrfs_fs_devices created with metadata/fsid different AND - * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the - * fs_devices equal to the FSID of the disk. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (!fs_devices->fsid_change) - continue; - - if (check_fsid_changed(fs_devices, disk_super->fsid)) - return fs_devices; - } - - return NULL; -} -/* * Add new device to list of registered devices * * Returns: @@ -785,10 +726,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, u64 devid = btrfs_stack_device_id(&disk_super->dev_item); dev_t path_devt; int error; + bool same_fsid_diff_dev = false; bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_METADATA_UUID); - bool fsid_change_in_progress = (btrfs_super_flags(disk_super) & - BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) { + btrfs_err(NULL, +"device %s has incomplete metadata_uuid change, please use btrfstune to complete", + path); + return ERR_PTR(-EAGAIN); + } error = lookup_bdev(path, &path_devt); if (error) { @@ -797,27 +744,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, return ERR_PTR(error); } - if (fsid_change_in_progress) { - if (!has_metadata_uuid) - fs_devices = find_fsid_inprogress(disk_super); - else - fs_devices = find_fsid_changed(disk_super); - } else if (has_metadata_uuid) { - fs_devices = find_fsid_with_metadata_uuid(disk_super); - } else { - fs_devices = find_fsid_reverted_metadata(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } - + fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev); if (!fs_devices) { - fs_devices = alloc_fs_devices(disk_super->fsid, - has_metadata_uuid ? disk_super->metadata_uuid : NULL); + fs_devices = alloc_fs_devices(disk_super->fsid); + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, BTRFS_FSID_SIZE); + if (IS_ERR(fs_devices)) return ERR_CAST(fs_devices); - fs_devices->fsid_change = fsid_change_in_progress; + if (same_fsid_diff_dev) { + generate_random_uuid(fs_devices->fsid); + fs_devices->temp_fsid = true; + pr_info("BTRFS: device %s using temp-fsid %pU\n", + path, fs_devices->fsid); + } mutex_lock(&fs_devices->device_list_mutex); list_add(&fs_devices->fs_list, &fs_uuids); @@ -832,18 +775,11 @@ static noinline struct btrfs_device *device_list_add(const char *path, mutex_lock(&fs_devices->device_list_mutex); device = btrfs_find_device(fs_devices, &args); - /* - * If this disk has been pulled into an fs devices created by - * a device which had the CHANGING_FSID_V2 flag then replace the - * metadata_uuid/fsid values of the fs_devices. - */ - if (fs_devices->fsid_change && - found_transid > fs_devices->latest_generation) { + if (found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); memcpy(fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); - fs_devices->fsid_change = false; } } @@ -997,7 +933,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) lockdep_assert_held(&uuid_mutex); - fs_devices = alloc_fs_devices(orig->fsid, NULL); + fs_devices = alloc_fs_devices(orig->fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -1068,9 +1004,10 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, if (device->devid == BTRFS_DEV_REPLACE_DEVID) continue; - if (device->bdev) { - blkdev_put(device->bdev, device->holder); + if (device->bdev_handle) { + bdev_release(device->bdev_handle); device->bdev = NULL; + device->bdev_handle = NULL; fs_devices->open_devices--; } if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { @@ -1115,7 +1052,7 @@ static void btrfs_close_bdev(struct btrfs_device *device) invalidate_bdev(device->bdev); } - blkdev_put(device->bdev, device->holder); + bdev_release(device->bdev_handle); } static void btrfs_close_one_device(struct btrfs_device *device) @@ -1356,14 +1293,19 @@ int btrfs_forget_devices(dev_t devt) /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock - * is read via pagecache + * is read via pagecache. + * + * With @mount_arg_dev it's a scan during mount time that will always register + * the device or return an error. Multi-device and seeding devices are registered + * in both cases. */ -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev) { struct btrfs_super_block *disk_super; bool new_device_added = false; struct btrfs_device *device = NULL; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 bytenr, bytenr_orig; int ret; @@ -1386,31 +1328,49 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags) * values temporarily, as the device paths of the fsid are the only * required information for assembling the volume. */ - bdev = blkdev_get_by_path(path, flags, NULL, NULL); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + bdev_handle = bdev_open_by_path(path, flags, NULL, NULL); + if (IS_ERR(bdev_handle)) + return ERR_CAST(bdev_handle); bytenr_orig = btrfs_sb_offset(0); - ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr); + ret = btrfs_sb_log_location_bdev(bdev_handle->bdev, 0, READ, &bytenr); if (ret) { device = ERR_PTR(ret); goto error_bdev_put; } - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig); + disk_super = btrfs_read_disk_super(bdev_handle->bdev, bytenr, + bytenr_orig); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; } + if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 && + !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)) { + dev_t devt; + + ret = lookup_bdev(path, &devt); + if (ret) + btrfs_warn(NULL, "lookup bdev failed for path %s: %d", + path, ret); + else + btrfs_free_stale_devices(devt, NULL); + + pr_debug("BTRFS: skip registering single non-seed device %s\n", path); + device = NULL; + goto free_disk_super; + } + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); +free_disk_super: btrfs_release_disk_super(disk_super); error_bdev_put: - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return device; } @@ -1594,7 +1554,7 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 search_start; u64 hole_size; u64 max_hole_start; - u64 max_hole_size; + u64 max_hole_size = 0; u64 extent_end; u64 search_end = device->total_bytes; int ret; @@ -1602,17 +1562,16 @@ static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, struct extent_buffer *l; search_start = dev_extent_search_start(device); + max_hole_start = search_start; WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - max_hole_start = search_start; - max_hole_size = 0; - + if (!path) { + ret = -ENOMEM; + goto out; + } again: if (search_start >= search_end || test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) { @@ -1895,7 +1854,7 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, ptr = btrfs_device_fsid(dev_item); write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid, ptr, BTRFS_FSID_SIZE); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); ret = 0; out: @@ -2088,7 +2047,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder) + struct bdev_handle **bdev_handle) { struct btrfs_trans_handle *trans; struct btrfs_device *device; @@ -2197,7 +2156,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, btrfs_assign_next_active_device(device, NULL); - if (device->bdev) { + if (device->bdev_handle) { cur_devices->open_devices--; /* remove sysfs entry */ btrfs_sysfs_remove_device(device); @@ -2213,9 +2172,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, * free the device. * * We cannot call btrfs_close_bdev() here because we're holding the sb - * write lock, and blkdev_put() will pull in the ->open_mutex on the - * block device and it's dependencies. Instead just flush the device - * and let the caller do the final blkdev_put. + * write lock, and bdev_release() will pull in the ->open_mutex on + * the block device and it's dependencies. Instead just flush the + * device and let the caller do the final bdev_release. */ if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { btrfs_scratch_superblocks(fs_info, device->bdev, @@ -2226,8 +2185,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, } } - *bdev = device->bdev; - *holder = device->holder; + *bdev_handle = device->bdev_handle; synchronize_rcu(); btrfs_free_device(device); @@ -2364,7 +2322,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, const char *path) { struct btrfs_super_block *disk_super; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int ret; if (!path || !path[0]) @@ -2382,7 +2340,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, } ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0, - &bdev, &disk_super); + &bdev_handle, &disk_super); if (ret) { btrfs_put_dev_args_from_path(args); return ret; @@ -2395,7 +2353,7 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, else memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - blkdev_put(bdev, NULL); + bdev_release(bdev_handle); return 0; } @@ -2452,7 +2410,7 @@ static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info) * Private copy of the seed devices, anchored at * fs_info->fs_devices->seed_list */ - seed_devices = alloc_fs_devices(NULL, NULL); + seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return seed_devices; @@ -2598,7 +2556,7 @@ next_slot: if (device->fs_devices->seeding) { btrfs_set_device_generation(leaf, dev_item, device->generation); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } path->slots[0]++; @@ -2615,7 +2573,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_root *root = fs_info->dev_root; struct btrfs_trans_handle *trans; struct btrfs_device *device; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct super_block *sb = fs_info->sb; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices = NULL; @@ -2628,12 +2586,12 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; - bdev = blkdev_get_by_path(device_path, BLK_OPEN_WRITE, - fs_info->bdev_holder, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + bdev_handle = bdev_open_by_path(device_path, BLK_OPEN_WRITE, + fs_info->bdev_holder, NULL); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); - if (!btrfs_check_device_zone_type(fs_info, bdev)) { + if (!btrfs_check_device_zone_type(fs_info, bdev_handle->bdev)) { ret = -EINVAL; goto error; } @@ -2645,11 +2603,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - sync_blockdev(bdev); + sync_blockdev(bdev_handle->bdev); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { - if (device->bdev == bdev) { + if (device->bdev == bdev_handle->bdev) { ret = -EEXIST; rcu_read_unlock(); goto error; @@ -2665,7 +2623,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } device->fs_info = fs_info; - device->bdev = bdev; + device->bdev_handle = bdev_handle; + device->bdev = bdev_handle->bdev; ret = lookup_bdev(device_path, &device->devt); if (ret) goto error_free_device; @@ -2686,12 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; device->total_bytes = - round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); + round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - device->holder = fs_info->bdev_holder; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); @@ -2727,7 +2685,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path atomic64_add(device->total_bytes, &fs_info->free_chunk_space); - if (!bdev_nonrot(bdev)) + if (!bdev_nonrot(device->bdev)) fs_devices->rotating = true; orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy); @@ -2849,7 +2807,7 @@ error_free_zone: error_free_device: btrfs_free_device(device); error: - blkdev_put(bdev, fs_info->bdev_holder); + bdev_release(bdev_handle); if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); @@ -2896,7 +2854,7 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_device_get_disk_total_bytes(device)); btrfs_set_device_bytes_used(leaf, dev_item, btrfs_device_get_bytes_used(device)); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); @@ -2930,6 +2888,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, btrfs_set_super_total_bytes(super_copy, round_down(old_total + diff, fs_info->sectorsize)); device->fs_devices->total_rw_bytes += diff; + atomic64_add(diff, &fs_info->free_chunk_space); btrfs_device_set_total_bytes(device, new_size); btrfs_device_set_disk_total_bytes(device, new_size); @@ -3028,7 +2987,8 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) } /* - * btrfs_get_chunk_map() - Find the mapping containing the given logical extent. + * Find the mapping containing the given logical extent. + * * @logical: Logical block offset in bytes. * @length: Length of extent in bytes. * @@ -3484,7 +3444,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info, btrfs_set_balance_flags(leaf, item, bctl->flags); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); out: btrfs_free_path(path); err = btrfs_commit_transaction(trans); @@ -4839,6 +4799,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) u64 old_size = btrfs_device_get_total_bytes(device); u64 diff; u64 start; + u64 free_diff = 0; new_size = round_down(new_size, fs_info->sectorsize); start = new_size; @@ -4864,7 +4825,19 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_device_set_total_bytes(device, new_size); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes -= diff; - atomic64_sub(diff, &fs_info->free_chunk_space); + + /* + * The new free_chunk_space is new_size - used, so we have to + * subtract the delta of the old free_chunk_space which included + * old_size - used. If used > new_size then just subtract this + * entire device's free space. + */ + if (device->bytes_used < new_size) + free_diff = (old_size - device->bytes_used) - + (new_size - device->bytes_used); + else + free_diff = old_size - device->bytes_used; + atomic64_sub(free_diff, &fs_info->free_chunk_space); } /* @@ -4999,9 +4972,10 @@ done: if (ret) { mutex_lock(&fs_info->chunk_mutex); btrfs_device_set_total_bytes(device, old_size); - if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { device->fs_devices->total_rw_bytes += diff; - atomic64_add(diff, &fs_info->free_chunk_space); + atomic64_add(free_diff, &fs_info->free_chunk_space); + } mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -5110,7 +5084,7 @@ static void init_alloc_chunk_ctl_policy_regular( ASSERT(space_info); ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); - ctl->max_stripe_size = ctl->max_chunk_size; + ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G); if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); @@ -5881,6 +5855,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + u64 logical, u16 total_stripes) { struct btrfs_io_context *bioc; @@ -5900,6 +5875,7 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ bioc->fs_info = fs_info; bioc->replace_stripe_src = -1; bioc->full_stripe_logical = (u64)-1; + bioc->logical = logical; return bioc; } @@ -6204,12 +6180,20 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, return U64_MAX; } -static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup *map, - u32 stripe_index, u64 stripe_offset, u32 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, struct btrfs_io_stripe *dst, + struct map_lookup *map, u32 stripe_index, + u64 stripe_offset, u64 stripe_nr) { dst->dev = map->stripes[stripe_index].dev; + + if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + return btrfs_get_raid_extent_offset(fs_info, logical, length, + map->type, stripe_index, dst); + dst->physical = map->stripes[stripe_index].physical + stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + return 0; } /* @@ -6246,16 +6230,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * * For RAID6 profile, mirror > 2 means mark another * data/P stripe error and rebuild from the remaining * stripes.. - * - * @need_raid_map: (Used only for integrity checker) whether the map wants - * a full stripe map (including all data and P/Q stripes) - * for RAID56. Should always be 1 except integrity checker. */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map) + struct btrfs_io_stripe *smap, int *mirror_num_ret) { struct extent_map *em; struct map_lookup *map; @@ -6350,8 +6329,10 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (need_raid_map && (op != BTRFS_MAP_READ || mirror_num > 1)) { + if (op != BTRFS_MAP_READ || mirror_num > 1) { /* + * Needs full stripe mapping. + * * Push stripe_nr back to the start of the full stripe * For those cases needing a full stripe, @stripe_nr * is the full stripe number. @@ -6374,19 +6355,14 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, stripe_index = 0; stripe_offset = 0; } else { - /* - * Mirror #0 or #1 means the original data block. - * Mirror #2 is RAID5 parity block. - * Mirror #3 is RAID6 Q block. - */ + ASSERT(mirror_num <= 1); + /* Just grab the data stripe directly. */ stripe_index = stripe_nr % data_stripes; stripe_nr /= data_stripes; - if (mirror_num > 1) - stripe_index = data_stripes + mirror_num - 2; /* We distribute the parity blocks across stripes */ stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num <= 1) + if (op == BTRFS_MAP_READ && mirror_num < 1) mirror_num = 1; } } else { @@ -6425,16 +6401,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (smap && num_alloc_stripes == 1 && + !(btrfs_need_stripe_tree_update(fs_info, map->type) && + op != BTRFS_MAP_READ) && !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { - set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); + ret = set_io_stripe(fs_info, op, logical, length, smap, map, + stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) *mirror_num_ret = mirror_num; *bioc_ret = NULL; - ret = 0; goto out; } - bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes); + bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes); if (!bioc) { ret = -ENOMEM; goto out; @@ -6448,7 +6426,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * * It's still mostly the same as other profiles, just with extra rotation. */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && (op != BTRFS_MAP_READ || mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes @@ -6460,22 +6438,35 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, */ bioc->full_stripe_logical = em->start + btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (i = 0; i < num_stripes; i++) - set_io_stripe(&bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); + for (int i = 0; i < num_stripes; i++) { + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, + (i + stripe_nr) % num_stripes, + stripe_offset, stripe_nr); + if (ret < 0) + break; + } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ for (i = 0; i < num_stripes; i++) { - set_io_stripe(&bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + ret = set_io_stripe(fs_info, op, logical, length, + &bioc->stripes[i], map, stripe_index, + stripe_offset, stripe_nr); + if (ret < 0) + break; stripe_index++; } } + if (ret) { + *bioc_ret = NULL; + btrfs_put_bioc(bioc); + goto out; + } + if (op != BTRFS_MAP_READ) max_errors = btrfs_chunk_max_errors(map); @@ -6902,7 +6893,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, if (!btrfs_test_opt(fs_info, DEGRADED)) return ERR_PTR(-ENOENT); - fs_devices = alloc_fs_devices(fsid, NULL); + fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) return fs_devices; @@ -7535,7 +7526,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) btrfs_set_dev_stats_value(eb, ptr, i, btrfs_dev_stat_read(device, i)); - btrfs_mark_buffer_dirty(eb); + btrfs_mark_buffer_dirty(trans, eb); out: btrfs_free_path(path); @@ -8077,7 +8068,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, ASSERT(mirror_num > 0); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, - &bioc, smap, &mirror_ret, true); + &bioc, smap, &mirror_ret); if (ret < 0) return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2128a032c3b7..9cc374864a79 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -90,13 +90,11 @@ struct btrfs_device { u64 generation; + struct bdev_handle *bdev_handle; struct block_device *bdev; struct btrfs_zoned_device_info *zone_info; - /* block device holder for blkdev_get/put */ - void *holder; - /* * Device's major-minor number. Must be set even if the device is not * opened (bdev == NULL), unless the device is missing. @@ -290,6 +288,19 @@ struct btrfs_fs_devices { * - Following shall be true at all times: * - metadata_uuid == btrfs_header::fsid * - metadata_uuid == btrfs_dev_item::fsid + * + * - Relations between fsid and metadata_uuid in sb and fs_devices: + * - Normal: + * fs_devices->fsid == fs_devices->metadata_uuid == sb->fsid + * sb->metadata_uuid == 0 + * + * - When the BTRFS_FEATURE_INCOMPAT_METADATA_UUID flag is set: + * fs_devices->fsid == sb->fsid + * fs_devices->metadata_uuid == sb->metadata_uuid + * + * - When in-memory fs_devices->temp_fsid is true + * fs_devices->fsid = random + * fs_devices->metadata_uuid == sb->fsid */ u8 metadata_uuid[BTRFS_FSID_SIZE]; @@ -353,9 +364,10 @@ struct btrfs_fs_devices { bool rotating; /* Devices support TRIM/discard commands. */ bool discardable; - bool fsid_change; /* The filesystem is a seed filesystem. */ bool seeding; + /* The mount needs to use a randomly generated fsid. */ + bool temp_fsid; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ @@ -381,12 +393,12 @@ struct btrfs_fs_devices { struct btrfs_io_stripe { struct btrfs_device *dev; - union { - /* Block mapping */ - u64 physical; - /* For the endio handler */ - struct btrfs_io_context *bioc; - }; + /* Block mapping. */ + u64 physical; + u64 length; + bool is_scrub; + /* For the endio handler. */ + struct btrfs_io_context *bioc; }; struct btrfs_discard_stripe { @@ -419,6 +431,11 @@ struct btrfs_io_context { atomic_t error; u16 max_errors; + u64 logical; + u64 size; + /* Raid stripe tree ordered entry. */ + struct list_head rst_ordered_entry; + /* * The total number of stripes, including the extra duplicated * stripe for replace. @@ -596,8 +613,7 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, int *mirror_num_ret, - int need_raid_map); + struct btrfs_io_stripe *smap, int *mirror_num_ret); int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, struct btrfs_io_stripe *smap, u64 logical, u32 length, int mirror_num); @@ -611,7 +627,8 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, void btrfs_mapping_tree_free(struct extent_map_tree *tree); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); -struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags); +struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, + bool mount_arg_dev); int btrfs_forget_devices(dev_t devt); void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices); @@ -629,7 +646,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); int btrfs_rm_device(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, - struct block_device **bdev, void **holder); + struct bdev_handle **bdev_handle); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 96828a13dd43..3cf236fb40a4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -188,15 +188,15 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, if (old_data_len + name_len + sizeof(*di) == item_size) { /* No other xattrs packed in the same leaf item. */ if (size > old_data_len) - btrfs_extend_item(path, size - old_data_len); + btrfs_extend_item(trans, path, size - old_data_len); else if (size < old_data_len) - btrfs_truncate_item(path, data_size, 1); + btrfs_truncate_item(trans, path, data_size, 1); } else { /* There are other xattrs packed in the same item. */ ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto out; - btrfs_extend_item(path, data_size); + btrfs_extend_item(trans, path, data_size); } ptr = btrfs_item_ptr(leaf, slot, char); @@ -205,7 +205,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, btrfs_set_dir_data_len(leaf, di, size); data_ptr = ((unsigned long)(di + 1)) + name_len; write_extent_buffer(leaf, value, data_ptr, size); - btrfs_mark_buffer_dirty(leaf); + btrfs_mark_buffer_dirty(trans, leaf); } else { /* * Insert, and we had space for the xattr, so path->slots[0] is @@ -265,7 +265,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -408,7 +408,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, if (!ret) { inode_inc_iversion(inode); inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); } @@ -442,7 +442,7 @@ static const struct xattr_handler btrfs_btrfs_xattr_handler = { .set = btrfs_xattr_handler_set_prop, }; -const struct xattr_handler *btrfs_xattr_handlers[] = { +const struct xattr_handler * const btrfs_xattr_handlers[] = { &btrfs_security_xattr_handler, &btrfs_trusted_xattr_handler, &btrfs_user_xattr_handler, diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 1cd3fc0a8f17..118118ca3e1d 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -8,7 +8,7 @@ #include <linux/xattr.h> -extern const struct xattr_handler *btrfs_xattr_handlers[]; +extern const struct xattr_handler * const btrfs_xattr_handlers[]; int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 09bc325d075d..3504ade30cb0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1282,21 +1282,284 @@ out: return ret; } +struct zone_info { + u64 physical; + u64 capacity; + u64 alloc_offset; +}; + +static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, + struct zone_info *info, unsigned long *active, + struct map_lookup *map) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *device = map->stripes[zone_idx].dev; + int dev_replace_is_ongoing = 0; + unsigned int nofs_flag; + struct blk_zone zone; + int ret; + + info->physical = map->stripes[zone_idx].physical; + + if (!device->bdev) { + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + /* Consider a zone as active if we can allow any number of active zones. */ + if (!device->zone_info->max_active_zones) + __set_bit(zone_idx, active); + + if (!btrfs_dev_is_sequential(device, info->physical)) { + info->alloc_offset = WP_CONVENTIONAL; + return 0; + } + + /* This zone will be used for allocation, so mark this zone non-empty. */ + btrfs_dev_clear_zone_empty(device, info->physical); + + down_read(&dev_replace->rwsem); + dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) + btrfs_dev_clear_zone_empty(dev_replace->tgtdev, info->physical); + up_read(&dev_replace->rwsem); + + /* + * The group is mapped to a sequential zone. Get the zone write pointer + * to determine the allocation offset within the zone. + */ + WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + nofs_flag = memalloc_nofs_save(); + ret = btrfs_get_dev_zone(device, info->physical, &zone); + memalloc_nofs_restore(nofs_flag); + if (ret) { + if (ret != -EIO && ret != -EOPNOTSUPP) + return ret; + info->alloc_offset = WP_MISSING_DEV; + return 0; + } + + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + btrfs_err_in_rcu(fs_info, + "zoned: unexpected conventional zone %llu on device %s (devid %llu)", + zone.start << SECTOR_SHIFT, rcu_str_deref(device->name), + device->devid); + return -EIO; + } + + info->capacity = (zone.capacity << SECTOR_SHIFT); + + switch (zone.cond) { + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + btrfs_err(fs_info, + "zoned: offline/readonly zone %llu on device %s (devid %llu)", + (info->physical >> device->zone_info->zone_size_shift), + rcu_str_deref(device->name), device->devid); + info->alloc_offset = WP_MISSING_DEV; + break; + case BLK_ZONE_COND_EMPTY: + info->alloc_offset = 0; + break; + case BLK_ZONE_COND_FULL: + info->alloc_offset = info->capacity; + break; + default: + /* Partially used zone. */ + info->alloc_offset = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(zone_idx, active); + break; + } + + return 0; +} + +static int btrfs_load_block_group_single(struct btrfs_block_group *bg, + struct zone_info *info, + unsigned long *active) +{ + if (info->alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + info->physical); + return -EIO; + } + + bg->alloc_offset = info->alloc_offset; + bg->zone_capacity = info->capacity; + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + return 0; +} + +static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data DUP profile needs raid-stripe-tree"); + return -EINVAL; + } + + if (zone_info[0].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[0].physical); + return -EIO; + } + if (zone_info[1].alloc_offset == WP_MISSING_DEV) { + btrfs_err(bg->fs_info, + "zoned: cannot recover write pointer for zone %llu", + zone_info[1].physical); + return -EIO; + } + if (zone_info[0].alloc_offset != zone_info[1].alloc_offset) { + btrfs_err(bg->fs_info, + "zoned: write pointer offset mismatch of zones in DUP profile"); + return -EIO; + } + + if (test_bit(0, active) != test_bit(1, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else if (test_bit(0, active)) { + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + bg->alloc_offset = zone_info[0].alloc_offset; + bg->zone_capacity = min(zone_info[0].capacity, zone_info[1].capacity); + return 0; +} + +static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + int i; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if ((zone_info[0].alloc_offset != zone_info[i].alloc_offset) && + !btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "zoned: write pointer offset mismatch of zones in %s profile", + btrfs_bg_type_to_raid_name(map->type)); + return -EIO; + } + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_test_opt(fs_info, DEGRADED) && + !btrfs_zone_activate(bg)) { + return -EIO; + } + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + /* In case a device is missing we have a cap of 0, so don't use it. */ + bg->zone_capacity = min_not_zero(zone_info[0].capacity, + zone_info[1].capacity); + } + + if (zone_info[0].alloc_offset != WP_MISSING_DEV) + bg->alloc_offset = zone_info[0].alloc_offset; + else + bg->alloc_offset = zone_info[i - 1].alloc_offset; + + return 0; +} + +static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + + return 0; +} + +static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, + struct map_lookup *map, + struct zone_info *zone_info, + unsigned long *active) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + if ((map->type & BTRFS_BLOCK_GROUP_DATA) && !fs_info->stripe_root) { + btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", + btrfs_bg_type_to_raid_name(map->type)); + return -EINVAL; + } + + for (int i = 0; i < map->num_stripes; i++) { + if (zone_info[i].alloc_offset == WP_MISSING_DEV || + zone_info[i].alloc_offset == WP_CONVENTIONAL) + continue; + + if (test_bit(0, active) != test_bit(i, active)) { + if (!btrfs_zone_activate(bg)) + return -EIO; + } else { + if (test_bit(0, active)) + set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &bg->runtime_flags); + } + + if ((i % map->sub_stripes) == 0) { + bg->zone_capacity += zone_info[i].capacity; + bg->alloc_offset += zone_info[i].alloc_offset; + } + } + + return 0; +} + int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct btrfs_device *device; u64 logical = cache->start; u64 length = cache->length; + struct zone_info *zone_info = NULL; int ret; int i; - unsigned int nofs_flag; - u64 *alloc_offsets = NULL; - u64 *caps = NULL; - u64 *physical = NULL; unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1328,20 +1591,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } - alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); - if (!alloc_offsets) { - ret = -ENOMEM; - goto out; - } - - caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); - if (!caps) { - ret = -ENOMEM; - goto out; - } - - physical = kcalloc(map->num_stripes, sizeof(*physical), GFP_NOFS); - if (!physical) { + zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); + if (!zone_info) { ret = -ENOMEM; goto out; } @@ -1353,98 +1604,14 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - bool is_sequential; - struct blk_zone zone; - struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; - - device = map->stripes[i].dev; - physical[i] = map->stripes[i].physical; - - if (device->bdev == NULL) { - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } - - is_sequential = btrfs_dev_is_sequential(device, physical[i]); - if (is_sequential) - num_sequential++; - else - num_conventional++; - - /* - * Consider a zone as active if we can allow any number of - * active zones. - */ - if (!device->zone_info->max_active_zones) - __set_bit(i, active); - - if (!is_sequential) { - alloc_offsets[i] = WP_CONVENTIONAL; - continue; - } - - /* - * This zone will be used for allocation, so mark this zone - * non-empty. - */ - btrfs_dev_clear_zone_empty(device, physical[i]); - - down_read(&dev_replace->rwsem); - dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); - if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) - btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical[i]); - up_read(&dev_replace->rwsem); - - /* - * The group is mapped to a sequential zone. Get the zone write - * pointer to determine the allocation offset within the zone. - */ - WARN_ON(!IS_ALIGNED(physical[i], fs_info->zone_size)); - nofs_flag = memalloc_nofs_save(); - ret = btrfs_get_dev_zone(device, physical[i], &zone); - memalloc_nofs_restore(nofs_flag); - if (ret == -EIO || ret == -EOPNOTSUPP) { - ret = 0; - alloc_offsets[i] = WP_MISSING_DEV; - continue; - } else if (ret) { - goto out; - } - - if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { - btrfs_err_in_rcu(fs_info, - "zoned: unexpected conventional zone %llu on device %s (devid %llu)", - zone.start << SECTOR_SHIFT, - rcu_str_deref(device->name), device->devid); - ret = -EIO; + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + if (ret) goto out; - } - caps[i] = (zone.capacity << SECTOR_SHIFT); - - switch (zone.cond) { - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, - "zoned: offline/readonly zone %llu on device %s (devid %llu)", - physical[i] >> device->zone_info->zone_size_shift, - rcu_str_deref(device->name), device->devid); - alloc_offsets[i] = WP_MISSING_DEV; - break; - case BLK_ZONE_COND_EMPTY: - alloc_offsets[i] = 0; - break; - case BLK_ZONE_COND_FULL: - alloc_offsets[i] = caps[i]; - break; - default: - /* Partially used zone */ - alloc_offsets[i] = - ((zone.wp - zone.start) << SECTOR_SHIFT); - __set_bit(i, active); - break; - } + if (zone_info[i].alloc_offset == WP_CONVENTIONAL) + num_conventional++; + else + num_sequential++; } if (num_sequential > 0) @@ -1468,63 +1635,24 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = caps[0]; - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags); + ret = btrfs_load_block_group_single(cache, &zone_info[0], active); break; case BTRFS_BLOCK_GROUP_DUP: - if (map->type & BTRFS_BLOCK_GROUP_DATA) { - btrfs_err(fs_info, "zoned: profile DUP not yet supported on data bg"); - ret = -EINVAL; - goto out; - } - if (alloc_offsets[0] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[0]); - ret = -EIO; - goto out; - } - if (alloc_offsets[1] == WP_MISSING_DEV) { - btrfs_err(fs_info, - "zoned: cannot recover write pointer for zone %llu", - physical[1]); - ret = -EIO; - goto out; - } - if (alloc_offsets[0] != alloc_offsets[1]) { - btrfs_err(fs_info, - "zoned: write pointer offset mismatch of zones in DUP profile"); - ret = -EIO; - goto out; - } - if (test_bit(0, active) != test_bit(1, active)) { - if (!btrfs_zone_activate(cache)) { - ret = -EIO; - goto out; - } - } else { - if (test_bit(0, active)) - set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, - &cache->runtime_flags); - } - cache->alloc_offset = alloc_offsets[0]; - cache->zone_capacity = min(caps[0], caps[1]); + ret = btrfs_load_block_group_dup(cache, map, zone_info, active); break; case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + ret = btrfs_load_block_group_raid1(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID0: + ret = btrfs_load_block_group_raid0(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID10: + ret = btrfs_load_block_group_raid10(cache, map, zone_info, active); + break; case BTRFS_BLOCK_GROUP_RAID5: case BTRFS_BLOCK_GROUP_RAID6: - /* non-single profiles are not supported yet */ default: btrfs_err(fs_info, "zoned: profile %s not yet supported", btrfs_bg_type_to_raid_name(map->type)); @@ -1570,9 +1698,7 @@ out: cache->physical_map = NULL; } bitmap_free(active); - kfree(physical); - kfree(caps); - kfree(alloc_offsets); + kfree(zone_info); free_extent_map(em); return ret; @@ -1609,7 +1735,7 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); set_extent_buffer_dirty(eb); set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY | EXTENT_NOWAIT, NULL); + EXTENT_DIRTY, NULL); } bool btrfs_use_zone_append(struct btrfs_bio *bbio) @@ -1887,7 +2013,7 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, int i, ret; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bioc, NULL, NULL, 1); + &mapped_length, &bioc, NULL, NULL); if (ret || !bioc || mapped_length < PAGE_SIZE) { ret = -EIO; goto out_put_bioc; diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index e7ac4ec809a4..5511766485cd 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -145,7 +145,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) } /* - * zstd_calc_ws_mem_sizes - calculate monotonic memory bounds + * Calculate monotonic memory bounds. * * It is possible based on the level configurations that a higher level * workspace uses less memory than a lower level workspace. In order to reuse @@ -218,7 +218,8 @@ void zstd_cleanup_workspace_manager(void) } /* - * zstd_find_workspace - find workspace + * Find workspace for given level. + * * @level: compression level * * This iterates over the set bits in the active_map beginning at the requested @@ -256,7 +257,8 @@ static struct list_head *zstd_find_workspace(unsigned int level) } /* - * zstd_get_workspace - zstd's get_workspace + * Zstd get_workspace for level. + * * @level: compression level * * If @level is 0, then any compression level can be used. Therefore, we begin @@ -296,7 +298,8 @@ again: } /* - * zstd_put_workspace - zstd put_workspace + * Zstd put_workspace. + * * @ws: list_head for the workspace * * When putting back a workspace, we only need to update the LRU if we are of diff --git a/fs/buffer.c b/fs/buffer.c index 2379564e5aea..12e9a71c693d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2011,7 +2011,7 @@ void folio_zero_new_buffers(struct folio *folio, size_t from, size_t to) } EXPORT_SYMBOL(folio_zero_new_buffers); -static void +static int iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, const struct iomap *iomap) { @@ -2025,7 +2025,8 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, * current block, then do not map the buffer and let the caller * handle it. */ - BUG_ON(offset >= iomap->offset + iomap->length); + if (offset >= iomap->offset + iomap->length) + return -EIO; switch (iomap->type) { case IOMAP_HOLE: @@ -2037,7 +2038,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) set_buffer_new(bh); - break; + return 0; case IOMAP_DELALLOC: if (!buffer_uptodate(bh) || (offset >= i_size_read(inode))) @@ -2045,7 +2046,7 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, set_buffer_uptodate(bh); set_buffer_mapped(bh); set_buffer_delay(bh); - break; + return 0; case IOMAP_UNWRITTEN: /* * For unwritten regions, we always need to ensure that regions @@ -2057,12 +2058,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh, fallthrough; case IOMAP_MAPPED: if ((iomap->flags & IOMAP_F_NEW) || - offset >= i_size_read(inode)) + offset >= i_size_read(inode)) { + /* + * This can happen if truncating the block device races + * with the check in the caller as i_size updates on + * block devices aren't synchronized by i_rwsem for + * block devices. + */ + if (S_ISBLK(inode->i_mode)) + return -EIO; set_buffer_new(bh); + } bh->b_blocknr = (iomap->addr + offset - iomap->offset) >> inode->i_blkbits; set_buffer_mapped(bh); - break; + return 0; + default: + WARN_ON_ONCE(1); + return -EIO; } } @@ -2103,13 +2116,12 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len, clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); - if (get_block) { + if (get_block) err = get_block(inode, block, bh, 1); - if (err) - break; - } else { - iomap_to_bh(inode, block, bh, iomap); - } + else + err = iomap_to_bh(inode, block, bh, iomap); + if (err) + break; if (buffer_new(bh)) { clean_bdev_bh_alias(bh); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f4863078f7fe..936b9e0b351d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -750,7 +750,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n", page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not "); - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(osdc, req); err = ceph_osdc_wait_request(osdc, req); @@ -1327,7 +1327,7 @@ new_request: pages = NULL; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); req = NULL; @@ -1875,7 +1875,7 @@ int ceph_uninline_data(struct file *file) goto out_unlock; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); ceph_osdc_put_request(req); @@ -1917,7 +1917,7 @@ int ceph_uninline_data(struct file *file) goto out_put_req; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -2092,7 +2092,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); ceph_osdc_start_request(&fsc->client->osdc, rd_req); - wr_req->r_mtime = ci->netfs.inode.i_mtime; + wr_req->r_mtime = inode_get_mtime(&ci->netfs.inode); ceph_osdc_start_request(&fsc->client->osdc, wr_req); err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 14215ec646f7..a104669fcf4c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1421,8 +1421,8 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->old_xattr_buf = NULL; } - arg->mtime = inode->i_mtime; - arg->atime = inode->i_atime; + arg->mtime = inode_get_mtime(inode); + arg->atime = inode_get_atime(inode); arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index e4d5cd56a80b..e3b1c3fab412 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,7 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, .get_dummy_policy = ceph_get_dummy_policy, @@ -249,11 +250,9 @@ static struct inode *parse_longname(const struct inode *parent, if (!dir) { /* This can happen if we're not mounting cephfs on the root */ dir = ceph_get_inode(parent->i_sb, vino, NULL); - if (!dir) - dir = ERR_PTR(-ENOENT); + if (IS_ERR(dir)) + dout("Can't find inode %s (%s)\n", inode_number, name); } - if (IS_ERR(dir)) - dout("Can't find inode %s (%s)\n", inode_number, name); out: kfree(inode_number); @@ -462,7 +461,7 @@ int ceph_fname_to_usr(const struct ceph_fname *fname, struct fscrypt_str *tname, out: fscrypt_fname_free_buffer(&_tname); out_inode: - if ((dir != fname->dir) && !IS_ERR(dir)) { + if (dir != fname->dir) { if ((dir->i_state & I_NEW)) discard_new_inode(dir); else diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b1da02f5dbe3..649600d0a7b6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -2489,7 +2489,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - req->r_mtime = inode->i_mtime; + req->r_mtime = inode_get_mtime(inode); ceph_osdc_start_request(&fsc->client->osdc, req); ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (ret == -ENOENT) @@ -2969,7 +2969,7 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ret = do_splice_direct(src_file, &src_off, dst_file, &dst_off, src_objlen, flags); /* Abort on short copies or on error */ - if (ret < src_objlen) { + if (ret < (long)src_objlen) { dout("Failed partial copy (%zd)\n", ret); goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 800ab7920513..2e2a303b9e64 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -185,9 +185,9 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; - inode->i_mtime = parent->i_mtime; + inode_set_mtime_to_ts(inode, inode_get_mtime(parent)); inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); - inode->i_atime = parent->i_atime; + inode_set_atime_to_ts(inode, inode_get_atime(parent)); ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; @@ -769,9 +769,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, ci->i_truncate_seq = truncate_seq; /* the MDS should have revoked these caps */ - WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL | - CEPH_CAP_FILE_RD | - CEPH_CAP_FILE_WR | + WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_LAZYIO)); /* * If we hold relevant caps, or in the case where we're @@ -837,28 +835,31 @@ void ceph_fill_file_time(struct inode *inode, int issued, /* the MDS did a utimes() */ dout("mtime %lld.%09ld -> %lld.%09ld " "tw %d -> %d\n", - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + inode_get_mtime_sec(inode), + inode_get_mtime_nsec(inode), mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { + struct timespec64 ts; + /* nobody did utimes(); take the max */ - if (timespec64_compare(mtime, &inode->i_mtime) > 0) { + ts = inode_get_mtime(inode); + if (timespec64_compare(mtime, &ts) > 0) { dout("mtime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_mtime.tv_sec, - inode->i_mtime.tv_nsec, + ts.tv_sec, ts.tv_nsec, mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = *mtime; + inode_set_mtime_to_ts(inode, *mtime); } - if (timespec64_compare(atime, &inode->i_atime) > 0) { + ts = inode_get_atime(inode); + if (timespec64_compare(atime, &ts) > 0) { dout("atime %lld.%09ld -> %lld.%09ld inc\n", - inode->i_atime.tv_sec, - inode->i_atime.tv_nsec, + ts.tv_sec, ts.tv_nsec, atime->tv_sec, atime->tv_nsec); - inode->i_atime = *atime; + inode_set_atime_to_ts(inode, *atime); } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -869,8 +870,8 @@ void ceph_fill_file_time(struct inode *inode, int issued, /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { inode_set_ctime_to_ts(inode, *ctime); - inode->i_mtime = *mtime; - inode->i_atime = *atime; + inode_set_mtime_to_ts(inode, *mtime); + inode_set_atime_to_ts(inode, *atime); ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; @@ -2553,20 +2554,22 @@ retry: } if (ia_valid & ATTR_ATIME) { + struct timespec64 atime = inode_get_atime(inode); + dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, - inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + atime.tv_sec, atime.tv_nsec, attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_atime, - &attr->ia_atime) < 0) { - inode->i_atime = attr->ia_atime; + timespec64_compare(&atime, + &attr->ia_atime) < 0) { + inode_set_atime_to_ts(inode, attr->ia_atime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { + !timespec64_equal(&atime, &attr->ia_atime)) { ceph_encode_timespec64(&req->r_args.setattr.atime, &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; @@ -2626,20 +2629,21 @@ retry: } } if (ia_valid & ATTR_MTIME) { + struct timespec64 mtime = inode_get_mtime(inode); + dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, - inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime.tv_sec, mtime.tv_nsec, attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_EXCL; } else if ((issued & CEPH_CAP_FILE_WR) && - timespec64_compare(&inode->i_mtime, - &attr->ia_mtime) < 0) { - inode->i_mtime = attr->ia_mtime; + timespec64_compare(&mtime, &attr->ia_mtime) < 0) { + inode_set_mtime_to_ts(inode, attr->ia_mtime); dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { + !timespec64_equal(&mtime, &attr->ia_mtime)) { ceph_encode_timespec64(&req->r_args.setattr.mtime, &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; @@ -2653,8 +2657,8 @@ retry: bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - inode_get_ctime(inode).tv_sec, - inode_get_ctime(inode).tv_nsec, + inode_get_ctime_sec(inode), + inode_get_ctime_nsec(inode), attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); if (only) { diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 615db141b6c4..de798444bb97 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -861,8 +861,8 @@ int ceph_wait_on_conflict_unlink(struct dentry *dentry) if (!d_same_name(udentry, pdentry, &dname)) goto next; + found = dget_dlock(udentry); spin_unlock(&udentry->d_lock); - found = dget(udentry); break; next: spin_unlock(&udentry->d_lock); @@ -4353,12 +4353,16 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg) rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { + struct timespec64 ts; + rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(i_size_read(inode)); - ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); - ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); + ts = inode_get_mtime(inode); + ceph_encode_timespec64(&rec.v1.mtime, &ts); + ts = inode_get_atime(inode); + ceph_encode_timespec64(&rec.v1.atime, &ts); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 813f21add992..6732e1ea97d9 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -658,8 +658,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, BUG_ON(capsnap->writing); capsnap->size = i_size_read(inode); - capsnap->mtime = inode->i_mtime; - capsnap->atime = inode->i_atime; + capsnap->mtime = inode_get_mtime(inode); + capsnap->atime = inode_get_atime(inode); capsnap->ctime = inode_get_ctime(inode); capsnap->btime = ci->i_btime; capsnap->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 51c7f2b14f6f..98844fc8a2f7 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1119,7 +1119,7 @@ ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t ceph_listxattr(struct dentry *, char *, size_t); extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); -extern const struct xattr_handler *ceph_xattr_handlers[]; +extern const struct xattr_handler * const ceph_xattr_handlers[]; struct ceph_acl_sec_ctx { #ifdef CONFIG_CEPH_FS_POSIX_ACL diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 0deae4a0f5f1..097ce7f74073 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1446,7 +1446,7 @@ void ceph_release_acl_sec_ctx(struct ceph_acl_sec_ctx *as_ctx) * List of handlers for synthetic system.* attributes. Other * attributes are handled directly. */ -const struct xattr_handler *ceph_xattr_handlers[] = { +const struct xattr_handler * const ceph_xattr_handlers[] = { &ceph_other_xattr_handler, NULL, }; diff --git a/fs/char_dev.c b/fs/char_dev.c index 950b6919fb87..6ba032442b39 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -350,7 +350,7 @@ static struct kobject *cdev_get(struct cdev *p) struct module *owner = p->owner; struct kobject *kobj; - if (owner && !try_module_get(owner)) + if (!try_module_get(owner)) return NULL; kobj = kobject_get_unless_zero(&p->kobj); if (!kobj) diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index ae023853a98f..1d2dac95f86a 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -123,9 +123,11 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_size != -1) inode->i_blocks = (attr->va_size + 511) >> 9; if (attr->va_atime.tv_sec != -1) - inode->i_atime = coda_to_timespec64(attr->va_atime); + inode_set_atime_to_ts(inode, + coda_to_timespec64(attr->va_atime)); if (attr->va_mtime.tv_sec != -1) - inode->i_mtime = coda_to_timespec64(attr->va_mtime); + inode_set_mtime_to_ts(inode, + coda_to_timespec64(attr->va_mtime)); if (attr->va_ctime.tv_sec != -1) inode_set_ctime_to_ts(inode, coda_to_timespec64(attr->va_ctime)); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index cb512b10473b..4e552ba7bd43 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir) /* optimistically we can also act as if our nose bleeds. The * granularity of the mtime is coarse anyways so we might actually be * right most of the time. Note: we only do this for directories. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); #endif } diff --git a/fs/coda/file.c b/fs/coda/file.c index 42346618b4ed..16acc58311ea 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; - coda_inode->i_mtime = inode_set_ctime_current(coda_inode); + inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode)); inode_unlock(coda_inode); file_end_write(host_file); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fbdcb3582926..dcc22f593e43 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -88,7 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -96,8 +96,8 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_mode = iattr->ia_mode; inode->i_uid = iattr->ia_uid; inode->i_gid = iattr->ia_gid; - inode->i_atime = iattr->ia_atime; - inode->i_mtime = iattr->ia_mtime; + inode_set_atime_to_ts(inode, iattr->ia_atime); + inode_set_mtime_to_ts(inode, iattr->ia_mtime); inode_set_ctime_to_ts(inode, iattr->ia_ctime); } @@ -171,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = inode_set_ctime_current(p_inode); + inode_set_mtime_to_ts(p_inode, inode_set_ctime_current(p_inode)); configfs_set_inode_lock_class(sd, inode); return inode; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 5ee7d7bbb361..60dbfa0f8805 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -133,8 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, - zerotime); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, zerotime))); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even @@ -495,7 +495,7 @@ static void cramfs_kill_sb(struct super_block *sb) sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { sync_blockdev(sb->s_bdev); - blkdev_put(sb->s_bdev, sb); + bdev_release(sb->s_bdev_handle); } kfree(sbi); } diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 62e1a3dd8357..0ad8c30b8fa5 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -111,10 +111,14 @@ out: int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, sector_t pblk, unsigned int len) { - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; - const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits; - const unsigned int blocks_per_page = 1 << blocks_per_page_bits; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; + const unsigned int du_per_page_bits = PAGE_SHIFT - du_bits; + const unsigned int du_per_page = 1U << du_per_page_bits; + u64 du_index = (u64)lblk << (inode->i_blkbits - du_bits); + u64 du_remaining = (u64)len << (inode->i_blkbits - du_bits); + sector_t sector = pblk << (inode->i_blkbits - SECTOR_SHIFT); struct page *pages[16]; /* write up to 16 pages at a time */ unsigned int nr_pages; unsigned int i; @@ -130,8 +134,8 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, len); BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_VECS); - nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), - (len + blocks_per_page - 1) >> blocks_per_page_bits); + nr_pages = min_t(u64, ARRAY_SIZE(pages), + (du_remaining + du_per_page - 1) >> du_per_page_bits); /* * We need at least one page for ciphertext. Allocate the first one @@ -154,21 +158,22 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, bio = bio_alloc(inode->i_sb->s_bdev, nr_pages, REQ_OP_WRITE, GFP_NOFS); do { - bio->bi_iter.bi_sector = pblk << (blockbits - 9); + bio->bi_iter.bi_sector = sector; i = 0; offset = 0; do { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), pages[i], - blocksize, offset, GFP_NOFS); + err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, du_index, + ZERO_PAGE(0), pages[i], + du_size, offset, + GFP_NOFS); if (err) goto out; - lblk++; - pblk++; - len--; - offset += blocksize; - if (offset == PAGE_SIZE || len == 0) { + du_index++; + sector += 1U << (du_bits - SECTOR_SHIFT); + du_remaining--; + offset += du_size; + if (offset == PAGE_SIZE || du_remaining == 0) { ret = bio_add_page(bio, pages[i++], offset, 0); if (WARN_ON_ONCE(ret != offset)) { err = -EIO; @@ -176,13 +181,13 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, } offset = 0; } - } while (i != nr_pages && len != 0); + } while (i != nr_pages && du_remaining != 0); err = submit_bio_wait(bio); if (err) goto out; bio_reset(bio, inode->i_sb->s_bdev, REQ_OP_WRITE); - } while (len != 0); + } while (du_remaining != 0); err = 0; out: bio_put(bio); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 6a837e4b80dc..328470d40dec 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -39,7 +39,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); -struct kmem_cache *fscrypt_info_cachep; +struct kmem_cache *fscrypt_inode_info_cachep; void fscrypt_enqueue_decrypt_work(struct work_struct *work) { @@ -49,6 +49,13 @@ EXPORT_SYMBOL(fscrypt_enqueue_decrypt_work); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags) { + if (WARN_ON_ONCE(!fscrypt_bounce_page_pool)) { + /* + * Oops, the filesystem called a function that uses the bounce + * page pool, but it didn't set needs_bounce_pages. + */ + return NULL; + } return mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); } @@ -70,44 +77,44 @@ void fscrypt_free_bounce_page(struct page *bounce_page) EXPORT_SYMBOL(fscrypt_free_bounce_page); /* - * Generate the IV for the given logical block number within the given file. - * For filenames encryption, lblk_num == 0. + * Generate the IV for the given data unit index within the given file. + * For filenames encryption, index == 0. * * Keep this in sync with fscrypt_limit_io_blocks(). fscrypt_limit_io_blocks() * needs to know about any IV generation methods where the low bits of IV don't - * simply contain the lblk_num (e.g., IV_INO_LBLK_32). + * simply contain the data unit index (e.g., IV_INO_LBLK_32). */ -void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, - const struct fscrypt_info *ci) +void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, + const struct fscrypt_inode_info *ci) { u8 flags = fscrypt_policy_flags(&ci->ci_policy); memset(iv, 0, ci->ci_mode->ivsize); if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { - WARN_ON_ONCE(lblk_num > U32_MAX); + WARN_ON_ONCE(index > U32_MAX); WARN_ON_ONCE(ci->ci_inode->i_ino > U32_MAX); - lblk_num |= (u64)ci->ci_inode->i_ino << 32; + index |= (u64)ci->ci_inode->i_ino << 32; } else if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { - WARN_ON_ONCE(lblk_num > U32_MAX); - lblk_num = (u32)(ci->ci_hashed_ino + lblk_num); + WARN_ON_ONCE(index > U32_MAX); + index = (u32)(ci->ci_hashed_ino + index); } else if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { memcpy(iv->nonce, ci->ci_nonce, FSCRYPT_FILE_NONCE_SIZE); } - iv->lblk_num = cpu_to_le64(lblk_num); + iv->index = cpu_to_le64(index); } -/* Encrypt or decrypt a single filesystem block of file contents */ -int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, - u64 lblk_num, struct page *src_page, - struct page *dest_page, unsigned int len, - unsigned int offs, gfp_t gfp_flags) +/* Encrypt or decrypt a single "data unit" of file contents. */ +int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, + fscrypt_direction_t rw, u64 index, + struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags) { union fscrypt_iv iv; struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist dst, src; - struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; int res = 0; @@ -116,7 +123,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, if (WARN_ON_ONCE(len % FSCRYPT_CONTENTS_ALIGNMENT != 0)) return -EINVAL; - fscrypt_generate_iv(&iv, lblk_num, ci); + fscrypt_generate_iv(&iv, index, ci); req = skcipher_request_alloc(tfm, gfp_flags); if (!req) @@ -137,28 +144,29 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait); skcipher_request_free(req); if (res) { - fscrypt_err(inode, "%scryption failed for block %llu: %d", - (rw == FS_DECRYPT ? "De" : "En"), lblk_num, res); + fscrypt_err(ci->ci_inode, + "%scryption failed for data unit %llu: %d", + (rw == FS_DECRYPT ? "De" : "En"), index, res); return res; } return 0; } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt filesystem blocks from a - * pagecache page - * @page: The locked pagecache page containing the block(s) to encrypt - * @len: Total size of the block(s) to encrypt. Must be a nonzero - * multiple of the filesystem's block size. - * @offs: Byte offset within @page of the first block to encrypt. Must be - * a multiple of the filesystem's block size. - * @gfp_flags: Memory allocation flags. See details below. + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page + * @page: the locked pagecache page containing the data to encrypt + * @len: size of the data to encrypt, in bytes + * @offs: offset within @page of the data to encrypt, in bytes + * @gfp_flags: memory allocation flags; see details below + * + * This allocates a new bounce page and encrypts the given data into it. The + * length and offset of the data must be aligned to the file's crypto data unit + * size. Alignment to the filesystem block size fulfills this requirement, as + * the filesystem block size is always a multiple of the data unit size. * - * A new bounce page is allocated, and the specified block(s) are encrypted into - * it. In the bounce page, the ciphertext block(s) will be located at the same - * offsets at which the plaintext block(s) were located in the source page; any - * other parts of the bounce page will be left uninitialized. However, normally - * blocksize == PAGE_SIZE and the whole page is encrypted at once. + * In the bounce page, the ciphertext data will be located at the same offset at + * which the plaintext data was located in the source page. Any other parts of + * the bounce page will be left uninitialized. * * This is for use by the filesystem's ->writepages() method. * @@ -176,28 +184,29 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, { const struct inode *inode = page->mapping->host; - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; - u64 lblk_num = ((u64)page->index << (PAGE_SHIFT - blockbits)) + - (offs >> blockbits); + u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + + (offs >> du_bits); unsigned int i; int err; if (WARN_ON_ONCE(!PageLocked(page))) return ERR_PTR(-EINVAL); - if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return ERR_PTR(-EINVAL); ciphertext_page = fscrypt_alloc_bounce_page(gfp_flags); if (!ciphertext_page) return ERR_PTR(-ENOMEM); - for (i = offs; i < offs + len; i += blocksize, lblk_num++) { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, - page, ciphertext_page, - blocksize, i, gfp_flags); + for (i = offs; i < offs + len; i += du_size, index++) { + err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, + page, ciphertext_page, + du_size, i, gfp_flags); if (err) { fscrypt_free_bounce_page(ciphertext_page); return ERR_PTR(err); @@ -224,30 +233,33 @@ EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * + * This is not compatible with fscrypt_operations::supports_subblock_data_units. + * * Return: 0 on success; -errno on failure */ int fscrypt_encrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num, gfp_t gfp_flags) { - return fscrypt_crypt_block(inode, FS_ENCRYPT, lblk_num, page, page, - len, offs, gfp_flags); + if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) + return -EOPNOTSUPP; + return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_ENCRYPT, + lblk_num, page, page, len, offs, + gfp_flags); } EXPORT_SYMBOL(fscrypt_encrypt_block_inplace); /** - * fscrypt_decrypt_pagecache_blocks() - Decrypt filesystem blocks in a - * pagecache folio - * @folio: The locked pagecache folio containing the block(s) to decrypt - * @len: Total size of the block(s) to decrypt. Must be a nonzero - * multiple of the filesystem's block size. - * @offs: Byte offset within @folio of the first block to decrypt. Must be - * a multiple of the filesystem's block size. + * fscrypt_decrypt_pagecache_blocks() - Decrypt data from a pagecache folio + * @folio: the pagecache folio containing the data to decrypt + * @len: size of the data to decrypt, in bytes + * @offs: offset within @folio of the data to decrypt, in bytes * - * The specified block(s) are decrypted in-place within the pagecache folio, - * which must still be locked and not uptodate. - * - * This is for use by the filesystem's ->readahead() method. + * Decrypt data that has just been read from an encrypted file. The data must + * be located in a pagecache folio that is still locked and not yet uptodate. + * The length and offset of the data must be aligned to the file's crypto data + * unit size. Alignment to the filesystem block size fulfills this requirement, + * as the filesystem block size is always a multiple of the data unit size. * * Return: 0 on success; -errno on failure */ @@ -255,25 +267,26 @@ int fscrypt_decrypt_pagecache_blocks(struct folio *folio, size_t len, size_t offs) { const struct inode *inode = folio->mapping->host; - const unsigned int blockbits = inode->i_blkbits; - const unsigned int blocksize = 1 << blockbits; - u64 lblk_num = ((u64)folio->index << (PAGE_SHIFT - blockbits)) + - (offs >> blockbits); + const struct fscrypt_inode_info *ci = inode->i_crypt_info; + const unsigned int du_bits = ci->ci_data_unit_bits; + const unsigned int du_size = 1U << du_bits; + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + + (offs >> du_bits); size_t i; int err; if (WARN_ON_ONCE(!folio_test_locked(folio))) return -EINVAL; - if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, blocksize))) + if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) return -EINVAL; - for (i = offs; i < offs + len; i += blocksize, lblk_num++) { + for (i = offs; i < offs + len; i += du_size, index++) { struct page *page = folio_page(folio, i >> PAGE_SHIFT); - err = fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, - page, blocksize, i & ~PAGE_MASK, - GFP_NOFS); + err = fscrypt_crypt_data_unit(ci, FS_DECRYPT, index, page, + page, du_size, i & ~PAGE_MASK, + GFP_NOFS); if (err) return err; } @@ -295,14 +308,19 @@ EXPORT_SYMBOL(fscrypt_decrypt_pagecache_blocks); * arbitrary page, not necessarily in the original pagecache page. The @inode * and @lblk_num must be specified, as they can't be determined from @page. * + * This is not compatible with fscrypt_operations::supports_subblock_data_units. + * * Return: 0 on success; -errno on failure */ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, unsigned int len, unsigned int offs, u64 lblk_num) { - return fscrypt_crypt_block(inode, FS_DECRYPT, lblk_num, page, page, - len, offs, GFP_NOFS); + if (WARN_ON_ONCE(inode->i_sb->s_cop->supports_subblock_data_units)) + return -EOPNOTSUPP; + return fscrypt_crypt_data_unit(inode->i_crypt_info, FS_DECRYPT, + lblk_num, page, page, len, offs, + GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); @@ -325,7 +343,7 @@ int fscrypt_initialize(struct super_block *sb) return 0; /* No need to allocate a bounce page pool if this FS won't use it. */ - if (sb->s_cop->flags & FS_CFLG_OWN_PAGES) + if (!sb->s_cop->needs_bounce_pages) return 0; mutex_lock(&fscrypt_init_mutex); @@ -391,18 +409,19 @@ static int __init fscrypt_init(void) if (!fscrypt_read_workqueue) goto fail; - fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); - if (!fscrypt_info_cachep) + fscrypt_inode_info_cachep = KMEM_CACHE(fscrypt_inode_info, + SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_inode_info_cachep) goto fail_free_queue; err = fscrypt_init_keyring(); if (err) - goto fail_free_info; + goto fail_free_inode_info; return 0; -fail_free_info: - kmem_cache_destroy(fscrypt_info_cachep); +fail_free_inode_info: + kmem_cache_destroy(fscrypt_inode_info_cachep); fail_free_queue: destroy_workqueue(fscrypt_read_workqueue); fail: diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 6eae3f12ad50..7b3fc189593a 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -100,7 +100,7 @@ int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; struct scatterlist sg; @@ -157,7 +157,7 @@ static int fname_decrypt(const struct inode *inode, struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - const struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_inode_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_enc_key.tfm; union fscrypt_iv iv; int res; @@ -568,7 +568,7 @@ EXPORT_SYMBOL_GPL(fscrypt_match_name); */ u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) { - const struct fscrypt_info *ci = dir->i_crypt_info; + const struct fscrypt_inode_info *ci = dir->i_crypt_info; WARN_ON_ONCE(!ci->ci_dirhash_key_initialized); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 2d63da48635a..1892356cf924 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -47,7 +47,8 @@ struct fscrypt_context_v2 { u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; - u8 __reserved[4]; + u8 log2_data_unit_size; + u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; @@ -165,6 +166,26 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } +static inline int +fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, + const struct inode *inode) +{ + return policy->log2_data_unit_size ?: inode->i_blkbits; +} + +static inline int +fscrypt_policy_du_bits(const union fscrypt_policy *policy, + const struct inode *inode) +{ + switch (policy->version) { + case FSCRYPT_POLICY_V1: + return inode->i_blkbits; + case FSCRYPT_POLICY_V2: + return fscrypt_policy_v2_du_bits(&policy->v2, inode); + } + BUG(); +} + /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. @@ -189,18 +210,18 @@ struct fscrypt_prepared_key { }; /* - * fscrypt_info - the "encryption key" for an inode + * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ -struct fscrypt_info { +struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; - /* True if ci_enc_key should be freed when this fscrypt_info is freed */ + /* True if ci_enc_key should be freed when this struct is freed */ bool ci_owns_key; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT @@ -212,6 +233,16 @@ struct fscrypt_info { #endif /* + * log2 of the data unit size (granularity of contents encryption) of + * this file. This is computable from ci_policy and ci_inode but is + * cached here for efficiency. Only used for regular files. + */ + u8 ci_data_unit_bits; + + /* Cached value: log2 of number of data units per FS block */ + u8 ci_data_units_per_block_bits; + + /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ @@ -263,12 +294,13 @@ typedef enum { } fscrypt_direction_t; /* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; +extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); -int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, - u64 lblk_num, struct page *src_page, - struct page *dest_page, unsigned int len, - unsigned int offs, gfp_t gfp_flags); +int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, + fscrypt_direction_t rw, u64 index, + struct page *src_page, struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold @@ -283,8 +315,8 @@ fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); union fscrypt_iv { struct { - /* logical block number within the file */ - __le64 lblk_num; + /* zero-based index of data unit within the file */ + __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; @@ -293,8 +325,18 @@ union fscrypt_iv { __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; -void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, - const struct fscrypt_info *ci); +void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, + const struct fscrypt_inode_info *ci); + +/* + * Return the number of bits used by the maximum file data unit index that is + * possible on the given filesystem, using the given log2 data unit size. + */ +static inline int +fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) +{ + return fls64(sb->s_maxbytes - 1) - du_bits; +} /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, @@ -332,17 +374,17 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT -int fscrypt_select_encryption_impl(struct fscrypt_info *ci); +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool -fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci); + const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); @@ -353,7 +395,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb, */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s @@ -370,13 +412,13 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ -static inline int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool -fscrypt_using_inline_encryption(const struct fscrypt_info *ci) +fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } @@ -384,7 +426,7 @@ fscrypt_using_inline_encryption(const struct fscrypt_info *ci) static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; @@ -398,7 +440,7 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb, static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } @@ -433,8 +475,28 @@ struct fscrypt_master_key_secret { * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the - * filesystem and can be used to "unlock" the encrypted files which were - * encrypted with it. + * filesystem. There are three high-level states that a key can be in: + * + * FSCRYPT_KEY_STATUS_PRESENT + * Key is fully usable; it can be used to unlock inodes that are encrypted + * with it (this includes being able to create new inodes). ->mk_present + * indicates whether the key is in this state. ->mk_secret exists, the key + * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. + * + * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED + * Removal of this key has been initiated, but some inodes that were + * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, + * and the key can no longer be used to unlock inodes. Unlike ABSENT, the + * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and + * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. + * + * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, + * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. + * + * FSCRYPT_KEY_STATUS_ABSENT + * Key is fully removed. The key is no longer in the keyring, + * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is + * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { @@ -444,7 +506,7 @@ struct fscrypt_master_key { */ struct hlist_node mk_node; - /* Semaphore that protects ->mk_secret and ->mk_users */ + /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* @@ -454,8 +516,8 @@ struct fscrypt_master_key { * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * - * There is one active ref associated with ->mk_secret being present, - * and one active ref for each inode in ->mk_decrypted_inodes. + * There is one active ref associated with ->mk_present being true, and + * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, @@ -467,17 +529,10 @@ struct fscrypt_master_key { struct rcu_head mk_rcu_head; /* - * The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is - * executed, this is wiped and no new inodes can be unlocked with this - * key; however, there may still be inodes in ->mk_decrypted_inodes - * which could not be evicted. As long as some inodes still remain, - * FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or - * FS_IOC_ADD_ENCRYPTION_KEY can add the secret again. + * The secret key material. Wiped as soon as it is no longer needed; + * for details, see the fscrypt_master_key struct comment. * - * While ->mk_secret is present, one ref in ->mk_active_refs is held. - * - * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs - * associated with this field is protected by ->mk_sem as well. + * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; @@ -500,7 +555,7 @@ struct fscrypt_master_key { * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic - * search+insert along with proper synchronization with ->mk_secret.) + * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; @@ -523,20 +578,17 @@ struct fscrypt_master_key { siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; -} __randomize_layout; - -static inline bool -is_master_key_secret_present(const struct fscrypt_master_key_secret *secret) -{ /* - * The READ_ONCE() is only necessary for fscrypt_drop_inode(). - * fscrypt_drop_inode() runs in atomic context, so it can't take the key - * semaphore and thus 'secret' can change concurrently which would be a - * data race. But fscrypt_drop_inode() only need to know whether the - * secret *was* present at the time of check, so READ_ONCE() suffices. + * Whether this key is in the "present" state, i.e. fully usable. For + * details, see the fscrypt_master_key struct comment. + * + * Locking: protected by ->mk_sem, but can be read locklessly using + * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers + * are possible. */ - return READ_ONCE(secret->size) != 0; -} + bool mk_present; + +} __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) @@ -598,17 +650,18 @@ struct fscrypt_mode { extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, const struct fscrypt_info *ci); + const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); -int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key); +int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, + const u8 *raw_key); -int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, +int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); -void fscrypt_hash_inode_number(struct fscrypt_info *ci, +void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); @@ -643,10 +696,11 @@ static inline int fscrypt_require_key(struct inode *inode) void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); -int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, +int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); -int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci); +int fscrypt_setup_v1_file_key_via_subscribed_keyrings( + struct fscrypt_inode_info *ci); /* policy.c */ diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index 6238dbcadcad..52504dd478d3 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -169,7 +169,7 @@ EXPORT_SYMBOL_GPL(__fscrypt_prepare_setattr); int fscrypt_prepare_setflags(struct inode *inode, unsigned int oldflags, unsigned int flags) { - struct fscrypt_info *ci; + struct fscrypt_inode_info *ci; struct fscrypt_master_key *mk; int err; @@ -187,7 +187,7 @@ int fscrypt_prepare_setflags(struct inode *inode, return -EINVAL; mk = ci->ci_master_key; down_read(&mk->mk_sem); - if (is_master_key_secret_present(&mk->mk_secret)) + if (mk->mk_present) err = fscrypt_derive_dirhash_key(ci, mk); else err = -ENOKEY; diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 8bfb3ce86476..b4002aea7cdb 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -39,11 +39,11 @@ static struct block_device **fscrypt_get_devices(struct super_block *sb, return devs; } -static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) +static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_inode_info *ci) { - struct super_block *sb = ci->ci_inode->i_sb; + const struct super_block *sb = ci->ci_inode->i_sb; unsigned int flags = fscrypt_policy_flags(&ci->ci_policy); - int ino_bits = 64, lblk_bits = 64; + int dun_bits; if (flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return offsetofend(union fscrypt_iv, nonce); @@ -54,10 +54,9 @@ static unsigned int fscrypt_get_dun_bytes(const struct fscrypt_info *ci) if (flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) return sizeof(__le32); - /* Default case: IVs are just the file logical block number */ - if (sb->s_cop->get_ino_and_lblk_bits) - sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - return DIV_ROUND_UP(lblk_bits, 8); + /* Default case: IVs are just the file data unit index */ + dun_bits = fscrypt_max_file_dun_bits(sb, ci->ci_data_unit_bits); + return DIV_ROUND_UP(dun_bits, 8); } /* @@ -90,7 +89,7 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, } /* Enable inline encryption for this file if supported. */ -int fscrypt_select_encryption_impl(struct fscrypt_info *ci) +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; @@ -129,7 +128,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci) * crypto configuration that the file would use. */ crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; - crypto_cfg.data_unit_size = sb->s_blocksize; + crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); devs = fscrypt_get_devices(sb, &num_devs); @@ -152,7 +151,7 @@ out_free_devs: int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; @@ -168,7 +167,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, return -ENOMEM; err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, - fscrypt_get_dun_bytes(ci), sb->s_blocksize); + fscrypt_get_dun_bytes(ci), + 1U << ci->ci_data_unit_bits); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); goto fail; @@ -232,13 +232,15 @@ bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) } EXPORT_SYMBOL_GPL(__fscrypt_inode_uses_inline_crypto); -static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, +static void fscrypt_generate_dun(const struct fscrypt_inode_info *ci, + u64 lblk_num, u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) { + u64 index = lblk_num << ci->ci_data_units_per_block_bits; union fscrypt_iv iv; int i; - fscrypt_generate_iv(&iv, lblk_num, ci); + fscrypt_generate_iv(&iv, index, ci); BUILD_BUG_ON(FSCRYPT_MAX_IV_SIZE > BLK_CRYPTO_MAX_IV_SIZE); memset(dun, 0, BLK_CRYPTO_MAX_IV_SIZE); @@ -265,7 +267,7 @@ static void fscrypt_generate_dun(const struct fscrypt_info *ci, u64 lblk_num, void fscrypt_set_bio_crypt_ctx(struct bio *bio, const struct inode *inode, u64 first_lblk, gfp_t gfp_mask) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; if (!fscrypt_inode_uses_inline_crypto(inode)) @@ -456,7 +458,7 @@ EXPORT_SYMBOL_GPL(fscrypt_dio_supported); */ u64 fscrypt_limit_io_blocks(const struct inode *inode, u64 lblk, u64 nr_blocks) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; u32 dun; if (!fscrypt_inode_uses_inline_crypto(inode)) diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 7cbb1fd872ac..f34a9b0b9e92 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -99,10 +99,10 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, spin_unlock(&sb->s_master_keys->lock); /* - * ->mk_active_refs == 0 implies that ->mk_secret is not present and - * that ->mk_decrypted_inodes is empty. + * ->mk_active_refs == 0 implies that ->mk_present is false and + * ->mk_decrypted_inodes is empty. */ - WARN_ON_ONCE(is_master_key_secret_present(&mk->mk_secret)); + WARN_ON_ONCE(mk->mk_present); WARN_ON_ONCE(!list_empty(&mk->mk_decrypted_inodes)); for (i = 0; i <= FSCRYPT_MODE_MAX; i++) { @@ -121,6 +121,18 @@ void fscrypt_put_master_key_activeref(struct super_block *sb, fscrypt_put_master_key(mk); } +/* + * This transitions the key state from present to incompletely removed, and then + * potentially to absent (depending on whether inodes remain). + */ +static void fscrypt_initiate_key_removal(struct super_block *sb, + struct fscrypt_master_key *mk) +{ + WRITE_ONCE(mk->mk_present, false); + wipe_master_key_secret(&mk->mk_secret); + fscrypt_put_master_key_activeref(sb, mk); +} + static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec) { if (spec->__reserved) @@ -234,14 +246,13 @@ void fscrypt_destroy_keyring(struct super_block *sb) * evicted, every key remaining in the keyring should * have an empty inode list, and should only still be in * the keyring due to the single active ref associated - * with ->mk_secret. There should be no structural refs - * beyond the one associated with the active ref. + * with ->mk_present. There should be no structural + * refs beyond the one associated with the active ref. */ WARN_ON_ONCE(refcount_read(&mk->mk_active_refs) != 1); WARN_ON_ONCE(refcount_read(&mk->mk_struct_refs) != 1); - WARN_ON_ONCE(!is_master_key_secret_present(&mk->mk_secret)); - wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(sb, mk); + WARN_ON_ONCE(!mk->mk_present); + fscrypt_initiate_key_removal(sb, mk); } } kfree_sensitive(keyring); @@ -439,7 +450,8 @@ static int add_new_master_key(struct super_block *sb, } move_master_key_secret(&mk->mk_secret, secret); - refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */ + mk->mk_present = true; + refcount_set(&mk->mk_active_refs, 1); /* ->mk_present is true */ spin_lock(&keyring->lock); hlist_add_head_rcu(&mk->mk_node, @@ -478,11 +490,18 @@ static int add_existing_master_key(struct fscrypt_master_key *mk, return err; } - /* Re-add the secret if needed. */ - if (!is_master_key_secret_present(&mk->mk_secret)) { - if (!refcount_inc_not_zero(&mk->mk_active_refs)) + /* If the key is incompletely removed, make it present again. */ + if (!mk->mk_present) { + if (!refcount_inc_not_zero(&mk->mk_active_refs)) { + /* + * Raced with the last active ref being dropped, so the + * key has become, or is about to become, "absent". + * Therefore, we need to allocate a new key struct. + */ return KEY_DEAD; + } move_master_key_secret(&mk->mk_secret, secret); + WRITE_ONCE(mk->mk_present, true); } return 0; @@ -506,8 +525,8 @@ static int do_add_master_key(struct super_block *sb, err = add_new_master_key(sb, secret, mk_spec); } else { /* - * Found the key in ->s_master_keys. Re-add the secret if - * needed, and add the user to ->mk_users if needed. + * Found the key in ->s_master_keys. Add the user to ->mk_users + * if needed, and make the key "present" again if possible. */ down_write(&mk->mk_sem); err = add_existing_master_key(mk, secret); @@ -867,7 +886,7 @@ static void shrink_dcache_inode(struct inode *inode) static void evict_dentries_for_decrypted_inodes(struct fscrypt_master_key *mk) { - struct fscrypt_info *ci; + struct fscrypt_inode_info *ci; struct inode *inode; struct inode *toput_inode = NULL; @@ -917,7 +936,7 @@ static int check_for_busy_inodes(struct super_block *sb, /* select an example file to show for debugging purposes */ struct inode *inode = list_first_entry(&mk->mk_decrypted_inodes, - struct fscrypt_info, + struct fscrypt_inode_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; } @@ -989,9 +1008,8 @@ static int try_to_lock_encrypted_files(struct super_block *sb, * * If all inodes were evicted, then we unlink the fscrypt_master_key from the * keyring. Otherwise it remains in the keyring in the "incompletely removed" - * state (without the actual secret key) where it tracks the list of remaining - * inodes. Userspace can execute the ioctl again later to retry eviction, or - * alternatively can re-add the secret key again. + * state where it tracks the list of remaining inodes. Userspace can execute + * the ioctl again later to retry eviction, or alternatively can re-add the key. * * For more details, see the "Removing keys" section of * Documentation/filesystems/fscrypt.rst. @@ -1053,11 +1071,10 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) } } - /* No user claims remaining. Go ahead and wipe the secret. */ + /* No user claims remaining. Initiate removal of the key. */ err = -ENOKEY; - if (is_master_key_secret_present(&mk->mk_secret)) { - wipe_master_key_secret(&mk->mk_secret); - fscrypt_put_master_key_activeref(sb, mk); + if (mk->mk_present) { + fscrypt_initiate_key_removal(sb, mk); err = 0; } inodes_remain = refcount_read(&mk->mk_active_refs) > 0; @@ -1074,9 +1091,9 @@ static int do_remove_key(struct file *filp, void __user *_uarg, bool all_users) } /* * We return 0 if we successfully did something: removed a claim to the - * key, wiped the secret, or tried locking the files again. Users need - * to check the informational status flags if they care whether the key - * has been fully removed including all files locked. + * key, initiated removal of the key, or tried locking the files again. + * Users need to check the informational status flags if they care + * whether the key has been fully removed including all files locked. */ out_put_key: fscrypt_put_master_key(mk); @@ -1103,12 +1120,11 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_remove_key_all_users); * Retrieve the status of an fscrypt master encryption key. * * We set ->status to indicate whether the key is absent, present, or - * incompletely removed. "Incompletely removed" means that the master key - * secret has been removed, but some files which had been unlocked with it are - * still in use. This field allows applications to easily determine the state - * of an encrypted directory without using a hack such as trying to open a - * regular file in it (which can confuse the "incompletely removed" state with - * absent or present). + * incompletely removed. (For an explanation of what these statuses mean and + * how they are represented internally, see struct fscrypt_master_key.) This + * field allows applications to easily determine the status of an encrypted + * directory without using a hack such as trying to open a regular file in it + * (which can confuse the "incompletely removed" status with absent or present). * * In addition, for v2 policy keys we allow applications to determine, via * ->status_flags and ->user_count, whether the key has been added by the @@ -1150,7 +1166,7 @@ int fscrypt_ioctl_get_key_status(struct file *filp, void __user *uarg) } down_read(&mk->mk_sem); - if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!mk->mk_present) { arg.status = refcount_read(&mk->mk_active_refs) > 0 ? FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED : FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index 361f41ef46c7..d71f7c799e79 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -148,7 +148,7 @@ err_free_tfm: * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, const struct fscrypt_info *ci) + const u8 *raw_key, const struct fscrypt_inode_info *ci) { struct crypto_skcipher *tfm; @@ -178,13 +178,14 @@ void fscrypt_destroy_prepared_key(struct super_block *sb, } /* Given a per-file encryption key, set up the file's crypto transform object */ -int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) +int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, + const u8 *raw_key) { ci->ci_owns_key = true; return fscrypt_prepare_key(&ci->ci_enc_key, raw_key, ci); } -static int setup_per_mode_enc_key(struct fscrypt_info *ci, +static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, struct fscrypt_prepared_key *keys, u8 hkdf_context, bool include_fs_uuid) @@ -265,7 +266,7 @@ static int fscrypt_derive_siphash_key(const struct fscrypt_master_key *mk, return 0; } -int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, +int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { int err; @@ -279,7 +280,7 @@ int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, return 0; } -void fscrypt_hash_inode_number(struct fscrypt_info *ci, +void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk) { WARN_ON_ONCE(ci->ci_inode->i_ino == 0); @@ -289,7 +290,7 @@ void fscrypt_hash_inode_number(struct fscrypt_info *ci, &mk->mk_ino_hash_key); } -static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_info *ci, +static int fscrypt_setup_iv_ino_lblk_32_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk) { int err; @@ -329,7 +330,7 @@ unlock: return 0; } -static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, +static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk, bool need_dirhash_key) { @@ -404,7 +405,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * still allow 512-bit master keys if the user chooses to use them, though.) */ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, - const struct fscrypt_info *ci) + const struct fscrypt_inode_info *ci) { unsigned int min_keysize; @@ -430,11 +431,12 @@ static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, * * If the master key is found in the filesystem-level keyring, then it is * returned in *mk_ret with its semaphore read-locked. This is needed to ensure - * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as - * multiple tasks may race to create an fscrypt_info for the same inode), and to - * synchronize the master key being removed with a new inode starting to use it. + * that only one task links the fscrypt_inode_info into ->mk_decrypted_inodes + * (as multiple tasks may race to create an fscrypt_inode_info for the same + * inode), and to synchronize the master key being removed with a new inode + * starting to use it. */ -static int setup_file_encryption_key(struct fscrypt_info *ci, +static int setup_file_encryption_key(struct fscrypt_inode_info *ci, bool need_dirhash_key, struct fscrypt_master_key **mk_ret) { @@ -484,8 +486,8 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, } down_read(&mk->mk_sem); - /* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */ - if (!is_master_key_secret_present(&mk->mk_secret)) { + if (!mk->mk_present) { + /* FS_IOC_REMOVE_ENCRYPTION_KEY has been executed on this key */ err = -ENOKEY; goto out_release_key; } @@ -519,7 +521,7 @@ out_release_key: return err; } -static void put_crypt_info(struct fscrypt_info *ci) +static void put_crypt_info(struct fscrypt_inode_info *ci) { struct fscrypt_master_key *mk; @@ -537,8 +539,8 @@ static void put_crypt_info(struct fscrypt_info *ci) /* * Remove this inode from the list of inodes that were unlocked * with the master key. In addition, if we're removing the last - * inode from a master key struct that already had its secret - * removed, then complete the full removal of the struct. + * inode from an incompletely removed key, then complete the + * full removal of the key. */ spin_lock(&mk->mk_decrypted_inodes_lock); list_del(&ci->ci_master_key_link); @@ -546,7 +548,7 @@ static void put_crypt_info(struct fscrypt_info *ci) fscrypt_put_master_key_activeref(ci->ci_inode->i_sb, mk); } memzero_explicit(ci, sizeof(*ci)); - kmem_cache_free(fscrypt_info_cachep, ci); + kmem_cache_free(fscrypt_inode_info_cachep, ci); } static int @@ -555,7 +557,7 @@ fscrypt_setup_encryption_info(struct inode *inode, const u8 nonce[FSCRYPT_FILE_NONCE_SIZE], bool need_dirhash_key) { - struct fscrypt_info *crypt_info; + struct fscrypt_inode_info *crypt_info; struct fscrypt_mode *mode; struct fscrypt_master_key *mk = NULL; int res; @@ -564,7 +566,7 @@ fscrypt_setup_encryption_info(struct inode *inode, if (res) return res; - crypt_info = kmem_cache_zalloc(fscrypt_info_cachep, GFP_KERNEL); + crypt_info = kmem_cache_zalloc(fscrypt_inode_info_cachep, GFP_KERNEL); if (!crypt_info) return -ENOMEM; @@ -580,6 +582,11 @@ fscrypt_setup_encryption_info(struct inode *inode, WARN_ON_ONCE(mode->ivsize > FSCRYPT_MAX_IV_SIZE); crypt_info->ci_mode = mode; + crypt_info->ci_data_unit_bits = + fscrypt_policy_du_bits(&crypt_info->ci_policy, inode); + crypt_info->ci_data_units_per_block_bits = + inode->i_blkbits - crypt_info->ci_data_unit_bits; + res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk); if (res) goto out; @@ -587,8 +594,8 @@ fscrypt_setup_encryption_info(struct inode *inode, /* * For existing inodes, multiple tasks may race to set ->i_crypt_info. * So use cmpxchg_release(). This pairs with the smp_load_acquire() in - * fscrypt_get_info(). I.e., here we publish ->i_crypt_info with a - * RELEASE barrier so that other tasks can ACQUIRE it. + * fscrypt_get_inode_info(). I.e., here we publish ->i_crypt_info with + * a RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_crypt_info, NULL, crypt_info) == NULL) { /* @@ -735,8 +742,8 @@ EXPORT_SYMBOL_GPL(fscrypt_prepare_new_inode); * fscrypt_put_encryption_info() - free most of an inode's fscrypt data * @inode: an inode being evicted * - * Free the inode's fscrypt_info. Filesystems must call this when the inode is - * being evicted. An RCU grace period need not have elapsed yet. + * Free the inode's fscrypt_inode_info. Filesystems must call this when the + * inode is being evicted. An RCU grace period need not have elapsed yet. */ void fscrypt_put_encryption_info(struct inode *inode) { @@ -773,7 +780,7 @@ EXPORT_SYMBOL(fscrypt_free_inode); */ int fscrypt_drop_inode(struct inode *inode) { - const struct fscrypt_info *ci = fscrypt_get_info(inode); + const struct fscrypt_inode_info *ci = fscrypt_get_inode_info(inode); /* * If ci is NULL, then the inode doesn't have an encryption key set up @@ -794,13 +801,14 @@ int fscrypt_drop_inode(struct inode *inode) return 0; /* - * Note: since we aren't holding the key semaphore, the result here can + * We can't take ->mk_sem here, since this runs in atomic context. + * Therefore, ->mk_present can change concurrently, and our result may * immediately become outdated. But there's no correctness problem with * unnecessarily evicting. Nor is there a correctness problem with not * evicting while iput() is racing with the key being removed, since * then the thread removing the key will either evict the inode itself * or will correctly detect that it wasn't evicted due to the race. */ - return !is_master_key_secret_present(&ci->ci_master_key->mk_secret); + return !READ_ONCE(ci->ci_master_key->mk_present); } EXPORT_SYMBOL_GPL(fscrypt_drop_inode); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 75dabd9b27f9..a10710bc8123 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -178,7 +178,8 @@ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk) */ static struct fscrypt_direct_key * find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, - const u8 *raw_key, const struct fscrypt_info *ci) + const u8 *raw_key, + const struct fscrypt_inode_info *ci) { unsigned long hash_key; struct fscrypt_direct_key *dk; @@ -218,7 +219,7 @@ find_or_insert_direct_key(struct fscrypt_direct_key *to_insert, /* Prepare to encrypt directly using the master key in the given mode */ static struct fscrypt_direct_key * -fscrypt_get_direct_key(const struct fscrypt_info *ci, const u8 *raw_key) +fscrypt_get_direct_key(const struct fscrypt_inode_info *ci, const u8 *raw_key) { struct fscrypt_direct_key *dk; int err; @@ -250,7 +251,7 @@ err_free_dk: } /* v1 policy, DIRECT_KEY: use the master key directly */ -static int setup_v1_file_key_direct(struct fscrypt_info *ci, +static int setup_v1_file_key_direct(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { struct fscrypt_direct_key *dk; @@ -264,7 +265,7 @@ static int setup_v1_file_key_direct(struct fscrypt_info *ci, } /* v1 policy, !DIRECT_KEY: derive the file's encryption key */ -static int setup_v1_file_key_derived(struct fscrypt_info *ci, +static int setup_v1_file_key_derived(struct fscrypt_inode_info *ci, const u8 *raw_master_key) { u8 *derived_key; @@ -289,7 +290,8 @@ out: return err; } -int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) +int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, + const u8 *raw_master_key) { if (ci->ci_policy.v1.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) return setup_v1_file_key_direct(ci, raw_master_key); @@ -297,8 +299,10 @@ int fscrypt_setup_v1_file_key(struct fscrypt_info *ci, const u8 *raw_master_key) return setup_v1_file_key_derived(ci, raw_master_key); } -int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) +int +fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_inode_info *ci) { + const struct super_block *sb = ci->ci_inode->i_sb; struct key *key; const struct fscrypt_key *payload; int err; @@ -306,8 +310,8 @@ int fscrypt_setup_v1_file_key_via_subscribed_keyrings(struct fscrypt_info *ci) key = find_and_lock_process_key(FSCRYPT_KEY_DESC_PREFIX, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); - if (key == ERR_PTR(-ENOKEY) && ci->ci_inode->i_sb->s_cop->key_prefix) { - key = find_and_lock_process_key(ci->ci_inode->i_sb->s_cop->key_prefix, + if (key == ERR_PTR(-ENOKEY) && sb->s_cop->legacy_key_prefix) { + key = find_and_lock_process_key(sb->s_cop->legacy_key_prefix, ci->ci_policy.v1.master_key_descriptor, ci->ci_mode->keysize, &payload); } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index f4456ecb3f87..701259991277 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -118,12 +118,11 @@ static bool supported_direct_key_modes(const struct inode *inode, } static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, - const struct inode *inode, - const char *type, - int max_ino_bits, int max_lblk_bits) + const struct inode *inode) { + const char *type = (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) + ? "IV_INO_LBLK_64" : "IV_INO_LBLK_32"; struct super_block *sb = inode->i_sb; - int ino_bits = 64, lblk_bits = 64; /* * IV_INO_LBLK_* exist only because of hardware limitations, and @@ -150,17 +149,29 @@ static bool supported_iv_ino_lblk_policy(const struct fscrypt_policy_v2 *policy, type, sb->s_id); return false; } - if (sb->s_cop->get_ino_and_lblk_bits) - sb->s_cop->get_ino_and_lblk_bits(sb, &ino_bits, &lblk_bits); - if (ino_bits > max_ino_bits) { + + /* + * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that inode numbers fit + * in 32 bits. In principle, IV_INO_LBLK_32 could support longer inode + * numbers because it hashes the inode number; however, currently the + * inode number is gotten from inode::i_ino which is 'unsigned long'. + * So for now the implementation limit is 32 bits. + */ + if (!sb->s_cop->has_32bit_inodes) { fscrypt_warn(inode, "Can't use %s policy on filesystem '%s' because its inode numbers are too long", type, sb->s_id); return false; } - if (lblk_bits > max_lblk_bits) { + + /* + * IV_INO_LBLK_64 and IV_INO_LBLK_32 both require that file data unit + * indices fit in 32 bits. + */ + if (fscrypt_max_file_dun_bits(sb, + fscrypt_policy_v2_du_bits(policy, inode)) > 32) { fscrypt_warn(inode, - "Can't use %s policy on filesystem '%s' because its block numbers are too long", + "Can't use %s policy on filesystem '%s' because its maximum file size is too large", type, sb->s_id); return false; } @@ -233,25 +244,39 @@ static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, return false; } + if (policy->log2_data_unit_size) { + if (!inode->i_sb->s_cop->supports_subblock_data_units) { + fscrypt_warn(inode, + "Filesystem does not support configuring crypto data unit size"); + return false; + } + if (policy->log2_data_unit_size > inode->i_blkbits || + policy->log2_data_unit_size < SECTOR_SHIFT /* 9 */) { + fscrypt_warn(inode, + "Unsupported log2_data_unit_size in encryption policy: %d", + policy->log2_data_unit_size); + return false; + } + if (policy->log2_data_unit_size != inode->i_blkbits && + (policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) { + /* + * Not safe to enable yet, as we need to ensure that DUN + * wraparound can only occur on a FS block boundary. + */ + fscrypt_warn(inode, + "Sub-block data units not yet supported with IV_INO_LBLK_32"); + return false; + } + } + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && !supported_direct_key_modes(inode, policy->contents_encryption_mode, policy->filenames_encryption_mode)) return false; - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_64", - 32, 32)) - return false; - - /* - * IV_INO_LBLK_32 hashes the inode number, so in principle it can - * support any ino_bits. However, currently the inode number is gotten - * from inode::i_ino which is 'unsigned long'. So for now the - * implementation limit is 32 bits. - */ - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) && - !supported_iv_ino_lblk_policy(policy, inode, "IV_INO_LBLK_32", - 32, 32)) + if ((policy->flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) && + !supported_iv_ino_lblk_policy(policy, inode)) return false; if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { @@ -330,6 +355,7 @@ static int fscrypt_new_context(union fscrypt_context *ctx_u, ctx->filenames_encryption_mode = policy->filenames_encryption_mode; ctx->flags = policy->flags; + ctx->log2_data_unit_size = policy->log2_data_unit_size; memcpy(ctx->master_key_identifier, policy->master_key_identifier, sizeof(ctx->master_key_identifier)); @@ -390,6 +416,7 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, policy->filenames_encryption_mode = ctx->filenames_encryption_mode; policy->flags = ctx->flags; + policy->log2_data_unit_size = ctx->log2_data_unit_size; memcpy(policy->__reserved, ctx->__reserved, sizeof(policy->__reserved)); memcpy(policy->master_key_identifier, @@ -405,11 +432,11 @@ int fscrypt_policy_from_context(union fscrypt_policy *policy_u, /* Retrieve an inode's encryption policy */ static int fscrypt_get_policy(struct inode *inode, union fscrypt_policy *policy) { - const struct fscrypt_info *ci; + const struct fscrypt_inode_info *ci; union fscrypt_context ctx; int ret; - ci = fscrypt_get_info(inode); + ci = fscrypt_get_inode_info(inode); if (ci) { /* key available, use the cached policy */ *policy = ci->ci_policy; @@ -647,7 +674,7 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) /* * Both parent and child are encrypted, so verify they use the same - * encryption policy. Compare the fscrypt_info structs if the keys are + * encryption policy. Compare the cached policies if the keys are * available, otherwise retrieve and compare the fscrypt_contexts. * * Note that the fscrypt_context retrieval will be required frequently @@ -717,7 +744,7 @@ const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir) */ int fscrypt_context_for_new_inode(void *ctx, struct inode *inode) { - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; BUILD_BUG_ON(sizeof(union fscrypt_context) != FSCRYPT_SET_CONTEXT_MAX_SIZE); @@ -742,7 +769,7 @@ EXPORT_SYMBOL_GPL(fscrypt_context_for_new_inode); */ int fscrypt_set_context(struct inode *inode, void *fs_data) { - struct fscrypt_info *ci = inode->i_crypt_info; + struct fscrypt_inode_info *ci = inode->i_crypt_info; union fscrypt_context ctx; int ctxsize; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 83e57e9f9fa0..5d41765e0c77 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -72,7 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } return inode; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 299c295a27a0..c830261aa883 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); @@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (!inode) goto fail; inode->i_ino = 1; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index f2ed0c0266cb..c586c5db18b5 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -702,6 +702,6 @@ int ecryptfs_set_f_namelen(long *namelen, long lower_namelen, int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, loff_t offset); -extern const struct xattr_handler *ecryptfs_xattr_handlers[]; +extern const struct xattr_handler * const ecryptfs_xattr_handlers[]; #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 992d9c7e64ae..a25dd3d20008 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1210,7 +1210,7 @@ static const struct xattr_handler ecryptfs_xattr_handler = { .set = ecryptfs_xattr_set, }; -const struct xattr_handler *ecryptfs_xattr_handlers[] = { +const struct xattr_handler * const ecryptfs_xattr_handlers[] = { &ecryptfs_xattr_handler, NULL }; diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 59b52718a3a2..7e9961639802 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_unlock(inode); } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index db9231f0e77b..76dd3c7295d9 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index e028fafa04f3..996271473609 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -32,10 +32,16 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 storage_space, remaining_space, max_variable_size; efi_status_t status; - status = efivar_query_variable_info(attr, &storage_space, &remaining_space, - &max_variable_size); - if (status != EFI_SUCCESS) - return efi_status_to_err(status); + /* Some UEFI firmware does not implement QueryVariableInfo() */ + storage_space = remaining_space = 0; + if (efi_rt_services_supported(EFI_RT_SUPPORTED_QUERY_VARIABLE_INFO)) { + status = efivar_query_variable_info(attr, &storage_space, + &remaining_space, + &max_variable_size); + if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) + pr_warn_ratelimited("query_variable_info() failed: 0x%lx\n", + status); + } /* * This is not a normal filesystem, so no point in pretending it has a block diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 3789d22ba501..7844ab24b813 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -103,10 +103,9 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) i_uid_write(inode, (uid_t)be16_to_cpu(efs_inode->di_uid)); i_gid_write(inode, (gid_t)be16_to_cpu(efs_inode->di_gid)); inode->i_size = be32_to_cpu(efs_inode->di_size); - inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); - inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); + inode_set_atime(inode, be32_to_cpu(efs_inode->di_atime), 0); + inode_set_mtime(inode, be32_to_cpu(efs_inode->di_mtime), 0); inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; /* this is the number of blocks in the file */ if (inode->i_size == 0) { diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0c2c99c58b5e..f6a0a1748521 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -222,7 +222,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) up_read(&devs->rwsem); return 0; } - map->m_bdev = dif->bdev; + map->m_bdev = dif->bdev_handle->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; @@ -240,7 +240,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) if (map->m_pa >= startoff && map->m_pa < startoff + length) { map->m_pa -= startoff; - map->m_bdev = dif->bdev; + map->m_bdev = dif->bdev_handle->bdev; map->m_daxdev = dif->dax_dev; map->m_dax_part_off = dif->dax_part_off; map->m_fscache = dif->fscache; diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 73091fbe3ea4..dee10d22ada9 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -217,9 +217,12 @@ again: strm->buf.out_size = min_t(u32, outlen, PAGE_SIZE - pageofs); outlen -= strm->buf.out_size; - if (!rq->out[no] && rq->fillgaps) /* deduped */ + if (!rq->out[no] && rq->fillgaps) { /* deduped */ rq->out[no] = erofs_allocpage(pagepool, GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } if (rq->out[no]) strm->buf.out = kmap(rq->out[no]) + pageofs; pageofs = 0; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index edc8ec7581b8..b8ad05b4509d 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -175,7 +175,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime = inode->i_atime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_get_ctime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 4ff88d0dd980..cf04e21bfda2 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,7 @@ typedef u32 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct dax_device *dax_dev; u64 dax_part_off; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 44a24d573f1f..6fd04781fec5 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -227,7 +227,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; struct erofs_deviceslot *dis; - struct block_device *bdev; + struct bdev_handle *bdev_handle; void *ptr; ptr = erofs_read_metabuf(buf, sb, erofs_blknr(sb, *pos), EROFS_KMAP); @@ -235,7 +235,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(ptr); dis = ptr + erofs_blkoff(sb, *pos); - if (!dif->path) { + if (!sbi->devs->flatdev && !dif->path) { if (!dis->tag[0]) { erofs_err(sb, "empty device tag @ pos %llu", *pos); return -EINVAL; @@ -251,13 +251,13 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, return PTR_ERR(fscache); dif->fscache = fscache; } else if (!sbi->devs->flatdev) { - bdev = blkdev_get_by_path(dif->path, BLK_OPEN_READ, sb->s_type, - NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - dif->bdev = bdev; - dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off, - NULL, NULL); + bdev_handle = bdev_open_by_path(dif->path, BLK_OPEN_READ, + sb->s_type, NULL); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); + dif->bdev_handle = bdev_handle; + dif->dax_dev = fs_dax_get_by_bdev(bdev_handle->bdev, + &dif->dax_part_off, NULL, NULL); } dif->blocks = le32_to_cpu(dis->blocks); @@ -806,8 +806,8 @@ static int erofs_release_device_info(int id, void *ptr, void *data) struct erofs_device_info *dif = ptr; fs_put_dax(dif->dax_dev, NULL); - if (dif->bdev) - blkdev_put(dif->bdev, &erofs_fs_type); + if (dif->bdev_handle) + bdev_release(dif->bdev_handle); erofs_fscache_unregister_cookie(dif->fscache); dif->fscache = NULL; kfree(dif->path); diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 09d341675e89..b58316b49a43 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -168,7 +168,7 @@ const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { }; #endif -const struct xattr_handler *erofs_xattr_handlers[] = { +const struct xattr_handler * const erofs_xattr_handlers[] = { &erofs_xattr_user_handler, &erofs_xattr_trusted_handler, #ifdef CONFIG_EROFS_FS_SECURITY diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h index f16283cb8c93..b246cd0e135e 100644 --- a/fs/erofs/xattr.h +++ b/fs/erofs/xattr.h @@ -23,7 +23,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, { const struct xattr_handler *handler = NULL; - static const struct xattr_handler *xattr_handler_map[] = { + static const struct xattr_handler * const xattr_handler_map[] = { [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, #ifdef CONFIG_EROFS_FS_POSIX_ACL [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -44,7 +44,7 @@ static inline const char *erofs_xattr_prefix(unsigned int idx, return xattr_prefix(handler); } -extern const struct xattr_handler *erofs_xattr_handlers[]; +extern const struct xattr_handler * const erofs_xattr_handlers[]; int erofs_xattr_prefixes_init(struct super_block *sb); void erofs_xattr_prefixes_cleanup(struct super_block *sb); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index f55498e5c23d..f78b614f44dc 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -549,6 +549,7 @@ void __exfat_fs_error(struct super_block *sb, int report, const char *fmt, ...) void exfat_get_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 tz, __le16 time, __le16 date, u8 time_cs); void exfat_truncate_atime(struct timespec64 *ts); +void exfat_truncate_inode_atime(struct inode *inode); void exfat_set_entry_time(struct exfat_sb_info *sbi, struct timespec64 *ts, u8 *tz, __le16 *time, __le16 *date, u8 *time_cs); u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type); diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 32395ef686a2..30ee2c8d36a5 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (err) return err; - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); if (!IS_SYNC(inode)) @@ -290,10 +290,9 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_valid & ATTR_SIZE) - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - setattr_copy(&nop_mnt_idmap, inode, attr); - exfat_truncate_atime(&inode->i_atime); + exfat_truncate_inode_atime(inode); if (attr->ia_valid & ATTR_SIZE) { error = exfat_block_truncate_page(inode, attr->ia_size); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 13329baeafbc..a2185e6f0548 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -26,6 +26,7 @@ int __exfat_write_inode(struct inode *inode, int sync) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); bool is_dir = (ei->type == TYPE_DIR) ? true : false; + struct timespec64 ts; if (inode->i_ino == EXFAT_ROOT_INO) return 0; @@ -55,16 +56,18 @@ int __exfat_write_inode(struct inode *inode, int sync) &ep->dentry.file.create_time, &ep->dentry.file.create_date, &ep->dentry.file.create_time_cs); - exfat_set_entry_time(sbi, &inode->i_mtime, - &ep->dentry.file.modify_tz, - &ep->dentry.file.modify_time, - &ep->dentry.file.modify_date, - &ep->dentry.file.modify_time_cs); - exfat_set_entry_time(sbi, &inode->i_atime, - &ep->dentry.file.access_tz, - &ep->dentry.file.access_time, - &ep->dentry.file.access_date, - NULL); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.modify_tz, + &ep->dentry.file.modify_time, + &ep->dentry.file.modify_date, + &ep->dentry.file.modify_time_cs); + inode_set_mtime_to_ts(inode, ts); + exfat_set_entry_time(sbi, &ts, + &ep->dentry.file.access_tz, + &ep->dentry.file.access_time, + &ep->dentry.file.access_date, + NULL); + inode_set_atime_to_ts(inode, ts); /* File size should be zero if there is no cluster allocated */ on_disk_size = i_size_read(inode); @@ -355,7 +358,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); exfat_truncate(inode); } } @@ -398,7 +401,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, exfat_write_failed(mapping, pos+len); if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ei->attr |= ATTR_ARCHIVE; mark_inode_dirty(inode); } @@ -576,10 +579,10 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; - inode->i_mtime = info->mtime; + inode_set_mtime_to_ts(inode, info->mtime); inode_set_ctime_to_ts(inode, info->mtime); ei->i_crtime = info->crtime; - inode->i_atime = info->atime; + inode_set_atime_to_ts(inode, info->atime); return 0; } diff --git a/fs/exfat/misc.c b/fs/exfat/misc.c index 2e1a1a6b1021..fa8459828046 100644 --- a/fs/exfat/misc.c +++ b/fs/exfat/misc.c @@ -126,6 +126,14 @@ void exfat_truncate_atime(struct timespec64 *ts) ts->tv_nsec = 0; } +void exfat_truncate_inode_atime(struct inode *inode) +{ + struct timespec64 atime = inode_get_atime(inode); + + exfat_truncate_atime(&atime); + inode_set_atime_to_ts(inode, atime); +} + u16 exfat_calc_chksum16(void *data, int len, u16 chksum, int type) { int i; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 1b9f587f6cca..b92e46916dea 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -582,8 +582,9 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); - exfat_truncate_atime(&inode->i_atime); + EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); + /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -816,16 +817,16 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); - exfat_truncate_atime(&dir->i_atime); + simple_inode_init_ts(dir); + exfat_truncate_inode_atime(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else mark_inode_dirty(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - exfat_truncate_atime(&inode->i_atime); + simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -851,7 +852,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -865,8 +866,8 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); - exfat_truncate_atime(&inode->i_atime); + EXFAT_I(inode)->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ d_instantiate(dentry, inode); @@ -977,8 +978,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); - exfat_truncate_atime(&dir->i_atime); + simple_inode_init_ts(dir); + exfat_truncate_inode_atime(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -986,8 +987,8 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - exfat_truncate_atime(&inode->i_atime); + simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); unlock: @@ -1312,7 +1313,7 @@ static int exfat_rename(struct mnt_idmap *idmap, inode_inc_iversion(new_dir); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); EXFAT_I(new_dir)->i_crtime = current_time(new_dir); - exfat_truncate_atime(&new_dir->i_atime); + exfat_truncate_inode_atime(new_dir); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); else diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 2778bd9b631e..e919a68bf4a1 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -370,8 +370,8 @@ static int exfat_read_root(struct inode *inode) ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); - inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode); - exfat_truncate_atime(&inode->i_atime); + ei->i_crtime = simple_inode_init_ts(inode); + exfat_truncate_inode_atime(inode); return 0; } diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index b335f17f682f..c7900868171b 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); if (update_times) - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); return ext2_handle_dirsync(dir); @@ -555,7 +555,7 @@ got_it: de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); ext2_commit_chunk(page, pos, rec_len); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); err = ext2_handle_dirsync(dir); @@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; ext2_commit_chunk(page, pos, to - from); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); return ext2_handle_dirsync(inode); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index c24d0de95a83..fdf63e9c6e7c 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -546,7 +546,7 @@ got: inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_flags = ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 314b415ee518..464faf6c217e 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1291,7 +1291,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) __ext2_truncate_blocks(inode, newsize); filemap_invalidate_unlock(inode->i_mapping); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); sync_inode_metadata(inode, 1); @@ -1412,10 +1412,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) i_gid_write(inode, i_gid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); + inode_set_atime(inode, (signed)le32_to_cpu(raw_inode->i_atime), 0); inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0); - inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; + inode_set_mtime(inode, (signed)le32_to_cpu(raw_inode->i_mtime), 0); ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes @@ -1544,9 +1543,9 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(inode->i_size); - raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); - raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); + raw_inode->i_atime = cpu_to_le32(inode_get_atime_sec(inode)); + raw_inode->i_ctime = cpu_to_le32(inode_get_ctime_sec(inode)); + raw_inode->i_mtime = cpu_to_le32(inode_get_mtime_sec(inode)); raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index aaf3e3e88cb2..645ee6142f69 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1572,7 +1572,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_inc_iversion(inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 20f741184673..e849241ebb8f 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -98,7 +98,7 @@ static struct buffer_head *ext2_xattr_cache_find(struct inode *, static void ext2_xattr_rehash(struct ext2_xattr_header *, struct ext2_xattr_entry *); -static const struct xattr_handler *ext2_xattr_handler_map[] = { +static const struct xattr_handler * const ext2_xattr_handler_map[] = { [EXT2_XATTR_INDEX_USER] = &ext2_xattr_user_handler, #ifdef CONFIG_EXT2_FS_POSIX_ACL [EXT2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -110,7 +110,7 @@ static const struct xattr_handler *ext2_xattr_handler_map[] = { #endif }; -const struct xattr_handler *ext2_xattr_handlers[] = { +const struct xattr_handler * const ext2_xattr_handlers[] = { &ext2_xattr_user_handler, &ext2_xattr_trusted_handler, #ifdef CONFIG_EXT2_FS_SECURITY diff --git a/fs/ext2/xattr.h b/fs/ext2/xattr.h index 7925f596e8e2..6a4966949047 100644 --- a/fs/ext2/xattr.h +++ b/fs/ext2/xattr.h @@ -72,7 +72,7 @@ extern void ext2_xattr_delete_inode(struct inode *); extern struct mb_cache *ext2_xattr_create_cache(void); extern void ext2_xattr_destroy_cache(struct mb_cache *cache); -extern const struct xattr_handler *ext2_xattr_handlers[]; +extern const struct xattr_handler * const ext2_xattr_handlers[]; # else /* CONFIG_EXT2_FS_XATTR */ diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 453d4da5de52..7ae0b61258a7 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -232,19 +232,14 @@ static bool ext4_has_stable_inodes(struct super_block *sb) return ext4_has_feature_stable_inodes(sb); } -static void ext4_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(EXT4_SB(sb)->s_es->s_inodes_count); - *lblk_bits_ret = 8 * sizeof(ext4_lblk_t); -} - const struct fscrypt_operations ext4_cryptops = { - .key_prefix = "ext4:", + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "ext4:", .get_context = ext4_get_context, .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, .has_stable_inodes = ext4_has_stable_inodes, - .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9418359b1d9d..8da5fb680210 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -891,10 +891,13 @@ do { \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ - EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) +#define EXT4_INODE_SET_ATIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) -#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ +#define EXT4_INODE_SET_MTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ @@ -910,9 +913,16 @@ do { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) -#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_GET_ATIME(inode, raw_inode) \ +do { \ + inode_set_atime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ +} while (0) + +#define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ - (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ + inode_set_mtime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ @@ -1537,7 +1547,7 @@ struct ext4_sb_info { unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; - struct block_device *s_journal_bdev; + struct bdev_handle *s_journal_bdev_handle; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 202c76996b62..4c4176ee1749 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4481,7 +4481,8 @@ retry: if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode_get_ctime(inode); + inode_set_mtime_to_ts(inode, + inode_get_ctime(inode)); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4617,7 +4618,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4642,7 +4643,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -5378,7 +5379,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -5488,7 +5489,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index cdf9bfe10137..11e6f33677a2 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -576,8 +576,9 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb, if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) return true; - if (EXT4_SB(sb)->s_journal_bdev && - fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev)) + if (EXT4_SB(sb)->s_journal_bdev_handle && + fm->fmr_device == + new_encode_dev(EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev)) return true; return false; } @@ -647,9 +648,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, memset(handlers, 0, sizeof(handlers)); handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); handlers[0].gfd_fn = ext4_getfsmap_datadev; - if (EXT4_SB(sb)->s_journal_bdev) { + if (EXT4_SB(sb)->s_journal_bdev_handle) { handlers[1].gfd_dev = new_encode_dev( - EXT4_SB(sb)->s_journal_bdev->bd_dev); + EXT4_SB(sb)->s_journal_bdev_handle->bdev->bd_dev); handlers[1].gfd_fn = ext4_getfsmap_logdev; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index b65058d972f9..e9bbb1da2d0a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1250,8 +1250,8 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - ei->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + ei->i_crtime = inode_get_mtime(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_dir_start_lookup = 0; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 012d9259ff53..9a84a5f9fef4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1991,7 +1991,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ce35f1c8b0a..08cb5c0e0d51 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4020,7 +4020,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; @@ -4180,7 +4180,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4284,8 +4284,8 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); @@ -4893,8 +4893,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } EXT4_INODE_GET_CTIME(inode, raw_inode); - EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_GET_ATIME(inode, raw_inode); + EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { @@ -5019,8 +5019,8 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); - EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); - EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); + EXT4_INODE_SET_MTIME(inode, raw_inode); + EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); @@ -5413,7 +5413,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * update c/mtime in shrink case below */ if (!shrink) - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); if (shrink) ext4_fc_track_range(handle, inode, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0bfe2ce589e2..4f931f80cb34 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -312,13 +312,22 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) struct ext4_inode_info *ei1; struct ext4_inode_info *ei2; unsigned long tmp; + struct timespec64 ts1, ts2; ei1 = EXT4_I(inode1); ei2 = EXT4_I(inode2); swap(inode1->i_version, inode2->i_version); - swap(inode1->i_atime, inode2->i_atime); - swap(inode1->i_mtime, inode2->i_mtime); + + ts1 = inode_get_atime(inode1); + ts2 = inode_get_atime(inode2); + inode_set_atime_to_ts(inode1, ts2); + inode_set_atime_to_ts(inode2, ts1); + + ts1 = inode_get_mtime(inode1); + ts2 = inode_get_mtime(inode2); + inode_set_mtime_to_ts(inode1, ts2); + inode_set_mtime_to_ts(inode2, ts1); memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); tmp = ei1->i_flags & EXT4_FL_SHOULD_SWAP; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c91db9f57524..1e599305d85f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> +#include <linux/freezer.h> #include <trace/events/ext4.h> /* @@ -6906,6 +6907,21 @@ __acquires(bitlock) return ret; } +static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, + ext4_group_t grp) +{ + if (grp < ext4_get_groups_count(sb)) + return EXT4_CLUSTERS_PER_GROUP(sb) - 1; + return (ext4_blocks_count(EXT4_SB(sb)->s_es) - + ext4_group_first_block_no(sb, grp) - 1) >> + EXT4_CLUSTER_BITS(sb); +} + +static bool ext4_trim_interrupted(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) @@ -6913,9 +6929,12 @@ __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count; + bool set_trimmed = false; void *bitmap; bitmap = e4b->bd_bitmap; + if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group)) + set_trimmed = true; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; @@ -6930,16 +6949,14 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) - break; + return count; count += next - start; } free_count += next - start; start = next + 1; - if (fatal_signal_pending(current)) { - count = -ERESTARTSYS; - break; - } + if (ext4_trim_interrupted()) + return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); @@ -6951,6 +6968,9 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) break; } + if (set_trimmed) + EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); + return count; } @@ -6961,7 +6981,6 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count - * @set_trimmed: set the trimmed flag if at least one block is trimmed * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy @@ -6971,7 +6990,7 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, - ext4_grpblk_t minblocks, bool set_trimmed) + ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; @@ -6988,13 +7007,10 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || - minblocks < EXT4_SB(sb)->s_last_trim_minblks) { + minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); - if (ret >= 0 && set_trimmed) - EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); - } else { + else ret = 0; - } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); @@ -7027,7 +7043,6 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); - bool whole_group, eof = false; int ret = 0; start = range->start >> sb->s_blocksize_bits; @@ -7046,10 +7061,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } - if (end >= max_blks - 1) { + if (end >= max_blks - 1) end = max_blks - 1; - eof = true; - } if (end <= first_data_blk) goto out; if (start < first_data_blk) @@ -7063,9 +7076,10 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; - whole_group = true; for (group = first_group; group <= last_group; group++) { + if (ext4_trim_interrupted()) + break; grp = ext4_get_group_info(sb, group); if (!grp) continue; @@ -7082,13 +7096,11 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ - if (group == last_group) { + if (group == last_group) end = last_cluster; - whole_group = eof ? true : end == EXT4_CLUSTERS_PER_GROUP(sb) - 1; - } if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, - end, minlen, whole_group); + end, minlen); if (cnt < 0) { ret = cnt; break; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 41a6411c600b..057d74467293 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -343,17 +343,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); #ifdef PARANOID struct ext4_dir_entry *d, *top; d = (struct ext4_dir_entry *)bh->b_data; top = (struct ext4_dir_entry *)(bh->b_data + - (EXT4_BLOCK_SIZE(inode->i_sb) - - sizeof(struct ext4_dir_entry_tail))); - while (d < top && d->rec_len) + (blocksize - sizeof(struct ext4_dir_entry_tail))); + while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize)) d = (struct ext4_dir_entry *)(((void *)d) + - le16_to_cpu(d->rec_len)); + ext4_rec_len_from_disk(d->rec_len, blocksize)); if (d != top) return NULL; @@ -364,7 +364,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, #endif if (t->det_reserved_zero1 || - le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) || + (ext4_rec_len_from_disk(t->det_rec_len, blocksize) != + sizeof(struct ext4_dir_entry_tail)) || t->det_reserved_zero2 || t->det_reserved_ft != EXT4_FT_DIR_CSUM) return NULL; @@ -445,13 +446,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, struct ext4_dir_entry *dp; struct dx_root_info *root; int count_offset; + int blocksize = EXT4_BLOCK_SIZE(inode->i_sb); + unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize); - if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb)) + if (rlen == blocksize) count_offset = 8; - else if (le16_to_cpu(dirent->rec_len) == 12) { + else if (rlen == 12) { dp = (struct ext4_dir_entry *)(((void *)dirent) + 12); - if (le16_to_cpu(dp->rec_len) != - EXT4_BLOCK_SIZE(inode->i_sb) - 12) + if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12) return NULL; root = (struct dx_root_info *)(((void *)dp + 12)); if (root->reserved_zero || @@ -1315,6 +1317,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, unsigned int buflen = bh->b_size; char *base = bh->b_data; struct dx_hash_info h = *hinfo; + int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); if (ext4_has_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); @@ -1335,11 +1338,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; - map_tail->size = le16_to_cpu(de->rec_len); + map_tail->size = ext4_rec_len_from_disk(de->rec_len, + blocksize); count++; cond_resched(); } - de = ext4_next_entry(de, dir->i_sb->s_blocksize); + de = ext4_next_entry(de, blocksize); } return count; } @@ -2203,7 +2207,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -3198,7 +3202,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) @@ -3273,7 +3277,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3644,7 +3648,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_mtime = inode_set_ctime_current(ent->dir); + inode_set_mtime_to_ts(ent->dir, inode_set_ctime_current(ent->dir)); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3959,7 +3963,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, ext4_dec_count(new.inode); inode_set_ctime_current(new.inode); } - old.dir->i_mtime = inode_set_ctime_current(old.dir); + inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir)); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 38217422f938..42a44990d99c 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1351,14 +1351,14 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); - if (sbi->s_journal_bdev) { + if (sbi->s_journal_bdev_handle) { /* * Invalidate the journal device's buffers. We don't want them * floating about in memory - the physical journal device may * hotswapped, and it breaks the `ro-after' testing code. */ - sync_blockdev(sbi->s_journal_bdev); - invalidate_bdev(sbi->s_journal_bdev); + sync_blockdev(sbi->s_journal_bdev_handle->bdev); + invalidate_bdev(sbi->s_journal_bdev_handle->bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -4233,7 +4233,7 @@ int ext4_calculate_overhead(struct super_block *sb) * Add the internal journal blocks whether the journal has been * loaded or not */ - if (sbi->s_journal && !sbi->s_journal_bdev) + if (sbi->s_journal && !sbi->s_journal_bdev_handle) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_total_len); else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) { /* j_inum for internal journal is non-zero */ @@ -5670,9 +5670,9 @@ failed_mount: #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); brelse(sbi->s_sbh); - if (sbi->s_journal_bdev) { - invalidate_bdev(sbi->s_journal_bdev); - blkdev_put(sbi->s_journal_bdev, sb); + if (sbi->s_journal_bdev_handle) { + invalidate_bdev(sbi->s_journal_bdev_handle->bdev); + bdev_release(sbi->s_journal_bdev_handle); } out_fail: invalidate_bdev(sb->s_bdev); @@ -5842,12 +5842,13 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb, return journal; } -static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, +static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb, dev_t j_dev, ext4_fsblk_t *j_start, ext4_fsblk_t *j_len) { struct buffer_head *bh; struct block_device *bdev; + struct bdev_handle *bdev_handle; int hblock, blocksize; ext4_fsblk_t sb_block; unsigned long offset; @@ -5856,16 +5857,17 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, /* see get_tree_bdev why this is needed and safe */ up_write(&sb->s_umount); - bdev = blkdev_get_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &fs_holder_ops); + bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + sb, &fs_holder_ops); down_write(&sb->s_umount); - if (IS_ERR(bdev)) { + if (IS_ERR(bdev_handle)) { ext4_msg(sb, KERN_ERR, "failed to open journal device unknown-block(%u,%u) %ld", - MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev)); - return ERR_CAST(bdev); + MAJOR(j_dev), MINOR(j_dev), PTR_ERR(bdev_handle)); + return bdev_handle; } + bdev = bdev_handle->bdev; blocksize = sb->s_blocksize; hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { @@ -5912,12 +5914,12 @@ static struct block_device *ext4_get_journal_blkdev(struct super_block *sb, *j_start = sb_block + 1; *j_len = ext4_blocks_count(es); brelse(bh); - return bdev; + return bdev_handle; out_bh: brelse(bh); out_bdev: - blkdev_put(bdev, sb); + bdev_release(bdev_handle); return ERR_PTR(errno); } @@ -5927,14 +5929,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, journal_t *journal; ext4_fsblk_t j_start; ext4_fsblk_t j_len; - struct block_device *journal_bdev; + struct bdev_handle *bdev_handle; int errno = 0; - journal_bdev = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); - if (IS_ERR(journal_bdev)) - return ERR_CAST(journal_bdev); + bdev_handle = ext4_get_journal_blkdev(sb, j_dev, &j_start, &j_len); + if (IS_ERR(bdev_handle)) + return ERR_CAST(bdev_handle); - journal = jbd2_journal_init_dev(journal_bdev, sb->s_bdev, j_start, + journal = jbd2_journal_init_dev(bdev_handle->bdev, sb->s_bdev, j_start, j_len, sb->s_blocksize); if (IS_ERR(journal)) { ext4_msg(sb, KERN_ERR, "failed to create device journal"); @@ -5949,14 +5951,14 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, goto out_journal; } journal->j_private = sb; - EXT4_SB(sb)->s_journal_bdev = journal_bdev; + EXT4_SB(sb)->s_journal_bdev_handle = bdev_handle; ext4_init_journal_params(sb, journal); return journal; out_journal: jbd2_journal_destroy(journal); out_bdev: - blkdev_put(journal_bdev, sb); + bdev_release(bdev_handle); return ERR_PTR(errno); } @@ -7127,7 +7129,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7300,12 +7302,12 @@ static inline int ext3_feature_set_ok(struct super_block *sb) static void ext4_kill_sb(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + struct bdev_handle *handle = sbi ? sbi->s_journal_bdev_handle : NULL; kill_block_super(sb); - if (journal_bdev) - blkdev_put(journal_bdev, sb); + if (handle) + bdev_release(handle); } static struct file_system_type ext4_fs_type = { @@ -7314,7 +7316,7 @@ static struct file_system_type ext4_fs_type = { .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, .kill_sb = ext4_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 92ba28cebac6..82dc5e673d5c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -98,7 +98,7 @@ static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_HURD] = &ext4_xattr_hurd_handler, }; -const struct xattr_handler *ext4_xattr_handlers[] = { +const struct xattr_handler * const ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, #ifdef CONFIG_EXT4_FS_SECURITY @@ -356,7 +356,7 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | + return ((u64) inode_get_ctime_sec(ea_inode) << 32) | (u32) inode_peek_iversion_raw(ea_inode); } @@ -368,12 +368,12 @@ static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) static u32 ext4_xattr_inode_get_hash(struct inode *ea_inode) { - return (u32)ea_inode->i_atime.tv_sec; + return (u32) inode_get_atime_sec(ea_inode); } static void ext4_xattr_inode_set_hash(struct inode *ea_inode, u32 hash) { - ea_inode->i_atime.tv_sec = hash; + inode_set_atime(ea_inode, hash, 0); } /* @@ -418,7 +418,7 @@ free_bhs: return ret; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode_get_mtime_sec(inode))) static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, u32 ea_inode_hash, struct inode **ea_inode) diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 824faf0b15a8..bd97c4aa8177 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -193,7 +193,7 @@ extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); extern void ext4_evict_ea_inode(struct inode *inode); -extern const struct xattr_handler *ext4_xattr_handlers[]; +extern const struct xattr_handler * const ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8aa29fe2e87b..042593aed1ec 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } f2fs_put_page(page, 1); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6d688e42d89c..9043cedfa12b 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1234,6 +1234,7 @@ struct f2fs_bio_info { #define FDEV(i) (sbi->devs[i]) #define RDEV(i) (raw_super->devs[i]) struct f2fs_dev_info { + struct bdev_handle *bdev_handle; struct block_device *bdev; char path[MAX_PATH_LEN]; unsigned int total_segments; @@ -3317,13 +3318,15 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_is_time_consistent(struct inode *inode) { - struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 ts = inode_get_atime(inode); - if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &ts)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime)) + ts = inode_get_ctime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ts)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) + ts = inode_get_mtime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &ts)) return false; return true; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ca5904129b16..dd99abbb7186 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -798,7 +798,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -905,9 +905,9 @@ static void __setattr_copy(struct mnt_idmap *idmap, i_uid_update(idmap, attr, inode); i_gid_update(idmap, attr, inode); if (ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { @@ -1012,7 +1012,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return err; spin_lock(&F2FS_I(inode)->i_size_lock); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); F2FS_I(inode)->last_disk_size = i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); } @@ -1840,7 +1840,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -2888,10 +2888,10 @@ out_src: if (ret) goto out_unlock; - src->i_mtime = inode_set_ctime_current(src); + inode_set_mtime_to_ts(src, inode_set_ctime_current(src)); f2fs_mark_inode_dirty_sync(src, false); if (src != dst) { - dst->i_mtime = inode_set_ctime_current(dst); + inode_set_mtime_to_ts(dst, inode_set_ctime_current(dst)); f2fs_mark_inode_dirty_sync(dst, false); } f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2fe25619ccb5..ac00423f117b 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -699,7 +699,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index cde243840abd..5779c7edd49b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -386,9 +386,9 @@ static void init_idisk_time(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - fi->i_disk_time[0] = inode->i_atime; + fi->i_disk_time[0] = inode_get_atime(inode); fi->i_disk_time[1] = inode_get_ctime(inode); - fi->i_disk_time[2] = inode->i_mtime; + fi->i_disk_time[2] = inode_get_mtime(inode); } static int do_read_inode(struct inode *inode) @@ -417,12 +417,12 @@ static int do_read_inode(struct inode *inode) inode->i_size = le64_to_cpu(ri->i_size); inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); - inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); + inode_set_atime(inode, le64_to_cpu(ri->i_atime), + le32_to_cpu(ri->i_atime_nsec)); inode_set_ctime(inode, le64_to_cpu(ri->i_ctime), le32_to_cpu(ri->i_ctime_nsec)); - inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + inode_set_mtime(inode, le64_to_cpu(ri->i_mtime), + le32_to_cpu(ri->i_mtime_nsec)); inode->i_generation = le32_to_cpu(ri->i_generation); if (S_ISDIR(inode->i_mode)) fi->i_current_depth = le32_to_cpu(ri->i_current_depth); @@ -698,12 +698,12 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) } set_raw_inline(inode, ri); - ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); - ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ri->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + ri->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + ri->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + ri->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ri->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); if (S_ISDIR(inode->i_mode)) ri->i_current_depth = cpu_to_le32(F2FS_I(inode)->i_current_depth); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 193b22a2d6bf..d0053b0284d8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -243,8 +243,8 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - F2FS_I(inode)->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + F2FS_I(inode)->i_crtime = inode_get_mtime(inode); inode->i_generation = get_random_u32(); if (S_ISDIR(inode->i_mode)) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 7be60df277a5..b56d0f1078a7 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -320,12 +320,12 @@ static int recover_inode(struct inode *inode, struct page *page) } f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); + inode_set_atime(inode, le64_to_cpu(raw->i_atime), + le32_to_cpu(raw->i_atime_nsec)); inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), le32_to_cpu(raw->i_ctime_nsec)); - inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode_set_mtime(inode, le64_to_cpu(raw->i_mtime), + le32_to_cpu(raw->i_mtime_nsec)); F2FS_I(inode)->i_advise = raw->i_advise; F2FS_I(inode)->i_flags = le32_to_cpu(raw->i_flags); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a8c8232852bb..be17d77513d5 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1562,7 +1562,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) for (i = 0; i < sbi->s_ndevs; i++) { if (i > 0) - blkdev_put(FDEV(i).bdev, sbi->sb); + bdev_release(FDEV(i).bdev_handle); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -2710,7 +2710,7 @@ retry: if (len == towrite) return err; - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; } @@ -3203,13 +3203,6 @@ static bool f2fs_has_stable_inodes(struct super_block *sb) return true; } -static void f2fs_get_ino_and_lblk_bits(struct super_block *sb, - int *ino_bits_ret, int *lblk_bits_ret) -{ - *ino_bits_ret = 8 * sizeof(nid_t); - *lblk_bits_ret = 8 * sizeof(block_t); -} - static struct block_device **f2fs_get_devices(struct super_block *sb, unsigned int *num_devs) { @@ -3231,13 +3224,15 @@ static struct block_device **f2fs_get_devices(struct super_block *sb, } static const struct fscrypt_operations f2fs_cryptops = { - .key_prefix = "f2fs:", + .needs_bounce_pages = 1, + .has_32bit_inodes = 1, + .supports_subblock_data_units = 1, + .legacy_key_prefix = "f2fs:", .get_context = f2fs_get_context, .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, .has_stable_inodes = f2fs_has_stable_inodes, - .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_devices = f2fs_get_devices, }; #endif @@ -4198,7 +4193,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) for (i = 0; i < max_devices; i++) { if (i == 0) - FDEV(0).bdev = sbi->sb->s_bdev; + FDEV(0).bdev_handle = sbi->sb->s_bdev_handle; else if (!RDEV(i).path[0]) break; @@ -4218,13 +4213,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).end_blk = FDEV(i).start_blk + (FDEV(i).total_segments << sbi->log_blocks_per_seg) - 1; - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, - mode, sbi->sb, NULL); + FDEV(i).bdev_handle = bdev_open_by_path( + FDEV(i).path, mode, sbi->sb, NULL); } } - if (IS_ERR(FDEV(i).bdev)) - return PTR_ERR(FDEV(i).bdev); + if (IS_ERR(FDEV(i).bdev_handle)) + return PTR_ERR(FDEV(i).bdev_handle); + FDEV(i).bdev = FDEV(i).bdev_handle->bdev; /* to release errored devices */ sbi->s_ndevs = i + 1; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index a657284faee3..4314456854f6 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -189,7 +189,7 @@ const struct xattr_handler f2fs_xattr_security_handler = { .set = f2fs_xattr_generic_set, }; -static const struct xattr_handler *f2fs_xattr_handler_map[] = { +static const struct xattr_handler * const f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_USER] = &f2fs_xattr_user_handler, #ifdef CONFIG_F2FS_FS_POSIX_ACL [F2FS_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, @@ -202,7 +202,7 @@ static const struct xattr_handler *f2fs_xattr_handler_map[] = { [F2FS_XATTR_INDEX_ADVISE] = &f2fs_xattr_advise_handler, }; -const struct xattr_handler *f2fs_xattr_handlers[] = { +const struct xattr_handler * const f2fs_xattr_handlers[] = { &f2fs_xattr_user_handler, &f2fs_xattr_trusted_handler, #ifdef CONFIG_F2FS_FS_SECURITY diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index b1811c392e6f..a005ffdcf717 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -125,7 +125,7 @@ extern const struct xattr_handler f2fs_xattr_trusted_handler; extern const struct xattr_handler f2fs_xattr_advise_handler; extern const struct xattr_handler f2fs_xattr_security_handler; -extern const struct xattr_handler *f2fs_xattr_handlers[]; +extern const struct xattr_handler * const f2fs_xattr_handlers[]; extern int f2fs_setxattr(struct inode *, int, const char *, const void *, size_t, struct page *, int); diff --git a/fs/fat/inode.c b/fs/fat/inode.c index cdd39b6020f3..1fac3dabf130 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -512,6 +512,7 @@ static int fat_validate_dir(struct inode *dir) int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + struct timespec64 mtime; int error; MSDOS_I(inode)->i_pos = 0; @@ -561,14 +562,18 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) inode->i_blocks = ((inode->i_size + (sbi->cluster_size - 1)) & ~((loff_t)sbi->cluster_size - 1)) >> 9; - fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); - inode_set_ctime_to_ts(inode, inode->i_mtime); + fat_time_fat2unix(sbi, &mtime, de->time, de->date, 0); + inode_set_mtime_to_ts(inode, mtime); + inode_set_ctime_to_ts(inode, mtime); if (sbi->options.isvfat) { - fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); + struct timespec64 atime; + + fat_time_fat2unix(sbi, &atime, 0, de->adate, 0); + inode_set_atime_to_ts(inode, atime); fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, de->cdate, de->ctime_cs); } else - inode->i_atime = fat_truncate_atime(sbi, &inode->i_mtime); + inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, &mtime)); return 0; } @@ -849,6 +854,7 @@ static int __fat_write_inode(struct inode *inode, int wait) struct msdos_sb_info *sbi = MSDOS_SB(sb); struct buffer_head *bh; struct msdos_dir_entry *raw_entry; + struct timespec64 mtime; loff_t i_pos; sector_t blocknr; int err, offset; @@ -882,12 +888,14 @@ retry: raw_entry->size = cpu_to_le32(inode->i_size); raw_entry->attr = fat_make_attrs(inode); fat_set_start(raw_entry, MSDOS_I(inode)->i_logstart); - fat_time_unix2fat(sbi, &inode->i_mtime, &raw_entry->time, + mtime = inode_get_mtime(inode); + fat_time_unix2fat(sbi, &mtime, &raw_entry->time, &raw_entry->date, NULL); if (sbi->options.isvfat) { + struct timespec64 ts = inode_get_atime(inode); __le16 atime; - fat_time_unix2fat(sbi, &inode->i_atime, &atime, - &raw_entry->adate, NULL); + + fat_time_unix2fat(sbi, &ts, &atime, &raw_entry->adate, NULL); fat_time_unix2fat(sbi, &MSDOS_I(inode)->i_crtime, &raw_entry->ctime, &raw_entry->cdate, &raw_entry->ctime_cs); } @@ -1407,7 +1415,8 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); - inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, 0, 0))); set_nlink(inode, fat_subdirs(inode)+2); return 0; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index f2304a1054aa..c7a2d27120ba 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -325,15 +325,15 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) } if (flags & S_ATIME) - inode->i_atime = fat_truncate_atime(sbi, now); + inode_set_atime_to_ts(inode, fat_truncate_atime(sbi, now)); /* * ctime and mtime share the same on-disk field, and should be * identical in memory. all mtime updates will be applied to ctime, * but ctime updates are ignored. */ if (flags & S_MTIME) - inode->i_mtime = inode_set_ctime_to_ts(inode, - fat_truncate_mtime(sbi, now)); + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, fat_truncate_mtime(sbi, now))); return 0; } diff --git a/fs/file.c b/fs/file.c index 3e4a4dfa38fc..5fb0b146e79e 100644 --- a/fs/file.c +++ b/fs/file.c @@ -604,6 +604,9 @@ void fd_install(unsigned int fd, struct file *file) struct files_struct *files = current->files; struct fdtable *fdt; + if (WARN_ON_ONCE(unlikely(file->f_mode & FMODE_BACKING))) + return; + rcu_read_lock_sched(); if (unlikely(files->resize_in_progress)) { @@ -853,8 +856,104 @@ void do_close_on_exec(struct files_struct *files) spin_unlock(&files->file_lock); } +static struct file *__get_file_rcu(struct file __rcu **f) +{ + struct file __rcu *file; + struct file __rcu *file_reloaded; + struct file __rcu *file_reloaded_cmp; + + file = rcu_dereference_raw(*f); + if (!file) + return NULL; + + if (unlikely(!atomic_long_inc_not_zero(&file->f_count))) + return ERR_PTR(-EAGAIN); + + file_reloaded = rcu_dereference_raw(*f); + + /* + * Ensure that all accesses have a dependency on the load from + * rcu_dereference_raw() above so we get correct ordering + * between reuse/allocation and the pointer check below. + */ + file_reloaded_cmp = file_reloaded; + OPTIMIZER_HIDE_VAR(file_reloaded_cmp); + + /* + * atomic_long_inc_not_zero() above provided a full memory + * barrier when we acquired a reference. + * + * This is paired with the write barrier from assigning to the + * __rcu protected file pointer so that if that pointer still + * matches the current file, we know we have successfully + * acquired a reference to the right file. + * + * If the pointers don't match the file has been reallocated by + * SLAB_TYPESAFE_BY_RCU. + */ + if (file == file_reloaded_cmp) + return file_reloaded; + + fput(file); + return ERR_PTR(-EAGAIN); +} + +/** + * get_file_rcu - try go get a reference to a file under rcu + * @f: the file to get a reference on + * + * This function tries to get a reference on @f carefully verifying that + * @f hasn't been reused. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_rcu(struct file __rcu **f) +{ + for (;;) { + struct file __rcu *file; + + file = __get_file_rcu(f); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) + continue; + + return file; + } +} +EXPORT_SYMBOL_GPL(get_file_rcu); + +/** + * get_file_active - try go get a reference to a file + * @f: the file to get a reference on + * + * In contast to get_file_rcu() the pointer itself isn't part of the + * reference counting. + * + * This function should rarely have to be used and only by users who + * understand the implications of SLAB_TYPESAFE_BY_RCU. Try to avoid it. + * + * Return: Returns @f with the reference count increased or NULL. + */ +struct file *get_file_active(struct file **f) +{ + struct file __rcu *file; + + rcu_read_lock(); + file = __get_file_rcu(f); + rcu_read_unlock(); + if (IS_ERR(file)) + file = NULL; + return file; +} +EXPORT_SYMBOL_GPL(get_file_active); + static inline struct file *__fget_files_rcu(struct files_struct *files, - unsigned int fd, fmode_t mask) + unsigned int fd, fmode_t mask) { for (;;) { struct file *file; @@ -865,12 +964,6 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, return NULL; fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds); - file = rcu_dereference_raw(*fdentry); - if (unlikely(!file)) - return NULL; - - if (unlikely(file->f_mode & mask)) - return NULL; /* * Ok, we have a file pointer. However, because we do @@ -879,10 +972,15 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * Such a race can take two forms: * - * (a) the file ref already went down to zero, - * and get_file_rcu() fails. Just try again: + * (a) the file ref already went down to zero and the + * file hasn't been reused yet or the file count + * isn't zero but the file has already been reused. */ - if (unlikely(!get_file_rcu(file))) + file = __get_file_rcu(fdentry); + if (unlikely(!file)) + return NULL; + + if (unlikely(IS_ERR(file))) continue; /* @@ -893,13 +991,21 @@ static inline struct file *__fget_files_rcu(struct files_struct *files, * * If so, we need to put our ref and try again. */ - if (unlikely(rcu_dereference_raw(files->fdt) != fdt) || - unlikely(rcu_dereference_raw(*fdentry) != file)) { + if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) { fput(file); continue; } /* + * This isn't the file we're looking for or we're not + * allowed to get a reference to it. + */ + if (unlikely(file->f_mode & mask)) { + fput(file); + return NULL; + } + + /* * Ok, we have a ref to the file, and checked that it * still exists. */ @@ -948,7 +1054,14 @@ struct file *fget_task(struct task_struct *task, unsigned int fd) return file; } -struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) +struct file *lookup_fdget_rcu(unsigned int fd) +{ + return __fget_files_rcu(current->files, fd, 0); + +} +EXPORT_SYMBOL_GPL(lookup_fdget_rcu); + +struct file *task_lookup_fdget_rcu(struct task_struct *task, unsigned int fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -957,13 +1070,13 @@ struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd) task_lock(task); files = task->files; if (files) - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); task_unlock(task); return file; } -struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd) +struct file *task_lookup_next_fdget_rcu(struct task_struct *task, unsigned int *ret_fd) { /* Must be called with rcu_read_lock held */ struct files_struct *files; @@ -974,7 +1087,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret files = task->files; if (files) { for (; fd < files_fdtable(files)->max_fds; fd++) { - file = files_lookup_fd_rcu(files, fd); + file = __fget_files_rcu(files, fd, 0); if (file) break; } @@ -983,7 +1096,7 @@ struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret *ret_fd = fd; return file; } -EXPORT_SYMBOL(task_lookup_next_fd_rcu); +EXPORT_SYMBOL(task_lookup_next_fdget_rcu); /* * Lightweight file lookup - no refcnt increment if fd table isn't shared. @@ -1272,12 +1385,16 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) { if (unlikely(newfd == oldfd)) { /* corner case */ struct files_struct *files = current->files; + struct file *f; int retval = oldfd; rcu_read_lock(); - if (!files_lookup_fd_rcu(files, oldfd)) + f = __fget_files_rcu(files, oldfd, 0); + if (!f) retval = -EBADF; rcu_read_unlock(); + if (f) + fput(f); return retval; } return ksys_dup3(oldfd, newfd, 0); diff --git a/fs/file_table.c b/fs/file_table.c index ee21b3da9d08..fa92743ba6a9 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -44,10 +44,10 @@ static struct kmem_cache *filp_cachep __read_mostly; static struct percpu_counter nr_files __cacheline_aligned_in_smp; -/* Container for backing file with optional real path */ +/* Container for backing file with optional user path */ struct backing_file { struct file file; - struct path real_path; + struct path user_path; }; static inline struct backing_file *backing_file(struct file *f) @@ -55,31 +55,36 @@ static inline struct backing_file *backing_file(struct file *f) return container_of(f, struct backing_file, file); } -struct path *backing_file_real_path(struct file *f) +struct path *backing_file_user_path(struct file *f) { - return &backing_file(f)->real_path; + return &backing_file(f)->user_path; } -EXPORT_SYMBOL_GPL(backing_file_real_path); +EXPORT_SYMBOL_GPL(backing_file_user_path); -static void file_free_rcu(struct rcu_head *head) +static inline void file_free(struct file *f) { - struct file *f = container_of(head, struct file, f_rcuhead); - + security_file_free(f); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); put_cred(f->f_cred); - if (unlikely(f->f_mode & FMODE_BACKING)) + if (unlikely(f->f_mode & FMODE_BACKING)) { + path_put(backing_file_user_path(f)); kfree(backing_file(f)); - else + } else { kmem_cache_free(filp_cachep, f); + } } -static inline void file_free(struct file *f) +void release_empty_file(struct file *f) { - security_file_free(f); - if (unlikely(f->f_mode & FMODE_BACKING)) - path_put(backing_file_real_path(f)); - if (likely(!(f->f_mode & FMODE_NOACCOUNT))) - percpu_counter_dec(&nr_files); - call_rcu(&f->f_rcuhead, file_free_rcu); + WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED)); + if (atomic_long_dec_and_test(&f->f_count)) { + security_file_free(f); + put_cred(f->f_cred); + if (likely(!(f->f_mode & FMODE_NOACCOUNT))) + percpu_counter_dec(&nr_files); + kmem_cache_free(filp_cachep, f); + } } /* @@ -164,7 +169,6 @@ static int init_file(struct file *f, int flags, const struct cred *cred) return error; } - atomic_long_set(&f->f_count, 1); rwlock_init(&f->f_owner.lock); spin_lock_init(&f->f_lock); mutex_init(&f->f_pos_lock); @@ -172,6 +176,12 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_mode = OPEN_FMODE(flags); /* f->f_version: 0 */ + /* + * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * fget-rcu pattern users need to be able to handle spurious + * refcount bumps we should reinitialize the reused file first. + */ + atomic_long_set(&f->f_count, 1); return 0; } @@ -471,7 +481,8 @@ EXPORT_SYMBOL(__fput_sync); void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN | + SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ac5d43b164b5..20600e9ea202 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -109,11 +109,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi, set_nlink(inode, vip->vii_nlink); inode->i_size = vip->vii_size; - inode->i_atime.tv_sec = vip->vii_atime; + inode_set_atime(inode, vip->vii_atime, 0); inode_set_ctime(inode, vip->vii_ctime, 0); - inode->i_mtime.tv_sec = vip->vii_mtime; - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; + inode_set_mtime(inode, vip->vii_mtime, 0); inode->i_blocks = vip->vii_blocks; inode->i_generation = vip->vii_gen; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 969ce991b0b0..1767493dffda 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -613,6 +613,24 @@ out_free: kfree(isw); } +static bool isw_prepare_wbs_switch(struct inode_switch_wbs_context *isw, + struct list_head *list, int *nr) +{ + struct inode *inode; + + list_for_each_entry(inode, list, i_io_list) { + if (!inode_prepare_wbs_switch(inode, isw->new_wb)) + continue; + + isw->inodes[*nr] = inode; + (*nr)++; + + if (*nr >= WB_MAX_INODES_PER_ISW - 1) + return true; + } + return false; +} + /** * cleanup_offline_cgwb - detach associated inodes * @wb: target wb @@ -625,7 +643,6 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) { struct cgroup_subsys_state *memcg_css; struct inode_switch_wbs_context *isw; - struct inode *inode; int nr; bool restart = false; @@ -647,17 +664,17 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) nr = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_attached, i_io_list) { - if (!inode_prepare_wbs_switch(inode, isw->new_wb)) - continue; - - isw->inodes[nr++] = inode; - - if (nr >= WB_MAX_INODES_PER_ISW - 1) { - restart = true; - break; - } - } + /* + * In addition to the inodes that have completed writeback, also switch + * cgwbs for those inodes only with dirty timestamps. Otherwise, those + * inodes won't be written back for a long time when lazytime is + * enabled, and thus pinning the dying cgwbs. It won't break the + * bandwidth restrictions, as writeback of inode metadata is not + * accounted for. + */ + restart = isw_prepare_wbs_switch(isw, &wb->b_attached, &nr); + if (!restart) + restart = isw_prepare_wbs_switch(isw, &wb->b_dirty_time, &nr); spin_unlock(&wb->list_lock); /* no attached inodes? bail out */ @@ -1535,10 +1552,15 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, if (wbc->pages_skipped) { /* - * writeback is not making progress due to locked - * buffers. Skip this inode for now. + * Writeback is not making progress due to locked buffers. + * Skip this inode for now. Although having skipped pages + * is odd for clean inodes, it can happen for some + * filesystems so handle that gracefully. */ - redirty_tail_locked(inode, wb); + if (inode->i_state & I_DIRTY_ALL) + redirty_tail_locked(inode, wb); + else + inode_cgwb_move_to_attached(inode, wb); return; } diff --git a/fs/fs_context.c b/fs/fs_context.c index a0ad7a0c4680..98589aae5208 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -192,17 +192,19 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, EXPORT_SYMBOL(vfs_parse_fs_string); /** - * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data * @fc: The superblock configuration to fill in. * @data: The data to parse + * @sep: callback for separating next option * - * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be - * called from the ->monolithic_mount_data() fs_context operation. + * Parse a blob of data that's in key[=val][,key[=val]]* form with a custom + * option separator callback. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ -int generic_parse_monolithic(struct fs_context *fc, void *data) +int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, + char *(*sep)(char **)) { char *options = data, *key; int ret = 0; @@ -214,7 +216,7 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) if (ret) return ret; - while ((key = strsep(&options, ",")) != NULL) { + while ((key = sep(&options)) != NULL) { if (*key) { size_t v_len = 0; char *value = strchr(key, '='); @@ -233,6 +235,28 @@ int generic_parse_monolithic(struct fs_context *fc, void *data) return ret; } +EXPORT_SYMBOL(vfs_parse_monolithic_sep); + +static char *vfs_parse_comma_sep(char **s) +{ + return strsep(s, ","); +} + +/** + * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data + * @fc: The superblock configuration to fill in. + * @data: The data to parse + * + * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be + * called from the ->monolithic_mount_data() fs_context operation. + * + * Returns 0 on success or the error returned by the ->parse_option() fs_context + * operation on failure. + */ +int generic_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, vfs_parse_comma_sep); +} EXPORT_SYMBOL(generic_parse_monolithic); /** diff --git a/fs/fsopen.c b/fs/fsopen.c index ce03f6521c88..6593ae518115 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -465,6 +465,7 @@ SYSCALL_DEFINE5(fsconfig, param.file = fget(aux); if (!param.file) goto out_key; + param.dirfd = aux; break; default: break; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index ab62e4624256..284a35006462 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d707e6987da9..d19cbf34c634 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1812,12 +1812,12 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; - inarg.mtime = inode->i_mtime.tv_sec; - inarg.mtimensec = inode->i_mtime.tv_nsec; + inarg.mtime = inode_get_mtime_sec(inode); + inarg.mtimensec = inode_get_mtime_nsec(inode); if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; - inarg.ctime = inode_get_ctime(inode).tv_sec; - inarg.ctimensec = inode_get_ctime(inode).tv_nsec; + inarg.ctime = inode_get_ctime_sec(inode); + inarg.ctimensec = inode_get_ctime_nsec(inode); } if (ff) { inarg.valid |= FATTR_FH; @@ -1956,7 +1956,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bf0b85d0b95c..6e6e721f421b 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1284,7 +1284,7 @@ ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); int fuse_removexattr(struct inode *inode, const char *name); -extern const struct xattr_handler *fuse_xattr_handlers[]; +extern const struct xattr_handler * const fuse_xattr_handlers[]; struct posix_acl; struct posix_acl *fuse_get_inode_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2e4eb7cf26fb..caa8121ad99c 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -188,12 +188,10 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1); attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1); - inode->i_atime.tv_sec = attr->atime; - inode->i_atime.tv_nsec = attr->atimensec; + inode_set_atime(inode, attr->atime, attr->atimensec); /* mtime from server may be stale due to local buffered write */ if (!(cache_mask & STATX_MTIME)) { - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); } if (!(cache_mask & STATX_CTIME)) { inode_set_ctime(inode, attr->ctime, attr->ctimensec); @@ -276,12 +274,12 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); if (cache_mask & STATX_MTIME) { - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; + attr->mtime = inode_get_mtime_sec(inode); + attr->mtimensec = inode_get_mtime_nsec(inode); } if (cache_mask & STATX_CTIME) { - attr->ctime = inode_get_ctime(inode).tv_sec; - attr->ctimensec = inode_get_ctime(inode).tv_nsec; + attr->ctime = inode_get_ctime_sec(inode); + attr->ctimensec = inode_get_ctime_nsec(inode); } if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -290,7 +288,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, return; } - old_mtime = inode->i_mtime; + old_mtime = inode_get_mtime(inode); fuse_change_attributes_common(inode, attr, sx, attr_valid, cache_mask); oldsize = inode->i_size; @@ -337,8 +335,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, { inode->i_mode = attr->mode & S_IFMT; inode->i_size = attr->size; - inode->i_mtime.tv_sec = attr->mtime; - inode->i_mtime.tv_nsec = attr->mtimensec; + inode_set_mtime(inode, attr->mtime, attr->mtimensec); inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); @@ -1423,17 +1420,19 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { + struct timespec64 atime = inode_get_atime(&fi->inode); + struct timespec64 mtime = inode_get_mtime(&fi->inode); struct timespec64 ctime = inode_get_ctime(&fi->inode); *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, - .atime = fi->inode.i_atime.tv_sec, - .mtime = fi->inode.i_mtime.tv_sec, + .atime = atime.tv_sec, + .mtime = mtime.tv_sec, .ctime = ctime.tv_sec, - .atimensec = fi->inode.i_atime.tv_nsec, - .mtimensec = fi->inode.i_mtime.tv_nsec, + .atimensec = atime.tv_nsec, + .mtimensec = mtime.tv_nsec, .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 9e6d587b3e67..c66a54d6c7d3 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -476,7 +476,7 @@ retry_locked: if (!fi->rdc.cached) { /* Starting cache? Set cache mtime. */ if (!ctx->pos && !fi->rdc.size) { - fi->rdc.mtime = inode->i_mtime; + fi->rdc.mtime = inode_get_mtime(inode); fi->rdc.iversion = inode_query_iversion(inode); } spin_unlock(&fi->rdc.lock); @@ -488,8 +488,10 @@ retry_locked: * changed, and reset the cache if so. */ if (!ctx->pos) { + struct timespec64 mtime = inode_get_mtime(inode); + if (inode_peek_iversion(inode) != fi->rdc.iversion || - !timespec64_equal(&fi->rdc.mtime, &inode->i_mtime)) { + !timespec64_equal(&fi->rdc.mtime, &mtime)) { fuse_rdc_reset(inode); goto retry_locked; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 49c01559580f..5b423fdbb13f 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -209,7 +209,7 @@ static const struct xattr_handler fuse_xattr_handler = { .set = fuse_xattr_set, }; -const struct xattr_handler *fuse_xattr_handlers[] = { +const struct xattr_handler * const fuse_xattr_handlers[] = { &fuse_xattr_handler, NULL }; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ef7017fb6951..011cd992e0e6 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize) ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); if (journaled) @@ -1583,7 +1583,7 @@ out_unlock: /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1949,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); @@ -1992,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); @@ -2093,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; truncate_setsize(inode, size); - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1a2afa88f8be..61ddd03ea111 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -227,7 +227,7 @@ out: if (ip->i_inode.i_size < offset + copied) i_size_write(&ip->i_inode, offset + copied); - ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); + inode_set_mtime_to_ts(&ip->i_inode, inode_set_ctime_current(&ip->i_inode)); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = tv; + inode_set_mtime_to_ts(&ip->i_inode, tv); if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1911,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = tv; + inode_set_mtime_to_ts(&dip->i_inode, tv); if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -1952,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, dent->de_type = cpu_to_be16(new_type); brelse(bh); - dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode); + inode_set_mtime_to_ts(&dip->i_inode, inode_set_ctime_current(&dip->i_inode)); mark_inode_dirty_sync(&dip->i_inode); return 0; } diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 9cbf8d98489a..3772a5d9e85c 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2010,7 +2010,9 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { if (!spin_trylock(&gl->gl_lockref.lock)) continue; - if (!gl->gl_lockref.count) { + if (gl->gl_lockref.count <= 1 && + (gl->gl_state == LM_ST_UNLOCKED || + demote_ok(gl))) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); freed++; @@ -2717,16 +2719,19 @@ static struct file *gfs2_glockfd_next_file(struct gfs2_glockfd_iter *i) for(;; i->fd++) { struct inode *inode; - i->file = task_lookup_next_fd_rcu(i->task, &i->fd); + i->file = task_lookup_next_fdget_rcu(i->task, &i->fd); if (!i->file) { i->fd = 0; break; } + inode = file_inode(i->file); - if (inode->i_sb != i->sb) - continue; - if (get_file_rcu(i->file)) + if (inode->i_sb == i->sb) break; + + rcu_read_unlock(); + fput(i->file); + rcu_read_lock(); } rcu_read_unlock(); return i->file; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d26759a98b10..e7d334c277a1 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -403,7 +403,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); const struct gfs2_dinode *str = buf; - struct timespec64 atime; + struct timespec64 atime, iatime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); struct inode *inode = &ip->i_inode; @@ -433,10 +433,11 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) gfs2_set_inode_blocks(inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); - if (timespec64_compare(&inode->i_atime, &atime) < 0) - inode->i_atime = atime; - inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); - inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); + iatime = inode_get_atime(inode); + if (timespec64_compare(&iatime, &atime) < 0) + inode_set_atime_to_ts(inode, atime); + inode_set_mtime(inode, be64_to_cpu(str->di_mtime), + be32_to_cpu(str->di_mtime_nsec)); inode_set_ctime(inode, be64_to_cpu(str->di_ctime), be32_to_cpu(str->di_ctime_nsec)); @@ -567,15 +568,16 @@ static void freeze_go_callback(struct gfs2_glock *gl, bool remote) struct super_block *sb = sdp->sd_vfs; if (!remote || - gl->gl_state != LM_ST_SHARED || + (gl->gl_state != LM_ST_SHARED && + gl->gl_state != LM_ST_UNLOCKED) || gl->gl_demote_state != LM_ST_UNLOCKED) return; /* * Try to get an active super block reference to prevent racing with - * unmount (see trylock_super()). But note that unmount isn't the only - * place where a write lock on s_umount is taken, and we can fail here - * because of things like remount as well. + * unmount (see super_trylock_shared()). But note that unmount isn't + * the only place where a write lock on s_umount is taken, and we can + * fail here because of things like remount as well. */ if (down_read_trylock(&sb->s_umount)) { atomic_inc(&sb->s_active); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 0eac04507904..7fe77bc771e5 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -185,8 +185,9 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; + inode_set_atime(inode, + 1LL << (8 * sizeof(inode_get_atime_sec(inode)) - 1), + 0); glock_set_object(ip->i_gl, ip); @@ -696,7 +697,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 171b2713d2e5..d9854aece15b 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -886,7 +886,7 @@ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 21ada332d555..1429945215a0 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -50,7 +50,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); if (ret) return ret; - if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON && + sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) return 0; ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 02d93da21b2b..52a878fa7139 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -410,9 +410,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_nlink = cpu_to_be32(inode->i_nlink); str->di_size = cpu_to_be64(i_size_read(inode)); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); - str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); - str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec); + str->di_atime = cpu_to_be64(inode_get_atime_sec(inode)); + str->di_mtime = cpu_to_be64(inode_get_mtime_sec(inode)); + str->di_ctime = cpu_to_be64(inode_get_ctime_sec(inode)); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -427,9 +427,9 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_entries = cpu_to_be32(ip->i_entries); str->di_eattr = cpu_to_be64(ip->i_eattr); - str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); - str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec); + str->di_atime_nsec = cpu_to_be32(inode_get_atime_nsec(inode)); + str->di_mtime_nsec = cpu_to_be32(inode_get_mtime_nsec(inode)); + str->di_ctime_nsec = cpu_to_be32(inode_get_ctime_nsec(inode)); } /** diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index ab9c83106932..b4ddf6244586 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -60,8 +60,8 @@ extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; -extern const struct xattr_handler *gfs2_xattr_handlers_max[]; -extern const struct xattr_handler **gfs2_xattr_handlers_min; +extern const struct xattr_handler * const gfs2_xattr_handlers_max[]; +extern const struct xattr_handler * const *gfs2_xattr_handlers_min; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 4fea70c0fe3d..79d5c5559512 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1494,7 +1494,7 @@ static const struct xattr_handler gfs2_xattr_trusted_handler = { .set = gfs2_xattr_set, }; -const struct xattr_handler *gfs2_xattr_handlers_max[] = { +const struct xattr_handler * const gfs2_xattr_handlers_max[] = { /* GFS2_FS_FORMAT_MAX */ &gfs2_xattr_trusted_handler, @@ -1504,4 +1504,4 @@ const struct xattr_handler *gfs2_xattr_handlers_max[] = { NULL, }; -const struct xattr_handler **gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; +const struct xattr_handler * const *gfs2_xattr_handlers_min = gfs2_xattr_handlers_max + 1; diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index 6341bb248247..f8395cdd1adf 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -146,7 +146,7 @@ static const struct xattr_handler hfs_type_handler = { .set = hfs_xattr_set, }; -const struct xattr_handler *hfs_xattr_handlers[] = { +const struct xattr_handler * const hfs_xattr_handlers[] = { &hfs_creator_handler, &hfs_type_handler, NULL diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index 632c226a3972..d63880e7d9d6 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i goto err1; dir->i_size++; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); hfs_find_exit(&fd); return 0; @@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) } dir->i_size--; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); res = 0; out: @@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; dst_dir->i_size++; - dst_dir->i_mtime = inode_set_ctime_current(dst_dir); + inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir)); mark_inode_dirty(dst_dir); /* finally remove the old entry */ @@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; src_dir->i_size--; - src_dir->i_mtime = inode_set_ctime_current(src_dir); + inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir)); mark_inode_dirty(src_dir); type = entry.type; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 49d02524e667..b5a6ad5df357 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -215,7 +215,7 @@ extern void hfs_evict_inode(struct inode *); extern void hfs_delete_inode(struct inode *); /* attr.c */ -extern const struct xattr_handler *hfs_xattr_handlers[]; +extern const struct xattr_handler * const hfs_xattr_handlers[]; /* mdb.c */ extern int hfs_mdb_get(struct super_block *); diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index ee349b72cfb3..a7bc4690a780 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode |= S_IWUGO; inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; - inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, - hfs_m_to_utime(rec->file.MdDat)); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->file.MdDat)))); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_size = be16_to_cpu(rec->dir.Val) + 2; HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); - inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, - hfs_m_to_utime(rec->dir.MdDat)); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, hfs_m_to_utime(rec->dir.MdDat)))); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; @@ -474,7 +474,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) be32_to_cpu(rec.dir.DirID) != inode->i_ino) { } - rec.dir.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.dir.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); rec.dir.Val = cpu_to_be16(inode->i_size - 2); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, @@ -502,7 +502,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) else rec.file.Flags |= HFS_FIL_LOCK; hfs_inode_write_fork(inode, rec.file.ExtRec, &rec.file.LgLen, &rec.file.PyLen); - rec.file.MdDat = hfs_u_to_mtime(inode->i_mtime); + rec.file.MdDat = hfs_u_to_mtime(inode_get_mtime(inode)); hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); @@ -654,7 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index dc27d418fbcd..76fa02e3835b 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -28,11 +28,13 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) /* fix up inode on a timezone change */ diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; if (diff) { - struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 ts = inode_get_ctime(inode); - inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec); - inode->i_atime.tv_sec += diff; - inode->i_mtime.tv_sec += diff; + inode_set_ctime(inode, ts.tv_sec + diff, ts.tv_nsec); + ts = inode_get_atime(inode); + inode_set_atime(inode, ts.tv_sec + diff, ts.tv_nsec); + ts = inode_get_mtime(inode); + inode_set_mtime(inode, ts.tv_sec + diff, ts.tv_nsec); HFS_I(inode)->tz_secondswest += diff; } return 1; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index e71ae2537eaa..1995bafee839 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dir->i_size++; if (S_ISDIR(inode->i_mode)) hfsplus_subfolders_inc(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); hfs_find_exit(&fd); @@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { @@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_size++; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_inc(dst_dir); - dst_dir->i_mtime = inode_set_ctime_current(dst_dir); + inode_set_mtime_to_ts(dst_dir, inode_set_ctime_current(dst_dir)); /* finally remove the old entry */ err = hfsplus_cat_build_key(sb, src_fd.search_key, @@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(src_dir); - src_dir->i_mtime = inode_set_ctime_current(src_dir); + inode_set_mtime_to_ts(src_dir, inode_set_ctime_current(src_dir)); /* remove old thread entry */ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index c65c8c4b03dd..702a0663b1d8 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap, } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } setattr_copy(&nop_mnt_idmap, inode, attr); @@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, inode->i_ino = sbi->next_cnid++; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); @@ -521,8 +521,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) hfsplus_get_perms(inode, &folder->permissions, 1); set_nlink(inode, 1); inode->i_size = 2 + be32_to_cpu(folder->valence); - inode->i_atime = hfsp_mt2ut(folder->access_date); - inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); + inode_set_atime_to_ts(inode, hfsp_mt2ut(folder->access_date)); + inode_set_mtime_to_ts(inode, + hfsp_mt2ut(folder->content_mod_date)); inode_set_ctime_to_ts(inode, hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; @@ -563,8 +564,9 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) init_special_inode(inode, inode->i_mode, be32_to_cpu(file->permissions.dev)); } - inode->i_atime = hfsp_mt2ut(file->access_date); - inode->i_mtime = hfsp_mt2ut(file->content_mod_date); + inode_set_atime_to_ts(inode, hfsp_mt2ut(file->access_date)); + inode_set_mtime_to_ts(inode, + hfsp_mt2ut(file->content_mod_date)); inode_set_ctime_to_ts(inode, hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; @@ -609,8 +611,8 @@ int hfsplus_cat_write_inode(struct inode *inode) sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ hfsplus_cat_set_perms(inode, &folder->permissions); - folder->access_date = hfsp_ut2mt(inode->i_atime); - folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); + folder->access_date = hfsp_ut2mt(inode_get_atime(inode)); + folder->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); folder->valence = cpu_to_be32(inode->i_size - 2); if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -644,8 +646,8 @@ int hfsplus_cat_write_inode(struct inode *inode) file->flags |= cpu_to_be16(HFSPLUS_FILE_LOCKED); else file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); - file->access_date = hfsp_ut2mt(inode->i_atime); - file->content_mod_date = hfsp_ut2mt(inode->i_mtime); + file->access_date = hfsp_ut2mt(inode_get_atime(inode)); + file->content_mod_date = hfsp_ut2mt(inode_get_mtime(inode)); file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index 58021e73c00b..9c9ff6b8c6f7 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -13,7 +13,7 @@ static int hfsplus_removexattr(struct inode *inode, const char *name); -const struct xattr_handler *hfsplus_xattr_handlers[] = { +const struct xattr_handler * const hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index d14e362b3eba..15cc55e41410 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -17,7 +17,7 @@ extern const struct xattr_handler hfsplus_xattr_user_handler; extern const struct xattr_handler hfsplus_xattr_trusted_handler; extern const struct xattr_handler hfsplus_xattr_security_handler; -extern const struct xattr_handler *hfsplus_xattr_handlers[]; +extern const struct xattr_handler * const hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index dc5a5cea5fae..ea87f24c6c3f 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -513,10 +513,14 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) set_nlink(ino, st->nlink); i_uid_write(ino, st->uid); i_gid_write(ino, st->gid); - ino->i_atime = - (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec }; - ino->i_mtime = - (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec }; + inode_set_atime_to_ts(ino, (struct timespec64){ + st->atime.tv_sec, + st->atime.tv_nsec, + }); + inode_set_mtime_to_ts(ino, (struct timespec64){ + st->mtime.tv_sec, + st->mtime.tv_nsec, + }); inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec); ino->i_size = st->size; ino->i_blocks = st->blocks; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index f36566d61215..49dd585c2b17 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -277,14 +277,16 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in * inode. */ - if (!inode_get_ctime(result).tv_sec) { + if (!inode_get_ctime_sec(result)) { time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)); inode_set_ctime(result, csec ? csec : 1, 0); - result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); - result->i_atime.tv_nsec = 0; + inode_set_mtime(result, + local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)), + 0); + inode_set_atime(result, + local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)), + 0); hpfs_result->i_ea_size = le32_to_cpu(de->ea_size); if (!hpfs_result->i_ea_mode && de->read_only) result->i_mode &= ~0222; diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 479166378bae..a59e8fa630db 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -37,8 +37,8 @@ void hpfs_init_inode(struct inode *i) hpfs_inode->i_dirty = 0; inode_set_ctime(i, 0, 0); - i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; - i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; + inode_set_mtime(i, 0, 0); + inode_set_atime(i, 0, 0); } void hpfs_read_inode(struct inode *i) @@ -230,9 +230,9 @@ void hpfs_write_inode_nolock(struct inode *i) } hpfs_write_inode_ea(i, fnode); if (de) { - de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); - de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i))); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i))); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i))); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); hpfs_mark_4buffers_dirty(&qbh); @@ -240,9 +240,9 @@ void hpfs_write_inode_nolock(struct inode *i) } if (S_ISDIR(i->i_mode)) { if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { - de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); - de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); + de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_mtime_sec(i))); + de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_atime_sec(i))); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime_sec(i))); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); de->file_size = cpu_to_le32(0); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index f4eb8d6f5989..9184b4584b01 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -12,10 +12,10 @@ static void hpfs_update_directory_times(struct inode *dir) { time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); - if (t == dir->i_mtime.tv_sec && - t == inode_get_ctime(dir).tv_sec) + if (t == inode_get_mtime_sec(dir) && + t == inode_get_ctime_sec(dir)) return; - dir->i_mtime = inode_set_ctime(dir, t, 0); + inode_set_mtime_to_ts(dir, inode_set_ctime(dir, t, 0)); hpfs_write_inode_nolock(dir); } @@ -58,8 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; hpfs_i(result)->i_dno = dno; - result->i_mtime = result->i_atime = - inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_mode |= S_IFDIR; result->i_op = &hpfs_dir_iops; @@ -164,8 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, result->i_fop = &hpfs_file_ops; set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_mtime = result->i_atime = - inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; if (dee.read_only) result->i_mode &= ~0222; @@ -245,8 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_mtime = result->i_atime = - inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); @@ -319,8 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_mtime = result->i_atime = - inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); + inode_set_mtime_to_ts(result, + inode_set_atime_to_ts(result, inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0))); hpfs_i(result)->i_ea_size = 0; result->i_mode = S_IFLNK | 0777; result->i_uid = current_fsuid(); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 758a51564124..6b0ba3c1efba 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -725,10 +725,12 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) if (!de) hpfs_error(s, "unable to find root dir"); else { - root->i_atime.tv_sec = local_to_gmt(s, le32_to_cpu(de->read_date)); - root->i_atime.tv_nsec = 0; - root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); - root->i_mtime.tv_nsec = 0; + inode_set_atime(root, + local_to_gmt(s, le32_to_cpu(de->read_date)), + 0); + inode_set_mtime(root, + local_to_gmt(s, le32_to_cpu(de->write_date)), + 0); inode_set_ctime(root, local_to_gmt(s, le32_to_cpu(de->creation_date)), 0); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 316c4cebd3f3..da217eaba102 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -980,7 +980,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -1024,7 +1024,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mapping->private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { @@ -1067,7 +1067,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; @@ -1099,7 +1099,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); d_tmpfile(file, inode); return finish_open_simple(file, 0); } @@ -1121,7 +1121,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, } else iput(inode); } - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); return error; } diff --git a/fs/init.c b/fs/init.c index 9684406a8416..e9387b6c4f30 100644 --- a/fs/init.c +++ b/fs/init.c @@ -153,8 +153,7 @@ int __init init_mknod(const char *filename, umode_t mode, unsigned int dev) if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mknod(&path, dentry, mode, dev); if (!error) error = vfs_mknod(mnt_idmap(path.mnt), path.dentry->d_inode, @@ -229,8 +228,7 @@ int __init init_mkdir(const char *pathname, umode_t mode) dentry = kern_path_create(AT_FDCWD, pathname, &path, LOOKUP_DIRECTORY); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!IS_POSIXACL(path.dentry->d_inode)) - mode &= ~current_umask(); + mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); if (!error) error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, diff --git a/fs/inode.c b/fs/inode.c index 35fd688168c5..4f8984b97df0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1837,27 +1837,29 @@ EXPORT_SYMBOL(bmap); static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { - struct timespec64 ctime; + struct timespec64 atime, mtime, ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; /* * Is mtime younger than or equal to atime? If yes, update atime: */ - if (timespec64_compare(&inode->i_mtime, &inode->i_atime) >= 0) + atime = inode_get_atime(inode); + mtime = inode_get_mtime(inode); + if (timespec64_compare(&mtime, &atime) >= 0) return 1; /* * Is ctime younger than or equal to atime? If yes, update atime: */ ctime = inode_get_ctime(inode); - if (timespec64_compare(&ctime, &inode->i_atime) >= 0) + if (timespec64_compare(&ctime, &atime) >= 0) return 1; /* * Is the previous atime value older than a day? If yes, * update atime: */ - if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60) + if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60) return 1; /* * Good, we can skip the atime update: @@ -1888,12 +1890,13 @@ int inode_update_timestamps(struct inode *inode, int flags) if (flags & (S_MTIME|S_CTIME|S_VERSION)) { struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); now = inode_set_ctime_current(inode); if (!timespec64_equal(&now, &ctime)) updated |= S_CTIME; - if (!timespec64_equal(&now, &inode->i_mtime)) { - inode->i_mtime = now; + if (!timespec64_equal(&now, &mtime)) { + inode_set_mtime_to_ts(inode, now); updated |= S_MTIME; } if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) @@ -1903,8 +1906,10 @@ int inode_update_timestamps(struct inode *inode, int flags) } if (flags & S_ATIME) { - if (!timespec64_equal(&now, &inode->i_atime)) { - inode->i_atime = now; + struct timespec64 atime = inode_get_atime(inode); + + if (!timespec64_equal(&now, &atime)) { + inode_set_atime_to_ts(inode, now); updated |= S_ATIME; } } @@ -1963,7 +1968,7 @@ EXPORT_SYMBOL(inode_update_time); bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; - struct timespec64 now; + struct timespec64 now, atime; if (inode->i_flags & S_NOATIME) return false; @@ -1989,7 +1994,8 @@ bool atime_needs_update(const struct path *path, struct inode *inode) if (!relatime_need_update(mnt, inode, now)) return false; - if (timespec64_equal(&inode->i_atime, &now)) + atime = inode_get_atime(inode); + if (timespec64_equal(&atime, &now)) return false; return true; @@ -2006,7 +2012,7 @@ void touch_atime(const struct path *path) if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt) != 0) + if (mnt_get_write_access(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -2018,7 +2024,7 @@ void touch_atime(const struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ inode_update_time(inode, S_ATIME); - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); skip_update: sb_end_write(inode->i_sb); } @@ -2102,63 +2108,22 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -/** - * current_mgtime - Return FS time (possibly fine-grained) - * @inode: inode. - * - * Return the current time truncated to the time granularity supported by - * the fs, as suitable for a ctime/mtime change. If the ctime is flagged - * as having been QUERIED, get a fine-grained timestamp. - */ -struct timespec64 current_mgtime(struct inode *inode) -{ - struct timespec64 now, ctime; - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; - long nsec = atomic_long_read(pnsec); - - if (nsec & I_CTIME_QUERIED) { - ktime_get_real_ts64(&now); - return timestamp_truncate(now, inode); - } - - ktime_get_coarse_real_ts64(&now); - now = timestamp_truncate(now, inode); - - /* - * If we've recently fetched a fine-grained timestamp - * then the coarse-grained one may still be earlier than the - * existing ctime. Just keep the existing value if so. - */ - ctime = inode_get_ctime(inode); - if (timespec64_compare(&ctime, &now) > 0) - now = ctime; - - return now; -} -EXPORT_SYMBOL(current_mgtime); - -static struct timespec64 current_ctime(struct inode *inode) -{ - if (is_mgtime(inode)) - return current_mgtime(inode); - return current_time(inode); -} - static int inode_needs_update_time(struct inode *inode) { int sync_it = 0; - struct timespec64 now = current_ctime(inode); - struct timespec64 ctime; + struct timespec64 now = current_time(inode); + struct timespec64 ts; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - if (!timespec64_equal(&inode->i_mtime, &now)) + ts = inode_get_mtime(inode); + if (!timespec64_equal(&ts, &now)) sync_it = S_MTIME; - ctime = inode_get_ctime(inode); - if (!timespec64_equal(&ctime, &now)) + ts = inode_get_ctime(inode); + if (!timespec64_equal(&ts, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2173,9 +2138,9 @@ static int __file_update_time(struct file *file, int sync_mode) struct inode *inode = file_inode(file); /* try to update time settings */ - if (!__mnt_want_write_file(file)) { + if (!mnt_get_write_access_file(file)) { ret = inode_update_time(inode, sync_mode); - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); } return ret; @@ -2578,43 +2543,9 @@ EXPORT_SYMBOL(current_time); */ struct timespec64 inode_set_ctime_current(struct inode *inode) { - struct timespec64 now; - struct timespec64 ctime; - - ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec); - if (!(ctime.tv_nsec & I_CTIME_QUERIED)) { - now = current_time(inode); + struct timespec64 now = current_time(inode); - /* Just copy it into place if it's not multigrain */ - if (!is_mgtime(inode)) { - inode_set_ctime_to_ts(inode, now); - return now; - } - - /* - * If we've recently updated with a fine-grained timestamp, - * then the coarse-grained one may still be earlier than the - * existing ctime. Just keep the existing value if so. - */ - ctime.tv_sec = inode->__i_ctime.tv_sec; - if (timespec64_compare(&ctime, &now) > 0) - return ctime; - - /* - * Ctime updates are usually protected by the inode_lock, but - * we can still race with someone setting the QUERIED flag. - * Try to swap the new nsec value into place. If it's changed - * in the interim, then just go with a fine-grained timestamp. - */ - if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec, - now.tv_nsec) != ctime.tv_nsec) - goto fine_grained; - inode->__i_ctime.tv_sec = now.tv_sec; - return now; - } -fine_grained: - ktime_get_real_ts64(&now); - inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode)); + inode_set_ctime(inode, now.tv_sec, now.tv_nsec); return now; } EXPORT_SYMBOL(inode_set_ctime_current); diff --git a/fs/internal.h b/fs/internal.h index d64ae03998cc..58e43341aebf 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -73,8 +73,8 @@ extern int sb_prepare_remount_readonly(struct super_block *); extern void __init mnt_init(void); -extern int __mnt_want_write_file(struct file *); -extern void __mnt_drop_write_file(struct file *); +int mnt_get_write_access_file(struct file *file); +void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); @@ -94,14 +94,22 @@ extern void chroot_fs_refs(const struct path *, const struct path *); struct file *alloc_empty_file(int flags, const struct cred *cred); struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred); struct file *alloc_empty_backing_file(int flags, const struct cred *cred); +void release_empty_file(struct file *f); + +static inline void file_put_write_access(struct file *file) +{ + put_write_access(file->f_inode); + mnt_put_write_access(file->f_path.mnt); + if (unlikely(file->f_mode & FMODE_BACKING)) + mnt_put_write_access(backing_file_user_path(file)->mnt); +} static inline void put_file_access(struct file *file) { if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_dec(file->f_inode); } else if (file->f_mode & FMODE_WRITER) { - put_write_access(file->f_inode); - __mnt_drop_write(file->f_path.mnt); + file_put_write_access(file); } } @@ -130,9 +138,9 @@ static inline void sb_start_ro_state_change(struct super_block *sb) * mnt_is_readonly() making sure if mnt_is_readonly() sees SB_RDONLY * cleared, it will see s_readonly_remount set. * For RW->RO transition, the barrier pairs with the barrier in - * __mnt_want_write() before the mnt_is_readonly() check. The barrier - * makes sure if __mnt_want_write() sees MNT_WRITE_HOLD already - * cleared, it will see s_readonly_remount set. + * mnt_get_write_access() before the mnt_is_readonly() check. + * The barrier makes sure if mnt_get_write_access() sees MNT_WRITE_HOLD + * already cleared, it will see s_readonly_remount set. */ smp_wmb(); } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index ae8673ce08b1..2bc0aa23fde3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -640,11 +640,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t poff, plen; /* - * If the write completely overlaps the current folio, then + * If the write or zeroing completely overlaps the current folio, then * entire folio will be dirtied so there is no need for * per-block state tracking structures to be attached to this folio. + * For the unshare case, we must read in the ondisk contents because we + * are not changing pagecache contents. */ - if (pos <= folio_pos(folio) && + if (!(iter->flags & IOMAP_UNSHARE) && pos <= folio_pos(folio) && pos + len >= folio_pos(folio) + folio_size(folio)) return 0; @@ -879,8 +881,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ + bytes = iov_iter_count(i); +retry: offset = pos & (chunk - 1); - bytes = min(chunk - offset, iov_iter_count(i)); + bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) @@ -931,10 +935,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) * halfway through, might be a race with munmap, * might be severe memory pressure. */ - if (copied) - bytes = copied; if (chunk > PAGE_SIZE) chunk /= 2; + if (copied) { + bytes = copied; + goto retry; + } } else { pos += status; written += status; @@ -1047,7 +1053,7 @@ static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, /* * Scan the data range passed to us for dirty page cache folios. If we find a - * dirty folio, punch out the preceeding range and update the offset from which + * dirty folio, punch out the preceding range and update the offset from which * the next punch will start from. * * We can punch out storage reservations under clean pages because they either @@ -1261,7 +1267,6 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) const struct iomap *srcmap = iomap_iter_srcmap(iter); loff_t pos = iter->pos; loff_t length = iomap_length(iter); - long status = 0; loff_t written = 0; /* don't bother with blocks that are not shared to start with */ @@ -1272,28 +1277,33 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) return length; do { - unsigned long offset = offset_in_page(pos); - unsigned long bytes = min_t(loff_t, PAGE_SIZE - offset, length); struct folio *folio; + int status; + size_t offset; + size_t bytes = min_t(u64, SIZE_MAX, length); status = iomap_write_begin(iter, pos, bytes, &folio); if (unlikely(status)) return status; - if (iter->iomap.flags & IOMAP_F_STALE) + if (iomap->flags & IOMAP_F_STALE) break; - status = iomap_write_end(iter, pos, bytes, bytes, folio); - if (WARN_ON_ONCE(status == 0)) + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; + + bytes = iomap_write_end(iter, pos, bytes, bytes, folio); + if (WARN_ON_ONCE(bytes == 0)) return -EIO; cond_resched(); - pos += status; - written += status; - length -= status; + pos += bytes; + written += bytes; + length -= bytes; balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length); + } while (length > 0); return written; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 2ee21286ac8f..3e4d53e26f94 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1422,8 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_ino, de->flags[-high_sierra]); } #endif - inode->i_mtime = inode->i_atime = - inode_set_ctime(inode, iso_date(de->date, high_sierra), 0); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0))); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 348783a70f57..d6c17ad69dee 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -426,16 +426,14 @@ repeat: 0); } if (rr->u.TF.flags & TF_MODIFY) { - inode->i_mtime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_mtime.tv_nsec = 0; + inode_set_mtime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_ACCESS) { - inode->i_atime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_atime.tv_nsec = 0; + inode_set_atime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_ATTRIBUTES) { inode_set_ctime(inode, @@ -531,9 +529,9 @@ repeat: inode->i_rdev = reloc->i_rdev; inode->i_size = reloc->i_size; inode->i_blocks = reloc->i_blocks; - inode->i_atime = reloc->i_atime; + inode_set_atime_to_ts(inode, inode_get_atime(reloc)); inode_set_ctime_to_ts(inode, inode_get_ctime(reloc)); - inode->i_mtime = reloc->i_mtime; + inode_set_mtime_to_ts(inode, inode_get_mtime(reloc)); iput(reloc); break; #ifdef CONFIG_ZISOFS diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1073259902a6..8d6f934c3d95 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -298,14 +298,12 @@ static int journal_finish_inode_data_buffers(journal_t *journal, static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) { - struct page *page = bh->b_page; char *addr; __u32 checksum; - addr = kmap_atomic(page); - checksum = crc32_be(crc32_sum, - (void *)(addr + offset_in_page(bh->b_data)), bh->b_size); - kunmap_atomic(addr); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); + checksum = crc32_be(crc32_sum, addr, bh->b_size); + kunmap_local(addr); return checksum; } @@ -322,7 +320,6 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, struct buffer_head *bh, __u32 sequence) { journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag; - struct page *page = bh->b_page; __u8 *addr; __u32 csum32; __be32 seq; @@ -331,11 +328,10 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, return; seq = cpu_to_be32(sequence); - addr = kmap_atomic(page); + addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data), - bh->b_size); - kunmap_atomic(addr); + csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + kunmap_local(addr); if (jbd2_has_feature_csum3(j)) tag3->t_checksum = cpu_to_be32(csum32); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 768fa05bcbed..30dec2bd2ecc 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1601,6 +1601,8 @@ static journal_t *journal_init_common(struct block_device *bdev, err_cleanup: percpu_counter_destroy(&journal->j_checkpoint_jh_count); + if (journal->j_chksum_driver) + crypto_free_shash(journal->j_chksum_driver); kfree(journal->j_wbuf); jbd2_journal_destroy_revoke(journal); journal_fail_superblock(journal); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 4d1fda1f7143..5f08b5fd105a 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -935,19 +935,15 @@ static void warn_dirty_buffer(struct buffer_head *bh) /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ static void jbd2_freeze_jh_data(struct journal_head *jh) { - struct page *page; - int offset; char *source; struct buffer_head *bh = jh2bh(jh); J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); - page = bh->b_page; - offset = offset_in_page(bh->b_data); - source = kmap_atomic(page); + source = kmap_local_folio(bh->b_folio, bh_offset(bh)); /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); - memcpy(jh->b_frozen_data, source + offset, bh->b_size); - kunmap_atomic(source); + jbd2_buffer_frozen_trigger(jh, source, jh->b_triggers); + memcpy(jh->b_frozen_data, source, bh->b_size); + kunmap_local(source); /* * Now that the frozen data is saved off, we need to store any matching diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 091ab0eaabbe..2b2938970da3 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -204,8 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, if (ret) goto fail; - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, - ITIME(je32_to_cpu(ri->ctime))); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(ri->ctime)))); jffs2_free_raw_inode(ri); @@ -238,7 +238,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); return ret; } /***********************************************************************/ @@ -272,7 +273,8 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); ihold(d_inode(old_dentry)); } return ret; @@ -423,8 +425,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, - ITIME(je32_to_cpu(rd->mctime))); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); jffs2_free_raw_dirent(rd); @@ -568,8 +570,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, - ITIME(je32_to_cpu(rd->mctime))); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -610,7 +612,8 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(now))); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -746,8 +749,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, - ITIME(je32_to_cpu(rd->mctime))); + inode_set_mtime_to_ts(dir_i, + inode_set_ctime_to_ts(dir_i, ITIME(je32_to_cpu(rd->mctime)))); jffs2_free_raw_dirent(rd); @@ -868,16 +871,18 @@ static int jffs2_rename (struct mnt_idmap *idmap, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, - ITIME(now)); + inode_set_mtime_to_ts(new_dir_i, + inode_set_ctime_to_ts(new_dir_i, ITIME(now))); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now)); - new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now)); + inode_set_mtime_to_ts(old_dir_i, + inode_set_ctime_to_ts(old_dir_i, ITIME(now))); + inode_set_mtime_to_ts(new_dir_i, + inode_set_ctime_to_ts(new_dir_i, ITIME(now))); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 11c66793960e..62ea76da7fdf 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -317,8 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_mtime = inode_set_ctime_to_ts(inode, - ITIME(je32_to_cpu(ri->ctime))); + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime)))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 0403efab4089..d175cccb7c55 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -113,8 +113,8 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); - ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); - ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); + ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode_get_atime(inode))); + ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode_get_mtime(inode))); ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode))); ri->offset = cpu_to_je32(0); @@ -147,9 +147,9 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) return PTR_ERR(new_metadata); } /* It worked. Update the inode */ - inode->i_atime = ITIME(je32_to_cpu(ri->atime)); + inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(ri->atime))); inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))); - inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); + inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(ri->mtime))); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); i_gid_write(inode, je16_to_cpu(ri->gid)); @@ -282,8 +282,8 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); inode->i_size = je32_to_cpu(latest_node.isize); - inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); - inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); + inode_set_atime_to_ts(inode, ITIME(je32_to_cpu(latest_node.atime))); + inode_set_mtime_to_ts(inode, ITIME(je32_to_cpu(latest_node.mtime))); inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); @@ -386,8 +386,8 @@ void jffs2_dirty_inode(struct inode *inode, int flags) iattr.ia_mode = inode->i_mode; iattr.ia_uid = inode->i_uid; iattr.ia_gid = inode->i_gid; - iattr.ia_atime = inode->i_atime; - iattr.ia_mtime = inode->i_mtime; + iattr.ia_atime = inode_get_atime(inode); + iattr.ia_mtime = inode_get_mtime(inode); iattr.ia_ctime = inode_get_ctime(inode); jffs2_do_setattr(inode, &iattr); @@ -475,8 +475,8 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_mode = jemode_to_cpu(ri->mode); i_gid_write(inode, je16_to_cpu(ri->gid)); i_uid_write(inode, je16_to_cpu(ri->uid)); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); - ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); + simple_inode_init_ts(inode); + ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode_get_mtime(inode))); inode->i_blocks = 0; inode->i_size = 0; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 50727a1ff931..86ab014a349c 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -36,8 +36,8 @@ struct kvec; #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds()) #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec) #define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f))) -#define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime) -#define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime) +#define JFFS2_F_I_MTIME(f) I_SEC(inode_get_mtime(OFNI_EDONI_2SFFJ(f))) +#define JFFS2_F_I_ATIME(f) I_SEC(inode_get_atime(OFNI_EDONI_2SFFJ(f))) #define sleep_on_spinunlock(wq, s) \ do { \ DECLARE_WAITQUEUE(__wait, current); \ diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 3b6bdc9a49e1..00224f3a8d6e 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -920,7 +920,7 @@ struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, * do_jffs2_setxattr(inode, xprefix, xname, buffer, size, flags) * is an implementation of setxattr handler on jffs2. * -------------------------------------------------- */ -const struct xattr_handler *jffs2_xattr_handlers[] = { +const struct xattr_handler * const jffs2_xattr_handlers[] = { &jffs2_user_xattr_handler, #ifdef CONFIG_JFFS2_FS_SECURITY &jffs2_security_xattr_handler, diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 1b5030a3349d..7e7de093ec0a 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -94,7 +94,7 @@ extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname extern int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, const char *buffer, size_t size, int flags); -extern const struct xattr_handler *jffs2_xattr_handlers[]; +extern const struct xattr_handler * const jffs2_xattr_handlers[]; extern const struct xattr_handler jffs2_user_xattr_handler; extern const struct xattr_handler jffs2_trusted_xattr_handler; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 920d58a1566b..1a6b5921d17a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) break; } - ip->i_mtime = inode_set_ctime_current(ip); + inode_set_mtime_to_ts(ip, inode_set_ctime_current(ip)); mark_inode_dirty(ip); txCommit(tid, 1, &ip, 0); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 923a58422c46..8e87264e56ce 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3061,10 +3061,10 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) } ip->i_size = le64_to_cpu(dip->di_size); - ip->i_atime.tv_sec = le32_to_cpu(dip->di_atime.tv_sec); - ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); - ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); - ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); + inode_set_atime(ip, le32_to_cpu(dip->di_atime.tv_sec), + le32_to_cpu(dip->di_atime.tv_nsec)); + inode_set_mtime(ip, le32_to_cpu(dip->di_mtime.tv_sec), + le32_to_cpu(dip->di_mtime.tv_nsec)); inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec), le32_to_cpu(dip->di_ctime.tv_nsec)); ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); @@ -3138,12 +3138,12 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) else /* Leave the original permissions alone */ dip->di_mode = cpu_to_le32(jfs_ip->mode2); - dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); - dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); - dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec); - dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec); - dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); - dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); + dip->di_atime.tv_sec = cpu_to_le32(inode_get_atime_sec(ip)); + dip->di_atime.tv_nsec = cpu_to_le32(inode_get_atime_nsec(ip)); + dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime_sec(ip)); + dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime_nsec(ip)); + dip->di_mtime.tv_sec = cpu_to_le32(inode_get_mtime_sec(ip)); + dip->di_mtime.tv_nsec = cpu_to_le32(inode_get_mtime_nsec(ip)); dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ dip->di_acl = jfs_ip->acl; /* as are dxd's */ dip->di_ea = jfs_ip->ea; diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 87594efa7f7c..f10f295d1502 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - jfs_inode->otime = inode_get_ctime(inode).tv_sec; + simple_inode_init_ts(inode); + jfs_inode->otime = inode_get_ctime_sec(inode); inode->i_generation = JFS_SBI(sb)->gengen++; jfs_inode->cflag = 0; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index e855b8fde76c..cb6d1fda66a7 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1058,7 +1058,7 @@ void jfs_syncpt(struct jfs_log *log, int hard_sync) int lmLogOpen(struct super_block *sb) { int rc; - struct block_device *bdev; + struct bdev_handle *bdev_handle; struct jfs_log *log; struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -1070,7 +1070,7 @@ int lmLogOpen(struct super_block *sb) mutex_lock(&jfs_log_mutex); list_for_each_entry(log, &jfs_external_logs, journal_list) { - if (log->bdev->bd_dev == sbi->logdev) { + if (log->bdev_handle->bdev->bd_dev == sbi->logdev) { if (!uuid_equal(&log->uuid, &sbi->loguuid)) { jfs_warn("wrong uuid on JFS journal"); mutex_unlock(&jfs_log_mutex); @@ -1100,14 +1100,14 @@ int lmLogOpen(struct super_block *sb) * file systems to log may have n-to-1 relationship; */ - bdev = blkdev_get_by_dev(sbi->logdev, BLK_OPEN_READ | BLK_OPEN_WRITE, - log, NULL); - if (IS_ERR(bdev)) { - rc = PTR_ERR(bdev); + bdev_handle = bdev_open_by_dev(sbi->logdev, + BLK_OPEN_READ | BLK_OPEN_WRITE, log, NULL); + if (IS_ERR(bdev_handle)) { + rc = PTR_ERR(bdev_handle); goto free; } - log->bdev = bdev; + log->bdev_handle = bdev_handle; uuid_copy(&log->uuid, &sbi->loguuid); /* @@ -1141,7 +1141,7 @@ journal_found: lbmLogShutdown(log); close: /* close external log device */ - blkdev_put(bdev, log); + bdev_release(bdev_handle); free: /* free log descriptor */ mutex_unlock(&jfs_log_mutex); @@ -1162,7 +1162,7 @@ static int open_inline_log(struct super_block *sb) init_waitqueue_head(&log->syncwait); set_bit(log_INLINELOG, &log->flag); - log->bdev = sb->s_bdev; + log->bdev_handle = sb->s_bdev_handle; log->base = addressPXD(&JFS_SBI(sb)->logpxd); log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> (L2LOGPSIZE - sb->s_blocksize_bits); @@ -1436,7 +1436,7 @@ int lmLogClose(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; - struct block_device *bdev; + struct bdev_handle *bdev_handle; int rc = 0; jfs_info("lmLogClose: log:0x%p", log); @@ -1482,10 +1482,10 @@ int lmLogClose(struct super_block *sb) * external log as separate logical volume */ list_del(&log->journal_list); - bdev = log->bdev; + bdev_handle = log->bdev_handle; rc = lmLogShutdown(log); - blkdev_put(bdev, log); + bdev_release(bdev_handle); kfree(log); @@ -1972,7 +1972,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bp->l_flag |= lbmREAD; - bio = bio_alloc(log->bdev, 1, REQ_OP_READ, GFP_NOFS); + bio = bio_alloc(log->bdev_handle->bdev, 1, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2110,10 +2110,15 @@ static void lbmStartIO(struct lbuf * bp) { struct bio *bio; struct jfs_log *log = bp->l_log; + struct block_device *bdev = NULL; jfs_info("lbmStartIO"); - bio = bio_alloc(log->bdev, 1, REQ_OP_WRITE | REQ_SYNC, GFP_NOFS); + if (!log->no_integrity) + bdev = log->bdev_handle->bdev; + + bio = bio_alloc(bdev, 1, REQ_OP_WRITE | REQ_SYNC, + GFP_NOFS); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); __bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_logmgr.h b/fs/jfs/jfs_logmgr.h index 805877ce5020..84aa2d253907 100644 --- a/fs/jfs/jfs_logmgr.h +++ b/fs/jfs/jfs_logmgr.h @@ -356,7 +356,7 @@ struct jfs_log { * before writing syncpt. */ struct list_head journal_list; /* Global list */ - struct block_device *bdev; /* 4: log lv pointer */ + struct bdev_handle *bdev_handle; /* 4: log lv pointer */ int serial; /* 4: log mount serial number */ s64 base; /* @8: log extent address (inline log ) */ diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index b83aae56a1f2..415eb65a36ff 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -430,7 +430,8 @@ int updateSuper(struct super_block *sb, uint state) if (state == FM_MOUNT) { /* record log's dev_t and mount serial number */ - j_sb->s_logdev = cpu_to_le32(new_encode_dev(sbi->log->bdev->bd_dev)); + j_sb->s_logdev = cpu_to_le32( + new_encode_dev(sbi->log->bdev_handle->bdev->bd_dev)); j_sb->s_logserial = cpu_to_le32(sbi->log->serial); } else if (state == FM_CLEAN) { /* diff --git a/fs/jfs/jfs_xattr.h b/fs/jfs/jfs_xattr.h index 0d33816d251d..ec67d8554d2c 100644 --- a/fs/jfs/jfs_xattr.h +++ b/fs/jfs/jfs_xattr.h @@ -46,7 +46,7 @@ extern int __jfs_setxattr(tid_t, struct inode *, const char *, const void *, extern ssize_t __jfs_getxattr(struct inode *, const char *, void *, size_t); extern ssize_t jfs_listxattr(struct dentry *, char *, size_t); -extern const struct xattr_handler *jfs_xattr_handlers[]; +extern const struct xattr_handler * const jfs_xattr_handlers[]; #ifdef CONFIG_JFS_SECURITY extern int jfs_init_security(tid_t, struct inode *, struct inode *, diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 57d7a4300210..d68a4e6ac345 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_mtime = inode_set_ctime_current(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); @@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ - dip->i_mtime = inode_set_ctime_current(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); @@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ - dip->i_mtime = inode_set_ctime_current(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); inode_dec_link_count(dip); /* @@ -512,7 +512,8 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) ASSERT(ip->i_nlink); - dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)); + inode_set_mtime_to_ts(dip, + inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip))); mark_inode_dirty(dip); /* update target's inode */ @@ -828,7 +829,7 @@ static int jfs_link(struct dentry *old_dentry, /* update object inode */ inc_nlink(ip); /* for new link */ inode_set_ctime_current(ip); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(ip); @@ -1028,7 +1029,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_mtime = inode_set_ctime_current(dip); + inode_set_mtime_to_ts(dip, inode_set_ctime_current(dip)); mark_inode_dirty(dip); /* * commit update of parent directory and link object @@ -1271,7 +1272,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_set_ctime_current(old_ip); mark_inode_dirty(old_ip); - new_dir->i_mtime = inode_set_ctime_current(new_dir); + inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ @@ -1283,7 +1284,8 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (old_dir != new_dir) { iplist[ipcount++] = new_dir; - old_dir->i_mtime = inode_set_ctime_current(old_dir); + inode_set_mtime_to_ts(old_dir, + inode_set_ctime_current(old_dir)); mark_inode_dirty(old_dir); } @@ -1416,7 +1418,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(ip); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2e2f7f6d36a0..966826c394ee 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -818,7 +818,7 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 8577ad494e05..0fb7afac298e 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -985,7 +985,7 @@ static const struct xattr_handler jfs_trusted_xattr_handler = { .set = jfs_xattr_set, }; -const struct xattr_handler *jfs_xattr_handlers[] = { +const struct xattr_handler * const jfs_xattr_handlers[] = { &jfs_os2_xattr_handler, &jfs_user_xattr_handler, &jfs_security_xattr_handler, diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 922719a343a7..b83054da68b3 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -151,7 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } static inline void set_inode_attr(struct inode *inode, @@ -159,8 +159,8 @@ static inline void set_inode_attr(struct inode *inode, { inode->i_uid = attrs->ia_uid; inode->i_gid = attrs->ia_gid; - inode->i_atime = attrs->ia_atime; - inode->i_mtime = attrs->ia_mtime; + inode_set_atime_to_ts(inode, attrs->ia_atime); + inode_set_mtime_to_ts(inode, attrs->ia_mtime); inode_set_ctime_to_ts(inode, attrs->ia_ctime); } @@ -445,7 +445,7 @@ static const struct xattr_handler kernfs_user_xattr_handler = { .set = kernfs_vfs_user_xattr_set, }; -const struct xattr_handler *kernfs_xattr_handlers[] = { +const struct xattr_handler * const kernfs_xattr_handlers[] = { &kernfs_trusted_xattr_handler, &kernfs_security_xattr_handler, &kernfs_user_xattr_handler, diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index a9b854cdfdb5..237f2764b941 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -127,7 +127,7 @@ extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ -extern const struct xattr_handler *kernfs_xattr_handlers[]; +extern const struct xattr_handler * const kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); diff --git a/fs/libfs.c b/fs/libfs.c index a4eb12757886..abe2b5a40ba1 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -541,7 +541,8 @@ void simple_recursive_removal(struct dentry *dentry, dput(victim); // unpin it } if (victim == dentry) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); @@ -582,7 +583,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = inode_set_ctime_current(root); + simple_inode_init_ts(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; @@ -638,8 +639,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den { struct inode *inode = d_inode(old_dentry); - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); dget(dentry); @@ -673,8 +674,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); dput(dentry); return 0; @@ -709,9 +710,10 @@ void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, { struct inode *newino = d_inode(new_dentry); - old_dir->i_mtime = inode_set_ctime_current(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (new_dir != old_dir) - new_dir->i_mtime = inode_set_ctime_current(new_dir); + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_current(new_dir)); inode_set_ctime_current(d_inode(old_dentry)); if (newino) inode_set_ctime_current(newino); @@ -926,7 +928,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); @@ -952,7 +954,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, goto out; } inode->i_mode = S_IFREG | files->mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); @@ -1520,7 +1522,7 @@ struct inode *alloc_anon_inode(struct super_block *s) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); @@ -1903,6 +1905,7 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, * We don't know how much we wrote, so just return the number of * bytes which were direct-written */ + iocb->ki_pos -= buffered_written; if (direct_written) return direct_written; return err; @@ -1911,3 +1914,20 @@ ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, return direct_written + buffered_written; } EXPORT_SYMBOL_GPL(direct_write_fallback); + +/** + * simple_inode_init_ts - initialize the timestamps for a new inode + * @inode: inode to be initialized + * + * When a new inode is created, most filesystems set the timestamps to the + * current time. Add a helper to do this. + */ +struct timespec64 simple_inode_init_ts(struct inode *inode) +{ + struct timespec64 ts = inode_set_ctime_current(inode); + + inode_set_atime_to_ts(inode, ts); + inode_set_mtime_to_ts(inode, ts); + return ts; +} +EXPORT_SYMBOL(simple_inode_init_ts); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 6579948070a4..81be07c1d3d1 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -24,7 +24,6 @@ #include <linux/uio.h> #include <linux/smp.h> #include <linux/mutex.h> -#include <linux/kthread.h> #include <linux/freezer.h> #include <linux/inetdevice.h> @@ -135,11 +134,11 @@ lockd(void *vrqstp) * The main request loop. We don't terminate until the last * NFS mount or NFS daemon has gone away. */ - while (!kthread_should_stop()) { + while (!svc_thread_should_stop(rqstp)) { /* update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nlm_max_connections; - nlmsvc_retry_blocked(); + nlmsvc_retry_blocked(rqstp); svc_recv(rqstp); } if (nlmsvc_ops) @@ -373,7 +372,9 @@ static void lockd_put(void) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + svc_get(nlmsvc_serv); svc_set_num_threads(nlmsvc_serv, NULL, 0); + svc_put(nlmsvc_serv); timer_delete_sync(&nlmsvc_retry); nlmsvc_serv = NULL; dprintk("lockd_down: service destroyed\n"); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 43aeba9de55c..2dc10900ad1c 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -30,7 +30,6 @@ #include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> -#include <linux/kthread.h> #include <linux/exportfs.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -481,9 +480,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, struct nlm_cookie *cookie, int reclaim) { -#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct inode *inode = nlmsvc_file_inode(file); -#endif struct nlm_block *block = NULL; int error; int mode; @@ -497,7 +494,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_end, wait); - if (nlmsvc_file_file(file)->f_op->lock) { + if (!exportfs_lock_op_is_async(inode->i_sb->s_export_op)) { async_block = wait; wait = 0; } @@ -543,6 +540,25 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + spin_lock(&nlm_blocked_lock); + /* + * If this is a lock request for an already pending + * lock request we return nlm_lck_blocked without calling + * vfs_lock_file() again. Otherwise we have two pending + * requests on the underlaying ->lock() implementation but + * only one nlm_block to being granted by lm_grant(). + */ + if (exportfs_lock_op_is_async(inode->i_sb->s_export_op) && + !list_empty(&block->b_list)) { + spin_unlock(&nlm_blocked_lock); + ret = nlm_lck_blocked; + goto out; + } + + /* Append to list of blocked */ + nlmsvc_insert_block_locked(block, NLM_NEVER); + spin_unlock(&nlm_blocked_lock); + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; mode = lock_to_openmode(&lock->fl); @@ -552,16 +568,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, dprintk("lockd: vfs_lock_file returned %d\n", error); switch (error) { case 0: + nlmsvc_remove_block(block); ret = nlm_granted; goto out; case -EAGAIN: - /* - * If this is a blocking request for an - * already pending lock request then we need - * to put it back on lockd's block list - */ - if (wait) - break; + if (!wait) + nlmsvc_remove_block(block); ret = async_block ? nlm_lck_blocked : nlm_lck_denied; goto out; case FILE_LOCK_DEFERRED: @@ -572,17 +584,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, ret = nlmsvc_defer_lock_rqst(rqstp, block); goto out; case -EDEADLK: + nlmsvc_remove_block(block); ret = nlm_deadlock; goto out; default: /* includes ENOLCK */ + nlmsvc_remove_block(block); ret = nlm_lck_denied_nolocks; goto out; } ret = nlm_lck_blocked; - - /* Append to list of blocked */ - nlmsvc_insert_block(block, NLM_NEVER); out: mutex_unlock(&file->f_mutex); nlmsvc_release_block(block); @@ -1020,13 +1031,13 @@ retry_deferred_block(struct nlm_block *block) * be retransmitted. */ void -nlmsvc_retry_blocked(void) +nlmsvc_retry_blocked(struct svc_rqst *rqstp) { unsigned long timeout = MAX_SCHEDULE_TIMEOUT; struct nlm_block *block; spin_lock(&nlm_blocked_lock); - while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { + while (!list_empty(&nlm_blocked) && !svc_thread_should_stop(rqstp)) { block = list_entry(nlm_blocked.next, struct nlm_block, b_list); if (block->b_when == NLM_NEVER) diff --git a/fs/locks.c b/fs/locks.c index 76ad05f8070a..d4e49a990a8d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2264,11 +2264,13 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Callers expecting ->lock() to return asynchronously - * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if) - * the request is for a blocking lock. When ->lock() does return asynchronously, - * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock - * request completes. + * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations + * flags need to be set. + * + * Callers expecting ->lock() to return asynchronously will only use F_SETLK, + * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a + * blocking lock. When ->lock() does return asynchronously, it must return + * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes. * If the request is for non-blocking lock the file system should return * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine * with the result. If the request timed out the callback routine will return a diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 25c08fbfcb9d..7da66ca184f4 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode) } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 20f23e6e58ad..62c313fc9a49 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -281,7 +281,7 @@ got_it: de->inode = inode->i_ino; } dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: @@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) else de->inode = 0; dir_commit_chunk(page, pos, len); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } @@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, else de->inode = inode->i_ino; dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df575473c1cc..f8af6c3ae336 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -501,7 +501,8 @@ static struct inode *V1_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime(inode, raw_inode->i_time, 0))); inode->i_blocks = 0; for (i = 0; i < 9; i++) minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; @@ -538,11 +539,9 @@ static struct inode *V2_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime.tv_sec = raw_inode->i_mtime; - inode->i_atime.tv_sec = raw_inode->i_atime; + inode_set_mtime(inode, raw_inode->i_mtime, 0); + inode_set_atime(inode, raw_inode->i_atime, 0); inode_set_ctime(inode, raw_inode->i_ctime, 0); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; inode->i_blocks = 0; for (i = 0; i < 10; i++) minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; @@ -589,7 +588,7 @@ static struct buffer_head * V1_minix_update_inode(struct inode * inode) raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; - raw_inode->i_time = inode->i_mtime.tv_sec; + raw_inode->i_time = inode_get_mtime_sec(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 9; i++) @@ -616,9 +615,9 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) raw_inode->i_gid = fs_high2lowgid(i_gid_read(inode)); raw_inode->i_nlinks = inode->i_nlink; raw_inode->i_size = inode->i_size; - raw_inode->i_mtime = inode->i_mtime.tv_sec; - raw_inode->i_atime = inode->i_atime.tv_sec; - raw_inode->i_ctime = inode_get_ctime(inode).tv_sec; + raw_inode->i_mtime = inode_get_mtime_sec(inode); + raw_inode->i_atime = inode_get_atime_sec(inode); + raw_inode->i_ctime = inode_get_ctime_sec(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 10; i++) diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index ce18ae37c29d..dad131e30c05 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -350,7 +350,7 @@ do_indirects: } first_whole++; } - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); } diff --git a/fs/namei.c b/fs/namei.c index 567ee547492b..71c13b2990b4 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -188,7 +188,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + atomic_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -249,7 +249,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + atomic_set(&result->refcnt, 1); audit_getname(result); return result; @@ -261,9 +261,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) + return; - if (--name->refcnt > 0) + if (!atomic_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { @@ -3104,25 +3105,6 @@ void unlock_rename(struct dentry *p1, struct dentry *p2) EXPORT_SYMBOL(unlock_rename); /** - * mode_strip_umask - handle vfs umask stripping - * @dir: parent directory of the new inode - * @mode: mode of the new inode to be created in @dir - * - * Umask stripping depends on whether or not the filesystem supports POSIX - * ACLs. If the filesystem doesn't support it umask stripping is done directly - * in here. If the filesystem does support POSIX ACLs umask stripping is - * deferred until the filesystem calls posix_acl_create(). - * - * Returns: mode - */ -static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode) -{ - if (!IS_POSIXACL(dir)) - mode &= ~current_umask(); - return mode; -} - -/** * vfs_prepare_mode - prepare the mode to be used for a new inode * @idmap: idmap of the mount the inode was found from * @dir: parent directory of the new inode @@ -3535,7 +3517,8 @@ static const char *open_last_lookups(struct nameidata *nd, if (likely(dentry)) goto finish_lookup; - BUG_ON(nd->flags & LOOKUP_RCU); + if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU)) + return ERR_PTR(-ECHILD); } else { /* create side of things */ if (nd->flags & LOOKUP_RCU) { @@ -3802,7 +3785,10 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + if (unlikely(file->f_mode & FMODE_OPENED)) + fput(file); + else + release_empty_file(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4386,11 +4372,9 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len]) + if (last.name[last.len] || d_is_negative(dentry)) goto slashes; inode = dentry->d_inode; - if (d_is_negative(dentry)) - goto slashes; ihold(inode); error = security_path_unlink(&path, dentry); if (error) diff --git a/fs/namespace.c b/fs/namespace.c index e157efc54023..6bde71735efa 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -330,16 +330,16 @@ static int mnt_is_readonly(struct vfsmount *mnt) * can determine when writes are able to occur to a filesystem. */ /** - * __mnt_want_write - get write access to a mount without freeze protection + * mnt_get_write_access - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being - * frozen. When the write operation is finished, __mnt_drop_write() must be + * frozen. When the write operation is finished, mnt_put_write_access() must be * called. This is effectively a refcount. */ -int __mnt_want_write(struct vfsmount *m) +int mnt_get_write_access(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; @@ -386,6 +386,7 @@ int __mnt_want_write(struct vfsmount *m) return ret; } +EXPORT_SYMBOL_GPL(mnt_get_write_access); /** * mnt_want_write - get write access to a mount @@ -401,7 +402,7 @@ int mnt_want_write(struct vfsmount *m) int ret; sb_start_write(m->mnt_sb); - ret = __mnt_want_write(m); + ret = mnt_get_write_access(m); if (ret) sb_end_write(m->mnt_sb); return ret; @@ -409,15 +410,15 @@ int mnt_want_write(struct vfsmount *m) EXPORT_SYMBOL_GPL(mnt_want_write); /** - * __mnt_want_write_file - get write access to a file's mount + * mnt_get_write_access_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but if the file is already open for writing it + * This is like mnt_get_write_access, but if @file is already open for write it * skips incrementing mnt_writers (since the open file already has a reference) * and instead only does the check for emergency r/o remounts. This must be - * paired with __mnt_drop_write_file. + * paired with mnt_put_write_access_file. */ -int __mnt_want_write_file(struct file *file) +int mnt_get_write_access_file(struct file *file) { if (file->f_mode & FMODE_WRITER) { /* @@ -428,7 +429,7 @@ int __mnt_want_write_file(struct file *file) return -EROFS; return 0; } - return __mnt_want_write(file->f_path.mnt); + return mnt_get_write_access(file->f_path.mnt); } /** @@ -445,7 +446,7 @@ int mnt_want_write_file(struct file *file) int ret; sb_start_write(file_inode(file)->i_sb); - ret = __mnt_want_write_file(file); + ret = mnt_get_write_access_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; @@ -453,19 +454,20 @@ int mnt_want_write_file(struct file *file) EXPORT_SYMBOL_GPL(mnt_want_write_file); /** - * __mnt_drop_write - give up write access to a mount + * mnt_put_write_access - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with - * __mnt_want_write() call above. + * mnt_get_write_access() call above. */ -void __mnt_drop_write(struct vfsmount *mnt) +void mnt_put_write_access(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } +EXPORT_SYMBOL_GPL(mnt_put_write_access); /** * mnt_drop_write - give up write access to a mount @@ -477,20 +479,20 @@ void __mnt_drop_write(struct vfsmount *mnt) */ void mnt_drop_write(struct vfsmount *mnt) { - __mnt_drop_write(mnt); + mnt_put_write_access(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); -void __mnt_drop_write_file(struct file *file) +void mnt_put_write_access_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) - __mnt_drop_write(file->f_path.mnt); + mnt_put_write_access(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { - __mnt_drop_write_file(file); + mnt_put_write_access_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); @@ -1344,9 +1346,9 @@ void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); - /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ + /* avoid cacheline pingpong */ if (unlikely(m->mnt_expiry_mark)) - m->mnt_expiry_mark = 0; + WRITE_ONCE(m->mnt_expiry_mark, 0); mntput_no_expire(m); } } diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 3404707ddbe7..2cd3ccf4c439 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -47,12 +47,14 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) xas_for_each(&xas, folio, last_page) { loff_t pg_end; bool pg_failed = false; + bool folio_started; if (xas_retry(&xas, folio)) continue; pg_end = folio_pos(folio) + folio_size(folio) - 1; + folio_started = false; for (;;) { loff_t sreq_end; @@ -60,8 +62,10 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) pg_failed = true; break; } - if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) + if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) { folio_start_fscache(folio); + folio_started = true; + } pg_failed |= subreq_failed; sreq_end = subreq->start + subreq->len - 1; if (pg_end < sreq_end) diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 716bc75e9ed2..b4294a8aa2d4 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -108,7 +108,7 @@ struct pnfs_block_dev { struct pnfs_block_dev *children; u64 chunk_size; - struct block_device *bdev; + struct bdev_handle *bdev_handle; u64 disk_offset; u64 pr_key; diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 65cbb5607a5f..f318a05a80e1 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -25,17 +25,17 @@ bl_free_device(struct pnfs_block_dev *dev) } else { if (dev->pr_registered) { const struct pr_ops *ops = - dev->bdev->bd_disk->fops->pr_ops; + dev->bdev_handle->bdev->bd_disk->fops->pr_ops; int error; - error = ops->pr_register(dev->bdev, dev->pr_key, 0, - false); + error = ops->pr_register(dev->bdev_handle->bdev, + dev->pr_key, 0, false); if (error) pr_err("failed to unregister PR key.\n"); } - if (dev->bdev) - blkdev_put(dev->bdev, NULL); + if (dev->bdev_handle) + bdev_release(dev->bdev_handle); } } @@ -169,7 +169,7 @@ static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, map->start = dev->start; map->len = dev->len; map->disk_offset = dev->disk_offset; - map->bdev = dev->bdev; + map->bdev = dev->bdev_handle->bdev; return true; } @@ -236,28 +236,26 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct bdev_handle *bdev_handle; dev_t dev; dev = bl_resolve_deviceid(server, v, gfp_mask); if (!dev) return -EIO; - bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_handle)) { printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", - MAJOR(dev), MINOR(dev), PTR_ERR(bdev)); - return PTR_ERR(bdev); + MAJOR(dev), MINOR(dev), PTR_ERR(bdev_handle)); + return PTR_ERR(bdev_handle); } - d->bdev = bdev; - - - d->len = bdev_nr_bytes(d->bdev); + d->bdev_handle = bdev_handle; + d->len = bdev_nr_bytes(bdev_handle->bdev); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", - d->bdev->bd_disk->disk_name); + bdev_handle->bdev->bd_disk->disk_name); return 0; } @@ -302,10 +300,10 @@ bl_validate_designator(struct pnfs_block_volume *v) } } -static struct block_device * +static struct bdev_handle * bl_open_path(struct pnfs_block_volume *v, const char *prefix) { - struct block_device *bdev; + struct bdev_handle *bdev_handle; const char *devname; devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN", @@ -313,15 +311,15 @@ bl_open_path(struct pnfs_block_volume *v, const char *prefix) if (!devname) return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, NULL, - NULL); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE, + NULL, NULL); + if (IS_ERR(bdev_handle)) { pr_warn("pNFS: failed to open device %s (%ld)\n", - devname, PTR_ERR(bdev)); + devname, PTR_ERR(bdev_handle)); } kfree(devname); - return bdev; + return bdev_handle; } static int @@ -329,7 +327,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) { struct pnfs_block_volume *v = &volumes[idx]; - struct block_device *bdev; + struct bdev_handle *bdev_handle; const struct pr_ops *ops; int error; @@ -342,32 +340,32 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, * On other distributions like Debian, the default SCSI by-id path will * point to the dm-multipath device if one exists. */ - bdev = bl_open_path(v, "dm-uuid-mpath-0x"); - if (IS_ERR(bdev)) - bdev = bl_open_path(v, "wwn-0x"); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); - d->bdev = bdev; - - d->len = bdev_nr_bytes(d->bdev); + bdev_handle = bl_open_path(v, "dm-uuid-mpath-0x"); + if (IS_ERR(bdev_handle)) + bdev_handle = bl_open_path(v, "wwn-0x"); + if (IS_ERR(bdev_handle)) + return PTR_ERR(bdev_handle); + d->bdev_handle = bdev_handle; + + d->len = bdev_nr_bytes(d->bdev_handle->bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; pr_info("pNFS: using block device %s (reservation key 0x%llx)\n", - d->bdev->bd_disk->disk_name, d->pr_key); + d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key); - ops = d->bdev->bd_disk->fops->pr_ops; + ops = d->bdev_handle->bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: block device %s does not support reservations.", - d->bdev->bd_disk->disk_name); + d->bdev_handle->bdev->bd_disk->disk_name); error = -EINVAL; goto out_blkdev_put; } - error = ops->pr_register(d->bdev, 0, d->pr_key, true); + error = ops->pr_register(d->bdev_handle->bdev, 0, d->pr_key, true); if (error) { pr_err("pNFS: failed to register key for block device %s.", - d->bdev->bd_disk->disk_name); + d->bdev_handle->bdev->bd_disk->disk_name); goto out_blkdev_put; } @@ -375,7 +373,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return 0; out_blkdev_put: - blkdev_put(d->bdev, NULL); + bdev_release(d->bdev_handle); return error; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 466ebf1d41b2..4ffa1f469e90 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -78,7 +78,7 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_freezable_should_stop(NULL)) + while (!svc_thread_should_stop(rqstp)) svc_recv(rqstp); svc_exit_thread(rqstp); @@ -86,45 +86,6 @@ nfs4_callback_svc(void *vrqstp) } #if defined(CONFIG_NFS_V4_1) -/* - * The callback service for NFSv4.1 callbacks - */ -static int -nfs41_callback_svc(void *vrqstp) -{ - struct svc_rqst *rqstp = vrqstp; - struct svc_serv *serv = rqstp->rq_server; - struct rpc_rqst *req; - int error; - DEFINE_WAIT(wq); - - set_freezable(); - - while (!kthread_freezable_should_stop(NULL)) { - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_IDLE); - spin_lock_bh(&serv->sv_cb_lock); - if (!list_empty(&serv->sv_cb_list)) { - req = list_first_entry(&serv->sv_cb_list, - struct rpc_rqst, rq_bc_list); - list_del(&req->rq_bc_list); - spin_unlock_bh(&serv->sv_cb_lock); - finish_wait(&serv->sv_cb_waitq, &wq); - dprintk("Invoking bc_svc_process()\n"); - error = bc_svc_process(serv, req, rqstp); - dprintk("bc_svc_process() returned w/ error code= %d\n", - error); - } else { - spin_unlock_bh(&serv->sv_cb_lock); - if (!kthread_should_stop()) - schedule(); - finish_wait(&serv->sv_cb_waitq, &wq); - } - } - - svc_exit_thread(rqstp); - return 0; -} - static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt, struct svc_serv *serv) { @@ -237,10 +198,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) cb_info->users); threadfn = nfs4_callback_svc; -#if defined(CONFIG_NFS_V4_1) - if (minorversion) - threadfn = nfs41_callback_svc; -#else +#if !defined(CONFIG_NFS_V4_1) if (minorversion) return ERR_PTR(-ENOTSUPP); #endif diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 6bed1394d748..96a4923080ae 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -60,7 +60,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, if (nfs_have_writebacks(inode)) res->change_attr++; res->ctime = inode_get_ctime(inode); - res->mtime = inode->i_mtime; + res->mtime = inode_get_mtime(inode); res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; res->bitmap[1] = (FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY) & diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 47d892a1d363..f6c74f424691 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -93,12 +93,10 @@ nfs_direct_handle_truncated(struct nfs_direct_req *dreq, dreq->max_count = dreq_len; if (dreq->count > dreq_len) dreq->count = dreq_len; - - if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) - dreq->error = hdr->error; - else /* Clear outstanding error if this is EOF */ - dreq->error = 0; } + + if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && !dreq->error) + dreq->error = hdr->error; } static void @@ -120,6 +118,18 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq, dreq->count = dreq_len; } +static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, + struct nfs_page *req) +{ + loff_t offs = req_offset(req); + size_t req_start = (size_t)(offs - dreq->io_start); + + if (req_start < dreq->max_count) + dreq->max_count = req_start; + if (req_start < dreq->count) + dreq->count = req_start; +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -488,7 +498,9 @@ static void nfs_direct_add_page_head(struct list_head *list, kref_get(&head->wb_kref); } -static void nfs_direct_join_group(struct list_head *list, struct inode *inode) +static void nfs_direct_join_group(struct list_head *list, + struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *req, *subreq; @@ -510,7 +522,7 @@ static void nfs_direct_join_group(struct list_head *list, struct inode *inode) nfs_release_request(subreq); } } while ((subreq = subreq->wb_this_page) != req); - nfs_join_page_group(req, inode); + nfs_join_page_group(req, cinfo, inode); } } @@ -528,20 +540,15 @@ nfs_direct_write_scan_commit_list(struct inode *inode, static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; - struct nfs_page *req, *tmp; + struct nfs_page *req; LIST_HEAD(reqs); struct nfs_commit_info cinfo; - LIST_HEAD(failed); nfs_init_cinfo_from_dreq(&cinfo, dreq); nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo); - nfs_direct_join_group(&reqs, dreq->inode); + nfs_direct_join_group(&reqs, &cinfo, dreq->inode); - dreq->count = 0; - dreq->max_count = 0; - list_for_each_entry(req, &reqs, wb_list) - dreq->max_count += req->wb_bytes; nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo); get_dreq(dreq); @@ -549,27 +556,40 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; - list_for_each_entry_safe(req, tmp, &reqs, wb_list) { + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); /* Bump the transmission count */ req->wb_nio++; if (!nfs_pageio_add_request(&desc, req)) { - nfs_list_move_request(req, &failed); - spin_lock(&cinfo.inode->i_lock); - dreq->flags = 0; - if (desc.pg_error < 0) + spin_lock(&dreq->lock); + if (dreq->error < 0) { + desc.pg_error = dreq->error; + } else if (desc.pg_error != -EAGAIN) { + dreq->flags = 0; + if (!desc.pg_error) + desc.pg_error = -EIO; dreq->error = desc.pg_error; - else - dreq->error = -EIO; - spin_unlock(&cinfo.inode->i_lock); + } else + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + break; } nfs_release_request(req); } nfs_pageio_complete(&desc); - while (!list_empty(&failed)) { - req = nfs_list_entry(failed.next); + while (!list_empty(&reqs)) { + req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); nfs_unlock_and_release_request(req); + if (desc.pg_error == -EAGAIN) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } else { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } } if (put_dreq(dreq)) @@ -589,8 +609,6 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) if (status < 0) { /* Errors in commit are fatal */ dreq->error = status; - dreq->max_count = 0; - dreq->count = 0; dreq->flags = NFS_ODIRECT_DONE; } else { status = dreq->error; @@ -601,7 +619,12 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); - if (status >= 0 && !nfs_write_match_verf(verf, req)) { + if (status < 0) { + spin_lock(&dreq->lock); + nfs_direct_truncate_request(dreq, req); + spin_unlock(&dreq->lock); + nfs_release_request(req); + } else if (!nfs_write_match_verf(verf, req)) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; /* * Despite the reboot, the write was successful, @@ -609,7 +632,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) */ req->wb_nio = 0; nfs_mark_request_commit(req, NULL, &cinfo, 0); - } else /* Error or match */ + } else nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -662,6 +685,7 @@ static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq) while (!list_empty(&reqs)) { req = nfs_list_entry(reqs.next); nfs_list_remove_request(req); + nfs_direct_truncate_request(dreq, req); nfs_release_request(req); nfs_unlock_and_release_request(req); } @@ -711,7 +735,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } nfs_direct_count_bytes(dreq, hdr); - if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags)) { + if (test_bit(NFS_IOHDR_UNSTABLE_WRITES, &hdr->flags) && + !test_bit(NFS_IOHDR_ERROR, &hdr->flags)) { if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; flags = dreq->flags; @@ -755,18 +780,23 @@ static void nfs_write_sync_pgio_error(struct list_head *head, int error) static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; + struct nfs_page *req; + struct nfs_commit_info cinfo; trace_nfs_direct_write_reschedule_io(dreq); + nfs_init_cinfo_from_dreq(&cinfo, dreq); spin_lock(&dreq->lock); - if (dreq->error == 0) { + if (dreq->error == 0) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.offset + hdr->args.count - - hdr->io_start; - } + set_bit(NFS_IOHDR_REDO, &hdr->flags); spin_unlock(&dreq->lock); + while (!list_empty(&hdr->pages)) { + req = nfs_list_entry(hdr->pages.next); + nfs_list_remove_request(req); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + } } static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { @@ -794,9 +824,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; + struct nfs_commit_info cinfo; ssize_t result = 0; size_t requested_bytes = 0; size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE); + bool defer = false; trace_nfs_direct_write_schedule_iovec(dreq); @@ -837,17 +869,37 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, break; } - nfs_lock_request(req); - if (!nfs_pageio_add_request(&desc, req)) { - result = desc.pg_error; - nfs_unlock_and_release_request(req); - break; - } pgbase = 0; bytes -= req_len; requested_bytes += req_len; pos += req_len; dreq->bytes_left -= req_len; + + if (defer) { + nfs_mark_request_commit(req, NULL, &cinfo, 0); + continue; + } + + nfs_lock_request(req); + if (nfs_pageio_add_request(&desc, req)) + continue; + + /* Exit on hard errors */ + if (desc.pg_error < 0 && desc.pg_error != -EAGAIN) { + result = desc.pg_error; + nfs_unlock_and_release_request(req); + break; + } + + /* If the error is soft, defer remaining requests */ + nfs_init_cinfo_from_dreq(&cinfo, dreq); + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_unlock_request(req); + nfs_mark_request_commit(req, NULL, &cinfo, 0); + desc.pg_error = 0; + defer = true; } nfs_direct_release_pages(pagevec, npages); kvfree(pagevec); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 7deb3cd76abe..ef817a0475ff 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1235,6 +1235,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -EPFNOSUPPORT: case -EPROTONOSUPPORT: case -EOPNOTSUPP: + case -EINVAL: case -ECONNREFUSED: case -ECONNRESET: case -EHOSTDOWN: @@ -2519,9 +2520,9 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, return i; } -static int -ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +static int ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) { + struct pnfs_layout_hdr *lo; struct nfs4_flexfile_layout *ff_layout; const int dev_count = PNFS_LAYOUTSTATS_MAXDEV; @@ -2532,11 +2533,14 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return -ENOMEM; spin_lock(&args->inode->i_lock); - ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout); - args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr, - &args->devinfo[0], - dev_count, - NFS4_FF_OP_LAYOUTSTATS); + lo = NFS_I(args->inode)->layout; + if (lo && pnfs_layout_is_valid(lo)) { + ff_layout = FF_LAYOUT_FROM_HDR(lo); + args->num_dev = ff_layout_mirror_prepare_stats( + &ff_layout->generic_hdr, &args->devinfo[0], dev_count, + NFS4_FF_OP_LAYOUTSTATS); + } else + args->num_dev = 0; spin_unlock(&args->inode->i_lock); if (!args->num_dev) { kfree(args->devinfo); diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index 2dc64454492b..5407ab8c8783 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -114,8 +114,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * struct inode *inode) { memset(auxdata, 0, sizeof(*auxdata)); - auxdata->mtime_sec = inode->i_mtime.tv_sec; - auxdata->mtime_nsec = inode->i_mtime.tv_nsec; + auxdata->mtime_sec = inode_get_mtime(inode).tv_sec; + auxdata->mtime_nsec = inode_get_mtime(inode).tv_nsec; auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index e21c073158e5..ebb8d60e1152 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -512,8 +512,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) } else init_special_inode(inode, inode->i_mode, fattr->rdev); - memset(&inode->i_atime, 0, sizeof(inode->i_atime)); - memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); + inode_set_atime(inode, 0, 0); + inode_set_mtime(inode, 0, 0); inode_set_ctime(inode, 0, 0); inode_set_iversion_raw(inode, 0); inode->i_size = 0; @@ -527,11 +527,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) @@ -742,9 +742,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (attr->ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); @@ -758,9 +758,9 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (attr->ia_valid & ATTR_MTIME_SET) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); @@ -1451,11 +1451,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode_set_ctime_to_ts(inode, fattr->ctime); } - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) @@ -1506,7 +1506,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE; - ts = inode->i_mtime; + ts = inode_get_mtime(inode); if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; @@ -1534,7 +1534,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_NLINK; - ts = inode->i_atime; + ts = inode_get_atime(inode); if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; @@ -2002,7 +2002,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { - fattr->pre_mtime = inode->i_mtime; + fattr->pre_mtime = inode_get_mtime(inode); fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && @@ -2184,7 +2184,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } if (fattr->valid & NFS_ATTR_FATTR_MTIME) - inode->i_mtime = fattr->mtime; + inode_set_mtime_to_ts(inode, fattr->mtime); else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; @@ -2220,7 +2220,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & NFS_INO_INVALID_SIZE; if (fattr->valid & NFS_ATTR_FATTR_ATIME) - inode->i_atime = fattr->atime; + inode_set_atime_to_ts(inode, fattr->atime); else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 5ba00610aede..0d3ce0460e35 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -18,7 +18,7 @@ struct nfs_subversion { const struct rpc_version *rpc_vers; /* NFS version information */ const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ - const struct xattr_handler **xattr; /* NFS xattr handlers */ + const struct xattr_handler * const *xattr; /* NFS xattr handlers */ struct list_head list; /* List of NFS versions */ }; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 063e00aff87e..28704f924612 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -81,7 +81,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status == 0) { if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, + NFS_INO_REVAL_FORCED | NFS_INO_INVALID_MODE); spin_unlock(&inode->i_lock); } status = nfs_post_op_update_inode_force_wcc(inode, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 47c5c1f86d66..827d00e2f094 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -315,7 +315,7 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct nfs_fh *, struct nfs_fattr *); extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *); -extern const struct xattr_handler *nfs4_xattr_handlers[]; +extern const struct xattr_handler * const nfs4_xattr_handlers[]; extern int nfs4_set_rw_stateid(nfs4_stateid *stateid, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 27fb25567ce7..11e3a285594c 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -417,6 +417,8 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) .net = old->cl_net, .servername = old->cl_hostname, }; + int max_connect = test_bit(NFS_CS_PNFS, &clp->cl_flags) ? + clp->cl_max_connect : old->cl_max_connect; if (clp->cl_proto != old->cl_proto) return; @@ -430,7 +432,7 @@ static void nfs4_add_trunk(struct nfs_client *clp, struct nfs_client *old) xprt_args.addrlen = clp_salen; rpc_clnt_add_xprt(old->cl_rpcclient, &xprt_args, - rpc_clnt_test_and_add_xprt, NULL); + rpc_clnt_test_and_add_xprt, &max_connect); } /** @@ -1010,6 +1012,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); + __set_bit(NFS_CS_PNFS, &cl_init.init_flags); + cl_init.max_connect = NFS_MAX_TRANSPORTS; /* * Set an authflavor equual to the MDS value. Use the MDS nfs_client * cl_ipaddr so as to use the same EXCHANGE_ID co_ownerid as the MDS diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 794343790ea8..a654d7234f51 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2703,8 +2703,12 @@ static int _nfs4_proc_open(struct nfs4_opendata *data, return status; } if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + struct nfs_fh *fh = &o_res->fh; + nfs4_sequence_free_slot(&o_res->seq_res); - nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, NULL); + if (o_arg->claim == NFS4_OPEN_CLAIM_FH) + fh = NFS_FH(d_inode(data->dentry)); + nfs4_proc_getattr(server, fh, o_res->f_attr, NULL); } return 0; } @@ -8866,8 +8870,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre /* Save the EXCHANGE_ID verifier session trunk tests */ memcpy(clp->cl_confirm.data, argp->verifier.data, sizeof(clp->cl_confirm.data)); - if (resp->flags & EXCHGID4_FLAG_USE_PNFS_DS) - set_bit(NFS_CS_DS, &clp->cl_flags); out: trace_nfs4_exchange_id(clp, status); rpc_put_task(task); @@ -10618,7 +10620,9 @@ static void nfs4_disable_swap(struct inode *inode) */ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + wake_up_var(&clp->cl_state); } static const struct inode_operations nfs4_dir_inode_operations = { @@ -10733,7 +10737,7 @@ static const struct xattr_handler nfs4_xattr_nfs4_user_handler = { }; #endif -const struct xattr_handler *nfs4_xattr_handlers[] = { +const struct xattr_handler * const nfs4_xattr_handlers[] = { &nfs4_xattr_nfs4_acl_handler, #if defined(CONFIG_NFS_V4_1) &nfs4_xattr_nfs4_dacl_handler, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index e079987af4a3..9a5d911a7edc 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1209,16 +1209,26 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) { struct task_struct *task; char buf[INET6_ADDRSTRLEN + sizeof("-manager") + 1]; + struct rpc_clnt *clnt = clp->cl_rpcclient; + bool swapon = false; - if (clp->cl_rpcclient->cl_shutdown) + if (clnt->cl_shutdown) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); - if (test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) != 0) { - wake_up_var(&clp->cl_state); - return; + + if (atomic_read(&clnt->cl_swapper)) { + swapon = !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, + &clp->cl_state); + if (!swapon) { + wake_up_var(&clp->cl_state); + return; + } } - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); + + if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) + return; + __module_get(THIS_MODULE); refcount_inc(&clp->cl_count); @@ -1235,8 +1245,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) __func__, PTR_ERR(task)); if (!nfs_client_init_is_complete(clp)) nfs_mark_client_ready(clp, PTR_ERR(task)); + if (swapon) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs4_clear_state_manager_bit(clp); - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); nfs_put_client(clp); module_put(THIS_MODULE); } @@ -2703,6 +2714,13 @@ static void nfs4_state_manager(struct nfs_client *clp) nfs4_end_drain_session(clp); nfs4_clear_state_manager_bit(clp); + if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, + &clp->cl_state)) { + memflags = memalloc_nofs_save(); + continue; + } + if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) { if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) { nfs_client_return_marked_delegations(clp); @@ -2741,22 +2759,25 @@ static int nfs4_run_state_manager(void *ptr) allow_signal(SIGKILL); again: - set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state); nfs4_state_manager(clp); - if (atomic_read(&cl->cl_swapper)) { + + if (test_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state) && + !test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) { wait_var_event_interruptible(&clp->cl_state, test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)); - if (atomic_read(&cl->cl_swapper) && - test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state)) + if (!atomic_read(&cl->cl_swapper)) + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); + if (refcount_read(&clp->cl_count) > 1 && !signalled() && + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; /* Either no longer a swapper, or were signalled */ + clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); } - clear_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state); if (refcount_read(&clp->cl_count) > 1 && !signalled() && test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) && - !test_and_set_bit(NFS4CLNT_MANAGER_AVAILABLE, &clp->cl_state)) + !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state)) goto again; nfs_put_client(clp); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 306cba0b9e69..84343aefbbd6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2634,31 +2634,44 @@ pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo, return mode == 0; } -static int -pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data) +static int pnfs_layout_return_unused_byserver(struct nfs_server *server, + void *data) { const struct pnfs_layout_range *range = data; + const struct cred *cred; struct pnfs_layout_hdr *lo; struct inode *inode; + nfs4_stateid stateid; + enum pnfs_iomode iomode; + restart: rcu_read_lock(); list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { - if (!pnfs_layout_can_be_returned(lo) || + inode = lo->plh_inode; + if (!inode || !pnfs_layout_can_be_returned(lo) || test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) continue; - inode = lo->plh_inode; spin_lock(&inode->i_lock); - if (!pnfs_should_return_unused_layout(lo, range)) { + if (!lo->plh_inode || + !pnfs_should_return_unused_layout(lo, range)) { spin_unlock(&inode->i_lock); continue; } + pnfs_get_layout_hdr(lo); + pnfs_set_plh_return_info(lo, range->iomode, 0); + if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, + range, 0) != 0 || + !pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode)) { + spin_unlock(&inode->i_lock); + rcu_read_unlock(); + pnfs_put_layout_hdr(lo); + cond_resched(); + goto restart; + } spin_unlock(&inode->i_lock); - inode = pnfs_grab_inode_layout_hdr(lo); - if (!inode) - continue; rcu_read_unlock(); - pnfs_mark_layout_for_return(inode, range); - iput(inode); + pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false); + pnfs_put_layout_hdr(lo); cond_resched(); goto restart; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d6473cb00cb..9b1cfca8112a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1071,7 +1071,7 @@ static void nfs_fill_super(struct super_block *sb, struct nfs_fs_context *ctx) sb->s_export_op = &nfs_export_ops; break; case 4: - sb->s_flags |= SB_POSIXACL; + sb->s_iflags |= SB_I_NOUMASK; sb->s_time_gran = 1; sb->s_time_min = S64_MIN; sb->s_time_max = S64_MAX; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f4cca8f00c0c..9d82d50ce0b1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -59,7 +59,8 @@ static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops; static const struct nfs_commit_completion_ops nfs_commit_completion_ops; static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_inode_remove_request(struct nfs_page *req); -static void nfs_clear_request_commit(struct nfs_page *req); +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); static struct nfs_page * @@ -502,8 +503,8 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * the (former) group. All subrequests are removed from any write or commit * lists, unlinked from the group and destroyed. */ -void -nfs_join_page_group(struct nfs_page *head, struct inode *inode) +void nfs_join_page_group(struct nfs_page *head, struct nfs_commit_info *cinfo, + struct inode *inode) { struct nfs_page *subreq; struct nfs_page *destroy_list = NULL; @@ -533,7 +534,7 @@ nfs_join_page_group(struct nfs_page *head, struct inode *inode) * Commit list removal accounting is done after locks are dropped */ subreq = head; do { - nfs_clear_request_commit(subreq); + nfs_clear_request_commit(cinfo, subreq); subreq = subreq->wb_this_page; } while (subreq != head); @@ -566,8 +567,10 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) { struct inode *inode = folio_file_mapping(folio)->host; struct nfs_page *head; + struct nfs_commit_info cinfo; int ret; + nfs_init_cinfo_from_inode(&cinfo, inode); /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed @@ -584,7 +587,7 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) return ERR_PTR(ret); } - nfs_join_page_group(head, inode); + nfs_join_page_group(head, &cinfo, inode); return head; } @@ -785,6 +788,8 @@ static void nfs_inode_add_request(struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(nfs_page_to_inode(req)); + if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { struct folio *folio = nfs_page_to_folio(req->wb_head); struct address_space *mapping = folio_file_mapping(folio); @@ -799,8 +804,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) { + atomic_long_dec(&nfsi->nrequests); nfs_release_request(req); - atomic_long_dec(&NFS_I(nfs_page_to_inode(req))->nrequests); } } @@ -955,18 +960,16 @@ static void nfs_folio_clear_commit(struct folio *folio) } /* Called holding the request lock on @req */ -static void -nfs_clear_request_commit(struct nfs_page *req) +static void nfs_clear_request_commit(struct nfs_commit_info *cinfo, + struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { struct nfs_open_context *ctx = nfs_req_openctx(req); struct inode *inode = d_inode(ctx->dentry); - struct nfs_commit_info cinfo; - nfs_init_cinfo_from_inode(&cinfo, inode); mutex_lock(&NFS_I(inode)->commit_mutex); - if (!pnfs_clear_request_commit(req, &cinfo)) { - nfs_request_remove_commit_list(req, &cinfo); + if (!pnfs_clear_request_commit(req, cinfo)) { + nfs_request_remove_commit_list(req, cinfo); } mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_folio_clear_commit(nfs_page_to_folio(req)); diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 6fffc8f03f74..b8736a82e57c 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -12,7 +12,8 @@ nfsd-y += trace.o nfsd-y += nfssvc.o nfsctl.o nfsfh.o vfs.o \ export.o auth.o lockd.o nfscache.o \ - stats.o filecache.o nfs3proc.o nfs3xdr.o + stats.o filecache.o nfs3proc.o nfs3xdr.o \ + netlink.o nfsd-$(CONFIG_NFSD_V2) += nfsproc.o nfsxdr.o nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 01d7fd108cf3..46fd74d91ea9 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -117,12 +117,13 @@ static __be32 nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, struct iomap *iomaps, int nr_iomaps) { + struct timespec64 mtime = inode_get_mtime(inode); loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; int error; if (lcp->lc_mtime.tv_nsec == UTIME_NOW || - timespec64_compare(&lcp->lc_mtime, &inode->i_mtime) < 0) + timespec64_compare(&lcp->lc_mtime, &mtime) < 0) lcp->lc_mtime = current_time(inode); iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index 1ed2f691ebb9..ce78f74715ee 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -16,9 +16,9 @@ __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_block_extent *b = lgp->lg_content; + const struct pnfs_block_extent *b = lgp->lg_content; int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32); __be32 *p; @@ -77,7 +77,7 @@ nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_block_deviceaddr *dev = gdp->gd_device; int len = sizeof(__be32), ret, i; diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index bc5166bfe46b..b0361e8aa9a7 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -51,9 +51,9 @@ struct pnfs_block_deviceaddr { }; __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); + const struct nfsd4_layoutget *lgp); int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, u32 block_size); int nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 11a0eaa2f914..b7da17e53007 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -339,12 +339,16 @@ static int export_stats_init(struct export_stats *stats) static void export_stats_reset(struct export_stats *stats) { - nfsd_percpu_counters_reset(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_reset(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void export_stats_destroy(struct export_stats *stats) { - nfsd_percpu_counters_destroy(stats->counter, EXP_STATS_COUNTERS_NUM); + if (stats) + nfsd_percpu_counters_destroy(stats->counter, + EXP_STATS_COUNTERS_NUM); } static void svc_export_put(struct kref *ref) @@ -353,7 +357,8 @@ static void svc_export_put(struct kref *ref) path_put(&exp->ex_path); auth_domain_put(exp->ex_client); nfsd4_fslocs_free(&exp->ex_fslocs); - export_stats_destroy(&exp->ex_stats); + export_stats_destroy(exp->ex_stats); + kfree(exp->ex_stats); kfree(exp->ex_uuid); kfree_rcu(exp, ex_rcu); } @@ -767,13 +772,15 @@ static int svc_export_show(struct seq_file *m, seq_putc(m, '\t'); seq_escape(m, exp->ex_client->name, " \t\n\\"); if (export_stats) { - seq_printf(m, "\t%lld\n", exp->ex_stats.start_time); + struct percpu_counter *counter = exp->ex_stats->counter; + + seq_printf(m, "\t%lld\n", exp->ex_stats->start_time); seq_printf(m, "\tfh_stale: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_FH_STALE])); + percpu_counter_sum_positive(&counter[EXP_STATS_FH_STALE])); seq_printf(m, "\tio_read: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_READ])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_READ])); seq_printf(m, "\tio_write: %lld\n", - percpu_counter_sum_positive(&exp->ex_stats.counter[EXP_STATS_IO_WRITE])); + percpu_counter_sum_positive(&counter[EXP_STATS_IO_WRITE])); seq_putc(m, '\n'); return 0; } @@ -819,7 +826,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) new->ex_layout_types = 0; new->ex_uuid = NULL; new->cd = item->cd; - export_stats_reset(&new->ex_stats); + export_stats_reset(new->ex_stats); } static void export_update(struct cache_head *cnew, struct cache_head *citem) @@ -856,7 +863,14 @@ static struct cache_head *svc_export_alloc(void) if (!i) return NULL; - if (export_stats_init(&i->ex_stats)) { + i->ex_stats = kmalloc(sizeof(*(i->ex_stats)), GFP_KERNEL); + if (!i->ex_stats) { + kfree(i); + return NULL; + } + + if (export_stats_init(i->ex_stats)) { + kfree(i->ex_stats); kfree(i); return NULL; } diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index 2df8ae25aad3..ca9dc230ae3d 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -64,10 +64,10 @@ struct svc_export { struct cache_head h; struct auth_domain * ex_client; int ex_flags; + int ex_fsid; struct path ex_path; kuid_t ex_anon_uid; kgid_t ex_anon_gid; - int ex_fsid; unsigned char * ex_uuid; /* 16 byte fsid */ struct nfsd4_fs_locations ex_fslocs; uint32_t ex_nflavors; @@ -76,8 +76,8 @@ struct svc_export { struct nfsd4_deviceid_map *ex_devid_map; struct cache_detail *cd; struct rcu_head ex_rcu; - struct export_stats ex_stats; unsigned long ex_xprtsec_modes; + struct export_stats *ex_stats; }; /* an "export key" (expkey) maps a filehandlefragement to an diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ee9c923192e0..07bf219f9ae4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -989,22 +989,21 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned char need = may_flags & NFSD_FILE_MAY_MASK; struct net *net = SVC_NET(rqstp); struct nfsd_file *new, *nf; - const struct cred *cred; + bool stale_retry = true; bool open_retry = true; struct inode *inode; __be32 status; int ret; +retry: status = fh_verify(rqstp, fhp, S_IFREG, may_flags|NFSD_MAY_OWNER_OVERRIDE); if (status != nfs_ok) return status; inode = d_inode(fhp->fh_dentry); - cred = get_current_cred(); -retry: rcu_read_lock(); - nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); rcu_read_unlock(); if (nf) { @@ -1026,7 +1025,7 @@ retry: rcu_read_lock(); spin_lock(&inode->i_lock); - nf = nfsd_file_lookup_locked(net, cred, inode, need, want_gc); + nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); if (unlikely(nf)) { spin_unlock(&inode->i_lock); rcu_read_unlock(); @@ -1058,6 +1057,7 @@ wait_for_construction: goto construction_err; } open_retry = false; + fh_put(fhp); goto retry; } this_cpu_inc(nfsd_file_cache_hits); @@ -1074,7 +1074,6 @@ out: nfsd_file_check_write_error(nf); *pnf = nf; } - put_cred(cred); trace_nfsd_file_acquire(rqstp, inode, may_flags, nf, status); return status; @@ -1088,8 +1087,20 @@ open_file: status = nfs_ok; trace_nfsd_file_opened(nf, status); } else { - status = nfsd_open_verified(rqstp, fhp, may_flags, - &nf->nf_file); + ret = nfsd_open_verified(rqstp, fhp, may_flags, + &nf->nf_file); + if (ret == -EOPENSTALE && stale_retry) { + stale_retry = false; + nfsd_file_unhash(nf); + clear_and_wake_up_bit(NFSD_FILE_PENDING, + &nf->nf_flags); + if (refcount_dec_and_test(&nf->nf_ref)) + nfsd_file_free(nf); + nf = NULL; + fh_put(fhp); + goto retry; + } + status = nfserrno(ret); trace_nfsd_file_open(nf, status); } } else diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index bb205328e043..aeb71c10ff1b 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -17,9 +17,9 @@ struct ff_idmap { __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp) + const struct nfsd4_layoutget *lgp) { - struct pnfs_ff_layout *fl = lgp->lg_content; + const struct pnfs_ff_layout *fl = lgp->lg_content; int len, mirror_len, ds_len, fh_len; __be32 *p; @@ -77,7 +77,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp) + const struct nfsd4_getdeviceinfo *gdp) { struct pnfs_ff_device_addr *da = gdp->gd_device; int len; diff --git a/fs/nfsd/flexfilelayoutxdr.h b/fs/nfsd/flexfilelayoutxdr.h index 8e195aeca023..6d5a1066a903 100644 --- a/fs/nfsd/flexfilelayoutxdr.h +++ b/fs/nfsd/flexfilelayoutxdr.h @@ -43,8 +43,8 @@ struct pnfs_ff_layout { }; __be32 nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdp); + const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, - struct nfsd4_layoutget *lgp); + const struct nfsd4_layoutget *lgp); #endif /* _NFSD_FLEXFILELAYOUTXDR_H */ diff --git a/fs/nfsd/netlink.c b/fs/nfsd/netlink.c new file mode 100644 index 000000000000..0e1d635ec5f9 --- /dev/null +++ b/fs/nfsd/netlink.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink.h" + +#include <uapi/linux/nfsd_netlink.h> + +/* Ops table for nfsd */ +static const struct genl_split_ops nfsd_nl_ops[] = { + { + .cmd = NFSD_CMD_RPC_STATUS_GET, + .start = nfsd_nl_rpc_status_get_start, + .dumpit = nfsd_nl_rpc_status_get_dumpit, + .done = nfsd_nl_rpc_status_get_done, + .flags = GENL_CMD_CAP_DUMP, + }, +}; + +struct genl_family nfsd_nl_family __ro_after_init = { + .name = NFSD_FAMILY_NAME, + .version = NFSD_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = nfsd_nl_ops, + .n_split_ops = ARRAY_SIZE(nfsd_nl_ops), +}; diff --git a/fs/nfsd/netlink.h b/fs/nfsd/netlink.h new file mode 100644 index 000000000000..d83dd6bdee92 --- /dev/null +++ b/fs/nfsd/netlink.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/nfsd.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_NFSD_GEN_H +#define _LINUX_NFSD_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/nfsd_netlink.h> + +int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb); +int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb); + +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb); + +extern struct genl_family nfsd_nl_family; + +#endif /* _LINUX_NFSD_GEN_H */ diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 268ef57751c4..b78eceebd945 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -171,7 +171,8 @@ nfsd3_proc_read(struct svc_rqst *rqstp) * + 1 (xdr opaque byte count) = 26 */ resp->count = argp->count; - svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); + svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3) << 2) + + resp->count + 4); fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_read(rqstp, &resp->fh, argp->offset, @@ -194,7 +195,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->len, (unsigned long long) argp->offset, - argp->stable? " stable" : ""); + argp->stable ? " stable" : ""); resp->status = nfserr_fbig; if (argp->offset > (u64)OFFSET_MAX || @@ -294,8 +295,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS3_CREATE_EXCLUSIVE: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { break; } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 4039ffcf90ba..92bc109dabe6 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -84,7 +84,21 @@ static void encode_uint32(struct xdr_stream *xdr, u32 n) static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap, size_t len) { - WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0); + xdr_stream_encode_uint32_array(xdr, bitmap, len); +} + +static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs4_cb_fattr *fattr) +{ + fattr->ncf_cb_change = 0; + fattr->ncf_cb_fsize = 0; + if (bitmap[0] & FATTR4_WORD0_CHANGE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) + return -NFSERR_BAD_XDR; + if (bitmap[0] & FATTR4_WORD0_SIZE) + if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) + return -NFSERR_BAD_XDR; + return 0; } /* @@ -358,6 +372,30 @@ encode_cb_recallany4args(struct xdr_stream *xdr, } /* + * CB_GETATTR4args + * struct CB_GETATTR4args { + * nfs_fh4 fh; + * bitmap4 attr_request; + * }; + * + * The size and change attributes are the only one + * guaranteed to be serviced by the client. + */ +static void +encode_cb_getattr4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr, + struct nfs4_cb_fattr *fattr) +{ + struct nfs4_delegation *dp = + container_of(fattr, struct nfs4_delegation, dl_cb_fattr); + struct knfsd_fh *fh = &dp->dl_stid.sc_file->fi_fhandle; + + encode_nfs_cb_opnum4(xdr, OP_CB_GETATTR); + encode_nfs_fh4(xdr, fh); + encode_bitmap4(xdr, fattr->ncf_cb_bmap, ARRAY_SIZE(fattr->ncf_cb_bmap)); + hdr->nops++; +} + +/* * CB_SEQUENCE4args * * struct CB_SEQUENCE4args { @@ -493,6 +531,26 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static void nfs4_xdr_enc_cb_getattr(struct rpc_rqst *req, + struct xdr_stream *xdr, const void *data) +{ + const struct nfsd4_callback *cb = data; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_cb_compound_hdr hdr = { + .ident = cb->cb_clp->cl_cb_ident, + .minorversion = cb->cb_clp->cl_minorversion, + }; + + encode_cb_compound4args(xdr, &hdr); + encode_cb_sequence4args(xdr, cb, &hdr); + encode_cb_getattr4args(xdr, &hdr, ncf); + encode_cb_nops(&hdr); +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, @@ -548,6 +606,42 @@ static int nfs4_xdr_dec_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, } /* + * 20.1. Operation 3: CB_GETATTR - Get Attributes + */ +static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfsd4_callback *cb = data; + struct nfs4_cb_compound_hdr hdr; + int status; + u32 bitmap[3] = {0}; + u32 attrlen; + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + status = decode_cb_compound4res(xdr, &hdr); + if (unlikely(status)) + return status; + + status = decode_cb_sequence4res(xdr, cb); + if (unlikely(status || cb->cb_seq_status)) + return status; + + status = decode_cb_op_status(xdr, OP_CB_GETATTR, &cb->cb_status); + if (status) + return status; + if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) + return -NFSERR_BAD_XDR; + if (xdr_stream_decode_u32(xdr, &attrlen) < 0) + return -NFSERR_BAD_XDR; + if (attrlen > (sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize))) + return -NFSERR_BAD_XDR; + status = decode_cb_fattr4(xdr, bitmap, ncf); + return status; +} + +/* * 20.2. Operation 4: CB_RECALL - Recall a Delegation */ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, @@ -855,6 +949,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = { PROC(CB_NOTIFY_LOCK, COMPOUND, cb_notify_lock, cb_notify_lock), PROC(CB_OFFLOAD, COMPOUND, cb_offload, cb_offload), PROC(CB_RECALL_ANY, COMPOUND, cb_recall_any, cb_recall_any), + PROC(CB_GETATTR, COMPOUND, cb_getattr, cb_getattr), }; static unsigned int nfs4_cb_counts[ARRAY_SIZE(nfs4_cb_procedures)]; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index e8a80052cb1b..5e8096bc5eaa 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -515,11 +515,11 @@ nfsd4_return_file_layouts(struct svc_rqst *rqstp, if (!list_empty(&ls->ls_layouts)) { if (found) nfs4_inc_and_copy_stateid(&lrp->lr_sid, &ls->ls_stid); - lrp->lrs_present = 1; + lrp->lrs_present = true; } else { trace_nfsd_layoutstate_unhash(&ls->ls_stid.sc_stateid); nfs4_unhash_stid(&ls->ls_stid); - lrp->lrs_present = 0; + lrp->lrs_present = false; } spin_unlock(&ls->ls_lock); @@ -539,7 +539,7 @@ nfsd4_return_client_layouts(struct svc_rqst *rqstp, struct nfs4_layout *lp, *t; LIST_HEAD(reaplist); - lrp->lrs_present = 0; + lrp->lrs_present = false; spin_lock(&clp->cl_lock); list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5ca748309c26..6f2d4aa4970d 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -322,8 +322,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS4_CREATE_EXCLUSIVE: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { open->op_created = true; break; /* subtle */ @@ -331,8 +331,8 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, status = nfserr_exist; break; case NFS4_CREATE_EXCLUSIVE4_1: - if (d_inode(child)->i_mtime.tv_sec == v_mtime && - d_inode(child)->i_atime.tv_sec == v_atime && + if (inode_get_mtime_sec(d_inode(child)) == v_mtime && + inode_get_atime_sec(d_inode(child)) == v_atime && d_inode(child)->i_size == 0) { open->op_created = true; goto set_attr; /* subtle */ @@ -1058,8 +1058,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, rename->rn_tname, rename->rn_tnamelen); if (status) return status; - set_change_info(&rename->rn_sinfo, &cstate->current_fh); - set_change_info(&rename->rn_tinfo, &cstate->save_fh); + set_change_info(&rename->rn_sinfo, &cstate->save_fh); + set_change_info(&rename->rn_tinfo, &cstate->current_fh); return nfs_ok; } @@ -1329,7 +1329,8 @@ extern void nfs_sb_deactive(struct super_block *sb); * setup a work entry in the ssc delayed unmount list. */ static __be32 nfsd4_ssc_setup_dul(struct nfsd_net *nn, char *ipaddr, - struct nfsd4_ssc_umount_item **nsui) + struct nfsd4_ssc_umount_item **nsui, + struct svc_rqst *rqstp) { struct nfsd4_ssc_umount_item *ni = NULL; struct nfsd4_ssc_umount_item *work = NULL; @@ -1351,7 +1352,7 @@ try_again: spin_unlock(&nn->nfsd_ssc_lock); /* allow 20secs for mount/unmount for now - revisit */ - if (kthread_should_stop() || + if (svc_thread_should_stop(rqstp) || (schedule_timeout(20*HZ) == 0)) { finish_wait(&nn->nfsd_ssc_waitq, &wait); kfree(work); @@ -1467,7 +1468,7 @@ nfsd4_interssc_connect(struct nl4_server *nss, struct svc_rqst *rqstp, goto out_free_rawdata; snprintf(dev_name, len + 5, "%s%s%s:/", startsep, ipaddr, endsep); - status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui); + status = nfsd4_ssc_setup_dul(nn, ipaddr, nsui, rqstp); if (status) goto out_free_devname; if ((*nsui)->nsui_vfsmount) @@ -1642,6 +1643,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, if (bytes_total == 0) bytes_total = ULLONG_MAX; do { + /* Only async copies can be stopped here */ if (kthread_should_stop()) break; bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos, @@ -1760,6 +1762,7 @@ static int nfsd4_do_async_copy(void *data) struct nfsd4_copy *copy = (struct nfsd4_copy *)data; __be32 nfserr; + trace_nfsd_copy_do_async(copy); if (nfsd4_ssc_is_inter(copy)) { struct file *filp; @@ -1798,21 +1801,27 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_copy *async_copy = NULL; + copy->cp_clp = cstate->clp; if (nfsd4_ssc_is_inter(copy)) { + trace_nfsd_copy_inter(copy); if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) { status = nfserr_notsupp; goto out; } status = nfsd4_setup_inter_ssc(rqstp, cstate, copy); - if (status) + if (status) { + trace_nfsd_copy_done(copy, status); return nfserr_offload_denied; + } } else { + trace_nfsd_copy_intra(copy); status = nfsd4_setup_intra_ssc(rqstp, cstate, copy); - if (status) + if (status) { + trace_nfsd_copy_done(copy, status); return status; + } } - copy->cp_clp = cstate->clp; memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); if (nfsd4_copy_is_async(copy)) { @@ -1847,6 +1856,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, copy->nf_dst->nf_file, true); } out: + trace_nfsd_copy_done(copy, status); release_copy_files(copy); return status; out_err: @@ -1929,8 +1939,8 @@ nfsd4_copy_notify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) return status; - cn->cpn_sec = nn->nfsd4_lease; - cn->cpn_nsec = 0; + cn->cpn_lease_time.tv_sec = nn->nfsd4_lease; + cn->cpn_lease_time.tv_nsec = 0; status = nfserrno(-ENOMEM); cps = nfs4_alloc_init_cpntf_state(nn, stid); @@ -2347,10 +2357,10 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, mutex_unlock(&ls->ls_mutex); if (new_size > i_size_read(inode)) { - lcp->lc_size_chg = 1; + lcp->lc_size_chg = true; lcp->lc_newsize = new_size; } else { - lcp->lc_size_chg = 0; + lcp->lc_size_chg = false; } nfserr = ops->proc_layoutcommit(inode, lcp); @@ -3200,6 +3210,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCK] = { .op_func = nfsd4_lock, + .op_release = nfsd4_lock_release, .op_flags = OP_MODIFIES_SOMETHING | OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCK", @@ -3208,6 +3219,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCKT] = { .op_func = nfsd4_lockt, + .op_release = nfsd4_lockt_release, .op_flags = OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCKT", .op_rsize_bop = nfsd4_lock_rsize, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8534693eb6a4..65fd5510323a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -59,7 +59,7 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -#define all_ones {{~0,~0},~0} +#define all_ones {{ ~0, ~0}, ~0} static const stateid_t one_stateid = { .si_generation = ~0, .si_opaque = all_ones, @@ -127,6 +127,7 @@ static void free_session(struct nfsd4_session *); static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops; static struct workqueue_struct *laundry_wq; @@ -297,7 +298,7 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh, nbl = find_blocked_lock(lo, fh, nn); if (!nbl) { - nbl= kmalloc(sizeof(*nbl), GFP_KERNEL); + nbl = kmalloc(sizeof(*nbl), GFP_KERNEL); if (nbl) { INIT_LIST_HEAD(&nbl->nbl_list); INIT_LIST_HEAD(&nbl->nbl_lru); @@ -1159,6 +1160,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate, u32 dl_type) { struct nfs4_delegation *dp; + struct nfs4_stid *stid; long n; dprintk("NFSD alloc_init_deleg\n"); @@ -1167,9 +1169,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, goto out_dec; if (delegation_blocked(&fp->fi_fhandle)) goto out_dec; - dp = delegstateid(nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg)); - if (dp == NULL) + stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg); + if (stid == NULL) goto out_dec; + dp = delegstateid(stid); /* * delegation seqid's are never incremented. The 4.1 special @@ -1187,6 +1190,10 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp, dp->dl_recalled = false; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); + nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client, + &nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR); + dp->dl_cb_fattr.ncf_file_modified = false; + dp->dl_cb_fattr.ncf_cb_bmap[0] = FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE; get_nfs4_file(fp); dp->dl_stid.sc_file = fp; return dp; @@ -2894,11 +2901,56 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) spin_unlock(&nn->client_lock); } +static int +nfsd4_cb_getattr_done(struct nfsd4_callback *cb, struct rpc_task *task) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + + ncf->ncf_cb_status = task->tk_status; + switch (task->tk_status) { + case -NFS4ERR_DELAY: + rpc_delay(task, 2 * HZ); + return 0; + default: + return 1; + } +} + +static void +nfsd4_cb_getattr_release(struct nfsd4_callback *cb) +{ + struct nfs4_cb_fattr *ncf = + container_of(cb, struct nfs4_cb_fattr, ncf_getattr); + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + nfs4_put_stid(&dp->dl_stid); + clear_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); + wake_up_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY); +} + static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = { .done = nfsd4_cb_recall_any_done, .release = nfsd4_cb_recall_any_release, }; +static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = { + .done = nfsd4_cb_getattr_done, + .release = nfsd4_cb_getattr_release, +}; + +void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) +{ + struct nfs4_delegation *dp = + container_of(ncf, struct nfs4_delegation, dl_cb_fattr); + + if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) + return; + refcount_inc(&dp->dl_stid.sc_count); + nfsd4_run_cb(&ncf->ncf_getattr); +} + static struct nfs4_client *create_client(struct xdr_netobj name, struct svc_rqst *rqstp, nfs4_verifier *verf) { @@ -5634,13 +5686,15 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent = NULL; int cb_up; int status = 0; + struct kstat stat; + struct path path; cb_up = nfsd4_cb_channel_good(oo->oo_owner.so_client); - open->op_recall = 0; + open->op_recall = false; switch (open->op_claim_type) { case NFS4_OPEN_CLAIM_PREVIOUS: if (!cb_up) - open->op_recall = 1; + open->op_recall = true; break; case NFS4_OPEN_CLAIM_NULL: parent = currentfh; @@ -5671,6 +5725,18 @@ nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { open->op_delegate_type = NFS4_OPEN_DELEGATE_WRITE; trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); + path.mnt = currentfh->fh_export->ex_path.mnt; + path.dentry = currentfh->fh_dentry; + if (vfs_getattr(&path, &stat, + (STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + AT_STATX_SYNC_AS_STAT)) { + nfs4_put_stid(&dp->dl_stid); + destroy_delegation(dp); + goto out_no_deleg; + } + dp->dl_cb_fattr.ncf_cur_fsize = stat.size; + dp->dl_cb_fattr.ncf_initial_cinfo = + nfsd4_change_attribute(&stat, d_inode(currentfh->fh_dentry)); } else { open->op_delegate_type = NFS4_OPEN_DELEGATE_READ; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); @@ -5682,7 +5748,7 @@ out_no_deleg: if (open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS && open->op_delegate_type != NFS4_OPEN_DELEGATE_NONE) { dprintk("NFSD: WARNING: refusing delegation reclaim\n"); - open->op_recall = 1; + open->op_recall = true; } /* 4.1 client asking for a delegation? */ @@ -7487,6 +7553,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_blocked_lock *nbl = NULL; struct file_lock *file_lock = NULL; struct file_lock *conflock = NULL; + struct super_block *sb; __be32 status = 0; int lkflg; int err; @@ -7508,6 +7575,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, dprintk("NFSD: nfsd4_lock: permission denied!\n"); return status; } + sb = cstate->current_fh.fh_dentry->d_sb; if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -7559,7 +7627,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fp = lock_stp->st_stid.sc_file; switch (lock->lk_type) { case NFS4_READW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) || + exportfs_lock_op_is_async(sb->s_export_op)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_READ_LT: @@ -7571,7 +7640,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, fl_type = F_RDLCK; break; case NFS4_WRITEW_LT: - if (nfsd4_has_session(cstate)) + if (nfsd4_has_session(cstate) || + exportfs_lock_op_is_async(sb->s_export_op)) fl_flags |= FL_SLEEP; fallthrough; case NFS4_WRITE_LT: @@ -7599,7 +7669,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * for file locks), so don't attempt blocking lock notifications * on those filesystems: */ - if (nf->nf_file->f_op->lock) + if (!exportfs_lock_op_is_async(sb->s_export_op)) fl_flags &= ~FL_SLEEP; nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn); @@ -7705,6 +7775,14 @@ out: return status; } +void nfsd4_lock_release(union nfsd4_op_u *u) +{ + struct nfsd4_lock *lock = &u->lock; + struct nfsd4_lock_denied *deny = &lock->lk_denied; + + kfree(deny->ld_owner.data); +} + /* * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, * so we do a temporary open here just to get an open file to pass to @@ -7810,6 +7888,14 @@ out: return status; } +void nfsd4_lockt_release(union nfsd4_op_u *u) +{ + struct nfsd4_lockt *lockt = &u->lockt; + struct nfsd4_lock_denied *deny = &lockt->lt_denied; + + kfree(deny->ld_owner.data); +} + __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -8403,6 +8489,8 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * nfsd4_deleg_getattr_conflict - Recall if GETATTR causes conflict * @rqstp: RPC transaction context * @inode: file to be checked for a conflict + * @modified: return true if file was modified + * @size: new size of file if modified is true * * This function is called when there is a conflict between a write * delegation and a change/size GETATTR from another client. The server @@ -8411,21 +8499,23 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, * delegation before replying to the GETATTR. See RFC 8881 section * 18.7.4. * - * The current implementation does not support CB_GETATTR yet. However - * this can avoid recalling the delegation could be added in follow up - * work. - * * Returns 0 if there is no conflict; otherwise an nfs_stat * code is returned. */ __be32 -nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) +nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode, + bool *modified, u64 *size) { - __be32 status; struct file_lock_context *ctx; - struct file_lock *fl; struct nfs4_delegation *dp; + struct nfs4_cb_fattr *ncf; + struct file_lock *fl; + struct iattr attrs; + __be32 status; + might_sleep(); + + *modified = false; ctx = locks_inode_context(inode); if (!ctx) return 0; @@ -8452,10 +8542,34 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct inode *inode) break_lease: spin_unlock(&ctx->flc_lock); nfsd_stats_wdeleg_getattr_inc(); - status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); - if (status != nfserr_jukebox || - !nfsd_wait_for_delegreturn(rqstp, inode)) - return status; + + dp = fl->fl_owner; + ncf = &dp->dl_cb_fattr; + nfs4_cb_getattr(&dp->dl_cb_fattr); + wait_on_bit(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, TASK_INTERRUPTIBLE); + if (ncf->ncf_cb_status) { + status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); + if (status != nfserr_jukebox || + !nfsd_wait_for_delegreturn(rqstp, inode)) + return status; + } + if (!ncf->ncf_file_modified && + (ncf->ncf_initial_cinfo != ncf->ncf_cb_change || + ncf->ncf_cur_fsize != ncf->ncf_cb_fsize)) + ncf->ncf_file_modified = true; + if (ncf->ncf_file_modified) { + /* + * The server would not update the file's metadata + * with the client's modified size. + */ + attrs.ia_mtime = attrs.ia_ctime = current_time(inode); + attrs.ia_valid = ATTR_MTIME | ATTR_CTIME; + setattr_copy(&nop_mnt_idmap, inode, &attrs); + mark_inode_dirty(inode); + ncf->ncf_cur_fsize = ncf->ncf_cb_fsize; + *size = ncf->ncf_cur_fsize; + *modified = true; + } return 0; } break; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 2e40c74d2f72..ec4ed6206df1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2530,66 +2530,62 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) return true; } -static __be32 *encode_change(__be32 *p, struct kstat *stat, struct inode *inode, - struct svc_export *exp) +static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr, + struct knfsd_fh *fh_handle) { - if (exp->ex_flags & NFSEXP_V4ROOT) { - *p++ = cpu_to_be32(convert_to_wallclock(exp->cd->flush_time)); - *p++ = 0; - } else - p = xdr_encode_hyper(p, nfsd4_change_attribute(stat, inode)); - return p; + return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size); } +/* This is a frequently-encoded type; open-coded for speed */ static __be32 nfsd4_encode_nfstime4(struct xdr_stream *xdr, - struct timespec64 *tv) + const struct timespec64 *tv) { __be32 *p; p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) return nfserr_resource; - - p = xdr_encode_hyper(p, (s64)tv->tv_sec); + p = xdr_encode_hyper(p, tv->tv_sec); *p = cpu_to_be32(tv->tv_nsec); return nfs_ok; } -/* - * ctime (in NFSv4, time_metadata) is not writeable, and the client - * doesn't really care what resolution could theoretically be stored by - * the filesystem. - * - * The client cares how close together changes can be while still - * guaranteeing ctime changes. For most filesystems (which have - * timestamps with nanosecond fields) that is limited by the resolution - * of the time returned from current_time() (which I'm assuming to be - * 1/HZ). - */ -static __be32 *encode_time_delta(__be32 *p, struct inode *inode) +static __be32 nfsd4_encode_specdata4(struct xdr_stream *xdr, + unsigned int major, unsigned int minor) { - struct timespec64 ts; - u32 ns; + __be32 status; - ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); - ts = ns_to_timespec64(ns); + status = nfsd4_encode_uint32_t(xdr, major); + if (status != nfs_ok) + return status; + return nfsd4_encode_uint32_t(xdr, minor); +} - p = xdr_encode_hyper(p, ts.tv_sec); - *p++ = cpu_to_be32(ts.tv_nsec); +static __be32 +nfsd4_encode_change_info4(struct xdr_stream *xdr, const struct nfsd4_change_info *c) +{ + __be32 status; - return p; + status = nfsd4_encode_bool(xdr, c->atomic); + if (status != nfs_ok) + return status; + status = nfsd4_encode_changeid4(xdr, c->before_change); + if (status != nfs_ok) + return status; + return nfsd4_encode_changeid4(xdr, c->after_change); } -static __be32 -nfsd4_encode_change_info4(struct xdr_stream *xdr, struct nfsd4_change_info *c) +static __be32 nfsd4_encode_netaddr4(struct xdr_stream *xdr, + const struct nfs42_netaddr *addr) { - if (xdr_stream_encode_bool(xdr, c->atomic) < 0) - return nfserr_resource; - if (xdr_stream_encode_u64(xdr, c->before_change) < 0) - return nfserr_resource; - if (xdr_stream_encode_u64(xdr, c->after_change) < 0) - return nfserr_resource; - return nfs_ok; + __be32 status; + + /* na_r_netid */ + status = nfsd4_encode_opaque(xdr, addr->netid, addr->netid_len); + if (status != nfs_ok) + return status; + /* na_r_addr */ + return nfsd4_encode_opaque(xdr, addr->addr, addr->addr_len); } /* Encode as an array of strings the string given with components @@ -2661,9 +2657,6 @@ static __be32 nfsd4_encode_components(struct xdr_stream *xdr, char sep, return nfsd4_encode_components_esc(xdr, sep, components, 0, 0); } -/* - * encode a location element of a fs_locations structure - */ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, struct nfsd4_fs_location *location) { @@ -2676,15 +2669,12 @@ static __be32 nfsd4_encode_fs_location4(struct xdr_stream *xdr, status = nfsd4_encode_components(xdr, '/', location->path); if (status) return status; - return 0; + return nfs_ok; } -/* - * Encode a path in RFC3530 'pathname4' format - */ -static __be32 nfsd4_encode_path(struct xdr_stream *xdr, - const struct path *root, - const struct path *path) +static __be32 nfsd4_encode_pathname4(struct xdr_stream *xdr, + const struct path *root, + const struct path *path) { struct path cur = *path; __be32 *p; @@ -2752,89 +2742,59 @@ out_free: return err; } -static __be32 nfsd4_encode_fsloc_fsroot(struct xdr_stream *xdr, - struct svc_rqst *rqstp, const struct path *path) +static __be32 nfsd4_encode_fs_locations4(struct xdr_stream *xdr, + struct svc_rqst *rqstp, + struct svc_export *exp) { + struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; struct svc_export *exp_ps; - __be32 res; + unsigned int i; + __be32 status; + /* fs_root */ exp_ps = rqst_find_fsidzero_export(rqstp); if (IS_ERR(exp_ps)) return nfserrno(PTR_ERR(exp_ps)); - res = nfsd4_encode_path(xdr, &exp_ps->ex_path, path); + status = nfsd4_encode_pathname4(xdr, &exp_ps->ex_path, &exp->ex_path); exp_put(exp_ps); - return res; -} - -/* - * encode a fs_locations structure - */ -static __be32 nfsd4_encode_fs_locations(struct xdr_stream *xdr, - struct svc_rqst *rqstp, struct svc_export *exp) -{ - __be32 status; - int i; - __be32 *p; - struct nfsd4_fs_locations *fslocs = &exp->ex_fslocs; - - status = nfsd4_encode_fsloc_fsroot(xdr, rqstp, &exp->ex_path); - if (status) + if (status != nfs_ok) return status; - p = xdr_reserve_space(xdr, 4); - if (!p) + + /* locations<> */ + if (xdr_stream_encode_u32(xdr, fslocs->locations_count) != XDR_UNIT) return nfserr_resource; - *p++ = cpu_to_be32(fslocs->locations_count); - for (i=0; i<fslocs->locations_count; i++) { + for (i = 0; i < fslocs->locations_count; i++) { status = nfsd4_encode_fs_location4(xdr, &fslocs->locations[i]); - if (status) + if (status != nfs_ok) return status; } - return 0; -} -static u32 nfs4_file_type(umode_t mode) -{ - switch (mode & S_IFMT) { - case S_IFIFO: return NF4FIFO; - case S_IFCHR: return NF4CHR; - case S_IFDIR: return NF4DIR; - case S_IFBLK: return NF4BLK; - case S_IFLNK: return NF4LNK; - case S_IFREG: return NF4REG; - case S_IFSOCK: return NF4SOCK; - default: return NF4BAD; - } + return nfs_ok; } -static inline __be32 -nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, - struct nfs4_ace *ace) +static __be32 nfsd4_encode_nfsace4(struct xdr_stream *xdr, struct svc_rqst *rqstp, + struct nfs4_ace *ace) { + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, ace->type); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, ace->flag); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, ace->access_mask & NFS4_ACE_MASK_ALL); + if (status != nfs_ok) + return nfserr_resource; + /* who */ if (ace->whotype != NFS4_ACL_WHO_NAMED) return nfs4_acl_write_who(xdr, ace->whotype); - else if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) + if (ace->flag & NFS4_ACE_IDENTIFIER_GROUP) return nfsd4_encode_group(xdr, rqstp, ace->who_gid); - else - return nfsd4_encode_user(xdr, rqstp, ace->who_uid); -} - -static inline __be32 -nfsd4_encode_layout_types(struct xdr_stream *xdr, u32 layout_types) -{ - __be32 *p; - unsigned long i = hweight_long(layout_types); - - p = xdr_reserve_space(xdr, 4 + 4 * i); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(i); - - for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) - if (layout_types & (1 << i)) - *p++ = cpu_to_be32(i); - - return 0; + return nfsd4_encode_user(xdr, rqstp, ace->who_uid); } #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ @@ -2906,12 +2866,12 @@ static int nfsd4_get_mounted_on_ino(struct svc_export *exp, u64 *pino) } static __be32 -nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) +nfsd4_encode_bitmap4(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) { __be32 *p; if (bmval2) { - p = xdr_reserve_space(xdr, 16); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); if (!p) goto out_resource; *p++ = cpu_to_be32(3); @@ -2919,94 +2879,687 @@ nfsd4_encode_bitmap(struct xdr_stream *xdr, u32 bmval0, u32 bmval1, u32 bmval2) *p++ = cpu_to_be32(bmval1); *p++ = cpu_to_be32(bmval2); } else if (bmval1) { - p = xdr_reserve_space(xdr, 12); + p = xdr_reserve_space(xdr, XDR_UNIT * 3); if (!p) goto out_resource; *p++ = cpu_to_be32(2); *p++ = cpu_to_be32(bmval0); *p++ = cpu_to_be32(bmval1); } else { - p = xdr_reserve_space(xdr, 8); + p = xdr_reserve_space(xdr, XDR_UNIT * 2); if (!p) goto out_resource; *p++ = cpu_to_be32(1); *p++ = cpu_to_be32(bmval0); } - return 0; + return nfs_ok; out_resource: return nfserr_resource; } +struct nfsd4_fattr_args { + struct svc_rqst *rqstp; + struct svc_fh *fhp; + struct svc_export *exp; + struct dentry *dentry; + struct kstat stat; + struct kstatfs statfs; + struct nfs4_acl *acl; + u64 size; +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + void *context; + int contextlen; +#endif + u32 rdattr_err; + bool contextsupport; + bool ignore_crossmnt; +}; + +typedef __be32(*nfsd4_enc_attr)(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args); + +static __be32 nfsd4_encode_fattr4__noop(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4__true(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, true); +} + +static __be32 nfsd4_encode_fattr4__false(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_bool(xdr, false); +} + +static __be32 nfsd4_encode_fattr4_supported_attrs(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 minorversion = resp->cstate.minorversion; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + if (!IS_POSIXACL(d_inode(args->dentry))) + supp[0] &= ~FATTR4_WORD0_ACL; + if (!args->contextsupport) + supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +static __be32 nfsd4_encode_fattr4_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT); + if (!p) + return nfserr_resource; + + switch (args->stat.mode & S_IFMT) { + case S_IFIFO: + *p = cpu_to_be32(NF4FIFO); + break; + case S_IFCHR: + *p = cpu_to_be32(NF4CHR); + break; + case S_IFDIR: + *p = cpu_to_be32(NF4DIR); + break; + case S_IFBLK: + *p = cpu_to_be32(NF4BLK); + break; + case S_IFLNK: + *p = cpu_to_be32(NF4LNK); + break; + case S_IFREG: + *p = cpu_to_be32(NF4REG); + break; + case S_IFSOCK: + *p = cpu_to_be32(NF4SOCK); + break; + default: + return nfserr_serverfault; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_fh_expire_type(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = NFS4_FH_PERSISTENT; + if (!(args->exp->ex_flags & NFSEXP_NOSUBTREECHECK)) + mask |= NFS4_FH_VOL_RENAME; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct svc_export *exp = args->exp; + u64 c; + + if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) { + u32 flush_time = convert_to_wallclock(exp->cd->flush_time); + + if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT) + return nfserr_resource; + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; + } + + c = nfsd4_change_attribute(&args->stat, d_inode(args->dentry)); + return nfsd4_encode_changeid4(xdr, c); +} + +static __be32 nfsd4_encode_fattr4_size(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->size); +} + +static __be32 nfsd4_encode_fattr4_fsid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT * 2 + XDR_UNIT * 2); + if (!p) + return nfserr_resource; + + if (unlikely(args->exp->ex_fslocs.migrated)) { + p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); + xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); + return nfs_ok; + } + switch (fsid_source(args->fhp)) { + case FSIDSOURCE_FSID: + p = xdr_encode_hyper(p, (u64)args->exp->ex_fsid); + xdr_encode_hyper(p, (u64)0); + break; + case FSIDSOURCE_DEV: + *p++ = xdr_zero; + *p++ = cpu_to_be32(MAJOR(args->stat.dev)); + *p++ = xdr_zero; + *p = cpu_to_be32(MINOR(args->stat.dev)); + break; + case FSIDSOURCE_UUID: + xdr_encode_opaque_fixed(p, args->exp->ex_uuid, EX_UUID_LEN); + break; + } + + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_lease_time(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd_net *nn = net_generic(SVC_NET(args->rqstp), nfsd_net_id); + + return nfsd4_encode_nfs_lease4(xdr, nn->nfsd4_lease); +} + +static __be32 nfsd4_encode_fattr4_rdattr_error(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->rdattr_err); +} + +static __be32 nfsd4_encode_fattr4_aclsupport(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u32 mask; + + mask = 0; + if (IS_POSIXACL(d_inode(args->dentry))) + mask = ACL4_SUPPORT_ALLOW_ACL | ACL4_SUPPORT_DENY_ACL; + return nfsd4_encode_uint32_t(xdr, mask); +} + +static __be32 nfsd4_encode_fattr4_acl(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfs4_acl *acl = args->acl; + struct nfs4_ace *ace; + __be32 status; + + /* nfsace4<> */ + if (!acl) { + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + } else { + if (xdr_stream_encode_u32(xdr, acl->naces) != XDR_UNIT) + return nfserr_resource; + for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { + status = nfsd4_encode_nfsace4(xdr, args->rqstp, ace); + if (status != nfs_ok) + return status; + } + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle); +} + +static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->stat.ino); +} + +static __be32 nfsd4_encode_fattr4_files_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_ffree); +} + +static __be32 nfsd4_encode_fattr4_files_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, args->statfs.f_files); +} + +static __be32 nfsd4_encode_fattr4_fs_locations(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_fs_locations4(xdr, args->rqstp, args->exp); +} + +static __be32 nfsd4_encode_fattr4_maxfilesize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct super_block *sb = args->exp->ex_path.mnt->mnt_sb; + + return nfsd4_encode_uint64_t(xdr, sb->s_maxbytes); +} + +static __be32 nfsd4_encode_fattr4_maxlink(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, 255); +} + +static __be32 nfsd4_encode_fattr4_maxname(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->statfs.f_namelen); +} + +static __be32 nfsd4_encode_fattr4_maxread(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_maxwrite(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, svc_max_payload(args->rqstp)); +} + +static __be32 nfsd4_encode_fattr4_mode(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_mode4(xdr, args->stat.mode & S_IALLUGO); +} + +static __be32 nfsd4_encode_fattr4_numlinks(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.nlink); +} + +static __be32 nfsd4_encode_fattr4_owner(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_user(xdr, args->rqstp, args->stat.uid); +} + +static __be32 nfsd4_encode_fattr4_owner_group(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_group(xdr, args->rqstp, args->stat.gid); +} + +static __be32 nfsd4_encode_fattr4_rawdev(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_specdata4(xdr, MAJOR(args->stat.rdev), + MINOR(args->stat.rdev)); +} + +static __be32 nfsd4_encode_fattr4_space_avail(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 avail = (u64)args->statfs.f_bavail * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, avail); +} + +static __be32 nfsd4_encode_fattr4_space_free(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 free = (u64)args->statfs.f_bfree * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, free); +} + +static __be32 nfsd4_encode_fattr4_space_total(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 total = (u64)args->statfs.f_blocks * (u64)args->statfs.f_bsize; + + return nfsd4_encode_uint64_t(xdr, total); +} + +static __be32 nfsd4_encode_fattr4_space_used(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint64_t(xdr, (u64)args->stat.blocks << 9); +} + +static __be32 nfsd4_encode_fattr4_time_access(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.atime); +} + +static __be32 nfsd4_encode_fattr4_time_create(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.btime); +} + +/* + * ctime (in NFSv4, time_metadata) is not writeable, and the client + * doesn't really care what resolution could theoretically be stored by + * the filesystem. + * + * The client cares how close together changes can be while still + * guaranteeing ctime changes. For most filesystems (which have + * timestamps with nanosecond fields) that is limited by the resolution + * of the time returned from current_time() (which I'm assuming to be + * 1/HZ). + */ +static __be32 nfsd4_encode_fattr4_time_delta(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + const struct inode *inode = d_inode(args->dentry); + u32 ns = max_t(u32, NSEC_PER_SEC/HZ, inode->i_sb->s_time_gran); + struct timespec64 ts = ns_to_timespec64(ns); + + return nfsd4_encode_nfstime4(xdr, &ts); +} + +static __be32 nfsd4_encode_fattr4_time_metadata(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.ctime); +} + +static __be32 nfsd4_encode_fattr4_time_modify(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_nfstime4(xdr, &args->stat.mtime); +} + +static __be32 nfsd4_encode_fattr4_mounted_on_fileid(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + u64 ino; + int err; + + if (!args->ignore_crossmnt && + args->dentry == args->exp->ex_path.mnt->mnt_root) { + err = nfsd4_get_mounted_on_ino(args->exp, &ino); + if (err) + return nfserrno(err); + } else + ino = args->stat.ino; + + return nfsd4_encode_uint64_t(xdr, ino); +} + +#ifdef CONFIG_NFSD_PNFS + +static __be32 nfsd4_encode_fattr4_fs_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_types(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + unsigned long mask = args->exp->ex_layout_types; + int i; + + /* Hamming weight of @mask is the number of layout types to return */ + if (xdr_stream_encode_u32(xdr, hweight_long(mask)) != XDR_UNIT) + return nfserr_resource; + for (i = LAYOUT_NFSV4_1_FILES; i < LAYOUT_TYPE_MAX; ++i) + if (mask & BIT(i)) { + /* layouttype4 */ + if (xdr_stream_encode_u32(xdr, i) != XDR_UNIT) + return nfserr_resource; + } + return nfs_ok; +} + +static __be32 nfsd4_encode_fattr4_layout_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_uint32_t(xdr, args->stat.blksize); +} + +#endif + +static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct nfsd4_compoundres *resp = args->rqstp->rq_resp; + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[resp->cstate.minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); +} + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL +static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + return nfsd4_encode_security_label(xdr, args->rqstp, + args->context, args->contextlen); +} +#endif + +static __be32 nfsd4_encode_fattr4_xattr_support(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + int err = xattr_supports_user_prefix(d_inode(args->dentry)); + + return nfsd4_encode_bool(xdr, err == 0); +} + +static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { + [FATTR4_SUPPORTED_ATTRS] = nfsd4_encode_fattr4_supported_attrs, + [FATTR4_TYPE] = nfsd4_encode_fattr4_type, + [FATTR4_FH_EXPIRE_TYPE] = nfsd4_encode_fattr4_fh_expire_type, + [FATTR4_CHANGE] = nfsd4_encode_fattr4_change, + [FATTR4_SIZE] = nfsd4_encode_fattr4_size, + [FATTR4_LINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_SYMLINK_SUPPORT] = nfsd4_encode_fattr4__true, + [FATTR4_NAMED_ATTR] = nfsd4_encode_fattr4__false, + [FATTR4_FSID] = nfsd4_encode_fattr4_fsid, + [FATTR4_UNIQUE_HANDLES] = nfsd4_encode_fattr4__true, + [FATTR4_LEASE_TIME] = nfsd4_encode_fattr4_lease_time, + [FATTR4_RDATTR_ERROR] = nfsd4_encode_fattr4_rdattr_error, + [FATTR4_ACL] = nfsd4_encode_fattr4_acl, + [FATTR4_ACLSUPPORT] = nfsd4_encode_fattr4_aclsupport, + [FATTR4_ARCHIVE] = nfsd4_encode_fattr4__noop, + [FATTR4_CANSETTIME] = nfsd4_encode_fattr4__true, + [FATTR4_CASE_INSENSITIVE] = nfsd4_encode_fattr4__false, + [FATTR4_CASE_PRESERVING] = nfsd4_encode_fattr4__true, + [FATTR4_CHOWN_RESTRICTED] = nfsd4_encode_fattr4__true, + [FATTR4_FILEHANDLE] = nfsd4_encode_fattr4_filehandle, + [FATTR4_FILEID] = nfsd4_encode_fattr4_fileid, + [FATTR4_FILES_AVAIL] = nfsd4_encode_fattr4_files_avail, + [FATTR4_FILES_FREE] = nfsd4_encode_fattr4_files_free, + [FATTR4_FILES_TOTAL] = nfsd4_encode_fattr4_files_total, + [FATTR4_FS_LOCATIONS] = nfsd4_encode_fattr4_fs_locations, + [FATTR4_HIDDEN] = nfsd4_encode_fattr4__noop, + [FATTR4_HOMOGENEOUS] = nfsd4_encode_fattr4__true, + [FATTR4_MAXFILESIZE] = nfsd4_encode_fattr4_maxfilesize, + [FATTR4_MAXLINK] = nfsd4_encode_fattr4_maxlink, + [FATTR4_MAXNAME] = nfsd4_encode_fattr4_maxname, + [FATTR4_MAXREAD] = nfsd4_encode_fattr4_maxread, + [FATTR4_MAXWRITE] = nfsd4_encode_fattr4_maxwrite, + [FATTR4_MIMETYPE] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE] = nfsd4_encode_fattr4_mode, + [FATTR4_NO_TRUNC] = nfsd4_encode_fattr4__true, + [FATTR4_NUMLINKS] = nfsd4_encode_fattr4_numlinks, + [FATTR4_OWNER] = nfsd4_encode_fattr4_owner, + [FATTR4_OWNER_GROUP] = nfsd4_encode_fattr4_owner_group, + [FATTR4_QUOTA_AVAIL_HARD] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_AVAIL_SOFT] = nfsd4_encode_fattr4__noop, + [FATTR4_QUOTA_USED] = nfsd4_encode_fattr4__noop, + [FATTR4_RAWDEV] = nfsd4_encode_fattr4_rawdev, + [FATTR4_SPACE_AVAIL] = nfsd4_encode_fattr4_space_avail, + [FATTR4_SPACE_FREE] = nfsd4_encode_fattr4_space_free, + [FATTR4_SPACE_TOTAL] = nfsd4_encode_fattr4_space_total, + [FATTR4_SPACE_USED] = nfsd4_encode_fattr4_space_used, + [FATTR4_SYSTEM] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_ACCESS] = nfsd4_encode_fattr4_time_access, + [FATTR4_TIME_ACCESS_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_BACKUP] = nfsd4_encode_fattr4__noop, + [FATTR4_TIME_CREATE] = nfsd4_encode_fattr4_time_create, + [FATTR4_TIME_DELTA] = nfsd4_encode_fattr4_time_delta, + [FATTR4_TIME_METADATA] = nfsd4_encode_fattr4_time_metadata, + [FATTR4_TIME_MODIFY] = nfsd4_encode_fattr4_time_modify, + [FATTR4_TIME_MODIFY_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_MOUNTED_ON_FILEID] = nfsd4_encode_fattr4_mounted_on_fileid, + [FATTR4_DIR_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DIRENT_NOTIF_DELAY] = nfsd4_encode_fattr4__noop, + [FATTR4_DACL] = nfsd4_encode_fattr4__noop, + [FATTR4_SACL] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_POLICY] = nfsd4_encode_fattr4__noop, + [FATTR4_FS_STATUS] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_PNFS + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4_fs_layout_types, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4_layout_types, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4_layout_blksize, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#else + [FATTR4_FS_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_HINT] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_TYPES] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_LAYOUT_ALIGNMENT] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_FS_LOCATIONS_INFO] = nfsd4_encode_fattr4__noop, + [FATTR4_MDSTHRESHOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_GET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTEVT_SET] = nfsd4_encode_fattr4__noop, + [FATTR4_RETENTION_HOLD] = nfsd4_encode_fattr4__noop, + [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, + [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, + [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, + [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, + +#ifdef CONFIG_NFSD_V4_SECURITY_LABEL + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4_sec_label, +#else + [FATTR4_SEC_LABEL] = nfsd4_encode_fattr4__noop, +#endif + + [FATTR4_MODE_UMASK] = nfsd4_encode_fattr4__noop, + [FATTR4_XATTR_SUPPORT] = nfsd4_encode_fattr4_xattr_support, +}; + /* * Note: @fhp can be NULL; in this case, we might have to compose the filehandle * ourselves. */ static __be32 -nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, - struct svc_export *exp, - struct dentry *dentry, u32 *bmval, - struct svc_rqst *rqstp, int ignore_crossmnt) +nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr, + struct svc_fh *fhp, struct svc_export *exp, + struct dentry *dentry, const u32 *bmval, + int ignore_crossmnt) { - u32 bmval0 = bmval[0]; - u32 bmval1 = bmval[1]; - u32 bmval2 = bmval[2]; - struct kstat stat; + struct nfsd4_fattr_args args; struct svc_fh *tempfh = NULL; - struct kstatfs statfs; - __be32 *p, *attrlen_p; int starting_len = xdr->buf->len; + __be32 *attrlen_p, status; int attrlen_offset; - u32 dummy; - u64 dummy64; - u32 rdattr_err = 0; - __be32 status; int err; - struct nfs4_acl *acl = NULL; -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL - void *context = NULL; - int contextlen; -#endif - bool contextsupport = false; struct nfsd4_compoundres *resp = rqstp->rq_resp; u32 minorversion = resp->cstate.minorversion; struct path path = { .mnt = exp->ex_path.mnt, .dentry = dentry, }; - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + union { + u32 attrmask[3]; + unsigned long mask[2]; + } u; + bool file_modified; + unsigned long bit; + u64 size = 0; + + WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1); + WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval)); + + args.rqstp = rqstp; + args.exp = exp; + args.dentry = dentry; + args.ignore_crossmnt = (ignore_crossmnt != 0); - BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); - BUG_ON(!nfsd_attrs_supported(minorversion, bmval)); + /* + * Make a local copy of the attribute bitmap that can be modified. + */ + memset(&u, 0, sizeof(u)); + u.attrmask[0] = bmval[0]; + u.attrmask[1] = bmval[1]; + u.attrmask[2] = bmval[2]; + args.rdattr_err = 0; if (exp->ex_fslocs.migrated) { - status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); + status = fattr_handle_absent_fs(&u.attrmask[0], &u.attrmask[1], + &u.attrmask[2], &args.rdattr_err); if (status) goto out; } - if (bmval0 & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { - status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry)); + args.size = 0; + if (u.attrmask[0] & (FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE)) { + status = nfsd4_deleg_getattr_conflict(rqstp, d_inode(dentry), + &file_modified, &size); if (status) goto out; } - err = vfs_getattr(&path, &stat, + err = vfs_getattr(&path, &args.stat, STATX_BASIC_STATS | STATX_BTIME | STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); if (err) goto out_nfserr; - if (!(stat.result_mask & STATX_BTIME)) + args.size = file_modified ? size : args.stat.size; + + if (!(args.stat.result_mask & STATX_BTIME)) /* underlying FS does not offer btime so we can't share it */ - bmval1 &= ~FATTR4_WORD1_TIME_CREATE; - if ((bmval0 & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | + u.attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE; + if ((u.attrmask[0] & (FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE | FATTR4_WORD0_FILES_TOTAL | FATTR4_WORD0_MAXNAME)) || - (bmval1 & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | + (u.attrmask[1] & (FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL))) { - err = vfs_statfs(&path, &statfs); + err = vfs_statfs(&path, &args.statfs); if (err) goto out_nfserr; } - if ((bmval0 & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && !fhp) { + if ((u.attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) && + !fhp) { tempfh = kmalloc(sizeof(struct svc_fh), GFP_KERNEL); status = nfserr_jukebox; if (!tempfh) @@ -3015,12 +3568,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, status = fh_compose(tempfh, exp, dentry, NULL); if (status) goto out; - fhp = tempfh; - } - if (bmval0 & FATTR4_WORD0_ACL) { - err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); + args.fhp = tempfh; + } else + args.fhp = fhp; + + args.acl = NULL; + if (u.attrmask[0] & FATTR4_WORD0_ACL) { + err = nfsd4_get_nfs4_acl(rqstp, dentry, &args.acl); if (err == -EOPNOTSUPP) - bmval0 &= ~FATTR4_WORD0_ACL; + u.attrmask[0] &= ~FATTR4_WORD0_ACL; else if (err == -EINVAL) { status = nfserr_attrnotsupp; goto out; @@ -3028,452 +3584,53 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out_nfserr; } + args.contextsupport = false; + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || - bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + args.context = NULL; + if ((u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) || + u.attrmask[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { if (exp->ex_flags & NFSEXP_SECURITY_LABEL) err = security_inode_getsecctx(d_inode(dentry), - &context, &contextlen); + &args.context, &args.contextlen); else err = -EOPNOTSUPP; - contextsupport = (err == 0); - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { + args.contextsupport = (err == 0); + if (u.attrmask[2] & FATTR4_WORD2_SECURITY_LABEL) { if (err == -EOPNOTSUPP) - bmval2 &= ~FATTR4_WORD2_SECURITY_LABEL; + u.attrmask[2] &= ~FATTR4_WORD2_SECURITY_LABEL; else if (err) goto out_nfserr; } } #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - status = nfsd4_encode_bitmap(xdr, bmval0, bmval1, bmval2); + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, u.attrmask[0], + u.attrmask[1], u.attrmask[2]); if (status) goto out; + /* attr_vals */ attrlen_offset = xdr->buf->len; attrlen_p = xdr_reserve_space(xdr, XDR_UNIT); if (!attrlen_p) goto out_resource; - - if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - - if (!IS_POSIXACL(dentry->d_inode)) - supp[0] &= ~FATTR4_WORD0_ACL; - if (!contextsupport) - supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL; - if (!supp[2]) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(2); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - } else { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(supp[0]); - *p++ = cpu_to_be32(supp[1]); - *p++ = cpu_to_be32(supp[2]); - } - } - if (bmval0 & FATTR4_WORD0_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - dummy = nfs4_file_type(stat.mode); - if (dummy == NF4BAD) { - status = nfserr_serverfault; + for_each_set_bit(bit, (const unsigned long *)&u.mask, + ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) { + status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args); + if (status != nfs_ok) goto out; - } - *p++ = cpu_to_be32(dummy); - } - if (bmval0 & FATTR4_WORD0_FH_EXPIRE_TYPE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - if (exp->ex_flags & NFSEXP_NOSUBTREECHECK) - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT); - else - *p++ = cpu_to_be32(NFS4_FH_PERSISTENT| - NFS4_FH_VOL_RENAME); } - if (bmval0 & FATTR4_WORD0_CHANGE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = encode_change(p, &stat, d_inode(dentry), exp); - } - if (bmval0 & FATTR4_WORD0_SIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.size); - } - if (bmval0 & FATTR4_WORD0_LINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_SYMLINK_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_NAMED_ATTR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_FSID) { - p = xdr_reserve_space(xdr, 16); - if (!p) - goto out_resource; - if (exp->ex_fslocs.migrated) { - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MAJOR); - p = xdr_encode_hyper(p, NFS4_REFERRAL_FSID_MINOR); - } else switch(fsid_source(fhp)) { - case FSIDSOURCE_FSID: - p = xdr_encode_hyper(p, (u64)exp->ex_fsid); - p = xdr_encode_hyper(p, (u64)0); - break; - case FSIDSOURCE_DEV: - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MAJOR(stat.dev)); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(MINOR(stat.dev)); - break; - case FSIDSOURCE_UUID: - p = xdr_encode_opaque_fixed(p, exp->ex_uuid, - EX_UUID_LEN); - break; - } - } - if (bmval0 & FATTR4_WORD0_UNIQUE_HANDLES) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_LEASE_TIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(nn->nfsd4_lease); - } - if (bmval0 & FATTR4_WORD0_RDATTR_ERROR) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(rdattr_err); - } - if (bmval0 & FATTR4_WORD0_ACL) { - struct nfs4_ace *ace; - - if (acl == NULL) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - - *p++ = cpu_to_be32(0); - goto out_acl; - } - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(acl->naces); - - for (ace = acl->aces; ace < acl->aces + acl->naces; ace++) { - p = xdr_reserve_space(xdr, 4*3); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(ace->type); - *p++ = cpu_to_be32(ace->flag); - *p++ = cpu_to_be32(ace->access_mask & - NFS4_ACE_MASK_ALL); - status = nfsd4_encode_aclname(xdr, rqstp, ace); - if (status) - goto out; - } - } -out_acl: - if (bmval0 & FATTR4_WORD0_ACLSUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? - ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); - } - if (bmval0 & FATTR4_WORD0_CANSETTIME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CASE_INSENSITIVE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(0); - } - if (bmval0 & FATTR4_WORD0_CASE_PRESERVING) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_CHOWN_RESTRICTED) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_FILEHANDLE) { - p = xdr_reserve_space(xdr, fhp->fh_handle.fh_size + 4); - if (!p) - goto out_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, - fhp->fh_handle.fh_size); - } - if (bmval0 & FATTR4_WORD0_FILEID) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, stat.ino); - } - if (bmval0 & FATTR4_WORD0_FILES_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_ffree); - } - if (bmval0 & FATTR4_WORD0_FILES_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) statfs.f_files); - } - if (bmval0 & FATTR4_WORD0_FS_LOCATIONS) { - status = nfsd4_encode_fs_locations(xdr, rqstp, exp); - if (status) - goto out; - } - if (bmval0 & FATTR4_WORD0_HOMOGENEOUS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval0 & FATTR4_WORD0_MAXFILESIZE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, exp->ex_path.mnt->mnt_sb->s_maxbytes); - } - if (bmval0 & FATTR4_WORD0_MAXLINK) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(255); - } - if (bmval0 & FATTR4_WORD0_MAXNAME) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(statfs.f_namelen); - } - if (bmval0 & FATTR4_WORD0_MAXREAD) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval0 & FATTR4_WORD0_MAXWRITE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - p = xdr_encode_hyper(p, (u64) svc_max_payload(rqstp)); - } - if (bmval1 & FATTR4_WORD1_MODE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.mode & S_IALLUGO); - } - if (bmval1 & FATTR4_WORD1_NO_TRUNC) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(1); - } - if (bmval1 & FATTR4_WORD1_NUMLINKS) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.nlink); - } - if (bmval1 & FATTR4_WORD1_OWNER) { - status = nfsd4_encode_user(xdr, rqstp, stat.uid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_OWNER_GROUP) { - status = nfsd4_encode_group(xdr, rqstp, stat.gid); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_RAWDEV) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - *p++ = cpu_to_be32((u32) MAJOR(stat.rdev)); - *p++ = cpu_to_be32((u32) MINOR(stat.rdev)); - } - if (bmval1 & FATTR4_WORD1_SPACE_AVAIL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bavail * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_FREE) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_bfree * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_TOTAL) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)statfs.f_blocks * (u64)statfs.f_bsize; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_SPACE_USED) { - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - dummy64 = (u64)stat.blocks << 9; - p = xdr_encode_hyper(p, dummy64); - } - if (bmval1 & FATTR4_WORD1_TIME_ACCESS) { - status = nfsd4_encode_nfstime4(xdr, &stat.atime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_TIME_CREATE) { - status = nfsd4_encode_nfstime4(xdr, &stat.btime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_TIME_DELTA) { - p = xdr_reserve_space(xdr, 12); - if (!p) - goto out_resource; - p = encode_time_delta(p, d_inode(dentry)); - } - if (bmval1 & FATTR4_WORD1_TIME_METADATA) { - status = nfsd4_encode_nfstime4(xdr, &stat.ctime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_TIME_MODIFY) { - status = nfsd4_encode_nfstime4(xdr, &stat.mtime); - if (status) - goto out; - } - if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { - u64 ino = stat.ino; - - p = xdr_reserve_space(xdr, 8); - if (!p) - goto out_resource; - /* - * Get ino of mountpoint in parent filesystem, if not ignoring - * crossmount and this is the root of a cross-mounted - * filesystem. - */ - if (ignore_crossmnt == 0 && - dentry == exp->ex_path.mnt->mnt_root) { - err = nfsd4_get_mounted_on_ino(exp, &ino); - if (err) - goto out_nfserr; - } - p = xdr_encode_hyper(p, ino); - } -#ifdef CONFIG_NFSD_PNFS - if (bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_TYPES) { - status = nfsd4_encode_layout_types(xdr, exp->ex_layout_types); - if (status) - goto out; - } - - if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - *p++ = cpu_to_be32(stat.blksize); - } -#endif /* CONFIG_NFSD_PNFS */ - if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - u32 supp[3]; - - memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); - supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; - supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; - supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; - - status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); - if (status) - goto out; - } - -#ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { - status = nfsd4_encode_security_label(xdr, rqstp, context, - contextlen); - if (status) - goto out; - } -#endif - - if (bmval2 & FATTR4_WORD2_XATTR_SUPPORT) { - p = xdr_reserve_space(xdr, 4); - if (!p) - goto out_resource; - err = xattr_supports_user_prefix(d_inode(dentry)); - *p++ = cpu_to_be32(err == 0); - } - *attrlen_p = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT); status = nfs_ok; out: #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if (context) - security_release_secctx(context, contextlen); + if (args.context) + security_release_secctx(args.context, args.contextlen); #endif /* CONFIG_NFSD_V4_SECURITY_LABEL */ - kfree(acl); + kfree(args.acl); if (tempfh) { fh_put(tempfh); kfree(tempfh); @@ -3514,12 +3671,28 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words, __be32 ret; svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2); - ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp, - ignore_crossmnt); + ret = nfsd4_encode_fattr4(rqstp, &xdr, fhp, exp, dentry, bmval, + ignore_crossmnt); *p = xdr.p; return ret; } +/* + * The buffer space for this field was reserved during a previous + * call to nfsd4_encode_entry4(). + */ +static void nfsd4_encode_entry4_nfs_cookie4(const struct nfsd4_readdir *readdir, + u64 offset) +{ + __be64 cookie = cpu_to_be64(offset); + struct xdr_stream *xdr = readdir->xdr; + + if (!readdir->cookie_offset) + return; + write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, &cookie, + sizeof(cookie)); +} + static inline int attributes_need_mount(u32 *bmval) { if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) @@ -3530,8 +3703,8 @@ static inline int attributes_need_mount(u32 *bmval) } static __be32 -nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, - const char *name, int namlen) +nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name, + int namlen) { struct svc_export *exp = cd->rd_fhp->fh_export; struct dentry *dentry; @@ -3574,33 +3747,34 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, } out_encode: - nfserr = nfsd4_encode_fattr(xdr, NULL, exp, dentry, cd->rd_bmval, - cd->rd_rqstp, ignore_crossmnt); + nfserr = nfsd4_encode_fattr4(cd->rd_rqstp, cd->xdr, NULL, exp, dentry, + cd->rd_bmval, ignore_crossmnt); out_put: dput(dentry); exp_put(exp); return nfserr; } -static __be32 * -nfsd4_encode_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) +static __be32 +nfsd4_encode_entry4_rdattr_error(struct xdr_stream *xdr, __be32 nfserr) { - __be32 *p; - - p = xdr_reserve_space(xdr, 20); - if (!p) - return NULL; - *p++ = htonl(2); - *p++ = htonl(FATTR4_WORD0_RDATTR_ERROR); /* bmval0 */ - *p++ = htonl(0); /* bmval1 */ + __be32 status; - *p++ = htonl(4); /* attribute length */ - *p++ = nfserr; /* no htonl */ - return p; + /* attrmask */ + status = nfsd4_encode_bitmap4(xdr, FATTR4_WORD0_RDATTR_ERROR, 0, 0); + if (status != nfs_ok) + return status; + /* attr_vals */ + if (xdr_stream_encode_u32(xdr, XDR_UNIT) != XDR_UNIT) + return nfserr_resource; + /* rdattr_error */ + if (xdr_stream_encode_be32(xdr, nfserr) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; } static int -nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, +nfsd4_encode_entry4(void *ccdv, const char *name, int namlen, loff_t offset, u64 ino, unsigned int d_type) { struct readdir_cd *ccd = ccdv; @@ -3611,8 +3785,6 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; - __be64 wire_offset; - __be32 *p; /* In nfsv4, "." and ".." never make it onto the wire.. */ if (name && isdotent(name, namlen)) { @@ -3620,24 +3792,19 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, return 0; } - if (cd->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, cd->cookie_offset, - &wire_offset, 8); - } + /* Encode the previous entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(cd, offset); - p = xdr_reserve_space(xdr, 4); - if (!p) + if (xdr_stream_encode_item_present(xdr) != XDR_UNIT) goto fail; - *p++ = xdr_one; /* mark entry present */ + + /* Reserve send buffer space for this entry's cookie value. */ cookie_offset = xdr->buf->len; - p = xdr_reserve_space(xdr, 3*4 + namlen); - if (!p) + if (nfsd4_encode_nfs_cookie4(xdr, OFFSET_MAX) != nfs_ok) goto fail; - p = xdr_encode_hyper(p, OFFSET_MAX); /* offset of next entry */ - p = xdr_encode_array(p, name, namlen); /* name length & name */ - - nfserr = nfsd4_encode_dirent_fattr(xdr, cd, name, namlen); + if (nfsd4_encode_component4(xdr, name, namlen) != nfs_ok) + goto fail; + nfserr = nfsd4_encode_entry4_fattr(cd, name, namlen); switch (nfserr) { case nfs_ok: break; @@ -3668,8 +3835,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, */ if (!(cd->rd_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)) goto fail; - p = nfsd4_encode_rdattr_error(xdr, nfserr); - if (p == NULL) { + if (nfsd4_encode_entry4_rdattr_error(xdr, nfserr)) { nfserr = nfserr_toosmall; goto fail; } @@ -3727,18 +3893,26 @@ nfsd4_encode_clientid4(struct xdr_stream *xdr, const clientid_t *clientid) return nfs_ok; } +/* This is a frequently-encoded item; open-coded for speed */ static __be32 -nfsd4_encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +nfsd4_encode_stateid4(struct xdr_stream *xdr, const stateid_t *sid) { __be32 *p; - p = xdr_reserve_space(xdr, sizeof(stateid_t)); + p = xdr_reserve_space(xdr, NFS4_STATEID_SIZE); if (!p) return nfserr_resource; *p++ = cpu_to_be32(sid->si_generation); - p = xdr_encode_opaque_fixed(p, &sid->si_opaque, - sizeof(stateid_opaque_t)); - return 0; + memcpy(p, &sid->si_opaque, sizeof(sid->si_opaque)); + return nfs_ok; +} + +static __be32 +nfsd4_encode_sessionid4(struct xdr_stream *xdr, + const struct nfs4_sessionid *sessionid) +{ + return nfsd4_encode_opaque_fixed(xdr, sessionid->data, + NFS4_MAX_SESSIONID_LEN); } static __be32 @@ -3747,14 +3921,14 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_access *access = &u->access; struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(access->ac_supported); - *p++ = cpu_to_be32(access->ac_resp_access); - return 0; + /* supported */ + status = nfsd4_encode_uint32_t(xdr, access->ac_supported); + if (status != nfs_ok) + return status; + /* access */ + return nfsd4_encode_uint32_t(xdr, access->ac_resp_access); } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, @@ -3762,17 +3936,16 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, { struct nfsd4_bind_conn_to_session *bcts = &u->bind_conn_to_session; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); - if (!p) + /* bctsr_sessid */ + nfserr = nfsd4_encode_sessionid4(xdr, &bcts->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* bctsr_dir */ + if (xdr_stream_encode_u32(xdr, bcts->dir) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(bcts->dir); - /* Upshifting from TCP to RDMA is not supported */ - *p++ = cpu_to_be32(0); - return 0; + /* bctsr_use_conn_in_rdma_mode */ + return nfsd4_encode_bool(xdr, false); } static __be32 @@ -3782,7 +3955,8 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_close *close = &u->close; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &close->cl_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &close->cl_stateid); } @@ -3802,11 +3976,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_create *create = &u->create; struct xdr_stream *xdr = resp->xdr; + /* cinfo */ nfserr = nfsd4_encode_change_info4(xdr, &create->cr_cinfo); if (nfserr) return nfserr; - return nfsd4_encode_bitmap(xdr, create->cr_bmval[0], - create->cr_bmval[1], create->cr_bmval[2]); + /* attrset */ + return nfsd4_encode_bitmap4(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); } static __be32 @@ -3817,65 +3993,56 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, - getattr->ga_bmval, resp->rqstp, 0); + /* obj_attributes */ + return nfsd4_encode_fattr4(resp->rqstp, xdr, fhp, fhp->fh_export, + fhp->fh_dentry, getattr->ga_bmval, 0); } static __be32 nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { - struct svc_fh **fhpp = &u->getfh; struct xdr_stream *xdr = resp->xdr; - struct svc_fh *fhp = *fhpp; - unsigned int len; - __be32 *p; + struct svc_fh *fhp = u->getfh; - len = fhp->fh_handle.fh_size; - p = xdr_reserve_space(xdr, len + 4); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_raw, len); - return 0; + /* object */ + return nfsd4_encode_nfs_fh4(xdr, &fhp->fh_handle); } -/* -* Including all fields other than the name, a LOCK4denied structure requires -* 8(clientid) + 4(namelen) + 8(offset) + 8(length) + 4(type) = 32 bytes. -*/ static __be32 -nfsd4_encode_lock_denied(struct xdr_stream *xdr, struct nfsd4_lock_denied *ld) +nfsd4_encode_lock_owner4(struct xdr_stream *xdr, const clientid_t *clientid, + const struct xdr_netobj *owner) { - struct xdr_netobj *conf = &ld->ld_owner; - __be32 *p; + __be32 status; -again: - p = xdr_reserve_space(xdr, 32 + XDR_LEN(conf->len)); - if (!p) { - /* - * Don't fail to return the result just because we can't - * return the conflicting open: - */ - if (conf->len) { - kfree(conf->data); - conf->len = 0; - conf->data = NULL; - goto again; - } + /* clientid */ + status = nfsd4_encode_clientid4(xdr, clientid); + if (status != nfs_ok) + return status; + /* owner */ + return nfsd4_encode_opaque(xdr, owner->data, owner->len); +} + +static __be32 +nfsd4_encode_lock4denied(struct xdr_stream *xdr, + const struct nfsd4_lock_denied *ld) +{ + __be32 status; + + /* offset */ + status = nfsd4_encode_offset4(xdr, ld->ld_start); + if (status != nfs_ok) + return status; + /* length */ + status = nfsd4_encode_length4(xdr, ld->ld_length); + if (status != nfs_ok) + return status; + /* locktype */ + if (xdr_stream_encode_u32(xdr, ld->ld_type) != XDR_UNIT) return nfserr_resource; - } - p = xdr_encode_hyper(p, ld->ld_start); - p = xdr_encode_hyper(p, ld->ld_length); - *p++ = cpu_to_be32(ld->ld_type); - if (conf->len) { - p = xdr_encode_opaque_fixed(p, &ld->ld_clientid, 8); - p = xdr_encode_opaque(p, conf->data, conf->len); - kfree(conf->data); - } else { /* non - nfsv4 lock in conflict, no clientid nor owner */ - p = xdr_encode_hyper(p, (u64)0); /* clientid */ - *p++ = cpu_to_be32(0); /* length of owner name */ - } - return nfserr_denied; + /* owner */ + return nfsd4_encode_lock_owner4(xdr, &ld->ld_clientid, + &ld->ld_owner); } static __be32 @@ -3884,13 +4051,21 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_lock *lock = &u->lock; struct xdr_stream *xdr = resp->xdr; + __be32 status; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &lock->lk_resp_stateid); - else if (nfserr == nfserr_denied) - nfserr = nfsd4_encode_lock_denied(xdr, &lock->lk_denied); - - return nfserr; + switch (nfserr) { + case nfs_ok: + /* resok4 */ + status = nfsd4_encode_stateid4(xdr, &lock->lk_resp_stateid); + break; + case nfserr_denied: + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lock->lk_denied); + break; + default: + return nfserr; + } + return status != nfs_ok ? status : nfserr; } static __be32 @@ -3899,9 +4074,14 @@ nfsd4_encode_lockt(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_lockt *lockt = &u->lockt; struct xdr_stream *xdr = resp->xdr; + __be32 status; - if (nfserr == nfserr_denied) - nfsd4_encode_lock_denied(xdr, &lockt->lt_denied); + if (nfserr == nfserr_denied) { + /* denied */ + status = nfsd4_encode_lock4denied(xdr, &lockt->lt_denied); + if (status != nfs_ok) + return status; + } return nfserr; } @@ -3912,7 +4092,8 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_locku *locku = &u->locku; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &locku->lu_stateid); + /* lock_stateid */ + return nfsd4_encode_stateid4(xdr, &locku->lu_stateid); } @@ -3926,104 +4107,159 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_encode_change_info4(xdr, &link->li_cinfo); } +/* + * This implementation does not yet support returning an ACE in an + * OPEN that offers a delegation. + */ +static __be32 +nfsd4_encode_open_nfsace4(struct xdr_stream *xdr) +{ + __be32 status; + + /* type */ + status = nfsd4_encode_acetype4(xdr, NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); + if (status != nfs_ok) + return nfserr_resource; + /* flag */ + status = nfsd4_encode_aceflag4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* access mask */ + status = nfsd4_encode_acemask4(xdr, 0); + if (status != nfs_ok) + return nfserr_resource; + /* who - empty for now */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) + return nfserr_resource; + return nfs_ok; +} static __be32 -nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_open_read_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) { - struct nfsd4_open *open = &u->open; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); - if (nfserr) - return nfserr; - nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); - if (nfserr) - return nfserr; - if (xdr_stream_encode_u32(xdr, open->op_rflags) < 0) + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* permissions */ + return nfsd4_encode_open_nfsace4(xdr); +} + +static __be32 +nfsd4_encode_nfs_space_limit4(struct xdr_stream *xdr, u64 filesize) +{ + /* limitby */ + if (xdr_stream_encode_u32(xdr, NFS4_LIMIT_SIZE) != XDR_UNIT) return nfserr_resource; + /* filesize */ + return nfsd4_encode_uint64_t(xdr, filesize); +} - nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], - open->op_bmval[2]); - if (nfserr) - return nfserr; +static __be32 +nfsd4_encode_open_write_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) +{ + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* stateid */ + status = nfsd4_encode_stateid4(xdr, &open->op_delegate_stateid); + if (status != nfs_ok) + return status; + /* recall */ + status = nfsd4_encode_bool(xdr, open->op_recall); + if (status != nfs_ok) + return status; + /* space_limit */ + status = nfsd4_encode_nfs_space_limit4(xdr, 0); + if (status != nfs_ok) + return status; + return nfsd4_encode_open_nfsace4(xdr); +} + +static __be32 +nfsd4_encode_open_none_delegation4(struct xdr_stream *xdr, + struct nfsd4_open *open) +{ + __be32 status = nfs_ok; + + /* ond_why */ + if (xdr_stream_encode_u32(xdr, open->op_why_no_deleg) != XDR_UNIT) return nfserr_resource; + switch (open->op_why_no_deleg) { + case WND4_CONTENTION: + /* ond_server_will_push_deleg */ + status = nfsd4_encode_bool(xdr, false); + break; + case WND4_RESOURCE: + /* ond_server_will_signal_avail */ + status = nfsd4_encode_bool(xdr, false); + } + return status; +} + +static __be32 +nfsd4_encode_open_delegation4(struct xdr_stream *xdr, struct nfsd4_open *open) +{ + __be32 status; - *p++ = cpu_to_be32(open->op_delegate_type); + /* delegation_type */ + if (xdr_stream_encode_u32(xdr, open->op_delegate_type) != XDR_UNIT) + return nfserr_resource; switch (open->op_delegate_type) { case NFS4_OPEN_DELEGATE_NONE: + status = nfs_ok; break; case NFS4_OPEN_DELEGATE_READ: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_recall); - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + /* read */ + status = nfsd4_encode_open_read_delegation4(xdr, open); break; case NFS4_OPEN_DELEGATE_WRITE: - nfserr = nfsd4_encode_stateid(xdr, &open->op_delegate_stateid); - if (nfserr) - return nfserr; - - p = xdr_reserve_space(xdr, XDR_UNIT * 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_recall); - - /* - * Always flush on close - * - * TODO: space_limit's in delegations - */ - *p++ = cpu_to_be32(NFS4_LIMIT_SIZE); - *p++ = xdr_zero; - *p++ = xdr_zero; - - /* - * TODO: ACE's in delegations - */ - *p++ = cpu_to_be32(NFS4_ACE_ACCESS_ALLOWED_ACE_TYPE); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); /* XXX: is NULL principal ok? */ + /* write */ + status = nfsd4_encode_open_write_delegation4(xdr, open); break; - case NFS4_OPEN_DELEGATE_NONE_EXT: /* 4.1 */ - switch (open->op_why_no_deleg) { - case WND4_CONTENTION: - case WND4_RESOURCE: - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - /* deleg signaling not supported yet: */ - *p++ = cpu_to_be32(0); - break; - default: - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(open->op_why_no_deleg); - } + case NFS4_OPEN_DELEGATE_NONE_EXT: + /* od_whynone */ + status = nfsd4_encode_open_none_delegation4(xdr, open); break; default: - BUG(); + status = nfserr_serverfault; } - /* XXX save filehandle here */ - return 0; + + return status; +} + +static __be32 +nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_open *open = &u->open; + struct xdr_stream *xdr = resp->xdr; + + /* stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &open->op_stateid); + if (nfserr != nfs_ok) + return nfserr; + /* cinfo */ + nfserr = nfsd4_encode_change_info4(xdr, &open->op_cinfo); + if (nfserr != nfs_ok) + return nfserr; + /* rflags */ + nfserr = nfsd4_encode_uint32_t(xdr, open->op_rflags); + if (nfserr != nfs_ok) + return nfserr; + /* attrset */ + nfserr = nfsd4_encode_bitmap4(xdr, open->op_bmval[0], + open->op_bmval[1], open->op_bmval[2]); + if (nfserr != nfs_ok) + return nfserr; + /* delegation */ + return nfsd4_encode_open_delegation4(xdr, open); } static __be32 @@ -4033,7 +4269,8 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc = &u->open_confirm; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &oc->oc_resp_stateid); } static __be32 @@ -4043,7 +4280,8 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od = &u->open_downgrade; struct xdr_stream *xdr = resp->xdr; - return nfsd4_encode_stateid(xdr, &od->od_stateid); + /* open_stateid */ + return nfsd4_encode_stateid4(xdr, &od->od_stateid); } /* @@ -4113,6 +4351,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, struct file *file, unsigned long maxcount) { struct xdr_stream *xdr = resp->xdr; + unsigned int base = xdr->buf->page_len & ~PAGE_MASK; unsigned int starting_len = xdr->buf->len; __be32 zero = xdr_zero; __be32 nfserr; @@ -4121,8 +4360,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, return nfserr_resource; nfserr = nfsd_iter_read(resp->rqstp, read->rd_fhp, file, - read->rd_offset, &maxcount, - xdr->buf->page_len & ~PAGE_MASK, + read->rd_offset, &maxcount, base, &read->rd_eof); read->rd_length = maxcount; if (nfserr) @@ -4227,90 +4465,83 @@ out_err: return nfserr; } -static __be32 -nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +static __be32 nfsd4_encode_dirlist4(struct xdr_stream *xdr, + struct nfsd4_readdir *readdir, + u32 max_payload) { - struct nfsd4_readdir *readdir = &u->readdir; - int maxcount; - int bytes_left; + int bytes_left, maxcount, starting_len = xdr->buf->len; loff_t offset; - __be64 wire_offset; - struct xdr_stream *xdr = resp->xdr; - int starting_len = xdr->buf->len; - __be32 *p; - - nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); - if (nfserr != nfs_ok) - return nfserr; + __be32 status; /* * Number of bytes left for directory entries allowing for the - * final 8 bytes of the readdir and a following failed op: + * final 8 bytes of the readdir and a following failed op. */ - bytes_left = xdr->buf->buflen - xdr->buf->len - - COMPOUND_ERR_SLACK_SPACE - 8; - if (bytes_left < 0) { - nfserr = nfserr_resource; - goto err_no_verf; - } - maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(u32, readdir->rd_maxcount, maxcount); + bytes_left = xdr->buf->buflen - xdr->buf->len - + COMPOUND_ERR_SLACK_SPACE - XDR_UNIT * 2; + if (bytes_left < 0) + return nfserr_resource; + maxcount = min_t(u32, readdir->rd_maxcount, max_payload); + /* - * Note the rfc defines rd_maxcount as the size of the - * READDIR4resok structure, which includes the verifier above - * and the 8 bytes encoded at the end of this function: + * The RFC defines rd_maxcount as the size of the + * READDIR4resok structure, which includes the verifier + * and the 8 bytes encoded at the end of this function. */ - if (maxcount < 16) { - nfserr = nfserr_toosmall; - goto err_no_verf; - } - maxcount = min_t(int, maxcount-16, bytes_left); + if (maxcount < XDR_UNIT * 4) + return nfserr_toosmall; + maxcount = min_t(int, maxcount - XDR_UNIT * 4, bytes_left); - /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0 */ if (!readdir->rd_dircount) - readdir->rd_dircount = svc_max_payload(resp->rqstp); + readdir->rd_dircount = max_payload; + /* *entries */ readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; readdir->cookie_offset = 0; - offset = readdir->rd_cookie; - nfserr = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, - &offset, - &readdir->common, nfsd4_encode_dirent); - if (nfserr == nfs_ok && - readdir->common.err == nfserr_toosmall && - xdr->buf->len == starting_len + 8) { - /* nothing encoded; which limit did we hit?: */ - if (maxcount - 16 < bytes_left) - /* It was the fault of rd_maxcount: */ - nfserr = nfserr_toosmall; - else - /* We ran out of buffer space: */ - nfserr = nfserr_resource; + status = nfsd_readdir(readdir->rd_rqstp, readdir->rd_fhp, &offset, + &readdir->common, nfsd4_encode_entry4); + if (status) + return status; + if (readdir->common.err == nfserr_toosmall && + xdr->buf->len == starting_len) { + /* No entries were encoded. Which limit did we hit? */ + if (maxcount - XDR_UNIT * 4 < bytes_left) + /* It was the fault of rd_maxcount */ + return nfserr_toosmall; + /* We ran out of buffer space */ + return nfserr_resource; } - if (nfserr) - goto err_no_verf; + /* Encode the final entry's cookie value */ + nfsd4_encode_entry4_nfs_cookie4(readdir, offset); + /* No entries follow */ + if (xdr_stream_encode_item_absent(xdr) != XDR_UNIT) + return nfserr_resource; - if (readdir->cookie_offset) { - wire_offset = cpu_to_be64(offset); - write_bytes_to_xdr_buf(xdr->buf, readdir->cookie_offset, - &wire_offset, 8); - } + /* eof */ + return nfsd4_encode_bool(xdr, readdir->common.err == nfserr_eof); +} - p = xdr_reserve_space(xdr, 8); - if (!p) { - WARN_ON_ONCE(1); - goto err_no_verf; - } - *p++ = 0; /* no more entries */ - *p++ = htonl(readdir->common.err == nfserr_eof); +static __be32 +nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_readdir *readdir = &u->readdir; + struct xdr_stream *xdr = resp->xdr; + int starting_len = xdr->buf->len; - return 0; -err_no_verf: - xdr_truncate_encode(xdr, starting_len); + /* cookieverf */ + nfserr = nfsd4_encode_verifier4(xdr, &readdir->rd_verf); + if (nfserr != nfs_ok) + return nfserr; + + /* reply */ + nfserr = nfsd4_encode_dirlist4(xdr, readdir, svc_max_payload(resp->rqstp)); + if (nfserr != nfs_ok) + xdr_truncate_encode(xdr, starting_len); return nfserr; } @@ -4338,13 +4569,34 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_rpcsec_gss_info(struct xdr_stream *xdr, + struct rpcsec_gss_info *info) +{ + __be32 status; + + /* oid */ + if (xdr_stream_encode_opaque(xdr, info->oid.data, info->oid.len) < 0) + return nfserr_resource; + /* qop */ + status = nfsd4_encode_qop4(xdr, info->qop); + if (status != nfs_ok) + return status; + /* service */ + if (xdr_stream_encode_u32(xdr, info->service) != XDR_UNIT) + return nfserr_resource; + + return nfs_ok; +} + +static __be32 nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) { u32 i, nflavs, supported; struct exp_flavor_info *flavs; struct exp_flavor_info def_flavs[2]; - __be32 *p, *flavorsp; static bool report = true; + __be32 *flavorsp; + __be32 status; if (exp->ex_nflavors) { flavs = exp->ex_flavors; @@ -4367,10 +4619,9 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) } supported = 0; - p = xdr_reserve_space(xdr, 4); - if (!p) + flavorsp = xdr_reserve_space(xdr, XDR_UNIT); + if (!flavorsp) return nfserr_resource; - flavorsp = p++; /* to be backfilled later */ for (i = 0; i < nflavs; i++) { rpc_authflavor_t pf = flavs[i].pseudoflavor; @@ -4378,20 +4629,22 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) if (rpcauth_get_gssinfo(pf, &info) == 0) { supported++; - p = xdr_reserve_space(xdr, 4 + 4 + - XDR_LEN(info.oid.len) + 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(RPC_AUTH_GSS); - p = xdr_encode_opaque(p, info.oid.data, info.oid.len); - *p++ = cpu_to_be32(info.qop); - *p++ = cpu_to_be32(info.service); + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, RPC_AUTH_GSS); + if (status != nfs_ok) + return status; + /* flavor_info */ + status = nfsd4_encode_rpcsec_gss_info(xdr, &info); + if (status != nfs_ok) + return status; } else if (pf < RPC_AUTH_MAXFLAVOR) { supported++; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(pf); + + /* flavor */ + status = nfsd4_encode_uint32_t(xdr, pf); + if (status != nfs_ok) + return status; } else { if (report) pr_warn("NFS: SECINFO: security flavor %u " @@ -4401,7 +4654,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) if (nflavs != supported) report = false; - *flavorsp = htonl(supported); + *flavorsp = cpu_to_be32(supported); return 0; } @@ -4425,34 +4678,25 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); } -/* - * The SETATTR encode routine is special -- it always encodes a bitmap, - * regardless of the error status. - */ static __be32 nfsd4_encode_setattr(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_setattr *setattr = &u->setattr; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - if (nfserr) { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - *p++ = cpu_to_be32(0); - } - else { - *p++ = cpu_to_be32(3); - *p++ = cpu_to_be32(setattr->sa_bmval[0]); - *p++ = cpu_to_be32(setattr->sa_bmval[1]); - *p++ = cpu_to_be32(setattr->sa_bmval[2]); + switch (nfserr) { + case nfs_ok: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, setattr->sa_bmval[0], + setattr->sa_bmval[1], + setattr->sa_bmval[2]); + break; + default: + /* attrsset */ + status = nfsd4_encode_bitmap4(resp->xdr, 0, 0, 0); } - return nfserr; + return status != nfs_ok ? status : nfserr; } static __be32 @@ -4488,86 +4732,148 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; + struct xdr_stream *xdr = resp->xdr; - if (xdr_stream_encode_u32(resp->xdr, write->wr_bytes_written) < 0) - return nfserr_resource; - if (xdr_stream_encode_u32(resp->xdr, write->wr_how_written) < 0) + /* count */ + nfserr = nfsd4_encode_count4(xdr, write->wr_bytes_written); + if (nfserr) + return nfserr; + /* committed */ + if (xdr_stream_encode_u32(xdr, write->wr_how_written) != XDR_UNIT) return nfserr_resource; - return nfsd4_encode_verifier4(resp->xdr, &write->wr_verifier); + /* writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); } static __be32 -nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_state_protect_ops4(struct xdr_stream *xdr, + struct nfsd4_exchange_id *exid) { - struct nfsd4_exchange_id *exid = &u->exchange_id; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - char *major_id; - char *server_scope; - int major_id_sz; - int server_scope_sz; - uint64_t minor_id = 0; - struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + __be32 status; - major_id = nn->nfsd_name; - major_id_sz = strlen(nn->nfsd_name); - server_scope = nn->nfsd_name; - server_scope_sz = strlen(nn->nfsd_name); + /* spo_must_enforce */ + status = nfsd4_encode_bitmap4(xdr, exid->spo_must_enforce[0], + exid->spo_must_enforce[1], + exid->spo_must_enforce[2]); + if (status != nfs_ok) + return status; + /* spo_must_allow */ + return nfsd4_encode_bitmap4(xdr, exid->spo_must_allow[0], + exid->spo_must_allow[1], + exid->spo_must_allow[2]); +} - if (nfsd4_encode_clientid4(xdr, &exid->clientid) != nfs_ok) - return nfserr_resource; - if (xdr_stream_encode_u32(xdr, exid->seqid) < 0) - return nfserr_resource; - if (xdr_stream_encode_u32(xdr, exid->flags) < 0) - return nfserr_resource; +static __be32 +nfsd4_encode_state_protect4_r(struct xdr_stream *xdr, struct nfsd4_exchange_id *exid) +{ + __be32 status; - if (xdr_stream_encode_u32(xdr, exid->spa_how) < 0) + if (xdr_stream_encode_u32(xdr, exid->spa_how) != XDR_UNIT) return nfserr_resource; switch (exid->spa_how) { case SP4_NONE: + status = nfs_ok; break; case SP4_MACH_CRED: - /* spo_must_enforce bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_enforce[0], - exid->spo_must_enforce[1], - exid->spo_must_enforce[2]); - if (nfserr) - return nfserr; - /* spo_must_allow bitmap: */ - nfserr = nfsd4_encode_bitmap(xdr, - exid->spo_must_allow[0], - exid->spo_must_allow[1], - exid->spo_must_allow[2]); - if (nfserr) - return nfserr; + /* spr_mach_ops */ + status = nfsd4_encode_state_protect_ops4(xdr, exid); break; default: - WARN_ON_ONCE(1); + status = nfserr_serverfault; } + return status; +} - p = xdr_reserve_space(xdr, - 8 /* so_minor_id */ + - 4 /* so_major_id.len */ + - (XDR_QUADLEN(major_id_sz) * 4) + - 4 /* eir_server_scope.len */ + - (XDR_QUADLEN(server_scope_sz) * 4) + - 4 /* eir_server_impl_id.count (0) */); - if (!p) +static __be32 +nfsd4_encode_server_owner4(struct xdr_stream *xdr, struct svc_rqst *rqstp) +{ + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); + __be32 status; + + /* so_minor_id */ + status = nfsd4_encode_uint64_t(xdr, 0); + if (status != nfs_ok) + return status; + /* so_major_id */ + return nfsd4_encode_opaque(xdr, nn->nfsd_name, strlen(nn->nfsd_name)); +} + +static __be32 +nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd_net *nn = net_generic(SVC_NET(resp->rqstp), nfsd_net_id); + struct nfsd4_exchange_id *exid = &u->exchange_id; + struct xdr_stream *xdr = resp->xdr; + + /* eir_clientid */ + nfserr = nfsd4_encode_clientid4(xdr, &exid->clientid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, exid->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, exid->flags); + if (nfserr != nfs_ok) + return nfserr; + /* eir_state_protect */ + nfserr = nfsd4_encode_state_protect4_r(xdr, exid); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_owner */ + nfserr = nfsd4_encode_server_owner4(xdr, resp->rqstp); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_scope */ + nfserr = nfsd4_encode_opaque(xdr, nn->nfsd_name, + strlen(nn->nfsd_name)); + if (nfserr != nfs_ok) + return nfserr; + /* eir_server_impl_id<1> */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; - /* The server_owner struct */ - p = xdr_encode_hyper(p, minor_id); /* Minor id */ - /* major id */ - p = xdr_encode_opaque(p, major_id, major_id_sz); + return nfs_ok; +} - /* Server scope */ - p = xdr_encode_opaque(p, server_scope, server_scope_sz); +static __be32 +nfsd4_encode_channel_attrs4(struct xdr_stream *xdr, + const struct nfsd4_channel_attrs *attrs) +{ + __be32 status; - /* Implementation id */ - *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ - return 0; + /* ca_headerpadsize */ + status = nfsd4_encode_count4(xdr, 0); + if (status != nfs_ok) + return status; + /* ca_maxrequestsize */ + status = nfsd4_encode_count4(xdr, attrs->maxreq_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_sz); + if (status != nfs_ok) + return status; + /* ca_maxresponsesize_cached */ + status = nfsd4_encode_count4(xdr, attrs->maxresp_cached); + if (status != nfs_ok) + return status; + /* ca_maxoperations */ + status = nfsd4_encode_count4(xdr, attrs->maxops); + if (status != nfs_ok) + return status; + /* ca_maxrequests */ + status = nfsd4_encode_count4(xdr, attrs->maxreqs); + if (status != nfs_ok) + return status; + /* ca_rdma_ird<1> */ + if (xdr_stream_encode_u32(xdr, attrs->nr_rdma_attrs) != XDR_UNIT) + return nfserr_resource; + if (attrs->nr_rdma_attrs) + return nfsd4_encode_uint32_t(xdr, attrs->rdma_attrs); + return nfs_ok; } static __be32 @@ -4576,52 +4882,25 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_create_session *sess = &u->create_session; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - p = xdr_reserve_space(xdr, 24); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, sess->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(sess->seqid); - *p++ = cpu_to_be32(sess->flags); - p = xdr_reserve_space(xdr, 28); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->fore_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->fore_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->fore_channel.maxops); - *p++ = cpu_to_be32(sess->fore_channel.maxreqs); - *p++ = cpu_to_be32(sess->fore_channel.nr_rdma_attrs); - - if (sess->fore_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->fore_channel.rdma_attrs); - } - - p = xdr_reserve_space(xdr, 28); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(0); /* headerpadsz */ - *p++ = cpu_to_be32(sess->back_channel.maxreq_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_sz); - *p++ = cpu_to_be32(sess->back_channel.maxresp_cached); - *p++ = cpu_to_be32(sess->back_channel.maxops); - *p++ = cpu_to_be32(sess->back_channel.maxreqs); - *p++ = cpu_to_be32(sess->back_channel.nr_rdma_attrs); - - if (sess->back_channel.nr_rdma_attrs) { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(sess->back_channel.rdma_attrs); - } - return 0; + /* csr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &sess->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_sequence */ + nfserr = nfsd4_encode_sequenceid4(xdr, sess->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* csr_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, sess->flags); + if (nfserr != nfs_ok) + return nfserr; + /* csr_fore_chan_attrs */ + nfserr = nfsd4_encode_channel_attrs4(xdr, &sess->fore_channel); + if (nfserr != nfs_ok) + return nfserr; + /* csr_back_chan_attrs */ + return nfsd4_encode_channel_attrs4(xdr, &sess->back_channel); } static __be32 @@ -4630,22 +4909,35 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_sequence *seq = &u->sequence; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, seq->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(seq->seqid); - *p++ = cpu_to_be32(seq->slotid); + /* sr_sessionid */ + nfserr = nfsd4_encode_sessionid4(xdr, &seq->sessionid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_sequenceid */ + nfserr = nfsd4_encode_sequenceid4(xdr, seq->seqid); + if (nfserr != nfs_ok) + return nfserr; + /* sr_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->slotid); + if (nfserr != nfs_ok) + return nfserr; /* Note slotid's are numbered from zero: */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_highest_slotid */ - *p++ = cpu_to_be32(seq->maxslots - 1); /* sr_target_highest_slotid */ - *p++ = cpu_to_be32(seq->status_flags); + /* sr_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_target_highest_slotid */ + nfserr = nfsd4_encode_slotid4(xdr, seq->maxslots - 1); + if (nfserr != nfs_ok) + return nfserr; + /* sr_status_flags */ + nfserr = nfsd4_encode_uint32_t(xdr, seq->status_flags); + if (nfserr != nfs_ok) + return nfserr; resp->cstate.data_offset = xdr->buf->len; /* DRC cache data pointer */ - return 0; + return nfs_ok; } static __be32 @@ -4653,125 +4945,132 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_test_stateid *test_stateid = &u->test_stateid; - struct xdr_stream *xdr = resp->xdr; struct nfsd4_test_stateid_id *stateid, *next; - __be32 *p; + struct xdr_stream *xdr = resp->xdr; - p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); - if (!p) + /* tsr_status_codes<> */ + if (xdr_stream_encode_u32(xdr, test_stateid->ts_num_ids) != XDR_UNIT) return nfserr_resource; - *p++ = htonl(test_stateid->ts_num_ids); - - list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { - *p++ = stateid->ts_id_status; + list_for_each_entry_safe(stateid, next, + &test_stateid->ts_stateid_list, ts_id_list) { + if (xdr_stream_encode_be32(xdr, stateid->ts_id_status) != XDR_UNIT) + return nfserr_resource; } - - return 0; + return nfs_ok; } #ifdef CONFIG_NFSD_PNFS static __be32 -nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_device_addr4(struct xdr_stream *xdr, + const struct nfsd4_getdeviceinfo *gdev) { - struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; - struct xdr_stream *xdr = resp->xdr; + u32 needed_len, starting_len = xdr->buf->len; const struct nfsd4_layout_ops *ops; - u32 starting_len = xdr->buf->len, needed_len; - __be32 *p; + __be32 status; - p = xdr_reserve_space(xdr, 4); - if (!p) + /* da_layout_type */ + if (xdr_stream_encode_u32(xdr, gdev->gd_layout_type) != XDR_UNIT) return nfserr_resource; - - *p++ = cpu_to_be32(gdev->gd_layout_type); - + /* da_addr_body */ ops = nfsd4_layout_ops[gdev->gd_layout_type]; - nfserr = ops->encode_getdeviceinfo(xdr, gdev); - if (nfserr) { + status = ops->encode_getdeviceinfo(xdr, gdev); + if (status != nfs_ok) { /* - * We don't bother to burden the layout drivers with - * enforcing gd_maxcount, just tell the client to - * come back with a bigger buffer if it's not enough. + * Don't burden the layout drivers with enforcing + * gd_maxcount. Just tell the client to come back + * with a bigger buffer if it's not enough. */ - if (xdr->buf->len + 4 > gdev->gd_maxcount) + if (xdr->buf->len + XDR_UNIT > gdev->gd_maxcount) goto toosmall; - return nfserr; + return status; } - if (gdev->gd_notify_types) { - p = xdr_reserve_space(xdr, 4 + 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(gdev->gd_notify_types); - } else { - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = 0; - } + return nfs_ok; - return 0; toosmall: - dprintk("%s: maxcount too small\n", __func__); - needed_len = xdr->buf->len + 4 /* notifications */; + needed_len = xdr->buf->len + XDR_UNIT; /* notifications */ xdr_truncate_encode(xdr, starting_len); - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(needed_len); + + status = nfsd4_encode_count4(xdr, needed_len); + if (status != nfs_ok) + return status; return nfserr_toosmall; } static __be32 -nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, +nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { - struct nfsd4_layoutget *lgp = &u->layoutget; + struct nfsd4_getdeviceinfo *gdev = &u->getdeviceinfo; struct xdr_stream *xdr = resp->xdr; - const struct nfsd4_layout_ops *ops; - __be32 *p; - - p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(1); /* we always set return-on-close */ - *p++ = cpu_to_be32(lgp->lg_sid.si_generation); - p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque, - sizeof(stateid_opaque_t)); + /* gdir_device_addr */ + nfserr = nfsd4_encode_device_addr4(xdr, gdev); + if (nfserr) + return nfserr; + /* gdir_notification */ + return nfsd4_encode_bitmap4(xdr, gdev->gd_notify_types, 0, 0); +} - *p++ = cpu_to_be32(1); /* we always return a single layout */ - p = xdr_encode_hyper(p, lgp->lg_seg.offset); - p = xdr_encode_hyper(p, lgp->lg_seg.length); - *p++ = cpu_to_be32(lgp->lg_seg.iomode); - *p++ = cpu_to_be32(lgp->lg_layout_type); +static __be32 +nfsd4_encode_layout4(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp) +{ + const struct nfsd4_layout_ops *ops = nfsd4_layout_ops[lgp->lg_layout_type]; + __be32 status; - ops = nfsd4_layout_ops[lgp->lg_layout_type]; + /* lo_offset */ + status = nfsd4_encode_offset4(xdr, lgp->lg_seg.offset); + if (status != nfs_ok) + return status; + /* lo_length */ + status = nfsd4_encode_length4(xdr, lgp->lg_seg.length); + if (status != nfs_ok) + return status; + /* lo_iomode */ + if (xdr_stream_encode_u32(xdr, lgp->lg_seg.iomode) != XDR_UNIT) + return nfserr_resource; + /* lo_content */ + if (xdr_stream_encode_u32(xdr, lgp->lg_layout_type) != XDR_UNIT) + return nfserr_resource; return ops->encode_layoutget(xdr, lgp); } static __be32 +nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) +{ + struct nfsd4_layoutget *lgp = &u->layoutget; + struct xdr_stream *xdr = resp->xdr; + + /* logr_return_on_close */ + nfserr = nfsd4_encode_bool(xdr, true); + if (nfserr != nfs_ok) + return nfserr; + /* logr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &lgp->lg_sid); + if (nfserr != nfs_ok) + return nfserr; + /* logr_layout<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + return nfsd4_encode_layout4(xdr, lgp); +} + +static __be32 nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_layoutcommit *lcp = &u->layoutcommit; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(lcp->lc_size_chg); - if (lcp->lc_size_chg) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - p = xdr_encode_hyper(p, lcp->lc_newsize); - } - - return 0; + /* ns_sizechanged */ + nfserr = nfsd4_encode_bool(xdr, lcp->lc_size_chg); + if (nfserr != nfs_ok) + return nfserr; + if (lcp->lc_size_chg) + /* ns_size */ + return nfsd4_encode_length4(xdr, lcp->lc_newsize); + return nfs_ok; } static __be32 @@ -4780,103 +5079,108 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_layoutreturn *lrp = &u->layoutreturn; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(lrp->lrs_present); + /* lrs_present */ + nfserr = nfsd4_encode_bool(xdr, lrp->lrs_present); + if (nfserr != nfs_ok) + return nfserr; if (lrp->lrs_present) - return nfsd4_encode_stateid(xdr, &lrp->lr_sid); - return 0; + /* lrs_stateid */ + return nfsd4_encode_stateid4(xdr, &lrp->lr_sid); + return nfs_ok; } #endif /* CONFIG_NFSD_PNFS */ static __be32 -nfsd42_encode_write_res(struct nfsd4_compoundres *resp, - struct nfsd42_write_res *write, bool sync) +nfsd4_encode_write_response4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) { - __be32 *p; - p = xdr_reserve_space(resp->xdr, 4); - if (!p) - return nfserr_resource; + const struct nfsd42_write_res *write = ©->cp_res; + u32 count = nfsd4_copy_is_sync(copy) ? 0 : 1; + __be32 status; - if (sync) - *p++ = cpu_to_be32(0); - else { - __be32 nfserr; - *p++ = cpu_to_be32(1); - nfserr = nfsd4_encode_stateid(resp->xdr, &write->cb_stateid); - if (nfserr) - return nfserr; + /* wr_callback_id<1> */ + if (xdr_stream_encode_u32(xdr, count) != XDR_UNIT) + return nfserr_resource; + if (count) { + status = nfsd4_encode_stateid4(xdr, &write->cb_stateid); + if (status != nfs_ok) + return status; } - p = xdr_reserve_space(resp->xdr, 8 + 4 + NFS4_VERIFIER_SIZE); - if (!p) + + /* wr_count */ + status = nfsd4_encode_length4(xdr, write->wr_bytes_written); + if (status != nfs_ok) + return status; + /* wr_committed */ + if (xdr_stream_encode_u32(xdr, write->wr_stable_how) != XDR_UNIT) return nfserr_resource; + /* wr_writeverf */ + return nfsd4_encode_verifier4(xdr, &write->wr_verifier); +} - p = xdr_encode_hyper(p, write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_stable_how); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - return nfs_ok; +static __be32 nfsd4_encode_copy_requirements4(struct xdr_stream *xdr, + const struct nfsd4_copy *copy) +{ + __be32 status; + + /* cr_consecutive */ + status = nfsd4_encode_bool(xdr, true); + if (status != nfs_ok) + return status; + /* cr_synchronous */ + return nfsd4_encode_bool(xdr, nfsd4_copy_is_sync(copy)); } static __be32 -nfsd42_encode_nl4_server(struct nfsd4_compoundres *resp, struct nl4_server *ns) +nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct xdr_stream *xdr = resp->xdr; - struct nfs42_netaddr *addr; - __be32 *p; + struct nfsd4_copy *copy = &u->copy; - p = xdr_reserve_space(xdr, 4); - *p++ = cpu_to_be32(ns->nl4_type); + nfserr = nfsd4_encode_write_response4(resp->xdr, copy); + if (nfserr != nfs_ok) + return nfserr; + return nfsd4_encode_copy_requirements4(resp->xdr, copy); +} +static __be32 +nfsd4_encode_netloc4(struct xdr_stream *xdr, const struct nl4_server *ns) +{ + __be32 status; + + if (xdr_stream_encode_u32(xdr, ns->nl4_type) != XDR_UNIT) + return nfserr_resource; switch (ns->nl4_type) { case NL4_NETADDR: - addr = &ns->u.nl4_addr; - - /* netid_len, netid, uaddr_len, uaddr (port included - * in RPCBIND_MAXUADDRLEN) - */ - p = xdr_reserve_space(xdr, - 4 /* netid len */ + - (XDR_QUADLEN(addr->netid_len) * 4) + - 4 /* uaddr len */ + - (XDR_QUADLEN(addr->addr_len) * 4)); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(addr->netid_len); - p = xdr_encode_opaque_fixed(p, addr->netid, - addr->netid_len); - *p++ = cpu_to_be32(addr->addr_len); - p = xdr_encode_opaque_fixed(p, addr->addr, - addr->addr_len); + /* nl_addr */ + status = nfsd4_encode_netaddr4(xdr, &ns->u.nl4_addr); break; default: - WARN_ON_ONCE(ns->nl4_type != NL4_NETADDR); - return nfserr_inval; + status = nfserr_serverfault; } - - return 0; + return status; } static __be32 -nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) +nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, + union nfsd4_op_u *u) { - struct nfsd4_copy *copy = &u->copy; - __be32 *p; + struct nfsd4_copy_notify *cn = &u->copy_notify; + struct xdr_stream *xdr = resp->xdr; - nfserr = nfsd42_encode_write_res(resp, ©->cp_res, - nfsd4_copy_is_sync(copy)); + /* cnr_lease_time */ + nfserr = nfsd4_encode_nfstime4(xdr, &cn->cpn_lease_time); if (nfserr) return nfserr; - - p = xdr_reserve_space(resp->xdr, 4 + 4); - *p++ = xdr_one; /* cr_consecutive */ - *p = nfsd4_copy_is_sync(copy) ? xdr_one : xdr_zero; - return 0; + /* cnr_stateid */ + nfserr = nfsd4_encode_stateid4(xdr, &cn->cpn_cnr_stateid); + if (nfserr) + return nfserr; + /* cnr_source_server<> */ + if (xdr_stream_encode_u32(xdr, 1) != XDR_UNIT) + return nfserr_resource; + return nfsd4_encode_netloc4(xdr, cn->cpn_src); } static __be32 @@ -4885,14 +5189,15 @@ nfsd4_encode_offload_status(struct nfsd4_compoundres *resp, __be32 nfserr, { struct nfsd4_offload_status *os = &u->offload_status; struct xdr_stream *xdr = resp->xdr; - __be32 *p; - p = xdr_reserve_space(xdr, 8 + 4); - if (!p) + /* osr_count */ + nfserr = nfsd4_encode_length4(xdr, os->count); + if (nfserr != nfs_ok) + return nfserr; + /* osr_complete<1> */ + if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT) return nfserr_resource; - p = xdr_encode_hyper(p, os->count); - *p++ = cpu_to_be32(0); - return nfserr; + return nfs_ok; } static __be32 @@ -4970,53 +5275,18 @@ out: } static __be32 -nfsd4_encode_copy_notify(struct nfsd4_compoundres *resp, __be32 nfserr, - union nfsd4_op_u *u) -{ - struct nfsd4_copy_notify *cn = &u->copy_notify; - struct xdr_stream *xdr = resp->xdr; - __be32 *p; - - if (nfserr) - return nfserr; - - /* 8 sec, 4 nsec */ - p = xdr_reserve_space(xdr, 12); - if (!p) - return nfserr_resource; - - /* cnr_lease_time */ - p = xdr_encode_hyper(p, cn->cpn_sec); - *p++ = cpu_to_be32(cn->cpn_nsec); - - /* cnr_stateid */ - nfserr = nfsd4_encode_stateid(xdr, &cn->cpn_cnr_stateid); - if (nfserr) - return nfserr; - - /* cnr_src.nl_nsvr */ - p = xdr_reserve_space(xdr, 4); - if (!p) - return nfserr_resource; - - *p++ = cpu_to_be32(1); - - nfserr = nfsd42_encode_nl4_server(resp, cn->cpn_src); - return nfserr; -} - -static __be32 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, union nfsd4_op_u *u) { struct nfsd4_seek *seek = &u->seek; - __be32 *p; - - p = xdr_reserve_space(resp->xdr, 4 + 8); - *p++ = cpu_to_be32(seek->seek_eof); - p = xdr_encode_hyper(p, seek->seek_pos); + struct xdr_stream *xdr = resp->xdr; - return 0; + /* sr_eof */ + nfserr = nfsd4_encode_bool(xdr, seek->seek_eof); + if (nfserr != nfs_ok) + return nfserr; + /* sr_offset */ + return nfsd4_encode_offset4(xdr, seek->seek_pos); } static __be32 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 7ed02fb88a36..3e15b72f421d 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -26,6 +26,7 @@ #include "pnfs.h" #include "filecache.h" #include "trace.h" +#include "netlink.h" /* * We have a single directory with several nodes in it. @@ -1132,7 +1133,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; @@ -1496,6 +1497,203 @@ static int create_proc_exports_entry(void) unsigned int nfsd_net_id; /** + * nfsd_nl_rpc_status_get_start - Prepare rpc_status_get dumpit + * @cb: netlink metadata and command arguments + * + * Return values: + * %0: The rpc_status_get command may proceed + * %-ENODEV: There is no NFSD running in this namespace + */ +int nfsd_nl_rpc_status_get_start(struct netlink_callback *cb) +{ + struct nfsd_net *nn = net_generic(sock_net(cb->skb->sk), nfsd_net_id); + int ret = -ENODEV; + + mutex_lock(&nfsd_mutex); + if (nn->nfsd_serv) { + svc_get(nn->nfsd_serv); + ret = 0; + } + mutex_unlock(&nfsd_mutex); + + return ret; +} + +static int nfsd_genl_rpc_status_compose_msg(struct sk_buff *skb, + struct netlink_callback *cb, + struct nfsd_genl_rqstp *rqstp) +{ + void *hdr; + u32 i; + + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + &nfsd_nl_family, 0, NFSD_CMD_RPC_STATUS_GET); + if (!hdr) + return -ENOBUFS; + + if (nla_put_be32(skb, NFSD_A_RPC_STATUS_XID, rqstp->rq_xid) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_FLAGS, rqstp->rq_flags) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROG, rqstp->rq_prog) || + nla_put_u32(skb, NFSD_A_RPC_STATUS_PROC, rqstp->rq_proc) || + nla_put_u8(skb, NFSD_A_RPC_STATUS_VERSION, rqstp->rq_vers) || + nla_put_s64(skb, NFSD_A_RPC_STATUS_SERVICE_TIME, + ktime_to_us(rqstp->rq_stime), + NFSD_A_RPC_STATUS_PAD)) + return -ENOBUFS; + + switch (rqstp->rq_saddr.sa_family) { + case AF_INET: { + const struct sockaddr_in *s_in, *d_in; + + s_in = (const struct sockaddr_in *)&rqstp->rq_saddr; + d_in = (const struct sockaddr_in *)&rqstp->rq_daddr; + if (nla_put_in_addr(skb, NFSD_A_RPC_STATUS_SADDR4, + s_in->sin_addr.s_addr) || + nla_put_in_addr(skb, NFSD_A_RPC_STATUS_DADDR4, + d_in->sin_addr.s_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin_port)) + return -ENOBUFS; + break; + } + case AF_INET6: { + const struct sockaddr_in6 *s_in, *d_in; + + s_in = (const struct sockaddr_in6 *)&rqstp->rq_saddr; + d_in = (const struct sockaddr_in6 *)&rqstp->rq_daddr; + if (nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_SADDR6, + &s_in->sin6_addr) || + nla_put_in6_addr(skb, NFSD_A_RPC_STATUS_DADDR6, + &d_in->sin6_addr) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_SPORT, + s_in->sin6_port) || + nla_put_be16(skb, NFSD_A_RPC_STATUS_DPORT, + d_in->sin6_port)) + return -ENOBUFS; + break; + } + } + + for (i = 0; i < rqstp->rq_opcnt; i++) + if (nla_put_u32(skb, NFSD_A_RPC_STATUS_COMPOUND_OPS, + rqstp->rq_opnum[i])) + return -ENOBUFS; + + genlmsg_end(skb, hdr); + return 0; +} + +/** + * nfsd_nl_rpc_status_get_dumpit - Handle rpc_status_get dumpit + * @skb: reply buffer + * @cb: netlink metadata and command arguments + * + * Returns the size of the reply or a negative errno. + */ +int nfsd_nl_rpc_status_get_dumpit(struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct nfsd_net *nn = net_generic(sock_net(skb->sk), nfsd_net_id); + int i, ret, rqstp_index = 0; + + rcu_read_lock(); + + for (i = 0; i < nn->nfsd_serv->sv_nrpools; i++) { + struct svc_rqst *rqstp; + + if (i < cb->args[0]) /* already consumed */ + continue; + + rqstp_index = 0; + list_for_each_entry_rcu(rqstp, + &nn->nfsd_serv->sv_pools[i].sp_all_threads, + rq_all) { + struct nfsd_genl_rqstp genl_rqstp; + unsigned int status_counter; + + if (rqstp_index++ < cb->args[1]) /* already consumed */ + continue; + /* + * Acquire rq_status_counter before parsing the rqst + * fields. rq_status_counter is set to an odd value in + * order to notify the consumers the rqstp fields are + * meaningful. + */ + status_counter = + smp_load_acquire(&rqstp->rq_status_counter); + if (!(status_counter & 1)) + continue; + + genl_rqstp.rq_xid = rqstp->rq_xid; + genl_rqstp.rq_flags = rqstp->rq_flags; + genl_rqstp.rq_vers = rqstp->rq_vers; + genl_rqstp.rq_prog = rqstp->rq_prog; + genl_rqstp.rq_proc = rqstp->rq_proc; + genl_rqstp.rq_stime = rqstp->rq_stime; + genl_rqstp.rq_opcnt = 0; + memcpy(&genl_rqstp.rq_daddr, svc_daddr(rqstp), + sizeof(struct sockaddr)); + memcpy(&genl_rqstp.rq_saddr, svc_addr(rqstp), + sizeof(struct sockaddr)); + +#ifdef CONFIG_NFSD_V4 + if (rqstp->rq_vers == NFS4_VERSION && + rqstp->rq_proc == NFSPROC4_COMPOUND) { + /* NFSv4 compound */ + struct nfsd4_compoundargs *args; + int j; + + args = rqstp->rq_argp; + genl_rqstp.rq_opcnt = args->opcnt; + for (j = 0; j < genl_rqstp.rq_opcnt; j++) + genl_rqstp.rq_opnum[j] = + args->ops[j].opnum; + } +#endif /* CONFIG_NFSD_V4 */ + + /* + * Acquire rq_status_counter before reporting the rqst + * fields to the user. + */ + if (smp_load_acquire(&rqstp->rq_status_counter) != + status_counter) + continue; + + ret = nfsd_genl_rpc_status_compose_msg(skb, cb, + &genl_rqstp); + if (ret) + goto out; + } + } + + cb->args[0] = i; + cb->args[1] = rqstp_index; + ret = skb->len; +out: + rcu_read_unlock(); + + return ret; +} + +/** + * nfsd_nl_rpc_status_get_done - rpc_status_get dumpit post-processing + * @cb: netlink metadata and command arguments + * + * Return values: + * %0: Success + */ +int nfsd_nl_rpc_status_get_done(struct netlink_callback *cb) +{ + mutex_lock(&nfsd_mutex); + nfsd_put(sock_net(cb->skb->sk)); + mutex_unlock(&nfsd_mutex); + + return 0; +} + +/** * nfsd_net_init - Prepare the nfsd_net portion of a new net namespace * @net: a freshly-created network namespace * @@ -1589,6 +1787,10 @@ static int __init init_nfsd(void) retval = register_filesystem(&nfsd_fs_type); if (retval) goto out_free_all; + retval = genl_register_family(&nfsd_nl_family); + if (retval) + goto out_free_all; + return 0; out_free_all: nfsd4_destroy_laundry_wq(); @@ -1613,6 +1815,7 @@ out_free_slabs: static void __exit exit_nfsd(void) { + genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 11c14faa6c67..f5ff42f41ee7 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -62,6 +62,23 @@ struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ }; +/* Maximum number of operations per session compound */ +#define NFSD_MAX_OPS_PER_COMPOUND 50 + +struct nfsd_genl_rqstp { + struct sockaddr rq_daddr; + struct sockaddr rq_saddr; + unsigned long rq_flags; + ktime_t rq_stime; + __be32 rq_xid; + u32 rq_vers; + u32 rq_prog; + u32 rq_proc; + + /* NFSv4 compound */ + u32 rq_opcnt; + u32 rq_opnum[NFSD_MAX_OPS_PER_COMPOUND]; +}; extern struct svc_program nfsd_program; extern const struct svc_version nfsd_version2, nfsd_version3, nfsd_version4; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 355bf0db3235..dbfa0ac13564 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -771,7 +771,7 @@ enum fsid_source fsid_source(const struct svc_fh *fhp) * assume that the new change attr is always logged to stable storage in some * fashion before the results can be seen. */ -u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode) +u64 nfsd4_change_attribute(const struct kstat *stat, const struct inode *inode) { u64 chattr; diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 40426f899e76..6ebdf7ea27bf 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -293,7 +293,8 @@ static inline void fh_clear_pre_post_attrs(struct svc_fh *fhp) fhp->fh_pre_saved = false; } -u64 nfsd4_change_attribute(struct kstat *stat, struct inode *inode); +u64 nfsd4_change_attribute(const struct kstat *stat, + const struct inode *inode); __be32 __must_check fh_fill_pre_attrs(struct svc_fh *fhp); __be32 fh_fill_post_attrs(struct svc_fh *fhp); __be32 __must_check fh_fill_both_attrs(struct svc_fh *fhp); diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 1582af33e204..d6122bb2d167 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -572,7 +572,6 @@ static void nfsd_last_thread(struct net *net) return; nfsd_shutdown_net(net); - pr_info("nfsd: last server has exited, flushing export cache\n"); nfsd_export_flush(net); } @@ -713,14 +712,13 @@ int nfsd_nrpools(struct net *net) int nfsd_get_nrthreads(int n, int *nthreads, struct net *net) { - int i = 0; struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct svc_serv *serv = nn->nfsd_serv; + int i; - if (nn->nfsd_serv != NULL) { - for (i = 0; i < nn->nfsd_serv->sv_nrpools && i < n; i++) - nthreads[i] = nn->nfsd_serv->sv_pools[i].sp_nrthreads; - } - + if (serv) + for (i = 0; i < serv->sv_nrpools && i < n; i++) + nthreads[i] = atomic_read(&serv->sv_pools[i].sp_nrthreads); return 0; } @@ -787,7 +785,6 @@ int nfsd_svc(int nrservs, struct net *net, const struct cred *cred) { int error; - bool nfsd_up_before; struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct svc_serv *serv; @@ -807,8 +804,6 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) error = nfsd_create_serv(net); if (error) goto out; - - nfsd_up_before = nn->nfsd_net_up; serv = nn->nfsd_serv; error = nfsd_startup_net(net, cred); @@ -816,17 +811,15 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred) goto out_put; error = svc_set_num_threads(serv, NULL, nrservs); if (error) - goto out_shutdown; + goto out_put; error = serv->sv_nrthreads; - if (error == 0) - nfsd_last_thread(net); -out_shutdown: - if (error < 0 && !nfsd_up_before) - nfsd_shutdown_net(net); out_put: /* Threads now hold service active */ if (xchg(&nn->keep_active, 0)) svc_put(serv); + + if (serv->sv_nrthreads == 0) + nfsd_last_thread(net); svc_put(serv); out: mutex_unlock(&nfsd_mutex); @@ -957,7 +950,7 @@ nfsd(void *vrqstp) /* * The main request loop */ - while (!kthread_should_stop()) { + while (!svc_thread_should_stop(rqstp)) { /* Update sv_maxconn if it has changed */ rqstp->rq_server->sv_maxconn = nn->max_connections; @@ -998,6 +991,15 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_decode(rqstp, &rqstp->rq_arg_stream)) goto out_decode_err; + /* + * Release rq_status_counter setting it to an odd value after the rpc + * request has been properly parsed. rq_status_counter is used to + * notify the consumers if the rqstp fields are stable + * (rq_status_counter is odd) or not meaningful (rq_status_counter + * is even). + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter | 1); + rp = NULL; switch (nfsd_cache_lookup(rqstp, &rp)) { case RC_DOIT: @@ -1015,6 +1017,12 @@ int nfsd_dispatch(struct svc_rqst *rqstp) if (!proc->pc_encode(rqstp, &rqstp->rq_res_stream)) goto out_encode_err; + /* + * Release rq_status_counter setting it to an even value after the rpc + * request has been properly processed. + */ + smp_store_release(&rqstp->rq_status_counter, rqstp->rq_status_counter + 1); + nfsd_cache_update(rqstp, rp, rqstp->rq_cachetype, statp + 1); out_cached_reply: return 1; @@ -1082,11 +1090,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file) int nfsd_pool_stats_release(struct inode *inode, struct file *file) { + struct seq_file *seq = file->private_data; + struct svc_serv *serv = seq->private; int ret = seq_release(inode, file); - struct net *net = inode->i_sb->s_fs_info; mutex_lock(&nfsd_mutex); - nfsd_put(net); + svc_put(serv); mutex_unlock(&nfsd_mutex); return ret; } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 4f4282d4eeca..de1e0dfed06a 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -27,12 +27,12 @@ struct nfsd4_layout_ops { struct nfs4_client *clp, struct nfsd4_getdeviceinfo *gdevp); __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, - struct nfsd4_getdeviceinfo *gdevp); + const struct nfsd4_getdeviceinfo *gdevp); __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, struct nfsd4_layoutget *lgp); - __be32 (*encode_layoutget)(struct xdr_stream *, - struct nfsd4_layoutget *lgp); + __be32 (*encode_layoutget)(struct xdr_stream *xdr, + const struct nfsd4_layoutget *lgp); __be32 (*proc_layoutcommit)(struct inode *inode, struct nfsd4_layoutcommit *lcp); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index cbddcf484dba..f96eaa8e9413 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -117,6 +117,24 @@ struct nfs4_cpntf_state { time64_t cpntf_time; /* last time stateid used */ }; +struct nfs4_cb_fattr { + struct nfsd4_callback ncf_getattr; + u32 ncf_cb_status; + u32 ncf_cb_bmap[1]; + + /* from CB_GETATTR reply */ + u64 ncf_cb_change; + u64 ncf_cb_fsize; + + unsigned long ncf_cb_flags; + bool ncf_file_modified; + u64 ncf_initial_cinfo; + u64 ncf_cur_fsize; +}; + +/* bits for ncf_cb_flags */ +#define CB_GETATTR_BUSY 0 + /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -150,6 +168,9 @@ struct nfs4_delegation { int dl_retries; struct nfsd4_callback dl_recall; bool dl_recalled; + + /* for CB_GETATTR */ + struct nfs4_cb_fattr dl_cb_fattr; }; #define cb_to_delegation(cb) \ @@ -174,8 +195,6 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) /* Maximum number of slots per session. 160 is useful for long haul TCP */ #define NFSD_MAX_SLOTS_PER_SESSION 160 -/* Maximum number of operations per session compound */ -#define NFSD_MAX_OPS_PER_COMPOUND 50 /* Maximum session per slot cache size */ #define NFSD_SLOT_CACHE_SIZE 2048 /* Maximum number of NFSD_SLOT_CACHE_SIZE slots per session */ @@ -642,6 +661,7 @@ enum nfsd4_cb_op { NFSPROC4_CLNT_CB_SEQUENCE, NFSPROC4_CLNT_CB_NOTIFY_LOCK, NFSPROC4_CLNT_CB_RECALL_ANY, + NFSPROC4_CLNT_CB_GETATTR, }; /* Returns true iff a is later than b: */ @@ -734,5 +754,6 @@ static inline bool try_to_expire_client(struct nfs4_client *clp) } extern __be32 nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, - struct inode *inode); + struct inode *inode, bool *file_modified, u64 *size); +extern void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf); #endif /* NFSD4_STATE_H */ diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index 63797635e1c3..12d79f5d4eb1 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -60,7 +60,7 @@ static int nfsd_show(struct seq_file *seq, void *v) #ifdef CONFIG_NFSD_V4 /* Show count for individual nfsv4 operations */ /* Writing operation numbers 0 1 2 also for maintaining uniformity */ - seq_printf(seq,"proc4ops %u", LAST_NFS4_OP + 1); + seq_printf(seq, "proc4ops %u", LAST_NFS4_OP + 1); for (i = 0; i <= LAST_NFS4_OP; i++) { seq_printf(seq, " %lld", percpu_counter_sum_positive(&nfsdstats.counter[NFSD_STATS_NFS4_OP(i)])); @@ -76,7 +76,7 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num) +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num) { int i, err = 0; diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index cf5524e7ca06..14f50c660b61 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -37,9 +37,9 @@ extern struct nfsd_stats nfsdstats; extern struct svc_stat nfsd_svcstats; -int nfsd_percpu_counters_init(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_reset(struct percpu_counter counters[], int num); -void nfsd_percpu_counters_destroy(struct percpu_counter counters[], int num); +int nfsd_percpu_counters_init(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_reset(struct percpu_counter *counters, int num); +void nfsd_percpu_counters_destroy(struct percpu_counter *counters, int num); int nfsd_stat_init(void); void nfsd_stat_shutdown(void); @@ -61,22 +61,22 @@ static inline void nfsd_stats_rc_nocache_inc(void) static inline void nfsd_stats_fh_stale_inc(struct svc_export *exp) { percpu_counter_inc(&nfsdstats.counter[NFSD_STATS_FH_STALE]); - if (exp) - percpu_counter_inc(&exp->ex_stats.counter[EXP_STATS_FH_STALE]); + if (exp && exp->ex_stats) + percpu_counter_inc(&exp->ex_stats->counter[EXP_STATS_FH_STALE]); } static inline void nfsd_stats_io_read_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_READ], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_READ], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_READ], amount); } static inline void nfsd_stats_io_write_add(struct svc_export *exp, s64 amount) { percpu_counter_add(&nfsdstats.counter[NFSD_STATS_IO_WRITE], amount); - if (exp) - percpu_counter_add(&exp->ex_stats.counter[EXP_STATS_IO_WRITE], amount); + if (exp && exp->ex_stats) + percpu_counter_add(&exp->ex_stats->counter[EXP_STATS_IO_WRITE], amount); } static inline void nfsd_stats_payload_misses_inc(struct nfsd_net *nn) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 803904348871..fbc0ccb40424 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1863,6 +1863,93 @@ TRACE_EVENT(nfsd_end_grace, ) ); +DECLARE_EVENT_CLASS(nfsd_copy_class, + TP_PROTO( + const struct nfsd4_copy *copy + ), + TP_ARGS(copy), + TP_STRUCT__entry( + __field(bool, intra) + __field(bool, async) + __field(u32, src_cl_boot) + __field(u32, src_cl_id) + __field(u32, src_so_id) + __field(u32, src_si_generation) + __field(u32, dst_cl_boot) + __field(u32, dst_cl_id) + __field(u32, dst_so_id) + __field(u32, dst_si_generation) + __field(u64, src_cp_pos) + __field(u64, dst_cp_pos) + __field(u64, cp_count) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + const stateid_t *src_stp = ©->cp_src_stateid; + const stateid_t *dst_stp = ©->cp_dst_stateid; + + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __entry->src_cl_boot = src_stp->si_opaque.so_clid.cl_boot; + __entry->src_cl_id = src_stp->si_opaque.so_clid.cl_id; + __entry->src_so_id = src_stp->si_opaque.so_id; + __entry->src_si_generation = src_stp->si_generation; + __entry->dst_cl_boot = dst_stp->si_opaque.so_clid.cl_boot; + __entry->dst_cl_id = dst_stp->si_opaque.so_clid.cl_id; + __entry->dst_so_id = dst_stp->si_opaque.so_id; + __entry->dst_si_generation = dst_stp->si_generation; + __entry->src_cp_pos = copy->cp_src_pos; + __entry->dst_cp_pos = copy->cp_dst_pos; + __entry->cp_count = copy->cp_count; + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("client=%pISpc intra=%d async=%d " + "src_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] " + "dst_stateid[si_generation:0x%x cl_boot:0x%x cl_id:0x%x so_id:0x%x] " + "cp_src_pos=%llu cp_dst_pos=%llu cp_count=%llu", + __get_sockaddr(addr), __entry->intra, __entry->async, + __entry->src_si_generation, __entry->src_cl_boot, + __entry->src_cl_id, __entry->src_so_id, + __entry->dst_si_generation, __entry->dst_cl_boot, + __entry->dst_cl_id, __entry->dst_so_id, + __entry->src_cp_pos, __entry->dst_cp_pos, __entry->cp_count + ) +); + +#define DEFINE_COPY_EVENT(name) \ +DEFINE_EVENT(nfsd_copy_class, nfsd_copy_##name, \ + TP_PROTO(const struct nfsd4_copy *copy), \ + TP_ARGS(copy)) + +DEFINE_COPY_EVENT(inter); +DEFINE_COPY_EVENT(intra); +DEFINE_COPY_EVENT(do_async); + +TRACE_EVENT(nfsd_copy_done, + TP_PROTO( + const struct nfsd4_copy *copy, + __be32 status + ), + TP_ARGS(copy, status), + TP_STRUCT__entry( + __field(int, status) + __field(bool, intra) + __field(bool, async) + __sockaddr(addr, sizeof(struct sockaddr_in6)) + ), + TP_fast_assign( + __entry->status = be32_to_cpu(status); + __entry->intra = test_bit(NFSD4_COPY_F_INTRA, ©->cp_flags); + __entry->async = !test_bit(NFSD4_COPY_F_SYNCHRONOUS, ©->cp_flags); + __assign_sockaddr(addr, ©->cp_clp->cl_addr, + sizeof(struct sockaddr_in6)); + ), + TP_printk("addr=%pISpc status=%d intra=%d async=%d ", + __get_sockaddr(addr), __entry->status, __entry->intra, __entry->async + ) +); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 48260cf68fde..fbbea7498f02 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -337,6 +337,24 @@ out: return err; } +static void +commit_reset_write_verifier(struct nfsd_net *nn, struct svc_rqst *rqstp, + int err) +{ + switch (err) { + case -EAGAIN: + case -ESTALE: + /* + * Neither of these are the result of a problem with + * durable storage, so avoid a write verifier reset. + */ + break; + default: + nfsd_reset_write_verifier(nn); + trace_nfsd_writeverf_reset(nn, rqstp, err); + } +} + /* * Commit metadata changes to stable storage. */ @@ -520,7 +538,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode_get_ctime(inode).tv_sec) + if (check_guard && guardtime != inode_get_ctime_sec(inode)) return nfserr_notsync; /* @@ -647,8 +665,7 @@ __be32 nfsd4_clone_file_range(struct svc_rqst *rqstp, &nfsd4_get_cstate(rqstp)->current_fh, dst_pos, count, status); - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, status); + commit_reset_write_verifier(nn, rqstp, status); ret = nfserrno(status); } } @@ -823,7 +840,7 @@ int nfsd_open_break_lease(struct inode *inode, int access) * and additional flags. * N.B. After this call fhp needs an fh_put */ -static __be32 +static int __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { @@ -831,14 +848,12 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, struct inode *inode; struct file *file; int flags = O_RDONLY|O_LARGEFILE; - __be32 err; - int host_err = 0; + int host_err = -EPERM; path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; inode = d_inode(path.dentry); - err = nfserr_perm; if (IS_APPEND(inode) && (may_flags & NFSD_MAY_WRITE)) goto out; @@ -847,7 +862,7 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, host_err = nfsd_open_break_lease(inode, may_flags); if (host_err) /* NOMEM or WOULDBLOCK */ - goto out_nfserr; + goto out; if (may_flags & NFSD_MAY_WRITE) { if (may_flags & NFSD_MAY_READ) @@ -859,13 +874,13 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, file = dentry_open(&path, flags, current_cred()); if (IS_ERR(file)) { host_err = PTR_ERR(file); - goto out_nfserr; + goto out; } host_err = ima_file_check(file, may_flags); if (host_err) { fput(file); - goto out_nfserr; + goto out; } if (may_flags & NFSD_MAY_64BIT_COOKIE) @@ -874,10 +889,8 @@ __nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, file->f_mode |= FMODE_32BITHASH; *filp = file; -out_nfserr: - err = nfserrno(host_err); out: - return err; + return host_err; } __be32 @@ -885,6 +898,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int may_flags, struct file **filp) { __be32 err; + int host_err; bool retried = false; validate_process_creds(); @@ -904,12 +918,13 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, retry: err = fh_verify(rqstp, fhp, type, may_flags); if (!err) { - err = __nfsd_open(rqstp, fhp, type, may_flags, filp); - if (err == nfserr_stale && !retried) { + host_err = __nfsd_open(rqstp, fhp, type, may_flags, filp); + if (host_err == -EOPENSTALE && !retried) { retried = true; fh_put(fhp); goto retry; } + err = nfserrno(host_err); } validate_process_creds(); return err; @@ -922,13 +937,13 @@ retry: * @may_flags: internal permission flags * @filp: OUT: open "struct file *" * - * Returns an nfsstat value in network byte order. + * Returns zero on success, or a negative errno value. */ -__be32 +int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, int may_flags, struct file **filp) { - __be32 err; + int err; validate_process_creds(); err = __nfsd_open(rqstp, fhp, S_IFREG, may_flags, filp); @@ -1172,8 +1187,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, host_err = vfs_iter_write(file, &iter, &pos, flags); file_end_write(file); if (host_err < 0) { - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, host_err); + commit_reset_write_verifier(nn, rqstp, host_err); goto out_nfserr; } *cnt = host_err; @@ -1185,10 +1199,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && use_wgather) { host_err = wait_for_concurrent_writes(file); - if (host_err < 0) { - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, host_err); - } + if (host_err < 0) + commit_reset_write_verifier(nn, rqstp, host_err); } out_nfserr: @@ -1331,8 +1343,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, err = nfserr_notsupp; break; default: - nfsd_reset_write_verifier(nn); - trace_nfsd_writeverf_reset(nn, rqstp, err2); + commit_reset_write_verifier(nn, rqstp, err2); err = nfserrno(err2); } } else @@ -1788,6 +1799,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) goto out; + err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out; + if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) + goto out; + retry: host_err = fh_want_write(ffhp); if (host_err) { @@ -1823,12 +1840,6 @@ retry: if (ndentry == trap) goto out_dput_new; - host_err = -EXDEV; - if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) - goto out_dput_new; - if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry) - goto out_dput_new; - if ((ndentry->d_sb->s_export_op->flags & EXPORT_OP_CLOSE_BEFORE_UNLINK) && nfsd_has_cached_files(ndentry)) { close_cached = true; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index a6890ea7b765..e3c29596f4df 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -104,8 +104,8 @@ __be32 nfsd_setxattr(struct svc_rqst *rqstp, struct svc_fh *fhp, int nfsd_open_break_lease(struct inode *, int); __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -__be32 nfsd_open_verified(struct svc_rqst *, struct svc_fh *, - int, struct file **); +int nfsd_open_verified(struct svc_rqst *rqstp, struct svc_fh *fhp, + int may_flags, struct file **filp); __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, unsigned long *count, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 9d918a79dc16..80e859dc84d8 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -50,6 +50,134 @@ #define HAS_CSTATE_FLAG(c, f) ((c)->sid_flags & (f)) #define CLEAR_CSTATE_FLAG(c, f) ((c)->sid_flags &= ~(f)) +/** + * nfsd4_encode_bool - Encode an XDR bool type result + * @xdr: target XDR stream + * @val: boolean value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_bool(struct xdr_stream *xdr, bool val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = val ? xdr_one : xdr_zero; + return nfs_ok; +} + +/** + * nfsd4_encode_uint32_t - Encode an XDR uint32_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint32_t(struct xdr_stream *xdr, u32 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT); + + if (unlikely(p == NULL)) + return nfserr_resource; + *p = cpu_to_be32(val); + return nfs_ok; +} + +#define nfsd4_encode_aceflag4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acemask4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_acetype4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_count4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_mode4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_nfs_lease4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_qop4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_sequenceid4(x, v) nfsd4_encode_uint32_t(x, v) +#define nfsd4_encode_slotid4(x, v) nfsd4_encode_uint32_t(x, v) + +/** + * nfsd4_encode_uint64_t - Encode an XDR uint64_t type result + * @xdr: target XDR stream + * @val: integer value to encode + * + * Return values: + * %nfs_ok: @val encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_uint64_t(struct xdr_stream *xdr, u64 val) +{ + __be32 *p = xdr_reserve_space(xdr, XDR_UNIT * 2); + + if (unlikely(p == NULL)) + return nfserr_resource; + put_unaligned_be64(val, p); + return nfs_ok; +} + +#define nfsd4_encode_changeid4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_nfs_cookie4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_length4(x, v) nfsd4_encode_uint64_t(x, v) +#define nfsd4_encode_offset4(x, v) nfsd4_encode_uint64_t(x, v) + +/** + * nfsd4_encode_opaque_fixed - Encode a fixed-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque_fixed(struct xdr_stream *xdr, const void *data, + size_t size) +{ + __be32 *p = xdr_reserve_space(xdr, xdr_align_size(size)); + size_t pad = xdr_pad_size(size); + + if (unlikely(p == NULL)) + return nfserr_resource; + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +/** + * nfsd4_encode_opaque - Encode a variable-length XDR opaque type result + * @xdr: target XDR stream + * @data: pointer to data + * @size: length of data in bytes + * + * Return values: + * %nfs_ok: @data encoded; @xdr advanced to next position + * %nfserr_resource: stream buffer space exhausted + */ +static __always_inline __be32 +nfsd4_encode_opaque(struct xdr_stream *xdr, const void *data, size_t size) +{ + size_t pad = xdr_pad_size(size); + __be32 *p; + + p = xdr_reserve_space(xdr, XDR_UNIT + xdr_align_size(size)); + if (unlikely(p == NULL)) + return nfserr_resource; + *p++ = cpu_to_be32(size); + memcpy(p, data, size); + if (pad) + memset((char *)p + size, 0, pad); + return nfs_ok; +} + +#define nfsd4_encode_component4(x, d, s) nfsd4_encode_opaque(x, d, s) + struct nfsd4_compound_state { struct svc_fh current_fh; struct svc_fh save_fh; @@ -170,12 +298,8 @@ struct nfsd4_lock { } v; /* response */ - union { - struct { - stateid_t stateid; - } ok; - struct nfsd4_lock_denied denied; - } u; + stateid_t lk_resp_stateid; + struct nfsd4_lock_denied lk_denied; }; #define lk_new_open_seqid v.new.open_seqid #define lk_new_open_stateid v.new.open_stateid @@ -185,20 +309,15 @@ struct nfsd4_lock { #define lk_old_lock_stateid v.old.lock_stateid #define lk_old_lock_seqid v.old.lock_seqid -#define lk_resp_stateid u.ok.stateid -#define lk_denied u.denied - - struct nfsd4_lockt { u32 lt_type; clientid_t lt_clientid; struct xdr_netobj lt_owner; u64 lt_offset; u64 lt_length; - struct nfsd4_lock_denied lt_denied; + struct nfsd4_lock_denied lt_denied; }; - struct nfsd4_locku { u32 lu_type; u32 lu_seqid; @@ -267,9 +386,9 @@ struct nfsd4_open { u32 op_deleg_want; /* request */ stateid_t op_stateid; /* response */ __be32 op_xdr_error; /* see nfsd4_open_omfg() */ - u32 op_recall; /* recall */ struct nfsd4_change_info op_cinfo; /* response */ u32 op_rflags; /* response */ + bool op_recall; /* response */ bool op_truncate; /* used during processing */ bool op_created; /* used during processing */ struct nfs4_openowner *op_openowner; /* used during processing */ @@ -496,7 +615,7 @@ struct nfsd4_layoutcommit { u32 lc_layout_type; /* request */ u32 lc_up_len; /* layout length */ void *lc_up_layout; /* decoded by callback */ - u32 lc_size_chg; /* boolean for response */ + bool lc_size_chg; /* response */ u64 lc_newsize; /* response */ }; @@ -508,7 +627,7 @@ struct nfsd4_layoutreturn { u32 lrf_body_len; /* request */ void *lrf_body; /* request */ stateid_t lr_sid; /* request/response */ - u32 lrs_present; /* response */ + bool lrs_present; /* response */ }; struct nfsd4_fallocate { @@ -626,8 +745,7 @@ struct nfsd4_copy_notify { /* response */ stateid_t cpn_cnr_stateid; - u64 cpn_sec; - u32 cpn_nsec; + struct timespec64 cpn_lease_time; struct nl4_server *cpn_src; }; @@ -820,8 +938,10 @@ extern __be32 nfsd4_open_downgrade(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lock_release(union nfsd4_op_u *u); extern __be32 nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); +extern void nfsd4_lockt_release(union nfsd4_op_u *u); extern __be32 nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *u); extern __be32 diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index 0d39af1b00a0..e8b00309c449 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -54,3 +54,21 @@ #define NFS4_dec_cb_recall_any_sz (cb_compound_dec_hdr_sz + \ cb_sequence_dec_sz + \ op_dec_sz) + +/* + * 1: CB_GETATTR opcode (32-bit) + * N: file_handle + * 1: number of entry in attribute array (32-bit) + * 1: entry 0 in attribute array (32-bit) + */ +#define NFS4_enc_cb_getattr_sz (cb_compound_enc_hdr_sz + \ + cb_sequence_enc_sz + \ + 1 + enc_nfs4_fh_sz + 1 + 1) +/* + * 4: fattr_bitmap_maxsz + * 1: attribute array len + * 2: change attr (64-bit) + * 2: size (64-bit) + */ +#define NFS4_dec_cb_getattr_sz (cb_compound_dec_hdr_sz + \ + cb_sequence_dec_sz + 4 + 1 + 2 + 2 + op_dec_sz) diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index bce734b68f08..de2073c47651 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } /* @@ -519,7 +519,7 @@ got_it: de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: @@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: nilfs_put_page(page); return err; diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 48fe71d309cb..8beb2730929d 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, struct the_nilfs *nilfs = inode->i_sb->s_fs_info; err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn); - if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */ - brelse(bh); + if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */ goto failed; - } } lock_buffer(bh); @@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, failed: unlock_page(bh->b_page); put_page(bh->b_page); + if (unlikely(err)) + brelse(bh); return err; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 1a8bd5993476..f861f3a0bf5c 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) atomic64_inc(&root->inodes_count); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); @@ -449,12 +449,12 @@ int nilfs_read_inode_common(struct inode *inode, i_gid_write(inode, le32_to_cpu(raw_inode->i_gid)); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); - inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); + inode_set_atime(inode, le64_to_cpu(raw_inode->i_mtime), + le32_to_cpu(raw_inode->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime), le32_to_cpu(raw_inode->i_ctime_nsec)); - inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); + inode_set_mtime(inode, le64_to_cpu(raw_inode->i_mtime), + le32_to_cpu(raw_inode->i_mtime_nsec)); if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) return -EIO; /* this inode is for metadata and corrupted */ if (inode->i_nlink == 0) @@ -768,10 +768,10 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le64(inode->i_size); - raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); - raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + raw_inode->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + raw_inode->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + raw_inode->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); raw_inode->i_flags = cpu_to_le32(ii->i_flags); @@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode) nilfs_truncate_bmap(ii, blkoff); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index ebdcc25df0f7..869b016014d2 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -265,7 +265,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) struct dnotify_struct *dn; struct inode *inode; fl_owner_t id = current->files; - struct file *f; + struct file *f = NULL; int destroy = 0, error = 0; __u32 mask; @@ -345,7 +345,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) } rcu_read_lock(); - f = lookup_fd_rcu(fd); + f = lookup_fdget_rcu(fd); rcu_read_unlock(); /* if (f != filp) means that we lost a race and another task/thread @@ -392,6 +392,8 @@ out_err: fsnotify_put_mark(new_fsn_mark); if (dn) kmem_cache_free(dnotify_struct_cache, dn); + if (f) + fput(f); return error; } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f69c451018e3..62fe0b679e58 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1585,16 +1585,25 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) } /* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct dentry *dentry) +static int fanotify_test_fid(struct dentry *dentry, unsigned int flags) { + unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; + const struct export_operations *nop = dentry->d_sb->s_export_op; + + /* + * We need to make sure that the filesystem supports encoding of + * file handles so user can use name_to_handle_at() to compare fids + * reported with events to the file handle of watched objects. + */ + if (!nop) + return -EOPNOTSUPP; + /* - * We need to make sure that the file system supports at least - * encoding a file handle so user can use name_to_handle_at() to - * compare fid returned with event to the file handle of watched - * objects. However, even the relaxed AT_HANDLE_FID flag requires - * at least empty export_operations for ecoding unique file ids. + * For sb/mount mark, we also need to make sure that the filesystem + * supports decoding file handles, so user has a way to map back the + * reported fids to filesystem objects. */ - if (!dentry->d_sb->s_export_op) + if (mark_type != FAN_MARK_INODE && !nop->fh_to_dentry) return -EOPNOTSUPP; return 0; @@ -1812,7 +1821,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, if (ret) goto path_put_and_out; - ret = fanotify_test_fid(path.dentry); + ret = fanotify_test_fid(path.dentry, flags); if (ret) goto path_put_and_out; diff --git a/fs/nsfs.c b/fs/nsfs.c index 647a22433bd8..9a4b228d42fa 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -84,7 +84,7 @@ slow: return -ENOMEM; } inode->i_ino = ns->inum; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 99ac6ea277c4..aba1e22db4e9 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -648,7 +648,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * mtime is the last change of the data within the file. Not changed * when only metadata is changed, e.g. a rename doesn't affect mtime. */ - vi->i_mtime = ntfs2utc(si->last_data_change_time); + inode_set_mtime_to_ts(vi, ntfs2utc(si->last_data_change_time)); /* * ctime is the last change of the metadata of the file. This obviously * always changes, when mtime is changed. ctime can be changed on its @@ -659,7 +659,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. */ - vi->i_atime = ntfs2utc(si->last_access_time); + inode_set_atime_to_ts(vi, ntfs2utc(si->last_access_time)); /* Find the attribute list attribute if present. */ ntfs_attr_reinit_search_ctx(ctx); @@ -1217,9 +1217,9 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); - vi->i_mtime = base_vi->i_mtime; + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - vi->i_atime = base_vi->i_atime; + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ @@ -1483,9 +1483,9 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_uid = base_vi->i_uid; vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); - vi->i_mtime = base_vi->i_mtime; + inode_set_mtime_to_ts(vi, inode_get_mtime(base_vi)); inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); - vi->i_atime = base_vi->i_atime; + inode_set_atime_to_ts(vi, inode_get_atime(base_vi)); vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ vi->i_mode = base_vi->i_mode & ~S_IFMT; @@ -2805,13 +2805,14 @@ done: if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { struct timespec64 now = current_time(VFS_I(base_ni)); struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); + struct timespec64 mtime = inode_get_mtime(VFS_I(base_ni)); int sync_it = 0; - if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || + if (!timespec64_equal(&mtime, &now) || !timespec64_equal(&ctime, &now)) sync_it = 1; inode_set_ctime_to_ts(VFS_I(base_ni), now); - VFS_I(base_ni)->i_mtime = now; + inode_set_mtime_to_ts(VFS_I(base_ni), now); if (sync_it) mark_inode_dirty_sync(VFS_I(base_ni)); @@ -2925,9 +2926,9 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } } if (ia_valid & ATTR_ATIME) - vi->i_atime = attr->ia_atime; + inode_set_atime_to_ts(vi, attr->ia_atime); if (ia_valid & ATTR_MTIME) - vi->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(vi, attr->ia_mtime); if (ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(vi, attr->ia_ctime); mark_inode_dirty(vi); @@ -2996,7 +2997,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si = (STANDARD_INFORMATION*)((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset)); /* Update the access times if they have changed. */ - nt = utc2ntfs(vi->i_mtime); + nt = utc2ntfs(inode_get_mtime(vi)); if (si->last_data_change_time != nt) { ntfs_debug("Updating mtime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) @@ -3014,7 +3015,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_mft_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_atime); + nt = utc2ntfs(inode_get_atime(vi)); if (si->last_access_time != nt) { ntfs_debug("Updating atime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index ad1a8f72da22..6fd1dc4b08c8 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2682,7 +2682,7 @@ mft_rec_already_initialized: vi->i_mode &= ~S_IWUGO; /* Set the inode times to the current time. */ - vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi); + simple_inode_init_ts(vi); /* * Set the file size to 0, the ntfs inode sizes are set to 0 by * the call to ntfs_init_big_inode() below. diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index a9d82bbb4729..63f70259edc0 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -1106,10 +1106,10 @@ repack: } } - /* + /* * The code below may require additional cluster (to extend attribute list) - * and / or one MFT record - * It is too complex to undo operations if -ENOSPC occurs deep inside + * and / or one MFT record + * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ @@ -1736,10 +1736,8 @@ repack: le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); - if (!attr_b) { - err = -ENOENT; - goto out; - } + if (!attr_b) + return -ENOENT; attr = attr_b; le = le_b; diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index 42631b31adf1..7c01735d1219 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -52,7 +52,8 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (!attr->non_res) { lsize = le32_to_cpu(attr->res.data_size); - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is resident: lsize < record_size (1K or 4K) */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; @@ -80,7 +81,17 @@ int ntfs_load_attr_list(struct ntfs_inode *ni, struct ATTRIB *attr) if (err < 0) goto out; - le = kmalloc(al_aligned(lsize), GFP_NOFS | __GFP_NOWARN); + /* attr is nonresident. + * The worst case: + * 1T (2^40) extremely fragmented file. + * cluster = 4K (2^12) => 2^28 fragments + * 2^9 fragments per one record => 2^19 records + * 2^5 bytes of ATTR_LIST_ENTRY per one record => 2^24 bytes. + * + * the result is 16M bytes per attribute list. + * Use kvmalloc to allocate in range [several Kbytes - dozen Mbytes] + */ + le = kvmalloc(al_aligned(lsize), GFP_KERNEL); if (!le) { err = -ENOMEM; goto out; diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 107e808e06ea..63f14a0232f6 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -125,6 +125,7 @@ void wnd_close(struct wnd_bitmap *wnd) struct rb_node *node, *next; kfree(wnd->free_bits); + wnd->free_bits = NULL; run_close(&wnd->run); node = rb_first(&wnd->start_tree); @@ -659,7 +660,8 @@ int wnd_init(struct wnd_bitmap *wnd, struct super_block *sb, size_t nbits) wnd->bits_last = wbits; wnd->free_bits = - kcalloc(wnd->nwnd, sizeof(u16), GFP_NOFS | __GFP_NOWARN); + kvmalloc_array(wnd->nwnd, sizeof(u16), GFP_KERNEL | __GFP_ZERO); + if (!wnd->free_bits) return -ENOMEM; diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 063a6654199b..ec0566b322d5 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -309,7 +309,11 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; } - dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; + /* NTFS: symlinks are "dir + reparse" or "file + reparse" */ + if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) + dt_type = DT_LNK; + else + dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 962f12ce6c0a..ad4a70b5d432 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = 0; } - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); if (IS_SYNC(inode)) { @@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) ni_unlock(ni); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (!IS_DIRSYNC(inode)) { dirty = 1; } else { @@ -642,7 +642,7 @@ out: filemap_invalidate_unlock(mapping); if (!err) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); } @@ -745,8 +745,8 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) } static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, - size_t len, unsigned int flags) + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { struct inode *inode = in->f_mapping->host; struct ntfs_inode *ni = ntfs_i(inode); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 2b85cb10f0be..3df2d9e34b91 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -2148,7 +2148,7 @@ out1: for (i = 0; i < pages_per_frame; i++) { pg = pages[i]; - if (i == idx) + if (i == idx || !pg) continue; unlock_page(pg); put_page(pg); @@ -3208,6 +3208,12 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, if (!fname || !memcmp(&fname->dup, dup, sizeof(fname->dup))) continue; + /* Check simple case when parent inode equals current inode. */ + if (ino_get(&fname->home) == ni->vfs_inode.i_ino) { + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + continue; + } + /* ntfs_iget5 may sleep. */ dir = ntfs_iget5(sb, &fname->home, NULL); if (IS_ERR(dir)) { @@ -3265,7 +3271,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; - struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 ts; /* Update times in standard attribute. */ std = ni_std(ni); @@ -3275,19 +3281,22 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) } /* Update the access times if they have changed. */ - dup.m_time = kernel2nt(&inode->i_mtime); + ts = inode_get_mtime(inode); + dup.m_time = kernel2nt(&ts); if (std->m_time != dup.m_time) { std->m_time = dup.m_time; modified = true; } - dup.c_time = kernel2nt(&ctime); + ts = inode_get_mtime(inode); + dup.c_time = kernel2nt(&ts); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; modified = true; } - dup.a_time = kernel2nt(&inode->i_atime); + ts = inode_get_atime(inode); + dup.a_time = kernel2nt(&ts); if (std->a_time != dup.a_time) { std->a_time = dup.a_time; modified = true; diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index 12f28cdf5c83..98ccb6650858 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -2168,8 +2168,10 @@ file_is_valid: if (!page) { page = kmalloc(log->page_size, GFP_NOFS); - if (!page) - return -ENOMEM; + if (!page) { + err = -ENOMEM; + goto out; + } } /* diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 33afee0f5559..fbfe21dbb425 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -983,18 +983,11 @@ out: if (err) return err; - mark_inode_dirty(&ni->vfs_inode); + mark_inode_dirty_sync(&ni->vfs_inode); /* verify(!ntfs_update_mftmirr()); */ - /* - * If we used wait=1, sync_inode_metadata waits for the io for the - * inode to finish. It hangs when media is removed. - * So wait=0 is sent down to sync_inode_metadata - * and filemap_fdatawrite is used for the data blocks. - */ - err = sync_inode_metadata(&ni->vfs_inode, 0); - if (!err) - err = filemap_fdatawrite(ni->vfs_inode.i_mapping); + /* write mft record on disk. */ + err = _ni_write_inode(&ni->vfs_inode, 1); return err; } @@ -2461,10 +2454,12 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) { CLST end, i, zone_len, zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; + bool dirty = false; down_write_nested(&wnd->rw_lock, BITMAP_MUTEX_CLUSTERS); if (!wnd_is_used(wnd, lcn, len)) { - ntfs_set_state(sbi, NTFS_DIRTY_ERROR); + /* mark volume as dirty out of wnd->rw_lock */ + dirty = true; end = lcn + len; len = 0; @@ -2518,6 +2513,8 @@ void mark_as_free_ex(struct ntfs_sb_info *sbi, CLST lcn, CLST len, bool trim) out: up_write(&wnd->rw_lock); + if (dirty) + ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } /* diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 124c6e822623..cf92b2433f7a 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -729,6 +729,9 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, u32 total = le32_to_cpu(hdr->total); u16 offs[128]; + if (unlikely(!cmp)) + return NULL; + fill_table: if (end > total) return NULL; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index eb2ed0701495..5e3d71374918 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -44,7 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, u64 t64; struct MFT_REC *rec; struct runs_tree *run; - struct timespec64 ctime; + struct timespec64 ts; inode->i_op = NULL; /* Setup 'uid' and 'gid' */ @@ -169,10 +169,12 @@ next_attr: #ifdef STATX_BTIME nt2kernel(std5->cr_time, &ni->i_crtime); #endif - nt2kernel(std5->a_time, &inode->i_atime); - ctime = inode_get_ctime(inode); - nt2kernel(std5->c_time, &ctime); - nt2kernel(std5->m_time, &inode->i_mtime); + nt2kernel(std5->a_time, &ts); + inode_set_atime_to_ts(inode, ts); + nt2kernel(std5->c_time, &ts); + inode_set_ctime_to_ts(inode, ts); + nt2kernel(std5->m_time, &ts); + inode_set_mtime_to_ts(inode, ts); ni->std_fa = std5->fa; @@ -960,7 +962,8 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, if (err >= 0) { if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; dirty = true; } @@ -1660,8 +1663,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime); - dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime); + inode_set_atime_to_ts(inode, ni->i_crtime); + inode_set_ctime_to_ts(inode, ni->i_crtime); + inode_set_mtime_to_ts(inode, ni->i_crtime); + inode_set_mtime_to_ts(dir, ni->i_crtime); + inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); mark_inode_dirty(inode); @@ -1767,7 +1773,7 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) if (!err) { drop_nlink(inode); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); if (inode->i_nlink) diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index ad430d50bd79..ee3093be5170 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_mtime = inode_set_ctime_to_ts(inode, - inode_set_ctime_current(dir)); + inode_set_ctime_current(inode); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -373,7 +373,7 @@ static int ntfs_atomic_open(struct inode *dir, struct dentry *dentry, #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (IS_POSIXACL(dir)) { - /* + /* * Load in cache current acl to avoid ni_lock(dir): * ntfs_create_inode -> ntfs_init_acl -> posix_acl_create -> * ntfs_get_acl -> ntfs_get_acl_ex -> ni_lock diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 98b76d1b09e7..86aecbb01a92 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -847,7 +847,7 @@ struct OBJECT_ID { // Birth Volume Id is the Object Id of the Volume on. // which the Object Id was allocated. It never changes. struct GUID BirthVolumeId; //0x10: - + // Birth Object Id is the first Object Id that was // ever assigned to this MFT Record. I.e. If the Object Id // is changed for some reason, this field will reflect the diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 629403ede6e5..f6706143d14b 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -42,9 +42,11 @@ enum utf16_endian; #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 +#define MAXIMUM_SHIFT_BYTES_PER_MFT 12 #define NTFS_BLOCKS_PER_MFT_RECORD (MAXIMUM_BYTES_PER_MFT / 512) #define MAXIMUM_BYTES_PER_INDEX 4096 +#define MAXIMUM_SHIFT_BYTES_PER_INDEX 12 #define NTFS_BLOCKS_PER_INODE (MAXIMUM_BYTES_PER_INDEX / 512) /* NTFS specific error code when fixup failed. */ @@ -495,8 +497,6 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -void ntfs_sparse_cluster(struct inode *inode, struct page *page0, CLST vcn, - CLST len); int ntfs_file_open(struct inode *inode, struct file *file); int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); @@ -872,7 +872,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode, int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry); ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size); -extern const struct xattr_handler *ntfs_xattr_handlers[]; +extern const struct xattr_handler * const ntfs_xattr_handlers[]; int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size); void ntfs_get_wsl_perm(struct inode *inode); diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index c12ebffc94da..53629b1f65e9 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -189,12 +189,19 @@ out: return err; } +/* + * mi_enum_attr - start/continue attributes enumeration in record. + * + * NOTE: mi->mrec - memory of size sbi->record_size + * here we sure that mi->mrec->total == sbi->record_size (see mi_read) + */ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) { const struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); - u32 t32, off, asize; + u32 t32, off, asize, prev_type; u16 t16; + u64 data_size, alloc_size, tot_size; if (!attr) { u32 total = le32_to_cpu(rec->total); @@ -213,6 +220,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) if (!is_rec_inuse(rec)) return NULL; + prev_type = 0; attr = Add2Ptr(rec, off); } else { /* Check if input attr inside record. */ @@ -226,11 +234,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return NULL; } - if (off + asize < off) { - /* Overflow check. */ + /* Overflow check. */ + if (off + asize < off) return NULL; - } + prev_type = le32_to_cpu(attr->type); attr = Add2Ptr(attr, asize); off += asize; } @@ -250,7 +258,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* 0x100 is last known attribute for now. */ t32 = le32_to_cpu(attr->type); - if ((t32 & 0xf) || (t32 > 0x100)) + if (!t32 || (t32 & 0xf) || (t32 > 0x100)) + return NULL; + + /* attributes in record must be ordered by type */ + if (t32 < prev_type) return NULL; /* Check overflow and boundary. */ @@ -259,16 +271,15 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) /* Check size of attribute. */ if (!attr->non_res) { + /* Check resident fields. */ if (asize < SIZEOF_RESIDENT) return NULL; t16 = le16_to_cpu(attr->res.data_off); - if (t16 > asize) return NULL; - t32 = le32_to_cpu(attr->res.data_size); - if (t16 + t32 > asize) + if (t16 + le32_to_cpu(attr->res.data_size) > asize) return NULL; t32 = sizeof(short) * attr->name_len; @@ -278,21 +289,52 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) return attr; } - /* Check some nonresident fields. */ - if (attr->name_len && - le16_to_cpu(attr->name_off) + sizeof(short) * attr->name_len > - le16_to_cpu(attr->nres.run_off)) { + /* Check nonresident fields. */ + if (attr->non_res != 1) + return NULL; + + t16 = le16_to_cpu(attr->nres.run_off); + if (t16 > asize) + return NULL; + + t32 = sizeof(short) * attr->name_len; + if (t32 && le16_to_cpu(attr->name_off) + t32 > t16) + return NULL; + + /* Check start/end vcn. */ + if (le64_to_cpu(attr->nres.svcn) > le64_to_cpu(attr->nres.evcn) + 1) return NULL; - } - if (attr->nres.svcn || !is_attr_ext(attr)) { + data_size = le64_to_cpu(attr->nres.data_size); + if (le64_to_cpu(attr->nres.valid_size) > data_size) + return NULL; + + alloc_size = le64_to_cpu(attr->nres.alloc_size); + if (data_size > alloc_size) + return NULL; + + t32 = mi->sbi->cluster_mask; + if (alloc_size & t32) + return NULL; + + if (!attr->nres.svcn && is_attr_ext(attr)) { + /* First segment of sparse/compressed attribute */ + if (asize + 8 < SIZEOF_NONRESIDENT_EX) + return NULL; + + tot_size = le64_to_cpu(attr->nres.total_size); + if (tot_size & t32) + return NULL; + + if (tot_size > alloc_size) + return NULL; + } else { if (asize + 8 < SIZEOF_NONRESIDENT) return NULL; if (attr->nres.c_unit) return NULL; - } else if (asize + 8 < SIZEOF_NONRESIDENT_EX) - return NULL; + } return attr; } diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index cfec5e0c7f66..f763e3256ccc 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -453,15 +453,23 @@ static struct proc_dir_entry *proc_info_root; * ntfs3.1 * cluster size * number of clusters + * total number of mft records + * number of used mft records ~= number of files + folders + * real state of ntfs "dirty"/"clean" + * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; - seq_printf(m, "ntfs%d.%d\n%u\n%zu\n", sbi->volume.major_ver, - sbi->volume.minor_ver, sbi->cluster_size, - sbi->used.bitmap.nbits); + seq_printf(m, "ntfs%d.%d\n%u\n%zu\n\%zu\n%zu\n%s\n%s\n", + sbi->volume.major_ver, sbi->volume.minor_ver, + sbi->cluster_size, sbi->used.bitmap.nbits, + sbi->mft.bitmap.nbits, + sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), + sbi->volume.real_dirty ? "dirty" : "clean", + (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } @@ -488,9 +496,13 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, { int err; struct super_block *sb = pde_data(file_inode(file)); - struct ntfs_sb_info *sbi = sb->s_fs_info; ssize_t ret = count; - u8 *label = kmalloc(count, GFP_NOFS); + u8 *label; + + if (sb_rdonly(sb)) + return -EROFS; + + label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; @@ -502,7 +514,7 @@ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, while (ret > 0 && label[ret - 1] == '\n') ret -= 1; - err = ntfs_set_label(sbi, label, ret); + err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); @@ -576,20 +588,30 @@ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); - if (sbi->mft.ni) + if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); + sbi->mft.ni = NULL; + } - if (sbi->security.ni) + if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); + sbi->security.ni = NULL; + } - if (sbi->reparse.ni) + if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); + sbi->reparse.ni = NULL; + } - if (sbi->objid.ni) + if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); + sbi->objid.ni = NULL; + } - if (sbi->volume.ni) + if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); + sbi->volume.ni = NULL; + } ntfs_update_mftmirr(sbi, 0); @@ -836,7 +858,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -845,6 +867,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u32 boot_off = 0; const char *hint = "Primary boot"; + /* Save original dev_size. Used with alternative boot. */ + dev_size0 = dev_size; + sbi->volume.blocks = dev_size >> PAGE_SHIFT; bh = ntfs_bread(sb, 0); @@ -853,6 +878,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, check_boot: err = -EINVAL; + + /* Corrupted image; do not read OOB */ + if (bh->b_size - sizeof(*boot) < boot_off) + goto out; + boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { @@ -899,9 +929,17 @@ check_boot: goto out; } - sbi->record_size = record_size = - boot->record_size < 0 ? 1 << (-boot->record_size) : - (u32)boot->record_size << cluster_bits; + if (boot->record_size >= 0) { + record_size = (u32)boot->record_size << cluster_bits; + } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { + record_size = 1u << (-boot->record_size); + } else { + ntfs_err(sb, "%s: invalid record size %d.", hint, + boot->record_size); + goto out; + } + + sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes @@ -918,9 +956,15 @@ check_boot: goto out; } - sbi->index_size = boot->index_size < 0 ? - 1u << (-boot->index_size) : - (u32)boot->index_size << cluster_bits; + if (boot->index_size >= 0) { + sbi->index_size = (u32)boot->index_size << cluster_bits; + } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { + sbi->index_size = 1u << (-boot->index_size); + } else { + ntfs_err(sb, "%s: invalid index size %d.", hint, + boot->index_size); + goto out; + } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { @@ -1055,17 +1099,17 @@ check_boot: if (bh->b_blocknr && !sb_rdonly(sb)) { /* - * Alternative boot is ok but primary is not ok. - * Do not update primary boot here 'cause it may be faked boot. - * Let ntfs to be mounted and update boot later. - */ + * Alternative boot is ok but primary is not ok. + * Do not update primary boot here 'cause it may be faked boot. + * Let ntfs to be mounted and update boot later. + */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: - if (err == -EINVAL && !bh->b_blocknr && dev_size > PAGE_SHIFT) { + if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); - u64 lbo = dev_size - sizeof(*boot); + u64 lbo = dev_size0 - sizeof(*boot); /* * Try alternative boot (last sector) @@ -1079,6 +1123,7 @@ out: boot_off = lbo & (block_size - 1); hint = "Alternative boot"; + dev_size = dev_size0; /* restore original size. */ goto check_boot; } brelse(bh); @@ -1367,7 +1412,7 @@ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) } bytes = inode->i_size; - sbi->def_table = t = kmalloc(bytes, GFP_NOFS | __GFP_NOWARN); + sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; @@ -1521,9 +1566,9 @@ load_root: if (boot2) { /* - * Alternative boot is ok but primary is not ok. - * Volume is recognized as NTFS. Update primary boot. - */ + * Alternative boot is ok but primary is not ok. + * Volume is recognized as NTFS. Update primary boot. + */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) @@ -1562,7 +1607,9 @@ load_root: put_inode_out: iput(inode); out: + ntfs3_put_sbi(sbi); kfree(boot2); + ntfs3_put_sbi(sbi); return err; } @@ -1756,7 +1803,6 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); - #ifdef CONFIG_PROC_FS /* Create "/proc/fs/ntfs3" */ proc_info_root = proc_mkdir("fs/ntfs3", NULL); @@ -1798,7 +1844,6 @@ static void __exit exit_ntfs_fs(void) if (proc_info_root) remove_proc_entry("fs/ntfs3", NULL); #endif - } MODULE_LICENSE("GPL"); diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 29fd391899e5..4274b6f31cfa 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -211,7 +211,8 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, size = le32_to_cpu(info->size); /* Enumerate all xattrs. */ - for (ret = 0, off = 0; off < size; off += ea_size) { + ret = 0; + for (off = 0; off + sizeof(struct EA_FULL) < size; off += ea_size) { ea = Add2Ptr(ea_all, off); ea_size = unpacked_ea_size(ea); @@ -219,6 +220,10 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer, break; if (buffer) { + /* Check if we can use field ea->name */ + if (off + ea_size > size) + break; + if (ret + ea->name_len + 1 > bytes_per_buffer) { err = -ERANGE; goto out; @@ -1016,7 +1021,7 @@ static const struct xattr_handler ntfs_other_xattr_handler = { .list = ntfs_xattr_user_list, }; -const struct xattr_handler *ntfs_xattr_handlers[] = { +const struct xattr_handler * const ntfs_xattr_handlers[] = { &ntfs_other_xattr_handler, NULL, }; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e75137a8e7cb..62464d194da3 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -193,8 +193,8 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, inode->i_mode = new_mode; inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index aef58f1395c8..f0937902f7b4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0fdba30740ab..6ab03494fc6e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2048,9 +2048,9 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode_set_ctime_current(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); if (handle) ocfs2_update_inode_fsync_trans(handle, inode, 1); } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21472e3ed182..4d7efefa98c5 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -213,7 +213,7 @@ struct o2hb_region { unsigned int hr_num_pages; struct page **hr_slot_data; - struct block_device *hr_bdev; + struct bdev_handle *hr_bdev_handle; struct o2hb_disk_slot *hr_slots; /* live node map of this region */ @@ -261,6 +261,11 @@ struct o2hb_region { int hr_last_hb_status; }; +static inline struct block_device *reg_bdev(struct o2hb_region *reg) +{ + return reg->hr_bdev_handle ? reg->hr_bdev_handle->bdev : NULL; +} + struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; @@ -286,7 +291,7 @@ static void o2hb_write_timeout(struct work_struct *work) hr_write_timeout_work.work); mlog(ML_ERROR, "Heartbeat write timeout to device %pg after %u " - "milliseconds\n", reg->hr_bdev, + "milliseconds\n", reg_bdev(reg), jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); if (o2hb_global_heartbeat_active()) { @@ -383,7 +388,7 @@ static void o2hb_nego_timeout(struct work_struct *work) if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg).\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(master_node, reg->hr_nego_node_bitmap); } if (!bitmap_equal(reg->hr_nego_node_bitmap, live_node_bitmap, @@ -398,7 +403,8 @@ static void o2hb_nego_timeout(struct work_struct *work) } printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%pg) is down.\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), + reg_bdev(reg)); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,7 +425,7 @@ static void o2hb_nego_timeout(struct work_struct *work) /* negotiate timeout with master node. */ printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%pg), negotiate timeout with node %d.\n", o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), - reg->hr_bdev, master_node); + reg_bdev(reg), master_node); ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, master_node); if (ret) @@ -436,7 +442,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, nego_msg = (struct o2hb_nego_msg *)msg->buf; printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%pg).\n", - nego_msg->node_num, config_item_name(®->hr_item), reg->hr_bdev); + nego_msg->node_num, config_item_name(®->hr_item), + reg_bdev(reg)); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -451,7 +458,7 @@ static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_region *reg = data; printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%pg).\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); o2hb_arm_timeout(reg); return 0; } @@ -515,7 +522,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, * GFP_KERNEL that the local node can get fenced. It would be * nicest if we could pre-allocate these bios and avoid this * all together. */ - bio = bio_alloc(reg->hr_bdev, 16, opf, GFP_ATOMIC); + bio = bio_alloc(reg_bdev(reg), 16, opf, GFP_ATOMIC); if (!bio) { mlog(ML_ERROR, "Could not alloc slots BIO!\n"); bio = ERR_PTR(-ENOMEM); @@ -687,7 +694,7 @@ static int o2hb_check_own_slot(struct o2hb_region *reg) errstr = ERRSTR3; mlog(ML_ERROR, "%s (%pg): expected(%u:0x%llx, 0x%llx), " - "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg->hr_bdev, + "ondisk(%u:0x%llx, 0x%llx)\n", errstr, reg_bdev(reg), slot->ds_node_num, (unsigned long long)slot->ds_last_generation, (unsigned long long)slot->ds_last_time, hb_block->hb_node, (unsigned long long)le64_to_cpu(hb_block->hb_generation), @@ -861,7 +868,7 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg) goto unlock; printk(KERN_NOTICE "o2hb: Region %s (%pg) is now a quorum device\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); @@ -920,7 +927,7 @@ static int o2hb_check_slot(struct o2hb_region *reg, * consider it a transient miss but don't populate any * other values as they may be junk. */ mlog(ML_ERROR, "Node %d has written a bad crc to %pg\n", - slot->ds_node_num, reg->hr_bdev); + slot->ds_node_num, reg_bdev(reg)); o2hb_dump_slot(hb_block); slot->ds_equal_samples++; @@ -1003,8 +1010,8 @@ fire_callbacks: "of %u ms, but our count is %u ms.\n" "Please double check your configuration values " "for 'O2CB_HEARTBEAT_THRESHOLD'\n", - slot->ds_node_num, reg->hr_bdev, slot_dead_ms, - dead_ms); + slot->ds_node_num, reg_bdev(reg), + slot_dead_ms, dead_ms); } goto out; } @@ -1143,7 +1150,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * can't be sure that the new block ever made it to * disk */ mlog(ML_ERROR, "Write error %d on device \"%pg\"\n", - write_wc.wc_error, reg->hr_bdev); + write_wc.wc_error, reg_bdev(reg)); ret = write_wc.wc_error; goto bail; } @@ -1169,7 +1176,7 @@ bail: printk(KERN_NOTICE "o2hb: Unable to stabilize " "heartbeat on region %s (%pg)\n", config_item_name(®->hr_item), - reg->hr_bdev); + reg_bdev(reg)); atomic_set(®->hr_steady_iterations, 0); reg->hr_aborted_start = 1; wake_up(&o2hb_steady_queue); @@ -1489,7 +1496,7 @@ static void o2hb_region_release(struct config_item *item) struct page *page; struct o2hb_region *reg = to_o2hb_region(item); - mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg->hr_bdev); + mlog(ML_HEARTBEAT, "hb region release (%pg)\n", reg_bdev(reg)); kfree(reg->hr_tmp_block); @@ -1502,8 +1509,8 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slot_data); } - if (reg->hr_bdev) - blkdev_put(reg->hr_bdev, NULL); + if (reg->hr_bdev_handle) + bdev_release(reg->hr_bdev_handle); kfree(reg->hr_slots); @@ -1562,7 +1569,7 @@ static ssize_t o2hb_region_block_bytes_store(struct config_item *item, unsigned long block_bytes; unsigned int block_bits; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; status = o2hb_read_block_input(reg, page, &block_bytes, @@ -1591,7 +1598,7 @@ static ssize_t o2hb_region_start_block_store(struct config_item *item, char *p = (char *)page; ssize_t ret; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; ret = kstrtoull(p, 0, &tmp); @@ -1616,7 +1623,7 @@ static ssize_t o2hb_region_blocks_store(struct config_item *item, unsigned long tmp; char *p = (char *)page; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) return -EINVAL; tmp = simple_strtoul(p, &p, 0); @@ -1635,8 +1642,8 @@ static ssize_t o2hb_region_dev_show(struct config_item *item, char *page) { unsigned int ret = 0; - if (to_o2hb_region(item)->hr_bdev) - ret = sprintf(page, "%pg\n", to_o2hb_region(item)->hr_bdev); + if (to_o2hb_region(item)->hr_bdev_handle) + ret = sprintf(page, "%pg\n", reg_bdev(to_o2hb_region(item))); return ret; } @@ -1745,7 +1752,10 @@ out: return ret; } -/* this is acting as commit; we set up all of hr_bdev and hr_task or nothing */ +/* + * this is acting as commit; we set up all of hr_bdev_handle and hr_task or + * nothing + */ static ssize_t o2hb_region_dev_store(struct config_item *item, const char *page, size_t count) @@ -1759,7 +1769,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, ssize_t ret = -EINVAL; int live_threshold; - if (reg->hr_bdev) + if (reg->hr_bdev_handle) goto out; /* We can't heartbeat without having had our node number @@ -1785,16 +1795,15 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (!S_ISBLK(f.file->f_mapping->host->i_mode)) goto out2; - reg->hr_bdev = blkdev_get_by_dev(f.file->f_mapping->host->i_rdev, - BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, - NULL); - if (IS_ERR(reg->hr_bdev)) { - ret = PTR_ERR(reg->hr_bdev); - reg->hr_bdev = NULL; + reg->hr_bdev_handle = bdev_open_by_dev(f.file->f_mapping->host->i_rdev, + BLK_OPEN_WRITE | BLK_OPEN_READ, NULL, NULL); + if (IS_ERR(reg->hr_bdev_handle)) { + ret = PTR_ERR(reg->hr_bdev_handle); + reg->hr_bdev_handle = NULL; goto out2; } - sectsize = bdev_logical_block_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg_bdev(reg)); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", @@ -1890,12 +1899,12 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, if (hb_task && o2hb_global_heartbeat_active()) printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%pg)\n", - config_item_name(®->hr_item), reg->hr_bdev); + config_item_name(®->hr_item), reg_bdev(reg)); out3: if (ret < 0) { - blkdev_put(reg->hr_bdev, NULL); - reg->hr_bdev = NULL; + bdev_release(reg->hr_bdev_handle); + reg->hr_bdev_handle = NULL; } out2: fdput(f); @@ -2085,7 +2094,7 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%pg)\n", ((atomic_read(®->hr_steady_iterations) == 0) ? "stopped" : "start aborted"), config_item_name(item), - reg->hr_bdev); + reg_bdev(reg)); } /* diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8b123d543e6e..a14c8fee6ee5 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1658,7 +1658,8 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -2962,11 +2963,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(dir)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(dir)); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 81265123ce6c..9b57d012fd5c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, parent, mode); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c3e2961ee5db..64a6ef638495 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2162,7 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; - struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 ts; lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2183,12 +2183,12 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_igid = cpu_to_be32(i_gid_read(inode)); lvb->lvb_imode = cpu_to_be16(inode->i_mode); lvb->lvb_inlink = cpu_to_be16(inode->i_nlink); - lvb->lvb_iatime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); - lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&ctime)); - lvb->lvb_imtime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); + ts = inode_get_atime(inode); + lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_ctime(inode); + lvb->lvb_ictime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); + ts = inode_get_mtime(inode); + lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&ts)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); lvb->lvb_idynfeatures = cpu_to_be16(oi->ip_dyn_features); lvb->lvb_igeneration = cpu_to_be32(inode->i_generation); @@ -2209,7 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; - struct timespec64 ctime; + struct timespec64 ts; mlog_meta_lvb(0, lockres); @@ -2236,13 +2236,12 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) i_gid_write(inode, be32_to_cpu(lvb->lvb_igid)); inode->i_mode = be16_to_cpu(lvb->lvb_imode); set_nlink(inode, be16_to_cpu(lvb->lvb_inlink)); - ocfs2_unpack_timespec(&inode->i_atime, - be64_to_cpu(lvb->lvb_iatime_packed)); - ocfs2_unpack_timespec(&inode->i_mtime, - be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&ctime, - be64_to_cpu(lvb->lvb_ictime_packed)); - inode_set_ctime_to_ts(inode, ctime); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_iatime_packed)); + inode_set_atime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_imtime_packed)); + inode_set_mtime_to_ts(inode, ts); + ocfs2_unpack_timespec(&ts, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ts); spin_unlock(&oi->ip_lock); return 0; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index c45596c25c66..94e2a1244442 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -233,16 +233,18 @@ int ocfs2_should_update_atime(struct inode *inode, if (vfsmnt->mnt_flags & MNT_RELATIME) { struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 atime = inode_get_atime(inode); + struct timespec64 mtime = inode_get_mtime(inode); - if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec64_compare(&inode->i_atime, &ctime) <= 0)) + if ((timespec64_compare(&atime, &mtime) <= 0) || + (timespec64_compare(&atime, &ctime) <= 0)) return 1; return 0; } now = current_time(inode); - if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) + if ((now.tv_sec - inode_get_atime_sec(inode) <= osb->s_atime_quantum)) return 0; else return 1; @@ -275,9 +277,9 @@ int ocfs2_update_inode_atime(struct inode *inode, * have i_rwsem to guard against concurrent changes to other * inode fields. */ - inode->i_atime = current_time(inode); - di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); + inode_set_atime_to_ts(inode, current_time(inode)); + di->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + di->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, bh); @@ -296,7 +298,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -417,12 +419,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -821,9 +823,9 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode_set_ctime_current(inode); - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + di->i_mtime = di->i_ctime = cpu_to_le64(inode_get_mtime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); di->i_mtime_nsec = di->i_ctime_nsec; if (handle) { ocfs2_journal_dirty(handle, di_bh); @@ -2040,7 +2042,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index e8771600b930..999111bfc271 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -302,10 +302,10 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_blocks = ocfs2_inode_sector_count(inode); inode->i_mapping->a_ops = &ocfs2_aops; } - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); @@ -1312,12 +1312,12 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_uid = cpu_to_le32(i_uid_read(inode)); fe->i_gid = cpu_to_le32(i_gid_read(inode)); fe->i_mode = cpu_to_le16(inode->i_mode); - fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); - fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + fe->i_atime = cpu_to_le64(inode_get_atime_sec(inode)); + fe->i_atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + fe->i_mtime = cpu_to_le64(inode_get_mtime_sec(inode)); + fe->i_mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ocfs2_journal_dirty(handle, bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); @@ -1348,10 +1348,10 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_blocks = 0; else inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime); - inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); - inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); - inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); + inode_set_atime(inode, le64_to_cpu(fe->i_atime), + le32_to_cpu(fe->i_atime_nsec)); + inode_set_mtime(inode, le64_to_cpu(fe->i_mtime), + le32_to_cpu(fe->i_mtime_nsec)); inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), le32_to_cpu(fe->i_ctime_nsec)); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 05d67968a3a9..1f9ed117e78b 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -951,8 +951,8 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) di = (struct ocfs2_dinode *)di_bh->b_data; inode_set_ctime_current(inode); - di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5cd6d7771cea..681e9501cdd3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -795,8 +795,8 @@ static int ocfs2_link(struct dentry *old_dentry, inc_nlink(inode); inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir, ocfs2_set_links_count(fe, inode->i_nlink); ocfs2_journal_dirty(handle, fe_bh); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1550,8 +1550,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime_sec(old_inode)); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(old_inode)); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1592,7 +1592,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, drop_nlink(new_inode); inode_set_ctime_current(new_inode); } - old_dir->i_mtime = inode_set_ctime_current(old_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, @@ -1614,8 +1614,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_mtime = inode_set_ctime_to_ts(new_dir, - inode_get_ctime(old_dir)); + inode_set_mtime_to_ts(new_dir, + inode_set_ctime_to_ts(new_dir, inode_get_ctime(old_dir))); /* * This will also pick up the i_nlink change from the diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 25c8ec3c8c3a..3f80a56d0d60 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3751,8 +3751,8 @@ static int ocfs2_change_ctime(struct inode *inode, } inode_set_ctime_current(inode); - di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(handle, di_bh); @@ -4075,10 +4075,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode, */ inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(t_inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(t_inode)); - t_inode->i_mtime = s_inode->i_mtime; + inode_set_mtime_to_ts(t_inode, inode_get_mtime(s_inode)); di->i_mtime = s_di->i_mtime; di->i_mtime_nsec = s_di->i_mtime_nsec; } @@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest, if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); - dest->i_mtime = inode_set_ctime_current(dest); + inode_set_mtime_to_ts(dest, inode_set_ctime_current(dest)); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 6510ad783c91..3b81213ed7b8 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -87,14 +87,14 @@ static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), }; -const struct xattr_handler *ocfs2_xattr_handlers[] = { +const struct xattr_handler * const ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL }; -static const struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { +static const struct xattr_handler * const ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &nop_posix_acl_access, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &nop_posix_acl_default, @@ -3422,8 +3422,8 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, } inode_set_ctime_current(inode); - di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode)); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 00308b57f64f..65e9aa743919 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -30,7 +30,7 @@ struct ocfs2_security_xattr_info { extern const struct xattr_handler ocfs2_xattr_user_handler; extern const struct xattr_handler ocfs2_xattr_trusted_handler; extern const struct xattr_handler ocfs2_xattr_security_handler; -extern const struct xattr_handler *ocfs2_xattr_handlers[]; +extern const struct xattr_handler * const ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 2f8c1882f45c..d6cd81163030 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; @@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait) oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); - ctime = inode_get_ctime(inode).tv_sec * 1000LL + - ((inode_get_ctime(inode).tv_nsec + 999)/1000); + ctime = inode_get_ctime_sec(inode) * 1000LL + + ((inode_get_ctime_nsec(inode) + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); @@ -230,11 +230,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) ctime = be64_to_cpu(oi->i_ctime); nsecs = do_div(ctime, 1000) * 1000L; - inode->i_atime.tv_sec = ctime; - inode->i_mtime.tv_sec = ctime; + inode_set_atime(inode, ctime, nsecs); + inode_set_mtime(inode, ctime, nsecs); inode_set_ctime(inode, ctime, nsecs); - inode->i_atime.tv_nsec = nsecs; - inode->i_mtime.tv_nsec = nsecs; inode->i_mapping->a_ops = &omfs_aops; diff --git a/fs/open.c b/fs/open.c index 98f6601fbac6..02dc608d40d8 100644 --- a/fs/open.c +++ b/fs/open.c @@ -870,6 +870,30 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) return ksys_fchown(fd, user, group); } +static inline int file_get_write_access(struct file *f) +{ + int error; + + error = get_write_access(f->f_inode); + if (unlikely(error)) + return error; + error = mnt_get_write_access(f->f_path.mnt); + if (unlikely(error)) + goto cleanup_inode; + if (unlikely(f->f_mode & FMODE_BACKING)) { + error = mnt_get_write_access(backing_file_user_path(f)->mnt); + if (unlikely(error)) + goto cleanup_mnt; + } + return 0; + +cleanup_mnt: + mnt_put_write_access(f->f_path.mnt); +cleanup_inode: + put_write_access(f->f_inode); + return error; +} + static int do_dentry_open(struct file *f, struct inode *inode, int (*open)(struct inode *, struct file *)) @@ -892,14 +916,9 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) { i_readcount_inc(inode); } else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) { - error = get_write_access(inode); + error = file_get_write_access(f); if (unlikely(error)) goto cleanup_file; - error = __mnt_want_write(f->f_path.mnt); - if (unlikely(error)) { - put_write_access(inode); - goto cleanup_file; - } f->f_mode |= FMODE_WRITER; } @@ -1163,20 +1182,19 @@ EXPORT_SYMBOL_GPL(kernel_file_open); /** * backing_file_open - open a backing file for kernel internal use - * @path: path of the file to open + * @user_path: path that the user reuqested to open * @flags: open flags * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). - * @path may be on the stackable filesystem and backing inode on the - * underlying filesystem. In this case, we want to be able to return - * the @real_path of the backing inode. This is done by embedding the - * returned file into a container structure that also stores the path of - * the backing inode on the underlying filesystem, which can be - * retrieved using backing_file_real_path(). + * @user_path may be on the stackable filesystem and @real_path on the + * underlying filesystem. In this case, we want to be able to return the + * @user_path of the stackable filesystem. This is done by embedding the + * returned file into a container structure that also stores the stacked + * file's path, which can be retrieved using backing_file_user_path(). */ -struct file *backing_file_open(const struct path *path, int flags, +struct file *backing_file_open(const struct path *user_path, int flags, const struct path *real_path, const struct cred *cred) { @@ -1187,9 +1205,9 @@ struct file *backing_file_open(const struct path *path, int flags, if (IS_ERR(f)) return f; - f->f_path = *path; - path_get(real_path); - *backing_file_real_path(f) = *real_path; + path_get(user_path); + *backing_file_user_path(f) = *user_path; + f->f_path = *real_path; error = do_dentry_open(f, d_inode(real_path->dentry), NULL); if (error) { fput(f); diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index b2457cb97fa0..c4b65a6d41cc 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -237,7 +237,7 @@ found: if (IS_ERR(inode)) return ERR_CAST(inode); if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; ent_oi->u = ent_data; @@ -387,7 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } - root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode); + simple_inode_init_ts(root_inode); root_inode->i_op = &openprom_inode_operations; root_inode->i_fop = &openprom_operations; root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index b711654ca18a..926d9c0a428a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -103,7 +103,7 @@ enum orangefs_vfs_op_states { #define ORANGEFS_CACHE_CREATE_FLAGS 0 #endif -extern const struct xattr_handler *orangefs_xattr_handlers[]; +extern const struct xattr_handler * const orangefs_xattr_handlers[]; extern struct posix_acl *orangefs_get_acl(struct inode *inode, int type, bool rcu); extern int orangefs_set_acl(struct mnt_idmap *idmap, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 0a9fcfdf552f..0fdceb00ca07 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -155,14 +155,14 @@ static inline void copy_attributes_from_inode(struct inode *inode, if (orangefs_inode->attr_valid & ATTR_ATIME) { attrs->mask |= ORANGEFS_ATTR_SYS_ATIME; if (orangefs_inode->attr_valid & ATTR_ATIME_SET) { - attrs->atime = (time64_t)inode->i_atime.tv_sec; + attrs->atime = (time64_t) inode_get_atime_sec(inode); attrs->mask |= ORANGEFS_ATTR_SYS_ATIME_SET; } } if (orangefs_inode->attr_valid & ATTR_MTIME) { attrs->mask |= ORANGEFS_ATTR_SYS_MTIME; if (orangefs_inode->attr_valid & ATTR_MTIME_SET) { - attrs->mtime = (time64_t)inode->i_mtime.tv_sec; + attrs->mtime = (time64_t) inode_get_mtime_sec(inode); attrs->mask |= ORANGEFS_ATTR_SYS_MTIME_SET; } } @@ -357,15 +357,15 @@ again2: downcall.resp.getattr.attributes.owner); inode->i_gid = make_kgid(&init_user_ns, new_op-> downcall.resp.getattr.attributes.group); - inode->i_atime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.atime; - inode->i_mtime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.mtime; + inode_set_atime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.atime, + 0); + inode_set_mtime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.mtime, + 0); inode_set_ctime(inode, (time64_t)new_op->downcall.resp.getattr.attributes.ctime, 0); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 68b62689a63e..74ef75586f38 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -554,7 +554,7 @@ static const struct xattr_handler orangefs_xattr_default_handler = { .set = orangefs_xattr_set_default, }; -const struct xattr_handler *orangefs_xattr_handlers[] = { +const struct xattr_handler * const orangefs_xattr_handlers[] = { &orangefs_xattr_default_handler, NULL }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index bae404a1bad4..ada3fcc9c6d5 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -337,7 +337,7 @@ static int ovl_set_timestamps(struct ovl_fs *ofs, struct dentry *upperdentry, { struct iattr attr = { .ia_valid = - ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME, .ia_atime = stat->atime, .ia_mtime = stat->mtime, }; @@ -618,7 +618,8 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK) { + if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already * copied i_flags diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index c8c8588bd98c..26b782c53910 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -188,7 +188,7 @@ static int ovl_check_encode_origin(struct dentry *dentry) /* Lower file handle for non-upper non-decodable */ if (!ovl_dentry_upper(dentry) && !decodable) - return 0; + return 1; /* Upper file handle for pure upper */ if (!ovl_dentry_lower(dentry)) diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 3b4cc633d763..ec3671ca140c 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -19,7 +19,6 @@ struct ovl_aio_req { struct kiocb iocb; refcount_t ref; struct kiocb *orig_iocb; - struct fd fd; }; static struct kmem_cache *ovl_aio_request_cachep; @@ -240,6 +239,7 @@ static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; struct timespec64 ctime, uctime; + struct timespec64 mtime, umtime; if (file->f_flags & O_NOATIME) return; @@ -252,9 +252,11 @@ static void ovl_file_accessed(struct file *file) ctime = inode_get_ctime(inode); uctime = inode_get_ctime(upperinode); - if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || - !timespec64_equal(&ctime, &uctime))) { - inode->i_mtime = upperinode->i_mtime; + mtime = inode_get_mtime(inode); + umtime = inode_get_mtime(upperinode); + if ((!timespec64_equal(&mtime, &umtime)) || + !timespec64_equal(&ctime, &uctime)) { + inode_set_mtime_to_ts(inode, inode_get_mtime(upperinode)); inode_set_ctime_to_ts(inode, uctime); } @@ -280,7 +282,7 @@ static rwf_t ovl_iocb_to_rwf(int ifl) static inline void ovl_aio_put(struct ovl_aio_req *aio_req) { if (refcount_dec_and_test(&aio_req->ref)) { - fdput(aio_req->fd); + fput(aio_req->iocb.ki_filp); kmem_cache_free(ovl_aio_request_cachep, aio_req); } } @@ -342,10 +344,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; - real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter); @@ -393,6 +393,12 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); + /* + * Overlayfs doesn't support deferred completions, don't copy + * this property in case it is set by the issuer. + */ + ifl &= ~IOCB_DIO_CALLER_COMP; + old_cred = ovl_override_creds(file_inode(file)->i_sb); if (is_sync_kiocb(iocb)) { file_start_write(real.file); @@ -409,10 +415,8 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - aio_req->fd = real; - real.flags = 0; aio_req->orig_iocb = iocb; - kiocb_clone(&aio_req->iocb, iocb, real.file); + kiocb_clone(&aio_req->iocb, iocb, get_file(real.file)); aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 83ef66644c21..b6e98a7d36ce 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -704,7 +704,8 @@ int ovl_update_time(struct inode *inode, int flags) if (upperpath.dentry) { touch_atime(&upperpath); - inode->i_atime = d_inode(upperpath.dentry)->i_atime; + inode_set_atime_to_ts(inode, + inode_get_atime(d_inode(upperpath.dentry))); } } return 0; diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index e9539f98e86a..d82d2a043da2 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -8,6 +8,7 @@ struct ovl_config { char *upperdir; char *workdir; + char **lowerdirs; bool default_permissions; int redirect_mode; int verity_mode; @@ -39,17 +40,8 @@ struct ovl_layer { int idx; /* One fsid per unique underlying sb (upper fsid == 0) */ int fsid; - char *name; }; -/* - * ovl_free_fs() relies on @mnt being the first member when unmounting - * the private mounts created for each layer. Let's check both the - * offset and type. - */ -static_assert(offsetof(struct ovl_layer, mnt) == 0); -static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *)); - struct ovl_path { const struct ovl_layer *layer; struct dentry *dentry; diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index b9355bb6d75a..f6ff23fd101c 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -157,6 +157,34 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { {} }; +static char *ovl_next_opt(char **s) +{ + char *sbegin = *s; + char *p; + + if (sbegin == NULL) + return NULL; + + for (p = sbegin; *p; p++) { + if (*p == '\\') { + p++; + if (!*p) + break; + } else if (*p == ',') { + *p = '\0'; + *s = p + 1; + return sbegin; + } + } + *s = NULL; + return sbegin; +} + +static int ovl_parse_monolithic(struct fs_context *fc, void *data) +{ + return vfs_parse_monolithic_sep(fc, data, ovl_next_opt); +} + static ssize_t ovl_parse_param_split_lowerdirs(char *str) { ssize_t nr_layers = 1, nr_colons = 0; @@ -164,7 +192,8 @@ static ssize_t ovl_parse_param_split_lowerdirs(char *str) for (s = d = str;; s++, d++) { if (*s == '\\') { - s++; + /* keep esc chars in split lowerdir */ + *d++ = *s++; } else if (*s == ':') { bool next_colon = (*(s + 1) == ':'); @@ -239,7 +268,7 @@ static void ovl_unescape(char *s) } } -static int ovl_mount_dir(const char *name, struct path *path) +static int ovl_mount_dir(const char *name, struct path *path, bool upper) { int err = -ENOMEM; char *tmp = kstrdup(name, GFP_KERNEL); @@ -248,7 +277,7 @@ static int ovl_mount_dir(const char *name, struct path *path) ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); - if (!err && path->dentry->d_flags & DCACHE_OP_REAL) { + if (!err && upper && path->dentry->d_flags & DCACHE_OP_REAL) { pr_err("filesystem on '%s' not supported as upperdir\n", tmp); path_put_init(path); @@ -269,7 +298,7 @@ static int ovl_parse_param_upperdir(const char *name, struct fs_context *fc, struct path path; char *dup; - err = ovl_mount_dir(name, &path); + err = ovl_mount_dir(name, &path, true); if (err) return err; @@ -321,12 +350,6 @@ static void ovl_parse_param_drop_lowerdir(struct ovl_fs_context *ctx) * Set "/lower1", "/lower2", and "/lower3" as lower layers and * "/data1" and "/data2" as data lower layers. Any existing lower * layers are replaced. - * (2) lowerdir=:/lower4 - * Append "/lower4" to current stack of lower layers. This requires - * that there already is at least one lower layer configured. - * (3) lowerdir=::/lower5 - * Append data "/lower5" as data lower layer. This requires that - * there's at least one regular lower layer present. */ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) { @@ -348,49 +371,9 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) return 0; } - if (strncmp(name, "::", 2) == 0) { - /* - * This is a data layer. - * There must be at least one regular lower layer - * specified. - */ - if (ctx->nr == 0) { - pr_err("data lower layers without regular lower layers not allowed"); - return -EINVAL; - } - - /* Skip the leading "::". */ - name += 2; - data_layer = true; - /* - * A data layer is automatically an append as there - * must've been at least one regular lower layer. - */ - append = true; - } else if (*name == ':') { - /* - * This is a regular lower layer. - * If users want to append a layer enforce that they - * have already specified a first layer before. It's - * better to be strict. - */ - if (ctx->nr == 0) { - pr_err("cannot append layer if no previous layer has been specified"); - return -EINVAL; - } - - /* - * Once a sequence of data layers has started regular - * lower layers are forbidden. - */ - if (ctx->nr_data > 0) { - pr_err("regular lower layers cannot follow data lower layers"); - return -EINVAL; - } - - /* Skip the leading ":". */ - name++; - append = true; + if (*name == ':') { + pr_err("cannot append lower layer"); + return -EINVAL; } dup = kstrdup(name, GFP_KERNEL); @@ -472,7 +455,7 @@ static int ovl_parse_param_lowerdir(const char *name, struct fs_context *fc) l = &ctx->lower[nr]; memset(l, 0, sizeof(*l)); - err = ovl_mount_dir_noesc(dup_iter, &l->path); + err = ovl_mount_dir(dup_iter, &l->path, false); if (err) goto out_put; @@ -682,6 +665,7 @@ static int ovl_reconfigure(struct fs_context *fc) } static const struct fs_context_operations ovl_context_ops = { + .parse_monolithic = ovl_parse_monolithic, .parse_param = ovl_parse_param, .get_tree = ovl_get_tree, .reconfigure = ovl_reconfigure, @@ -752,12 +736,12 @@ void ovl_free_fs(struct ovl_fs *ofs) if (ofs->upperdir_locked) ovl_inuse_unlock(ovl_upper_mnt(ofs)->mnt_root); - /* Hack! Reuse ofs->layers as a vfsmount array before freeing it */ - mounts = (struct vfsmount **) ofs->layers; + /* Reuse ofs->config.lowerdirs as a vfsmount array before freeing it */ + mounts = (struct vfsmount **) ofs->config.lowerdirs; for (i = 0; i < ofs->numlayer; i++) { iput(ofs->layers[i].trap); + kfree(ofs->config.lowerdirs[i]); mounts[i] = ofs->layers[i].mnt; - kfree(ofs->layers[i].name); } kern_unmount_array(mounts, ofs->numlayer); kfree(ofs->layers); @@ -765,6 +749,7 @@ void ovl_free_fs(struct ovl_fs *ofs) free_anon_bdev(ofs->fs[i].pseudo_dev); kfree(ofs->fs); + kfree(ofs->config.lowerdirs); kfree(ofs->config.upperdir); kfree(ofs->config.workdir); if (ofs->creator_cred) @@ -949,16 +934,23 @@ int ovl_show_options(struct seq_file *m, struct dentry *dentry) struct super_block *sb = dentry->d_sb; struct ovl_fs *ofs = OVL_FS(sb); size_t nr, nr_merged_lower = ofs->numlayer - ofs->numdatalayer; - const struct ovl_layer *data_layers = &ofs->layers[nr_merged_lower]; - - /* ofs->layers[0] is the upper layer */ - seq_printf(m, ",lowerdir=%s", ofs->layers[1].name); - /* dump regular lower layers */ - for (nr = 2; nr < nr_merged_lower; nr++) - seq_printf(m, ":%s", ofs->layers[nr].name); - /* dump data lower layers */ - for (nr = 0; nr < ofs->numdatalayer; nr++) - seq_printf(m, "::%s", data_layers[nr].name); + + /* + * lowerdirs[] starts from offset 1, then + * >= 0 regular lower layers prefixed with : and + * >= 0 data-only lower layers prefixed with :: + * + * we need to escase comma and space like seq_show_option() does and + * we also need to escape the colon separator from lowerdir paths. + */ + seq_puts(m, ",lowerdir="); + for (nr = 1; nr < ofs->numlayer; nr++) { + if (nr > 1) + seq_putc(m, ':'); + if (nr >= nr_merged_lower) + seq_putc(m, ':'); + seq_escape(m, ofs->config.lowerdirs[nr], ":, \t\n\\"); + } if (ofs->config.upperdir) { seq_show_option(m, "upperdir", ofs->config.upperdir); seq_show_option(m, "workdir", ofs->config.workdir); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index def266b5e2a3..6cd949c59fed 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -34,14 +34,22 @@ static struct dentry *ovl_d_real(struct dentry *dentry, struct dentry *real = NULL, *lower; int err; - /* It's an overlay file */ + /* + * vfs is only expected to call d_real() with NULL from d_real_inode() + * and with overlay inode from file_dentry() on an overlay file. + * + * TODO: remove @inode argument from d_real() API, remove code in this + * function that deals with non-NULL @inode and remove d_real() call + * from file_dentry(). + */ if (inode && d_inode(dentry) == inode) return dentry; + else if (inode) + goto bug; if (!d_is_reg(dentry)) { - if (!inode || inode == d_inode(dentry)) - return dentry; - goto bug; + /* d_real_inode() is only relevant for regular files */ + return dentry; } real = ovl_dentry_upper(dentry); @@ -104,8 +112,8 @@ static int ovl_revalidate_real(struct dentry *d, unsigned int flags, bool weak) static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int flags, bool weak) { - struct ovl_entry *oe = OVL_E(dentry); - struct ovl_path *lowerstack = ovl_lowerstack(oe); + struct ovl_entry *oe; + struct ovl_path *lowerstack; struct inode *inode = d_inode_rcu(dentry); struct dentry *upper; unsigned int i; @@ -115,6 +123,8 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, if (!inode) return -ECHILD; + oe = OVL_I_E(inode); + lowerstack = ovl_lowerstack(oe); upper = ovl_i_dentry_upper(inode); if (upper) ret = ovl_revalidate_real(upper, flags, weak); @@ -167,6 +177,7 @@ static void ovl_free_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); kfree(oi->redirect); + kfree(oi->oe); mutex_destroy(&oi->lock); kmem_cache_free(ovl_inode_cachep, oi); } @@ -176,7 +187,7 @@ static void ovl_destroy_inode(struct inode *inode) struct ovl_inode *oi = OVL_I(inode); dput(oi->__upperdentry); - ovl_free_entry(oi->oe); + ovl_stack_put(ovl_lowerstack(oi->oe), ovl_numlower(oi->oe)); if (S_ISDIR(inode->i_mode)) ovl_dir_cache_free(inode); else @@ -484,13 +495,13 @@ static const struct xattr_handler ovl_other_xattr_handler = { .set = ovl_other_xattr_set, }; -static const struct xattr_handler *ovl_trusted_xattr_handlers[] = { +static const struct xattr_handler * const ovl_trusted_xattr_handlers[] = { &ovl_own_trusted_xattr_handler, &ovl_other_xattr_handler, NULL }; -static const struct xattr_handler *ovl_user_xattr_handlers[] = { +static const struct xattr_handler * const ovl_user_xattr_handlers[] = { &ovl_own_user_xattr_handler, &ovl_other_xattr_handler, NULL @@ -569,11 +580,6 @@ static int ovl_get_upper(struct super_block *sb, struct ovl_fs *ofs, upper_layer->idx = 0; upper_layer->fsid = 0; - err = -ENOMEM; - upper_layer->name = kstrdup(ofs->config.upperdir, GFP_KERNEL); - if (!upper_layer->name) - goto out; - /* * Inherit SB_NOSEC flag from upperdir. * @@ -1122,7 +1128,8 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, layers[ofs->numlayer].idx = ofs->numlayer; layers[ofs->numlayer].fsid = fsid; layers[ofs->numlayer].fs = &ofs->fs[fsid]; - layers[ofs->numlayer].name = l->name; + /* Store for printing lowerdir=... in ovl_show_options() */ + ofs->config.lowerdirs[ofs->numlayer] = l->name; l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; @@ -1367,8 +1374,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (!layers) goto out_err; + ofs->config.lowerdirs = kcalloc(ctx->nr + 1, sizeof(char *), GFP_KERNEL); + if (!ofs->config.lowerdirs) { + kfree(layers); + goto out_err; + } ofs->layers = layers; - /* Layer 0 is reserved for upper even if there's no upper */ + /* + * Layer 0 is reserved for upper even if there's no upper. + * For consistency, config.lowerdirs[0] is NULL. + */ ofs->numlayer = 1; sb->s_stack_depth = 0; @@ -1481,8 +1496,16 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_xattr = ofs->config.userxattr ? ovl_user_xattr_handlers : ovl_trusted_xattr_handlers; sb->s_fs_info = ofs; +#ifdef CONFIG_FS_POSIX_ACL sb->s_flags |= SB_POSIXACL; +#endif sb->s_iflags |= SB_I_SKIP_SYNC | SB_I_IMA_UNVERIFIABLE_SIGNATURE; + /* + * Ensure that umask handling is done by the filesystems used + * for the the upper layer instead of overlayfs as that would + * lead to unexpected results. + */ + sb->s_iflags |= SB_I_NOUMASK; err = -ENOMEM; root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 89e0d60d35b6..868afd8834c3 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1409,8 +1409,8 @@ void ovl_copyattr(struct inode *inode) inode->i_uid = vfsuid_into_kuid(vfsuid); inode->i_gid = vfsgid_into_kgid(vfsgid); inode->i_mode = realinode->i_mode; - inode->i_atime = realinode->i_atime; - inode->i_mtime = realinode->i_mtime; + inode_set_atime_to_ts(inode, inode_get_atime(realinode)); + inode_set_mtime_to_ts(inode, inode_get_mtime(realinode)); inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); } diff --git a/fs/pipe.c b/fs/pipe.c index 6c1a9b1db907..8916c455a469 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -227,6 +227,36 @@ static inline bool pipe_readable(const struct pipe_inode_info *pipe) return !pipe_empty(head, tail) || !writers; } +static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + unsigned int tail) +{ + pipe_buf_release(pipe, buf); + + /* + * If the pipe has a watch_queue, we need additional protection + * by the spinlock because notifications get posted with only + * this spinlock, no mutex + */ + if (pipe_has_watch_queue(pipe)) { + spin_lock_irq(&pipe->rd_wait.lock); +#ifdef CONFIG_WATCH_QUEUE + if (buf->flags & PIPE_BUF_FLAG_LOSS) + pipe->note_loss = true; +#endif + pipe->tail = ++tail; + spin_unlock_irq(&pipe->rd_wait.lock); + return tail; + } + + /* + * Without a watch_queue, we can simply increment the tail + * without the spinlock - the mutex is enough. + */ + pipe->tail = ++tail; + return tail; +} + static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { @@ -320,17 +350,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) buf->len = 0; } - if (!buf->len) { - pipe_buf_release(pipe, buf); - spin_lock_irq(&pipe->rd_wait.lock); -#ifdef CONFIG_WATCH_QUEUE - if (buf->flags & PIPE_BUF_FLAG_LOSS) - pipe->note_loss = true; -#endif - tail++; - pipe->tail = tail; - spin_unlock_irq(&pipe->rd_wait.lock); - } + if (!buf->len) + tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ @@ -437,12 +458,10 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) goto out; } -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe_has_watch_queue(pipe)) { ret = -EXDEV; goto out; } -#endif /* * If it wasn't empty we try to merge new data into @@ -507,16 +526,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * it, either the reader will consume it or it'll still * be there for the next write. */ - spin_lock_irq(&pipe->rd_wait.lock); - - head = pipe->head; - if (pipe_full(head, pipe->tail, pipe->max_usage)) { - spin_unlock_irq(&pipe->rd_wait.lock); - continue; - } - pipe->head = head + 1; - spin_unlock_irq(&pipe->rd_wait.lock); /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; @@ -537,7 +547,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) break; } ret += copied; - buf->offset = 0; buf->len = copied; if (!iov_iter_count(from)) @@ -899,7 +908,7 @@ static struct inode * get_pipe_inode(void) inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); return inode; @@ -1325,10 +1334,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) unsigned int nr_slots, size; long ret = 0; -#ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) + if (pipe_has_watch_queue(pipe)) return -EBUSY; -#endif size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; @@ -1380,10 +1387,8 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) if (file->f_op != &pipefifo_fops || !pipe) return NULL; -#ifdef CONFIG_WATCH_QUEUE - if (for_splice && pipe->watch_queue) + if (for_splice && pipe_has_watch_queue(pipe)) return NULL; -#endif return pipe; } diff --git a/fs/proc/base.c b/fs/proc/base.c index ffd54617c354..83396ab14998 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &proc_def_inode_operations; /* @@ -2218,7 +2218,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path) rc = -ENOENT; vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma->vm_file->f_path; + *path = *file_user_path(vma->vm_file); path_get(path); rc = 0; } diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 6276b3938842..6e72e5ad42bc 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -113,10 +113,12 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) struct file *file; rcu_read_lock(); - file = task_lookup_fd_rcu(task, fd); - if (file) - *mode = file->f_mode; + file = task_lookup_fdget_rcu(task, fd); rcu_read_unlock(); + if (file) { + *mode = file->f_mode; + fput(file); + } return !!file; } @@ -259,12 +261,13 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = task_lookup_next_fd_rcu(p, &fd); + f = task_lookup_next_fdget_rcu(p, &fd); ctx->pos = fd + 2LL; if (!f) break; data.mode = f->f_mode; rcu_read_unlock(); + fput(f); data.fd = fd; len = snprintf(name, sizeof(name), "%u", fd); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 532dc9d240f7..592ed2516f47 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_private = de->data; inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 9dda7e54b2d0..9a8f32f21ff5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -289,9 +289,7 @@ struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; -#ifdef CONFIG_MMU struct vma_iterator iter; -#endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 4d3493579458..c6e7ebc63756 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -58,7 +58,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index c88854df0b62..bc9a2db89cfa 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -465,7 +465,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; diff --git a/fs/proc/self.c b/fs/proc/self.c index ecc4da8d265e..b46fbfd22681 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..1593940ca01e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -296,7 +296,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) if (anon_name) seq_printf(m, "[anon_shmem:%s]", anon_name->name); else - seq_file_path(m, file, "\n"); + seq_path(m, file_user_path(file), "\n"); goto done; } @@ -1967,7 +1967,7 @@ static int show_numa_map(struct seq_file *m, void *v) if (file) { seq_puts(m, " file="); - seq_file_path(m, file, "\n\t= "); + seq_path(m, file_user_path(file), "\n\t= "); } else if (vma_is_initial_heap(vma)) { seq_puts(m, " heap"); } else if (vma_is_initial_stack(vma)) { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index a8ac0dd8041e..bce674533000 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -157,7 +157,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) if (file) { seq_pad(m, ' '); - seq_file_path(m, file, ""); + seq_path(m, file_user_path(file), ""); } else if (mm && vma_is_initial_stack(vma)) { seq_pad(m, ' '); seq_puts(m, "[stack]"); @@ -175,15 +175,28 @@ static int show_map(struct seq_file *m, void *_p) return nommu_vma_show(m, _p); } -static void *m_start(struct seq_file *m, loff_t *pos) +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -1UL; + } + + return vma; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; + unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; - unsigned long addr = *pos; - /* See m_next(). Zero at the start or after lseek. */ - if (addr == -1UL) + /* See proc_get_vma(). Zero at the start or after lseek. */ + if (last_addr == -1UL) return NULL; /* pin the task and mm whilst we play with them */ @@ -192,44 +205,41 @@ static void *m_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-ESRCH); mm = priv->mm; - if (!mm || !mmget_not_zero(mm)) + if (!mm || !mmget_not_zero(mm)) { + put_task_struct(priv->task); + priv->task = NULL; return NULL; + } if (mmap_read_lock_killable(mm)) { mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; return ERR_PTR(-EINTR); } - /* start the next element from addr */ - vma = find_vma(mm, addr); - if (vma) - return vma; + vma_iter_init(&priv->iter, mm, last_addr); - mmap_read_unlock(mm); - mmput(mm); - return NULL; + return proc_get_vma(priv, ppos); } -static void m_stop(struct seq_file *m, void *_vml) +static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; + struct mm_struct *mm = priv->mm; - if (!IS_ERR_OR_NULL(_vml)) { - mmap_read_unlock(priv->mm); - mmput(priv->mm); - } - if (priv->task) { - put_task_struct(priv->task); - priv->task = NULL; - } + if (!priv->task) + return; + + mmap_read_unlock(mm); + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; } -static void *m_next(struct seq_file *m, void *_p, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *ppos) { - struct vm_area_struct *vma = _p; - - *pos = vma->vm_end; - return find_vma(vma->vm_mm, vma->vm_end); + return proc_get_vma(m->private, ppos); } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 63ac1f93289f..0e5050d6ab64 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 585360706b33..d41c20d1b5e8 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } return inode; } @@ -390,7 +390,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode_set_ctime_to_ts(inode, record->time); + inode_set_mtime_to_ts(inode, + inode_set_ctime_to_ts(inode, record->time)); d_add(dentry, inode); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index a7171f5532a1..6eb9bb369b57 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -301,10 +301,8 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid)); set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); - inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->di_mtime); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); - inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0); + inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0); inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0); inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 21f90d519f1a..a286c545717f 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -558,10 +558,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) i_uid_write(inode, (uid_t)fs32_to_cpu(sbi, raw_inode->di_uid)); i_gid_write(inode, (gid_t)fs32_to_cpu(sbi, raw_inode->di_gid)); inode->i_size = fs64_to_cpu(sbi, raw_inode->di_size); - inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_mtime); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime); - inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->di_mtime), 0); + inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->di_atime), 0); inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9e72bfe8bbad..31e897ad5e6a 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -233,19 +233,18 @@ static void put_quota_format(struct quota_format_type *fmt) * All dquots are placed to the end of inuse_list when first created, and this * list is used for invalidate operation, which must look at every dquot. * - * When the last reference of a dquot will be dropped, the dquot will be - * added to releasing_dquots. We'd then queue work item which would call + * When the last reference of a dquot is dropped, the dquot is added to + * releasing_dquots. We'll then queue work item which will call * synchronize_srcu() and after that perform the final cleanup of all the - * dquots on the list. Both releasing_dquots and free_dquots use the - * dq_free list_head in the dquot struct. When a dquot is removed from - * releasing_dquots, a reference count is always subtracted, and if - * dq_count == 0 at that point, the dquot will be added to the free_dquots. + * dquots on the list. Each cleaned up dquot is moved to free_dquots list. + * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot + * struct. * - * Unused dquots (dq_count == 0) are added to the free_dquots list when freed, - * and this list is searched whenever we need an available dquot. Dquots are - * removed from the list as soon as they are used again, and - * dqstats.free_dquots gives the number of dquots on the list. When - * dquot is invalidated it's completely released from memory. + * Unused and cleaned up dquots are in the free_dquots list and this list is + * searched whenever we need an available dquot. Dquots are removed from the + * list as soon as they are used again and dqstats.free_dquots gives the number + * of dquots on the list. When dquot is invalidated it's completely released + * from memory. * * Dirty dquots are added to the dqi_dirty_list of quota_info when mark * dirtied, and this list is searched when writing dirty dquots back to @@ -321,6 +320,7 @@ static inline void put_dquot_last(struct dquot *dquot) static inline void put_releasing_dquots(struct dquot *dquot) { list_add_tail(&dquot->dq_free, &releasing_dquots); + set_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void remove_free_dquot(struct dquot *dquot) @@ -328,8 +328,10 @@ static inline void remove_free_dquot(struct dquot *dquot) if (list_empty(&dquot->dq_free)) return; list_del_init(&dquot->dq_free); - if (!atomic_read(&dquot->dq_count)) + if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags)) dqstats_dec(DQST_FREE_DQUOTS); + else + clear_bit(DQ_RELEASING_B, &dquot->dq_flags); } static inline void put_inuse(struct dquot *dquot) @@ -581,12 +583,6 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { - /* dquot in releasing_dquots, flush and retry */ - if (!list_empty(&dquot->dq_free)) { - spin_unlock(&dq_list_lock); - goto restart; - } - atomic_inc(&dquot->dq_count); spin_unlock(&dq_list_lock); /* @@ -606,6 +602,15 @@ restart: goto restart; } /* + * The last user already dropped its reference but dquot didn't + * get fully cleaned up yet. Restart the scan which flushes the + * work cleaning up released dquots. + */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + goto restart; + } + /* * Quota now has no users and it has been written on last * dqput() */ @@ -696,6 +701,13 @@ int dquot_writeback_dquots(struct super_block *sb, int type) dq_dirty); WARN_ON(!dquot_active(dquot)); + /* If the dquot is releasing we should not touch it */ + if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); + flush_delayed_work("a_release_work); + spin_lock(&dq_list_lock); + continue; + } /* Now we have active dquot from which someone is * holding reference so we can safely just increase @@ -809,18 +821,18 @@ static void quota_release_workfn(struct work_struct *work) /* Exchange the list head to avoid livelock. */ list_replace_init(&releasing_dquots, &rls_head); spin_unlock(&dq_list_lock); + synchronize_srcu(&dquot_srcu); restart: - synchronize_srcu(&dquot_srcu); spin_lock(&dq_list_lock); while (!list_empty(&rls_head)) { dquot = list_first_entry(&rls_head, struct dquot, dq_free); - /* Dquot got used again? */ - if (atomic_read(&dquot->dq_count) > 1) { - remove_free_dquot(dquot); - atomic_dec(&dquot->dq_count); - continue; - } + WARN_ON_ONCE(atomic_read(&dquot->dq_count)); + /* + * Note that DQ_RELEASING_B protects us from racing with + * invalidate_dquots() calls so we are safe to work with the + * dquot even after we drop dq_list_lock. + */ if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ @@ -834,7 +846,6 @@ restart: } /* Dquot is inactive and clean, now move it to free list */ remove_free_dquot(dquot); - atomic_dec(&dquot->dq_count); put_dquot_last(dquot); } spin_unlock(&dq_list_lock); @@ -875,6 +886,7 @@ void dqput(struct dquot *dquot) BUG_ON(!list_empty(&dquot->dq_free)); #endif put_releasing_dquots(dquot); + atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); queue_delayed_work(system_unbound_wq, "a_release_work, 1); } @@ -963,7 +975,7 @@ we_slept: dqstats_inc(DQST_LOOKUPS); } /* Wait for dq_lock - after this we know that either dquot_release() is - * already finished or it will be canceled due to dq_count > 1 test */ + * already finished or it will be canceled due to dq_count > 0 test */ wait_on_dquot(dquot); /* Read the dquot / allocate space in quota file */ if (!dquot_active(dquot)) { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 18e8387cab41..4ac05a9e25bc 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } return error; } @@ -138,7 +138,8 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (!error) { d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, + inode_set_ctime_current(dir)); } else iput(inode); } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 86e55d4bb10d..c8572346556f 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1257,11 +1257,9 @@ static void init_inode(struct inode *inode, struct treepath *path) i_uid_write(inode, sd_v1_uid(sd)); i_gid_write(inode, sd_v1_gid(sd)); inode->i_size = sd_v1_size(sd); - inode->i_atime.tv_sec = sd_v1_atime(sd); - inode->i_mtime.tv_sec = sd_v1_mtime(sd); + inode_set_atime(inode, sd_v1_atime(sd), 0); + inode_set_mtime(inode, sd_v1_mtime(sd), 0); inode_set_ctime(inode, sd_v1_ctime(sd), 0); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; inode->i_blocks = sd_v1_blocks(sd); inode->i_generation = le32_to_cpu(INODE_PKEY(inode)->k_dir_id); @@ -1311,11 +1309,9 @@ static void init_inode(struct inode *inode, struct treepath *path) i_uid_write(inode, sd_v2_uid(sd)); inode->i_size = sd_v2_size(sd); i_gid_write(inode, sd_v2_gid(sd)); - inode->i_mtime.tv_sec = sd_v2_mtime(sd); - inode->i_atime.tv_sec = sd_v2_atime(sd); + inode_set_mtime(inode, sd_v2_mtime(sd), 0); + inode_set_atime(inode, sd_v2_atime(sd), 0); inode_set_ctime(inode, sd_v2_ctime(sd), 0); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; inode->i_blocks = sd_v2_blocks(sd); rdev = sd_v2_rdev(sd); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -1370,9 +1366,9 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_uid(sd_v2, i_uid_read(inode)); set_sd_v2_size(sd_v2, size); set_sd_v2_gid(sd_v2, i_gid_read(inode)); - set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); - set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); - set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec); + set_sd_v2_mtime(sd_v2, inode_get_mtime_sec(inode)); + set_sd_v2_atime(sd_v2, inode_get_atime_sec(inode)); + set_sd_v2_ctime(sd_v2, inode_get_ctime_sec(inode)); set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); @@ -1391,9 +1387,9 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) set_sd_v1_gid(sd_v1, i_gid_read(inode)); set_sd_v1_nlink(sd_v1, inode->i_nlink); set_sd_v1_size(sd_v1, size); - set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); - set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec); - set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); + set_sd_v1_atime(sd_v1, inode_get_atime_sec(inode)); + set_sd_v1_ctime(sd_v1, inode_get_ctime_sec(inode)); + set_sd_v1_mtime(sd_v1, inode_get_mtime_sec(inode)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); @@ -1984,7 +1980,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_size = i_size; inode->i_blocks = 0; inode->i_bytes = 0; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 015bfe4e4524..171c912af50f 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -90,8 +90,7 @@ static int flush_commit_list(struct super_block *s, static int can_dirty(struct reiserfs_journal_cnode *cn); static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *sb); -static void release_journal_dev(struct super_block *super, - struct reiserfs_journal *journal); +static void release_journal_dev(struct reiserfs_journal *journal); static void dirty_one_transaction(struct super_block *s, struct reiserfs_journal_list *jl); static void flush_async_commits(struct work_struct *work); @@ -1893,7 +1892,7 @@ static void free_journal_ram(struct super_block *sb) * j_header_bh is on the journal dev, make sure * not to release the journal dev until we brelse j_header_bh */ - release_journal_dev(sb, journal); + release_journal_dev(journal); vfree(journal); } @@ -2387,7 +2386,7 @@ static int journal_read(struct super_block *sb) cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); reiserfs_info(sb, "checking transaction log (%pg)\n", - journal->j_dev_bd); + journal->j_bdev_handle->bdev); start = ktime_get_seconds(); /* @@ -2448,7 +2447,7 @@ static int journal_read(struct super_block *sb) * device and journal device to be the same */ d_bh = - reiserfs_breada(journal->j_dev_bd, cur_dblock, + reiserfs_breada(journal->j_bdev_handle->bdev, cur_dblock, sb->s_blocksize, SB_ONDISK_JOURNAL_1st_BLOCK(sb) + SB_ONDISK_JOURNAL_SIZE(sb)); @@ -2587,17 +2586,11 @@ static void journal_list_init(struct super_block *sb) SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb); } -static void release_journal_dev(struct super_block *super, - struct reiserfs_journal *journal) +static void release_journal_dev(struct reiserfs_journal *journal) { - if (journal->j_dev_bd != NULL) { - void *holder = NULL; - - if (journal->j_dev_bd->bd_dev != super->s_dev) - holder = journal; - - blkdev_put(journal->j_dev_bd, holder); - journal->j_dev_bd = NULL; + if (journal->j_bdev_handle) { + bdev_release(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; } } @@ -2612,7 +2605,7 @@ static int journal_init_dev(struct super_block *super, result = 0; - journal->j_dev_bd = NULL; + journal->j_bdev_handle = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; @@ -2623,36 +2616,37 @@ static int journal_init_dev(struct super_block *super, if ((!jdev_name || !jdev_name[0])) { if (jdev == super->s_dev) holder = NULL; - journal->j_dev_bd = blkdev_get_by_dev(jdev, blkdev_mode, holder, - NULL); - if (IS_ERR(journal->j_dev_bd)) { - result = PTR_ERR(journal->j_dev_bd); - journal->j_dev_bd = NULL; + journal->j_bdev_handle = bdev_open_by_dev(jdev, blkdev_mode, + holder, NULL); + if (IS_ERR(journal->j_bdev_handle)) { + result = PTR_ERR(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; reiserfs_warning(super, "sh-458", "cannot init journal device unknown-block(%u,%u): %i", MAJOR(jdev), MINOR(jdev), result); return result; } else if (jdev != super->s_dev) - set_blocksize(journal->j_dev_bd, super->s_blocksize); + set_blocksize(journal->j_bdev_handle->bdev, + super->s_blocksize); return 0; } - journal->j_dev_bd = blkdev_get_by_path(jdev_name, blkdev_mode, holder, - NULL); - if (IS_ERR(journal->j_dev_bd)) { - result = PTR_ERR(journal->j_dev_bd); - journal->j_dev_bd = NULL; + journal->j_bdev_handle = bdev_open_by_path(jdev_name, blkdev_mode, + holder, NULL); + if (IS_ERR(journal->j_bdev_handle)) { + result = PTR_ERR(journal->j_bdev_handle); + journal->j_bdev_handle = NULL; reiserfs_warning(super, "sh-457", "journal_init_dev: Cannot open '%s': %i", jdev_name, result); return result; } - set_blocksize(journal->j_dev_bd, super->s_blocksize); + set_blocksize(journal->j_bdev_handle->bdev, super->s_blocksize); reiserfs_info(super, "journal_init_dev: journal device: %pg\n", - journal->j_dev_bd); + journal->j_bdev_handle->bdev); return 0; } @@ -2810,7 +2804,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - journal->j_dev_bd, + journal->j_bdev_handle->bdev, sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2834,7 +2828,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - journal->j_dev_bd, + journal->j_bdev_handle->bdev, SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 9c5704be2435..994d6e6995ab 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } dir->i_size += paste_size; - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); if (!S_ISDIR(inode->i_mode) && visible) /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); @@ -966,8 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) @@ -1075,7 +1075,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) reiserfs_update_sd(&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); reiserfs_update_sd(&th, dir); if (!savelink) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 3dba8acf4e83..83cb9402e0f9 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -354,7 +354,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - SB_JOURNAL(sb)->j_dev_bd, + SB_JOURNAL(sb)->j_bdev_handle->bdev, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index b81749492ef9..725667880e62 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -299,7 +299,7 @@ struct reiserfs_journal { /* oldest journal block. start here for traverse */ struct reiserfs_journal_cnode *j_first; - struct block_device *j_dev_bd; + struct bdev_handle *j_bdev_handle; /* first block on s_dev of reserved area journal */ int j_1st_reserved_block; @@ -1165,7 +1165,7 @@ static inline int bmap_would_wrap(unsigned bmap_nr) return bmap_nr > ((1LL << 16) - 1); } -extern const struct xattr_handler *reiserfs_xattr_handlers[]; +extern const struct xattr_handler * const reiserfs_xattr_handlers[]; /* * this says about version of key of all items (but stat data) the @@ -2699,7 +2699,7 @@ struct reiserfs_iget_args { #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12) #define journal_trans_half(blocksize) \ - ((blocksize - sizeof (struct reiserfs_journal_desc) + sizeof (__u32) - 12) / sizeof (__u32)) + ((blocksize - sizeof(struct reiserfs_journal_desc) - 12) / sizeof(__u32)) /* journal.c see journal.c for all the comments here */ @@ -2711,7 +2711,7 @@ struct reiserfs_journal_desc { __le32 j_len; __le32 j_mount_id; /* mount id of this trans */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_desc_trans_id(d) le32_to_cpu((d)->j_trans_id) @@ -2726,7 +2726,7 @@ struct reiserfs_journal_desc { struct reiserfs_journal_commit { __le32 j_trans_id; /* must match j_trans_id from the desc block */ __le32 j_len; /* ditto */ - __le32 j_realblock[1]; /* real locations for each block */ + __le32 j_realblock[]; /* real locations for each block */ }; #define get_commit_trans_id(c) le32_to_cpu((c)->j_trans_id) @@ -2809,9 +2809,12 @@ struct reiserfs_journal_header { #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) /* We need these to make journal.c code more readable */ -#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) -#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) -#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_find_get_block(s, block) __find_get_block(\ + SB_JOURNAL(s)->j_bdev_handle->bdev, block, s->s_blocksize) +#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + block, s->s_blocksize) +#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_bdev_handle->bdev,\ + block, s->s_blocksize) enum reiserfs_bh_state_bits { BH_JDirty = BH_PrivateStart, /* buffer is in current transaction */ diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index 3676e02a0232..2138ee7d271d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2003,7 +2003,8 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, pathrelse(&s_search_path); if (update_timestamps) { - inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, + current_time(inode)); inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); @@ -2028,7 +2029,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, update_and_out: if (update_timestamps) { /* this is truncate, not file closing */ - inode->i_mtime = current_time(inode); + inode_set_mtime_to_ts(inode, current_time(inode)); inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 7eaf36b3de12..67b5510beded 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2587,7 +2587,7 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 6000964c2b80..998035a6388e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -780,7 +780,7 @@ static inline bool reiserfs_posix_acl_list(const char *name, } /* This is the implementation for the xattr plugin infrastructure */ -static inline bool reiserfs_xattr_list(const struct xattr_handler **handlers, +static inline bool reiserfs_xattr_list(const struct xattr_handler * const *handlers, const char *name, struct dentry *dentry) { if (handlers) { @@ -911,7 +911,7 @@ static int create_privroot(struct dentry *dentry) { return 0; } #endif /* Actual operations that are exported to VFS-land */ -const struct xattr_handler *reiserfs_xattr_handlers[] = { +const struct xattr_handler * const reiserfs_xattr_handlers[] = { #ifdef CONFIG_REISERFS_FS_XATTR &reiserfs_xattr_user_handler, &reiserfs_xattr_trusted_handler, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 5c35f6c76037..545ad44f96b8 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -322,7 +322,8 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); - i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0); + inode_set_mtime_to_ts(i, + inode_set_atime_to_ts(i, inode_set_ctime(i, 0, 0))); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; @@ -593,7 +594,7 @@ static void romfs_kill_sb(struct super_block *sb) #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { sync_blockdev(sb->s_bdev); - blkdev_put(sb->s_bdev, sb); + bdev_release(sb->s_bdev_handle); } #endif } diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b17f067e4ada..fe1bf5b6e0cb 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -15,6 +15,7 @@ static struct cached_fid *init_cached_dir(const char *path); static void free_cached_dir(struct cached_fid *cfid); static void smb2_close_cached_fid(struct kref *ref); +static void cfids_laundromat_worker(struct work_struct *work); static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, const char *path, @@ -169,15 +170,18 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, return -ENOENT; } /* - * At this point we either have a lease already and we can just - * return it. If not we are guaranteed to be the only thread accessing - * this cfid. + * Return cached fid if it has a lease. Otherwise, it is either a new + * entry or laundromat worker removed it from @cfids->entries. Caller + * will put last reference if the latter. */ + spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease) { + spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; kfree(utf16_path); return 0; } + spin_unlock(&cfids->cfid_list_lock); /* * Skip any prefix paths in @path as lookup_positive_unlocked() ends up @@ -294,9 +298,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, goto oshr_free; } } + spin_lock(&cfids->cfid_list_lock); cfid->dentry = dentry; cfid->time = jiffies; cfid->has_lease = true; + spin_unlock(&cfids->cfid_list_lock); oshr_free: kfree(utf16_path); @@ -305,24 +311,28 @@ oshr_free: free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); spin_lock(&cfids->cfid_list_lock); - if (rc && !cfid->has_lease) { - if (cfid->on_list) { - list_del(&cfid->entry); - cfid->on_list = false; - cfids->num_entries--; + if (!cfid->has_lease) { + if (rc) { + if (cfid->on_list) { + list_del(&cfid->entry); + cfid->on_list = false; + cfids->num_entries--; + } + rc = -ENOENT; + } else { + /* + * We are guaranteed to have two references at this + * point. One for the caller and one for a potential + * lease. Release the Lease-ref so that the directory + * will be closed when the caller closes the cached + * handle. + */ + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); + goto out; } - rc = -ENOENT; } spin_unlock(&cfids->cfid_list_lock); - if (!rc && !cfid->has_lease) { - /* - * We are guaranteed to have two references at this point. - * One for the caller and one for a potential lease. - * Release the Lease-ref so that the directory will be closed - * when the caller closes the cached handle. - */ - kref_put(&cfid->refcount, smb2_close_cached_fid); - } if (rc) { if (cfid->is_open) SMB2_close(0, cfid->tcon, cfid->fid.persistent_fid, @@ -330,7 +340,7 @@ oshr_free: free_cached_dir(cfid); cfid = NULL; } - +out: if (rc == 0) { *ret_cfid = cfid; atomic_inc(&tcon->num_remote_opens); @@ -452,6 +462,9 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) struct cached_fid *cfid, *q; LIST_HEAD(entry); + if (cfids == NULL) + return; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { list_move(&cfid->entry, &entry); @@ -569,53 +582,51 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(cfid); } -static int -cifs_cfids_laundromat_thread(void *p) +static void cfids_laundromat_worker(struct work_struct *work) { - struct cached_fids *cfids = p; + struct cached_fids *cfids; struct cached_fid *cfid, *q; - struct list_head entry; + LIST_HEAD(entry); - while (!kthread_should_stop()) { - ssleep(1); - INIT_LIST_HEAD(&entry); - if (kthread_should_stop()) - return 0; - spin_lock(&cfids->cfid_list_lock); - list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { - if (time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { - list_del(&cfid->entry); - list_add(&cfid->entry, &entry); - cfids->num_entries--; - } - } - spin_unlock(&cfids->cfid_list_lock); + cfids = container_of(work, struct cached_fids, laundromat_work.work); - list_for_each_entry_safe(cfid, q, &entry, entry) { + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { + if (cfid->time && + time_after(jiffies, cfid->time + HZ * dir_cache_timeout)) { cfid->on_list = false; - list_del(&cfid->entry); + list_move(&cfid->entry, &entry); + cfids->num_entries--; + /* To prevent race with smb2_cached_lease_break() */ + kref_get(&cfid->refcount); + } + } + spin_unlock(&cfids->cfid_list_lock); + + list_for_each_entry_safe(cfid, q, &entry, entry) { + list_del(&cfid->entry); + /* + * Cancel and wait for the work to finish in case we are racing + * with it. + */ + cancel_work_sync(&cfid->lease_break); + if (cfid->has_lease) { /* - * Cancel, and wait for the work to finish in - * case we are racing with it. + * Our lease has not yet been cancelled from the server + * so we need to drop the reference. */ - cancel_work_sync(&cfid->lease_break); - if (cfid->has_lease) { - /* - * We lease has not yet been cancelled from - * the server so we need to drop the reference. - */ - spin_lock(&cfids->cfid_list_lock); - cfid->has_lease = false; - spin_unlock(&cfids->cfid_list_lock); - kref_put(&cfid->refcount, smb2_close_cached_fid); - } + spin_lock(&cfids->cfid_list_lock); + cfid->has_lease = false; + spin_unlock(&cfids->cfid_list_lock); + kref_put(&cfid->refcount, smb2_close_cached_fid); } + /* Drop the extra reference opened above */ + kref_put(&cfid->refcount, smb2_close_cached_fid); } - - return 0; + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); } - struct cached_fids *init_cached_dirs(void) { struct cached_fids *cfids; @@ -626,19 +637,10 @@ struct cached_fids *init_cached_dirs(void) spin_lock_init(&cfids->cfid_list_lock); INIT_LIST_HEAD(&cfids->entries); - /* - * since we're in a cifs function already, we know that - * this will succeed. No need for try_module_get(). - */ - __module_get(THIS_MODULE); - cfids->laundromat = kthread_run(cifs_cfids_laundromat_thread, - cfids, "cifsd-cfid-laundromat"); - if (IS_ERR(cfids->laundromat)) { - cifs_dbg(VFS, "Failed to start cfids laundromat thread.\n"); - kfree(cfids); - module_put(THIS_MODULE); - return NULL; - } + INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker); + queue_delayed_work(cifsiod_wq, &cfids->laundromat_work, + dir_cache_timeout * HZ); + return cfids; } @@ -651,11 +653,10 @@ void free_cached_dirs(struct cached_fids *cfids) struct cached_fid *cfid, *q; LIST_HEAD(entry); - if (cfids->laundromat) { - kthread_stop(cfids->laundromat); - cfids->laundromat = NULL; - module_put(THIS_MODULE); - } + if (cfids == NULL) + return; + + cancel_delayed_work_sync(&cfids->laundromat_work); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index a82ff2cea789..81ba0fd5cc16 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -57,7 +57,7 @@ struct cached_fids { spinlock_t cfid_list_lock; int num_entries; struct list_head entries; - struct task_struct *laundromat; + struct delayed_work laundromat_work; }; extern struct cached_fids *init_cached_dirs(void); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 41daebd220ff..8ca3d7606bb4 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -127,7 +127,7 @@ extern int cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, struct dentry *direntry, const char *symname); #ifdef CONFIG_CIFS_XATTR -extern const struct xattr_handler *cifs_xattr_handlers[]; +extern const struct xattr_handler * const cifs_xattr_handlers[]; extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); #else # define cifs_xattr_handlers NULL diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 032d8716f671..02082621d8e0 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -1807,6 +1807,7 @@ static inline bool is_retryable_error(int error) #define MID_RETRY_NEEDED 8 /* session closed while this request out */ #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 +#define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ @@ -1943,7 +1944,7 @@ require use of the stronger protocol */ * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fid->fid_mutex cifs_tcon->crfid tconInfoAlloc + * cached_fid->fid_mutex cifs_tcon->crfid tcon_info_alloc * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 7d8035846680..0c37eefa18a5 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -512,7 +512,7 @@ extern int CIFSSMBLogoff(const unsigned int xid, struct cifs_ses *ses); extern struct cifs_ses *sesInfoAlloc(void); extern void sesInfoFree(struct cifs_ses *); -extern struct cifs_tcon *tconInfoAlloc(void); +extern struct cifs_tcon *tcon_info_alloc(bool dir_leases_enabled); extern void tconInfoFree(struct cifs_tcon *); extern int cifs_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server, diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 687754791bf0..7b923e36501b 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1882,7 +1882,8 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb3_fs_context *ctx) } } - tcon = tconInfoAlloc(); + /* no need to setup directory caching on IPC share, so pass in false */ + tcon = tcon_info_alloc(false); if (tcon == NULL) return -ENOMEM; @@ -2473,8 +2474,9 @@ cifs_put_tcon(struct cifs_tcon *tcon) static struct cifs_tcon * cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) { - int rc, xid; struct cifs_tcon *tcon; + bool nohandlecache; + int rc, xid; tcon = cifs_find_tcon(ses, ctx); if (tcon) { @@ -2492,11 +2494,17 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } - tcon = tconInfoAlloc(); + if (ses->server->dialect >= SMB20_PROT_ID && + (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING)) + nohandlecache = ctx->nohandlecache; + else + nohandlecache = true; + tcon = tcon_info_alloc(!nohandlecache); if (tcon == NULL) { rc = -ENOMEM; goto out_fail; } + tcon->nohandlecache = nohandlecache; if (ctx->snapshot_time) { if (ses->server->vals->protocol_id == 0) { @@ -2658,10 +2666,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) tcon->nocase = ctx->nocase; tcon->broken_sparse_sup = ctx->no_sparse; tcon->max_cached_dirs = ctx->max_cached_dirs; - if (ses->server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) - tcon->nohandlecache = ctx->nohandlecache; - else - tcon->nohandlecache = true; tcon->nodelete = ctx->nodelete; tcon->local_lease = ctx->local_lease; INIT_LIST_HEAD(&tcon->pending_opens); @@ -2891,9 +2895,9 @@ bind_socket(struct TCP_Server_Info *server) if (server->srcaddr.ss_family != AF_UNSPEC) { /* Bind to the specified local IP address */ struct socket *socket = server->ssocket; - rc = socket->ops->bind(socket, - (struct sockaddr *) &server->srcaddr, - sizeof(server->srcaddr)); + rc = kernel_bind(socket, + (struct sockaddr *) &server->srcaddr, + sizeof(server->srcaddr)); if (rc < 0) { struct sockaddr_in *saddr4; struct sockaddr_in6 *saddr6; @@ -3042,8 +3046,8 @@ generic_ip_connect(struct TCP_Server_Info *server) socket->sk->sk_sndbuf, socket->sk->sk_rcvbuf, socket->sk->sk_rcvtimeo); - rc = socket->ops->connect(socket, saddr, slen, - server->noblockcnt ? O_NONBLOCK : 0); + rc = kernel_connect(socket, saddr, slen, + server->noblockcnt ? O_NONBLOCK : 0); /* * When mounting SMB root file systems, we do not want to block in * connect. Otherwise bail out and then let cifs_reconnect() perform diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 2108b3b40ce9..cf17e3dd703e 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1085,7 +1085,8 @@ int cifs_close(struct inode *inode, struct file *file) !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, + inode_set_ctime_current(inode)); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -2596,7 +2597,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data, to - from, &offset); cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); if ((bytes_written > 0) && (offset)) rc = 0; else if (bytes_written < 0) @@ -4647,11 +4648,13 @@ static void cifs_readahead(struct readahead_control *ractl) static int cifs_readpage_worker(struct file *file, struct page *page, loff_t *poffset) { + struct inode *inode = file_inode(file); + struct timespec64 atime, mtime; char *read_data; int rc; /* Is the page cached? */ - rc = cifs_readpage_from_fscache(file_inode(file), page); + rc = cifs_readpage_from_fscache(inode, page); if (rc == 0) goto read_complete; @@ -4666,11 +4669,10 @@ static int cifs_readpage_worker(struct file *file, struct page *page, cifs_dbg(FYI, "Bytes read %d\n", rc); /* we do not want atime to be less than mtime, it broke some apps */ - file_inode(file)->i_atime = current_time(file_inode(file)); - if (timespec64_compare(&(file_inode(file)->i_atime), &(file_inode(file)->i_mtime))) - file_inode(file)->i_atime = file_inode(file)->i_mtime; - else - file_inode(file)->i_atime = current_time(file_inode(file)); + atime = inode_set_atime_to_ts(inode, current_time(inode)); + mtime = inode_get_mtime(inode); + if (timespec64_compare(&atime, &mtime)) + inode_set_atime_to_ts(inode, inode_get_mtime(inode)); if (PAGE_SIZE > rc) memset(read_data + rc, 0, PAGE_SIZE - rc); diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e45ce31bbda7..a3493da12ad1 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -1541,6 +1541,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_parse_mount_err: kfree_sensitive(ctx->password); + ctx->password = NULL; return -EINVAL; } diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index 84f3b09367d2..a3d73720914f 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -49,12 +49,12 @@ static inline void cifs_fscache_fill_coherency(struct inode *inode, struct cifs_fscache_inode_coherency_data *cd) { - struct cifsInodeInfo *cifsi = CIFS_I(inode); struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); memset(cd, 0, sizeof(*cd)); - cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); - cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); + cd->last_write_time_sec = cpu_to_le64(mtime.tv_sec); + cd->last_write_time_nsec = cpu_to_le32(mtime.tv_nsec); cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec); cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec); } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index de2dfbaae821..3abfe77bfa46 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -82,6 +82,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) { struct cifs_fscache_inode_coherency_data cd; struct cifsInodeInfo *cifs_i = CIFS_I(inode); + struct timespec64 mtime; cifs_dbg(FYI, "%s: revalidating inode %llu\n", __func__, cifs_i->uniqueid); @@ -101,7 +102,8 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr) /* revalidate if mtime or size have changed */ fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode); - if (timespec64_equal(&inode->i_mtime, &fattr->cf_mtime) && + mtime = inode_get_mtime(inode); + if (timespec64_equal(&mtime, &fattr->cf_mtime) && cifs_i->server_eof == fattr->cf_eof) { cifs_dbg(FYI, "%s: inode %llu is unchanged\n", __func__, cifs_i->uniqueid); @@ -164,10 +166,10 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) fattr->cf_ctime = timestamp_truncate(fattr->cf_ctime, inode); /* we do not want atime to be less than mtime, it broke some apps */ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0) - inode->i_atime = fattr->cf_mtime; + inode_set_atime_to_ts(inode, fattr->cf_mtime); else - inode->i_atime = fattr->cf_atime; - inode->i_mtime = fattr->cf_mtime; + inode_set_atime_to_ts(inode, fattr->cf_atime); + inode_set_mtime_to_ts(inode, fattr->cf_mtime); inode_set_ctime_to_ts(inode, fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); @@ -1816,7 +1818,7 @@ out_reval: when needed */ inode_set_ctime_current(inode); } - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -2131,7 +2133,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifsInode->time = 0; inode_set_ctime_current(d_inode(direntry)); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); rmdir_exit: free_dentry_path(page); @@ -2337,9 +2339,6 @@ unlink_target: /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; - source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir, - inode_set_ctime_current(target_dir)); - cifs_rename_exit: kfree(info_buf_source); free_dentry_path(page2); @@ -2680,7 +2679,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, } cifsFileInfo_put(cfile); - return -ENOTSUPP; + return -EOPNOTSUPP; } int cifs_truncate_page(struct address_space *mapping, loff_t from) diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 366b755ca913..35b176457bbe 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -113,18 +113,22 @@ sesInfoFree(struct cifs_ses *buf_to_free) } struct cifs_tcon * -tconInfoAlloc(void) +tcon_info_alloc(bool dir_leases_enabled) { struct cifs_tcon *ret_buf; ret_buf = kzalloc(sizeof(*ret_buf), GFP_KERNEL); if (!ret_buf) return NULL; - ret_buf->cfids = init_cached_dirs(); - if (!ret_buf->cfids) { - kfree(ret_buf); - return NULL; + + if (dir_leases_enabled == true) { + ret_buf->cfids = init_cached_dirs(); + if (!ret_buf->cfids) { + kfree(ret_buf); + return NULL; + } } + /* else ret_buf->cfids is already set to NULL above */ atomic_inc(&tconInfoAllocCount); ret_buf->status = TID_NEW; diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index b41e2e872b22..0b89f7008ac0 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -539,6 +539,9 @@ static int parse_create_response(struct cifs_open_info_data *data, int rc = 0; switch (rsp->hdr.Status) { + case STATUS_IO_REPARSE_TAG_NOT_HANDLED: + reparse_point = true; + break; case STATUS_STOPPED_ON_SYMLINK: rc = smb2_parse_symlink_response(cifs_sb, iov, &data->symlink_target); diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c index 194799ddd382..1a90dd78b238 100644 --- a/fs/smb/client/smb2maperror.c +++ b/fs/smb/client/smb2maperror.c @@ -877,8 +877,6 @@ static const struct status_to_posix_error smb2_error_map_table[] = { "STATUS_IO_REPARSE_TAG_MISMATCH"}, {STATUS_IO_REPARSE_DATA_INVALID, -EIO, "STATUS_IO_REPARSE_DATA_INVALID"}, - {STATUS_IO_REPARSE_TAG_NOT_HANDLED, -EIO, - "STATUS_IO_REPARSE_TAG_NOT_HANDLED"}, {STATUS_REPARSE_POINT_NOT_RESOLVED, -EIO, "STATUS_REPARSE_POINT_NOT_RESOLVED"}, {STATUS_DIRECTORY_IS_A_REPARSE_POINT, -EIO, diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index d9eda2e958b4..f4849a8ad40b 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -297,7 +297,7 @@ smb2_adjust_credits(struct TCP_Server_Info *server, cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); - return -ENOTSUPP; + return -EOPNOTSUPP; } spin_lock(&server->req_lock); @@ -1161,7 +1161,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, /* Use a fudge factor of 256 bytes in case we collide * with a different set_EAs command. */ - if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - + if (CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE - MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 < used_len + ea_name_len + ea_value_len + 1) { rc = -ENOSPC; @@ -1403,12 +1403,14 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, /* Creation time should not need to be updated on close */ if (file_inf.LastWriteTime) - inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime); + inode_set_mtime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.LastWriteTime)); if (file_inf.ChangeTime) inode_set_ctime_to_ts(inode, cifs_NTtimeToUnix(file_inf.ChangeTime)); if (file_inf.LastAccessTime) - inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime); + inode_set_atime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.LastAccessTime)); /* * i_blocks is not related to (i_size / i_blksize), @@ -4591,7 +4593,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, if (shdr->Command != SMB2_READ) { cifs_server_dbg(VFS, "only big read responses are supported\n"); - return -ENOTSUPP; + return -EOPNOTSUPP; } if (server->ops->is_session_expired && diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 092b0087c9dc..c75a80bb6d9e 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -89,20 +89,26 @@ smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, struct TCP_Server_Info *server) { struct smb3_hdr_req *smb3_hdr; + shdr->ProtocolId = SMB2_PROTO_NUMBER; shdr->StructureSize = cpu_to_le16(64); shdr->Command = smb2_cmd; - if (server->dialect >= SMB30_PROT_ID) { - /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ - smb3_hdr = (struct smb3_hdr_req *)shdr; - /* if primary channel is not set yet, use default channel for chan sequence num */ - if (SERVER_IS_CHAN(server)) - smb3_hdr->ChannelSequence = - cpu_to_le16(server->primary_server->channel_sequence_num); - else - smb3_hdr->ChannelSequence = cpu_to_le16(server->channel_sequence_num); - } + if (server) { + /* After reconnect SMB3 must set ChannelSequence on subsequent reqs */ + if (server->dialect >= SMB30_PROT_ID) { + smb3_hdr = (struct smb3_hdr_req *)shdr; + /* + * if primary channel is not set yet, use default + * channel for chan sequence num + */ + if (SERVER_IS_CHAN(server)) + smb3_hdr->ChannelSequence = + cpu_to_le16(server->primary_server->channel_sequence_num); + else + smb3_hdr->ChannelSequence = + cpu_to_le16(server->channel_sequence_num); + } spin_lock(&server->req_lock); /* Request up to 10 credits but don't go over the limit. */ if (server->credits >= server->max_credits) @@ -842,7 +848,7 @@ add_posix_context(struct kvec *iov, unsigned int *num_iovec, umode_t mode) iov[num].iov_base = create_posix_buf(mode); if (mode == ACL_NO_MODE) - cifs_dbg(FYI, "Invalid mode\n"); + cifs_dbg(FYI, "%s: no mode\n", __func__); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = sizeof(struct create_posix); @@ -2234,7 +2240,7 @@ create_durable_v2_buf(struct cifs_open_parms *oparms) * (most servers default to 120 seconds) and most clients default to 0. * This can be overridden at mount ("handletimeout=") if the user wants * a different persistent (or resilient) handle timeout for all opens - * opens on a particular SMB3 mount. + * on a particular SMB3 mount. */ buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout); buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT); @@ -2379,7 +2385,7 @@ add_twarp_context(struct kvec *iov, unsigned int *num_iovec, __u64 timewarp) return 0; } -/* See See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ +/* See http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ static void setup_owner_group_sids(char *buf) { struct owner_group_sids *sids = (struct owner_group_sids *)buf; @@ -3124,6 +3130,7 @@ void SMB2_ioctl_free(struct smb_rqst *rqst) { int i; + if (rqst && rqst->rq_iov) { cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ for (i = 1; i < rqst->rq_nvec; i++) @@ -3871,7 +3878,7 @@ void smb2_reconnect_server(struct work_struct *work) goto done; /* allocate a dummy tcon struct used for reconnect */ - tcon = tconInfoAlloc(); + tcon = tcon_info_alloc(false); if (!tcon) { resched = true; list_for_each_entry_safe(ses, ses2, &tmp_ses_list, rlist) { diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 2a2aec8c6112..94df9eec3d8d 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1401,10 +1401,13 @@ create_conn: server->smbd_conn = smbd_get_connection( server, (struct sockaddr *) &server->dstaddr); - if (server->smbd_conn) + if (server->smbd_conn) { cifs_dbg(VFS, "RDMA transport re-established\n"); - - return server->smbd_conn ? 0 : -ENOENT; + trace_smb3_smbd_connect_done(server->hostname, server->conn_id, &server->dstaddr); + return 0; + } + trace_smb3_smbd_connect_err(server->hostname, server->conn_id, &server->dstaddr); + return -ENOENT; } static void destroy_caches_and_workqueue(struct smbd_connection *info) diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index a7e4755bed0f..de199ec9f726 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -935,6 +935,8 @@ DEFINE_EVENT(smb3_connect_class, smb3_##name, \ TP_ARGS(hostname, conn_id, addr)) DEFINE_SMB3_CONNECT_EVENT(connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_done); +DEFINE_SMB3_CONNECT_EVENT(smbd_connect_err); DECLARE_EVENT_CLASS(smb3_connect_err_class, TP_PROTO(char *hostname, __u64 conn_id, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 1b5d9794ed5b..14710afdc2a3 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -18,7 +18,7 @@ #include <linux/bvec.h> #include <linux/highmem.h> #include <linux/uaccess.h> -#include <asm/processor.h> +#include <linux/processor.h> #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> @@ -35,6 +35,8 @@ void cifs_wake_up_task(struct mid_q_entry *mid) { + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; wake_up_process(mid->callback_data); } @@ -87,7 +89,8 @@ static void __release_mid(struct kref *refcount) struct TCP_Server_Info *server = midEntry->server; if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) && - midEntry->mid_state == MID_RESPONSE_RECEIVED && + (midEntry->mid_state == MID_RESPONSE_RECEIVED || + midEntry->mid_state == MID_RESPONSE_READY) && server->ops->handle_cancelled_mid) server->ops->handle_cancelled_mid(midEntry, server); @@ -737,7 +740,8 @@ wait_for_response(struct TCP_Server_Info *server, struct mid_q_entry *midQ) int error; error = wait_event_state(server->response_q, - midQ->mid_state != MID_REQUEST_SUBMITTED, + midQ->mid_state != MID_REQUEST_SUBMITTED && + midQ->mid_state != MID_RESPONSE_RECEIVED, (TASK_KILLABLE|TASK_FREEZABLE_UNSAFE)); if (error < 0) return -ERESTARTSYS; @@ -890,7 +894,7 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) spin_lock(&server->mid_lock); switch (mid->mid_state) { - case MID_RESPONSE_RECEIVED: + case MID_RESPONSE_READY: spin_unlock(&server->mid_lock); return rc; case MID_RETRY_NEEDED: @@ -989,6 +993,9 @@ cifs_compound_callback(struct mid_q_entry *mid) credits.instance = server->reconnect_instance; add_credits(server, &credits, mid->optype); + + if (mid->mid_state == MID_RESPONSE_RECEIVED) + mid->mid_state = MID_RESPONSE_READY; } static void @@ -1209,7 +1216,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, send_cancel(server, &rqst[i], midQ[i]); spin_lock(&server->mid_lock); midQ[i]->mid_flags |= MID_WAIT_CANCELLED; - if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED || + midQ[i]->mid_state == MID_RESPONSE_RECEIVED) { midQ[i]->callback = cifs_cancelled_callback; cancelled_mid[i] = true; credits[i].value = 0; @@ -1230,7 +1238,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, } if (!midQ[i]->resp_buf || - midQ[i]->mid_state != MID_RESPONSE_RECEIVED) { + midQ[i]->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_dbg(FYI, "Bad MID state?\n"); goto out; @@ -1417,7 +1425,8 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, if (rc != 0) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1434,7 +1443,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses, } if (!midQ->resp_buf || !out_buf || - midQ->mid_state != MID_RESPONSE_RECEIVED) { + midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_server_dbg(VFS, "Bad MID state?\n"); goto out; @@ -1558,14 +1567,16 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, /* Wait for a reply - allow signals to interrupt. */ rc = wait_event_interruptible(server->response_q, - (!(midQ->mid_state == MID_REQUEST_SUBMITTED)) || + (!(midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED)) || ((server->tcpStatus != CifsGood) && (server->tcpStatus != CifsNew))); /* Were we interrupted by a signal ? */ spin_lock(&server->srv_lock); if ((rc == -ERESTARTSYS) && - (midQ->mid_state == MID_REQUEST_SUBMITTED) && + (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) && ((server->tcpStatus == CifsGood) || (server->tcpStatus == CifsNew))) { spin_unlock(&server->srv_lock); @@ -1596,7 +1607,8 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { send_cancel(server, &rqst, midQ); spin_lock(&server->mid_lock); - if (midQ->mid_state == MID_REQUEST_SUBMITTED) { + if (midQ->mid_state == MID_REQUEST_SUBMITTED || + midQ->mid_state == MID_RESPONSE_RECEIVED) { /* no longer considered to be "in-flight" */ midQ->callback = release_mid; spin_unlock(&server->mid_lock); @@ -1616,7 +1628,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon, return rc; /* rcvd frame is ok */ - if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_RECEIVED) { + if (out_buf == NULL || midQ->mid_state != MID_RESPONSE_READY) { rc = -EIO; cifs_tcon_dbg(VFS, "Bad MID state?\n"); goto out; diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 4ad5531686d8..ac199160bce6 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -478,7 +478,7 @@ static const struct xattr_handler smb3_ntsd_full_xattr_handler = { .set = cifs_xattr_set, }; -const struct xattr_handler *cifs_xattr_handlers[] = { +const struct xattr_handler * const cifs_xattr_handlers[] = { &cifs_user_xattr_handler, &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 0d990c2f33cd..4b38c3a285f6 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -84,6 +84,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) spin_lock_init(&conn->llist_lock); INIT_LIST_HEAD(&conn->lock_list); + init_rwsem(&conn->session_lock); + down_write(&conn_list_lock); list_add(&conn->conns_list, &conn_list); up_write(&conn_list_lock); @@ -197,6 +199,9 @@ int ksmbd_conn_write(struct ksmbd_work *work) if (work->send_no_response) return 0; + if (!work->iov_idx) + return -EINVAL; + ksmbd_conn_lock(conn); sent = conn->transport->ops->writev(conn->transport, work->iov, work->iov_cnt, diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index ab2583f030ce..3c005246a32e 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -50,6 +50,7 @@ struct ksmbd_conn { struct nls_table *local_nls; struct unicode_map *um; struct list_head conns_list; + struct rw_semaphore session_lock; /* smb session 1 per user */ struct xarray sessions; unsigned long last_active; diff --git a/fs/smb/server/mgmt/tree_connect.c b/fs/smb/server/mgmt/tree_connect.c index 408cddf2f094..d2c81a8a11dd 100644 --- a/fs/smb/server/mgmt/tree_connect.c +++ b/fs/smb/server/mgmt/tree_connect.c @@ -73,7 +73,10 @@ ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, tree_conn->user = sess->user; tree_conn->share_conf = sc; + tree_conn->t_state = TREE_NEW; status.tree_conn = tree_conn; + atomic_set(&tree_conn->refcount, 1); + init_waitqueue_head(&tree_conn->refcount_q); ret = xa_err(xa_store(&sess->tree_conns, tree_conn->id, tree_conn, GFP_KERNEL)); @@ -93,14 +96,33 @@ out_error: return status; } +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon) +{ + /* + * Checking waitqueue to releasing tree connect on + * tree disconnect. waitqueue_active is safe because it + * uses atomic operation for condition. + */ + if (!atomic_dec_return(&tcon->refcount) && + waitqueue_active(&tcon->refcount_q)) + wake_up(&tcon->refcount_q); +} + int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn) { int ret; + write_lock(&sess->tree_conns_lock); + xa_erase(&sess->tree_conns, tree_conn->id); + write_unlock(&sess->tree_conns_lock); + + if (!atomic_dec_and_test(&tree_conn->refcount)) + wait_event(tree_conn->refcount_q, + atomic_read(&tree_conn->refcount) == 0); + ret = ksmbd_ipc_tree_disconnect_request(sess->id, tree_conn->id); ksmbd_release_tree_conn_id(sess, tree_conn->id); - xa_erase(&sess->tree_conns, tree_conn->id); ksmbd_share_config_put(tree_conn->share_conf); kfree(tree_conn); return ret; @@ -111,11 +133,15 @@ struct ksmbd_tree_connect *ksmbd_tree_conn_lookup(struct ksmbd_session *sess, { struct ksmbd_tree_connect *tcon; + read_lock(&sess->tree_conns_lock); tcon = xa_load(&sess->tree_conns, id); if (tcon) { - if (test_bit(TREE_CONN_EXPIRE, &tcon->status)) + if (tcon->t_state != TREE_CONNECTED) + tcon = NULL; + else if (!atomic_inc_not_zero(&tcon->refcount)) tcon = NULL; } + read_unlock(&sess->tree_conns_lock); return tcon; } @@ -129,8 +155,18 @@ int ksmbd_tree_conn_session_logoff(struct ksmbd_session *sess) if (!sess) return -EINVAL; - xa_for_each(&sess->tree_conns, id, tc) + xa_for_each(&sess->tree_conns, id, tc) { + write_lock(&sess->tree_conns_lock); + if (tc->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + ret = -ENOENT; + continue; + } + tc->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + ret |= ksmbd_tree_conn_disconnect(sess, tc); + } xa_destroy(&sess->tree_conns); return ret; } diff --git a/fs/smb/server/mgmt/tree_connect.h b/fs/smb/server/mgmt/tree_connect.h index 562d647ad9fa..6377a70b811c 100644 --- a/fs/smb/server/mgmt/tree_connect.h +++ b/fs/smb/server/mgmt/tree_connect.h @@ -14,7 +14,11 @@ struct ksmbd_share_config; struct ksmbd_user; struct ksmbd_conn; -#define TREE_CONN_EXPIRE 1 +enum { + TREE_NEW = 0, + TREE_CONNECTED, + TREE_DISCONNECTED +}; struct ksmbd_tree_connect { int id; @@ -27,7 +31,9 @@ struct ksmbd_tree_connect { int maximal_access; bool posix_extensions; - unsigned long status; + atomic_t refcount; + wait_queue_head_t refcount_q; + unsigned int t_state; }; struct ksmbd_tree_conn_status { @@ -46,6 +52,7 @@ struct ksmbd_session; struct ksmbd_tree_conn_status ksmbd_tree_conn_connect(struct ksmbd_conn *conn, struct ksmbd_session *sess, const char *share_name); +void ksmbd_tree_connect_put(struct ksmbd_tree_connect *tcon); int ksmbd_tree_conn_disconnect(struct ksmbd_session *sess, struct ksmbd_tree_connect *tree_conn); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 8a5dcab05614..15f68ee05089 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -174,7 +174,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) unsigned long id; struct ksmbd_session *sess; - down_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { if (sess->state != SMB2_SESSION_VALID || time_after(jiffies, @@ -185,7 +185,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) continue; } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } int ksmbd_session_register(struct ksmbd_conn *conn, @@ -227,7 +227,9 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) } } } + up_write(&sessions_table_lock); + down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { unsigned long chann_id; struct channel *chann; @@ -244,7 +246,7 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) ksmbd_session_destroy(sess); } } - up_write(&sessions_table_lock); + up_write(&conn->session_lock); } struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, @@ -252,9 +254,11 @@ struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, { struct ksmbd_session *sess; + down_read(&conn->session_lock); sess = xa_load(&conn->sessions, id); if (sess) sess->last_active = jiffies; + up_read(&conn->session_lock); return sess; } @@ -351,6 +355,7 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->ksmbd_chann_list); xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; + rwlock_init(&sess->tree_conns_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index f99d475b28db..63cb08fffde8 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -60,6 +60,7 @@ struct ksmbd_session { struct ksmbd_file_table file_table; unsigned long last_active; + rwlock_t tree_conns_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 5ab2f52f9b35..3079e607c5fe 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -115,8 +115,10 @@ static int __process_request(struct ksmbd_work *work, struct ksmbd_conn *conn, if (check_conn_state(work)) return SERVER_HANDLER_CONTINUE; - if (ksmbd_verify_smb_message(work)) + if (ksmbd_verify_smb_message(work)) { + conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); return SERVER_HANDLER_ABORT; + } command = conn->ops->get_cmd_val(work); *cmd = command; @@ -239,6 +241,8 @@ static void __handle_ksmbd_work(struct ksmbd_work *work, } while (is_chained == true); send: + if (work->tcon) + ksmbd_tree_connect_put(work->tcon); smb3_preauth_hash_rsp(work); if (work->sess && work->sess->enc && work->encrypted && conn->ops->encrypt_resp) { diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index e881df1d10cb..23bd3d1209df 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -440,10 +440,8 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) validate_credit: if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(work->conn, hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + smb2_validate_credit_charge(work->conn, hdr)) return 1; - } return 0; } diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 749660110878..658209839729 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -231,11 +231,12 @@ void set_smb2_rsp_status(struct ksmbd_work *work, __le32 err) { struct smb2_hdr *rsp_hdr; - if (work->next_smb2_rcv_hdr_off) - rsp_hdr = ksmbd_resp_buf_next(work); - else - rsp_hdr = smb2_get_msg(work->response_buf); + rsp_hdr = smb2_get_msg(work->response_buf); rsp_hdr->Status = err; + + work->iov_idx = 0; + work->iov_cnt = 0; + work->next_smb2_rcv_hdr_off = 0; smb2_set_err_rsp(work); } @@ -1993,6 +1994,9 @@ int smb2_tree_connect(struct ksmbd_work *work) if (conn->posix_ext_supported) status.tree_conn->posix_extensions = true; + write_lock(&sess->tree_conns_lock); + status.tree_conn->t_state = TREE_CONNECTED; + write_unlock(&sess->tree_conns_lock); rsp->StructureSize = cpu_to_le16(16); out_err1: rsp->Capabilities = 0; @@ -2122,27 +2126,50 @@ int smb2_tree_disconnect(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); + if (!tcon) { + ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + err = -ENOENT; + goto err_out; + } + + ksmbd_close_tree_conn_fds(work); + + write_lock(&sess->tree_conns_lock); + if (tcon->t_state == TREE_DISCONNECTED) { + write_unlock(&sess->tree_conns_lock); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + err = -ENOENT; + goto err_out; + } + + WARN_ON_ONCE(atomic_dec_and_test(&tcon->refcount)); + tcon->t_state = TREE_DISCONNECTED; + write_unlock(&sess->tree_conns_lock); + + err = ksmbd_tree_conn_disconnect(sess, tcon); + if (err) { + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; + goto err_out; + } + + work->tcon = NULL; + rsp->StructureSize = cpu_to_le16(4); err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_tree_disconnect_rsp)); if (err) { rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; - smb2_set_err_rsp(work); - return err; + goto err_out; } - if (!tcon || test_and_set_bit(TREE_CONN_EXPIRE, &tcon->status)) { - ksmbd_debug(SMB, "Invalid tid %d\n", req->hdr.Id.SyncId.TreeId); + return 0; - rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; - smb2_set_err_rsp(work); - return -ENOENT; - } +err_out: + smb2_set_err_rsp(work); + return err; - ksmbd_close_tree_conn_fds(work); - ksmbd_tree_conn_disconnect(sess, tcon); - work->tcon = NULL; - return 0; } /** @@ -2164,17 +2191,17 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_debug(SMB, "request\n"); - sess_id = le64_to_cpu(req->hdr.SessionId); - - rsp->StructureSize = cpu_to_le16(4); - err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); - if (err) { - rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + ksmbd_conn_lock(conn); + if (!ksmbd_conn_good(conn)) { + ksmbd_conn_unlock(conn); + rsp->hdr.Status = STATUS_NETWORK_NAME_DELETED; smb2_set_err_rsp(work); - return err; + return -ENOENT; } - + sess_id = le64_to_cpu(req->hdr.SessionId); ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_RECONNECT); + ksmbd_conn_unlock(conn); + ksmbd_close_session_fds(work); ksmbd_conn_wait_idle(conn, sess_id); @@ -2196,6 +2223,14 @@ int smb2_session_logoff(struct ksmbd_work *work) ksmbd_free_user(sess->user); sess->user = NULL; ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); + + rsp->StructureSize = cpu_to_le16(4); + err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); + if (err) { + rsp->hdr.Status = STATUS_INSUFFICIENT_RESOURCES; + smb2_set_err_rsp(work); + return err; + } return 0; } @@ -3370,8 +3405,10 @@ err_out: } ksmbd_revert_fsids(work); err_out1: - if (!rc) + if (!rc) { + ksmbd_update_fstate(&work->sess->file_table, fp, FP_INITED); rc = ksmbd_iov_pin_rsp(work, (void *)rsp, iov_len); + } if (rc) { if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -4797,9 +4834,9 @@ static void find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb311_posix_qinfo *)rsp->Buffer; file_info->CreationTime = cpu_to_le64(fp->create_time); - time = ksmbd_UnixTimeToNT(inode->i_atime); + time = ksmbd_UnixTimeToNT(inode_get_atime(inode)); file_info->LastAccessTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_mtime); + time = ksmbd_UnixTimeToNT(inode_get_mtime(inode)); file_info->LastWriteTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); file_info->ChangeTime = cpu_to_le64(time); @@ -5406,9 +5443,9 @@ int smb2_close(struct ksmbd_work *work) rsp->EndOfFile = cpu_to_le64(inode->i_size); rsp->Attributes = fp->f_ci->m_fattr; rsp->CreationTime = cpu_to_le64(fp->create_time); - time = ksmbd_UnixTimeToNT(inode->i_atime); + time = ksmbd_UnixTimeToNT(inode_get_atime(inode)); rsp->LastAccessTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_mtime); + time = ksmbd_UnixTimeToNT(inode_get_mtime(inode)); rsp->LastWriteTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); rsp->ChangeTime = cpu_to_le64(time); @@ -6115,12 +6152,12 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work) memcpy(aux_payload_buf, rpc_resp->payload, rpc_resp->payload_sz); nbytes = rpc_resp->payload_sz; - kvfree(rpc_resp); err = ksmbd_iov_pin_rsp_read(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer), aux_payload_buf, nbytes); if (err) goto out; + kvfree(rpc_resp); } else { err = ksmbd_iov_pin_rsp(work, (void *)rsp, offsetof(struct smb2_read_rsp, Buffer)); @@ -6312,7 +6349,7 @@ int smb2_read(struct ksmbd_work *work) aux_payload_buf, nbytes); kvfree(aux_payload_buf); - + aux_payload_buf = NULL; nbytes = 0; if (remain_bytes < 0) { err = (int)remain_bytes; @@ -7028,10 +7065,6 @@ skip: ksmbd_debug(SMB, "would have to wait for getting lock\n"); - spin_lock(&work->conn->llist_lock); - list_add_tail(&smb_lock->clist, - &work->conn->lock_list); - spin_unlock(&work->conn->llist_lock); list_add(&smb_lock->llist, &rollback_list); argv = kmalloc(sizeof(void *), GFP_KERNEL); @@ -7062,9 +7095,6 @@ skip: if (work->state != KSMBD_WORK_ACTIVE) { list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); locks_free_lock(flock); if (work->state == KSMBD_WORK_CANCELLED) { @@ -7084,19 +7114,16 @@ skip: } list_del(&smb_lock->llist); - spin_lock(&work->conn->llist_lock); - list_del(&smb_lock->clist); - spin_unlock(&work->conn->llist_lock); release_async_work(work); goto retry; } else if (!rc) { + list_add(&smb_lock->llist, &rollback_list); spin_lock(&work->conn->llist_lock); list_add_tail(&smb_lock->clist, &work->conn->lock_list); list_add_tail(&smb_lock->flist, &fp->lock_list); spin_unlock(&work->conn->llist_lock); - list_add(&smb_lock->llist, &rollback_list); ksmbd_debug(SMB, "successful in taking lock\n"); } else { goto out; @@ -8036,10 +8063,10 @@ static void smb20_oplock_break_ack(struct ksmbd_work *work) goto err_out; } - opinfo_put(opinfo); - ksmbd_fd_put(work, fp); opinfo->op_state = OPLOCK_STATE_NONE; wake_up_interruptible_all(&opinfo->oplock_q); + opinfo_put(opinfo); + ksmbd_fd_put(work, fp); rsp->StructureSize = cpu_to_le16(24); rsp->OplockLevel = rsp_oplevel; diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index e5e438bf5499..6c0305be895e 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -1420,7 +1420,6 @@ int set_info_sec(struct ksmbd_conn *conn, struct ksmbd_tree_connect *tcon, out: posix_acl_release(fattr.cf_acls); posix_acl_release(fattr.cf_dacls); - mark_inode_dirty(inode); return rc; } diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index f41f8d6108ce..c91eac6514dd 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -106,7 +106,7 @@ int ksmbd_query_inode_status(struct inode *inode) ci = __ksmbd_inode_lookup(inode); if (ci) { ret = KSMBD_INODE_STATUS_OK; - if (ci->m_flags & S_DEL_PENDING) + if (ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)) ret = KSMBD_INODE_STATUS_PENDING_DELETE; atomic_dec(&ci->m_count); } @@ -116,7 +116,7 @@ int ksmbd_query_inode_status(struct inode *inode) bool ksmbd_inode_pending_delete(struct ksmbd_file *fp) { - return (fp->f_ci->m_flags & S_DEL_PENDING); + return (fp->f_ci->m_flags & (S_DEL_PENDING | S_DEL_ON_CLS)); } void ksmbd_set_inode_pending_delete(struct ksmbd_file *fp) @@ -333,6 +333,9 @@ static void __ksmbd_close_fd(struct ksmbd_file_table *ft, struct ksmbd_file *fp) static struct ksmbd_file *ksmbd_fp_get(struct ksmbd_file *fp) { + if (fp->f_state != FP_INITED) + return NULL; + if (!atomic_inc_not_zero(&fp->refcount)) return NULL; return fp; @@ -382,15 +385,20 @@ int ksmbd_close_fd(struct ksmbd_work *work, u64 id) return 0; ft = &work->sess->file_table; - read_lock(&ft->lock); + write_lock(&ft->lock); fp = idr_find(ft->idr, id); if (fp) { set_close_state_blocked_works(fp); - if (!atomic_dec_and_test(&fp->refcount)) + if (fp->f_state != FP_INITED) fp = NULL; + else { + fp->f_state = FP_CLOSED; + if (!atomic_dec_and_test(&fp->refcount)) + fp = NULL; + } } - read_unlock(&ft->lock); + write_unlock(&ft->lock); if (!fp) return -EINVAL; @@ -570,6 +578,7 @@ struct ksmbd_file *ksmbd_open_fd(struct ksmbd_work *work, struct file *filp) fp->tcon = work->tcon; fp->volatile_id = KSMBD_NO_FID; fp->persistent_id = KSMBD_NO_FID; + fp->f_state = FP_NEW; fp->f_ci = ksmbd_inode_get(fp); if (!fp->f_ci) { @@ -591,6 +600,17 @@ err_out: return ERR_PTR(ret); } +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state) +{ + if (!fp) + return; + + write_lock(&ft->lock); + fp->f_state = state; + write_unlock(&ft->lock); +} + static int __close_file_table_ids(struct ksmbd_file_table *ft, struct ksmbd_tree_connect *tcon, diff --git a/fs/smb/server/vfs_cache.h b/fs/smb/server/vfs_cache.h index fcb13413fa8d..03d0bf941216 100644 --- a/fs/smb/server/vfs_cache.h +++ b/fs/smb/server/vfs_cache.h @@ -60,6 +60,12 @@ struct ksmbd_inode { __le32 m_fattr; }; +enum { + FP_NEW = 0, + FP_INITED, + FP_CLOSED +}; + struct ksmbd_file { struct file *filp; u64 persistent_id; @@ -98,6 +104,7 @@ struct ksmbd_file { /* if ls is happening on directory, below is valid*/ struct ksmbd_readdir_data readdir_data; int dot_dotdot[2]; + unsigned int f_state; }; static inline void set_ctx_actor(struct dir_context *ctx, @@ -142,6 +149,8 @@ int ksmbd_close_inode_fds(struct ksmbd_work *work, struct inode *inode); int ksmbd_init_global_file_table(void); void ksmbd_free_global_file_table(void); void ksmbd_set_fd_limit(unsigned long limit); +void ksmbd_update_fstate(struct ksmbd_file_table *ft, struct ksmbd_file *fp, + unsigned int state); /* * INODE hash diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index c6e626b00546..aa3411354e66 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -59,9 +59,9 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); - inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); - inode->i_atime.tv_sec = inode->i_mtime.tv_sec; - inode_set_ctime(inode, inode->i_mtime.tv_sec, 0); + inode_set_mtime(inode, le32_to_cpu(sqsh_ino->mtime), 0); + inode_set_atime(inode, inode_get_mtime_sec(inode), 0); + inode_set_ctime(inode, inode_get_mtime_sec(inode), 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index a6164fdf9435..5a756e6790b5 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -111,4 +111,4 @@ extern const struct address_space_operations squashfs_symlink_aops; extern const struct inode_operations squashfs_symlink_inode_ops; /* xattr.c */ -extern const struct xattr_handler *squashfs_xattr_handlers[]; +extern const struct xattr_handler * const squashfs_xattr_handlers[]; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index e1e3f3dd5a06..ce6608cabd49 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -262,7 +262,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int type) } } -const struct xattr_handler *squashfs_xattr_handlers[] = { +const struct xattr_handler * const squashfs_xattr_handlers[] = { &squashfs_xattr_user_handler, &squashfs_xattr_trusted_handler, &squashfs_xattr_security_handler, diff --git a/fs/stack.c b/fs/stack.c index b5e01bdb5f5f..f18920119944 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -66,8 +66,8 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_uid = src->i_uid; dest->i_gid = src->i_gid; dest->i_rdev = src->i_rdev; - dest->i_atime = src->i_atime; - dest->i_mtime = src->i_mtime; + inode_set_atime_to_ts(dest, inode_get_atime(src)); + inode_set_mtime_to_ts(dest, inode_get_mtime(src)); inode_set_ctime_to_ts(dest, inode_get_ctime(src)); dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; diff --git a/fs/stat.c b/fs/stat.c index 6822ac77aec2..24bb0209e459 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -27,37 +27,6 @@ #include "mount.h" /** - * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED - * @stat: where to store the resulting values - * @request_mask: STATX_* values requested - * @inode: inode from which to grab the c/mtime - * - * Given @inode, grab the ctime and mtime out if it and store the result - * in @stat. When fetching the value, flag it as queried so the next write - * will use a fine-grained timestamp. - */ -void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) -{ - atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; - - /* If neither time was requested, then don't report them */ - if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { - stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); - return; - } - - stat->mtime = inode->i_mtime; - stat->ctime.tv_sec = inode->__i_ctime.tv_sec; - /* - * Atomically set the QUERIED flag and fetch the new value with - * the flag masked off. - */ - stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) & - ~I_CTIME_QUERIED; -} -EXPORT_SYMBOL(fill_mg_cmtime); - -/** * generic_fillattr - Fill in the basic attributes from the inode struct * @idmap: idmap of the mount the inode was found from * @request_mask: statx request_mask @@ -88,15 +57,9 @@ void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); - stat->atime = inode->i_atime; - - if (is_mgtime(inode)) { - fill_mg_cmtime(stat, request_mask, inode); - } else { - stat->mtime = inode->i_mtime; - stat->ctime = inode_get_ctime(inode); - } - + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); + stat->ctime = inode_get_ctime(inode); stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; @@ -419,12 +382,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat #ifdef __ARCH_WANT_NEW_STAT -#if BITS_PER_LONG == 32 -# define choose_32_64(a,b) a -#else -# define choose_32_64(a,b) b -#endif - #ifndef INIT_STRUCT_STAT_PADDING # define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st)) #endif diff --git a/fs/super.c b/fs/super.c index 2d762ce67f6e..c7b452e12e4c 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1419,32 +1419,48 @@ EXPORT_SYMBOL(sget_dev); #ifdef CONFIG_BLOCK /* - * Lock a super block that the callers holds a reference to. + * Lock the superblock that is holder of the bdev. Returns the superblock + * pointer if we successfully locked the superblock and it is alive. Otherwise + * we return NULL and just unlock bdev->bd_holder_lock. * - * The caller needs to ensure that the super_block isn't being freed while - * calling this function, e.g. by holding a lock over the call to this function - * and the place that clears the pointer to the superblock used by this function - * before freeing the superblock. + * The function must be called with bdev->bd_holder_lock and releases it. */ -static bool super_lock_shared_active(struct super_block *sb) +static struct super_block *bdev_super_lock_shared(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) { - bool born = super_lock_shared(sb); + struct super_block *sb = bdev->bd_holder; + bool born; + + lockdep_assert_held(&bdev->bd_holder_lock); + lockdep_assert_not_held(&sb->s_umount); + lockdep_assert_not_held(&bdev->bd_disk->open_mutex); + + /* Make sure sb doesn't go away from under us */ + spin_lock(&sb_lock); + sb->s_count++; + spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_holder_lock); + born = super_lock_shared(sb); if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { super_unlock_shared(sb); - return false; + put_super(sb); + return NULL; } - return true; + /* + * The superblock is active and we hold s_umount, we can drop our + * temporary reference now. + */ + put_super(sb); + return sb; } static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) { - struct super_block *sb = bdev->bd_holder; - - /* bd_holder_lock ensures that the sb isn't freed */ - lockdep_assert_held(&bdev->bd_holder_lock); + struct super_block *sb; - if (!super_lock_shared_active(sb)) + sb = bdev_super_lock_shared(bdev); + if (!sb) return; if (!surprise) @@ -1459,11 +1475,10 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) static void fs_bdev_sync(struct block_device *bdev) { - struct super_block *sb = bdev->bd_holder; - - lockdep_assert_held(&bdev->bd_holder_lock); + struct super_block *sb; - if (!super_lock_shared_active(sb)) + sb = bdev_super_lock_shared(bdev); + if (!sb) return; sync_filesystem(sb); super_unlock_shared(sb); @@ -1479,14 +1494,16 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, struct fs_context *fc) { blk_mode_t mode = sb_open_mode(sb_flags); + struct bdev_handle *bdev_handle; struct block_device *bdev; - bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); - if (IS_ERR(bdev)) { + bdev_handle = bdev_open_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev_handle)) { if (fc) errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev); + return PTR_ERR(bdev_handle); } + bdev = bdev_handle->bdev; /* * This really should be in blkdev_get_by_dev, but right now can't due @@ -1494,7 +1511,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, * writable from userspace even for a read-only block device. */ if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, sb); + bdev_release(bdev_handle); return -EACCES; } @@ -1510,10 +1527,11 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, mutex_unlock(&bdev->bd_fsfreeze_mutex); if (fc) warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, sb); + bdev_release(bdev_handle); return -EBUSY; } spin_lock(&sb_lock); + sb->s_bdev_handle = bdev_handle; sb->s_bdev = bdev; sb->s_bdi = bdi_get(bdev->bd_disk->bdi); if (bdev_stable_writes(bdev)) @@ -1646,7 +1664,7 @@ void kill_block_super(struct super_block *sb) generic_shutdown_super(sb); if (bdev) { sync_blockdev(bdev); - blkdev_put(bdev, sb); + bdev_release(sb->s_bdev_handle); } } diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 2f5ead88d00b..2e126d72d619 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -224,7 +224,7 @@ got_it: memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: @@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) } de->inode = 0; dir_commit_chunk(page, pos, SYSV_DIRSIZE); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); } @@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index 6719da5889d9..269df6d49815 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) dirty_sb(sb); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 0aa3827d8178..5a915b2e68f5 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -200,11 +200,9 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); - inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); - inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); + inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0); + inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0); inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); - inode->i_atime.tv_nsec = 0; - inode->i_mtime.tv_nsec = 0; inode->i_blocks = 0; si = SYSV_I(inode); @@ -253,9 +251,9 @@ static int __sysv_write_inode(struct inode *inode, int wait) raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); - raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); - raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec); + raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode)); + raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode)); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode)); si = SYSV_I(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index edb94e55de8e..725981474e5f 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -423,7 +423,7 @@ do_indirects: } n++; } - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) sysv_sync_inode (inode); else diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 237c6f370ad9..8c8d64e76103 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -70,6 +70,7 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); static int dcache_dir_open_wrapper(struct inode *inode, struct file *file); +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx); static int eventfs_release(struct inode *inode, struct file *file); static const struct inode_operations eventfs_root_dir_inode_operations = { @@ -79,7 +80,7 @@ static const struct inode_operations eventfs_root_dir_inode_operations = { static const struct file_operations eventfs_file_operations = { .open = dcache_dir_open_wrapper, .read = generic_read_dir, - .iterate_shared = dcache_readdir, + .iterate_shared = dcache_readdir_wrapper, .llseek = generic_file_llseek, .release = eventfs_release, }; @@ -185,17 +186,49 @@ static struct dentry *create_dir(const char *name, struct dentry *parent, void * /** * eventfs_set_ef_status_free - set the ef->status to free + * @ti: the tracefs_inode of the dentry * @dentry: dentry who's status to be freed * * eventfs_set_ef_status_free will be called if no more * references remain */ -void eventfs_set_ef_status_free(struct dentry *dentry) +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry) { struct tracefs_inode *ti_parent; - struct eventfs_file *ef; + struct eventfs_inode *ei; + struct eventfs_file *ef, *tmp; + + /* The top level events directory may be freed by this */ + if (unlikely(ti->flags & TRACEFS_EVENT_TOP_INODE)) { + LIST_HEAD(ef_del_list); + + mutex_lock(&eventfs_mutex); + + ei = ti->private; + + /* Record all the top level files */ + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + lockdep_is_held(&eventfs_mutex)) { + list_add_tail(&ef->del_list, &ef_del_list); + } + + /* Nothing should access this, but just in case! */ + ti->private = NULL; + + mutex_unlock(&eventfs_mutex); + + /* Now safely free the top level files and their children */ + list_for_each_entry_safe(ef, tmp, &ef_del_list, del_list) { + list_del(&ef->del_list); + eventfs_remove(ef); + } + + kfree(ei); + return; + } mutex_lock(&eventfs_mutex); + ti_parent = get_tracefs(dentry->d_parent->d_inode); if (!ti_parent || !(ti_parent->flags & TRACEFS_EVENT_INODE)) goto out; @@ -364,6 +397,11 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, return ret; } +struct dentry_list { + void *cursor; + struct dentry **dentries; +}; + /** * eventfs_release - called to release eventfs file/dir * @inode: inode to be released @@ -372,26 +410,25 @@ static struct dentry *eventfs_root_lookup(struct inode *dir, static int eventfs_release(struct inode *inode, struct file *file) { struct tracefs_inode *ti; - struct eventfs_inode *ei; - struct eventfs_file *ef; - struct dentry *dentry; - int idx; + struct dentry_list *dlist = file->private_data; + void *cursor; + int i; ti = get_tracefs(inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; - ei = ti->private; - idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_srcu(ef, &ei->e_top_files, list, - srcu_read_lock_held(&eventfs_srcu)) { - mutex_lock(&eventfs_mutex); - dentry = ef->dentry; - mutex_unlock(&eventfs_mutex); - if (dentry) - dput(dentry); + if (WARN_ON_ONCE(!dlist)) + return -EINVAL; + + for (i = 0; dlist->dentries && dlist->dentries[i]; i++) { + dput(dlist->dentries[i]); } - srcu_read_unlock(&eventfs_srcu, idx); + + cursor = dlist->cursor; + kfree(dlist->dentries); + kfree(dlist); + file->private_data = cursor; return dcache_dir_close(inode, file); } @@ -410,21 +447,70 @@ static int dcache_dir_open_wrapper(struct inode *inode, struct file *file) struct tracefs_inode *ti; struct eventfs_inode *ei; struct eventfs_file *ef; + struct dentry_list *dlist; + struct dentry **dentries = NULL; struct dentry *dentry = file_dentry(file); + struct dentry *d; struct inode *f_inode = file_inode(file); + int cnt = 0; int idx; + int ret; ti = get_tracefs(f_inode); if (!(ti->flags & TRACEFS_EVENT_INODE)) return -EINVAL; + if (WARN_ON_ONCE(file->private_data)) + return -EINVAL; + + dlist = kmalloc(sizeof(*dlist), GFP_KERNEL); + if (!dlist) + return -ENOMEM; + ei = ti->private; idx = srcu_read_lock(&eventfs_srcu); - list_for_each_entry_rcu(ef, &ei->e_top_files, list) { - create_dentry(ef, dentry, false); + list_for_each_entry_srcu(ef, &ei->e_top_files, list, + srcu_read_lock_held(&eventfs_srcu)) { + d = create_dentry(ef, dentry, false); + if (d) { + struct dentry **tmp; + + tmp = krealloc(dentries, sizeof(d) * (cnt + 2), GFP_KERNEL); + if (!tmp) + break; + tmp[cnt] = d; + tmp[cnt + 1] = NULL; + cnt++; + dentries = tmp; + } } srcu_read_unlock(&eventfs_srcu, idx); - return dcache_dir_open(inode, file); + ret = dcache_dir_open(inode, file); + + /* + * dcache_dir_open() sets file->private_data to a dentry cursor. + * Need to save that but also save all the dentries that were + * opened by this function. + */ + dlist->cursor = file->private_data; + dlist->dentries = dentries; + file->private_data = dlist; + return ret; +} + +/* + * This just sets the file->private_data back to the cursor and back. + */ +static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx) +{ + struct dentry_list *dlist = file->private_data; + int ret; + + file->private_data = dlist->cursor; + ret = dcache_readdir(file, ctx); + dlist->cursor = file->private_data; + file->private_data = dlist; + return ret; } /** @@ -491,6 +577,9 @@ struct dentry *eventfs_create_events_dir(const char *name, struct tracefs_inode *ti; struct inode *inode; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (IS_ERR(dentry)) return dentry; @@ -507,7 +596,7 @@ struct dentry *eventfs_create_events_dir(const char *name, INIT_LIST_HEAD(&ei->e_top_files); ti = get_tracefs(inode); - ti->flags |= TRACEFS_EVENT_INODE; + ti->flags |= TRACEFS_EVENT_INODE | TRACEFS_EVENT_TOP_INODE; ti->private = ei; inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; @@ -538,6 +627,9 @@ struct eventfs_file *eventfs_add_subsystem_dir(const char *name, struct eventfs_inode *ei_parent; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!parent) return ERR_PTR(-EINVAL); @@ -569,6 +661,9 @@ struct eventfs_file *eventfs_add_dir(const char *name, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + if (!ef_parent) return ERR_PTR(-EINVAL); @@ -606,6 +701,9 @@ int eventfs_add_events_file(const char *name, umode_t mode, struct eventfs_inode *ei; struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!parent) return -EINVAL; @@ -654,6 +752,9 @@ int eventfs_add_file(const char *name, umode_t mode, { struct eventfs_file *ef; + if (security_locked_down(LOCKDOWN_TRACEFS)) + return -ENODEV; + if (!ef_parent) return -EINVAL; @@ -791,7 +892,6 @@ void eventfs_remove(struct eventfs_file *ef) void eventfs_remove_events_dir(struct dentry *dentry) { struct tracefs_inode *ti; - struct eventfs_inode *ei; if (!dentry || !dentry->d_inode) return; @@ -800,8 +900,6 @@ void eventfs_remove_events_dir(struct dentry *dentry) if (!ti || !(ti->flags & TRACEFS_EVENT_INODE)) return; - ei = ti->private; d_invalidate(dentry); dput(dentry); - kfree(ei); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index de5b72216b1a..429603d865a9 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -152,7 +152,7 @@ struct inode *tracefs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); } return inode; } @@ -385,7 +385,7 @@ static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode) ti = get_tracefs(inode); if (ti && ti->flags & TRACEFS_EVENT_INODE) - eventfs_set_ef_status_free(dentry); + eventfs_set_ef_status_free(ti, dentry); iput(inode); } @@ -673,6 +673,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { + if (security_locked_down(LOCKDOWN_TRACEFS)) + return NULL; + return __create_dir(name, parent, &simple_dir_inode_operations); } diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h index 69c2b1d87c46..4f2e49e2197b 100644 --- a/fs/tracefs/internal.h +++ b/fs/tracefs/internal.h @@ -3,7 +3,8 @@ #define _TRACEFS_INTERNAL_H enum { - TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_INODE = BIT(1), + TRACEFS_EVENT_TOP_INODE = BIT(2), }; struct tracefs_inode { @@ -24,6 +25,6 @@ struct inode *tracefs_get_inode(struct super_block *sb); struct dentry *eventfs_start_creating(const char *name, struct dentry *parent); struct dentry *eventfs_failed_creating(struct dentry *dentry); struct dentry *eventfs_end_creating(struct dentry *dentry); -void eventfs_set_ef_status_free(struct dentry *dentry); +void eventfs_set_ef_status_free(struct tracefs_inode *ti, struct dentry *dentry); #endif /* _TRACEFS_INTERNAL_H */ diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 3125e76376ee..921f9033d0d2 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -88,8 +88,7 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn, } const struct fscrypt_operations ubifs_crypt_operations = { - .flags = FS_CFLG_OWN_PAGES, - .key_prefix = "ubifs:", + .legacy_key_prefix = "ubifs:", .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index eef9e527d9ff..d013c5b3f1ed 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -237,14 +237,14 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) pr_err("\tuid %u\n", (unsigned int)i_uid_read(inode)); pr_err("\tgid %u\n", (unsigned int)i_gid_read(inode)); pr_err("\tatime %u.%u\n", - (unsigned int)inode->i_atime.tv_sec, - (unsigned int)inode->i_atime.tv_nsec); + (unsigned int) inode_get_atime_sec(inode), + (unsigned int) inode_get_atime_nsec(inode)); pr_err("\tmtime %u.%u\n", - (unsigned int)inode->i_mtime.tv_sec, - (unsigned int)inode->i_mtime.tv_nsec); + (unsigned int) inode_get_mtime_sec(inode), + (unsigned int) inode_get_mtime_nsec(inode)); pr_err("\tctime %u.%u\n", - (unsigned int) inode_get_ctime(inode).tv_sec, - (unsigned int) inode_get_ctime(inode).tv_nsec); + (unsigned int) inode_get_ctime_sec(inode), + (unsigned int) inode_get_ctime_nsec(inode)); pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); pr_err("\txattr_size %u\n", ui->xattr_size); pr_err("\txattr_cnt %u\n", ui->xattr_cnt); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 2f48c58d47cd..7af442de44c3 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -96,7 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode->i_flags |= S_NOCMTIME; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_mapping->nrpages = 0; if (!is_xattr) { @@ -324,7 +324,8 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -767,7 +768,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inode_set_ctime_current(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -841,7 +843,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -944,7 +947,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -1018,7 +1022,8 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); @@ -1109,7 +1114,8 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1209,7 +1215,8 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_get_ctime(inode))); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e5382f0b2587..2e65fd2dbdc3 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1088,9 +1088,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_GID) inode->i_gid = attr->ia_gid; if (attr->ia_valid & ATTR_ATIME) - inode->i_atime = attr->ia_atime; + inode_set_atime_to_ts(inode, attr->ia_atime); if (attr->ia_valid & ATTR_MTIME) - inode->i_mtime = attr->ia_mtime; + inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); if (attr->ia_valid & ATTR_MODE) { @@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1365,9 +1365,9 @@ static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) { struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); - if (!timespec64_equal(&inode->i_mtime, now) || - !timespec64_equal(&ctime, now)) + if (!timespec64_equal(&mtime, now) || !timespec64_equal(&ctime, now)) return 1; return 0; } @@ -1429,7 +1429,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1567,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index ffc9beee7be6..d69d2154645b 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -452,12 +452,12 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->ch.node_type = UBIFS_INO_NODE; ino_key_init_flash(c, &ino->key, inode->i_ino); ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); - ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); - ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec); - ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); - ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); - ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + ino->atime_sec = cpu_to_le64(inode_get_atime_sec(inode)); + ino->atime_nsec = cpu_to_le32(inode_get_atime_nsec(inode)); + ino->ctime_sec = cpu_to_le64(inode_get_ctime_sec(inode)); + ino->ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode)); + ino->mtime_sec = cpu_to_le64(inode_get_mtime_sec(inode)); + ino->mtime_nsec = cpu_to_le32(inode_get_mtime_nsec(inode)); ino->uid = cpu_to_le32(i_uid_read(inode)); ino->gid = cpu_to_le32(i_gid_read(inode)); ino->mode = cpu_to_le32(inode->i_mode); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b08fb28d16b5..366941d4a18a 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -142,10 +142,10 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) set_nlink(inode, le32_to_cpu(ino->nlink)); i_uid_write(inode, le32_to_cpu(ino->uid)); i_gid_write(inode, le32_to_cpu(ino->gid)); - inode->i_atime.tv_sec = (int64_t)le64_to_cpu(ino->atime_sec); - inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); - inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); - inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); + inode_set_atime(inode, (int64_t)le64_to_cpu(ino->atime_sec), + le32_to_cpu(ino->atime_nsec)); + inode_set_mtime(inode, (int64_t)le64_to_cpu(ino->mtime_sec), + le32_to_cpu(ino->mtime_nsec)); inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index ebb3ad6b5e7e..62633816d7d0 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2043,7 +2043,7 @@ ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, size_t size); #ifdef CONFIG_UBIFS_FS_XATTR -extern const struct xattr_handler *ubifs_xattr_handlers[]; +extern const struct xattr_handler * const ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); void ubifs_evict_xattr_inode(struct ubifs_info *c, ino_t xattr_inum); int ubifs_purge_xattrs(struct inode *host); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 406c82eab513..0847db521984 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -735,7 +735,7 @@ static const struct xattr_handler ubifs_security_xattr_handler = { }; #endif -const struct xattr_handler *ubifs_xattr_handlers[] = { +const struct xattr_handler * const ubifs_xattr_handlers[] = { &ubifs_user_xattr_handler, &ubifs_trusted_xattr_handler, #ifdef CONFIG_UBIFS_FS_SECURITY diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6b558cbbeb6b..5f1f969f4134 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -100,8 +100,8 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); - iinfo->i_crtime = inode->i_mtime; + simple_inode_init_ts(inode); + iinfo->i_crtime = inode_get_mtime(inode); if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a17a6184cc39..d8493449d4c5 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1296,7 +1296,7 @@ set_size: goto out_unlock; } update_time: - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); if (IS_SYNC(inode)) udf_sync_inode(inode); else @@ -1327,7 +1327,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; - struct timespec64 ctime; + struct timespec64 ts; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1504,10 +1504,12 @@ reread: inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime); - udf_disk_stamp_to_time(&ctime, fe->attrTime); - inode_set_ctime_to_ts(inode, ctime); + udf_disk_stamp_to_time(&ts, fe->accessTime); + inode_set_atime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, fe->modificationTime); + inode_set_mtime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, fe->attrTime); + inode_set_ctime_to_ts(inode, ts); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1519,11 +1521,13 @@ reread: inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << (inode->i_sb->s_blocksize_bits - 9); - udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime); - udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime); + udf_disk_stamp_to_time(&ts, efe->accessTime); + inode_set_atime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, efe->modificationTime); + inode_set_mtime_to_ts(inode, ts); + udf_disk_stamp_to_time(&ts, efe->attrTime); + inode_set_ctime_to_ts(inode, ts); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); - udf_disk_stamp_to_time(&ctime, efe->attrTime); - inode_set_ctime_to_ts(inode, ctime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1798,8 +1802,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) inode->i_sb->s_blocksize - sizeof(struct fileEntry)); fe->logicalBlocksRecorded = cpu_to_le64(lb_recorded); - udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&fe->accessTime, inode_get_atime(inode)); + udf_time_to_disk_stamp(&fe->modificationTime, inode_get_mtime(inode)); udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); @@ -1829,12 +1833,14 @@ static int udf_update_inode(struct inode *inode, int do_sync) cpu_to_le32(inode->i_sb->s_blocksize); } - udf_adjust_time(iinfo, inode->i_atime); - udf_adjust_time(iinfo, inode->i_mtime); + udf_adjust_time(iinfo, inode_get_atime(inode)); + udf_adjust_time(iinfo, inode_get_mtime(inode)); udf_adjust_time(iinfo, inode_get_ctime(inode)); - udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); - udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); + udf_time_to_disk_stamp(&efe->accessTime, + inode_get_atime(inode)); + udf_time_to_disk_stamp(&efe->modificationTime, + inode_get_mtime(inode)); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ae55ab8859b6..3508ac484da3 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_fiiter_write_fi(&iter, NULL); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, false, 1); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); @@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; inode_dec_link_count(dir); udf_add_fid_counter(dir->i_sb, true, -1); - dir->i_mtime = inode_set_ctime_to_ts(dir, - inode_set_ctime_current(inode)); + inode_set_mtime_to_ts(dir, + inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); mark_inode_dirty(dir); ret = 0; end_rmdir: @@ -555,7 +555,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } udf_fiiter_delete_entry(&iter); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_dec_link_count(inode); udf_add_fid_counter(dir->i_sb, false, -1); @@ -748,7 +748,7 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, udf_add_fid_counter(dir->i_sb, false, 1); inode_set_ctime_current(inode); mark_inode_dirty(inode); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -866,8 +866,8 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), -1); } - old_dir->i_mtime = inode_set_ctime_current(old_dir); - new_dir->i_mtime = inode_set_ctime_current(new_dir); + inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); + inode_set_mtime_to_ts(new_dir, inode_set_ctime_current(new_dir)); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index fd57f03b6c93..27c85d92d1dc 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } @@ -397,7 +397,7 @@ got_it: ufs_set_de_type(sb, de, inode->i_mode); ufs_commit_chunk(page, pos, rec_len); - dir->i_mtime = inode_set_ctime_current(dir); + inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); @@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(page, pos, to - from); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index a1e7bd9d1f98..73531827ecee 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -292,7 +292,7 @@ cg_found: inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; ufsi->i_shadow = 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 21a4779a2de5..338e4b97312f 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -579,13 +579,15 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) i_gid_write(inode, ufs_get_inode_gid(sb, ufs_inode)); inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); - inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); + inode_set_atime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec), + 0); inode_set_ctime(inode, (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec), 0); - inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; + inode_set_mtime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec), + 0); inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); @@ -626,12 +628,12 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) i_gid_write(inode, fs32_to_cpu(sb, ufs2_inode->ui_gid)); inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); - inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); + inode_set_atime(inode, fs64_to_cpu(sb, ufs2_inode->ui_atime), + fs32_to_cpu(sb, ufs2_inode->ui_atimensec)); inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime), fs32_to_cpu(sb, ufs2_inode->ui_ctimensec)); - inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); - inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); - inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); + inode_set_mtime(inode, fs64_to_cpu(sb, ufs2_inode->ui_mtime), + fs32_to_cpu(sb, ufs2_inode->ui_mtimensec)); inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); @@ -725,12 +727,14 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); - ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); + ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, + inode_get_atime_sec(inode)); ufs_inode->ui_atime.tv_usec = 0; ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, - inode_get_ctime(inode).tv_sec); + inode_get_ctime_sec(inode)); ufs_inode->ui_ctime.tv_usec = 0; - ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); + ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, + inode_get_mtime_sec(inode)); ufs_inode->ui_mtime.tv_usec = 0; ufs_inode->ui_blocks = cpu_to_fs32(sb, inode->i_blocks); ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); @@ -770,13 +774,15 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) ufs_inode->ui_gid = cpu_to_fs32(sb, i_gid_read(inode)); ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); - ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); - ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); - ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec); + ufs_inode->ui_atime = cpu_to_fs64(sb, inode_get_atime_sec(inode)); + ufs_inode->ui_atimensec = cpu_to_fs32(sb, + inode_get_atime_nsec(inode)); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime_sec(inode)); ufs_inode->ui_ctimensec = cpu_to_fs32(sb, - inode_get_ctime(inode).tv_nsec); - ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); - ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); + inode_get_ctime_nsec(inode)); + ufs_inode->ui_mtime = cpu_to_fs64(sb, inode_get_mtime_sec(inode)); + ufs_inode->ui_mtimensec = cpu_to_fs32(sb, + inode_get_mtime_nsec(inode)); ufs_inode->ui_blocks = cpu_to_fs64(sb, inode->i_blocks); ufs_inode->ui_flags = cpu_to_fs32(sb, ufsi->i_flags); @@ -1208,7 +1214,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); ufs_truncate_blocks(inode); - inode->i_mtime = inode_set_ctime_current(inode); + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index 83f20dd15522..72ac9320e6a3 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -126,12 +126,12 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, do_div(allocated, 512); inode->i_blocks = allocated; - inode->i_atime = ns_to_timespec64( - info->access_time.ns_relative_to_unix_epoch); + inode_set_atime_to_ts(inode, + ns_to_timespec64(info->access_time.ns_relative_to_unix_epoch)); inode_set_ctime_to_ts(inode, ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch)); - inode->i_mtime = ns_to_timespec64( - info->modification_time.ns_relative_to_unix_epoch); + inode_set_mtime_to_ts(inode, + ns_to_timespec64(info->modification_time.ns_relative_to_unix_epoch)); return 0; } @@ -194,7 +194,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) struct vboxsf_sbi *sbi; struct vboxsf_inode *sf_i; struct shfl_fsobjinfo info; - struct timespec64 prev_mtime; + struct timespec64 mtime, prev_mtime; struct inode *inode; int err; @@ -202,7 +202,7 @@ int vboxsf_inode_revalidate(struct dentry *dentry) return -EINVAL; inode = d_inode(dentry); - prev_mtime = inode->i_mtime; + prev_mtime = inode_get_mtime(inode); sf_i = VBOXSF_I(inode); sbi = VBOXSF_SBI(dentry->d_sb); if (!sf_i->force_restat) { @@ -225,7 +225,8 @@ int vboxsf_inode_revalidate(struct dentry *dentry) * page-cache for it. Note this also gets triggered by our own writes, * this is unavoidable. */ - if (timespec64_compare(&inode->i_mtime, &prev_mtime) > 0) + mtime = inode_get_mtime(inode); + if (timespec64_compare(&mtime, &prev_mtime) > 0) invalidate_inode_pages2(inode->i_mapping); return 0; diff --git a/fs/xattr.c b/fs/xattr.c index efd4736bc94b..09d927603433 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -56,7 +56,7 @@ strcmp_prefix(const char *a, const char *a_prefix) static const struct xattr_handler * xattr_resolve_name(struct inode *inode, const char **name) { - const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { @@ -162,7 +162,7 @@ xattr_permission(struct mnt_idmap *idmap, struct inode *inode, int xattr_supports_user_prefix(struct inode *inode) { - const struct xattr_handler **handlers = inode->i_sb->s_xattr; + const struct xattr_handler * const *handlers = inode->i_sb->s_xattr; const struct xattr_handler *handler; if (!(inode->i_opflags & IOP_XATTR)) { @@ -999,7 +999,7 @@ int xattr_list_one(char **buffer, ssize_t *remaining_size, const char *name) ssize_t generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - const struct xattr_handler *handler, **handlers = dentry->d_sb->s_xattr; + const struct xattr_handler *handler, * const *handlers = dentry->d_sb->s_xattr; ssize_t remaining_size = buffer_size; int err = 0; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index c9d653168ad0..ed0bc8cbc703 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -147,7 +147,7 @@ config XFS_ONLINE_SCRUB_STATS bool "XFS online metadata check usage data collection" default y depends on XFS_ONLINE_SCRUB - select FS_DEBUG + select XFS_DEBUG help If you say Y here, the kernel will gather usage data about the online metadata check subsystem. This includes the number diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index e9cc481b4ddf..f9f4d694640d 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -1001,6 +1001,12 @@ xfs_ag_shrink_space( error = -ENOSPC; goto resv_init_out; } + + /* Update perag geometry */ + pag->block_count -= delta; + __xfs_agino_range(pag->pag_mount, pag->block_count, &pag->agino_min, + &pag->agino_max); + xfs_ialloc_log_agi(*tpp, agibp, XFS_AGI_LENGTH); xfs_alloc_log_agf(*tpp, agfbp, XFS_AGF_LENGTH); return 0; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index a35781577cad..543f3748c2a3 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -220,8 +220,10 @@ xfs_inode_from_disk( * a time before epoch is converted to a time long after epoch * on 64 bit systems. */ - inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); - inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); + inode_set_atime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_atime)); + inode_set_mtime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_mtime)); inode_set_ctime_to_ts(inode, xfs_inode_from_disk_ts(from, from->di_ctime)); @@ -315,8 +317,8 @@ xfs_inode_to_disk( to->di_projid_lo = cpu_to_be16(ip->i_projid & 0xffff); to->di_projid_hi = cpu_to_be16(ip->i_projid >> 16); - to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); + to->di_atime = xfs_inode_to_disk_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_disk_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 2420865f3007..a5100a11faf9 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -131,4 +131,26 @@ void xlog_check_buf_cancel_table(struct xlog *log); #define xlog_check_buf_cancel_table(log) do { } while (0) #endif +/* + * Transform a regular reservation into one suitable for recovery of a log + * intent item. + * + * Intent recovery only runs a single step of the transaction chain and defers + * the rest to a separate transaction. Therefore, we reduce logcount to 1 here + * to avoid livelocks if the log grant space is nearly exhausted due to the + * recovered intent pinning the tail. Keep the same logflags to avoid tripping + * asserts elsewhere. Struct copies abound below. + */ +static inline struct xfs_trans_res +xlog_recover_resv(const struct xfs_trans_res *r) +{ + struct xfs_trans_res ret = { + .tr_logres = r->tr_logres, + .tr_logcount = 1, + .tr_logflags = r->tr_logflags, + }; + + return ret; +} + #endif /* __XFS_LOG_RECOVER_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index fa180ab66b73..396648acb5be 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -970,6 +970,7 @@ xfs_rtfree_extent( xfs_mount_t *mp; /* file system mount structure */ xfs_fsblock_t sb; /* summary file block number */ struct xfs_buf *sumbp = NULL; /* summary file block buffer */ + struct timespec64 atime; mp = tp->t_mountp; @@ -999,7 +1000,10 @@ xfs_rtfree_extent( mp->m_sb.sb_rextents) { if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *(uint64_t *)&VFS_I(mp->m_rbmip)->i_atime = 0; + + atime = inode_get_atime(VFS_I(mp->m_rbmip)); + *((uint64_t *)&atime) = 0; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), atime); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); } return 0; diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 5e174685a77c..6264daaab37b 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -266,7 +266,8 @@ xfs_validate_sb_write( return -EFSCORRUPTED; } - if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { + if (!xfs_is_readonly(mp) && + xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index ad22656376d3..70e97ea6eee7 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - /* If the mtime changes, then ctime must also change */ - ASSERT(flags & XFS_ICHGTIME_CHG); + tv = current_time(inode); - tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) - inode->i_mtime = tv; + inode_set_mtime_to_ts(inode, tv); + if (flags & XFS_ICHGTIME_CHG) + inode_set_ctime_to_ts(inode, tv); if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 7d3aa14d81b5..4849efcaa33a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -588,6 +588,8 @@ out_nofix: out_teardown: error = xchk_teardown(sc, error); out_sc: + if (error != -ENOENT) + xchk_stats_merge(mp, sm, &run); kfree(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); @@ -595,8 +597,6 @@ out: sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; error = 0; } - if (error != -ENOENT) - xchk_stats_merge(mp, sm, &run); return error; need_drain: error = xchk_teardown(sc, 0); diff --git a/fs/xfs/scrub/stats.c b/fs/xfs/scrub/stats.c index aeb92624176b..cd91db4a5548 100644 --- a/fs/xfs/scrub/stats.c +++ b/fs/xfs/scrub/stats.c @@ -185,7 +185,10 @@ xchk_stats_merge_one( { struct xchk_scrub_stats *css; - ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + if (sm->sm_type >= XFS_SCRUB_TYPE_NR) { + ASSERT(sm->sm_type < XFS_SCRUB_TYPE_NR); + return; + } css = &cs->cs_stats[sm->sm_type]; spin_lock(&css->css_lock); diff --git a/fs/xfs/scrub/xfile.c b/fs/xfs/scrub/xfile.c index d98e8e77c684..090c3ead43fd 100644 --- a/fs/xfs/scrub/xfile.c +++ b/fs/xfs/scrub/xfile.c @@ -10,7 +10,6 @@ #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" -#include "xfs_format.h" #include "scrub/xfile.h" #include "scrub/xfarray.h" #include "scrub/scrub.h" diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 5db87b34fb6e..89c7a9f4f930 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -333,7 +333,6 @@ xfs_attr_inactive( int error = 0; mp = dp->i_mount; - ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); xfs_ilock(dp, lock_mode); if (!xfs_inode_has_attr_fork(dp)) diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c index 2788a6f2edcd..36fe2abb16e6 100644 --- a/fs/xfs/xfs_attr_item.c +++ b/fs/xfs/xfs_attr_item.c @@ -547,7 +547,7 @@ xfs_attri_item_recover( struct xfs_inode *ip; struct xfs_da_args *args; struct xfs_trans *tp; - struct xfs_trans_res tres; + struct xfs_trans_res resv; struct xfs_attri_log_format *attrp; struct xfs_attri_log_nameval *nv = attrip->attri_nameval; int error; @@ -618,8 +618,9 @@ xfs_attri_item_recover( goto out; } - xfs_init_attr_trans(args, &tres, &total); - error = xfs_trans_alloc(mp, &tres, total, 0, XFS_TRANS_RESERVE, &tp); + xfs_init_attr_trans(args, &resv, &total); + resv = xlog_recover_resv(&resv); + error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp); if (error) goto out; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 7551c3ec4ea5..e736a0844c89 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -490,6 +490,7 @@ xfs_bui_item_recover( struct list_head *capture_list) { struct xfs_bmap_intent fake = { }; + struct xfs_trans_res resv; struct xfs_bui_log_item *buip = BUI_ITEM(lip); struct xfs_trans *tp; struct xfs_inode *ip = NULL; @@ -515,7 +516,8 @@ xfs_bui_item_recover( return error; /* Allocate transaction and do the work. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) goto err_rele; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fcefab687285..40e0a1f1f753 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1644,7 +1644,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; - struct timespec64 ctime; + struct timespec64 ctime, mtime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1758,10 +1758,11 @@ xfs_swap_extents( * under it. */ ctime = inode_get_ctime(VFS_I(ip)); + mtime = inode_get_mtime(VFS_I(ip)); if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || - (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || - (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { + (sbp->bs_mtime.tv_sec != mtime.tv_sec) || + (sbp->bs_mtime.tv_nsec != mtime.tv_nsec)) { error = -EBUSY; goto out_trans_cancel; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c1ece4a08ff4..003e157241da 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1945,8 +1945,6 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { - struct block_device *bdev = btp->bt_bdev; - unregister_shrinker(&btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); @@ -1954,8 +1952,8 @@ xfs_free_buftarg( fs_put_dax(btp->bt_daxdev, btp->bt_mount); /* the main block device is closed by kill_block_super */ - if (bdev != btp->bt_mount->m_super->s_bdev) - blkdev_put(bdev, btp->bt_mount->m_super); + if (btp->bt_bdev != btp->bt_mount->m_super->s_bdev) + bdev_release(btp->bt_bdev_handle); kmem_free(btp); } @@ -1990,16 +1988,15 @@ xfs_setsize_buftarg( */ STATIC int xfs_setsize_buftarg_early( - xfs_buftarg_t *btp, - struct block_device *bdev) + xfs_buftarg_t *btp) { - return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev)); + return xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev)); } struct xfs_buftarg * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev) + struct bdev_handle *bdev_handle) { xfs_buftarg_t *btp; const struct dax_holder_operations *ops = NULL; @@ -2010,9 +2007,10 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_NOFS); btp->bt_mount = mp; - btp->bt_dev = bdev->bd_dev; - btp->bt_bdev = bdev; - btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off, + btp->bt_bdev_handle = bdev_handle; + btp->bt_dev = bdev_handle->bdev->bd_dev; + btp->bt_bdev = bdev_handle->bdev; + btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); /* @@ -2022,7 +2020,7 @@ xfs_alloc_buftarg( ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ, DEFAULT_RATELIMIT_BURST); - if (xfs_setsize_buftarg_early(btp, bdev)) + if (xfs_setsize_buftarg_early(btp)) goto error_free; if (list_lru_init(&btp->bt_lru)) diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index df8f47953bb4..ada9d310b7d3 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -98,6 +98,7 @@ typedef unsigned int xfs_buf_flags_t; */ typedef struct xfs_buftarg { dev_t bt_dev; + struct bdev_handle *bt_bdev_handle; struct block_device *bt_bdev; struct dax_device *bt_daxdev; u64 bt_dax_part_off; @@ -364,7 +365,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, - struct block_device *bdev); + struct bdev_handle *bdev_handle); extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index afc4c78b9eed..d5787991bb5b 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Copyright (C) 2010 Red Hat, Inc. + * Copyright (C) 2010, 2023 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" @@ -19,21 +19,147 @@ #include "xfs_log.h" #include "xfs_ag.h" -STATIC int -xfs_trim_extents( +/* + * Notes on an efficient, low latency fstrim algorithm + * + * We need to walk the filesystem free space and issue discards on the free + * space that meet the search criteria (size and location). We cannot issue + * discards on extents that might be in use, or are so recently in use they are + * still marked as busy. To serialise against extent state changes whilst we are + * gathering extents to trim, we must hold the AGF lock to lock out other + * allocations and extent free operations that might change extent state. + * + * However, we cannot just hold the AGF for the entire AG free space walk whilst + * we issue discards on each free space that is found. Storage devices can have + * extremely slow discard implementations (e.g. ceph RBD) and so walking a + * couple of million free extents and issuing synchronous discards on each + * extent can take a *long* time. Whilst we are doing this walk, nothing else + * can access the AGF, and we can stall transactions and hence the log whilst + * modifications wait for the AGF lock to be released. This can lead hung tasks + * kicking the hung task timer and rebooting the system. This is bad. + * + * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI + * lock, gathers a range of inode cluster buffers that are allocated, drops the + * AGI lock and then reads all the inode cluster buffers and processes them. It + * loops doing this, using a cursor to keep track of where it is up to in the AG + * for each iteration to restart the INOBT lookup from. + * + * We can't do this exactly with free space - once we drop the AGF lock, the + * state of the free extent is out of our control and we cannot run a discard + * safely on it in this situation. Unless, of course, we've marked the free + * extent as busy and undergoing a discard operation whilst we held the AGF + * locked. + * + * This is exactly how online discard works - free extents are marked busy when + * they are freed, and once the extent free has been committed to the journal, + * the busy extent record is marked as "undergoing discard" and the discard is + * then issued on the free extent. Once the discard completes, the busy extent + * record is removed and the extent is able to be allocated again. + * + * In the context of fstrim, if we find a free extent we need to discard, we + * don't have to discard it immediately. All we need to do it record that free + * extent as being busy and under discard, and all the allocation routines will + * now avoid trying to allocate it. Hence if we mark the extent as busy under + * the AGF lock, we can safely discard it without holding the AGF lock because + * nothing will attempt to allocate that free space until the discard completes. + * + * This also allows us to issue discards asynchronously like we do with online + * discard, and so for fast devices fstrim will run much faster as we can have + * multiple discard operations in flight at once, as well as pipeline the free + * extent search so that it overlaps in flight discard IO. + */ + +struct workqueue_struct *xfs_discard_wq; + +static void +xfs_discard_endio_work( + struct work_struct *work) +{ + struct xfs_busy_extents *extents = + container_of(work, struct xfs_busy_extents, endio_work); + + xfs_extent_busy_clear(extents->mount, &extents->extent_list, false); + kmem_free(extents->owner); +} + +/* + * Queue up the actual completion to a thread to avoid IRQ-safe locking for + * pagb_lock. + */ +static void +xfs_discard_endio( + struct bio *bio) +{ + struct xfs_busy_extents *extents = bio->bi_private; + + INIT_WORK(&extents->endio_work, xfs_discard_endio_work); + queue_work(xfs_discard_wq, &extents->endio_work); + bio_put(bio); +} + +/* + * Walk the discard list and issue discards on all the busy extents in the + * list. We plug and chain the bios so that we only need a single completion + * call to clear all the busy extents once the discards are complete. + */ +int +xfs_discard_extents( + struct xfs_mount *mp, + struct xfs_busy_extents *extents) +{ + struct xfs_extent_busy *busyp; + struct bio *bio = NULL; + struct blk_plug plug; + int error = 0; + + blk_start_plug(&plug); + list_for_each_entry(busyp, &extents->extent_list, list) { + trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, + busyp->length); + + error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, + XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), + XFS_FSB_TO_BB(mp, busyp->length), + GFP_NOFS, &bio); + if (error && error != -EOPNOTSUPP) { + xfs_info(mp, + "discard failed for extent [0x%llx,%u], error %d", + (unsigned long long)busyp->bno, + busyp->length, + error); + break; + } + } + + if (bio) { + bio->bi_private = extents; + bio->bi_end_io = xfs_discard_endio; + submit_bio(bio); + } else { + xfs_discard_endio_work(&extents->endio_work); + } + blk_finish_plug(&plug); + + return error; +} + + +static int +xfs_trim_gather_extents( struct xfs_perag *pag, xfs_daddr_t start, xfs_daddr_t end, xfs_daddr_t minlen, + struct xfs_alloc_rec_incore *tcur, + struct xfs_busy_extents *extents, uint64_t *blocks_trimmed) { struct xfs_mount *mp = pag->pag_mount; - struct block_device *bdev = mp->m_ddev_targp->bt_bdev; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - struct xfs_agf *agf; int error; int i; + int batch = 100; /* * Force out the log. This means any transactions that might have freed @@ -45,20 +171,28 @@ xfs_trim_extents( error = xfs_alloc_read_agf(pag, NULL, 0, &agbp); if (error) return error; - agf = agbp->b_addr; cur = xfs_allocbt_init_cursor(mp, NULL, agbp, pag, XFS_BTNUM_CNT); /* - * Look up the longest btree in the AGF and start with it. + * Look up the extent length requested in the AGF and start with it. */ - error = xfs_alloc_lookup_ge(cur, 0, be32_to_cpu(agf->agf_longest), &i); + if (tcur->ar_startblock == NULLAGBLOCK) + error = xfs_alloc_lookup_ge(cur, 0, tcur->ar_blockcount, &i); + else + error = xfs_alloc_lookup_le(cur, tcur->ar_startblock, + tcur->ar_blockcount, &i); if (error) goto out_del_cursor; + if (i == 0) { + /* nothing of that length left in the AG, we are done */ + tcur->ar_blockcount = 0; + goto out_del_cursor; + } /* * Loop until we are done with all extents that are large - * enough to be worth discarding. + * enough to be worth discarding or we hit batch limits. */ while (i) { xfs_agblock_t fbno; @@ -73,7 +207,16 @@ xfs_trim_extents( error = -EFSCORRUPTED; break; } - ASSERT(flen <= be32_to_cpu(agf->agf_longest)); + + if (--batch <= 0) { + /* + * Update the cursor to point at this extent so we + * restart the next batch from this extent. + */ + tcur->ar_startblock = fbno; + tcur->ar_blockcount = flen; + break; + } /* * use daddr format for all range/len calculations as that is @@ -88,6 +231,7 @@ xfs_trim_extents( */ if (dlen < minlen) { trace_xfs_discard_toosmall(mp, pag->pag_agno, fbno, flen); + tcur->ar_blockcount = 0; break; } @@ -110,29 +254,103 @@ xfs_trim_extents( goto next_extent; } - trace_xfs_discard_extent(mp, pag->pag_agno, fbno, flen); - error = blkdev_issue_discard(bdev, dbno, dlen, GFP_NOFS); - if (error) - break; + xfs_extent_busy_insert_discard(pag, fbno, flen, + &extents->extent_list); *blocks_trimmed += flen; - next_extent: error = xfs_btree_decrement(cur, 0, &i); if (error) break; - if (fatal_signal_pending(current)) { - error = -ERESTARTSYS; - break; - } + /* + * If there's no more records in the tree, we are done. Set the + * cursor block count to 0 to indicate to the caller that there + * is no more extents to search. + */ + if (i == 0) + tcur->ar_blockcount = 0; } + /* + * If there was an error, release all the gathered busy extents because + * we aren't going to issue a discard on them any more. + */ + if (error) + xfs_extent_busy_clear(mp, &extents->extent_list, false); out_del_cursor: xfs_btree_del_cursor(cur, error); xfs_buf_relse(agbp); return error; } +static bool +xfs_trim_should_stop(void) +{ + return fatal_signal_pending(current) || freezing(current); +} + +/* + * Iterate the free list gathering extents and discarding them. We need a cursor + * for the repeated iteration of gather/discard loop, so use the longest extent + * we found in the last batch as the key to start the next. + */ +static int +xfs_trim_extents( + struct xfs_perag *pag, + xfs_daddr_t start, + xfs_daddr_t end, + xfs_daddr_t minlen, + uint64_t *blocks_trimmed) +{ + struct xfs_alloc_rec_incore tcur = { + .ar_blockcount = pag->pagf_longest, + .ar_startblock = NULLAGBLOCK, + }; + int error = 0; + + do { + struct xfs_busy_extents *extents; + + extents = kzalloc(sizeof(*extents), GFP_KERNEL); + if (!extents) { + error = -ENOMEM; + break; + } + + extents->mount = pag->pag_mount; + extents->owner = extents; + INIT_LIST_HEAD(&extents->extent_list); + + error = xfs_trim_gather_extents(pag, start, end, minlen, + &tcur, extents, blocks_trimmed); + if (error) { + kfree(extents); + break; + } + + /* + * We hand the extent list to the discard function here so the + * discarded extents can be removed from the busy extent list. + * This allows the discards to run asynchronously with gathering + * the next round of extents to discard. + * + * However, we must ensure that we do not reference the extent + * list after this function call, as it may have been freed by + * the time control returns to us. + */ + error = xfs_discard_extents(pag->pag_mount, extents); + if (error) + break; + + if (xfs_trim_should_stop()) + break; + + } while (tcur.ar_blockcount != 0); + + return error; + +} + /* * trim a range of the filesystem. * @@ -195,12 +413,12 @@ xfs_ioc_trim( for_each_perag_range(mp, agno, xfs_daddr_to_agno(mp, end), pag) { error = xfs_trim_extents(pag, start, end, minlen, &blocks_trimmed); - if (error) { + if (error) last_error = error; - if (error == -ERESTARTSYS) { - xfs_perag_rele(pag); - break; - } + + if (xfs_trim_should_stop()) { + xfs_perag_rele(pag); + break; } } diff --git a/fs/xfs/xfs_discard.h b/fs/xfs/xfs_discard.h index de92d9cc958f..2b1a85223a56 100644 --- a/fs/xfs/xfs_discard.h +++ b/fs/xfs/xfs_discard.h @@ -3,8 +3,10 @@ #define XFS_DISCARD_H 1 struct fstrim_range; -struct list_head; +struct xfs_mount; +struct xfs_busy_extents; -extern int xfs_ioc_trim(struct xfs_mount *, struct fstrim_range __user *); +int xfs_discard_extents(struct xfs_mount *mp, struct xfs_busy_extents *busy); +int xfs_ioc_trim(struct xfs_mount *mp, struct fstrim_range __user *fstrim); #endif /* XFS_DISCARD_H */ diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1064c2342876..7cd09c3a82cb 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -146,6 +146,20 @@ xfs_nfs_get_inode( return ERR_PTR(error); } + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return ERR_PTR(error); + } + } + if (VFS_I(ip)->i_generation != generation) { xfs_irele(ip); return ERR_PTR(-ESTALE); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 7c2fdc71e42d..9ecfdcdc752f 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -19,13 +19,13 @@ #include "xfs_log.h" #include "xfs_ag.h" -void -xfs_extent_busy_insert( - struct xfs_trans *tp, +static void +xfs_extent_busy_insert_list( struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, - unsigned int flags) + unsigned int flags, + struct list_head *busy_list) { struct xfs_extent_busy *new; struct xfs_extent_busy *busyp; @@ -40,7 +40,7 @@ xfs_extent_busy_insert( new->flags = flags; /* trace before insert to be able to see failed inserts */ - trace_xfs_extent_busy(tp->t_mountp, pag->pag_agno, bno, len); + trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len); spin_lock(&pag->pagb_lock); rbp = &pag->pagb_tree.rb_node; @@ -62,10 +62,33 @@ xfs_extent_busy_insert( rb_link_node(&new->rb_node, parent, rbp); rb_insert_color(&new->rb_node, &pag->pagb_tree); - list_add(&new->list, &tp->t_busy); + /* always process discard lists in fifo order */ + list_add_tail(&new->list, busy_list); spin_unlock(&pag->pagb_lock); } +void +xfs_extent_busy_insert( + struct xfs_trans *tp, + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + unsigned int flags) +{ + xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy); +} + +void +xfs_extent_busy_insert_discard( + struct xfs_perag *pag, + xfs_agblock_t bno, + xfs_extlen_t len, + struct list_head *busy_list) +{ + xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED, + busy_list); +} + /* * Search for a busy extent within the range of the extent we are about to * allocate. You need to be holding the busy extent tree lock when calling diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index c37bf87e6781..0639aab336f3 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -16,9 +16,6 @@ struct xfs_alloc_arg; /* * Busy block/extent entry. Indexed by a rbtree in perag to mark blocks that * have been freed but whose transactions aren't committed to disk yet. - * - * Note that we use the transaction ID to record the transaction, not the - * transaction structure itself. See xfs_extent_busy_insert() for details. */ struct xfs_extent_busy { struct rb_node rb_node; /* ag by-bno indexed search tree */ @@ -31,11 +28,32 @@ struct xfs_extent_busy { #define XFS_EXTENT_BUSY_SKIP_DISCARD 0x02 /* do not discard */ }; +/* + * List used to track groups of related busy extents all the way through + * to discard completion. + */ +struct xfs_busy_extents { + struct xfs_mount *mount; + struct list_head extent_list; + struct work_struct endio_work; + + /* + * Owner is the object containing the struct xfs_busy_extents to free + * once the busy extents have been processed. If only the + * xfs_busy_extents object needs freeing, then point this at itself. + */ + void *owner; +}; + void xfs_extent_busy_insert(struct xfs_trans *tp, struct xfs_perag *pag, xfs_agblock_t bno, xfs_extlen_t len, unsigned int flags); void +xfs_extent_busy_insert_discard(struct xfs_perag *pag, xfs_agblock_t bno, + xfs_extlen_t len, struct list_head *busy_list); + +void xfs_extent_busy_clear(struct xfs_mount *mp, struct list_head *list, bool do_discard); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index f1a5ecf099aa..3fa8789820ad 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -660,6 +660,7 @@ xfs_efi_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_efi_log_item *efip = EFI_ITEM(lip); struct xfs_mount *mp = lip->li_log->l_mp; struct xfs_efd_log_item *efdp; @@ -683,7 +684,8 @@ xfs_efi_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp); if (error) return error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 10403ba9b58f..736e5545f584 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -565,6 +565,19 @@ err: } #endif /* CONFIG_XFS_RT */ +static inline bool +rmap_not_shareable(struct xfs_mount *mp, const struct xfs_rmap_irec *r) +{ + if (!xfs_has_reflink(mp)) + return true; + if (XFS_RMAP_NON_INODE_OWNER(r->rm_owner)) + return true; + if (r->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK | + XFS_RMAP_UNWRITTEN)) + return true; + return false; +} + /* Execute a getfsmap query against the regular data device. */ STATIC int __xfs_getfsmap_datadev( @@ -598,7 +611,6 @@ __xfs_getfsmap_datadev( * low to the fsmap low key and max out the high key to the end * of the AG. */ - info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); if (error) @@ -608,12 +620,9 @@ __xfs_getfsmap_datadev( /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { - /* empty */ - } else if (XFS_RMAP_NON_INODE_OWNER(info->low.rm_owner) || - (info->low.rm_flags & (XFS_RMAP_ATTR_FORK | - XFS_RMAP_BMBT_BLOCK | - XFS_RMAP_UNWRITTEN))) { - info->low.rm_startblock += info->low.rm_blockcount; + /* No previous record from which to continue */ + } else if (rmap_not_shareable(mp, &info->low)) { + /* Last record seen was an unshareable extent */ info->low.rm_owner = 0; info->low.rm_offset = 0; @@ -621,8 +630,10 @@ __xfs_getfsmap_datadev( if (XFS_FSB_TO_DADDR(mp, start_fsb) >= eofs) return 0; } else { + /* Last record seen was a shareable file data extent */ info->low.rm_offset += info->low.rm_blockcount; } + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); info->high.rm_startblock = -1U; info->high.rm_owner = ULLONG_MAX; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index e541f5c0bc25..3c210ac83713 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -113,7 +113,7 @@ xfs_inode_alloc( INIT_LIST_HEAD(&ip->i_ioend_list); spin_lock_init(&ip->i_ioend_lock); ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return ip; } @@ -443,7 +443,7 @@ xfs_inodegc_queue_all( int cpu; bool ret = false; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0); @@ -463,7 +463,7 @@ xfs_inodegc_wait_all( int error = 0; flush_workqueue(mp->m_inodegc_wq); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { struct xfs_inodegc *gc; gc = per_cpu_ptr(mp->m_inodegc, cpu); @@ -1845,9 +1845,17 @@ xfs_inodegc_worker( struct xfs_inodegc, work); struct llist_node *node = llist_del_all(&gc->list); struct xfs_inode *ip, *n; + struct xfs_mount *mp = gc->mp; unsigned int nofs_flag; - ASSERT(gc->cpu == smp_processor_id()); + /* + * Clear the cpu mask bit and ensure that we have seen the latest + * update of the gc structure associated with this CPU. This matches + * with the release semantics used when setting the cpumask bit in + * xfs_inodegc_queue. + */ + cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask); + smp_mb__after_atomic(); WRITE_ONCE(gc->items, 0); @@ -1862,7 +1870,7 @@ xfs_inodegc_worker( nofs_flag = memalloc_nofs_save(); ip = llist_entry(node, struct xfs_inode, i_gclist); - trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); + trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits)); WRITE_ONCE(gc->shrinker_hits, 0); llist_for_each_entry_safe(ip, n, node, i_gclist) { @@ -2057,6 +2065,7 @@ xfs_inodegc_queue( struct xfs_inodegc *gc; int items; unsigned int shrinker_hits; + unsigned int cpu_nr; unsigned long queue_delay = 1; trace_xfs_inode_set_need_inactive(ip); @@ -2064,18 +2073,28 @@ xfs_inodegc_queue( ip->i_flags |= XFS_NEED_INACTIVE; spin_unlock(&ip->i_flags_lock); - gc = get_cpu_ptr(mp->m_inodegc); + cpu_nr = get_cpu(); + gc = this_cpu_ptr(mp->m_inodegc); llist_add(&ip->i_gclist, &gc->list); items = READ_ONCE(gc->items); WRITE_ONCE(gc->items, items + 1); shrinker_hits = READ_ONCE(gc->shrinker_hits); /* + * Ensure the list add is always seen by anyone who finds the cpumask + * bit set. This effectively gives the cpumask bit set operation + * release ordering semantics. + */ + smp_mb__before_atomic(); + if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask)) + cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask); + + /* * We queue the work while holding the current CPU so that the work * is scheduled to run on this CPU. */ if (!xfs_is_inodegc_enabled(mp)) { - put_cpu_ptr(gc); + put_cpu(); return; } @@ -2085,7 +2104,7 @@ xfs_inodegc_queue( trace_xfs_inodegc_queue(mp, __return_address); mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, queue_delay); - put_cpu_ptr(gc); + put_cpu(); if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { trace_xfs_inodegc_throttle(mp, __return_address); @@ -2094,47 +2113,6 @@ xfs_inodegc_queue( } /* - * Fold the dead CPU inodegc queue into the current CPUs queue. - */ -void -xfs_inodegc_cpu_dead( - struct xfs_mount *mp, - unsigned int dead_cpu) -{ - struct xfs_inodegc *dead_gc, *gc; - struct llist_node *first, *last; - unsigned int count = 0; - - dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); - cancel_delayed_work_sync(&dead_gc->work); - - if (llist_empty(&dead_gc->list)) - return; - - first = dead_gc->list.first; - last = first; - while (last->next) { - last = last->next; - count++; - } - dead_gc->list.first = NULL; - dead_gc->items = 0; - - /* Add pending work to current CPU */ - gc = get_cpu_ptr(mp->m_inodegc); - llist_add_batch(first, last, &gc->list); - count += READ_ONCE(gc->items); - WRITE_ONCE(gc->items, count); - - if (xfs_is_inodegc_enabled(mp)) { - trace_xfs_inodegc_queue(mp, __return_address); - mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work, - 0); - } - put_cpu_ptr(gc); -} - -/* * We set the inode flag atomically with the radix tree tag. Once we get tag * lookups on the radix tree, this inode flag can go away. * @@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count( if (!xfs_is_inodegc_enabled(mp)) return 0; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) return XFS_INODEGC_SHRINKER_COUNT; @@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan( trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &mp->m_inodegc_cpumask) { gc = per_cpu_ptr(mp->m_inodegc, cpu); if (!llist_empty(&gc->list)) { unsigned int h = READ_ONCE(gc->shrinker_hits); diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 2fa6f2e09d07..905944dafbe5 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp); int xfs_inodegc_flush(struct xfs_mount *mp); void xfs_inodegc_stop(struct xfs_mount *mp); void xfs_inodegc_start(struct xfs_mount *mp); -void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu); int xfs_inodegc_register_shrinker(struct xfs_mount *mp); #endif diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 360fe83a334f..36f5cf802c07 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -844,8 +844,8 @@ xfs_init_new_inode( ASSERT(ip->i_nblocks == 0); tv = inode_set_ctime_current(inode); - inode->i_mtime = tv; - inode->i_atime = tv; + inode_set_mtime_to_ts(inode, tv); + inode_set_atime_to_ts(inode, tv); ip->i_extsize = 0; ip->i_diflags = 0; @@ -1642,8 +1642,11 @@ xfs_inode_needs_inactive( if (VFS_I(ip)->i_mode == 0) return false; - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) return false; /* If the log isn't running, push inodes straight to reclaim. */ @@ -1703,8 +1706,11 @@ xfs_inactive( mp = ip->i_mount; ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY)); - /* If this is a read-only mount, don't do this (would generate I/O) */ - if (xfs_is_readonly(mp)) + /* + * If this is a read-only mount, don't do this (would generate I/O) + * unless we're in log recovery and cleaning the iunlinked list. + */ + if (xfs_is_readonly(mp) && !xlog_recovery_needed(mp->m_log)) goto out; /* Metadata inodes require explicit resource cleanup. */ @@ -1736,9 +1742,21 @@ xfs_inactive( ip->i_df.if_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip); - if (error) - goto out; + if (xfs_iflags_test(ip, XFS_IQUOTAUNCHECKED)) { + /* + * If this inode is being inactivated during a quotacheck and + * has not yet been scanned by quotacheck, we /must/ remove + * the dquots from the inode before inactivation changes the + * block and inode counts. Most probably this is a result of + * reloading the incore iunlinked list to purge unrecovered + * unlinked inodes. + */ + xfs_qm_dqdetach(ip); + } else { + error = xfs_qm_dqattach(ip); + if (error) + goto out; + } if (S_ISLNK(VFS_I(ip)->i_mode)) error = xfs_inactive_symlink(ip); @@ -1822,12 +1840,17 @@ xfs_iunlink_lookup( rcu_read_lock(); ip = radix_tree_lookup(&pag->pag_ici_root, agino); + if (!ip) { + /* Caller can handle inode not being in memory. */ + rcu_read_unlock(); + return NULL; + } /* - * Inode not in memory or in RCU freeing limbo should not happen. - * Warn about this and let the caller handle the failure. + * Inode in RCU freeing limbo should not happen. Warn about this and + * let the caller handle the failure. */ - if (WARN_ON_ONCE(!ip || !ip->i_ino)) { + if (WARN_ON_ONCE(!ip->i_ino)) { rcu_read_unlock(); return NULL; } @@ -1836,7 +1859,10 @@ xfs_iunlink_lookup( return ip; } -/* Update the prev pointer of the next agino. */ +/* + * Update the prev pointer of the next agino. Returns -ENOLINK if the inode + * is not in cache. + */ static int xfs_iunlink_update_backref( struct xfs_perag *pag, @@ -1851,7 +1877,8 @@ xfs_iunlink_update_backref( ip = xfs_iunlink_lookup(pag, next_agino); if (!ip) - return -EFSCORRUPTED; + return -ENOLINK; + ip->i_prev_unlinked = prev_agino; return 0; } @@ -1895,6 +1922,64 @@ xfs_iunlink_update_bucket( return 0; } +/* + * Load the inode @next_agino into the cache and set its prev_unlinked pointer + * to @prev_agino. Caller must hold the AGI to synchronize with other changes + * to the unlinked list. + */ +STATIC int +xfs_iunlink_reload_next( + struct xfs_trans *tp, + struct xfs_buf *agibp, + xfs_agino_t prev_agino, + xfs_agino_t next_agino) +{ + struct xfs_perag *pag = agibp->b_pag; + struct xfs_mount *mp = pag->pag_mount; + struct xfs_inode *next_ip = NULL; + xfs_ino_t ino; + int error; + + ASSERT(next_agino != NULLAGINO); + +#ifdef DEBUG + rcu_read_lock(); + next_ip = radix_tree_lookup(&pag->pag_ici_root, next_agino); + ASSERT(next_ip == NULL); + rcu_read_unlock(); +#endif + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating recovery.", + next_agino, pag->pag_agno); + + /* + * Use an untrusted lookup just to be cautious in case the AGI has been + * corrupted and now points at a free inode. That shouldn't happen, + * but we'd rather shut down now since we're already running in a weird + * situation. + */ + ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, next_agino); + error = xfs_iget(mp, tp, ino, XFS_IGET_UNTRUSTED, 0, &next_ip); + if (error) + return error; + + /* If this is not an unlinked inode, something is very wrong. */ + if (VFS_I(next_ip)->i_nlink != 0) { + error = -EFSCORRUPTED; + goto rele; + } + + next_ip->i_prev_unlinked = prev_agino; + trace_xfs_iunlink_reload_next(next_ip); +rele: + ASSERT(!(VFS_I(next_ip)->i_state & I_DONTCACHE)); + if (xfs_is_quotacheck_running(mp) && next_ip) + xfs_iflags_set(next_ip, XFS_IQUOTAUNCHECKED); + xfs_irele(next_ip); + return error; +} + static int xfs_iunlink_insert_inode( struct xfs_trans *tp, @@ -1926,6 +2011,8 @@ xfs_iunlink_insert_inode( * inode. */ error = xfs_iunlink_update_backref(pag, agino, next_agino); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, agino, next_agino); if (error) return error; @@ -1941,6 +2028,7 @@ xfs_iunlink_insert_inode( } /* Point the head of the list to point to this inode. */ + ip->i_prev_unlinked = NULLAGINO; return xfs_iunlink_update_bucket(tp, pag, agibp, bucket_index, agino); } @@ -2020,6 +2108,9 @@ xfs_iunlink_remove_inode( */ error = xfs_iunlink_update_backref(pag, ip->i_prev_unlinked, ip->i_next_unlinked); + if (error == -ENOLINK) + error = xfs_iunlink_reload_next(tp, agibp, ip->i_prev_unlinked, + ip->i_next_unlinked); if (error) return error; @@ -2040,7 +2131,7 @@ xfs_iunlink_remove_inode( } ip->i_next_unlinked = NULLAGINO; - ip->i_prev_unlinked = NULLAGINO; + ip->i_prev_unlinked = 0; return error; } @@ -3529,3 +3620,117 @@ xfs_iunlock2_io_mmap( if (ip1 != ip2) inode_unlock(VFS_I(ip1)); } + +/* + * Reload the incore inode list for this inode. Caller should ensure that + * the link count cannot change, either by taking ILOCK_SHARED or otherwise + * preventing other threads from executing. + */ +int +xfs_inode_reload_unlinked_bucket( + struct xfs_trans *tp, + struct xfs_inode *ip) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_buf *agibp; + struct xfs_agi *agi; + struct xfs_perag *pag; + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); + xfs_agino_t prev_agino, next_agino; + unsigned int bucket; + bool foundit = false; + int error; + + /* Grab the first inode in the list */ + pag = xfs_perag_get(mp, agno); + error = xfs_ialloc_read_agi(pag, tp, &agibp); + xfs_perag_put(pag); + if (error) + return error; + + /* + * We've taken ILOCK_SHARED and the AGI buffer lock to stabilize the + * incore unlinked list pointers for this inode. Check once more to + * see if we raced with anyone else to reload the unlinked list. + */ + if (!xfs_inode_unlinked_incomplete(ip)) { + foundit = true; + goto out_agibp; + } + + bucket = agino % XFS_AGI_UNLINKED_BUCKETS; + agi = agibp->b_addr; + + trace_xfs_inode_reload_unlinked_bucket(ip); + + xfs_info_ratelimited(mp, + "Found unrecovered unlinked inode 0x%x in AG 0x%x. Initiating list recovery.", + agino, agno); + + prev_agino = NULLAGINO; + next_agino = be32_to_cpu(agi->agi_unlinked[bucket]); + while (next_agino != NULLAGINO) { + struct xfs_inode *next_ip = NULL; + + /* Found this caller's inode, set its backlink. */ + if (next_agino == agino) { + next_ip = ip; + next_ip->i_prev_unlinked = prev_agino; + foundit = true; + goto next_inode; + } + + /* Try in-memory lookup first. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (next_ip) + goto next_inode; + + /* Inode not in memory, try reloading it. */ + error = xfs_iunlink_reload_next(tp, agibp, prev_agino, + next_agino); + if (error) + break; + + /* Grab the reloaded inode. */ + next_ip = xfs_iunlink_lookup(pag, next_agino); + if (!next_ip) { + /* No incore inode at all? We reloaded it... */ + ASSERT(next_ip != NULL); + error = -EFSCORRUPTED; + break; + } + +next_inode: + prev_agino = next_agino; + next_agino = next_ip->i_next_unlinked; + } + +out_agibp: + xfs_trans_brelse(tp, agibp); + /* Should have found this inode somewhere in the iunlinked bucket. */ + if (!error && !foundit) + error = -EFSCORRUPTED; + return error; +} + +/* Decide if this inode is missing its unlinked list and reload it. */ +int +xfs_inode_reload_unlinked( + struct xfs_inode *ip) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc_empty(ip->i_mount, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (xfs_inode_unlinked_incomplete(ip)) + error = xfs_inode_reload_unlinked_bucket(tp, ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_trans_cancel(tp); + + return error; +} diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 7547caf2f2ab..0c5bdb91152e 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -68,8 +68,21 @@ typedef struct xfs_inode { uint64_t i_diflags2; /* XFS_DIFLAG2_... */ struct timespec64 i_crtime; /* time created */ - /* unlinked list pointers */ + /* + * Unlinked list pointers. These point to the next and previous inodes + * in the AGI unlinked bucket list, respectively. These fields can + * only be updated with the AGI locked. + * + * i_next_unlinked caches di_next_unlinked. + */ xfs_agino_t i_next_unlinked; + + /* + * If the inode is not on an unlinked list, this field is zero. If the + * inode is the first element in an unlinked list, this field is + * NULLAGINO. Otherwise, i_prev_unlinked points to the previous inode + * in the unlinked list. + */ xfs_agino_t i_prev_unlinked; /* VFS inode */ @@ -81,6 +94,11 @@ typedef struct xfs_inode { struct list_head i_ioend_list; } xfs_inode_t; +static inline bool xfs_inode_on_unlinked_list(const struct xfs_inode *ip) +{ + return ip->i_prev_unlinked != 0; +} + static inline bool xfs_inode_has_attr_fork(struct xfs_inode *ip) { return ip->i_forkoff > 0; @@ -326,6 +344,9 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) */ #define XFS_INACTIVATING (1 << 13) +/* Quotacheck is running but inode has not been added to quota counts. */ +#define XFS_IQUOTAUNCHECKED (1 << 14) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -340,7 +361,7 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) #define XFS_IRECLAIM_RESET_FLAGS \ (XFS_IRECLAIMABLE | XFS_IRECLAIM | \ XFS_IDIRTY_RELEASE | XFS_ITRUNCATED | XFS_NEED_INACTIVE | \ - XFS_INACTIVATING) + XFS_INACTIVATING | XFS_IQUOTAUNCHECKED) /* * Flags for inode locking. @@ -575,4 +596,13 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +static inline bool +xfs_inode_unlinked_incomplete( + struct xfs_inode *ip) +{ + return VFS_I(ip)->i_nlink == 0 && !xfs_inode_on_unlinked_list(ip); +} +int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip); +int xfs_inode_reload_unlinked(struct xfs_inode *ip); + #endif /* __XFS_INODE_H__ */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 127b2410eb20..17c51804f9c6 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -526,8 +526,8 @@ xfs_inode_to_log_dinode( to->di_projid_hi = ip->i_projid >> 16; memset(to->di_pad3, 0, sizeof(to->di_pad3)); - to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); - to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); + to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode_get_atime(inode)); + to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode_get_mtime(inode)); to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2ededd3f6b8c..fdfda4fba12b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -572,11 +572,11 @@ xfs_vn_getattr( stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; - stat->atime = inode->i_atime; + stat->atime = inode_get_atime(inode); + stat->mtime = inode_get_mtime(inode); + stat->ctime = inode_get_ctime(inode); stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); - fill_mg_cmtime(stat, request_mask, inode); - if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; @@ -584,6 +584,11 @@ xfs_vn_getattr( } } + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->change_cookie = inode_query_iversion(inode); + stat->result_mask |= STATX_CHANGE_COOKIE; + } + /* * Note: If you add another clause to set an attribute flag, please * update attributes_mask below. @@ -917,7 +922,7 @@ xfs_setattr_size( if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_mgtime(inode); + current_time(inode); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } @@ -1062,9 +1067,9 @@ xfs_vn_update_time( now = current_time(inode); if (flags & S_MTIME) - inode->i_mtime = now; + inode_set_mtime_to_ts(inode, now); if (flags & S_ATIME) - inode->i_atime = now; + inode_set_atime_to_ts(inode, now); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index c2093cb56092..14462614fcc8 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -80,6 +80,17 @@ xfs_bulkstat_one_int( if (error) goto out; + /* Reload the incore unlinked list to avoid failure in inodegc. */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked_bucket(tp, ip); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + xfs_irele(ip); + return error; + } + } + ASSERT(ip != NULL); ASSERT(ip->i_imap.im_blkno != 0); inode = VFS_I(ip); @@ -96,12 +107,12 @@ xfs_bulkstat_one_int( buf->bs_size = ip->i_disk_size; buf->bs_nlink = inode->i_nlink; - buf->bs_atime = inode->i_atime.tv_sec; - buf->bs_atime_nsec = inode->i_atime.tv_nsec; - buf->bs_mtime = inode->i_mtime.tv_sec; - buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode_get_ctime(inode).tv_sec; - buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec; + buf->bs_atime = inode_get_atime_sec(inode); + buf->bs_atime_nsec = inode_get_atime_nsec(inode); + buf->bs_mtime = inode_get_mtime_sec(inode); + buf->bs_mtime_nsec = inode_get_mtime_nsec(inode); + buf->bs_ctime = inode_get_ctime_sec(inode); + buf->bs_ctime_nsec = inode_get_ctime_nsec(inode); buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 79004d193e54..51c100c86177 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -715,15 +715,7 @@ xfs_log_mount( * just worked. */ if (!xfs_has_norecovery(mp)) { - /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - bool readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, - &mp->m_opstate); error = xlog_recover(log); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); if (error) { xfs_warn(mp, "log mount/recovery failed: error %d", error); @@ -772,7 +764,6 @@ xfs_log_mount_finish( struct xfs_mount *mp) { struct xlog *log = mp->m_log; - bool readonly; int error = 0; if (xfs_has_norecovery(mp)) { @@ -781,12 +772,6 @@ xfs_log_mount_finish( } /* - * log recovery ignores readonly state and so we need to clear - * mount-based read only state so it can write to disk. - */ - readonly = test_and_clear_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); - - /* * During the second phase of log recovery, we need iget and * iput to behave like they do for an active filesystem. * xfs_fs_drop_inode needs to be able to prevent the deletion @@ -835,8 +820,6 @@ xfs_log_mount_finish( xfs_buftarg_drain(mp->m_ddev_targp); clear_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate); - if (readonly) - set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate); /* Make sure the log is dead if we're returning failure. */ ASSERT(!error || xlog_is_shutdown(log)); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index eccbfb99e894..67a99d94701e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -16,8 +16,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" - -struct workqueue_struct *xfs_discard_wq; +#include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to @@ -103,7 +102,7 @@ xlog_cil_ctx_alloc(void) ctx = kmem_zalloc(sizeof(*ctx), KM_NOFS); INIT_LIST_HEAD(&ctx->committing); - INIT_LIST_HEAD(&ctx->busy_extents); + INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); @@ -124,7 +123,7 @@ xlog_cil_push_pcp_aggregate( struct xlog_cil_pcp *cilpcp; int cpu; - for_each_online_cpu(cpu) { + for_each_cpu(cpu, &ctx->cil_pcpmask) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ctx->ticket->t_curr_res += cilpcp->space_reserved; @@ -132,7 +131,7 @@ xlog_cil_push_pcp_aggregate( if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, - &ctx->busy_extents); + &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); @@ -165,7 +164,13 @@ xlog_cil_insert_pcp_aggregate( if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) return; - for_each_online_cpu(cpu) { + /* + * We can race with other cpus setting cil_pcpmask. However, we've + * atomically cleared PCP_SPACE which forces other threads to add to + * the global space used count. cil_pcpmask is a superset of cilpcp + * structures that could have a nonzero space_used. + */ + for_each_cpu(cpu, &ctx->cil_pcpmask) { int old, prev; cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); @@ -554,6 +559,7 @@ xlog_cil_insert_items( int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; int order; + unsigned int cpu_nr; struct xlog_cil_pcp *cilpcp; ASSERT(tp); @@ -577,7 +583,12 @@ xlog_cil_insert_items( * can't be scheduled away between split sample/update operations that * are done without outside locking to serialise them. */ - cilpcp = get_cpu_ptr(cil->xc_pcp); + cpu_nr = get_cpu(); + cilpcp = this_cpu_ptr(cil->xc_pcp); + + /* Tell the future push that there was work added by this CPU. */ + if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask)) + cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask); /* * We need to take the CIL checkpoint unit reservation on the first @@ -663,7 +674,7 @@ xlog_cil_insert_items( continue; list_add_tail(&lip->li_cil, &cilpcp->log_items); } - put_cpu_ptr(cilpcp); + put_cpu(); /* * If we've overrun the reservation, dump the tx details before we move @@ -696,76 +707,6 @@ xlog_cil_free_logvec( } } -static void -xlog_discard_endio_work( - struct work_struct *work) -{ - struct xfs_cil_ctx *ctx = - container_of(work, struct xfs_cil_ctx, discard_endio_work); - struct xfs_mount *mp = ctx->cil->xc_log->l_mp; - - xfs_extent_busy_clear(mp, &ctx->busy_extents, false); - kmem_free(ctx); -} - -/* - * Queue up the actual completion to a thread to avoid IRQ-safe locking for - * pagb_lock. Note that we need a unbounded workqueue, otherwise we might - * get the execution delayed up to 30 seconds for weird reasons. - */ -static void -xlog_discard_endio( - struct bio *bio) -{ - struct xfs_cil_ctx *ctx = bio->bi_private; - - INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); - queue_work(xfs_discard_wq, &ctx->discard_endio_work); - bio_put(bio); -} - -static void -xlog_discard_busy_extents( - struct xfs_mount *mp, - struct xfs_cil_ctx *ctx) -{ - struct list_head *list = &ctx->busy_extents; - struct xfs_extent_busy *busyp; - struct bio *bio = NULL; - struct blk_plug plug; - int error = 0; - - ASSERT(xfs_has_discard(mp)); - - blk_start_plug(&plug); - list_for_each_entry(busyp, list, list) { - trace_xfs_discard_extent(mp, busyp->agno, busyp->bno, - busyp->length); - - error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev, - XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno), - XFS_FSB_TO_BB(mp, busyp->length), - GFP_NOFS, &bio); - if (error && error != -EOPNOTSUPP) { - xfs_info(mp, - "discard failed for extent [0x%llx,%u], error %d", - (unsigned long long)busyp->bno, - busyp->length, - error); - break; - } - } - - if (bio) { - bio->bi_private = ctx; - bio->bi_end_io = xlog_discard_endio; - submit_bio(bio); - } else { - xlog_discard_endio_work(&ctx->discard_endio_work); - } - blk_finish_plug(&plug); -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -795,8 +736,8 @@ xlog_cil_committed( xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, &ctx->lv_chain, ctx->start_lsn, abort); - xfs_extent_busy_sort(&ctx->busy_extents); - xfs_extent_busy_clear(mp, &ctx->busy_extents, + xfs_extent_busy_sort(&ctx->busy_extents.extent_list); + xfs_extent_busy_clear(mp, &ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); @@ -805,10 +746,14 @@ xlog_cil_committed( xlog_cil_free_logvec(&ctx->lv_chain); - if (!list_empty(&ctx->busy_extents)) - xlog_discard_busy_extents(mp, ctx); - else - kmem_free(ctx); + if (!list_empty(&ctx->busy_extents.extent_list)) { + ctx->busy_extents.mount = mp; + ctx->busy_extents.owner = ctx; + xfs_discard_extents(mp, &ctx->busy_extents); + return; + } + + kmem_free(ctx); } void @@ -1791,38 +1736,6 @@ out_shutdown: } /* - * Move dead percpu state to the relevant CIL context structures. - * - * We have to lock the CIL context here to ensure that nothing is modifying - * the percpu state, either addition or removal. Both of these are done under - * the CIL context lock, so grabbing that exclusively here will ensure we can - * safely drain the cilpcp for the CPU that is dying. - */ -void -xlog_cil_pcp_dead( - struct xlog *log, - unsigned int cpu) -{ - struct xfs_cil *cil = log->l_cilp; - struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); - struct xfs_cil_ctx *ctx; - - down_write(&cil->xc_ctx_lock); - ctx = cil->xc_ctx; - if (ctx->ticket) - ctx->ticket->t_curr_res += cilpcp->space_reserved; - cilpcp->space_reserved = 0; - - if (!list_empty(&cilpcp->log_items)) - list_splice_init(&cilpcp->log_items, &ctx->log_items); - if (!list_empty(&cilpcp->busy_extents)) - list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents); - atomic_add(cilpcp->space_used, &ctx->space_used); - cilpcp->space_used = 0; - up_write(&cil->xc_ctx_lock); -} - -/* * Perform initial CIL structure initialisation. */ int diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 1bd2963e8fbd..fa3ad1d7b31c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -6,6 +6,8 @@ #ifndef __XFS_LOG_PRIV_H__ #define __XFS_LOG_PRIV_H__ +#include "xfs_extent_busy.h" /* for struct xfs_busy_extents */ + struct xfs_buf; struct xlog; struct xlog_ticket; @@ -223,14 +225,19 @@ struct xfs_cil_ctx { struct xlog_in_core *commit_iclog; struct xlog_ticket *ticket; /* chkpt ticket */ atomic_t space_used; /* aggregate size of regions */ - struct list_head busy_extents; /* busy extents in chkpt */ + struct xfs_busy_extents busy_extents; struct list_head log_items; /* log items in chkpt */ struct list_head lv_chain; /* logvecs being pushed */ struct list_head iclog_entry; struct list_head committing; /* ctx committing list */ - struct work_struct discard_endio_work; struct work_struct push_work; atomic_t order_id; + + /* + * CPUs that could have added items to the percpu CIL data. Access is + * coordinated with xc_ctx_lock. + */ + struct cpumask cil_pcpmask; }; /* @@ -278,9 +285,6 @@ struct xfs_cil { wait_queue_head_t xc_push_wait; /* background push throttle */ void __percpu *xc_pcp; /* percpu CIL structures */ -#ifdef CONFIG_HOTPLUG_CPU - struct list_head xc_pcp_list; -#endif } ____cacheline_aligned_in_smp; /* xc_flags bit values */ @@ -705,9 +709,4 @@ xlog_kvmalloc( return p; } -/* - * CIL CPU dead notifier - */ -void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu); - #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 82c81d20459d..13b94d2e605b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -329,7 +329,7 @@ xlog_find_verify_cycle( * try a smaller size. We need to be able to read at least * a log sector, or we're out of luck. */ - bufblks = 1 << ffs(nbblks); + bufblks = roundup_pow_of_two(nbblks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { @@ -1528,7 +1528,7 @@ xlog_write_log_records( * a smaller size. We need to be able to write at least a * log sector, or we're out of luck. */ - bufblks = 1 << ffs(blocks); + bufblks = roundup_pow_of_two(blocks); while (bufblks > log->l_logBBsize) bufblks >>= 1; while (!(buffer = xlog_alloc_buffer(log, bufblks))) { diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a25eece3be2b..d19cca099bc3 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -60,6 +60,7 @@ struct xfs_error_cfg { * Per-cpu deferred inode inactivation GC lists. */ struct xfs_inodegc { + struct xfs_mount *mp; struct llist_head list; struct delayed_work work; int error; @@ -67,9 +68,7 @@ struct xfs_inodegc { /* approximate count of inodes in the list */ unsigned int items; unsigned int shrinker_hits; -#if defined(DEBUG) || defined(XFS_WARN) unsigned int cpu; -#endif }; /* @@ -98,7 +97,6 @@ typedef struct xfs_mount { xfs_buftarg_t *m_ddev_targp; /* saves taking the address */ xfs_buftarg_t *m_logdev_targp;/* ptr to log device */ xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */ - struct list_head m_mount_list; /* global mount list */ void __percpu *m_inodegc; /* percpu inodegc structures */ /* @@ -249,6 +247,9 @@ typedef struct xfs_mount { unsigned int *m_errortag; struct xfs_kobj m_errortag_kobj; #endif + + /* cpus that have inodes queued for inactivation */ + struct cpumask m_inodegc_cpumask; } xfs_mount_t; #define M_IGEO(mp) (&(mp)->m_ino_geo) @@ -404,6 +405,8 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_WARNED_SHRINK 8 /* Kernel has logged a warning about logged xattr updates being used. */ #define XFS_OPSTATE_WARNED_LARP 9 +/* Mount time quotacheck is running */ +#define XFS_OPSTATE_QUOTACHECK_RUNNING 10 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -426,6 +429,11 @@ __XFS_IS_OPSTATE(inode32, INODE32) __XFS_IS_OPSTATE(readonly, READONLY) __XFS_IS_OPSTATE(inodegc_enabled, INODEGC_ENABLED) __XFS_IS_OPSTATE(blockgc_enabled, BLOCKGC_ENABLED) +#ifdef CONFIG_XFS_QUOTA +__XFS_IS_OPSTATE(quotacheck_running, QUOTACHECK_RUNNING) +#else +# define xfs_is_quotacheck_running(mp) (false) +#endif static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -443,7 +451,8 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ - { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" } + { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ + { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" } /* * Max and min values for mount-option defined I/O diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index 4a9bbd3fe120..a7daa522e00f 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -126,8 +126,8 @@ xfs_dax_notify_ddev_failure( struct xfs_rmap_irec ri_low = { }; struct xfs_rmap_irec ri_high; struct xfs_agf *agf; - xfs_agblock_t agend; struct xfs_perag *pag; + xfs_agblock_t range_agend; pag = xfs_perag_get(mp, agno); error = xfs_alloc_read_agf(pag, tp, 0, &agf_bp); @@ -148,10 +148,10 @@ xfs_dax_notify_ddev_failure( ri_high.rm_startblock = XFS_FSB_TO_AGBNO(mp, end_fsbno); agf = agf_bp->b_addr; - agend = min(be32_to_cpu(agf->agf_length), + range_agend = min(be32_to_cpu(agf->agf_length) - 1, ri_high.rm_startblock); notify.startblock = ri_low.rm_startblock; - notify.blockcount = agend - ri_low.rm_startblock; + notify.blockcount = range_agend + 1 - ri_low.rm_startblock; error = xfs_rmap_query_range(cur, &ri_low, &ri_high, xfs_dax_failure_fn, ¬ify); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 6abcc34fafd8..086e78a6143a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1160,6 +1160,19 @@ xfs_qm_dqusage_adjust( if (error) return error; + /* + * Reload the incore unlinked list to avoid failure in inodegc. + * Use an unlocked check here because unrecovered unlinked inodes + * should be somewhat rare. + */ + if (xfs_inode_unlinked_incomplete(ip)) { + error = xfs_inode_reload_unlinked(ip); + if (error) { + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + goto error0; + } + } + ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { @@ -1173,6 +1186,7 @@ xfs_qm_dqusage_adjust( } nblks = (xfs_qcnt_t)ip->i_nblocks - rtblks; + xfs_iflags_clear(ip, XFS_IQUOTAUNCHECKED); /* * Add the (disk blocks and inode) resources occupied by this @@ -1319,8 +1333,10 @@ xfs_qm_quotacheck( flags |= XFS_PQUOTA_CHKD; } + xfs_set_quotacheck_running(mp); error = xfs_iwalk_threaded(mp, 0, 0, xfs_qm_dqusage_adjust, 0, true, NULL); + xfs_clear_quotacheck_running(mp); /* * On error, the inode walk may have partially populated the dquot diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index edd8587658d5..2d4444d61e98 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -477,6 +477,7 @@ xfs_cui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_cui_log_item *cuip = CUI_ITEM(lip); struct xfs_cud_log_item *cudp; struct xfs_trans *tp; @@ -514,8 +515,9 @@ xfs_cui_item_recover( * doesn't fit. We need to reserve enough blocks to handle a * full btree split on either end of the refcount range. */ - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_refc_maxlevels * 2, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 520c7ebdfed8..0e0e747028da 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -507,6 +507,7 @@ xfs_rui_item_recover( struct xfs_log_item *lip, struct list_head *capture_list) { + struct xfs_trans_res resv; struct xfs_rui_log_item *ruip = RUI_ITEM(lip); struct xfs_rud_log_item *rudp; struct xfs_trans *tp; @@ -530,8 +531,9 @@ xfs_rui_item_recover( } } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, - mp->m_rmap_maxlevels, 0, XFS_TRANS_RESERVE, &tp); + resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate); + error = xfs_trans_alloc(mp, &resv, mp->m_rmap_maxlevels, 0, + XFS_TRANS_RESERVE, &tp); if (error) return error; rudp = xfs_trans_get_rud(tp, ruip); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 16534e9873f6..2e1a4e5cd03d 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1420,25 +1420,26 @@ xfs_rtunmount_inodes( */ int /* error */ xfs_rtpick_extent( - xfs_mount_t *mp, /* file system mount point */ - xfs_trans_t *tp, /* transaction pointer */ - xfs_extlen_t len, /* allocation length (rtextents) */ - xfs_rtblock_t *pick) /* result rt extent */ -{ - xfs_rtblock_t b; /* result block */ - int log2; /* log of sequence number */ - uint64_t resid; /* residual after log removed */ - uint64_t seq; /* sequence number of file creation */ - uint64_t *seqp; /* pointer to seqno in inode */ + xfs_mount_t *mp, /* file system mount point */ + xfs_trans_t *tp, /* transaction pointer */ + xfs_extlen_t len, /* allocation length (rtextents) */ + xfs_rtblock_t *pick) /* result rt extent */ + { + xfs_rtblock_t b; /* result block */ + int log2; /* log of sequence number */ + uint64_t resid; /* residual after log removed */ + uint64_t seq; /* sequence number of file creation */ + struct timespec64 ts; /* temporary timespec64 storage */ ASSERT(xfs_isilocked(mp->m_rbmip, XFS_ILOCK_EXCL)); - seqp = (uint64_t *)&VFS_I(mp->m_rbmip)->i_atime; if (!(mp->m_rbmip->i_diflags & XFS_DIFLAG_NEWRTBM)) { mp->m_rbmip->i_diflags |= XFS_DIFLAG_NEWRTBM; - *seqp = 0; + seq = 0; + } else { + ts = inode_get_atime(VFS_I(mp->m_rbmip)); + seq = (uint64_t)ts.tv_sec; } - seq = *seqp; if ((log2 = xfs_highbit64(seq)) == -1) b = 0; else { @@ -1450,7 +1451,8 @@ xfs_rtpick_extent( if (b + len > mp->m_sb.sb_rextents) b = mp->m_sb.sb_rextents - len; } - *seqp = seq + 1; + ts.tv_sec = (time64_t)seq + 1; + inode_set_atime_to_ts(VFS_I(mp->m_rbmip), ts); xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); *pick = b; return 0; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1f77014c6e1a..f0ae07828153 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -56,28 +56,6 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #endif -#ifdef CONFIG_HOTPLUG_CPU -static LIST_HEAD(xfs_mount_list); -static DEFINE_SPINLOCK(xfs_mount_list_lock); - -static inline void xfs_mount_list_add(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_add(&mp->m_mount_list, &xfs_mount_list); - spin_unlock(&xfs_mount_list_lock); -} - -static inline void xfs_mount_list_del(struct xfs_mount *mp) -{ - spin_lock(&xfs_mount_list_lock); - list_del(&mp->m_mount_list); - spin_unlock(&xfs_mount_list_lock); -} -#else /* !CONFIG_HOTPLUG_CPU */ -static inline void xfs_mount_list_add(struct xfs_mount *mp) {} -static inline void xfs_mount_list_del(struct xfs_mount *mp) {} -#endif - enum xfs_dax_mode { XFS_DAX_INODE = 0, XFS_DAX_ALWAYS = 1, @@ -383,14 +361,15 @@ STATIC int xfs_blkdev_get( xfs_mount_t *mp, const char *name, - struct block_device **bdevp) + struct bdev_handle **handlep) { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, - mp->m_super, &fs_holder_ops); - if (IS_ERR(*bdevp)) { - error = PTR_ERR(*bdevp); + *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); + if (IS_ERR(*handlep)) { + error = PTR_ERR(*handlep); + *handlep = NULL; xfs_warn(mp, "Invalid device [%s], error=%d", name, error); } @@ -455,7 +434,7 @@ xfs_open_devices( { struct super_block *sb = mp->m_super; struct block_device *ddev = sb->s_bdev; - struct block_device *logdev = NULL, *rtdev = NULL; + struct bdev_handle *logdev_handle = NULL, *rtdev_handle = NULL; int error; /* @@ -468,17 +447,19 @@ xfs_open_devices( * Open real time and log devices - order is important. */ if (mp->m_logname) { - error = xfs_blkdev_get(mp, mp->m_logname, &logdev); + error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle); if (error) goto out_relock; } if (mp->m_rtname) { - error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev); + error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev_handle); if (error) goto out_close_logdev; - if (rtdev == ddev || rtdev == logdev) { + if (rtdev_handle->bdev == ddev || + (logdev_handle && + rtdev_handle->bdev == logdev_handle->bdev)) { xfs_warn(mp, "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = -EINVAL; @@ -490,22 +471,25 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_handle); if (!mp->m_ddev_targp) goto out_close_rtdev; - if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + if (rtdev_handle) { + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_handle); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } - if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + if (logdev_handle && logdev_handle->bdev != ddev) { + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_handle); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { mp->m_logdev_targp = mp->m_ddev_targp; + /* Handle won't be used, drop it */ + if (logdev_handle) + bdev_release(logdev_handle); } error = 0; @@ -519,11 +503,11 @@ out_relock: out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - if (rtdev) - blkdev_put(rtdev, sb); + if (rtdev_handle) + bdev_release(rtdev_handle); out_close_logdev: - if (logdev && logdev != ddev) - blkdev_put(logdev, sb); + if (logdev_handle) + bdev_release(logdev_handle); goto out_relock; } @@ -1135,9 +1119,8 @@ xfs_inodegc_init_percpu( for_each_possible_cpu(cpu) { gc = per_cpu_ptr(mp->m_inodegc, cpu); -#if defined(DEBUG) || defined(XFS_WARN) gc->cpu = cpu; -#endif + gc->mp = mp; init_llist_head(&gc->list); gc->items = 0; gc->error = 0; @@ -1168,7 +1151,6 @@ xfs_fs_put_super( xfs_freesb(mp); xchk_mount_stats_free(mp); free_percpu(mp->m_stats.xs_stats); - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); @@ -1577,13 +1559,6 @@ xfs_fs_fill_super( if (error) goto out_destroy_counters; - /* - * All percpu data structures requiring cleanup when a cpu goes offline - * must be allocated before adding this @mp to the cpu-dead handler's - * mount list. - */ - xfs_mount_list_add(mp); - /* Allocate stats memory before we do operations that might use it */ mp->m_stats.xs_stats = alloc_percpu(struct xfsstats); if (!mp->m_stats.xs_stats) { @@ -1781,7 +1756,6 @@ xfs_fs_fill_super( out_free_stats: free_percpu(mp->m_stats.xs_stats); out_destroy_inodegc: - xfs_mount_list_del(mp); xfs_inodegc_free_percpu(mp); out_destroy_counters: xfs_destroy_percpu_counters(mp); @@ -2065,7 +2039,7 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("xfs"); @@ -2326,49 +2300,6 @@ xfs_destroy_workqueues(void) destroy_workqueue(xfs_alloc_wq); } -#ifdef CONFIG_HOTPLUG_CPU -static int -xfs_cpu_dead( - unsigned int cpu) -{ - struct xfs_mount *mp, *n; - - spin_lock(&xfs_mount_list_lock); - list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) { - spin_unlock(&xfs_mount_list_lock); - xfs_inodegc_cpu_dead(mp, cpu); - xlog_cil_pcp_dead(mp->m_log, cpu); - spin_lock(&xfs_mount_list_lock); - } - spin_unlock(&xfs_mount_list_lock); - return 0; -} - -static int __init -xfs_cpu_hotplug_init(void) -{ - int error; - - error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL, - xfs_cpu_dead); - if (error < 0) - xfs_alert(NULL, -"Failed to initialise CPU hotplug, error %d. XFS is non-functional.", - error); - return error; -} - -static void -xfs_cpu_hotplug_destroy(void) -{ - cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD); -} - -#else /* !CONFIG_HOTPLUG_CPU */ -static inline int xfs_cpu_hotplug_init(void) { return 0; } -static inline void xfs_cpu_hotplug_destroy(void) {} -#endif - STATIC int __init init_xfs_fs(void) { @@ -2385,13 +2316,9 @@ init_xfs_fs(void) xfs_dir_startup(); - error = xfs_cpu_hotplug_init(); - if (error) - goto out; - error = xfs_init_caches(); if (error) - goto out_destroy_hp; + goto out; error = xfs_init_workqueues(); if (error) @@ -2475,8 +2402,6 @@ init_xfs_fs(void) xfs_destroy_workqueues(); out_destroy_caches: xfs_destroy_caches(); - out_destroy_hp: - xfs_cpu_hotplug_destroy(); out: return error; } @@ -2500,7 +2425,6 @@ exit_xfs_fs(void) xfs_destroy_workqueues(); xfs_destroy_caches(); xfs_uuid_table_free(); - xfs_cpu_hotplug_destroy(); } module_init(init_xfs_fs); diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 902c7f67a117..3926cf7f2a6e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -3824,6 +3824,51 @@ TRACE_EVENT(xfs_iunlink_update_dinode, __entry->new_ptr) ); +TRACE_EVENT(xfs_iunlink_reload_next, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(xfs_agino_t, prev_agino) + __field(xfs_agino_t, next_agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + __entry->prev_agino = ip->i_prev_unlinked; + __entry->next_agino = ip->i_next_unlinked; + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x prev_unlinked 0x%x next_unlinked 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->prev_agino, + __entry->next_agino) +); + +TRACE_EVENT(xfs_inode_reload_unlinked_bucket, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + ), + TP_fast_assign( + __entry->dev = ip->i_mount->m_super->s_dev; + __entry->agno = XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino); + __entry->agino = XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino); + ), + TP_printk("dev %d:%d agno 0x%x agino 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agino, + __entry->agino % XFS_AGI_UNLINKED_BUCKETS) +); + DECLARE_EVENT_CLASS(xfs_ag_inode_class, TP_PROTO(struct xfs_inode *ip), TP_ARGS(ip), diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 43e5c219aaed..987843f84d03 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -46,6 +46,17 @@ xfs_attr_grab_log_assist( if (xfs_sb_version_haslogxattrs(&mp->m_sb)) return 0; + /* + * Check if the filesystem featureset is new enough to set this log + * incompat feature bit. Strictly speaking, the minimum requirement is + * a V5 filesystem for the superblock field, but we'll require rmap + * or reflink to avoid having to deal with really old kernels. + */ + if (!xfs_has_reflink(mp) && !xfs_has_rmapbt(mp)) { + error = -EOPNOTSUPP; + goto drop_incompat; + } + /* Enable log-assisted xattrs. */ error = xfs_add_incompat_log_feature(mp, XFS_SB_FEAT_INCOMPAT_LOG_XATTRS); @@ -175,7 +186,7 @@ static const struct xattr_handler xfs_xattr_security_handler = { .set = xfs_xattr_set, }; -const struct xattr_handler *xfs_xattr_handlers[] = { +const struct xattr_handler * const xfs_xattr_handlers[] = { &xfs_xattr_user_handler, &xfs_xattr_trusted_handler, &xfs_xattr_security_handler, diff --git a/fs/xfs/xfs_xattr.h b/fs/xfs/xfs_xattr.h index 2b09133b1b9b..cec766cad26c 100644 --- a/fs/xfs/xfs_xattr.h +++ b/fs/xfs/xfs_xattr.h @@ -8,6 +8,6 @@ int xfs_attr_change(struct xfs_da_args *args); -extern const struct xattr_handler *xfs_xattr_handlers[]; +extern const struct xattr_handler * const xfs_xattr_handlers[]; #endif /* __XFS_XATTR_H__ */ diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 9d1a9808fbbb..e6a75401677d 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -658,8 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode->i_ino = ino; inode->i_mode = z->z_mode; - inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, - inode_get_ctime(dir)); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(dir)))); inode->i_uid = z->z_uid; inode->i_gid = z->z_gid; inode->i_size = z->z_wpoffset; @@ -695,8 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode->i_ino = ino; inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; - inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, - inode_get_ctime(root)); + inode_set_mtime_to_ts(inode, + inode_set_atime_to_ts(inode, inode_set_ctime_to_ts(inode, inode_get_ctime(root)))); inode->i_private = &sbi->s_zgroup[ztype]; set_nlink(inode, 2); @@ -1319,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; - inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + simple_inode_init_ts(inode); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; inode->i_size = 2; |