diff options
Diffstat (limited to 'fs')
349 files changed, 6189 insertions, 4301 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 950cf61f118b..0d28ecf668d0 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -260,7 +260,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_mapping->a_ops = &v9fs_addr_operations; inode->i_private = NULL; @@ -1011,7 +1011,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache & CACHE_WRITEBACK) { if (S_ISREG(inode->i_mode)) { @@ -1032,7 +1032,7 @@ v9fs_vfs_getattr(struct mnt_idmap *idmap, const struct path *path, return PTR_ERR(st); v9fs_stat2inode(st, d_inode(dentry), dentry->d_sb, 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1152,7 +1152,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_atime.tv_sec = stat->atime; inode->i_mtime.tv_sec = stat->mtime; - inode->i_ctime.tv_sec = stat->mtime; + inode_set_ctime(inode, stat->mtime, 0); inode->i_uid = v9ses->dfltuid; inode->i_gid = v9ses->dfltgid; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 14510872ecc3..1312f68965ac 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -450,7 +450,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } else if (v9ses->cache) { if (S_ISREG(inode->i_mode)) { @@ -475,7 +475,7 @@ v9fs_vfs_getattr_dotl(struct mnt_idmap *idmap, return PTR_ERR(st); v9fs_stat2inode_dotl(st, d_inode(dentry), 0); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -645,8 +645,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_atime.tv_nsec = stat->st_atime_nsec; inode->i_mtime.tv_sec = stat->st_mtime_sec; inode->i_mtime.tv_nsec = stat->st_mtime_nsec; - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); inode->i_uid = stat->st_uid; inode->i_gid = stat->st_gid; set_nlink(inode, stat->st_nlink); @@ -668,8 +668,8 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, inode->i_mtime.tv_nsec = stat->st_mtime_nsec; } if (stat->st_result_mask & P9_STATS_CTIME) { - inode->i_ctime.tv_sec = stat->st_ctime_sec; - inode->i_ctime.tv_nsec = stat->st_ctime_nsec; + inode_set_ctime(inode, stat->st_ctime_sec, + stat->st_ctime_nsec); } if (stat->st_result_mask & P9_STATS_UID) inode->i_uid = stat->st_uid; diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7953..7da21f563192 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -205,8 +205,8 @@ config TMPFS_XATTR Extended attributes are name:value pairs associated with inodes by the kernel or by users (see the attr(5) manual page for details). - Currently this enables support for the trusted.* and - security.* namespaces. + This enables support for the trusted.*, security.* and user.* + namespaces. You need this for POSIX ACL support on tmpfs. @@ -233,6 +233,18 @@ config TMPFS_INODE64 If unsure, say N. +config TMPFS_QUOTA + bool "Tmpfs quota support" + depends on TMPFS + select QUOTA + help + Quota support allows to set per user and group limits for tmpfs + usage. Say Y to enable quota support. Once enabled you can control + user and group quota enforcement with quota, usrquota and grpquota + mount options. + + If unsure, say N. + config ARCH_SUPPORTS_HUGETLBFS def_bool n diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index c3ac613d0975..20963002578a 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -270,7 +270,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) inode->i_mode = adfs_atts2mode(sb, inode); adfs_adfs2unix_time(&inode->i_mtime, inode); inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); if (S_ISDIR(inode->i_mode)) { inode->i_op = &adfs_dir_inode_operations; @@ -331,7 +331,7 @@ adfs_notify_change(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_ATIME) inode->i_atime = attr->ia_atime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 29f11e10a7c7..7ba93efc1143 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -60,7 +60,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh) mark_buffer_dirty_inode(dir_bh, dir); affs_brelse(dir_bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -114,7 +114,7 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh) affs_brelse(bh); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); inode_inc_iversion(dir); mark_inode_dirty(dir); @@ -315,7 +315,7 @@ affs_remove_header(struct dentry *dentry) else clear_nlink(inode); affs_unlock_link(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); done: diff --git a/fs/affs/file.c b/fs/affs/file.c index e43f2f007ac1..472e2bdd5349 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -15,6 +15,7 @@ #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/mpage.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); @@ -370,9 +371,10 @@ err_alloc: return -ENOSPC; } -static int affs_writepage(struct page *page, struct writeback_control *wbc) +static int affs_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, affs_get_block, wbc); + return mpage_writepages(mapping, wbc, affs_get_block); } static int affs_read_folio(struct file *file, struct folio *folio) @@ -456,10 +458,11 @@ const struct address_space_operations affs_aops = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio, - .writepage = affs_writepage, + .writepages = affs_writepages, .write_begin = affs_write_begin, .write_end = affs_write_end, .direct_IO = affs_direct_IO, + .migrate_folio = buffer_migrate_folio, .bmap = _affs_bmap }; @@ -835,9 +838,10 @@ const struct address_space_operations affs_aops_ofs = { .dirty_folio = block_dirty_folio, .invalidate_folio = block_invalidate_folio, .read_folio = affs_read_folio_ofs, - //.writepage = affs_writepage_ofs, + //.writepages = affs_writepages_ofs, .write_begin = affs_write_begin_ofs, - .write_end = affs_write_end_ofs + .write_end = affs_write_end_ofs, + .migrate_folio = filemap_migrate_folio, }; /* Free any preallocated blocks. */ diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 27f77a52c5c8..060746c63151 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -149,13 +149,13 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; } - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec - = (be32_to_cpu(tail->change.days) * 86400LL + - be32_to_cpu(tail->change.mins) * 60 + - be32_to_cpu(tail->change.ticks) / 50 + - AFFS_EPOCH_DELTA) + - sys_tz.tz_minuteswest * 60; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_atime.tv_nsec = 0; + inode->i_mtime.tv_sec = inode->i_atime.tv_sec = + inode_set_ctime(inode, + (be32_to_cpu(tail->change.days) * 86400LL + + be32_to_cpu(tail->change.mins) * 60 + + be32_to_cpu(tail->change.ticks) / 50 + AFFS_EPOCH_DELTA) + + sys_tz.tz_minuteswest * 60, 0).tv_sec; + inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = 0; affs_brelse(bh); unlock_new_inode(inode); return inode; @@ -314,7 +314,7 @@ affs_new_inode(struct inode *dir) inode->i_gid = current_fsgid(); inode->i_ino = block; set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index d12ccfd2a83d..2fe4a5832fcf 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -43,7 +43,7 @@ affs_get_toupper(struct super_block *sb) * Note: the dentry argument is the parent dentry. */ static inline int -__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t toupper, bool notruncate) +__affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t fn, bool notruncate) { const u8 *name = qstr->name; unsigned long hash; @@ -57,7 +57,7 @@ __affs_hash_dentry(const struct dentry *dentry, struct qstr *qstr, toupper_t tou hash = init_name_hash(dentry); len = min(qstr->len, AFFSNAMEMAX); for (; len > 0; name++, len--) - hash = partial_name_hash(toupper(*name), hash); + hash = partial_name_hash(fn(*name), hash); qstr->hash = end_name_hash(hash); return 0; @@ -80,7 +80,7 @@ affs_intl_hash_dentry(const struct dentry *dentry, struct qstr *qstr) } static inline int __affs_compare_dentry(unsigned int len, - const char *str, const struct qstr *name, toupper_t toupper, + const char *str, const struct qstr *name, toupper_t fn, bool notruncate) { const u8 *aname = str; @@ -106,7 +106,7 @@ static inline int __affs_compare_dentry(unsigned int len, return 1; for (; len > 0; len--) - if (toupper(*aname++) != toupper(*bname++)) + if (fn(*aname++) != fn(*bname++)) return 1; return 0; @@ -135,7 +135,7 @@ affs_intl_compare_dentry(const struct dentry *dentry, */ static inline int -affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) +affs_match(struct dentry *dentry, const u8 *name2, toupper_t fn) { const u8 *name = dentry->d_name.name; int len = dentry->d_name.len; @@ -148,7 +148,7 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) return 0; for (name2++; len > 0; len--) - if (toupper(*name++) != toupper(*name2++)) + if (fn(*name++) != fn(*name2++)) return 0; return 1; } @@ -156,12 +156,12 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper) int affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len) { - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 hash; hash = len = min(len, AFFSNAMEMAX); for (; len > 0; len--) - hash = (hash * 13 + toupper(*name++)) & 0x7ff; + hash = (hash * 13 + fn(*name++)) & 0x7ff; return hash % AFFS_SB(sb)->s_hashsize; } @@ -171,7 +171,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; struct buffer_head *bh; - toupper_t toupper = affs_get_toupper(sb); + toupper_t fn = affs_get_toupper(sb); u32 key; pr_debug("%s(\"%pd\")\n", __func__, dentry); @@ -189,7 +189,7 @@ affs_find_entry(struct inode *dir, struct dentry *dentry) bh = affs_bread(sb, key); if (!bh) return ERR_PTR(-EIO); - if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, toupper)) + if (affs_match(dentry, AFFS_TAIL(sb, bh)->name, fn)) return bh; key = be32_to_cpu(AFFS_TAIL(sb, bh)->hash_chain); } diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d7d9402ff718..95bcbd7654d1 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -88,7 +88,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) set_nlink(inode, 2); inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; - inode->i_ctime = inode->i_atime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_blocks = 0; inode->i_generation = 0; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 866bab860a88..1c794a1896aa 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -90,7 +90,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, vnode->status = *status; t = status->mtime_client; - inode->i_ctime = t; + inode_set_ctime_to_ts(inode, t); inode->i_mtime = t; inode->i_atime = t; inode->i_flags |= S_NOATIME; @@ -206,7 +206,7 @@ static void afs_apply_status(struct afs_operation *op, t = status->mtime_client; inode->i_mtime = t; if (vp->update_ctime) - inode->i_ctime = op->ctime; + inode_set_ctime_to_ts(inode, op->ctime); if (vnode->status.data_version != status->data_version) data_changed = true; @@ -252,7 +252,7 @@ static void afs_apply_status(struct afs_operation *op, vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); - inode->i_ctime = t; + inode_set_ctime_to_ts(inode, t); inode->i_atime = t; } } @@ -773,7 +773,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path, do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) && stat->nlink > 0) stat->nlink -= 1; @@ -1447,13 +1447,8 @@ static void aio_complete_rw(struct kiocb *kiocb, long res) if (kiocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(kiocb->ki_filp); - /* - * Tell lockdep we inherited freeze protection from submission - * thread. - */ if (S_ISREG(inode->i_mode)) - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - file_end_write(kiocb->ki_filp); + kiocb_end_write(kiocb); } iocb->ki_res.res = res; @@ -1581,17 +1576,8 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb, return ret; ret = rw_verify_area(WRITE, file, &req->ki_pos, iov_iter_count(&iter)); if (!ret) { - /* - * Open-code file_start_write here to grab freeze protection, - * which will be released by another thread in - * aio_complete_rw(). Fool lockdep by telling it the lock got - * released so that it doesn't complain about the held lock when - * we return to userspace. - */ - if (S_ISREG(file_inode(file)->i_mode)) { - sb_start_write(file_inode(file)->i_sb); - __sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE); - } + if (S_ISREG(file_inode(file)->i_mode)) + kiocb_start_write(req); req->ki_flags |= IOCB_WRITE; aio_rw_done(req, call_write_iter(file, req, &iter)); } diff --git a/fs/attr.c b/fs/attr.c index d60dc1edb526..a8ae5f6d9b16 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -312,7 +312,7 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, if (ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; if (!in_group_or_capable(idmap, inode, @@ -394,9 +394,25 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, return error; if ((ia_valid & ATTR_MODE)) { - umode_t amode = attr->ia_mode; + /* + * Don't allow changing the mode of symlinks: + * + * (1) The vfs doesn't take the mode of symlinks into account + * during permission checking. + * (2) This has never worked correctly. Most major filesystems + * did return EOPNOTSUPP due to interactions with POSIX ACLs + * but did still updated the mode of the symlink. + * This inconsistency led system call wrapper providers such + * as libc to block changing the mode of symlinks with + * EOPNOTSUPP already. + * (3) To even do this in the first place one would have to use + * specific file descriptors and quite some effort. + */ + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + /* Flag setting protected by i_mutex */ - if (is_sxid(amode)) + if (is_sxid(attr->ia_mode)) inode->i_flags &= ~S_NOSEC; } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index affa70360b1f..2b49662ed237 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -370,7 +370,7 @@ struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) inode->i_uid = d_inode(sb->s_root)->i_uid; inode->i_gid = d_inode(sb->s_root)->i_gid; } - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_ino = get_next_ino(); if (S_ISDIR(mode)) { diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 93046c9dc461..512b9a26c63d 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -600,7 +600,7 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return 0; } @@ -633,7 +633,7 @@ static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); spin_lock(&sbi->lookup_lock); __autofs_add_expiring(dentry); @@ -749,7 +749,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return 0; } diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 54c1f8b8b075..33dd4660d82f 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) wq->status = -ENOENT; /* Magic is gone - report failure */ kfree(wq->name.name - wq->offset); wq->name.name = NULL; - wq->wait_ctr--; - wake_up_interruptible(&wq->queue); + wake_up(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); wq = nwq; } fput(sbi->pipe); /* Close the pipe */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index db649487d58c..83f9566c973b 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -133,8 +133,7 @@ static int bad_inode_fiemap(struct inode *inode, return -EIO; } -static int bad_inode_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int bad_inode_update_time(struct inode *inode, int flags) { return -EIO; } @@ -209,8 +208,7 @@ void make_bad_inode(struct inode *inode) remove_inode_hash(inode); inode->i_mode = S_IFREG; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &bad_inode_ops; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &bad_file_ops; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index eee9237386e2..9a16a51fbb88 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -363,7 +363,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_mtime.tv_sec = fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16; inode->i_mtime.tv_nsec = 0; /* lower 16 bits are not a time */ - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); inode->i_atime = inode->i_mtime; befs_ino->i_inode_num = fsrun_to_cpu(sb, raw_inode->inode_num); diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 040d5140e426..12b8af04dcb3 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -97,7 +97,7 @@ static int bfs_create(struct mnt_idmap *idmap, struct inode *dir, set_bit(ino, info->si_imap); info->si_freei--; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; inode->i_op = &bfs_file_inops; inode->i_fop = &bfs_file_operations; @@ -158,7 +158,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, return err; } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); ihold(inode); d_instantiate(new, inode); @@ -187,9 +187,9 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry) } de->ino = 0; mark_buffer_dirty_inode(bh, dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); error = 0; @@ -240,10 +240,10 @@ static int bfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, goto end_rename; } old_de->ino = 0; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); mark_inode_dirty(old_dir); if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); } mark_buffer_dirty_inode(old_bh, old_dir); @@ -292,9 +292,9 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino) pos = (block - sblock) * BFS_BSIZE + off; if (pos >= dir->i_size) { dir->i_size += BFS_DIRENT_SIZE; - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); } - dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); de->ino = cpu_to_le16((u16)ino); for (i = 0; i < BFS_NAMELEN; i++) diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 1926bec2c850..e6a76ae9eb44 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -82,10 +82,9 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino) inode->i_blocks = BFS_FILEBLOCKS(di); inode->i_atime.tv_sec = le32_to_cpu(di->i_atime); inode->i_mtime.tv_sec = le32_to_cpu(di->i_mtime); - inode->i_ctime.tv_sec = le32_to_cpu(di->i_ctime); + inode_set_ctime(inode, le32_to_cpu(di->i_ctime), 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; brelse(bh); unlock_new_inode(inode); @@ -143,7 +142,7 @@ static int bfs_write_inode(struct inode *inode, struct writeback_control *wbc) di->i_nlink = cpu_to_le32(inode->i_nlink); di->i_atime = cpu_to_le32(inode->i_atime.tv_sec); di->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); - di->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + di->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); i_sblock = BFS_I(inode)->i_sblock; di->i_sblock = cpu_to_le32(i_sblock); di->i_eblock = cpu_to_le32(BFS_I(inode)->i_eblock); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bb202ad369d5..e0108d17b085 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -547,8 +547,7 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 66fa9ab2c046..3282adc84d52 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -49,9 +49,11 @@ config BTRFS_FS_POSIX_ACL If you don't know what Access Control Lists are, say N config BTRFS_FS_CHECK_INTEGRITY - bool "Btrfs with integrity check tool compiled in (DANGEROUS)" + bool "Btrfs with integrity check tool compiled in (DEPRECATED)" depends on BTRFS_FS help + This feature has been deprecated and will be removed in 6.7. + Adds code that examines all block write requests (including writes of the super block). The goal is to verify that the state of the filesystem on disk is always consistent, i.e., diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index ceadfc5d6c66..8cfc8214109c 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACCESSORS_H #define BTRFS_ACCESSORS_H +#include <linux/stddef.h> + struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; @@ -34,13 +36,13 @@ static inline void put_unaligned_le8(u8 val, void *p) read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define write_eb_member(eb, ptr, type, member, result) (\ write_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ offsetof(type, member), \ - sizeof(((type *)0)->member))) + sizeof_field(type, member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ @@ -62,25 +64,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof_field(type, member)); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -111,17 +113,14 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == sizeof_field(struct btrfs_dev_item, total_bytes)); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 79336fa853db..b7d54efb4728 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -3373,7 +3373,6 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, struct btrfs_key *node_key, struct btrfs_backref_node *cur) { - struct btrfs_fs_info *fs_info = cache->fs_info; struct btrfs_backref_edge *edge; struct btrfs_backref_node *exist; int ret; @@ -3462,25 +3461,21 @@ int btrfs_backref_add_tree_node(struct btrfs_backref_cache *cache, ret = handle_direct_tree_backref(cache, &key, cur); if (ret < 0) goto out; - continue; - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, ret, NULL); - goto out; - } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { - continue; + } else if (key.type == BTRFS_TREE_BLOCK_REF_KEY) { + /* + * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref + * offset means the root objectid. We need to search + * the tree to get its parent bytenr. + */ + ret = handle_indirect_tree_backref(cache, path, &key, node_key, + cur); + if (ret < 0) + goto out; } - /* - * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset - * means the root objectid. We need to search the tree to get - * its parent bytenr. + * Unrecognized tree backref items (if it can pass tree-checker) + * would be ignored. */ - ret = handle_indirect_tree_backref(cache, path, &key, node_key, - cur); - if (ret < 0) - goto out; } ret = 0; cur->checked = 1; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 82324c327a50..0cb1dee965a0 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -504,13 +504,20 @@ static void fragment_free_space(struct btrfs_block_group *block_group) #endif /* - * This is only called by btrfs_cache_block_group, since we could have freed - * extents we need to check the pinned_extents for any extents that can't be - * used yet since their free space will be released as soon as the transaction - * commits. + * Add a free space range to the in memory free space cache of a block group. + * This checks if the range contains super block locations and any such + * locations are not added to the free space cache. + * + * @block_group: The target block group. + * @start: Start offset of the range. + * @end: End offset of the range (exclusive). + * @total_added_ret: Optional pointer to return the total amount of space + * added to the block group's free space cache. + * + * Returns 0 on success or < 0 on error. */ -int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end, - u64 *total_added_ret) +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, + u64 end, u64 *total_added_ret) { struct btrfs_fs_info *info = block_group->fs_info; u64 extent_start, extent_end, size; @@ -520,11 +527,10 @@ int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end *total_added_ret = 0; while (start < end) { - ret = find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL); - if (ret) + if (!find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY | EXTENT_UPTODATE, + NULL)) break; if (extent_start <= start) { @@ -799,8 +805,8 @@ next: key.type == BTRFS_METADATA_ITEM_KEY) { u64 space_added; - ret = add_new_free_space(block_group, last, key.objectid, - &space_added); + ret = btrfs_add_new_free_space(block_group, last, + key.objectid, &space_added); if (ret) goto out; total_found += space_added; @@ -821,14 +827,20 @@ next: path->slots[0]++; } - ret = add_new_free_space(block_group, last, - block_group->start + block_group->length, - NULL); + ret = btrfs_add_new_free_space(block_group, last, + block_group->start + block_group->length, + NULL); out: btrfs_free_path(path); return ret; } +static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) +{ + clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_UPTODATE); +} + static noinline void caching_thread(struct btrfs_work *work) { struct btrfs_block_group *block_group; @@ -2098,8 +2110,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = btrfs_add_excluded_extent(fs_info, cache->start, - stripe_len); + ret = set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_UPTODATE, NULL); if (ret) return ret; } @@ -2125,8 +2138,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = btrfs_add_excluded_extent(fs_info, logical[nr], - len); + ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], + logical[nr] + len - 1, + EXTENT_UPTODATE, NULL); if (ret) { kfree(logical); return ret; @@ -2319,8 +2333,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_free_excluded_extents(cache); } else if (cache->used == 0) { cache->cached = BTRFS_CACHE_FINISHED; - ret = add_new_free_space(cache, cache->start, - cache->start + cache->length, NULL); + ret = btrfs_add_new_free_space(cache, cache->start, + cache->start + cache->length, NULL); btrfs_free_excluded_extents(cache); if (ret) goto error; @@ -2767,7 +2781,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); + ret = btrfs_add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL); btrfs_free_excluded_extents(cache); if (ret) { btrfs_put_block_group(cache); @@ -4075,7 +4089,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (IS_ERR(ret_bg)) { ret = PTR_ERR(ret_bg); - } else if (from_extent_allocation) { + } else if (from_extent_allocation && (flags & BTRFS_BLOCK_GROUP_DATA)) { /* * New block group is likely to be used soon. Try to activate * it now. Failure is OK for now. @@ -4273,6 +4287,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; + if (btrfs_is_zoned(info)) { + if (info->active_meta_bg) { + btrfs_put_block_group(info->active_meta_bg); + info->active_meta_bg = NULL; + } + if (info->active_system_bg) { + btrfs_put_block_group(info->active_system_bg); + info->active_system_bg = NULL; + } + } + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 74b61e663028..2bdbcb834f95 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -291,8 +291,8 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait); void btrfs_put_caching_control(struct btrfs_caching_control *ctl); struct btrfs_caching_control *btrfs_get_caching_control( struct btrfs_block_group *cache); -int add_new_free_space(struct btrfs_block_group *block_group, - u64 start, u64 end, u64 *total_added_ret); +int btrfs_add_new_free_space(struct btrfs_block_group *block_group, + u64 start, u64 end, u64 *total_added_ret); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d47a927b3504..bda1fdbba666 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -498,12 +498,8 @@ int btrfs_prealloc_file_range_trans(struct inode *inode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint); int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, struct writeback_control *wbc); + u64 start, u64 end, struct writeback_control *wbc); int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type); int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 6d51db066503..53c1211dd60b 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1736,9 +1736,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, int over = 0; unsigned char d_type; - if (list_empty(ins_list)) - return 0; - /* * Changing the data of the delayed item is impossible. So * we needn't lock them. And we have held i_mutex of the @@ -1809,9 +1806,9 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime.tv_sec); @@ -1862,8 +1859,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime); + inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + btrfs_stack_timespec_nsec(&inode_item->ctime)); BTRFS_I(inode)->i_otime.tv_sec = btrfs_stack_timespec_sec(&inode_item->otime); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5f10965fd72b..fff22ed55c42 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -792,9 +792,9 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (!find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { + while (find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { ret = set_extent_bit(&tgtdev->alloc_state, found_start, found_end, CHUNK_ALLOCATED, NULL); if (ret) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a9a2c5446c18..0a96ea8c1d3a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -313,21 +313,16 @@ static bool check_tree_block_fsid(struct extent_buffer *eb) struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); + /* - * Checking the incompat flag is only valid for the current fs. For - * seed devices it's forbidden to have their uuid changed so reading - * ->fsid in this case is fine + * alloc_fs_devices() copies the fsid into metadata_uuid if the + * metadata_uuid is unset in the superblock, including for a seed device. + * So, we can use fs_devices->metadata_uuid. */ - if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; - - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0) return false; list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) @@ -2384,21 +2379,18 @@ int btrfs_validate_super(struct btrfs_fs_info *fs_info, ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid, - BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", - fs_info->super_copy->fsid, fs_info->fs_devices->fsid); + sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, METADATA_UUID) && - memcmp(fs_info->fs_devices->metadata_uuid, - fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) { + if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", - fs_info->super_copy->metadata_uuid, - fs_info->fs_devices->metadata_uuid); + btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } @@ -2869,6 +2861,56 @@ static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info) return 0; } +static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i = 0; + int err = 0; + unsigned int ret = 0; + + while (1) { + spin_lock(&fs_info->fs_roots_radix_lock); + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) { + spin_unlock(&fs_info->fs_roots_radix_lock); + break; + } + root_objectid = gang[ret - 1]->root_key.objectid + 1; + + for (i = 0; i < ret; i++) { + /* Avoid to grab roots in dead_roots. */ + if (btrfs_root_refs(&gang[i]->root_item) == 0) { + gang[i] = NULL; + continue; + } + /* Grab all the search result for later use. */ + gang[i] = btrfs_grab_root(gang[i]); + } + spin_unlock(&fs_info->fs_roots_radix_lock); + + for (i = 0; i < ret; i++) { + if (!gang[i]) + continue; + root_objectid = gang[i]->root_key.objectid; + err = btrfs_orphan_cleanup(gang[i]); + if (err) + goto out; + btrfs_put_root(gang[i]); + } + root_objectid++; + } +out: + /* Release the uncleaned roots due to error. */ + for (; i < ret; i++) { + if (gang[i]) + btrfs_put_root(gang[i]); + } + return err; +} + /* * Some options only have meaning at mount time and shouldn't persist across * remounts, or be displayed. Clear these at the end of mount and remount @@ -3222,7 +3264,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* check FS state, whether FS is broken. */ if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, -EUCLEAN); /* * In the long term, we'll store the compression type in the super @@ -3417,6 +3459,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_free_zone_cache(fs_info); + btrfs_check_active_zone_reservation(fs_info); + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, @@ -4136,56 +4180,6 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, btrfs_put_root(root); } -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) -{ - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; - int err = 0; - unsigned int ret = 0; - - while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; - } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); - } - spin_unlock(&fs_info->fs_roots_radix_lock); - - for (i = 0; i < ret; i++) { - if (!gang[i]) - continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); - if (err) - goto out; - btrfs_put_root(gang[i]); - } - root_objectid++; - } -out: - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); - } - return err; -} - int btrfs_commit_super(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->tree_root; @@ -4228,7 +4222,7 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (!find_first_extent_bit(&trans->dirty_pages, cur, + while (find_first_extent_bit(&trans->dirty_pages, cur, &found_start, &found_end, EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; @@ -4552,9 +4546,7 @@ static void btrfs_destroy_ordered_extents(struct btrfs_root *root) static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); @@ -4660,9 +4652,7 @@ static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) { struct btrfs_inode *btrfs_inode; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -4695,9 +4685,7 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; - struct list_head splice; - - INIT_LIST_HEAD(&splice); + LIST_HEAD(splice); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -4716,21 +4704,16 @@ static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delalloc_root_lock); } -static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, - struct extent_io_tree *dirty_pages, - int mark) +static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, + struct extent_io_tree *dirty_pages, + int mark) { - int ret; struct extent_buffer *eb; u64 start = 0; u64 end; - while (1) { - ret = find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL); - if (ret) - break; - + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); @@ -4746,16 +4729,13 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, free_extent_buffer_stale(eb); } } - - return ret; } -static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, - struct extent_io_tree *unpin) +static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, + struct extent_io_tree *unpin) { u64 start; u64 end; - int ret; while (1) { struct extent_state *cached_state = NULL; @@ -4767,9 +4747,8 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -4780,8 +4759,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); } - - return 0; } static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index b03767f4d7ed..02b645744a82 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -77,7 +77,6 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); -int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a2315a4b8b75..ff8e117a1ace 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -831,15 +831,15 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * * Note: If there are multiple bits set in @bits, any of them will match. * - * Return 0 if we find something, and update @start_ret and @end_ret. - * Return 1 if we found nothing. + * Return true if we find something, and update @start_ret and @end_ret. + * Return false if we found nothing. */ -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; - int ret = 1; + bool ret = false; spin_lock(&tree->lock); if (cached_state && *cached_state) { @@ -863,7 +863,7 @@ got_it: cache_state_if_flags(state, cached_state, 0); *start_ret = state->start; *end_ret = state->end; - ret = 0; + ret = true; } out: spin_unlock(&tree->lock); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index fbd3b275ab1c..28c23a23d121 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -182,9 +182,9 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u32 clear_bits, struct extent_state **cached_state); -int find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); +bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f396a9afa403..f356f08b55cb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -69,27 +69,6 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) return (cache->flags & bits) == bits; } -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes) -{ - u64 end = start + num_bytes - 1; - set_extent_bit(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE, NULL); - return 0; -} - -void btrfs_free_excluded_extents(struct btrfs_block_group *cache) -{ - struct btrfs_fs_info *fs_info = cache->fs_info; - u64 start, end; - - start = cache->start; - end = start + cache->length - 1; - - clear_extent_bits(&fs_info->excluded_extents, start, end, - EXTENT_UPTODATE); -} - /* simple helper to search for an existing data extent at a given offset */ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -187,8 +166,10 @@ search_again: num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); } else { - ret = -EINVAL; - btrfs_print_v0_err(fs_info); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); if (trans) btrfs_abort_transaction(trans, ret); else @@ -402,11 +383,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, } } + WARN_ON(1); btrfs_print_leaf(eb); btrfs_err(eb->fs_info, "eb %llu iref 0x%lx invalid extent inline ref type %d", eb->start, (unsigned long)iref, type); - WARN_ON(1); return BTRFS_REF_TYPE_INVALID; } @@ -624,12 +605,12 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, ref2 = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_shared_data_ref); num_refs = btrfs_shared_data_ref_count(leaf, ref2); - } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) { - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, -EINVAL); - return -EINVAL; } else { - BUG(); + btrfs_err(trans->fs_info, + "unrecognized backref key (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + return -EUCLEAN; } BUG_ON(num_refs < refs_to_drop); @@ -660,7 +641,6 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); if (iref) { /* * If type is invalid, we should have bailed out earlier than @@ -869,6 +849,11 @@ again: err = -ENOENT; goto out; } else if (WARN_ON(ret)) { + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, +"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu", + bytenr, num_bytes, parent, root_objectid, owner, + offset); err = -EIO; goto out; } @@ -876,8 +861,10 @@ again: leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %llu expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, err); goto out; } @@ -1079,13 +1066,13 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans, /* * helper to update/remove inline back ref */ -static noinline_for_stack -void update_inline_extent_backref(struct btrfs_path *path, +static noinline_for_stack int update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_extent_item *ei; struct btrfs_extent_data_ref *dref = NULL; struct btrfs_shared_data_ref *sref = NULL; @@ -1098,18 +1085,33 @@ void update_inline_extent_backref(struct btrfs_path *path, ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, ei); - WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0); + if (unlikely(refs_to_mod < 0 && refs + refs_to_mod <= 0)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for extent %llu num_bytes %u, has %d expect >= -%llu", + key.objectid, extent_size, refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; btrfs_set_extent_refs(leaf, ei, refs); if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); /* - * If type is invalid, we should have bailed out after - * lookup_inline_extent_backref(). + * Function btrfs_get_extent_inline_ref_type() has already printed + * error messages. */ - type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); - ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (unlikely(type == BTRFS_REF_TYPE_INVALID)) + return -EUCLEAN; if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -1119,10 +1121,43 @@ void update_inline_extent_backref(struct btrfs_path *path, refs = btrfs_shared_data_ref_count(leaf, sref); } else { refs = 1; - BUG_ON(refs_to_mod != -1); + /* + * For tree blocks we can only drop one ref for it, and tree + * blocks should not have refs > 1. + * + * Furthermore if we're inserting a new inline backref, we + * won't reach this path either. That would be + * setup_inline_extent_backref(). + */ + if (unlikely(refs_to_mod != -1)) { + struct btrfs_key key; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + btrfs_print_leaf(leaf); + btrfs_err(fs_info, + "invalid refs_to_mod for tree block %llu, has %d expect -1", + key.objectid, refs_to_mod); + return -EUCLEAN; + } } - BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod); + if (unlikely(refs_to_mod < 0 && refs < -refs_to_mod)) { + struct btrfs_key key; + u32 extent_size; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.type == BTRFS_METADATA_ITEM_KEY) + extent_size = fs_info->nodesize; + else + extent_size = key.offset; + btrfs_print_leaf(leaf); + btrfs_err(fs_info, +"invalid refs_to_mod for backref entry, iref %lu extent %llu num_bytes %u, has %d expect >= -%llu", + (unsigned long)iref, key.objectid, extent_size, + refs_to_mod, refs); + return -EUCLEAN; + } refs += refs_to_mod; if (refs > 0) { @@ -1142,6 +1177,7 @@ void update_inline_extent_backref(struct btrfs_path *path, btrfs_truncate_item(path, item_size, 1); } btrfs_mark_buffer_dirty(leaf); + return 0; } static noinline_for_stack @@ -1170,7 +1206,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, bytenr, num_bytes, root_objectid, path->slots[0]); return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, extent_op); + ret = update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1190,7 +1226,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, BUG_ON(!is_data && refs_to_drop != 1); if (iref) - update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + ret = update_inline_extent_backref(path, iref, -refs_to_drop, NULL); else if (is_data) ret = remove_extent_data_ref(trans, root, path, refs_to_drop); else @@ -1629,8 +1665,10 @@ again: item_size = btrfs_item_size(leaf, path->slots[0]); if (unlikely(item_size < sizeof(*ei))) { - err = -EINVAL; - btrfs_print_v0_err(fs_info); + err = -EUCLEAN; + btrfs_err(fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, err); goto out; } @@ -2751,9 +2789,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct extent_state *cached_state = NULL; mutex_lock(&fs_info->unused_bg_unpin_mutex); - ret = find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state); - if (ret) { + if (!find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } @@ -3059,8 +3096,10 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; item_size = btrfs_item_size(leaf, extent_slot); if (unlikely(item_size < sizeof(*ei))) { - ret = -EINVAL; - btrfs_print_v0_err(info); + ret = -EUCLEAN; + btrfs_err(trans->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); goto out; } @@ -3351,11 +3390,38 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } enum btrfs_loop_type { + /* + * Start caching block groups but do not wait for progress or for them + * to be done. + */ LOOP_CACHING_NOWAIT, + + /* + * Wait for the block group free_space >= the space we're waiting for if + * the block group isn't cached. + */ LOOP_CACHING_WAIT, + + /* + * Allow allocations to happen from block groups that do not yet have a + * size classification. + */ LOOP_UNSET_SIZE_CLASS, + + /* + * Allocate a chunk and then retry the allocation. + */ LOOP_ALLOC_CHUNK, + + /* + * Ignore the size class restrictions for this allocation. + */ LOOP_WRONG_SIZE_CLASS, + + /* + * Ignore the empty size, only try to allocate the number of bytes + * needed for this allocation. + */ LOOP_NO_EMPTY_SIZE, }; @@ -3427,7 +3493,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, * Helper function for find_free_extent(). * * Return -ENOENT to inform caller that we need fallback to unclustered mode. - * Return -EAGAIN to inform caller that we need to re-search this block group * Return >0 to inform caller that we find nothing * Return 0 means we have found a location and set ffe_ctl->found_offset. */ @@ -3508,14 +3573,6 @@ refill_cluster: trace_btrfs_reserve_extent_cluster(bg, ffe_ctl); return 0; } - } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && - !ffe_ctl->retry_clustered) { - spin_unlock(&last_ptr->refill_lock); - - ffe_ctl->retry_clustered = true; - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_cluster + ffe_ctl->empty_size); - return -EAGAIN; } /* * At this point we either didn't find a cluster or we weren't able to @@ -3530,7 +3587,6 @@ refill_cluster: /* * Return >0 to inform caller that we find nothing * Return 0 when we found an free extent and set ffe_ctrl->found_offset - * Return -EAGAIN to inform caller that we need to re-search this block group */ static int find_free_extent_unclustered(struct btrfs_block_group *bg, struct find_free_extent_ctl *ffe_ctl) @@ -3568,25 +3624,8 @@ static int find_free_extent_unclustered(struct btrfs_block_group *bg, offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start, ffe_ctl->num_bytes, ffe_ctl->empty_size, &ffe_ctl->max_extent_size); - - /* - * If we didn't find a chunk, and we haven't failed on this block group - * before, and this block group is in the middle of caching and we are - * ok with waiting, then go ahead and wait for progress to be made, and - * set @retry_unclustered to true. - * - * If @retry_unclustered is true then we've already waited on this - * block group once and should move on to the next block group. - */ - if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached && - ffe_ctl->loop > LOOP_CACHING_NOWAIT) { - btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes + - ffe_ctl->empty_size); - ffe_ctl->retry_unclustered = true; - return -EAGAIN; - } else if (!offset) { + if (!offset) return 1; - } ffe_ctl->found_offset = offset; return 0; } @@ -3600,7 +3639,7 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, /* We want to try and use the cluster allocator, so lets look there */ if (ffe_ctl->last_ptr && ffe_ctl->use_cluster) { ret = find_free_extent_clustered(block_group, ffe_ctl, bg_ret); - if (ret >= 0 || ret == -EAGAIN) + if (ret >= 0) return ret; /* ret == -ENOENT case falls through */ } @@ -3685,7 +3724,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, } spin_unlock(&block_group->lock); - if (!ret && !btrfs_zone_activate(block_group)) { + /* Metadata block group is activated at write time. */ + if (!ret && (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !btrfs_zone_activate(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3709,7 +3750,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, fs_info->data_reloc_bg == 0); if (block_group->ro || - test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + (!ffe_ctl->for_data_reloc && + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags))) { ret = 1; goto out; } @@ -3752,8 +3794,26 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; - if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) - fs_info->data_reloc_bg = block_group->start; + if (ffe_ctl->for_data_reloc) { + if (!fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + /* + * Do not allow allocations from this block group, unless it is + * for data relocation. Compared to increasing the ->ro, setting + * the ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + * + * Also, this flag avoids this block group to be zone finished. + */ + set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + } ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; @@ -3771,24 +3831,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc && - fs_info->data_reloc_bg == block_group->start) { - /* - * Do not allow further allocations from this block group. - * Compared to increasing the ->ro, setting the - * ->zoned_data_reloc_ongoing flag still allows nocow - * writers to come in. See btrfs_inc_nocow_writers(). - * - * We need to disable an allocation to avoid an allocation of - * regular (non-relocation data) extent. With mix of relocation - * extents and regular extents, we can dispatch WRITE commands - * (for relocation extents) and ZONE APPEND commands (for - * regular extents) at the same time to the same zone, which - * easily break the write pointer. - */ - set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); + if (ret && ffe_ctl->for_data_reloc) fs_info->data_reloc_bg = 0; - } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3816,8 +3860,7 @@ static void release_block_group(struct btrfs_block_group *block_group, { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; break; case BTRFS_EXTENT_ALLOC_ZONED: /* Nothing to do */ @@ -3861,6 +3904,10 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, struct find_free_extent_ctl *ffe_ctl) { + /* Block group's activeness is not a requirement for METADATA block groups. */ + if (!(ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + /* If we can activate new zone, just allocate a chunk and use it */ if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) return 0; @@ -3949,15 +3996,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) return 1; - /* - * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking - * caching kthreads as we move along - * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching - * LOOP_UNSET_SIZE_CLASS, allow unset size class - * LOOP_ALLOC_CHUNK, force a chunk allocation and try again - * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try - * again - */ + /* See the comments for btrfs_loop_type for an explanation of the phases. */ if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { ffe_ctl->index = 0; /* @@ -4168,9 +4207,7 @@ static noinline int find_free_extent(struct btrfs_root *root, ffe_ctl->orig_have_caching_bg = false; ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); ffe_ctl->loop = 0; - /* For clustered allocation */ - ffe_ctl->retry_clustered = false; - ffe_ctl->retry_unclustered = false; + ffe_ctl->retry_uncached = false; ffe_ctl->cached = 0; ffe_ctl->max_extent_size = 0; ffe_ctl->total_free_space = 0; @@ -4321,16 +4358,12 @@ have_block_group: bg_ret = NULL; ret = do_allocation(block_group, ffe_ctl, &bg_ret); - if (ret == 0) { - if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, - ffe_ctl->delalloc); - block_group = bg_ret; - } - } else if (ret == -EAGAIN) { - goto have_block_group; - } else if (ret > 0) { + if (ret > 0) goto loop; + + if (bg_ret && bg_ret != block_group) { + btrfs_release_block_group(block_group, ffe_ctl->delalloc); + block_group = bg_ret; } /* Checks */ @@ -4371,6 +4404,15 @@ have_block_group: btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: + if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT && + !ffe_ctl->retry_uncached) { + ffe_ctl->retry_uncached = true; + btrfs_wait_block_group_cache_progress(block_group, + ffe_ctl->num_bytes + + ffe_ctl->empty_cluster + + ffe_ctl->empty_size); + goto have_block_group; + } release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 429d5c570061..88c249c37516 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -48,16 +48,11 @@ struct find_free_extent_ctl { int loop; /* - * Whether we're refilling a cluster, if true we need to re-search - * current block group but don't try to refill the cluster again. + * Set to true if we're retrying the allocation on this block group + * after waiting for caching progress, this is so that we retry only + * once before moving on to another block group. */ - bool retry_clustered; - - /* - * Whether we're updating free space cache, if true we need to re-search - * current block group but don't try updating free space cache again. - */ - bool retry_unclustered; + bool retry_uncached; /* If current block group is cached */ int cached; @@ -96,9 +91,6 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 90ad3006ef3a..ac3fca5a5e41 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -181,34 +181,9 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) } } -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - struct folio *folio; - - while (index <= end_index) { - folio = filemap_get_folio(mapping, index); - filemap_dirty_folio(mapping, folio); - folio_account_redirty(folio); - index += folio_nr_pages(folio); - folio_put(folio); - } -} - -/* - * Process one page for __process_pages_contig(). - * - * Return >0 if we hit @page == @locked_page. - * Return 0 if we updated the page status. - * Return -EGAIN if the we need to try again. - * (For PAGE_LOCK case but got dirty page or page not belong to mapping) - */ -static int process_one_page(struct btrfs_fs_info *fs_info, - struct address_space *mapping, - struct page *page, struct page *locked_page, - unsigned long page_ops, u64 start, u64 end) +static void process_one_page(struct btrfs_fs_info *fs_info, + struct page *page, struct page *locked_page, + unsigned long page_ops, u64 start, u64 end) { u32 len; @@ -224,94 +199,36 @@ static int process_one_page(struct btrfs_fs_info *fs_info, if (page_ops & PAGE_END_WRITEBACK) btrfs_page_clamp_clear_writeback(fs_info, page, start, len); - if (page == locked_page) - return 1; - - if (page_ops & PAGE_LOCK) { - int ret; - - ret = btrfs_page_start_writer_lock(fs_info, page, start, len); - if (ret) - return ret; - if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, len); - return -EAGAIN; - } - } - if (page_ops & PAGE_UNLOCK) + if (page != locked_page && (page_ops & PAGE_UNLOCK)) btrfs_page_end_writer_lock(fs_info, page, start, len); - return 0; } -static int __process_pages_contig(struct address_space *mapping, - struct page *locked_page, - u64 start, u64 end, unsigned long page_ops, - u64 *processed_end) +static void __process_pages_contig(struct address_space *mapping, + struct page *locked_page, u64 start, u64 end, + unsigned long page_ops) { struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); pgoff_t start_index = start >> PAGE_SHIFT; pgoff_t end_index = end >> PAGE_SHIFT; pgoff_t index = start_index; - unsigned long pages_processed = 0; struct folio_batch fbatch; - int err = 0; int i; - if (page_ops & PAGE_LOCK) { - ASSERT(page_ops == PAGE_LOCK); - ASSERT(processed_end && *processed_end == start); - } - folio_batch_init(&fbatch); while (index <= end_index) { int found_folios; found_folios = filemap_get_folios_contig(mapping, &index, end_index, &fbatch); - - if (found_folios == 0) { - /* - * Only if we're going to lock these pages, we can find - * nothing at @index. - */ - ASSERT(page_ops & PAGE_LOCK); - err = -EAGAIN; - goto out; - } - for (i = 0; i < found_folios; i++) { - int process_ret; struct folio *folio = fbatch.folios[i]; - process_ret = process_one_page(fs_info, mapping, - &folio->page, locked_page, page_ops, - start, end); - if (process_ret < 0) { - err = -EAGAIN; - folio_batch_release(&fbatch); - goto out; - } - pages_processed += folio_nr_pages(folio); + + process_one_page(fs_info, &folio->page, locked_page, + page_ops, start, end); } folio_batch_release(&fbatch); cond_resched(); } -out: - if (err && processed_end) { - /* - * Update @processed_end. I know this is awful since it has - * two different return value patterns (inclusive vs exclusive). - * - * But the exclusive pattern is necessary if @start is 0, or we - * underflow and check against processed_end won't work as - * expected. - */ - if (pages_processed) - *processed_end = min(end, - ((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1); - else - *processed_end = start; - } - return err; } static noinline void __unlock_for_delalloc(struct inode *inode, @@ -326,29 +243,63 @@ static noinline void __unlock_for_delalloc(struct inode *inode, return; __process_pages_contig(inode->i_mapping, locked_page, start, end, - PAGE_UNLOCK, NULL); + PAGE_UNLOCK); } static noinline int lock_delalloc_pages(struct inode *inode, struct page *locked_page, - u64 delalloc_start, - u64 delalloc_end) + u64 start, + u64 end) { - unsigned long index = delalloc_start >> PAGE_SHIFT; - unsigned long end_index = delalloc_end >> PAGE_SHIFT; - u64 processed_end = delalloc_start; - int ret; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct address_space *mapping = inode->i_mapping; + pgoff_t start_index = start >> PAGE_SHIFT; + pgoff_t end_index = end >> PAGE_SHIFT; + pgoff_t index = start_index; + u64 processed_end = start; + struct folio_batch fbatch; - ASSERT(locked_page); if (index == locked_page->index && index == end_index) return 0; - ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start, - delalloc_end, PAGE_LOCK, &processed_end); - if (ret == -EAGAIN && processed_end > delalloc_start) - __unlock_for_delalloc(inode, locked_page, delalloc_start, - processed_end); - return ret; + folio_batch_init(&fbatch); + while (index <= end_index) { + unsigned int found_folios, i; + + found_folios = filemap_get_folios_contig(mapping, &index, + end_index, &fbatch); + if (found_folios == 0) + goto out; + + for (i = 0; i < found_folios; i++) { + struct page *page = &fbatch.folios[i]->page; + u32 len = end + 1 - start; + + if (page == locked_page) + continue; + + if (btrfs_page_start_writer_lock(fs_info, page, start, + len)) + goto out; + + if (!PageDirty(page) || page->mapping != mapping) { + btrfs_page_end_writer_lock(fs_info, page, start, + len); + goto out; + } + + processed_end = page_offset(page) + PAGE_SIZE - 1; + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return 0; +out: + folio_batch_release(&fbatch); + if (processed_end > start) + __unlock_for_delalloc(inode, locked_page, start, processed_end); + return -EAGAIN; } /* @@ -467,7 +418,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, clear_extent_bit(&inode->io_tree, start, end, clear_bits, NULL); __process_pages_contig(inode->vfs_inode.i_mapping, locked_page, - start, end, page_ops, NULL); + start, end, page_ops); } static bool btrfs_verify_page(struct page *page, u64 start) @@ -497,31 +448,6 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_subpage_end_reader(fs_info, page, start, len); } -/* lots and lots of room for performance fixes in the end_bio funcs */ - -void end_extent_writepage(struct page *page, int err, u64 start, u64 end) -{ - struct btrfs_inode *inode; - const bool uptodate = (err == 0); - int ret = 0; - - ASSERT(page && page->mapping); - inode = BTRFS_I(page->mapping->host); - btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate); - - if (!uptodate) { - const struct btrfs_fs_info *fs_info = inode->root->fs_info; - u32 len; - - ASSERT(end + 1 - start <= U32_MAX); - len = end + 1 - start; - - btrfs_page_clear_uptodate(fs_info, page, start, len); - ret = err < 0 ? err : -EIO; - mapping_set_error(page->mapping, ret); - } -} - /* * after a writepage IO is done, we need to: * clear the uptodate bits on error @@ -1243,38 +1169,45 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc) { - const u64 page_end = page_offset(page) + PAGE_SIZE - 1; - u64 delalloc_start = page_offset(page); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; + u64 delalloc_start = page_start; + u64 delalloc_end = page_end; u64 delalloc_to_write = 0; - /* How many pages are started by btrfs_run_delalloc_range() */ - unsigned long nr_written = 0; - int ret; - int page_started = 0; + int ret = 0; while (delalloc_start < page_end) { - u64 delalloc_end = page_end; - bool found; - - found = find_lock_delalloc_range(&inode->vfs_inode, page, - &delalloc_start, - &delalloc_end); - if (!found) { + delalloc_end = page_end; + if (!find_lock_delalloc_range(&inode->vfs_inode, page, + &delalloc_start, &delalloc_end)) { delalloc_start = delalloc_end + 1; continue; } + ret = btrfs_run_delalloc_range(inode, page, delalloc_start, - delalloc_end, &page_started, &nr_written, wbc); - if (ret) + delalloc_end, wbc); + if (ret < 0) return ret; - /* - * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE - */ - delalloc_to_write += (delalloc_end - delalloc_start + - PAGE_SIZE) >> PAGE_SHIFT; delalloc_start = delalloc_end + 1; } + + /* + * delalloc_end is already one less than the total length, so + * we don't subtract one from PAGE_SIZE + */ + delalloc_to_write += + DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); + + /* + * If btrfs_run_dealloc_range() already started I/O and unlocked + * the pages, we just need to account for them here. + */ + if (ret == 1) { + wbc->nr_to_write -= delalloc_to_write; + return 1; + } + if (wbc->nr_to_write < delalloc_to_write) { int thresh = 8192; @@ -1284,16 +1217,6 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, thresh); } - /* Did btrfs_run_dealloc_range() already unlock and start the IO? */ - if (page_started) { - /* - * We've unlocked the page, so we can't update the mapping's - * writeback index, just update nr_to_write. - */ - wbc->nr_to_write -= nr_written; - return 1; - } - return 0; } @@ -1382,6 +1305,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { + u32 len = end - cur + 1; u64 disk_bytenr; u64 em_end; u64 dirty_range_start = cur; @@ -1389,8 +1313,8 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u32 iosize; if (cur >= i_size) { - btrfs_writepage_endio_finish_ordered(inode, page, cur, - end, true); + btrfs_mark_ordered_io_finished(inode, page, cur, len, + true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1399,7 +1323,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur); + btrfs_page_clear_dirty(fs_info, page, cur, len); break; } @@ -1410,7 +1334,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, continue; } - em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); + em = btrfs_get_extent(inode, NULL, 0, cur, len); if (IS_ERR(em)) { ret = PTR_ERR_OR_ZERO(em); goto out_error; @@ -1486,7 +1410,6 @@ static int __extent_writepage(struct page *page, struct btrfs_bio_ctrl *bio_ctrl struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; const u64 page_start = page_offset(page); - const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -1530,8 +1453,13 @@ done: set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, page_start, page_end); + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, page_start, + PAGE_SIZE, !ret); + btrfs_page_clear_uptodate(btrfs_sb(inode->i_sb), page, + page_start, PAGE_SIZE); + mapping_set_error(page->mapping, ret); + } unlock_page(page); ASSERT(ret <= 0); return ret; @@ -1877,11 +1805,10 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * previous call. * Return <0 for fatal error. */ -static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_buffer **eb_context) +static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { + struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; - struct btrfs_block_group *cache = NULL; struct extent_buffer *eb; int ret; @@ -1908,7 +1835,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; } - if (eb == *eb_context) { + if (eb == ctx->eb) { spin_unlock(&mapping->private_lock); return 0; } @@ -1917,34 +1844,25 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!ret) return 0; - if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) { - /* - * If for_sync, this hole will be filled with - * trasnsaction commit. - */ - if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) - ret = -EAGAIN; - else + ctx->eb = eb; + + ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); + if (ret) { + if (ret == -EBUSY) ret = 0; free_extent_buffer(eb); return ret; } - *eb_context = eb; - if (!lock_extent_buffer_for_io(eb, wbc)) { - btrfs_revert_meta_write_pointer(cache, eb); - if (cache) - btrfs_put_block_group(cache); free_extent_buffer(eb); return 0; } - if (cache) { - /* - * Implies write in zoned mode. Mark the last eb in a block group. - */ - btrfs_schedule_zone_finish_bg(cache, eb); - btrfs_put_block_group(cache); + /* Implies write in zoned mode. */ + if (ctx->zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); + ctx->zoned_bg->meta_write_pointer += eb->len; } write_one_eb(eb, wbc); free_extent_buffer(eb); @@ -1954,7 +1872,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct extent_buffer *eb_context = NULL; + struct btrfs_eb_write_context ctx = { .wbc = wbc }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; int done = 0; @@ -1996,7 +1914,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - ret = submit_eb_page(&folio->page, wbc, &eb_context); + ret = submit_eb_page(&folio->page, &ctx); if (ret == 0) continue; if (ret < 0) { @@ -2057,6 +1975,9 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; + + if (ctx.zoned_bg) + btrfs_put_block_group(ctx.zoned_bg); btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -2150,7 +2071,7 @@ retry: for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; - done_index = folio->index + folio_nr_pages(folio); + done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor * the page lock: the page may be truncated or @@ -2233,11 +2154,11 @@ retry: * already been ran (aka, ordered extent inserted) and all pages are still * locked. */ -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc) +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty) { bool found_error = false; - int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2256,18 +2177,16 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, while (cur <= end) { u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + u32 cur_len = cur_end + 1 - cur; struct page *page; int nr = 0; page = find_get_page(mapping, cur >> PAGE_SHIFT); - /* - * All pages in the range are locked since - * btrfs_run_delalloc_range(), thus there is no way to clear - * the page dirty flag. - */ ASSERT(PageLocked(page)); - ASSERT(PageDirty(page)); - clear_page_dirty_for_io(page); + if (pages_dirty && page != locked_page) { + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + } ret = __extent_writepage_io(BTRFS_I(inode), page, &bio_ctrl, i_size, &nr); @@ -2279,23 +2198,21 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, set_page_writeback(page); end_page_writeback(page); } - if (ret) - end_extent_writepage(page, ret, cur, cur_end); - btrfs_page_unlock_writer(fs_info, page, cur, cur_end + 1 - cur); - if (ret < 0) { - found_error = true; - first_error = ret; + if (ret) { + btrfs_mark_ordered_io_finished(BTRFS_I(inode), page, + cur, cur_len, !ret); + btrfs_page_clear_uptodate(fs_info, page, cur, cur_len); + mapping_set_error(page->mapping, ret); } + btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + if (ret < 0) + found_error = true; next_page: put_page(page); cur = cur_end + 1; } submit_write_bio(&bio_ctrl, found_error ? ret : 0); - - if (found_error) - return first_error; - return ret; } int extent_writepages(struct address_space *mapping, @@ -3315,8 +3232,8 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) return NULL; } WARN_ON(PageDirty(p)); - copy_page(page_address(p), page_address(src->pages[i])); } + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; @@ -3559,6 +3476,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *exists = NULL; struct page *p; struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; int uptodate = 1; int ret; @@ -3595,36 +3513,30 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++, index++) { - struct btrfs_subpage *prealloc = NULL; + /* + * Preallocate page->private for subpage case, so that we won't + * allocate memory with private_lock nor page lock hold. + * + * The memory will be freed by attach_extent_buffer_page() or freed + * manually if we exit earlier. + */ + if (fs_info->nodesize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + exists = ERR_CAST(prealloc); + goto free_eb; + } + } + + for (i = 0; i < num_pages; i++, index++) { p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); if (!p) { exists = ERR_PTR(-ENOMEM); + btrfs_free_subpage(prealloc); goto free_eb; } - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock hold. The memory will be - * freed by attach_extent_buffer_page() or freed manually if - * we exit earlier. - * - * Although we have ensured one subpage eb can only have one - * page, but it may change in the future for 16K page size - * support, so we still preallocate the memory in the loop. - */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); - if (IS_ERR(prealloc)) { - ret = PTR_ERR(prealloc); - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; - } - } - spin_lock(&mapping->private_lock); exists = grab_extent_buffer(fs_info, p); if (exists) { @@ -4210,30 +4122,9 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, } } -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, - chunk_tree_uuid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) -{ - char *kaddr; - - assert_eb_page_uptodate(eb, eb->pages[0]); - kaddr = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid)); - memcpy(kaddr, srcv, BTRFS_FSID_SIZE); -} - -void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, - unsigned long start, unsigned long len) +static void __write_extent_buffer(const struct extent_buffer *eb, + const void *srcv, unsigned long start, + unsigned long len, bool use_memmove) { size_t cur; size_t offset; @@ -4241,6 +4132,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *kaddr; char *src = (char *)srcv; unsigned long i = get_eb_page_index(start); + /* For unmapped (dummy) ebs, no need to check their uptodate status. */ + const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); @@ -4251,11 +4144,15 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); + if (check_uptodate) + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); - memcpy(kaddr + offset, src, cur); + if (use_memmove) + memmove(kaddr + offset, src, cur); + else + memcpy(kaddr + offset, src, cur); src += cur; len -= cur; @@ -4264,55 +4161,54 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, } } -void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, - unsigned long len) +void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) { - size_t cur; - size_t offset; - struct page *page; - char *kaddr; - unsigned long i = get_eb_page_index(start); + return __write_extent_buffer(eb, srcv, start, len, false); +} - if (check_eb_range(eb, start, len)) - return; +static void memset_extent_buffer(const struct extent_buffer *eb, int c, + unsigned long start, unsigned long len) +{ + unsigned long cur = start; - offset = get_eb_offset_in_page(eb, start); + while (cur < start + len) { + unsigned long index = get_eb_page_index(cur); + unsigned int offset = get_eb_offset_in_page(eb, cur); + unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); + struct page *page = eb->pages[index]; - while (len > 0) { - page = eb->pages[i]; assert_eb_page_uptodate(eb, page); + memset(page_address(page) + offset, c, cur_len); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); - memset(kaddr + offset, 0, cur); - - len -= cur; - offset = 0; - i++; + cur += cur_len; } } +void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + if (check_eb_range(eb, start, len)) + return; + return memset_extent_buffer(eb, 0, start, len); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { - int i; - int num_pages; + unsigned long cur = 0; ASSERT(dst->len == src->len); - if (dst->fs_info->nodesize >= PAGE_SIZE) { - num_pages = num_extent_pages(dst); - for (i = 0; i < num_pages; i++) - copy_page(page_address(dst->pages[i]), - page_address(src->pages[i])); - } else { - size_t src_offset = get_eb_offset_in_page(src, 0); - size_t dst_offset = get_eb_offset_in_page(dst, 0); + while (cur < src->len) { + unsigned long index = get_eb_page_index(cur); + unsigned long offset = get_eb_offset_in_page(src, cur); + unsigned long cur_len = min(src->len, PAGE_SIZE - offset); + void *addr = page_address(src->pages[index]) + offset; + + write_extent_buffer(dst, addr, cur, cur_len); - ASSERT(src->fs_info->nodesize < PAGE_SIZE); - memcpy(page_address(dst->pages[0]) + dst_offset, - page_address(src->pages[0]) + src_offset, - src->len); + cur += cur_len; } } @@ -4406,6 +4302,15 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } +static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) +{ + unsigned long index = get_eb_page_index(bytenr); + + if (check_eb_range(eb, bytenr, 1)) + return NULL; + return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); +} + /* * Set an area of a bitmap to 1. * @@ -4417,35 +4322,28 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_set) { - kaddr[offset] |= mask_to_set; - len -= bits_to_set; - bits_to_set = BITS_PER_BYTE; - mask_to_set = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_set &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] |= mask_to_set; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr |= mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0xff, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr |= BITMAP_LAST_BYTE_MASK(pos + len); } @@ -4461,35 +4359,28 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) { + unsigned int first_byte = start + BIT_BYTE(pos); + unsigned int last_byte = start + BIT_BYTE(pos + len - 1); + const bool same_byte = (first_byte == last_byte); + u8 mask = BITMAP_FIRST_BYTE_MASK(pos); u8 *kaddr; - struct page *page; - unsigned long i; - size_t offset; - const unsigned int size = pos + len; - int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); - u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); - eb_bitmap_offset(eb, start, pos, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + if (same_byte) + mask &= BITMAP_LAST_BYTE_MASK(pos + len); - while (len >= bits_to_clear) { - kaddr[offset] &= ~mask_to_clear; - len -= bits_to_clear; - bits_to_clear = BITS_PER_BYTE; - mask_to_clear = ~0; - if (++offset >= PAGE_SIZE && len > 0) { - offset = 0; - page = eb->pages[++i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); - } - } - if (len) { - mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); - kaddr[offset] &= ~mask_to_clear; - } + /* Handle the first byte. */ + kaddr = extent_buffer_get_byte(eb, first_byte); + *kaddr &= ~mask; + if (same_byte) + return; + + /* Handle the byte aligned part. */ + ASSERT(first_byte + 1 <= last_byte); + memset_extent_buffer(eb, 0, first_byte + 1, last_byte - first_byte - 1); + + /* Handle the last byte. */ + kaddr = extent_buffer_get_byte(eb, last_byte); + *kaddr &= ~BITMAP_LAST_BYTE_MASK(pos + len); } static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) @@ -4498,60 +4389,29 @@ static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned return distance < len; } -static void copy_pages(struct page *dst_page, struct page *src_page, - unsigned long dst_off, unsigned long src_off, - unsigned long len) -{ - char *dst_kaddr = page_address(dst_page); - char *src_kaddr; - int must_memmove = 0; - - if (dst_page != src_page) { - src_kaddr = page_address(src_page); - } else { - src_kaddr = dst_kaddr; - if (areas_overlap(src_off, dst_off, len)) - must_memmove = 1; - } - - if (must_memmove) - memmove(dst_kaddr + dst_off, src_kaddr + src_off, len); - else - memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); -} - void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; - unsigned long dst_i; - unsigned long src_i; + unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; - while (len > 0) { - dst_off_in_page = get_eb_offset_in_page(dst, dst_offset); - src_off_in_page = get_eb_offset_in_page(dst, src_offset); - - dst_i = get_eb_page_index(dst_offset); - src_i = get_eb_page_index(src_offset); - - cur = min(len, (unsigned long)(PAGE_SIZE - - src_off_in_page)); - cur = min_t(unsigned long, cur, - (unsigned long)(PAGE_SIZE - dst_off_in_page)); - - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page, src_off_in_page, cur); - - src_offset += cur; - dst_offset += cur; - len -= cur; + while (cur_off < len) { + unsigned long cur_src = cur_off + src_offset; + unsigned long pg_index = get_eb_page_index(cur_src); + unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long cur_len = min(src_offset + len - cur_src, + PAGE_SIZE - pg_off); + void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + const bool use_memmove = areas_overlap(src_offset + cur_off, + dst_offset + cur_off, cur_len); + + __write_extent_buffer(dst, src_addr, dst_offset + cur_off, cur_len, + use_memmove); + cur_off += cur_len; } } @@ -4559,23 +4419,26 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; unsigned long dst_end = dst_offset + len - 1; unsigned long src_end = src_offset + len - 1; - unsigned long dst_i; - unsigned long src_i; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; } + while (len > 0) { - dst_i = get_eb_page_index(dst_end); + unsigned long src_i; + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + void *src_addr; + bool use_memmove; + src_i = get_eb_page_index(src_end); dst_off_in_page = get_eb_offset_in_page(dst, dst_end); @@ -4583,9 +4446,14 @@ void memmove_extent_buffer(const struct extent_buffer *dst, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(dst->pages[dst_i], dst->pages[src_i], - dst_off_in_page - cur + 1, - src_off_in_page - cur + 1, cur); + + src_addr = page_address(dst->pages[src_i]) + src_off_in_page - + cur + 1; + use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, + cur); + + __write_extent_buffer(dst, src_addr, dst_end - cur + 1, cur, + use_memmove); dst_end -= cur; src_end -= cur; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index c5fae3a7d911..68368ba99321 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -40,7 +40,6 @@ enum { ENUM_BIT(PAGE_START_WRITEBACK), ENUM_BIT(PAGE_END_WRITEBACK), ENUM_BIT(PAGE_SET_ORDERED), - ENUM_BIT(PAGE_LOCK), }; /* @@ -94,6 +93,13 @@ struct extent_buffer { #endif }; +struct btrfs_eb_write_context { + struct writeback_control *wbc; + struct extent_buffer *eb; + /* Block group @eb resides in. Only used for zoned mode. */ + struct btrfs_block_group *zoned_bg; +}; + /* * Get the correct offset inside the page of extent buffer. * @@ -178,8 +184,9 @@ int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); int btrfs_read_folio(struct file *file, struct folio *folio); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - struct writeback_control *wbc); +void extent_write_locked_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, struct writeback_control *wbc, + bool pages_dirty); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -236,11 +243,24 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dst, int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dst, unsigned long start, unsigned long len); -void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); -void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, - const void *src); void write_extent_buffer(const struct extent_buffer *eb, const void *src, unsigned long start, unsigned long len); + +static inline void write_extent_buffer_chunk_tree_uuid( + const struct extent_buffer *eb, const void *chunk_tree_uuid) +{ + write_extent_buffer(eb, chunk_tree_uuid, + offsetof(struct btrfs_header, chunk_tree_uuid), + BTRFS_FSID_SIZE); +} + +static inline void write_extent_buffer_fsid(const struct extent_buffer *eb, + const void *fsid) +{ + write_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE); +} + void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src); void copy_extent_buffer(const struct extent_buffer *dst, @@ -266,7 +286,6 @@ void set_extent_buffer_dirty(struct extent_buffer *eb); void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); @@ -277,8 +296,6 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); -void end_extent_writepage(struct page *page, int err, u64 start, u64 end); - #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 696bf695d8eb..1ce5dd154499 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -597,29 +597,37 @@ fail: * Each bit represents a sector. Thus caller should ensure @csum_buf passed * in is large enough to contain all csums. */ -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit) +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; - struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_csum_item *item; const u64 orig_start = start; + bool free_path = false; int ret; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + free_path = true; + } - if (search_commit) { - path->skip_locking = 1; - path->reada = READA_FORWARD; - path->search_commit_root = 1; + /* Check if we can reuse the previous path. */ + if (path->nodes[0]) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY && + key.offset <= start) + goto search_forward; + btrfs_release_path(path); } key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; @@ -656,6 +664,7 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } } +search_forward: while (start <= end) { u64 csum_end; @@ -712,7 +721,8 @@ int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, } ret = 0; fail: - btrfs_free_path(path); + if (free_path) + btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 4ec669b69008..04bd2d34efb1 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -57,9 +57,9 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); -int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, - u8 *csum_buf, unsigned long *csum_bitmap, - bool search_commit); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, struct btrfs_path *path, + u64 start, u64 end, u8 *csum_buf, + unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fd03e689a6be..ca46a529d56b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -876,9 +876,9 @@ static int prepare_uptodate_page(struct inode *inode, return 0; } -static unsigned int get_prepare_fgp_flags(bool nowait) +static fgf_t get_prepare_fgp_flags(bool nowait) { - unsigned int fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; + fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT; if (nowait) fgp_flags |= FGP_NOWAIT; @@ -910,7 +910,7 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, int i; unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - unsigned int fgp_flags = get_prepare_fgp_flags(nowait); + fgf_t fgp_flags = get_prepare_fgp_flags(nowait); int err = 0; int faili; @@ -1106,24 +1106,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode) btrfs_drew_write_unlock(&inode->root->snapshot_lock); } -static void update_time_for_write(struct inode *inode) -{ - struct timespec64 now; - - if (IS_NOCMTIME(inode)) - return; - - now = current_time(inode); - if (!timespec64_equal(&inode->i_mtime, &now)) - inode->i_mtime = now; - - if (!timespec64_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - - if (IS_I_VERSION(inode)) - inode_inc_iversion(inode); -} - static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, size_t count) { @@ -1155,7 +1137,10 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ - update_time_for_write(inode); + if (!IS_NOCMTIME(inode)) { + inode->i_mtime = inode_set_ctime_current(inode); + inode_inc_iversion(inode); + } start_pos = round_down(pos, fs_info->sectorsize); oldsize = i_size_read(inode); @@ -2459,10 +2444,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, */ inode_inc_iversion(&inode->vfs_inode); - if (!extent_info || extent_info->update_times) { - inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); - inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; - } + if (!extent_info || extent_info->update_times) + inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode); ret = btrfs_update_inode(trans, root, inode); if (ret) @@ -2703,8 +2686,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); @@ -2721,11 +2703,10 @@ out_only_mutex: * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ - struct timespec64 now = current_time(inode); + struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_mtime = now; - inode->i_ctime = now; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -2796,7 +2777,7 @@ static int btrfs_fallocate_update_isize(struct inode *inode, if (IS_ERR(trans)) return PTR_ERR(trans); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); @@ -3018,7 +2999,7 @@ static long btrfs_fallocate(struct file *file, int mode, struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; - struct list_head reserve_list; + LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -3110,7 +3091,6 @@ static long btrfs_fallocate(struct file *file, int mode, btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ - INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 880800418075..27fad70451aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1219,10 +1219,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - ret = find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL); - if (ret) + if (!find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -2705,13 +2704,8 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); spin_lock(&ctl->tree_lock); - /* Count initial region as zone_unusable until it gets activated. */ if (!used) to_free = size; - else if (initial && - test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &block_group->fs_info->flags) && - (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) - to_free = 0; else if (initial) to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) @@ -2739,8 +2733,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, reclaimable_unusable = block_group->zone_unusable - (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length && - block_group->alloc_offset) { + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= @@ -2944,7 +2937,8 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, btrfs_info(fs_info, "block group has cluster?: %s", list_empty(&block_group->cluster_list) ? "no" : "yes"); btrfs_info(fs_info, - "%d blocks of free space at or bigger than bytes is", count); + "%d free space entries at or bigger than %llu bytes", + count, bytes); } void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group, diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index f169378e2ca6..c0e734082dcc 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1517,8 +1517,10 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } else if (prev_bit == 1 && bit == 0) { u64 space_added; - ret = add_new_free_space(block_group, extent_start, - offset, &space_added); + ret = btrfs_add_new_free_space(block_group, + extent_start, + offset, + &space_added); if (ret) goto out; total_found += space_added; @@ -1533,7 +1535,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, } } if (prev_bit == 1) { - ret = add_new_free_space(block_group, extent_start, end, NULL); + ret = btrfs_add_new_free_space(block_group, extent_start, end, NULL); if (ret) goto out; extent_count++; @@ -1590,8 +1592,9 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); ASSERT(key.objectid < end && key.objectid + key.offset <= end); - ret = add_new_free_space(block_group, key.objectid, - key.objectid + key.offset, &space_added); + ret = btrfs_add_new_free_space(block_group, key.objectid, + key.objectid + key.offset, + &space_added); if (ret) goto out; total_found += space_added; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 203d2a267828..a523d64d5491 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -46,8 +46,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); * Runtime (in-memory) states of filesystem */ enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, /* * Filesystem is being remounted, allow to skip some operations, like * defrag @@ -686,6 +684,12 @@ struct btrfs_fs_info { bool qgroup_rescan_running; u8 qgroup_drop_subtree_thres; + /* + * If this is not 0, then it indicates a serious filesystem error has + * happened and it contains that error (negative errno value). + */ + int fs_error; + /* Filesystem state */ unsigned long fs_state; @@ -766,6 +770,9 @@ struct btrfs_fs_info { u64 data_reloc_bg; struct mutex zoned_data_reloc_io_lock; + struct btrfs_block_group *active_meta_bg; + struct btrfs_block_group *active_system_bg; + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; @@ -962,8 +969,8 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); } -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) +#define BTRFS_FS_ERROR(fs_info) (READ_ONCE((fs_info)->fs_error)) + #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa090b0b5d29..f09fbdc43f0f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -124,11 +124,11 @@ static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); -static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset); + +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, @@ -423,11 +423,10 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, while (index <= end_index) { /* - * For locked page, we will call end_extent_writepage() on it - * in run_delalloc_range() for the error handling. That - * end_extent_writepage() function will call - * btrfs_mark_ordered_io_finished() to clear page Ordered and - * run the ordered extent accounting. + * For locked page, we will call btrfs_mark_ordered_io_finished + * through btrfs_mark_ordered_io_finished() on it + * in run_delalloc_range() for the error handling, which will + * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting @@ -815,24 +814,22 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, } /* - * we create compressed extents in two phases. The first - * phase compresses a range of pages that have already been - * locked (both pages and state bits are locked). + * Work queue call back to started compression on a file and pages. * - * This is done inside an ordered work queue, and the compression - * is spread across many cpus. The actual IO submission is step - * two, and the ordered work queue takes care of making sure that - * happens in the same order things were put onto the queue by - * writepages and friends. + * This is done inside an ordered work queue, and the compression is spread + * across many cpus. The actual IO submission is step two, and the ordered work + * queue takes care of making sure that happens in the same order things were + * put onto the queue by writepages and friends. * - * If this code finds it can't get good compression, it puts an - * entry onto the work queue to write the uncompressed bytes. This - * makes sure that both compressed inodes and uncompressed inodes - * are written in the same order that the flusher thread sent them - * down. + * If this code finds it can't get good compression, it puts an entry onto the + * work queue to write the uncompressed bytes. This makes sure that both + * compressed inodes and uncompressed inodes are written in the same order that + * the flusher thread sent them down. */ -static noinline int compress_file_range(struct async_chunk *async_chunk) +static void compress_file_range(struct btrfs_work *work) { + struct async_chunk *async_chunk = + container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -842,19 +839,24 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) u64 actual_end; u64 i_size; int ret = 0; - struct page **pages = NULL; + struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; + unsigned int poff; int i; - int will_compress; int compress_type = fs_info->compress_type; - int compressed_extents = 0; - int redirty = 0; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* + * We need to call clear_page_dirty_for_io on each page in the range. + * Otherwise applications with the file mmap'd can wander in and change + * the page contents while we are compressing them. + */ + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + + /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size @@ -868,7 +870,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) barrier(); actual_end = min_t(u64, i_size, end + 1); again: - will_compress = 0; + pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); @@ -912,78 +914,57 @@ again: ret = 0; /* - * we do compression for mount -o compress and when the - * inode has not been flagged as nocompress. This flag can - * change at any time if we discover bad compression ratios. + * We do compression for mount -o compress and when the inode has not + * been flagged as NOCOMPRESS. This flag can change at any time if we + * discover bad compression ratios. */ - if (inode_need_compress(inode, start, end)) { - WARN_ON(pages); - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) { - /* just bail out to the uncompressed code */ - nr_pages = 0; - goto cont; - } - - if (inode->defrag_compress) - compress_type = inode->defrag_compress; - else if (inode->prop_compress) - compress_type = inode->prop_compress; + if (!inode_need_compress(inode, start, end)) + goto cleanup_and_bail_uncompressed; + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { /* - * we need to call clear_page_dirty_for_io on each - * page in the range. Otherwise applications with the file - * mmap'd can wander in and change the page contents while - * we are compressing them. - * - * If the compression fails for any reason, we set the pages - * dirty again later on. - * - * Note that the remaining part is redirtied, the start pointer - * has moved, the end is the original one. + * Memory allocation failure is not a fatal error, we can fall + * back to uncompressed code. */ - if (!redirty) { - extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); - redirty = 1; - } + goto cleanup_and_bail_uncompressed; + } - /* Compression level is applied here and only here */ - ret = btrfs_compress_pages( - compress_type | (fs_info->compress_level << 4), - mapping, start, - pages, - &nr_pages, - &total_in, - &total_compressed); + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; + + /* Compression level is applied here. */ + ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), + mapping, start, pages, &nr_pages, &total_in, + &total_compressed); + if (ret) + goto mark_incompressible; - if (!ret) { - unsigned long offset = offset_in_page(total_compressed); - struct page *page = pages[nr_pages - 1]; + /* + * Zero the tail end of the last page, as we might be sending it down + * to disk. + */ + poff = offset_in_page(total_compressed); + if (poff) + memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); - /* zero the tail end of the last page, we might be - * sending it down to disk - */ - if (offset) - memzero_page(page, offset, PAGE_SIZE - offset); - will_compress = 1; - } - } -cont: /* + * Try to create an inline extent. + * + * If we didn't compress the entire range, try to create an uncompressed + * inline extent, else a compressed one. + * * Check cow_file_range() for why we don't even try to create inline - * extent for subpage case. + * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { - /* lets try to make an inline extent */ - if (ret || total_in < actual_end) { - /* we didn't compress the entire range, try - * to make an uncompressed inline extent. - */ - ret = cow_file_range_inline(inode, actual_end, - 0, BTRFS_COMPRESS_NONE, - NULL, false); + if (total_in < actual_end) { + ret = cow_file_range_inline(inode, actual_end, 0, + BTRFS_COMPRESS_NONE, NULL, + false); } else { - /* try making a compressed inline extent */ ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, @@ -1013,99 +994,52 @@ cont: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - - /* - * Ensure we only free the compressed pages if we have - * them allocated, as we can still reach here with - * inode_need_compress() == false. - */ - if (pages) { - for (i = 0; i < nr_pages; i++) { - WARN_ON(pages[i]->mapping); - put_page(pages[i]); - } - kfree(pages); - } - return 0; + goto free_pages; } } - if (will_compress) { - /* - * we aren't doing an inline extent round the compressed size - * up to a block size boundary so the allocator does sane - * things - */ - total_compressed = ALIGN(total_compressed, blocksize); + /* + * We aren't doing an inline extent. Round the compressed size up to a + * block size boundary so the allocator does sane things. + */ + total_compressed = ALIGN(total_compressed, blocksize); - /* - * one last check to make sure the compression is really a - * win, compare the page count read with the blocks on disk, - * compression must free at least one sector size - */ - total_in = round_up(total_in, fs_info->sectorsize); - if (total_compressed + blocksize <= total_in) { - compressed_extents++; + /* + * One last check to make sure the compression is really a win, compare + * the page count read with the blocks on disk, compression must free at + * least one sector. + */ + total_in = round_up(total_in, fs_info->sectorsize); + if (total_compressed + blocksize > total_in) + goto mark_incompressible; - /* - * The async work queues will take care of doing actual - * allocation on disk for these compressed pages, and - * will submit them to the elevator. - */ - add_async_extent(async_chunk, start, total_in, - total_compressed, pages, nr_pages, - compress_type); - - if (start + total_in < end) { - start += total_in; - pages = NULL; - cond_resched(); - goto again; - } - return compressed_extents; - } + /* + * The async work queues will take care of doing actual allocation on + * disk for these compressed pages, and will submit the bios. + */ + add_async_extent(async_chunk, start, total_in, total_compressed, pages, + nr_pages, compress_type); + if (start + total_in < end) { + start += total_in; + cond_resched(); + goto again; } + return; + +mark_incompressible: + if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) + inode->flags |= BTRFS_INODE_NOCOMPRESS; +cleanup_and_bail_uncompressed: + add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, + BTRFS_COMPRESS_NONE); +free_pages: if (pages) { - /* - * the compression code ran but failed to make things smaller, - * free any pages it allocated and our page pointer array - */ for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); - pages = NULL; - total_compressed = 0; - nr_pages = 0; - - /* flag the file so we don't compress in the future */ - if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(inode->prop_compress)) { - inode->flags |= BTRFS_INODE_NOCOMPRESS; - } - } -cleanup_and_bail_uncompressed: - /* - * No compression, but we still need to write the pages in the file - * we've been given so far. redirty the locked page if it corresponds - * to our extent and set things up for the async work queue to run - * cow_file_range to do the normal delalloc dance. - */ - if (async_chunk->locked_page && - (page_offset(async_chunk->locked_page) >= start && - page_offset(async_chunk->locked_page)) <= end) { - __set_page_dirty_nobuffers(async_chunk->locked_page); - /* unlocked later on in the async handlers */ } - - if (redirty) - extent_range_redirty_for_io(&inode->vfs_inode, start, end); - add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, - BTRFS_COMPRESS_NONE); - compressed_extents++; - - return compressed_extents; } static void free_async_extent_pages(struct async_extent *async_extent) @@ -1124,14 +1058,12 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -static int submit_uncompressed_range(struct btrfs_inode *inode, - struct async_extent *async_extent, - struct page *locked_page) +static void submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; - unsigned long nr_written = 0; - int page_started = 0; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -1140,45 +1072,33 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, .no_cgroup_owner = 1, }; - /* - * Call cow_file_range() to run the delalloc range directly, since we - * won't go to NOCOW or async path again. - * - * Also we call cow_file_range() with @unlock_page == 0, so that we - * can directly submit them without interruption. - */ - ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0, NULL); - /* Inline extent inserted, page gets unlocked and everything is done */ - if (page_started) - return 0; - + wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); + ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); + wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); - const u64 page_end = page_start + PAGE_SIZE - 1; set_page_writeback(locked_page); end_page_writeback(locked_page); - end_extent_writepage(locked_page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, locked_page, + page_start, PAGE_SIZE, + !ret); + btrfs_page_clear_uptodate(inode->root->fs_info, + locked_page, page_start, + PAGE_SIZE); + mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } - return ret; } - - /* All pages will be unlocked, including @locked_page */ - wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); - ret = extent_write_locked_range(&inode->vfs_inode, start, end, &wbc); - wbc_detach_inode(&wbc); - return ret; } -static int submit_one_async_extent(struct btrfs_inode *inode, - struct async_chunk *async_chunk, - struct async_extent *async_extent, - u64 *alloc_hint) +static void submit_one_async_extent(struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) { + struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1206,9 +1126,8 @@ static int submit_one_async_extent(struct btrfs_inode *inode, } lock_extent(io_tree, start, end, NULL); - /* We have fall back to uncompressed write */ - if (!async_extent->pages) { - ret = submit_uncompressed_range(inode, async_extent, locked_page); + if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + submit_uncompressed_range(inode, async_extent, locked_page); goto done; } @@ -1217,7 +1136,6 @@ static int submit_one_async_extent(struct btrfs_inode *inode, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { - free_async_extent_pages(async_extent); /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for @@ -1272,7 +1190,7 @@ done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); - return ret; + return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); @@ -1286,39 +1204,13 @@ out_free: PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); - goto done; -} - -/* - * Phase two of compressed writeback. This is the ordered portion of the code, - * which only gets called in the order the work was queued. We walk all the - * async extents created by compress_file_range and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) -{ - struct btrfs_inode *inode = async_chunk->inode; - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - int ret = 0; - - while (!list_empty(&async_chunk->extents)) { - u64 extent_start; - u64 ram_size; - - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - extent_start = async_extent->start; - ram_size = async_extent->ram_size; - - ret = submit_one_async_extent(inode, async_chunk, async_extent, - &alloc_hint); - btrfs_debug(fs_info, + if (async_chunk->blkcg_css) + kthread_associate_blkcg(NULL); + btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", - inode->root->root_key.objectid, - btrfs_ino(inode), extent_start, ram_size, ret); - } + root->root_key.objectid, btrfs_ino(inode), start, + async_extent->ram_size, ret); + kfree(async_extent); } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1362,25 +1254,18 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * - * *page_started is set to one if we unlock locked_page and do everything - * required to start IO on it. It may be clean and already done with - * IO when we return. - * - * When unlock == 1, we unlock the pages in successfully allocated regions. - * When unlock == 0, we leave them locked for writing them out. + * When this function fails, it unlocks all pages except @locked_page. * - * However, we unlock all the pages except @locked_page in case of failure. + * When this function successfully creates an inline extent, it returns 1 and + * unlocks all pages including locked_page and starts I/O on them. + * (In reality inline extents are limited to a single page, so locked_page is + * the only page handled anyway). * - * In summary, page locking state will be as follow: + * When this function succeed and creates a normal extent, the page locking + * status depends on the passed in flags: * - * - page_started == 1 (return value) - * - All the pages are unlocked. IO is started. - * - Note that this can happen only on success - * - unlock == 1 - * - All the pages except @locked_page are unlocked in any case - * - unlock == 0 - * - On success, all the pages are locked for writing out them - * - On failure, all the pages except @locked_page are unlocked + * - If @keep_locked is set, all pages are kept locked. + * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept @@ -1389,10 +1274,9 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock, - u64 *done_offset) + struct page *locked_page, u64 start, u64 end, + u64 *done_offset, + bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1431,7 +1315,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ - if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { + if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); @@ -1451,9 +1335,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - *nr_written = *nr_written + - (end - start + PAGE_SIZE) / PAGE_SIZE; - *page_started = 1; /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by @@ -1463,11 +1344,12 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller - * can't use page_started to determine if it's an - * inline extent or a compressed extent. + * can't determine if it's an inline extent or a + * compressed extent. */ unlock_page(locked_page); - goto out; + ret = 1; + goto done; } else if (ret < 0) { goto out_unlock; } @@ -1498,6 +1380,31 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + /* + * btrfs_reserve_extent only returns -EAGAIN for zoned + * file systems, which is an indication that there are + * no active zones to allocate from at the moment. + * + * If this is the first loop iteration, wait for at + * least one zone to finish before retrying the + * allocation. Otherwise ask the caller to write out + * the already allocated blocks before coming back to + * us, or return -ENOSPC if it can't handle retries. + */ + ASSERT(btrfs_is_zoned(fs_info)); + if (start == orig_start) { + wait_on_bit_io(&inode->root->fs_info->flags, + BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + continue; + } + if (done_offset) { + *done_offset = start - 1; + return 0; + } + ret = -ENOSPC; + } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; @@ -1558,7 +1465,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ - page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, @@ -1581,7 +1488,9 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_unlock; } -out: +done: + if (done_offset) + *done_offset = end; return ret; out_drop_extent_cache: @@ -1591,21 +1500,6 @@ out_reserve: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* - * If done_offset is non-NULL and ret == -EAGAIN, we expect the - * caller to write out the successfully allocated region and retry. - */ - if (done_offset && ret == -EAGAIN) { - if (orig_start < start) - *done_offset = start - 1; - else - *done_offset = start; - return ret; - } else if (ret == -EAGAIN) { - /* Convert to -ENOSPC since the caller cannot retry. */ - ret = -ENOSPC; - } - - /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| @@ -1627,10 +1521,10 @@ out_unlock: * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * - * However, in case of unlock == 0, we still need to unlock the pages + * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ - if (!unlock && orig_start < start) { + if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, @@ -1671,43 +1565,28 @@ out_unlock: } /* - * work queue call back to started compression on a file and pages - */ -static noinline void async_cow_start(struct btrfs_work *work) -{ - struct async_chunk *async_chunk; - int compressed_extents; - - async_chunk = container_of(work, struct async_chunk, work); - - compressed_extents = compress_file_range(async_chunk); - if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); - async_chunk->inode = NULL; - } -} - -/* - * work queue call back to submit previously compressed pages + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. */ -static noinline void async_cow_submit(struct btrfs_work *work) +static noinline void submit_compressed_extents(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); + struct async_extent *async_extent; unsigned long nr_pages; + u64 alloc_hint = 0; nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; - /* - * ->inode could be NULL if async_chunk_start has failed to compress, - * in which case we don't have anything to submit, yet we need to - * always adjust ->async_delalloc_pages as its paired with the init - * happening in run_delalloc_compressed - */ - if (async_chunk->inode) - submit_compressed_extents(async_chunk); + while (!list_empty(&async_chunk->extents)) { + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + submit_one_async_extent(async_chunk, async_extent, &alloc_hint); + } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < @@ -1721,8 +1600,7 @@ static noinline void async_cow_free(struct btrfs_work *work) struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); - if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1732,10 +1610,8 @@ static noinline void async_cow_free(struct btrfs_work *work) } static bool run_delalloc_compressed(struct btrfs_inode *inode, - struct writeback_control *wbc, - struct page *locked_page, - u64 start, u64 end, int *page_started, - unsigned long *nr_written) + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); @@ -1809,65 +1685,42 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, async_chunk[i].blkcg_css = NULL; } - btrfs_init_work(&async_chunk[i].work, async_cow_start, - async_cow_submit, async_cow_free); + btrfs_init_work(&async_chunk[i].work, compress_file_range, + submit_compressed_extents, async_cow_free); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); - *nr_written += nr_pages; start = cur_end + 1; } - *page_started = 1; return true; } -static noinline int run_delalloc_zoned(struct btrfs_inode *inode, - struct page *locked_page, u64 start, - u64 end, int *page_started, - unsigned long *nr_written, - struct writeback_control *wbc) +/* + * Run the delalloc range from start to end, and write back any dirty pages + * covered by the range. + */ +static noinline int run_delalloc_cow(struct btrfs_inode *inode, + struct page *locked_page, u64 start, + u64 end, struct writeback_control *wbc, + bool pages_dirty) { u64 done_offset = end; int ret; - bool locked_page_done = false; while (start <= end) { - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0, &done_offset); - if (ret && ret != -EAGAIN) + ret = cow_file_range(inode, locked_page, start, end, &done_offset, + true, false); + if (ret) return ret; - - if (*page_started) { - ASSERT(ret == 0); - return 0; - } - - if (ret == 0) - done_offset = end; - - if (done_offset == start) { - wait_on_bit_io(&inode->root->fs_info->flags, - BTRFS_FS_NEED_ZONE_FINISH, - TASK_UNINTERRUPTIBLE); - continue; - } - - if (!locked_page_done) { - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - } - locked_page_done = true; - extent_write_locked_range(&inode->vfs_inode, start, done_offset, - wbc); + extent_write_locked_range(&inode->vfs_inode, locked_page, start, + done_offset, wbc, pages_dirty); start = done_offset + 1; } - *page_started = 1; - - return 0; + return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, @@ -1894,8 +1747,7 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, unsigned long *nr_written) + const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); @@ -1903,6 +1755,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; + int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was @@ -1955,8 +1808,14 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, NULL); } - return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1, NULL); + /* + * Don't try to create inline extents, as a mix of inline extent that + * is written out and unlocked directly and a normal NOCOW extent + * doesn't work. + */ + ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); + ASSERT(ret != 1); + return ret; } struct can_nocow_file_extent_args { @@ -2105,9 +1964,7 @@ static int can_nocow_file_extent(struct btrfs_path *path, */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, - const u64 start, const u64 end, - int *page_started, - unsigned long *nr_written) + const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -2117,25 +1974,26 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); - struct btrfs_block_group *bg; - bool nocow = false; struct can_nocow_file_extent_args nocow_args = { 0 }; + /* + * Normally on a zoned device we're only doing COW writes, but in case + * of relocation on a zoned filesystem serializes I/O so that we're only + * writing sequentially and can end up here as well. + */ + ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); + path = btrfs_alloc_path(); if (!path) { - extent_clear_unlock_delalloc(inode, start, end, locked_page, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, PAGE_UNLOCK | - PAGE_START_WRITEBACK | - PAGE_END_WRITEBACK); - return -ENOMEM; + ret = -ENOMEM; + goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (1) { + struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; @@ -2146,8 +2004,6 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, int extent_type; bool is_prealloc; - nocow = false; - ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) @@ -2172,11 +2028,8 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } if (ret > 0) break; leaf = path->nodes[0]; @@ -2209,7 +2062,7 @@ next_slot: if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; - goto out_check; + goto must_cow; } /* @@ -2239,24 +2092,22 @@ next_slot: nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; + if (ret < 0) goto error; - } else if (ret == 0) { - goto out_check; - } + if (ret == 0) + goto must_cow; ret = 0; - bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); - if (bg) - nocow = true; -out_check: - /* - * If nocow is false then record the beginning of the range - * that needs to be COWed - */ - if (!nocow) { + nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (!nocow_bg) { +must_cow: + /* + * If we can't perform NOCOW writeback for the range, + * then record the beginning of the range that needs to + * be COWed. It will be written out before the next + * NOCOW range if we find one, or when exiting this + * loop. + */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; @@ -2275,11 +2126,12 @@ out_check: */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, - cow_start, found_key.offset - 1, - page_started, nr_written); - if (ret) - goto error; + cow_start, found_key.offset - 1); cow_start = (u64)-1; + if (ret) { + btrfs_dec_nocow_writers(nocow_bg); + goto error; + } } nocow_end = cur_offset + nocow_args.num_bytes - 1; @@ -2296,6 +2148,7 @@ out_check: ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { + btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } @@ -2309,6 +2162,7 @@ out_check: ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); + btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, @@ -2318,11 +2172,6 @@ out_check: goto error; } - if (nocow) { - btrfs_dec_nocow_writers(bg); - nocow = false; - } - if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent @@ -2357,17 +2206,24 @@ out_check: if (cow_start != (u64)-1) { cur_offset = end; - ret = fallback_to_cow(inode, locked_page, cow_start, end, - page_started, nr_written); + ret = fallback_to_cow(inode, locked_page, cow_start, end); + cow_start = (u64)-1; if (ret) goto error; } -error: - if (nocow) - btrfs_dec_nocow_writers(bg); + btrfs_free_path(path); + return 0; - if (ret && cur_offset < end) +error: + /* + * If an error happened while a COW region is outstanding, cur_offset + * needs to be reset to cow_start to ensure the COW region is unlocked + * as well. + */ + if (cow_start != (u64)-1) + cur_offset = cow_start; + if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | @@ -2395,49 +2251,37 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc) + u64 start, u64 end, struct writeback_control *wbc) { - int ret = 0; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + int ret; /* - * The range must cover part of the @locked_page, or the returned - * @page_started can confuse the caller. + * The range must cover part of the @locked_page, or a return of 1 + * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { - /* - * Normally on a zoned device we're only doing COW writes, but - * in case of relocation on a zoned filesystem we have taken - * precaution, that we're only writing sequentially. It's safe - * to use run_delalloc_nocow() here, like for regular - * preallocated inodes. - */ - ASSERT(!zoned || btrfs_is_data_reloc_root(inode->root)); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, nr_written); + ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && - run_delalloc_compressed(inode, wbc, locked_page, start, - end, page_started, nr_written)) - goto out; + run_delalloc_compressed(inode, locked_page, start, end, wbc)) + return 1; if (zoned) - ret = run_delalloc_zoned(inode, locked_page, start, end, - page_started, nr_written, wbc); + ret = run_delalloc_cow(inode, locked_page, start, end, wbc, + true); else - ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1, NULL); + ret = cow_file_range(inode, locked_page, start, end, NULL, + false, false); out: - ASSERT(ret <= 0); - if (ret) + if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; @@ -2840,23 +2684,19 @@ struct btrfs_writepage_fixup { static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { - struct btrfs_writepage_fixup *fixup; + struct btrfs_writepage_fixup *fixup = + container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - struct page *page; - struct btrfs_inode *inode; - u64 page_start; - u64 page_end; + struct page *page = fixup->page; + struct btrfs_inode *inode = fixup->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; + u64 page_start = page_offset(page); + u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; - fixup = container_of(work, struct btrfs_writepage_fixup, work); - page = fixup->page; - inode = fixup->inode; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; - /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. @@ -2949,10 +2789,12 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); + btrfs_mark_ordered_io_finished(inode, page, page_start, + PAGE_SIZE, !ret); + btrfs_page_clear_uptodate(fs_info, page, page_start, PAGE_SIZE); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -3359,6 +3201,13 @@ out: btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); + /* + * Actually free the qgroup rsv which was released when + * the ordered extent was created. + */ + btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, + ordered_extent->qgroup_rsv, + BTRFS_QGROUP_RSV_DATA); } } @@ -3384,15 +3233,6 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) return btrfs_finish_one_ordered(ordered); } -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate) -{ - trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); -} - /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. @@ -3662,9 +3502,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) */ if (found_key.offset == last_objectid) { + /* + * We found the same inode as before. This means we were + * not able to remove its items via eviction triggered + * by an iput(). A transaction abort may have happened, + * due to -ENOSPC for example, so try to grab the error + * that lead to a transaction abort, if any. + */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); - ret = -EINVAL; + ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } @@ -3917,8 +3764,8 @@ static int btrfs_read_locked_inode(struct inode *inode, inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime); inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime); - inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime); - inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime); + inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + btrfs_timespec_nsec(leaf, &inode_item->ctime)); BTRFS_I(inode)->i_otime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->otime); @@ -4089,9 +3936,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime.tv_sec); @@ -4289,9 +4136,8 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; - dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; + inode_set_ctime_current(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4464,8 +4310,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); - dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); - dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode); ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); @@ -5115,8 +4960,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); } } @@ -5738,11 +5582,11 @@ struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root return btrfs_iget_path(s, ino, root, NULL); } -static struct inode *new_simple_dir(struct super_block *s, +static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { - struct inode *inode = new_inode(s); + struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -5760,10 +5604,11 @@ static struct inode *new_simple_dir(struct super_block *s, inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - inode->i_mtime = current_time(inode); - inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); + inode->i_atime = dir->i_atime; BTRFS_I(inode)->i_otime = inode->i_mtime; + inode->i_uid = dir->i_uid; + inode->i_gid = dir->i_gid; return inode; } @@ -5822,7 +5667,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, root); + inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); @@ -6007,8 +5852,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_key found_key; struct btrfs_path *path; void *addr; - struct list_head ins_list; - struct list_head del_list; + LIST_HEAD(ins_list); + LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; @@ -6027,8 +5872,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) addr = private->filldir_buf; path->reada = READA_FORWARD; - INIT_LIST_HEAD(&ins_list); - INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); @@ -6165,8 +6008,7 @@ static int btrfs_dirty_inode(struct btrfs_inode *inode) * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ -static int btrfs_update_time(struct inode *inode, struct timespec64 *now, - int flags) +static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty = flags & ~S_VERSION; @@ -6174,14 +6016,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, if (btrfs_root_readonly(root)) return -EROFS; - if (flags & S_VERSION) - dirty |= inode_maybe_inc_iversion(inode, dirty); - if (flags & S_CTIME) - inode->i_ctime = *now; - if (flags & S_MTIME) - inode->i_mtime = *now; - if (flags & S_ATIME) - inode->i_atime = *now; + dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } @@ -6429,9 +6264,8 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, goto discard; } - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode->i_atime = inode->i_mtime; - inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; /* @@ -6596,12 +6430,10 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ - if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) { - struct timespec64 now = current_time(&parent_inode->vfs_inode); + if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) + parent_inode->vfs_inode.i_mtime = + inode_set_ctime_current(&parent_inode->vfs_inode); - parent_inode->vfs_inode.i_mtime = now; - parent_inode->vfs_inode.i_ctime = now; - } ret = btrfs_update_inode(trans, root, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); @@ -6741,7 +6573,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); @@ -8807,7 +8639,7 @@ static int btrfs_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); @@ -8831,7 +8663,6 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; - struct timespec64 ctime = current_time(old_inode); struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); @@ -8962,12 +8793,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_mtime = ctime; - old_dir->i_ctime = ctime; - new_dir->i_mtime = ctime; - new_dir->i_ctime = ctime; - old_inode->i_ctime = ctime; - new_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9231,11 +9057,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_mtime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime; - new_dir->i_mtime = old_dir->i_mtime; - new_dir->i_ctime = old_dir->i_mtime; - old_inode->i_ctime = old_dir->i_mtime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9257,7 +9079,6 @@ static int btrfs_rename(struct mnt_idmap *idmap, if (new_inode) { inode_inc_iversion(new_inode); - new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); @@ -9390,14 +9211,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; - struct list_head works; - struct list_head splice; + LIST_HEAD(works); + LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; - INIT_LIST_HEAD(&works); - INIT_LIST_HEAD(&splice); - mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); @@ -9485,14 +9303,12 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, .range_end = LLONG_MAX, }; struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); @@ -9797,7 +9613,7 @@ next: *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a895d105464b..a18ee7b5a166 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -384,7 +384,7 @@ update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out_end_trans: diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 23fc11af498a..7695decc7243 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -10,14 +10,13 @@ #ifdef CONFIG_PRINTK #define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT + 1) /* * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', [BTRFS_FS_STATE_REMOUNTING] = 'M', [BTRFS_FS_STATE_RO] = 0, [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', @@ -37,6 +36,11 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; + if (BTRFS_FS_ERROR(info)) { + *curr++ = 'E'; + states_printed = true; + } + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { @@ -155,7 +159,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function * Today we only save the error info to memory. Long term we'll also * send it down to the disk. */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + WRITE_ONCE(fs_info->fs_error, errno); /* Don't go through full error handling during mount. */ if (!(sb->s_flags & SB_BORN)) @@ -252,12 +256,6 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index deedc1a168e2..1ae6f8e23e07 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -181,8 +181,6 @@ do { \ #define ASSERT(expr) (void)(expr) #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - __printf(5, 6) __cold void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index a629532283bc..b46ab348e8e5 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -410,6 +410,10 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, unsigned long flags; u64 cur = file_offset; + trace_btrfs_writepage_end_io_hook(inode, file_offset, + file_offset + num_bytes - 1, + uptodate); + spin_lock_irqsave(&tree->lock, flags); while (cur < file_offset + num_bytes) { u64 entry_end; @@ -736,11 +740,9 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, const u64 range_start, const u64 range_len) { struct btrfs_root *root; - struct list_head splice; + LIST_HEAD(splice); u64 done; - INIT_LIST_HEAD(&splice); - mutex_lock(&fs_info->ordered_operations_mutex); spin_lock(&fs_info->ordered_root_lock); list_splice_init(&fs_info->ordered_roots, &splice); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aa06d9ca911d..0c93439e929f 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -95,8 +95,10 @@ static void print_extent_item(const struct extent_buffer *eb, int slot, int type int ref_index = 0; if (unlikely(item_size < sizeof(*ei))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); + btrfs_err(eb->fs_info, + "unexpected extent item size, has %u expect >= %zu", + item_size, sizeof(*ei)); + btrfs_handle_fs_error(eb->fs_info, -EUCLEAN, NULL); } ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item); @@ -291,10 +293,6 @@ void btrfs_print_leaf(const struct extent_buffer *l) btrfs_file_extent_num_bytes(l, fi), btrfs_file_extent_ram_bytes(l, fi)); break; - case BTRFS_EXTENT_REF_V0_KEY: - btrfs_print_v0_err(fs_info); - btrfs_handle_fs_error(fs_info, -EINVAL, NULL); - break; case BTRFS_BLOCK_GROUP_ITEM_KEY: bi = btrfs_item_ptr(l, i, struct btrfs_block_group_item); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2637d6b157ff..b99230db3c82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3590,15 +3590,16 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info) * going to clear all tracking information for a clean start. */ - trans = btrfs_join_transaction(fs_info->fs_root); - if (IS_ERR(trans)) { + trans = btrfs_attach_transaction_barrier(fs_info->fs_root); + if (IS_ERR(trans) && trans != ERR_PTR(-ENOENT)) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; return PTR_ERR(trans); - } - ret = btrfs_commit_transaction(trans); - if (ret) { - fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - return ret; + } else if (trans != ERR_PTR(-ENOENT)) { + ret = btrfs_commit_transaction(trans); + if (ret) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; + return ret; + } } qgroup_rescan_zero_tracking(fs_info); @@ -3757,9 +3758,11 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); - trans = btrfs_join_transaction(root); + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; goto out; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0249ea52bb80..3e014b9370a3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -584,8 +584,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING || - last->operation == BTRFS_RBIO_READ_REBUILD) + if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; @@ -784,10 +783,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); - if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, recover_rbio_work_locked); - else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { - steal_rbio(rbio, next); + if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); @@ -1517,11 +1513,11 @@ static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_scrub_read_recover_enabled()) { + if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } @@ -1698,8 +1694,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1763,8 +1758,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1897,8 +1891,7 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { + if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); @@ -2112,8 +2105,8 @@ static void fill_data_csums(struct btrfs_raid_bio *rbio) goto error; } - ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, - rbio->csum_buf, rbio->csum_bitmap, false); + ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) @@ -2198,11 +2191,11 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; - if (trace_raid56_write_stripe_enabled()) { + if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); + trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e84c9c9293f..45e6ff78316f 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -14,7 +14,6 @@ enum btrfs_rbio_ops { BTRFS_RBIO_WRITE, BTRFS_RBIO_READ_REBUILD, BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, }; struct btrfs_raid_bio { diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 0474bbe39da7..65d2bd6910f2 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -30,8 +30,7 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, inode_inc_iversion(inode); if (!no_time_update) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; + inode->i_mtime = inode_set_ctime_current(inode); } /* * We round up to the block size at eof when determining which diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 46c3c1d57266..9951a0caf5bb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3006,9 +3006,6 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, if (!page) return -ENOMEM; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto release_page; if (PageReadahead(page)) page_cache_async_readahead(inode->i_mapping, ra, NULL, @@ -3024,6 +3021,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, } } + /* + * We could have lost page private when we dropped the lock to read the + * page above, make sure we set_page_extent_mapped here so we have any + * of the subpage blocksize stuff we need in place. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto release_page; + page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; @@ -3250,12 +3256,13 @@ static int add_tree_block(struct reloc_control *rc, if (type == BTRFS_TREE_BLOCK_REF_KEY) owner = btrfs_extent_inline_ref_offset(eb, iref); } - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - btrfs_print_v0_err(eb->fs_info); - btrfs_handle_fs_error(eb->fs_info, -EINVAL, NULL); - return -EINVAL; } else { - BUG(); + btrfs_print_leaf(eb); + btrfs_err(rc->block_group->fs_info, + "unrecognized tree backref at tree block %llu slot %u", + eb->start, path->slots[0]); + btrfs_release_path(path); + return -EUCLEAN; } btrfs_release_path(path); @@ -3498,6 +3505,8 @@ int find_next_extent(struct reloc_control *rc, struct btrfs_path *path, last = rc->block_group->start + rc->block_group->length; while (1) { + bool block_found; + cond_resched(); if (rc->search_start >= last) { ret = 1; @@ -3548,11 +3557,11 @@ next: goto next; } - ret = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); - if (ret == 0 && start <= key.objectid) { + if (block_found && start <= key.objectid) { btrfs_release_path(path); rc->search_start = end + 1; } else { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 7289f5bff397..b877203f1dc5 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -43,9 +43,20 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This determines the batch size for stripe submitted in one go. + * This detemines how many stripes would be submitted in one go, + * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ -#define SCRUB_STRIPES_PER_SCTX 8 /* That would be 8 64K stripe per-device. */ +#define SCRUB_STRIPES_PER_GROUP 8 + +/* + * How many groups we have for each sctx. + * + * This would be 8M per device, the same value as the old scrub in-flight bios + * size limit. + */ +#define SCRUB_GROUPS_PER_SCTX 16 + +#define SCRUB_TOTAL_STRIPES (SCRUB_GROUPS_PER_SCTX * SCRUB_STRIPES_PER_GROUP) /* * The following value times PAGE_SIZE needs to be large enough to match the @@ -172,9 +183,11 @@ struct scrub_stripe { }; struct scrub_ctx { - struct scrub_stripe stripes[SCRUB_STRIPES_PER_SCTX]; + struct scrub_stripe stripes[SCRUB_TOTAL_STRIPES]; struct scrub_stripe *raid56_data_stripes; struct btrfs_fs_info *fs_info; + struct btrfs_path extent_path; + struct btrfs_path csum_path; int first_free; int cur_stripe; atomic_t cancel_req; @@ -315,10 +328,10 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (!sctx) return; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) release_scrub_stripe(&sctx->stripes[i]); - kfree(sctx); + kvfree(sctx); } static void scrub_put_ctx(struct scrub_ctx *sctx) @@ -333,13 +346,20 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( struct scrub_ctx *sctx; int i; - sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); + /* Since sctx has inline 128 stripes, it can go beyond 64K easily. Use + * kvzalloc(). + */ + sctx = kvzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->fs_info = fs_info; - for (i = 0; i < SCRUB_STRIPES_PER_SCTX; i++) { + sctx->extent_path.search_commit_root = 1; + sctx->extent_path.skip_locking = 1; + sctx->csum_path.search_commit_root = 1; + sctx->csum_path.skip_locking = 1; + for (i = 0; i < SCRUB_TOTAL_STRIPES; i++) { int ret; ret = init_scrub_stripe(fs_info, &sctx->stripes[i]); @@ -970,6 +990,9 @@ skip: spin_unlock(&sctx->stat_lock); } +static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *stripe, + unsigned long write_bitmap, bool dev_replace); + /* * The main entrance for all read related scrub work, including: * @@ -978,13 +1001,16 @@ skip: * - Go through the remaining mirrors and try to read as large blocksize as * possible * - Go through all mirrors (including the failed mirror) sector-by-sector + * - Submit writeback for repaired sectors * - * Writeback does not happen here, it needs extra synchronization. + * Writeback for dev-replace does not happen here, it needs extra + * synchronization for zoned devices. */ static void scrub_stripe_read_repair_worker(struct work_struct *work) { struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; + struct scrub_ctx *sctx = stripe->sctx; + struct btrfs_fs_info *fs_info = sctx->fs_info; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); int mirror; @@ -1049,7 +1075,23 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) goto out; } out: - scrub_stripe_report_errors(stripe->sctx, stripe); + /* + * Submit the repaired sectors. For zoned case, we cannot do repair + * in-place, but queue the bg to be relocated. + */ + if (btrfs_is_zoned(fs_info)) { + if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + btrfs_repair_one_zone(fs_info, sctx->stripes[0].bg->start); + } else if (!sctx->readonly) { + unsigned long repaired; + + bitmap_andnot(&repaired, &stripe->init_error_bitmap, + &stripe->error_bitmap, stripe->nr_sectors); + scrub_write_sectors(sctx, stripe, repaired, false); + wait_scrub_stripe_io(stripe); + } + + scrub_stripe_report_errors(sctx, stripe); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1262,7 +1304,6 @@ static int get_raid56_logic_offset(u64 physical, int num, /* Work out the disk rotation on this stripe-set */ rot = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -1468,6 +1509,8 @@ static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) * Return <0 for error. */ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, + struct btrfs_path *extent_path, + struct btrfs_path *csum_path, struct btrfs_device *dev, u64 physical, int mirror_num, u64 logical_start, u32 logical_len, @@ -1477,7 +1520,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, struct btrfs_root *extent_root = btrfs_extent_root(fs_info, bg->start); struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bg->start); const u64 logical_end = logical_start + logical_len; - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; u64 stripe_end; u64 extent_start; @@ -1493,14 +1535,13 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* The range must be inside the bg. */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; - - ret = find_first_extent_item(extent_root, &path, logical_start, logical_len); + ret = find_first_extent_item(extent_root, extent_path, logical_start, + logical_len); /* Either error or not found. */ if (ret) goto out; - get_extent_info(&path, &extent_start, &extent_len, &extent_flags, &extent_gen); + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, + &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) @@ -1528,7 +1569,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, /* Fill the extent info for the remaining sectors. */ while (cur_logical <= stripe_end) { - ret = find_first_extent_item(extent_root, &path, cur_logical, + ret = find_first_extent_item(extent_root, extent_path, cur_logical, stripe_end - cur_logical + 1); if (ret < 0) goto out; @@ -1536,7 +1577,7 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, ret = 0; break; } - get_extent_info(&path, &extent_start, &extent_len, + get_extent_info(extent_path, &extent_start, &extent_len, &extent_flags, &extent_gen); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) stripe->nr_meta_extents++; @@ -1561,9 +1602,9 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, */ ASSERT(BITS_PER_LONG >= BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits); - ret = btrfs_lookup_csums_bitmap(csum_root, stripe->logical, - stripe_end, stripe->csums, - &csum_bitmap, true); + ret = btrfs_lookup_csums_bitmap(csum_root, csum_path, + stripe->logical, stripe_end, + stripe->csums, &csum_bitmap); if (ret < 0) goto out; if (ret > 0) @@ -1576,7 +1617,6 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, } set_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state); out: - btrfs_release_path(&path); return ret; } @@ -1654,6 +1694,28 @@ static bool stripe_has_metadata_error(struct scrub_stripe *stripe) return false; } +static void submit_initial_group_read(struct scrub_ctx *sctx, + unsigned int first_slot, + unsigned int nr_stripes) +{ + struct blk_plug plug; + + ASSERT(first_slot < SCRUB_TOTAL_STRIPES); + ASSERT(first_slot + nr_stripes <= SCRUB_TOTAL_STRIPES); + + scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, + btrfs_stripe_nr_to_offset(nr_stripes)); + blk_start_plug(&plug); + for (int i = 0; i < nr_stripes; i++) { + struct scrub_stripe *stripe = &sctx->stripes[first_slot + i]; + + /* Those stripes should be initialized. */ + ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &stripe->state)); + scrub_submit_initial_read(sctx, stripe); + } + blk_finish_plug(&plug); +} + static int flush_scrub_stripes(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; @@ -1666,11 +1728,11 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) ASSERT(test_bit(SCRUB_STRIPE_FLAG_INITIALIZED, &sctx->stripes[0].state)); - scrub_throttle_dev_io(sctx, sctx->stripes[0].dev, - btrfs_stripe_nr_to_offset(nr_stripes)); - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - scrub_submit_initial_read(sctx, stripe); + /* Submit the stripes which are populated but not submitted. */ + if (nr_stripes % SCRUB_STRIPES_PER_GROUP) { + const int first_slot = round_down(nr_stripes, SCRUB_STRIPES_PER_GROUP); + + submit_initial_group_read(sctx, first_slot, nr_stripes - first_slot); } for (int i = 0; i < nr_stripes; i++) { @@ -1680,32 +1742,6 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) test_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state)); } - /* - * Submit the repaired sectors. For zoned case, we cannot do repair - * in-place, but queue the bg to be relocated. - */ - if (btrfs_is_zoned(fs_info)) { - for (int i = 0; i < nr_stripes; i++) { - stripe = &sctx->stripes[i]; - - if (!bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) { - btrfs_repair_one_zone(fs_info, - sctx->stripes[0].bg->start); - break; - } - } - } else if (!sctx->readonly) { - for (int i = 0; i < nr_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - } - /* Submit for dev-replace. */ if (sctx->is_dev_replace) { /* @@ -1750,28 +1786,40 @@ static void raid56_scrub_wait_endio(struct bio *bio) static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, struct btrfs_device *dev, int mirror_num, - u64 logical, u32 length, u64 physical) + u64 logical, u32 length, u64 physical, + u64 *found_logical_ret) { struct scrub_stripe *stripe; int ret; - /* No available slot, submit all stripes and wait for them. */ - if (sctx->cur_stripe >= SCRUB_STRIPES_PER_SCTX) { - ret = flush_scrub_stripes(sctx); - if (ret < 0) - return ret; - } + /* + * There should always be one slot left, as caller filling the last + * slot should flush them all. + */ + ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); stripe = &sctx->stripes[sctx->cur_stripe]; - - /* We can queue one stripe using the remaining slot. */ scrub_reset_stripe(stripe); - ret = scrub_find_fill_first_stripe(bg, dev, physical, mirror_num, - logical, length, stripe); + ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, + &sctx->csum_path, dev, physical, + mirror_num, logical, length, stripe); /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; + if (found_logical_ret) + *found_logical_ret = stripe->logical; sctx->cur_stripe++; + + /* We filled one group, submit it. */ + if (sctx->cur_stripe % SCRUB_STRIPES_PER_GROUP == 0) { + const int first_slot = sctx->cur_stripe - SCRUB_STRIPES_PER_GROUP; + + submit_initial_group_read(sctx, first_slot, SCRUB_STRIPES_PER_GROUP); + } + + /* Last slot used, flush them all. */ + if (sctx->cur_stripe == SCRUB_TOTAL_STRIPES) + return flush_scrub_stripes(sctx); return 0; } @@ -1785,6 +1833,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_io_context *bioc = NULL; + struct btrfs_path extent_path = { 0 }; + struct btrfs_path csum_path = { 0 }; struct bio *bio; struct scrub_stripe *stripe; bool all_empty = true; @@ -1795,6 +1845,16 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ASSERT(sctx->raid56_data_stripes); + /* + * For data stripe search, we cannot re-use the same extent/csum paths, + * as the data stripe bytenr may be smaller than previous extent. Thus + * we have to use our own extent/csum paths. + */ + extent_path.search_commit_root = 1; + extent_path.skip_locking = 1; + csum_path.search_commit_root = 1; + csum_path.skip_locking = 1; + for (int i = 0; i < data_stripes; i++) { int stripe_index; int rot; @@ -1809,7 +1869,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, scrub_reset_stripe(stripe); set_bit(SCRUB_STRIPE_FLAG_NO_REPORT, &stripe->state); - ret = scrub_find_fill_first_stripe(bg, + ret = scrub_find_fill_first_stripe(bg, &extent_path, &csum_path, map->stripes[stripe_index].dev, physical, 1, full_stripe_start + btrfs_stripe_nr_to_offset(i), BTRFS_STRIPE_LEN, stripe); @@ -1854,24 +1914,6 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* For now, no zoned support for RAID56. */ ASSERT(!btrfs_is_zoned(sctx->fs_info)); - /* Writeback for the repaired sectors. */ - for (int i = 0; i < data_stripes; i++) { - unsigned long repaired; - - stripe = &sctx->raid56_data_stripes[i]; - - bitmap_andnot(&repaired, &stripe->init_error_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); - scrub_write_sectors(sctx, stripe, repaired, false); - } - - /* Wait for the above writebacks to finish. */ - for (int i = 0; i < data_stripes; i++) { - stripe = &sctx->raid56_data_stripes[i]; - - wait_scrub_stripe_io(stripe); - } - /* * Now all data stripes are properly verified. Check if we have any * unrepaired, if so abort immediately or we could further corrupt the @@ -1937,6 +1979,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, bio_put(bio); btrfs_bio_counter_dec(fs_info); + btrfs_release_path(&extent_path); + btrfs_release_path(&csum_path); out: return ret; } @@ -1958,18 +2002,15 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, { struct btrfs_fs_info *fs_info = sctx->fs_info; const u64 logical_end = logical_start + logical_length; - /* An artificial limit, inherit from old scrub behavior */ - struct btrfs_path path = { 0 }; u64 cur_logical = logical_start; int ret; /* The range must be inside the bg */ ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); - path.search_commit_root = 1; - path.skip_locking = 1; /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { + u64 found_logical; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -1994,7 +2035,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, ret = queue_scrub_stripe(sctx, bg, device, mirror_num, cur_logical, logical_end - cur_logical, - cur_physical); + cur_physical, &found_logical); if (ret > 0) { /* No more extent, just update the accounting */ sctx->stat.last_physical = physical + logical_length; @@ -2004,14 +2045,11 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; - ASSERT(sctx->cur_stripe > 0); - cur_logical = sctx->stripes[sctx->cur_stripe - 1].logical - + BTRFS_STRIPE_LEN; + cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ cond_resched(); } - btrfs_release_path(&path); return ret; } @@ -2109,6 +2147,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, u64 stripe_logical; int stop_loop = 0; + /* Extent_path should be released by now. */ + ASSERT(sctx->extent_path.nodes[0] == NULL); + scrub_blocked_if_needed(fs_info); if (sctx->is_dev_replace && @@ -2227,6 +2268,9 @@ out: ret2 = flush_scrub_stripes(sctx); if (!ret) ret = ret2; + btrfs_release_path(&sctx->extent_path); + btrfs_release_path(&sctx->csum_path); + if (sctx->raid56_data_stripes) { for (int i = 0; i < nr_data_stripes(map); i++) release_scrub_stripe(&sctx->raid56_data_stripes[i]); @@ -2711,8 +2755,7 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ -static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, - int is_dev_replace) +static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info) { struct workqueue_struct *scrub_workers = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; @@ -2722,10 +2765,7 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - if (is_dev_replace) - scrub_workers = alloc_ordered_workqueue("btrfs-scrub", flags); - else - scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, max_active); if (!scrub_workers) return -ENOMEM; @@ -2777,7 +2817,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); - ret = scrub_workers_get(fs_info, is_dev_replace); + ret = scrub_workers_get(fs_info); if (ret) goto out_free_ctx; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8bfd44750efe..3a566150c531 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3685,7 +3685,7 @@ static void tail_append_pending_moves(struct send_ctx *sctx, static int apply_children_dir_moves(struct send_ctx *sctx) { struct pending_dir_move *pm; - struct list_head stack; + LIST_HEAD(stack); u64 parent_ino = sctx->cur_ino; int ret = 0; @@ -3693,7 +3693,6 @@ static int apply_children_dir_moves(struct send_ctx *sctx) if (!pm) return 0; - INIT_LIST_HEAD(&stack); tail_append_pending_moves(sctx, pm, &stack); while (!list_empty(&stack)) { @@ -4165,7 +4164,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) int ret = 0; struct recorded_ref *cur; struct recorded_ref *cur2; - struct list_head check_dirs; + LIST_HEAD(check_dirs); struct fs_path *valid_path = NULL; u64 ow_inode = 0; u64 ow_gen; @@ -4184,7 +4183,6 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * which is always '..' */ BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID); - INIT_LIST_HEAD(&check_dirs); valid_path = fs_path_alloc(); if (!valid_path) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 75e7fa337e66..d7e8cd4f140c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -389,11 +389,7 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags) && - (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) - avail = 0; - else - avail = calc_available_free_space(fs_info, space_info, flush); + avail = calc_available_free_space(fs_info, space_info, flush); if (used + bytes < space_info->total_bytes + avail) return 1; @@ -510,6 +506,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, int dump_block_groups) { struct btrfs_block_group *cache; + u64 total_avail = 0; int index = 0; spin_lock(&info->lock); @@ -523,18 +520,27 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, down_read(&info->groups_sem); again: list_for_each_entry(cache, &info->block_groups[index], list) { + u64 avail; + spin_lock(&cache->lock); + avail = cache->length - cache->used - cache->pinned - + cache->reserved - cache->delalloc_bytes - + cache->bytes_super - cache->zone_unusable; btrfs_info(fs_info, - "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s", - cache->start, cache->length, cache->used, cache->pinned, - cache->reserved, cache->zone_unusable, - cache->ro ? "[readonly]" : ""); +"block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu delalloc %llu super %llu zone_unusable (%llu bytes available) %s", + cache->start, cache->length, cache->used, cache->pinned, + cache->reserved, cache->delalloc_bytes, + cache->bytes_super, cache->zone_unusable, + avail, cache->ro ? "[readonly]" : ""); spin_unlock(&cache->lock); btrfs_dump_free_space(cache, bytes); + total_avail += avail; } if (++index < BTRFS_NR_RAID_TYPES) goto again; up_read(&info->groups_sem); + + btrfs_info(fs_info, "%llu bytes available across all block groups", total_avail); } static inline u64 calc_reclaim_items_nr(const struct btrfs_fs_info *fs_info, @@ -715,9 +721,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, else nr = -1; - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_run_delayed_items_nr(trans, nr); @@ -733,9 +741,11 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELAYED_REFS_NR: case FLUSH_DELAYED_REFS: - trans = btrfs_join_transaction(root); + trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } if (state == FLUSH_DELAYED_REFS_NR) @@ -747,18 +757,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: - /* - * For metadata space on zoned filesystem, reaching here means we - * don't have enough space left in active_total_bytes. Try to - * activate a block group first, because we may have inactive - * block group already allocated. - */ - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); - if (ret < 0) - break; - else if (ret == 1) - break; - trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -770,22 +768,6 @@ static void flush_space(struct btrfs_fs_info *fs_info, CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); - /* - * For metadata space on zoned filesystem, allocating a new chunk - * is not enough. We still need to activate the block * group. - * Active the newly allocated block group by (maybe) finishing - * a block group. - */ - if (ret == 1) { - ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); - /* - * Revert to the original ret regardless we could finish - * one block group or not. - */ - if (ret >= 0) - ret = 1; - } - if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -800,9 +782,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case COMMIT_TRANS: ASSERT(current->journal_info == NULL); - trans = btrfs_join_transaction(root); + /* + * We don't want to start a new transaction, just attach to the + * current one or wait it fully commits in case its commit is + * happening at the moment. Note: we don't use a nostart join + * because that does not wait for a transaction to fully commit + * (only for it to be unblocked, state TRANS_STATE_UNBLOCKED). + */ + trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + if (ret == -ENOENT) + ret = 0; break; } ret = btrfs_commit_transaction(trans); @@ -1408,8 +1399,18 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } } - /* Attempt to steal from the global rsv if we can. */ - if (!steal_from_global_rsv(fs_info, space_info, ticket)) { + /* + * Attempt to steal from the global rsv if we can, except if the fs was + * turned into error mode due to a transaction abort when flushing space + * above, in that case fail with the abort error instead of returning + * success to the caller if we can steal from the global rsv - this is + * just to have caller fail immeditelly instead of later when trying to + * modify the fs, making it easier to debug -ENOSPC problems. + */ + if (BTRFS_FS_ERROR(fs_info)) { + ticket->error = BTRFS_FS_ERROR(fs_info); + remove_ticket(space_info, ticket); + } else if (!steal_from_global_rsv(fs_info, space_info, ticket)) { ticket->error = -ENOSPC; remove_ticket(space_info, ticket); } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f1dd172d8d5b..09bfe68d2ea3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -709,12 +709,16 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY case Opt_check_integrity_including_extent_data: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "enabling check integrity including extent data"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY_DATA); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; case Opt_check_integrity: + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "enabling check integrity"); btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY); break; @@ -727,6 +731,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; } info->check_integrity_print_mask = intarg; + btrfs_warn(info, + "integrity checker is deprecated and will be removed in 6.7"); btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); break; @@ -2144,7 +2150,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_MGTIME, }; static struct file_system_type btrfs_root_fs_type = { @@ -2152,7 +2158,8 @@ static struct file_system_type btrfs_root_fs_type = { .name = "btrfs", .mount = btrfs_mount_root, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | + FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("btrfs"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25294e624851..b1d1ac25237b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -414,6 +414,12 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_sectorsizes, supported_sectorsizes_show); +static ssize_t acl_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) +{ + return sysfs_emit(buf, "%d\n", !!IS_ENABLED(CONFIG_BTRFS_FS_POSIX_ACL)); +} +BTRFS_ATTR(static_feature, acl, acl_show); + /* * Features which only depend on kernel version. * @@ -421,6 +427,7 @@ BTRFS_ATTR(static_feature, supported_sectorsizes, * btrfs_supported_feature_attrs. */ static struct attribute *btrfs_supported_static_feature_attrs[] = { + BTRFS_ATTR_PTR(static_feature, acl), BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index f6bc6d738555..1cc86af97dc6 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -319,86 +319,139 @@ out: return ret; } -static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int check_eb_bitmap(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i; - for (i = 0; i < len * BITS_PER_BYTE; i++) { + for (i = 0; i < eb->len * BITS_PER_BYTE; i++) { int bit, bit1; bit = !!test_bit(i, bitmap); bit1 = !!extent_buffer_test_bit(eb, 0, i); if (bit1 != bit) { - test_err("bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte 0 bit %lu, byte %lu has 0x%02x expect 0x%02x", + i, i / BITS_PER_BYTE, has, expect); return -EINVAL; } bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, i % BITS_PER_BYTE); if (bit1 != bit) { - test_err("offset bits do not match"); + u8 has; + u8 expect; + + read_extent_buffer(eb, &has, i / BITS_PER_BYTE, 1); + expect = bitmap_get_value8(bitmap, ALIGN(i, BITS_PER_BYTE)); + + test_err( + "bits do not match, start byte %lu bit %lu, byte %lu has 0x%02x expect 0x%02x", + i / BITS_PER_BYTE, i % BITS_PER_BYTE, + i / BITS_PER_BYTE, has, expect); return -EINVAL; } } return 0; } -static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, - unsigned long len) +static int test_bitmap_set(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_set(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_set(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} + +static int test_bitmap_clear(const char *name, unsigned long *bitmap, + struct extent_buffer *eb, + unsigned long byte_start, unsigned long bit_start, + unsigned long bit_len) +{ + int ret; + + bitmap_clear(bitmap, byte_start * BITS_PER_BYTE + bit_start, bit_len); + extent_buffer_bitmap_clear(eb, byte_start, bit_start, bit_len); + ret = check_eb_bitmap(bitmap, eb); + if (ret < 0) + test_err("%s test failed", name); + return ret; +} +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb) { unsigned long i, j; + unsigned long byte_len = eb->len; u32 x; int ret; - memset(bitmap, 0, len); - memzero_extent_buffer(eb, 0, len); - if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { - test_err("bitmap was not zeroed"); - return -EINVAL; - } + ret = test_bitmap_clear("clear all run 1", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting all bits failed"); + ret = test_bitmap_set("set all", bitmap, eb, 0, 0, byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing all bits failed"); + ret = test_bitmap_clear("clear all run 2", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("same byte set", bitmap, eb, 0, 2, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("same byte partial clear", bitmap, eb, 0, 4, 1); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross byte set", bitmap, eb, 2, 4, 8); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross multi byte set", bitmap, eb, 4, 4, 24); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross byte clear", bitmap, eb, 2, 6, 4); + if (ret < 0) + return ret; + + ret = test_bitmap_clear("cross multi byte clear", bitmap, eb, 4, 6, 20); + if (ret < 0) return ret; - } /* Straddling pages test */ - if (len > PAGE_SIZE) { - bitmap_set(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, PAGE_SIZE - sizeof(long) / 2, 0, - sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("setting straddling pages failed"); + if (byte_len > PAGE_SIZE) { + ret = test_bitmap_set("cross page set", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (ret < 0) + return ret; + + ret = test_bitmap_set("cross page set all", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) return ret; - } - bitmap_set(bitmap, 0, len * BITS_PER_BYTE); - bitmap_clear(bitmap, - (PAGE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, - sizeof(long) * BITS_PER_BYTE); - extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, PAGE_SIZE - sizeof(long) / 2, 0, + ret = test_bitmap_clear("cross page clear", bitmap, eb, + PAGE_SIZE - sizeof(long) / 2, 0, sizeof(long) * BITS_PER_BYTE); - ret = check_eb_bitmap(bitmap, eb, len); - if (ret) { - test_err("clearing straddling pages failed"); + if (ret < 0) return ret; - } } /* @@ -406,9 +459,12 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, * something repetitive that could miss some hypothetical off-by-n bug. */ x = 0; - bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); - extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); - for (i = 0; i < len * BITS_PER_BYTE / 32; i++) { + ret = test_bitmap_clear("clear all run 3", bitmap, eb, 0, 0, + byte_len * BITS_PER_BYTE); + if (ret < 0) + return ret; + + for (i = 0; i < byte_len * BITS_PER_BYTE / 32; i++) { x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffU; for (j = 0; j < 32; j++) { if (x & (1U << j)) { @@ -418,7 +474,7 @@ static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, } } - ret = check_eb_bitmap(bitmap, eb, len); + ret = check_eb_bitmap(bitmap, eb); if (ret) { test_err("random bit pattern failed"); return ret; @@ -456,7 +512,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); if (ret) goto out; @@ -473,7 +529,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - ret = __test_eb_bitmaps(bitmap, eb, nodesize); + ret = __test_eb_bitmaps(bitmap, eb); out: free_extent_buffer(eb); kfree(bitmap); @@ -592,6 +648,146 @@ out: return ret; } +static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < eb->len; i++) { + struct page *page = eb->pages[i >> PAGE_SHIFT]; + void *addr = page_address(page) + offset_in_page(i); + + if (memcmp(addr, memory + i, 1) != 0) { + test_err("%s failed", test_name); + test_err("eb and memory diffs at byte %u, eb has 0x%02x memory has 0x%02x", + i, *(u8 *)addr, *(u8 *)(memory + i)); + return; + } + } +} + +static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, + const char *test_name) +{ + for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { + void *eb_addr = page_address(eb->pages[i]); + + if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { + dump_eb_and_memory_contents(eb, memory, test_name); + return -EUCLEAN; + } + } + return 0; +} + +/* + * Init both memory and extent buffer contents to the same randomly generated + * contents. + */ +static void init_eb_and_memory(struct extent_buffer *eb, void *memory) +{ + get_random_bytes(memory, eb->len); + write_extent_buffer(eb, memory, 0, eb->len); +} + +static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) +{ + struct btrfs_fs_info *fs_info; + struct extent_buffer *eb = NULL; + void *memory = NULL; + int ret; + + test_msg("running extent buffer memory operation tests"); + + fs_info = btrfs_alloc_dummy_fs_info(nodesize, sectorsize); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + memory = kvzalloc(nodesize, GFP_KERNEL); + if (!memory) { + test_err("failed to allocate memory"); + ret = -ENOMEM; + goto out; + } + + eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + if (!eb) { + test_std_err(TEST_ALLOC_EXTENT_BUFFER); + ret = -ENOMEM; + goto out; + } + + init_eb_and_memory(eb, memory); + ret = verify_eb_and_memory(eb, memory, "full eb write"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 16, 16); + memcpy_extent_buffer(eb, 0, 16, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory, memory + 2048, 16); + memcpy_extent_buffer(eb, 0, 2048, 16); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + memcpy(memory, memory + 2048, 2048); + memcpy_extent_buffer(eb, 0, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page non-overlapping memcpy 3"); + if (ret < 0) + goto out; + + memmove(memory + 512, memory + 256, 512); + memmove_extent_buffer(eb, 512, 256, 512); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 2048, memory + 512, 2048); + memmove_extent_buffer(eb, 2048, 512, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 2"); + if (ret < 0) + goto out; + memmove(memory + 512, memory + 2048, 2048); + memmove_extent_buffer(eb, 512, 2048, 2048); + ret = verify_eb_and_memory(eb, memory, "same page overlapping memcpy 3"); + if (ret < 0) + goto out; + + if (nodesize > PAGE_SIZE) { + memcpy(memory, memory + 4096 - 128, 256); + memcpy_extent_buffer(eb, 0, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 1"); + if (ret < 0) + goto out; + + memcpy(memory + 4096 - 128, memory + 4096 + 128, 256); + memcpy_extent_buffer(eb, 4096 - 128, 4096 + 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page non-overlapping memcpy 2"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 128, memory + 4096 - 64, 256); + memmove_extent_buffer(eb, 4096 - 128, 4096 - 64, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 1"); + if (ret < 0) + goto out; + + memmove(memory + 4096 - 64, memory + 4096 - 128, 256); + memmove_extent_buffer(eb, 4096 - 64, 4096 - 128, 256); + ret = verify_eb_and_memory(eb, memory, "cross page overlapping memcpy 2"); + if (ret < 0) + goto out; + } +out: + free_extent_buffer(eb); + kvfree(memory); + btrfs_free_dummy_fs_info(fs_info); + return ret; +} + int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) { int ret; @@ -607,6 +803,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize) goto out; ret = test_eb_bitmaps(sectorsize, nodesize); + if (ret) + goto out; + + ret = test_eb_mem_ops(sectorsize, nodesize); out: return ret; } diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index ed0f36ae5346..29bdd08b241f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,7 @@ #include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../btrfs_inode.h" #include "../volumes.h" #include "../disk-io.h" #include "../block-group.h" @@ -442,6 +443,406 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +static int add_compressed_extent(struct extent_map_tree *em_tree, + u64 start, u64 len, u64 block_start) +{ + struct extent_map *em; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = start; + em->len = len; + em->block_start = block_start; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + free_extent_map(em); + if (ret < 0) { + test_err("cannot add extent map [%llu, %llu)", start, start + len); + return ret; + } + + return 0; +} + +struct extent_range { + u64 start; + u64 len; +}; + +/* The valid states of the tree after every drop, as described below. */ +struct extent_range valid_ranges[][7] = { + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 3, .len = SZ_4K * 3}, /* [12k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K * 3}, /* [24k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K }, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + { .start = SZ_32K, .len = SZ_4K}, /* [32k, 36k) */ + { .start = SZ_32K + SZ_4K, .len = SZ_4K}, /* [36k, 40k) */ + { .start = SZ_4K * 10, .len = SZ_4K * 6}, /* [40k, 64k) */ + }, + { + { .start = 0, .len = SZ_8K}, /* [0, 8K) */ + { .start = SZ_4K * 5, .len = SZ_4K}, /* [20k, 24k) */ + { .start = SZ_4K * 6, .len = SZ_4K}, /* [24k, 28k) */ + } +}; + +static int validate_range(struct extent_map_tree *em_tree, int index) +{ + struct rb_node *n; + int i; + + for (i = 0, n = rb_first_cached(&em_tree->map); + valid_ranges[index][i].len && n; + i++, n = rb_next(n)) { + struct extent_map *entry = rb_entry(n, struct extent_map, rb_node); + + if (entry->start != valid_ranges[index][i].start) { + test_err("mapping has start %llu expected %llu", + entry->start, valid_ranges[index][i].start); + return -EINVAL; + } + + if (entry->len != valid_ranges[index][i].len) { + test_err("mapping has len %llu expected %llu", + entry->len, valid_ranges[index][i].len); + return -EINVAL; + } + } + + /* + * We exited because we don't have any more entries in the extent_map + * but we still expect more valid entries. + */ + if (valid_ranges[index][i].len) { + test_err("missing an entry"); + return -EINVAL; + } + + /* We exited the loop but still have entries in the extent map. */ + if (n) { + test_err("we have a left over entry in the extent map we didn't expect"); + return -EINVAL; + } + + return 0; +} + +/* + * Test scenario: + * + * Test the various edge cases of btrfs_drop_extent_map_range, create the + * following ranges + * + * [0, 12k)[12k, 24k)[24k, 36k)[36k, 40k)[40k,64k) + * + * And then we'll drop: + * + * [8k, 12k) - test the single front split + * [12k, 20k) - test the single back split + * [28k, 32k) - test the double split + * [32k, 64k) - test whole em dropping + * + * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from + * merging the em's. + */ +static int test_case_5(void) +{ + struct extent_map_tree *em_tree; + struct inode *inode; + u64 start, end; + int ret; + + test_msg("Running btrfs_drop_extent_map_range tests"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + /* [0, 12k) */ + ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + if (ret) { + test_err("cannot add extent range [0, 12K)"); + goto out; + } + + /* [12k, 24k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [24k, 36k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [36k, 40k) */ + ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* [40k, 64k) */ + ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + if (ret) { + test_err("cannot add extent range [12k, 24k)"); + goto out; + } + + /* Drop [8k, 12k) */ + start = SZ_8K; + end = (3 * SZ_4K) - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 0); + if (ret) + goto out; + + /* Drop [12k, 20k) */ + start = SZ_4K * 3; + end = SZ_16K + SZ_4K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 1); + if (ret) + goto out; + + /* Drop [28k, 32k) */ + start = SZ_32K - SZ_4K; + end = SZ_32K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 2); + if (ret) + goto out; + + /* Drop [32k, 64k) */ + start = SZ_32K; + end = SZ_64K - 1; + btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, false); + ret = validate_range(&BTRFS_I(inode)->extent_tree, 3); + if (ret) + goto out; +out: + iput(inode); + return ret; +} + +/* + * Test the btrfs_add_extent_mapping helper which will attempt to create an em + * for areas between two existing ems. Validate it doesn't do this when there + * are two unmerged em's side by side. + */ +static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em_tree) +{ + struct extent_map *em = NULL; + int ret; + + ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + if (ret) + goto out; + + ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + if (ret) + goto out; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + em->start = SZ_4K; + em->len = SZ_4K; + em->block_start = SZ_16K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, 0, SZ_8K); + write_unlock(&em_tree->lock); + + if (ret != 0) { + test_err("got an error when adding our em: %d", ret); + goto out; + } + + ret = -EINVAL; + if (em->start != 0) { + test_err("unexpected em->start at %llu, wanted 0", em->start); + goto out; + } + if (em->len != SZ_4K) { + test_err("unexpected em->len %llu, expected 4K", em->len); + goto out; + } + ret = 0; +out: + free_extent_map(em); + free_extent_map_tree(em_tree); + return ret; +} + +/* + * Regression test for btrfs_drop_extent_map_range. Calling with skip_pinned == + * true would mess up the start/end calculations and subsequent splits would be + * incorrect. + */ +static int test_case_7(void) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + struct inode *inode; + int ret; + + test_msg("Running btrfs_drop_extent_cache with pinned"); + + inode = btrfs_new_test_inode(); + if (!inode) { + test_std_err(TEST_ALLOC_INODE); + return -ENOMEM; + } + + em_tree = &BTRFS_I(inode)->extent_tree; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [0, 16K), pinned */ + em->start = 0; + em->len = SZ_16K; + em->block_start = 0; + em->block_len = SZ_4K; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + ret = -ENOMEM; + goto out; + } + + /* [32K, 48K), not pinned */ + em->start = SZ_32K; + em->len = SZ_16K; + em->block_start = SZ_32K; + em->block_len = SZ_16K; + write_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em, 0); + write_unlock(&em_tree->lock); + if (ret < 0) { + test_err("couldn't add extent map"); + goto out; + } + free_extent_map(em); + + /* + * Drop [0, 36K) This should skip the [0, 4K) extent and then split the + * [32K, 48K) extent. + */ + btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (36 * SZ_1K) - 1, true); + + /* Make sure our extent maps look sane. */ + ret = -EINVAL; + + em = lookup_extent_mapping(em_tree, 0, SZ_16K); + if (!em) { + test_err("didn't find an em at 0 as expected"); + goto out; + } + + if (em->start != 0) { + test_err("em->start is %llu, expected 0", em->start); + goto out; + } + + if (em->len != SZ_16K) { + test_err("em->len is %llu, expected 16K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an em when we weren't expecting one"); + goto out; + } + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + read_unlock(&em_tree->lock); + if (!em) { + test_err("didn't find an em at 32K as expected"); + goto out; + } + + if (em->start != (36 * SZ_1K)) { + test_err("em->start is %llu, expected 36K", em->start); + goto out; + } + + if (em->len != (12 * SZ_1K)) { + test_err("em->len is %llu, expected 12K", em->len); + goto out; + } + + free_extent_map(em); + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + read_unlock(&em_tree->lock); + if (em) { + test_err("found an unexpected em above 48K"); + goto out; + } + + ret = 0; +out: + free_extent_map(em); + iput(inode); + return ret; +} + struct rmap_test_vector { u64 raid_type; u64 physical_start; @@ -619,6 +1020,17 @@ int btrfs_test_extent_map(void) if (ret) goto out; ret = test_case_4(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_5(); + if (ret) + goto out; + ret = test_case_6(fs_info, em_tree); + if (ret) + goto out; + ret = test_case_7(); + if (ret) + goto out; test_msg("running rmap tests"); for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 91b6c2fdc420..874e4394df86 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,10 +292,11 @@ loop: spin_unlock(&fs_info->trans_lock); /* - * If we are ATTACH, we just want to catch the current transaction, - * and commit it. If there is no transaction, just return ENOENT. + * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the + * current transaction, and commit it. If there is no transaction, just + * return ENOENT. */ - if (type == TRANS_ATTACH) + if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* @@ -591,8 +592,13 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, u64 delayed_refs_bytes = 0; qgroup_reserved = num_items * fs_info->nodesize; - ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved, - enforce_qgroups); + /* + * Use prealloc for now, as there might be a currently running + * transaction that could free this reserved space prematurely + * by committing. + */ + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, + enforce_qgroups, false); if (ret) return ERR_PTR(ret); @@ -705,6 +711,14 @@ again: h->reloc_reserved = reloc_reserved; } + /* + * Now that we have found a transaction to be a part of, convert the + * qgroup reservation from prealloc to pertrans. A different transaction + * can't race in and free our pertrans out from under us. + */ + if (qgroup_reserved) + btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); + got_it: if (!current->journal_info) current->journal_info = h; @@ -752,7 +766,7 @@ alloc_fail: btrfs_block_rsv_release(fs_info, &fs_info->trans_block_rsv, num_bytes, NULL); reserve_fail: - btrfs_qgroup_free_meta_pertrans(root, qgroup_reserved); + btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } @@ -785,7 +799,10 @@ struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root * /* * Similar to regular join but it never starts a transaction when none is - * running or after waiting for the current one to finish. + * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. + * This is similar to btrfs_attach_transaction() but it allows the join to + * happen if the transaction commit already started but it's not yet in the + * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { @@ -1060,8 +1077,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; err = convert_extent_bit(dirty_pages, start, end, @@ -1114,8 +1131,8 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (!find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1837,8 +1854,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); - parent_inode->i_mtime = current_time(parent_inode); - parent_inode->i_ctime = parent_inode->i_mtime; + parent_inode->i_mtime = inode_set_ctime_current(parent_inode); ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 365a1cc0a3c3..d1e46b839519 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4148,9 +4148,9 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, inode->i_mtime.tv_nsec); btrfs_set_token_timespec_sec(&token, &item->ctime, - inode->i_ctime.tv_sec); + inode_get_ctime(inode).tv_sec); btrfs_set_token_timespec_nsec(&token, &item->ctime, - inode->i_ctime.tv_nsec); + inode_get_ctime(inode).tv_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4841,13 +4841,11 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; - struct list_head extents; + LIST_HEAD(extents); struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; int num = 0; - INIT_LIST_HEAD(&extents); - write_lock(&tree->lock); list_for_each_entry_safe(em, n, &tree->modified_extents, list) { @@ -6794,8 +6792,8 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, while (true) { struct btrfs_fs_info *fs_info = root->fs_info; - struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; + struct extent_buffer *leaf; + int slot; struct btrfs_key search_key; struct inode *inode; u64 ino; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6aa9bf3661ac..9621455edebc 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -681,6 +681,14 @@ error_free_page: return -EINVAL; } +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb) +{ + bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) & + BTRFS_FEATURE_INCOMPAT_METADATA_UUID); + + return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; +} + /* * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices * being created with a disk that has already completed its fsid change. Such @@ -833,15 +841,8 @@ static noinline struct btrfs_device *device_list_add(const char *path, found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - - if (has_metadata_uuid) - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, - BTRFS_FSID_SIZE); - else - memcpy(fs_devices->metadata_uuid, - disk_super->fsid, BTRFS_FSID_SIZE); - + memcpy(fs_devices->metadata_uuid, + btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE); fs_devices->fsid_change = false; } } @@ -851,8 +852,9 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (fs_devices->opened) { btrfs_err(NULL, - "device %s belongs to fsid %pU, and the fs is already mounted", - path, fs_devices->fsid); +"device %s belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)", + path, fs_devices->fsid, current->comm, + task_pid_nr(current)); mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } @@ -1424,9 +1426,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (!find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1438,18 +1440,18 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, return false; } -static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) +static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); + return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular * allocator, because we anyway use/reserve the first two zones * for superblock logging. */ - return ALIGN(start, device->zone_info->zone_size); + return 0; default: BUG(); } @@ -1581,15 +1583,15 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * correct usable device space, as device extent freed in current transaction * is not reported as available. */ -static int find_free_dev_extent_start(struct btrfs_device *device, - u64 num_bytes, u64 search_start, u64 *start, - u64 *len) +static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, + u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; struct btrfs_key key; struct btrfs_dev_extent *dev_extent; struct btrfs_path *path; + u64 search_start; u64 hole_size; u64 max_hole_start; u64 max_hole_size; @@ -1599,7 +1601,7 @@ static int find_free_dev_extent_start(struct btrfs_device *device, int slot; struct extent_buffer *l; - search_start = dev_extent_search_start(device, search_start); + search_start = dev_extent_search_start(device); WARN_ON(device->zone_info && !IS_ALIGNED(num_bytes, device->zone_info->zone_size)); @@ -1725,13 +1727,6 @@ out: return ret; } -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *len) -{ - /* FIXME use last free of some kind */ - return find_free_dev_extent_start(device, num_bytes, 0, start, len); -} - static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 start, u64 *dev_extent_len) @@ -1917,15 +1912,13 @@ out: static void update_dev_time(const char *device_path) { struct path path; - struct timespec64 now; int ret; ret = kern_path(device_path, LOOKUP_FOLLOW, &path); if (ret) return; - now = current_time(d_inode(path.dentry)); - inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME | S_VERSION); + inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION); path_put(&path); } @@ -6219,6 +6212,45 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); } +/* + * Map one logical range to one or more physical ranges. + * + * @length: (Mandatory) mapped length of this run. + * One logical range can be split into different segments + * due to factors like zones and RAID0/5/6/10 stripe + * boundaries. + * + * @bioc_ret: (Mandatory) returned btrfs_io_context structure. + * which has one or more physical ranges (btrfs_io_stripe) + * recorded inside. + * Caller should call btrfs_put_bioc() to free it after use. + * + * @smap: (Optional) single physical range optimization. + * If the map request can be fulfilled by one single + * physical range, and this is parameter is not NULL, + * then @bioc_ret would be NULL, and @smap would be + * updated. + * + * @mirror_num_ret: (Mandatory) returned mirror number if the original + * value is 0. + * + * Mirror number 0 means to choose any live mirrors. + * + * For non-RAID56 profiles, non-zero mirror_num means + * the Nth mirror. (e.g. mirror_num 1 means the first + * copy). + * + * For RAID56 profile, mirror 1 means rebuild from P and + * the remaining data stripes. + * + * For RAID6 profile, mirror > 2 means mark another + * data/P stripe error and rebuild from the remaining + * stripes.. + * + * @need_raid_map: (Used only for integrity checker) whether the map wants + * a full stripe map (including all data and P/Q stripes) + * for RAID56. Should always be 1 except integrity checker. + */ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, @@ -6393,9 +6425,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * I/O context structure. */ if (smap && num_alloc_stripes == 1 && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) && - (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || - !dev_replace->tgtdev)) { + !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); if (mirror_num_ret) *mirror_num_ret = mirror_num; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b8c51f16ba86..2128a032c3b7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -650,8 +650,6 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); -int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, - u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats); @@ -749,5 +747,6 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); +u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fc4b20c2688a..96828a13dd43 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -264,7 +264,7 @@ int btrfs_setxattr_trans(struct inode *inode, const char *name, goto out; inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); @@ -407,7 +407,7 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, ret = btrfs_set_prop(trans, inode, name, value, size, flags); if (!ret) { inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); if (ret) btrfs_abort_transaction(trans, ret); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 72b90bc19a19..09bc325d075d 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -65,6 +65,9 @@ #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) +static void wait_eb_writebacks(struct btrfs_block_group *block_group); +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written); + static inline bool sb_zone_is_full(const struct blk_zone *zone) { return (zone->cond == BLK_ZONE_COND_FULL) || @@ -465,8 +468,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) * use the cache. */ if (populate_cache && bdev_is_zoned(device->bdev)) { - zone_info->zone_cache = vzalloc(sizeof(struct blk_zone) * - zone_info->nr_zones); + zone_info->zone_cache = vcalloc(zone_info->nr_zones, + sizeof(struct blk_zone)); if (!zone_info->zone_cache) { btrfs_err_in_rcu(device->fs_info, "zoned: failed to allocate zone cache for %s", @@ -1583,19 +1586,9 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - - /* Check for block groups never get activated */ - if (test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &cache->fs_info->flags) && - cache->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM) && - !test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &cache->runtime_flags) && - cache->alloc_offset == 0) { - unusable = cache->length; - free = 0; - } else { - unusable = (cache->alloc_offset - cache->used) + - (cache->length - cache->zone_capacity); - free = cache->zone_capacity - cache->alloc_offset; - } + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->cached = BTRFS_CACHE_FINISHED; @@ -1707,10 +1700,21 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) { struct btrfs_inode *inode = BTRFS_I(ordered->inode); struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_ordered_sum *sum = - list_first_entry(&ordered->list, typeof(*sum), list); - u64 logical = sum->logical; - u64 len = sum->len; + struct btrfs_ordered_sum *sum; + u64 logical, len; + + /* + * Write to pre-allocated region is for the data relocation, and so + * it should use WRITE operation. No split/rewrite are necessary. + */ + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) + return; + + ASSERT(!list_empty(&ordered->list)); + /* The ordered->list can be empty in the above pre-alloc case. */ + sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + logical = sum->logical; + len = sum->len; while (len < ordered->disk_num_bytes) { sum = list_next_entry(sum, list); @@ -1747,41 +1751,121 @@ out: } } -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) +static bool check_bg_is_active(struct btrfs_eb_write_context *ctx, + struct btrfs_block_group **active_bg) { - struct btrfs_block_group *cache; - bool ret = true; + const struct writeback_control *wbc = ctx->wbc; + struct btrfs_block_group *block_group = ctx->zoned_bg; + struct btrfs_fs_info *fs_info = block_group->fs_info; - if (!btrfs_is_zoned(fs_info)) + if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) return true; - cache = btrfs_lookup_block_group(fs_info, eb->start); - if (!cache) - return true; + if (fs_info->treelog_bg == block_group->start) { + if (!btrfs_zone_activate(block_group)) { + int ret_fin = btrfs_zone_finish_one_bg(fs_info); - if (cache->meta_write_pointer != eb->start) { - btrfs_put_block_group(cache); - cache = NULL; - ret = false; - } else { - cache->meta_write_pointer = eb->start + eb->len; - } + if (ret_fin != 1 || !btrfs_zone_activate(block_group)) + return false; + } + } else if (*active_bg != block_group) { + struct btrfs_block_group *tgt = *active_bg; - *cache_ret = cache; + /* zoned_meta_io_lock protects fs_info->active_{meta,system}_bg. */ + lockdep_assert_held(&fs_info->zoned_meta_io_lock); - return ret; + if (tgt) { + /* + * If there is an unsent IO left in the allocated area, + * we cannot wait for them as it may cause a deadlock. + */ + if (tgt->meta_write_pointer < tgt->start + tgt->alloc_offset) { + if (wbc->sync_mode == WB_SYNC_NONE || + (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)) + return false; + } + + /* Pivot active metadata/system block group. */ + btrfs_zoned_meta_io_unlock(fs_info); + wait_eb_writebacks(tgt); + do_zone_finish(tgt, true); + btrfs_zoned_meta_io_lock(fs_info); + if (*active_bg == tgt) { + btrfs_put_block_group(tgt); + *active_bg = NULL; + } + } + if (!btrfs_zone_activate(block_group)) + return false; + if (*active_bg != block_group) { + ASSERT(*active_bg == NULL); + *active_bg = block_group; + btrfs_get_block_group(block_group); + } + } + + return true; } -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb) +/* + * Check if @ctx->eb is aligned to the write pointer. + * + * Return: + * 0: @ctx->eb is at the write pointer. You can write it. + * -EAGAIN: There is a hole. The caller should handle the case. + * -EBUSY: There is a hole, but the caller can just bail out. + */ +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { - if (!btrfs_is_zoned(eb->fs_info) || !cache) - return; + const struct writeback_control *wbc = ctx->wbc; + const struct extent_buffer *eb = ctx->eb; + struct btrfs_block_group *block_group = ctx->zoned_bg; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + if (block_group) { + if (block_group->start > eb->start || + block_group->start + block_group->length <= eb->start) { + btrfs_put_block_group(block_group); + block_group = NULL; + ctx->zoned_bg = NULL; + } + } + + if (!block_group) { + block_group = btrfs_lookup_block_group(fs_info, eb->start); + if (!block_group) + return 0; + ctx->zoned_bg = block_group; + } + + if (block_group->meta_write_pointer == eb->start) { + struct btrfs_block_group **tgt; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return 0; - ASSERT(cache->meta_write_pointer == eb->start + eb->len); - cache->meta_write_pointer = eb->start; + if (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) + tgt = &fs_info->active_system_bg; + else + tgt = &fs_info->active_meta_bg; + if (check_bg_is_active(ctx, tgt)) + return 0; + } + + /* + * Since we may release fs_info->zoned_meta_io_lock, someone can already + * start writing this eb. In that case, we can just bail out. + */ + if (block_group->meta_write_pointer > eb->start) + return -EBUSY; + + /* If for_sync, this hole will be filled with trasnsaction commit. */ + if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) + return -EAGAIN; + return -EBUSY; } int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length) @@ -1879,10 +1963,10 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; + const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); bool ret; int i; @@ -1891,7 +1975,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; - spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) { ret = true; @@ -1904,30 +1987,44 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) goto out_unlock; } + spin_lock(&fs_info->zone_active_bgs_lock); for (i = 0; i < map->num_stripes; i++) { + struct btrfs_zoned_device_info *zinfo; + int reserved = 0; + device = map->stripes[i].dev; physical = map->stripes[i].physical; + zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; + if (is_data) + reserved = zinfo->reserved_active_zones; + /* + * For the data block group, leave active zones for one + * metadata block group and one system block group. + */ + if (atomic_read(&zinfo->active_zones_left) <= reserved) { + ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); + goto out_unlock; + } + if (!btrfs_dev_set_active_zone(device, physical)) { /* Cannot activate the zone */ ret = false; + spin_unlock(&fs_info->zone_active_bgs_lock); goto out_unlock; } + if (!is_data) + zinfo->reserved_active_zones--; } + spin_unlock(&fs_info->zone_active_bgs_lock); /* Successfully activated all the zones */ set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); - WARN_ON(block_group->alloc_offset != 0); - if (block_group->zone_unusable == block_group->length) { - block_group->zone_unusable = block_group->length - block_group->zone_capacity; - space_info->bytes_zone_unusable -= block_group->zone_capacity; - } spin_unlock(&block_group->lock); - btrfs_try_granting_tickets(fs_info, space_info); - spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1940,7 +2037,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); return ret; } @@ -2006,6 +2102,10 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ * and block_group->meta_write_pointer for metadata. */ if (!fully_written) { + if (test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } spin_unlock(&block_group->lock); ret = btrfs_inc_block_group_ro(block_group, false); @@ -2034,7 +2134,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ return 0; } - if (block_group->reserved) { + if (block_group->reserved || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + &block_group->runtime_flags)) { spin_unlock(&block_group->lock); btrfs_dec_block_group_ro(block_group); return -EAGAIN; @@ -2043,6 +2145,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ clear_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags); block_group->alloc_offset = block_group->zone_capacity; + if (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) + block_group->meta_write_pointer = block_group->start + + block_group->zone_capacity; block_group->free_space_ctl->free_space = 0; btrfs_clear_treelog_bg(block_group); btrfs_clear_data_reloc_bg(block_group); @@ -2052,18 +2157,21 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; + struct btrfs_zoned_device_info *zinfo = device->zone_info; - if (device->zone_info->max_active_zones == 0) + if (zinfo->max_active_zones == 0) continue; ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - device->zone_info->zone_size >> SECTOR_SHIFT, + zinfo->zone_size >> SECTOR_SHIFT, GFP_NOFS); if (ret) return ret; + if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA)) + zinfo->reserved_active_zones++; btrfs_dev_clear_active_zone(device, physical); } @@ -2102,8 +2210,10 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); + spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { struct btrfs_zoned_device_info *zinfo = device->zone_info; + int reserved = 0; if (!device->bdev) continue; @@ -2113,17 +2223,21 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) break; } + if (flags & BTRFS_BLOCK_GROUP_DATA) + reserved = zinfo->reserved_active_zones; + switch (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) { case 0: /* single */ - ret = (atomic_read(&zinfo->active_zones_left) >= 1); + ret = (atomic_read(&zinfo->active_zones_left) >= (1 + reserved)); break; case BTRFS_BLOCK_GROUP_DUP: - ret = (atomic_read(&zinfo->active_zones_left) >= 2); + ret = (atomic_read(&zinfo->active_zones_left) >= (2 + reserved)); break; } if (ret) break; } + spin_unlock(&fs_info->zone_active_bgs_lock); mutex_unlock(&fs_info->chunk_mutex); if (!ret) @@ -2265,7 +2379,10 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica /* All relocation extents are written. */ if (block_group->start + block_group->alloc_offset == logical + length) { - /* Now, release this block group for further allocations. */ + /* + * Now, release this block group for further allocations and + * zone finish. + */ clear_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags); } @@ -2289,7 +2406,8 @@ int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) spin_lock(&block_group->lock); if (block_group->reserved || block_group->alloc_offset == 0 || - (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM) || + test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) { spin_unlock(&block_group->lock); continue; } @@ -2365,3 +2483,55 @@ int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } + +/* + * Reserve zones for one metadata block group, one tree-log block group, and one + * system block group. + */ +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_block_group *block_group; + struct btrfs_device *device; + /* Reserve zones for normal SINGLE metadata and tree-log block group. */ + unsigned int metadata_reserve = 2; + /* Reserve a zone for SINGLE system block group. */ + unsigned int system_reserve = 1; + + if (!test_bit(BTRFS_FS_ACTIVE_ZONE_TRACKING, &fs_info->flags)) + return; + + /* + * This function is called from the mount context. So, there is no + * parallel process touching the bits. No need for read_seqretry(). + */ + if (fs_info->avail_metadata_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + metadata_reserve = 4; + if (fs_info->avail_system_alloc_bits & BTRFS_BLOCK_GROUP_DUP) + system_reserve = 2; + + /* Apply the reservation on all the devices. */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + device->zone_info->reserved_active_zones = + metadata_reserve + system_reserve; + } + mutex_unlock(&fs_devices->device_list_mutex); + + /* Release reservation for currently active block groups. */ + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { + struct map_lookup *map = block_group->physical_map; + + if (!(block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) + continue; + + for (int i = 0; i < map->num_stripes; i++) + map->stripes[i].dev->zone_info->reserved_active_zones--; + } + spin_unlock(&fs_info->zone_active_bgs_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 27322b926038..b9cec523b778 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -22,6 +22,11 @@ struct btrfs_zoned_device_info { u8 zone_size_shift; u32 nr_zones; unsigned int max_active_zones; + /* + * Reserved active zones for one metadata and one system block group. + * It can vary per-device depending on the allocation status. + */ + int reserved_active_zones; atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; @@ -58,11 +63,8 @@ void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); -bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret); -void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache, - struct extent_buffer *eb); +int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx); int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length); int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); @@ -81,6 +83,7 @@ void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logica int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, bool do_finish); +void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -189,17 +192,10 @@ static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio) { } -static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, - struct btrfs_block_group **cache_ret) -{ - return true; -} - -static inline void btrfs_revert_meta_write_pointer( - struct btrfs_block_group *cache, - struct extent_buffer *eb) +static inline int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, + struct btrfs_eb_write_context *ctx) { + return 0; } static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device, @@ -262,6 +258,8 @@ static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, return 0; } +static inline void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/buffer.c b/fs/buffer.c index bd091329026c..084a6ade108a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,6 +49,7 @@ #include <trace/events/block.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> +#include <linux/sched/isolation.h> #include "internal.h" @@ -1225,19 +1226,14 @@ EXPORT_SYMBOL(mark_buffer_dirty); void mark_buffer_write_io_error(struct buffer_head *bh) { - struct super_block *sb; - set_buffer_write_io_error(bh); /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) + if (bh->b_assoc_map) { mapping_set_error(bh->b_assoc_map, -EIO); - rcu_read_lock(); - sb = READ_ONCE(bh->b_bdev->bd_super); - if (sb) - errseq_set(&sb->s_wb_err, -EIO); - rcu_read_unlock(); + errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); + } } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1352,7 +1348,7 @@ static void bh_lru_install(struct buffer_head *bh) * failing page migration. * Skip putting upcoming bh into bh_lru until migration is done. */ - if (lru_cache_disabled()) { + if (lru_cache_disabled() || cpu_is_isolated(smp_processor_id())) { bh_lru_unlock(); return; } @@ -1382,6 +1378,10 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) check_irqs_on(); bh_lru_lock(); + if (cpu_is_isolated(smp_processor_id())) { + bh_lru_unlock(); + return NULL; + } for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 175a25fcade8..009d23cd435b 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -259,9 +259,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) _enter("%ld", ret); - /* Tell lockdep we inherited freeze protection from submission thread */ - __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); - __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + kiocb_end_write(iocb); if (ret < 0) trace_cachefiles_io_error(object, inode, ret, @@ -286,7 +284,6 @@ int __cachefiles_write(struct cachefiles_object *object, { struct cachefiles_cache *cache; struct cachefiles_kiocb *ki; - struct inode *inode; unsigned int old_nofs; ssize_t ret; size_t len = iov_iter_count(iter); @@ -322,19 +319,12 @@ int __cachefiles_write(struct cachefiles_object *object, ki->iocb.ki_complete = cachefiles_write_complete; atomic_long_add(ki->b_writing, &cache->b_writing); - /* Open-code file_start_write here to grab freeze protection, which - * will be released by another thread in aio_complete_rw(). Fool - * lockdep by telling it the lock got released so that it doesn't - * complain about the held lock when we return to userspace. - */ - inode = file_inode(file); - __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); - __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + kiocb_start_write(&ki->iocb); get_file(ki->iocb.ki_filp); cachefiles_grab_object(object, cachefiles_obj_get_ioreq); - trace_cachefiles_write(object, inode, ki->iocb.ki_pos, len); + trace_cachefiles_write(object, file_inode(file), ki->iocb.ki_pos, len); old_nofs = memalloc_nofs_save(); ret = cachefiles_inject_write_error(); if (ret == 0) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 6945a938d396..c91b293267d7 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -93,7 +93,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, char *value = NULL; struct iattr newattrs; struct inode *inode = d_inode(dentry); - struct timespec64 old_ctime = inode->i_ctime; + struct timespec64 old_ctime = inode_get_ctime(inode); umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index e2bb0d0072da..09cd6d334604 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1400,7 +1400,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->mtime = inode->i_mtime; arg->atime = inode->i_atime; - arg->ctime = inode->i_ctime; + arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 8e5f41d45283..fd05d68e2990 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -100,7 +100,7 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; inode->i_mtime = parent->i_mtime; - inode->i_ctime = parent->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(parent)); inode->i_atime = parent->i_atime; ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; @@ -688,6 +688,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, struct timespec64 *mtime, struct timespec64 *atime) { struct ceph_inode_info *ci = ceph_inode(inode); + struct timespec64 ictime = inode_get_ctime(inode); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -696,11 +697,11 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec64_compare(ctime, &inode->i_ctime) > 0) { + timespec64_compare(ctime, &ictime) > 0) { dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ictime.tv_sec, ictime.tv_nsec, ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = *ctime; + inode_set_ctime_to_ts(inode, *ctime); } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { @@ -738,7 +739,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = *ctime; + inode_set_ctime_to_ts(inode, *ctime); inode->i_mtime = *mtime; inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; @@ -2166,7 +2167,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + inode_get_ctime(inode).tv_sec, + inode_get_ctime(inode).tv_nsec, attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); if (only) { @@ -2191,7 +2193,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (dirtied) { inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, &prealloc_cf); - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); inode_inc_iversion_raw(inode); } @@ -2465,7 +2467,7 @@ int ceph_getattr(struct mnt_idmap *idmap, const struct path *path, return err; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = ceph_present_inode(inode); /* diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 343d738448dc..c9920ade15f5 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -660,7 +660,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->size = i_size_read(inode); capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; - capsnap->ctime = inode->i_ctime; + capsnap->ctime = inode_get_ctime(inode); capsnap->btime = ci->i_btime; capsnap->change_attr = inode_peek_iversion_raw(inode); capsnap->time_warp_seq = ci->i_time_warp_seq; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 806183959c47..1cbd84cc82a8 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1238,7 +1238,7 @@ retry: dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, &prealloc_cf); ci->i_xattrs.dirty = true; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } spin_unlock(&ci->i_ceph_lock); diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 903ca8fa4b9b..ae023853a98f 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -127,7 +127,8 @@ void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) if (attr->va_mtime.tv_sec != -1) inode->i_mtime = coda_to_timespec64(attr->va_mtime); if (attr->va_ctime.tv_sec != -1) - inode->i_ctime = coda_to_timespec64(attr->va_ctime); + inode_set_ctime_to_ts(inode, + coda_to_timespec64(attr->va_ctime)); } diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 1b960de2bf39..cb512b10473b 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -111,7 +111,7 @@ static inline void coda_dir_update_mtime(struct inode *dir) /* optimistically we can also act as if our nose bleeds. The * granularity of the mtime is coarse anyways so we might actually be * right most of the time. Note: we only do this for directories. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); #endif } diff --git a/fs/coda/file.c b/fs/coda/file.c index 12b26bd13564..42346618b4ed 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -84,7 +84,7 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; - coda_inode->i_mtime = coda_inode->i_ctime = current_time(coda_inode); + coda_inode->i_mtime = inode_set_ctime_current(coda_inode); inode_unlock(coda_inode); file_end_write(host_file); diff --git a/fs/coda/inode.c b/fs/coda/inode.c index d661e6cf17ac..0c7c2528791e 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -256,7 +256,8 @@ int coda_getattr(struct mnt_idmap *idmap, const struct path *path, { int err = coda_revalidate_inode(d_inode(path->dentry)); if (!err) - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(path->dentry), stat); return err; } @@ -269,7 +270,7 @@ int coda_setattr(struct mnt_idmap *idmap, struct dentry *de, memset(&vattr, 0, sizeof(vattr)); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); coda_iattr_to_vattr(iattr, &vattr); vattr.va_type = C_VNON; /* cannot set type */ diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1c15edbe70ff..fbdcb3582926 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -88,8 +88,7 @@ int configfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, static inline void set_default_inode_attr(struct inode * inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) @@ -99,7 +98,7 @@ static inline void set_inode_attr(struct inode * inode, struct iattr * iattr) inode->i_gid = iattr->ia_gid; inode->i_atime = iattr->ia_atime; inode->i_mtime = iattr->ia_mtime; - inode->i_ctime = iattr->ia_ctime; + inode_set_ctime_to_ts(inode, iattr->ia_ctime); } struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd, @@ -172,7 +171,7 @@ struct inode *configfs_create(struct dentry *dentry, umode_t mode) return ERR_PTR(-ENOMEM); p_inode = d_inode(dentry->d_parent); - p_inode->i_mtime = p_inode->i_ctime = current_time(p_inode); + p_inode->i_mtime = inode_set_ctime_current(p_inode); configfs_set_inode_lock_class(sd, inode); return inode; } diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 27c6597aa1be..5ee7d7bbb361 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -133,7 +133,8 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } /* Struct copy intentional */ - inode->i_mtime = inode->i_atime = inode->i_ctime = zerotime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + zerotime); /* inode->i_nlink is left 1 - arguably wrong for directories, but it's the best we can do without reading the directory contents. 1 yields the right result in GNU find, even @@ -485,12 +486,16 @@ static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + generic_shutdown_super(sb); + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sb->s_mtd) { if (sbi && sbi->mtd_point_size) mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); - kill_mtd_super(sb); + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { - kill_block_super(sb); + sync_blockdev(sb->s_bdev); + blkdev_put(sb->s_bdev, sb); } kfree(sbi); } diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fdab6b..25ac74d30bff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1664,7 +1664,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; - printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + WARN(1, "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? @@ -1673,7 +1673,6 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - WARN_ON(1); return D_WALK_CONTINUE; } @@ -3247,8 +3246,6 @@ void d_genocide(struct dentry *parent) d_walk(parent, parent, d_genocide_kill); } -EXPORT_SYMBOL(d_genocide); - void d_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3f81f73c241a..83e57e9f9fa0 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -72,8 +72,7 @@ static struct inode *debugfs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fe3db0eda8e4..299c295a27a0 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -338,7 +338,7 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); mode = S_IFCHR|opts->ptmxmode; init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); @@ -451,7 +451,7 @@ devpts_fill_super(struct super_block *s, void *data, int silent) if (!inode) goto fail; inode->i_ino = 1; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -534,12 +534,12 @@ void devpts_kill_index(struct pts_fs_info *fsi, int idx) /** * devpts_pty_new -- create a new inode in /dev/pts/ - * @ptmx_inode: inode of the master - * @device: major+minor of the node to be created + * @fsi: Filesystem info for this instance. * @index: used as a name of the node * @priv: what's given back by devpts_get_priv * - * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill. + * The dentry for the created inode is returned. + * Remove it from /dev/pts/ with devpts_pty_kill(). */ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) { @@ -560,7 +560,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) inode->i_ino = index + 3; inode->i_uid = opts->setuid ? opts->uid : current_fsuid(); inode->i_gid = opts->setgid ? opts->gid : current_fsgid(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); init_special_inode(inode, S_IFCHR|opts->mode, MKDEV(UNIX98_PTY_SLAVE_MAJOR, index)); sprintf(s, "%d", index); @@ -580,7 +580,7 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) /** * devpts_get_priv -- get private data for a slave - * @pts_inode: inode of the slave + * @dentry: dentry of the slave * * Returns whatever was passed as priv in devpts_pty_new for a given inode. */ @@ -593,7 +593,7 @@ void *devpts_get_priv(struct dentry *dentry) /** * devpts_pty_kill -- remove inode form /dev/pts/ - * @inode: inode of the slave to be removed + * @dentry: dentry of the slave to be removed * * This is an inverse operation of devpts_pty_new. */ diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c16f0d660cb7..03bd55069d86 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -441,10 +441,10 @@ int ecryptfs_encrypt_page(struct page *page) } lower_offset = lower_offset_for_page(crypt_stat, page); - enc_extent_virt = kmap(enc_extent_page); + enc_extent_virt = kmap_local_page(enc_extent_page); rc = ecryptfs_write_lower(ecryptfs_inode, enc_extent_virt, lower_offset, PAGE_SIZE); - kunmap(enc_extent_page); + kunmap_local(enc_extent_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to write lower page; rc = [%d]\n", @@ -490,10 +490,10 @@ int ecryptfs_decrypt_page(struct page *page) BUG_ON(!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)); lower_offset = lower_offset_for_page(crypt_stat, page); - page_virt = kmap(page); + page_virt = kmap_local_page(page); rc = ecryptfs_read_lower(page_virt, lower_offset, PAGE_SIZE, ecryptfs_inode); - kunmap(page); + kunmap_local(page_virt); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Error attempting to read lower page; rc = [%d]\n", diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 83274915ba6d..992d9c7e64ae 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -148,7 +148,7 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, } fsstack_copy_attr_times(dir, lower_dir); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); out_unlock: dput(lower_dentry); inode_unlock(lower_dir); @@ -982,7 +982,7 @@ static int ecryptfs_getattr_link(struct mnt_idmap *idmap, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -1011,7 +1011,8 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, + d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 373c3e5747e6..e2483acc4366 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -125,7 +125,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is a header extent */ char *page_virt; - page_virt = kmap_atomic(page); + page_virt = kmap_local_page(page); memset(page_virt, 0, PAGE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { @@ -138,7 +138,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, crypt_stat, &written); } - kunmap_atomic(page_virt); + kunmap_local(page_virt); flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " @@ -255,7 +255,6 @@ out: * @mapping: The eCryptfs object * @pos: The file offset at which to start writing * @len: Length of the write - * @flags: Various flags * @pagep: Pointer to return the page * @fsdata: Pointer to return fs data (unused) * diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 60bdcaddcbe5..3458f153a588 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -64,11 +64,11 @@ int ecryptfs_write_lower_page_segment(struct inode *ecryptfs_inode, offset = ((((loff_t)page_for_lower->index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_lower); + virt = kmap_local_page(page_for_lower); rc = ecryptfs_write_lower(ecryptfs_inode, virt, offset, size); if (rc > 0) rc = 0; - kunmap(page_for_lower); + kunmap_local(virt); return rc; } @@ -140,7 +140,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, ecryptfs_page_idx, rc); goto out; } - ecryptfs_page_virt = kmap_atomic(ecryptfs_page); + ecryptfs_page_virt = kmap_local_page(ecryptfs_page); /* * pos: where we're now writing, offset: where the request was @@ -163,7 +163,7 @@ int ecryptfs_write(struct inode *ecryptfs_inode, char *data, loff_t offset, (data + data_offset), num_bytes); data_offset += num_bytes; } - kunmap_atomic(ecryptfs_page_virt); + kunmap_local(ecryptfs_page_virt); flush_dcache_page(ecryptfs_page); SetPageUptodate(ecryptfs_page); unlock_page(ecryptfs_page); @@ -253,11 +253,11 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, int rc; offset = ((((loff_t)page_index) << PAGE_SHIFT) + offset_in_page); - virt = kmap(page_for_ecryptfs); + virt = kmap_local_page(page_for_ecryptfs); rc = ecryptfs_read_lower(virt, offset, size, ecryptfs_inode); if (rc > 0) rc = 0; - kunmap(page_for_ecryptfs); + kunmap_local(virt); flush_dcache_page(page_for_ecryptfs); return rc; } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index d57ee15874f9..59b52718a3a2 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,7 +51,7 @@ static ssize_t efivarfs_file_write(struct file *file, } else { inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); inode_unlock(inode); } diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index b973a2c03dde..db9231f0e77b 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -25,7 +25,7 @@ struct inode *efivarfs_get_inode(struct super_block *sb, if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_flags = is_removable ? 0 : S_IMMUTABLE; switch (mode & S_IFMT) { case S_IFREG: diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 3ba94bb005a6..3789d22ba501 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -105,8 +105,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) inode->i_size = be32_to_cpu(efs_inode->di_size); inode->i_atime.tv_sec = be32_to_cpu(efs_inode->di_atime); inode->i_mtime.tv_sec = be32_to_cpu(efs_inode->di_mtime); - inode->i_ctime.tv_sec = be32_to_cpu(efs_inode->di_ctime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, be32_to_cpu(efs_inode->di_ctime), 0); + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; /* this is the number of blocks in the file */ if (inode->i_size == 0) { diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index f259d92c9720..f6dc961e6c2b 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -38,6 +38,7 @@ config EROFS_FS_DEBUG config EROFS_FS_XATTR bool "EROFS extended attributes" depends on EROFS_FS + select XXHASH default y help Extended attributes are name:value pairs associated with inodes by @@ -99,6 +100,21 @@ config EROFS_FS_ZIP_LZMA If unsure, say N. +config EROFS_FS_ZIP_DEFLATE + bool "EROFS DEFLATE compressed data support" + depends on EROFS_FS_ZIP + select ZLIB_INFLATE + help + Saying Y here includes support for reading EROFS file systems + containing DEFLATE compressed data. It gives better compression + ratios than the default LZ4 format, while it costs more CPU + overhead. + + DEFLATE support is an experimental feature for now and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support" depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y) diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index a3a98fc3e481..994d0b9deddf 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -5,4 +5,5 @@ erofs-objs := super.o inode.o data.o namei.o dir.o utils.o sysfs.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o +erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index b1b846504027..349c3316ae6b 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -94,4 +94,6 @@ extern const struct z_erofs_decompressor erofs_decompressors[]; /* prototypes for specific algorithms */ int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool); +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index cfad1eac7fd9..332ec5f74002 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -379,4 +379,10 @@ const struct z_erofs_decompressor erofs_decompressors[] = { .name = "lzma" }, #endif +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE + [Z_EROFS_COMPRESSION_DEFLATE] = { + .decompress = z_erofs_deflate_decompress, + .name = "deflate" + }, +#endif }; diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c new file mode 100644 index 000000000000..19e5bdeb30b6 --- /dev/null +++ b/fs/erofs/decompressor_deflate.c @@ -0,0 +1,247 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/module.h> +#include <linux/zlib.h> +#include "compress.h" + +struct z_erofs_deflate { + struct z_erofs_deflate *next; + struct z_stream_s z; + u8 bounce[PAGE_SIZE]; +}; + +static DEFINE_SPINLOCK(z_erofs_deflate_lock); +static unsigned int z_erofs_deflate_nstrms, z_erofs_deflate_avail_strms; +static struct z_erofs_deflate *z_erofs_deflate_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_deflate_wq); + +module_param_named(deflate_streams, z_erofs_deflate_nstrms, uint, 0444); + +void z_erofs_deflate_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + continue; + } + z_erofs_deflate_head = NULL; + spin_unlock(&z_erofs_deflate_lock); + + while (strm) { + struct z_erofs_deflate *n = strm->next; + + vfree(strm->z.workspace); + kfree(strm); + --z_erofs_deflate_avail_strms; + strm = n; + } + } +} + +int __init z_erofs_deflate_init(void) +{ + /* by default, use # of possible CPUs instead */ + if (!z_erofs_deflate_nstrms) + z_erofs_deflate_nstrms = num_possible_cpus(); + + for (; z_erofs_deflate_avail_strms < z_erofs_deflate_nstrms; + ++z_erofs_deflate_avail_strms) { + struct z_erofs_deflate *strm; + + strm = kzalloc(sizeof(*strm), GFP_KERNEL); + if (!strm) + goto out_failed; + + /* XXX: in-kernel zlib cannot shrink windowbits currently */ + strm->z.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!strm->z.workspace) { + kfree(strm); + goto out_failed; + } + + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + } + return 0; + +out_failed: + pr_err("failed to allocate zlib workspace\n"); + z_erofs_deflate_exit(); + return -ENOMEM; +} + +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) +{ + if (!dfl || size < sizeof(struct z_erofs_deflate_cfgs)) { + erofs_err(sb, "invalid deflate cfgs, size=%u", size); + return -EINVAL; + } + + if (dfl->windowbits > MAX_WBITS) { + erofs_err(sb, "unsupported windowbits %u", dfl->windowbits); + return -EOPNOTSUPP; + } + + erofs_info(sb, "EXPERIMENTAL DEFLATE feature in use. Use at your own risk!"); + return 0; +} + +int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + struct super_block *sb = rq->sb; + unsigned int insz, outsz, pofs; + struct z_erofs_deflate *strm; + u8 *kin, *kout = NULL; + bool bounced = false; + int no = -1, ni = 0, j = 0, zerr, err; + + /* 1. get the exact DEFLATE compressed size */ + kin = kmap_local_page(*rq->in); + err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + sb->s_blocksize - rq->pageofs_in)); + if (err) { + kunmap_local(kin); + return err; + } + + /* 2. get an available DEFLATE context */ +again: + spin_lock(&z_erofs_deflate_lock); + strm = z_erofs_deflate_head; + if (!strm) { + spin_unlock(&z_erofs_deflate_lock); + wait_event(z_erofs_deflate_wq, READ_ONCE(z_erofs_deflate_head)); + goto again; + } + z_erofs_deflate_head = strm->next; + spin_unlock(&z_erofs_deflate_lock); + + /* 3. multi-call decompress */ + insz = rq->inputsize; + outsz = rq->outputsize; + zerr = zlib_inflateInit2(&strm->z, -MAX_WBITS); + if (zerr != Z_OK) { + err = -EIO; + goto failed_zinit; + } + + pofs = rq->pageofs_out; + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE - rq->pageofs_in); + insz -= strm->z.avail_in; + strm->z.next_in = kin + rq->pageofs_in; + strm->z.avail_out = 0; + + while (1) { + if (!strm->z.avail_out) { + if (++no >= nrpages_out || !outsz) { + erofs_err(sb, "insufficient space for decompressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) + kunmap_local(kout); + strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs); + outsz -= strm->z.avail_out; + if (!rq->out[no]) { + rq->out[no] = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(rq->out[no], + Z_EROFS_SHORTLIVED_PAGE); + } + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + pofs; + pofs = 0; + } + + if (!strm->z.avail_in && insz) { + if (++ni >= nrpages_in) { + erofs_err(sb, "invalid compressed data"); + err = -EFSCORRUPTED; + break; + } + + if (kout) { /* unlike kmap(), take care of the orders */ + j = strm->z.next_out - kout; + kunmap_local(kout); + } + kunmap_local(kin); + strm->z.avail_in = min_t(u32, insz, PAGE_SIZE); + insz -= strm->z.avail_in; + kin = kmap_local_page(rq->in[ni]); + strm->z.next_in = kin; + bounced = false; + if (kout) { + kout = kmap_local_page(rq->out[no]); + strm->z.next_out = kout + j; + } + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Or use short-lived pages from the + * on-stack pagepool where pages share among the same request + * and not _all_ inplace I/O pages are needed to be doubled. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->z.next_in, strm->z.avail_in); + strm->z.next_in = strm->bounce; + bounced = true; + } + + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + + zerr = zlib_inflate(&strm->z, Z_SYNC_FLUSH); + if (zerr != Z_OK || !(outsz + strm->z.avail_out)) { + if (zerr == Z_OK && rq->partial_decoding) + break; + if (zerr == Z_STREAM_END && !outsz) + break; + erofs_err(sb, "failed to decompress %d in[%u] out[%u]", + zerr, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + + if (zlib_inflateEnd(&strm->z) != Z_OK && !err) + err = -EIO; + if (kout) + kunmap_local(kout); +failed_zinit: + kunmap_local(kin); + /* 4. push back DEFLATE stream context to the global list */ + spin_lock(&z_erofs_deflate_lock); + strm->next = z_erofs_deflate_head; + z_erofs_deflate_head = strm; + spin_unlock(&z_erofs_deflate_lock); + wake_up(&z_erofs_deflate_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 2c7b16e340fe..a03ec70ba6f2 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -13,6 +13,7 @@ #define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001 #define EROFS_FEATURE_COMPAT_MTIME 0x00000002 +#define EROFS_FEATURE_COMPAT_XATTR_FILTER 0x00000004 /* * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should @@ -81,7 +82,8 @@ struct erofs_super_block { __u8 xattr_prefix_count; /* # of long xattr name prefixes */ __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ - __u8 reserved2[24]; + __u8 xattr_filter_reserved; /* reserved for xattr name filter */ + __u8 reserved2[23]; }; /* @@ -200,7 +202,7 @@ struct erofs_inode_extended { * for read-only fs, no need to introduce h_refcount */ struct erofs_xattr_ibody_header { - __le32 h_reserved; + __le32 h_name_filter; /* bit value 1 indicates not-present */ __u8 h_shared_count; __u8 h_reserved2[7]; __le32 h_shared_xattrs[]; /* shared xattr id array */ @@ -221,6 +223,10 @@ struct erofs_xattr_ibody_header { #define EROFS_XATTR_LONG_PREFIX 0x80 #define EROFS_XATTR_LONG_PREFIX_MASK 0x7f +#define EROFS_XATTR_FILTER_BITS 32 +#define EROFS_XATTR_FILTER_DEFAULT UINT32_MAX +#define EROFS_XATTR_FILTER_SEED 0x25BBE08F + /* xattr entry (for both inline & shared xattrs) */ struct erofs_xattr_entry { __u8 e_name_len; /* length of name */ @@ -289,6 +295,7 @@ struct erofs_dirent { enum { Z_EROFS_COMPRESSION_LZ4 = 0, Z_EROFS_COMPRESSION_LZMA = 1, + Z_EROFS_COMPRESSION_DEFLATE = 2, Z_EROFS_COMPRESSION_MAX }; #define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) @@ -309,6 +316,12 @@ struct z_erofs_lzma_cfgs { #define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) +/* 6 bytes (+ length field = 8 bytes) */ +struct z_erofs_deflate_cfgs { + u8 windowbits; /* 8..15 for DEFLATE */ + u8 reserved[5]; +} __packed; + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index e12592727a54..edc8ec7581b8 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -105,8 +105,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, set_nlink(inode, le32_to_cpu(die->i_nlink)); /* extended inode has its own timestamp */ - inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime); - inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec); + inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); @@ -148,8 +148,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, set_nlink(inode, le16_to_cpu(dic->i_nlink)); /* use build time for compact inodes */ - inode->i_ctime.tv_sec = sbi->build_time; - inode->i_ctime.tv_nsec = sbi->build_time_nsec; + inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); inode->i_size = le32_to_cpu(dic->i_size); if (erofs_inode_is_data_compressed(vi->datalayout)) @@ -176,10 +175,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode->i_mtime.tv_sec = inode->i_ctime.tv_sec; - inode->i_atime.tv_sec = inode->i_ctime.tv_sec; - inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec; - inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; + inode->i_mtime = inode->i_atime = inode_get_ctime(inode); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && @@ -373,7 +369,7 @@ int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= (STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 36e32fa542f0..4ff88d0dd980 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -151,6 +151,7 @@ struct erofs_sb_info { u32 xattr_prefix_start; u8 xattr_prefix_count; struct erofs_xattr_prefix_item *xattr_prefixes; + unsigned int xattr_filter_reserved; #endif u16 device_id_mask; /* valid bits of device id to be used */ @@ -251,6 +252,7 @@ EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) +EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) /* atomic flag definitions */ #define EROFS_I_EA_INITED_BIT 0 @@ -270,6 +272,7 @@ struct erofs_inode { unsigned char inode_isize; unsigned int xattr_isize; + unsigned int xattr_name_filter; unsigned int xattr_shared_count; unsigned int *xattr_shared_xattrs; @@ -519,6 +522,26 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ +#ifdef CONFIG_EROFS_FS_ZIP_DEFLATE +int __init z_erofs_deflate_init(void); +void z_erofs_deflate_exit(void); +int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size); +#else +static inline int z_erofs_deflate_init(void) { return 0; } +static inline int z_erofs_deflate_exit(void) { return 0; } +static inline int z_erofs_load_deflate_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_deflate_cfgs *dfl, int size) { + if (dfl) { + erofs_err(sb, "deflate algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP_DEFLATE */ + #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 566f68ddfa36..44a24d573f1f 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -19,10 +19,8 @@ #include <trace/events/erofs.h> static struct kmem_cache *erofs_inode_cachep __read_mostly; -struct file_system_type erofs_fs_type; -void _erofs_err(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -32,12 +30,11 @@ void _erofs_err(struct super_block *sb, const char *function, vaf.fmt = fmt; vaf.va = &args; - pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf); + pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf); va_end(args); } -void _erofs_info(struct super_block *sb, const char *function, - const char *fmt, ...) +void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...) { struct va_format vaf; va_list args; @@ -102,11 +99,9 @@ static void erofs_free_inode(struct inode *inode) { struct erofs_inode *vi = EROFS_I(inode); - /* be careful of RCU symlink path */ if (inode->i_op == &erofs_fast_symlink_iops) kfree(inode->i_link); kfree(vi->xattr_shared_xattrs); - kmem_cache_free(erofs_inode_cachep, vi); } @@ -119,8 +114,7 @@ static bool check_layout_compatibility(struct super_block *sb, /* check if current kernel meets all mandatory requirements */ if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) { - erofs_err(sb, - "unidentified incompatible feature %x, please upgrade kernel version", + erofs_err(sb, "unidentified incompatible feature %x, please upgrade kernel", feature & ~EROFS_ALL_FEATURE_INCOMPAT); return false; } @@ -201,6 +195,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZMA: ret = z_erofs_load_lzma_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_DEFLATE: + ret = z_erofs_load_deflate_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -388,6 +385,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); sbi->xattr_prefix_start = le32_to_cpu(dsb->xattr_prefix_start); sbi->xattr_prefix_count = dsb->xattr_prefix_count; + sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); sbi->root_nid = le16_to_cpu(dsb->root_nid); @@ -420,16 +418,11 @@ static int erofs_read_superblock(struct super_block *sb) if (erofs_is_fscache_mode(sb)) erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!"); - if (erofs_sb_has_fragments(sbi)) - erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!"); - if (erofs_sb_has_dedupe(sbi)) - erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!"); out: erofs_put_metabuf(&buf); return ret; } -/* set up default EROFS parameters */ static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP @@ -731,7 +724,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) xa_init(&sbi->managed_pslots); #endif - /* get the root inode */ inode = erofs_iget(sb, ROOT_NID(sbi)); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -748,7 +740,6 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; erofs_shrinker_register(sb); - /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); if (IS_ERR(sbi->packed_inode)) { @@ -881,10 +872,6 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } -/* - * could be triggered after deactivate_locked_super() - * is called, thus including umount and failed to initialize. - */ static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -913,7 +900,6 @@ static void erofs_kill_sb(struct super_block *sb) sb->s_fs_info = NULL; } -/* called when ->s_root is non-NULL */ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); @@ -950,9 +936,9 @@ static int __init erofs_module_init(void) erofs_check_ondisk_layout_definitions(); erofs_inode_cachep = kmem_cache_create("erofs_inode", - sizeof(struct erofs_inode), 0, - SLAB_RECLAIM_ACCOUNT, - erofs_inode_init_once); + sizeof(struct erofs_inode), 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + erofs_inode_init_once); if (!erofs_inode_cachep) return -ENOMEM; @@ -964,6 +950,10 @@ static int __init erofs_module_init(void) if (err) goto lzma_err; + err = z_erofs_deflate_init(); + if (err) + goto deflate_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -984,6 +974,8 @@ fs_err: sysfs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_deflate_exit(); +deflate_err: z_erofs_lzma_exit(); lzma_err: erofs_exit_shrinker(); @@ -1001,13 +993,13 @@ static void __exit erofs_module_exit(void) erofs_exit_sysfs(); z_erofs_exit_zip_subsystem(); + z_erofs_deflate_exit(); z_erofs_lzma_exit(); erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } -/* get filesystem statistics */ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 40178b6e0688..09d341675e89 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -5,6 +5,7 @@ * Copyright (C) 2021-2022, Alibaba Cloud */ #include <linux/security.h> +#include <linux/xxhash.h> #include "xattr.h" struct erofs_xattr_iter { @@ -87,6 +88,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) } ih = it.kaddr + erofs_blkoff(sb, it.pos); + vi->xattr_name_filter = le32_to_cpu(ih->h_name_filter); vi->xattr_shared_count = ih->h_shared_count; vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, sizeof(uint), GFP_KERNEL); @@ -392,7 +394,10 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size) { int ret; + unsigned int hashbit; struct erofs_xattr_iter it; + struct erofs_inode *vi = EROFS_I(inode); + struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); if (!name) return -EINVAL; @@ -401,6 +406,15 @@ int erofs_getxattr(struct inode *inode, int index, const char *name, if (ret) return ret; + /* reserved flag is non-zero if there's any change of on-disk format */ + if (erofs_sb_has_xattr_filter(sbi) && !sbi->xattr_filter_reserved) { + hashbit = xxh32(name, strlen(name), + EROFS_XATTR_FILTER_SEED + index); + hashbit &= EROFS_XATTR_FILTER_BITS - 1; + if (vi->xattr_name_filter & (1U << hashbit)) + return -ENOATTR; + } + it.index = index; it.name = (struct qstr)QSTR_INIT(name, strlen(name)); if (it.name.len > EROFS_NAME_LEN) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index de4f12152b62..036f610e044b 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -143,22 +143,17 @@ static inline void z_erofs_onlinepage_split(struct page *page) atomic_inc((atomic_t *)&page->private); } -static inline void z_erofs_page_mark_eio(struct page *page) +static void z_erofs_onlinepage_endio(struct page *page, int err) { - int orig; + int orig, v; + + DBG_BUGON(!PagePrivate(page)); do { orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; + v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); @@ -507,19 +502,17 @@ enum z_erofs_pclustermode { */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* - * The current collection has been linked with the owned chain, and - * could also be linked with the remaining collections, which means - * if the processing page is the tail page of the collection, thus - * the current collection can safely use the whole page (since - * the previous collection is under control) for in-place I/O, as - * illustrated below: - * ________________________________________________________________ - * | tail (partial) page | head (partial) page | - * | (of the current cl) | (of the previous collection) | - * | | | - * |__PCLUSTER_FOLLOWED___|___________PCLUSTER_FOLLOWED____________| + * The pcluster was just linked to a decompression chain by us. It can + * also be linked with the remaining pclusters, which means if the + * processing page is the tail page of a pcluster, this pcluster can + * safely use the whole page (since the previous pcluster is within the + * same chain) for in-place I/O, as illustrated below: + * ___________________________________________________ + * | tail (partial) page | head (partial) page | + * | (of the current pcl) | (of the previous pcl) | + * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * - * [ (*) the above page can be used as inplace I/O. ] + * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; @@ -535,8 +528,6 @@ struct z_erofs_decompress_frontend { z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; - /* used for applying cache strategy on the fly */ - bool backmost; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ @@ -545,7 +536,7 @@ struct z_erofs_decompress_frontend { #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } + .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) { @@ -554,7 +545,7 @@ static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; - if (fe->backmost) + if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && @@ -851,9 +842,11 @@ err_out: return err; } -static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) +static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; + struct super_block *sb = fe->inode->i_sb; + erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; @@ -863,8 +856,7 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { - grp = erofs_find_workgroup(fe->inode->i_sb, - map->m_pa >> PAGE_SHIFT); + grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; @@ -883,9 +875,26 @@ static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) } else if (ret) { return ret; } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - /* since file-backed online pages are traversed in reverse order */ + if (!z_erofs_is_inline_pcluster(fe->pcl)) { + /* bind cache first when cached decompression is preferred */ + z_erofs_bind_cache(fe); + } else { + void *mptr; + + mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); + if (IS_ERR(mptr)) { + ret = PTR_ERR(mptr); + erofs_err(sb, "failed to get inline data %d", ret); + return ret; + } + get_page(map->buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } + /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -908,12 +917,12 @@ void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) +static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) - return false; + return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); @@ -929,37 +938,29 @@ static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; - return true; } -static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, - struct page *page, unsigned int pageofs, - unsigned int len) +static int z_erofs_read_fragment(struct super_block *sb, struct page *page, + unsigned int cur, unsigned int end, erofs_off_t pos) { - struct super_block *sb = inode->i_sb; - struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u8 *src, *dst; - unsigned int i, cnt; + unsigned int cnt; + u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.inode = packed_inode; - pos += EROFS_I(inode)->z_fragmentoff; - for (i = 0; i < len; i += cnt) { - cnt = min_t(unsigned int, len - i, + for (; cur < end; cur += cnt, pos += cnt) { + cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } - - dst = kmap_local_page(page); - memcpy(dst + pageofs + i, src + erofs_blkoff(sb, pos), cnt); - kunmap_local(dst); - pos += cnt; + memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; @@ -972,94 +973,60 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; - unsigned int cur, end, spiltted; + unsigned int cur, end, len, split; int err = 0; - /* register locked file pages as online pages in pack */ z_erofs_onlinepage_init(page); - spiltted = 0; + split = 0; end = PAGE_SIZE; repeat: - cur = end - 1; - - if (offset + cur < map->m_la || - offset + cur >= map->m_la + map->m_llen) { - if (z_erofs_collector_end(fe)) - fe->backmost = false; - map->m_la = offset + cur; + if (offset + end - 1 < map->m_la || + offset + end - 1 >= map->m_la + map->m_llen) { + z_erofs_pcluster_end(fe); + map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; - } else { - if (fe->pcl) - goto hitted; - /* didn't get a valid pcluster previously (very rare) */ } - if (!(map->m_flags & EROFS_MAP_MAPPED) || - map->m_flags & EROFS_MAP_FRAGMENT) - goto hitted; - - err = z_erofs_collector_begin(fe); - if (err) - goto out; - - if (z_erofs_is_inline_pcluster(fe->pcl)) { - void *mp; - - mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, - erofs_blknr(inode->i_sb, map->m_pa), - EROFS_NO_KMAP); - if (IS_ERR(mp)) { - err = PTR_ERR(mp); - erofs_err(inode->i_sb, - "failed to get inline page, err %d", err); - goto out; - } - get_page(fe->map.buf.page); - WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, - fe->map.buf.page); - fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; - } else { - /* bind cache first when cached decompression is preferred */ - z_erofs_bind_cache(fe); - } -hitted: - /* - * Ensure the current partial page belongs to this submit chain rather - * than other concurrent submit chains or the noio(bypass) chain since - * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or bvpage (should be processed in a strict order.) - */ - tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + cur = offset > map->m_la ? 0 : map->m_la - offset; + /* bump split parts first to avoid several separate cases */ + ++split; - cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); + tight = false; goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { - unsigned int pageofs, skip, len; + erofs_off_t fpos = offset + cur - map->m_la; - if (offset > map->m_la) { - pageofs = 0; - skip = offset - map->m_la; - } else { - pageofs = map->m_la & ~PAGE_MASK; - skip = 0; - } - len = min_t(unsigned int, map->m_llen - skip, end - cur); - err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + len = min_t(unsigned int, map->m_llen - fpos, end - cur); + err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, + EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; - ++spiltted; tight = false; goto next_part; } - exclusive = (!cur && (!spiltted || tight)); + if (!fe->pcl) { + err = z_erofs_pcluster_begin(fe); + if (err) + goto out; + } + + /* + * Ensure the current partial page belongs to this submit chain rather + * than other concurrent submit chains or the noio(bypass) chain since + * those chains are handled asynchronously thus the page cannot be used + * for inplace I/O or bvpage (should be processed in a strict order.) + */ + tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); + exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); @@ -1072,8 +1039,6 @@ hitted: goto out; z_erofs_onlinepage_split(page); - /* bump up the number of spiltted parts of a page */ - ++spiltted; if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { @@ -1094,9 +1059,7 @@ next_part: goto repeat; out: - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); return err; } @@ -1199,9 +1162,7 @@ static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, cur += len; } kunmap_local(dst); - if (err) - z_erofs_page_mark_eio(bvi->bvec.page); - z_erofs_onlinepage_endio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page, err); list_del(p); kfree(bvi); } @@ -1372,9 +1333,7 @@ out: /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - if (err) - z_erofs_page_mark_eio(page); - z_erofs_onlinepage_endio(page); + z_erofs_onlinepage_endio(page, err); } if (be->decompressed_pages != be->onstack_pages) @@ -1410,7 +1369,10 @@ static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, owned = READ_ONCE(be.pcl->next); z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); - erofs_workgroup_put(&be.pcl->obj); + if (z_erofs_is_inline_pcluster(be.pcl)) + z_erofs_free_pcluster(be.pcl); + else + erofs_workgroup_put(&be.pcl->obj); } } @@ -1848,15 +1810,10 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, page = erofs_grab_cache_page_nowait(inode->i_mapping, index); if (page) { - if (PageUptodate(page)) { + if (PageUptodate(page)) unlock_page(page); - } else { - err = z_erofs_do_read_page(f, page); - if (err) - erofs_err(inode->i_sb, - "readmore error at page %lu @ nid %llu", - index, EROFS_I(inode)->nid); - } + else + (void)z_erofs_do_read_page(f, page); put_page(page); } @@ -1868,25 +1825,25 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, static int z_erofs_read_folio(struct file *file, struct folio *folio) { - struct page *page = &folio->page; - struct inode *const inode = page->mapping->host; + struct inode *const inode = folio->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; - trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + trace_erofs_read_folio(folio, false); + f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); - err = z_erofs_do_read_page(&f, page); + err = z_erofs_do_read_page(&f, &folio->page); z_erofs_pcluster_readmore(&f, NULL, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); - if (err) - erofs_err(inode->i_sb, "failed to read, err [%d]", err); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", + err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); @@ -1898,38 +1855,35 @@ static void z_erofs_readahead(struct readahead_control *rac) struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *head = NULL, *page; - unsigned int nr_pages; + struct folio *head = NULL, *folio; + unsigned int nr_folios; + int err; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); - nr_pages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + nr_folios = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); - while ((page = readahead_page(rac))) { - set_page_private(page, (unsigned long)head); - head = page; + while ((folio = readahead_folio(rac))) { + folio->private = head; + head = folio; } + /* traverse in reverse order for best metadata I/O performance */ while (head) { - struct page *page = head; - int err; - - /* traversal in reverse order */ - head = (void *)page_private(page); + folio = head; + head = folio_get_private(folio); - err = z_erofs_do_read_page(&f, page); - if (err) - erofs_err(inode->i_sb, - "readahead error at page %lu @ nid %llu", - page->index, EROFS_I(inode)->nid); - put_page(page); + err = z_erofs_do_read_page(&f, &folio->page); + if (err && err != -EINTR) + erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", + folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); - (void)z_erofs_collector_end(&f); + z_erofs_pcluster_end(&f); - z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_pages), true); + z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 1909ddafd9c7..7b55111fd533 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -561,8 +561,9 @@ static int z_erofs_do_map_blocks(struct inode *inode, if ((flags & EROFS_GET_BLOCKS_FIEMAP) || ((flags & EROFS_GET_BLOCKS_READMORE) && - map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && - map->m_llen >= i_blocksize(inode))) { + (map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA || + map->m_algorithmformat == Z_EROFS_COMPRESSION_DEFLATE) && + map->m_llen >= i_blocksize(inode))) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/eventfd.c b/fs/eventfd.c index 8aa36cd37351..33a918f9566c 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -189,7 +189,7 @@ void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) { lockdep_assert_held(&ctx->wqh.lock); - *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count; ctx->count -= *cnt; } EXPORT_SYMBOL_GPL(eventfd_ctx_do_read); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 4b1b3362f697..1d9a71a0c4c1 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -975,15 +975,11 @@ again: static int ep_alloc(struct eventpoll **pep) { - int error; - struct user_struct *user; struct eventpoll *ep; - user = get_current_user(); - error = -ENOMEM; ep = kzalloc(sizeof(*ep), GFP_KERNEL); if (unlikely(!ep)) - goto free_uid; + return -ENOMEM; mutex_init(&ep->mtx); rwlock_init(&ep->lock); @@ -992,16 +988,12 @@ static int ep_alloc(struct eventpoll **pep) INIT_LIST_HEAD(&ep->rdllist); ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; - ep->user = user; + ep->user = get_current_user(); refcount_set(&ep->refcount, 1); *pep = ep; return 0; - -free_uid: - free_uid(user); - return error; } /* diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 729ada9e26e8..f55498e5c23d 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -273,8 +273,6 @@ struct exfat_sb_info { spinlock_t inode_hash_lock; struct hlist_head inode_hashtable[EXFAT_HASH_SIZE]; - - struct rcu_head rcu; }; #define EXFAT_CACHE_VALID 0 diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 3cbd270e0cba..32395ef686a2 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -22,7 +22,7 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) if (err) return err; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); if (!IS_SYNC(inode)) @@ -232,7 +232,7 @@ int exfat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_backing_inode(path->dentry); struct exfat_inode_info *ei = EXFAT_I(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); exfat_truncate_atime(&stat->atime); stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = ei->i_crtime.tv_sec; @@ -290,7 +290,7 @@ int exfat_setattr(struct mnt_idmap *idmap, struct dentry *dentry, } if (attr->ia_valid & ATTR_SIZE) - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); setattr_copy(&nop_mnt_idmap, inode, attr); exfat_truncate_atime(&inode->i_atime); diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 481dd338f2b8..13329baeafbc 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -355,7 +355,7 @@ static void exfat_write_failed(struct address_space *mapping, loff_t to) if (to > i_size_read(inode)) { truncate_pagecache(inode, i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); exfat_truncate(inode); } } @@ -398,7 +398,7 @@ static int exfat_write_end(struct file *file, struct address_space *mapping, exfat_write_failed(mapping, pos+len); if (!(err < 0) && !(ei->attr & ATTR_ARCHIVE)) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ei->attr |= ATTR_ARCHIVE; mark_inode_dirty(inode); } @@ -577,7 +577,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) inode->i_blocks = round_up(i_size_read(inode), sbi->cluster_size) >> 9; inode->i_mtime = info->mtime; - inode->i_ctime = info->mtime; + inode_set_ctime_to_ts(inode, info->mtime); ei->i_crtime = info->crtime; inode->i_atime = info->atime; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index e0ff9d156f6f..1b9f587f6cca 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -569,7 +569,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -582,8 +582,7 @@ static int exfat_create(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -817,7 +816,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); + dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); @@ -825,7 +824,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry) mark_inode_dirty(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); @@ -852,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); else @@ -866,8 +865,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, goto unlock; inode_inc_iversion(inode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - EXFAT_I(inode)->i_crtime = current_time(inode); + inode->i_mtime = inode->i_atime = EXFAT_I(inode)->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); /* timestamp is already written, so mark_inode_dirty() is unneeded. */ @@ -979,7 +977,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) ei->dir.dir = DIR_DELETED; inode_inc_iversion(dir); - dir->i_mtime = dir->i_atime = current_time(dir); + dir->i_mtime = dir->i_atime = inode_set_ctime_current(dir); exfat_truncate_atime(&dir->i_atime); if (IS_DIRSYNC(dir)) exfat_sync_inode(dir); @@ -988,7 +986,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry) drop_nlink(dir); clear_nlink(inode); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); exfat_unhash_inode(inode); exfat_d_version_set(dentry, inode_query_iversion(dir)); @@ -1312,8 +1310,8 @@ static int exfat_rename(struct mnt_idmap *idmap, goto unlock; inode_inc_iversion(new_dir); - new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = - EXFAT_I(new_dir)->i_crtime = current_time(new_dir); + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); + EXFAT_I(new_dir)->i_crtime = current_time(new_dir); exfat_truncate_atime(&new_dir->i_atime); if (IS_DIRSYNC(new_dir)) exfat_sync_inode(new_dir); @@ -1336,7 +1334,6 @@ static int exfat_rename(struct mnt_idmap *idmap, } inode_inc_iversion(old_dir); - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); if (IS_DIRSYNC(old_dir)) exfat_sync_inode(old_dir); else @@ -1354,8 +1351,7 @@ static int exfat_rename(struct mnt_idmap *idmap, exfat_warn(sb, "abnormal access to an inode dropped"); WARN_ON(new_inode->i_nlink == 0); } - new_inode->i_ctime = EXFAT_I(new_inode)->i_crtime = - current_time(new_inode); + EXFAT_I(new_inode)->i_crtime = current_time(new_inode); } unlock: diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8c32460e031e..2778bd9b631e 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -31,16 +31,6 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi) kfree(sbi->options.iocharset); } -static void exfat_delayed_free(struct rcu_head *p) -{ - struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu); - - unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - exfat_free_upcase_table(sbi); - kfree(sbi); -} - static void exfat_put_super(struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -50,7 +40,8 @@ static void exfat_put_super(struct super_block *sb) brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); - call_rcu(&sbi->rcu, exfat_delayed_free); + unload_nls(sbi->nls_io); + exfat_free_upcase_table(sbi); } static int exfat_sync_fs(struct super_block *sb, int wait) @@ -379,8 +370,7 @@ static int exfat_read_root(struct inode *inode) ei->i_size_ondisk = i_size_read(inode); exfat_save_attr(inode, ATTR_SUBDIR); - inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime = - current_time(inode); + inode->i_mtime = inode->i_atime = ei->i_crtime = inode_set_ctime_current(inode); exfat_truncate_atime(&inode->i_atime); return 0; } @@ -710,9 +700,6 @@ free_table: check_nls_io: unload_nls(sbi->nls_io); - exfat_free_iocharset(sbi); - sb->s_fs_info = NULL; - kfree(sbi); return err; } @@ -721,14 +708,18 @@ static int exfat_get_tree(struct fs_context *fc) return get_tree_bdev(fc, exfat_fill_super); } +static void exfat_free_sbi(struct exfat_sb_info *sbi) +{ + exfat_free_iocharset(sbi); + kfree(sbi); +} + static void exfat_free(struct fs_context *fc) { struct exfat_sb_info *sbi = fc->s_fs_info; - if (sbi) { - exfat_free_iocharset(sbi); - kfree(sbi); - } + if (sbi) + exfat_free_sbi(sbi); } static int exfat_reconfigure(struct fs_context *fc) @@ -773,12 +764,21 @@ static int exfat_init_fs_context(struct fs_context *fc) return 0; } +static void exfat_kill_sb(struct super_block *sb) +{ + struct exfat_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + if (sbi) + exfat_free_sbi(sbi); +} + static struct file_system_type exfat_fs_type = { .owner = THIS_MODULE, .name = "exfat", .init_fs_context = exfat_init_fs_context, .parameters = exfat_parameters, - .kill_sb = kill_block_super, + .kill_sb = exfat_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index 82b17d7fc93f..7e54c31589c7 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -237,7 +237,7 @@ ext2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, error = __ext2_set_acl(inode, acl, type); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } return error; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 42db804794bd..b335f17f682f 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -468,7 +468,7 @@ int ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, ext2_set_de_type(de, inode); ext2_commit_chunk(page, pos, len); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); return ext2_handle_dirsync(dir); @@ -555,7 +555,7 @@ got_it: de->inode = cpu_to_le32(inode->i_ino); ext2_set_de_type (de, inode); ext2_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); EXT2_I(dir)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(dir); err = ext2_handle_dirsync(dir); @@ -606,7 +606,7 @@ int ext2_delete_entry(struct ext2_dir_entry_2 *dir, struct page *page) pde->rec_len = ext2_rec_len_to_disk(to - from); dir->inode = 0; ext2_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); EXT2_I(inode)->i_flags &= ~EXT2_BTREE_FL; mark_inode_dirty(inode); return ext2_handle_dirsync(inode); diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index a4e1d7a9c544..124df89689e1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -549,7 +549,7 @@ got: inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); memset(ei->i_data, 0, sizeof(ei->i_data)); ei->i_flags = ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 75983215c7a1..acbab27fe957 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -595,7 +595,7 @@ static void ext2_splice_branch(struct inode *inode, if (where->bh) mark_buffer_dirty_inode(where->bh, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -1287,7 +1287,7 @@ static int ext2_setsize(struct inode *inode, loff_t newsize) __ext2_truncate_blocks(inode, newsize); filemap_invalidate_unlock(inode->i_mapping); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (inode_needs_sync(inode)) { sync_mapping_buffers(inode->i_mapping); sync_inode_metadata(inode, 1); @@ -1409,9 +1409,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le32_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = (signed)le32_to_cpu(raw_inode->i_atime); - inode->i_ctime.tv_sec = (signed)le32_to_cpu(raw_inode->i_ctime); + inode_set_ctime(inode, (signed)le32_to_cpu(raw_inode->i_ctime), 0); inode->i_mtime.tv_sec = (signed)le32_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes @@ -1541,7 +1541,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le32(inode->i_size); raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec); - raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_le32(inode_get_ctime(inode).tv_sec); raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec); raw_inode->i_blocks = cpu_to_le32(inode->i_blocks); @@ -1628,7 +1628,7 @@ int ext2_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index cc87d413eb43..44e04484e570 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -44,7 +44,7 @@ int ext2_fileattr_set(struct mnt_idmap *idmap, (fa->flags & EXT2_FL_USER_MODIFIABLE); ext2_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; @@ -77,7 +77,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } inode_lock(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_generation = generation; inode_unlock(inode); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 937dd8f60f96..059517068adc 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -211,7 +211,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -291,7 +291,7 @@ static int ext2_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -367,7 +367,7 @@ static int ext2_rename (struct mnt_idmap * idmap, ext2_put_page(new_page, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -383,7 +383,7 @@ static int ext2_rename (struct mnt_idmap * idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); err = ext2_delete_entry(old_de, old_page); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 2959afc7541c..aaf3e3e88cb2 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1572,7 +1572,7 @@ out: if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 8906ba479aaf..1c9187188d68 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -773,7 +773,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, /* Update the inode. */ EXT2_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) { error = sync_inode_metadata(inode, 1); /* In case sync failed due to ENOSPC the inode was actually diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 27fcbddfb148..3bffe862f954 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -259,7 +259,7 @@ retry: error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0a2d55faa095..1e2259d9967d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,64 +868,70 @@ struct ext4_inode { * affected filesystem before 2242. */ -static inline __le32 ext4_encode_extra_time(struct timespec64 *time) +static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { - u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK; - return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS)); + u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; + return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } -static inline void ext4_decode_extra_time(struct timespec64 *time, - __le32 extra) +static inline struct timespec64 ext4_decode_extra_time(__le32 base, + __le32 extra) { + struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; + if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) - time->tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; - time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; + ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; + return ts; } -#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +#define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\ - (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(inode)->xtime); \ - } \ - else \ - (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (inode)->xtime.tv_sec, S32_MIN, S32_MAX)); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ + (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ + (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ + } else \ + (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) -#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - (raw_inode)->xtime ## _extra = \ - ext4_encode_extra_time(&(einode)->xtime); \ -} while (0) +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, (inode)->xtime) + +#define EXT4_INODE_SET_CTIME(inode, raw_inode) \ + EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ + raw_inode, (einode)->xtime) + +#define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ + (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ + ext4_decode_extra_time((raw_inode)->xtime, \ + (raw_inode)->xtime ## _extra) : \ + (struct timespec64) { \ + .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ + }) #define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ do { \ - (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ - if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ - ext4_decode_extra_time(&(inode)->xtime, \ - raw_inode->xtime ## _extra); \ - } \ - else \ - (inode)->xtime.tv_nsec = 0; \ + (inode)->xtime = EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode); \ } while (0) +#define EXT4_INODE_GET_CTIME(inode, raw_inode) \ +do { \ + inode_set_ctime_to_ts(inode, \ + EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ +} while (0) -#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ -do { \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ - (einode)->xtime.tv_sec = \ - (signed)le32_to_cpu((raw_inode)->xtime); \ - else \ - (einode)->xtime.tv_sec = 0; \ - if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ - ext4_decode_extra_time(&(einode)->xtime, \ - raw_inode->xtime ## _extra); \ - else \ - (einode)->xtime.tv_nsec = 0; \ +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime = \ + EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ + raw_inode); \ + else \ + (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 77f318ec8abb..b38d59581411 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -234,8 +234,7 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, might_sleep(); - if (bh->b_bdev->bd_super) - ext4_check_bdev_write_error(bh->b_bdev->bd_super); + ext4_check_bdev_write_error(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e4115d338f10..202c76996b62 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4476,12 +4476,12 @@ retry: map.m_lblk += ret; map.m_len = len = len - ret; epos = (loff_t)map.m_lblk << inode->i_blkbits; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (new_size) { if (epos > new_size) epos = new_size; if (ext4_update_inode_size(inode, epos) & 0x1) - inode->i_mtime = inode->i_ctime; + inode->i_mtime = inode_get_ctime(inode); } ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -4617,7 +4617,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); @@ -4642,7 +4642,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, goto out_mutex; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); @@ -5378,7 +5378,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); @@ -5488,7 +5488,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ext4_mark_inode_dirty(handle, inode); if (ret) goto out_stop; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 754f961cd9fd..48abef5f23e7 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1250,7 +1250,7 @@ got: inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb); /* This is the optimal IO size (for stat), not the fs block size */ inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ei->i_crtime = inode->i_mtime; memset(ei->i_data, 0, sizeof(ei->i_data)); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index a4b7e4bc32d4..003861037374 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1037,7 +1037,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); return 1; @@ -1991,7 +1991,7 @@ out: ext4_orphan_del(handle, inode); if (err == 0) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode-test.c b/fs/ext4/inode-test.c index 7935ea6cf92c..f0c0fd507fbc 100644 --- a/fs/ext4/inode-test.c +++ b/fs/ext4/inode-test.c @@ -245,9 +245,9 @@ static void inode_test_xtimestamp_decoding(struct kunit *test) struct timestamp_expectation *test_param = (struct timestamp_expectation *)(test->param_value); - timestamp.tv_sec = get_32bit_time(test_param); - ext4_decode_extra_time(×tamp, - cpu_to_le32(test_param->extra_bits)); + timestamp = ext4_decode_extra_time( + cpu_to_le32(get_32bit_time(test_param)), + cpu_to_le32(test_param->extra_bits)); KUNIT_EXPECT_EQ_MSG(test, test_param->expected.tv_sec, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..6683076ecb2f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3986,7 +3986,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; @@ -4146,7 +4146,7 @@ out_stop: if (inode->i_nlink) ext4_orphan_del(handle, inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; @@ -4249,7 +4249,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); @@ -4858,7 +4858,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } } - EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_GET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); @@ -4981,7 +4981,7 @@ static void __ext4_update_other_inode_time(struct super_block *sb, spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); - EXT4_INODE_SET_XTIME(i_ctime, inode, raw_inode); + EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_XTIME(i_mtime, inode, raw_inode); EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); @@ -5376,10 +5376,8 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, * Update c/mtime on truncate up, ext4_truncate() will * update c/mtime in shrink case below */ - if (!shrink) { - inode->i_mtime = current_time(inode); - inode->i_ctime = inode->i_mtime; - } + if (!shrink) + inode->i_mtime = inode_set_ctime_current(inode); if (shrink) ext4_fc_track_range(handle, inode, @@ -5537,7 +5535,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 331859511f80..b0349f451863 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -449,7 +449,8 @@ static long swap_inode_boot_loader(struct super_block *sb, diff = size - size_bl; swap_inode_data(inode, inode_bl); - inode->i_ctime = inode_bl->i_ctime = current_time(inode); + inode_set_ctime_current(inode); + inode_set_ctime_current(inode_bl); inode_inc_iversion(inode); inode->i_generation = get_random_u32(); @@ -663,7 +664,7 @@ static int ext4_ioctl_setflags(struct inode *inode, ext4_set_inode_flags(inode, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); err = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -774,7 +775,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid) } EXT4_I(inode)->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); out_dirty: rc = ext4_mark_iloc_dirty(handle, inode, &iloc); @@ -1266,7 +1267,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); inode->i_generation = generation; err = ext4_mark_iloc_dirty(handle, inode, &iloc); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 0caf6c730ce3..933ad03f4f58 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2203,7 +2203,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * happen is that the times are slightly out of date * and/or different from the directory change time. */ - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); err2 = ext4_mark_inode_dirty(handle, dir); @@ -3197,7 +3197,8 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) * recovery. */ inode->i_size = 0; ext4_orphan_add(handle, inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_current(dir); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (retval) goto end_rmdir; @@ -3271,7 +3272,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto out_handle; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); ext4_update_dx_flag(dir); retval = ext4_mark_inode_dirty(handle, dir); if (retval) @@ -3286,7 +3287,7 @@ int __ext4_unlink(struct inode *dir, const struct qstr *d_name, drop_nlink(inode); if (!inode->i_nlink) ext4_orphan_add(handle, inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); retval = ext4_mark_inode_dirty(handle, inode); if (dentry && !retval) ext4_fc_track_unlink(handle, dentry); @@ -3463,7 +3464,7 @@ retry: if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ext4_inc_count(inode); ihold(inode); @@ -3641,8 +3642,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, if (ext4_has_feature_filetype(ent->dir->i_sb)) ent->de->file_type = file_type; inode_inc_iversion(ent->dir); - ent->dir->i_ctime = ent->dir->i_mtime = - current_time(ent->dir); + ent->dir->i_mtime = inode_set_ctime_current(ent->dir); retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { @@ -3941,7 +3941,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old.inode->i_ctime = current_time(old.inode); + inode_set_ctime_current(old.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; @@ -3955,9 +3955,9 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (new.inode) { ext4_dec_count(new.inode); - new.inode->i_ctime = current_time(new.inode); + inode_set_ctime_current(new.inode); } - old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir); + old.dir->i_mtime = inode_set_ctime_current(old.dir); ext4_update_dx_flag(old.dir); if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); @@ -4053,7 +4053,6 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, }; u8 new_file_type; int retval; - struct timespec64 ctime; if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && !projid_eq(EXT4_I(new_dir)->i_projid, @@ -4147,9 +4146,8 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - ctime = current_time(old.inode); - old.inode->i_ctime = ctime; - new.inode->i_ctime = ctime; + inode_set_ctime_current(old.inode); + inode_set_ctime_current(new.inode); retval = ext4_mark_inode_dirty(handle, old.inode); if (unlikely(retval)) goto end_rename; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ebf704616..73547d2334fd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -93,6 +93,7 @@ static int ext4_get_tree(struct fs_context *fc); static int ext4_reconfigure(struct fs_context *fc); static void ext4_fc_free(struct fs_context *fc); static int ext4_init_fs_context(struct fs_context *fc); +static void ext4_kill_sb(struct super_block *sb); static const struct fs_parameter_spec ext4_param_specs[]; /* @@ -135,12 +136,12 @@ static struct file_system_type ext2_fs_type = { .name = "ext2", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext2"); MODULE_ALIAS("ext2"); -#define IS_EXT2_SB(sb) ((sb)->s_bdev->bd_holder == &ext2_fs_type) +#define IS_EXT2_SB(sb) ((sb)->s_type == &ext2_fs_type) #else #define IS_EXT2_SB(sb) (0) #endif @@ -151,12 +152,12 @@ static struct file_system_type ext3_fs_type = { .name = "ext3", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, + .kill_sb = ext4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ext3"); MODULE_ALIAS("ext3"); -#define IS_EXT3_SB(sb) ((sb)->s_bdev->bd_holder == &ext3_fs_type) +#define IS_EXT3_SB(sb) ((sb)->s_type == &ext3_fs_type) static inline void __ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, @@ -1096,15 +1097,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -static void ext4_bdev_mark_dead(struct block_device *bdev) -{ - ext4_force_shutdown(bdev->bd_holder, EXT4_GOING_FLAGS_NOLOGFLUSH); -} - -static const struct blk_holder_ops ext4_holder_ops = { - .mark_dead = ext4_bdev_mark_dead, -}; - /* * Open the external journal device */ @@ -1113,7 +1105,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb) struct block_device *bdev; bdev = blkdev_get_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE, sb, - &ext4_holder_ops); + &fs_holder_ops); if (IS_ERR(bdev)) goto fail; return bdev; @@ -1125,25 +1117,6 @@ fail: return NULL; } -/* - * Release the journal device - */ -static void ext4_blkdev_remove(struct ext4_sb_info *sbi) -{ - struct block_device *bdev; - bdev = sbi->s_journal_bdev; - if (bdev) { - /* - * Invalidate the journal device's buffers. We don't want them - * floating about in memory - the physical journal device may - * hotswapped, and it breaks the `ro-after' testing code. - */ - invalidate_bdev(bdev); - blkdev_put(bdev, sbi->s_sb); - sbi->s_journal_bdev = NULL; - } -} - static inline struct inode *orphan_list_entry(struct list_head *l) { return &list_entry(l, struct ext4_inode_info, i_orphan)->vfs_inode; @@ -1339,8 +1312,13 @@ static void ext4_put_super(struct super_block *sb) sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->s_journal_bdev) { + /* + * Invalidate the journal device's buffers. We don't want them + * floating about in memory - the physical journal device may + * hotswapped, and it breaks the `ro-after' testing code. + */ sync_blockdev(sbi->s_journal_bdev); - ext4_blkdev_remove(sbi); + invalidate_bdev(sbi->s_journal_bdev); } ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); @@ -5572,7 +5550,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); - sb->s_bdev->bd_super = sb; EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; @@ -5664,9 +5641,11 @@ failed_mount: kfree(get_qf_name(sb, sbi, i)); #endif fscrypt_free_dummy_policy(&sbi->s_dummy_enc_policy); - /* ext4_blkdev_remove() calls kill_bdev(), release bh before it. */ brelse(sbi->s_sbh); - ext4_blkdev_remove(sbi); + if (sbi->s_journal_bdev) { + invalidate_bdev(sbi->s_journal_bdev); + blkdev_put(sbi->s_journal_bdev, sb); + } out_fail: invalidate_bdev(sb->s_bdev); sb->s_fs_info = NULL; @@ -5854,7 +5833,10 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, if (WARN_ON_ONCE(!ext4_has_feature_journal(sb))) return NULL; + /* see get_tree_bdev why this is needed and safe */ + up_write(&sb->s_umount); bdev = ext4_blkdev_get(j_dev, sb); + down_write(&sb->s_umount); if (bdev == NULL) return NULL; @@ -7103,7 +7085,7 @@ static int ext4_quota_off(struct super_block *sb, int type) } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: @@ -7273,13 +7255,24 @@ static inline int ext3_feature_set_ok(struct super_block *sb) return 1; } +static void ext4_kill_sb(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct block_device *journal_bdev = sbi ? sbi->s_journal_bdev : NULL; + + kill_block_super(sb); + + if (journal_bdev) + blkdev_put(journal_bdev, sb); +} + static struct file_system_type ext4_fs_type = { .owner = THIS_MODULE, .name = "ext4", .init_fs_context = ext4_init_fs_context, .parameters = ext4_param_specs, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .kill_sb = ext4_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("ext4"); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 05151d61b00b..281e1bfbbe3e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -356,13 +356,13 @@ ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) { - return ((u64)ea_inode->i_ctime.tv_sec << 32) | + return ((u64) inode_get_ctime(ea_inode).tv_sec << 32) | (u32) inode_peek_iversion_raw(ea_inode); } static void ext4_xattr_inode_set_ref(struct inode *ea_inode, u64 ref_count) { - ea_inode->i_ctime.tv_sec = (u32)(ref_count >> 32); + inode_set_ctime(ea_inode, (u32)(ref_count >> 32), 0); inode_set_iversion_raw(ea_inode, ref_count & 0xffffffff); } @@ -2473,7 +2473,7 @@ retry_inode: } if (!error) { ext4_xattr_update_super_block(handle, inode->i_sb); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_iversion(inode); if (!value) no_expand = 0; diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 236d890f560b..0f7df9c11af3 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1045,7 +1045,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, struct address_space *mapping = cc->inode->i_mapping; struct page *page; sector_t last_block_in_bio; - unsigned fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; + fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); int i, ret; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index d635c58cf5a3..8aa29fe2e87b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -455,7 +455,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, de->file_type = fs_umode_to_ftype(inode->i_mode); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -609,7 +609,7 @@ void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -858,7 +858,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -919,7 +919,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7cb2177b252..613132339d72 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2736,7 +2736,7 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, static inline struct page *f2fs_pagecache_get_page( struct address_space *mapping, pgoff_t index, - int fgp_flags, gfp_t gfp_mask) + fgf_t fgp_flags, gfp_t gfp_mask) { if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) return NULL; @@ -3303,9 +3303,11 @@ static inline void clear_file(struct inode *inode, int type) static inline bool f2fs_is_time_consistent(struct inode *inode) { + struct timespec64 ctime = inode_get_ctime(inode); + if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime)) return false; - if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime)) + if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &ctime)) return false; if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime)) return false; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 093039dee992..35886a52edfb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -794,7 +794,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -882,7 +882,7 @@ int f2fs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_NODUMP | STATX_ATTR_VERITY); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); /* we need to show initial sectors used for inline_data/dentries */ if ((S_ISREG(inode->i_mode) && f2fs_has_inline_data(inode)) || @@ -905,7 +905,7 @@ static void __setattr_copy(struct mnt_idmap *idmap, if (ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -1008,7 +1008,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, return err; spin_lock(&F2FS_I(inode)->i_size_lock); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); F2FS_I(inode)->last_disk_size = i_size_read(inode); spin_unlock(&F2FS_I(inode)->i_size_lock); } @@ -1835,7 +1835,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1937,7 +1937,7 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) else clear_inode_flag(inode, FI_PROJ_INHERIT); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_set_inode_flags(inode); f2fs_mark_inode_dirty_sync(inode, true); return 0; @@ -2874,10 +2874,10 @@ out_src: if (ret) goto out_unlock; - src->i_mtime = src->i_ctime = current_time(src); + src->i_mtime = inode_set_ctime_current(src); f2fs_mark_inode_dirty_sync(src, false); if (src != dst) { - dst->i_mtime = dst->i_ctime = current_time(dst); + dst->i_mtime = inode_set_ctime_current(dst); f2fs_mark_inode_dirty_sync(dst, false); } f2fs_update_time(sbi, REQ_TIME); @@ -3073,7 +3073,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) goto out_unlock; fi->i_projid = kprojid; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); out_unlock: f2fs_unlock_op(sbi); @@ -3511,7 +3511,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) } set_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); @@ -3710,7 +3710,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) if (ret >= 0) { clear_inode_flag(inode, FI_COMPRESS_RELEASED); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, true); } unlock_inode: diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 01effd3fcb6c..a1ca394bc327 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -2181,12 +2181,14 @@ out_drop_write: if (err) return err; - err = freeze_super(sbi->sb); + err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE); if (err) return err; if (f2fs_readonly(sbi->sb)) { - thaw_super(sbi->sb); + err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + if (err) + return err; return -EROFS; } @@ -2240,6 +2242,6 @@ recover_out: out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); - thaw_super(sbi->sb); + thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4638fee16a91..88fc9208ffa7 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -698,7 +698,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); f2fs_mark_inode_dirty_sync(dir, false); if (inode) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 09e986b050c6..c1c2ba9f28e5 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -403,7 +403,7 @@ static void init_idisk_time(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); fi->i_disk_time[0] = inode->i_atime; - fi->i_disk_time[1] = inode->i_ctime; + fi->i_disk_time[1] = inode_get_ctime(inode); fi->i_disk_time[2] = inode->i_mtime; } @@ -434,10 +434,10 @@ static int do_read_inode(struct inode *inode) inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); + inode_set_ctime(inode, le64_to_cpu(ri->i_ctime), + le32_to_cpu(ri->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(ri->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); inode->i_generation = le32_to_cpu(ri->i_generation); if (S_ISDIR(inode->i_mode)) @@ -714,10 +714,10 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); - ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + ri->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); ri->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); ri->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ri->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ri->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ri->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); if (S_ISDIR(inode->i_mode)) ri->i_current_depth = diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index bee0568888da..193b22a2d6bf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -243,7 +243,7 @@ static struct inode *f2fs_new_inode(struct mnt_idmap *idmap, inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); F2FS_I(inode)->i_crtime = inode->i_mtime; inode->i_generation = get_random_u32(); @@ -420,7 +420,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -1052,7 +1052,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_page = NULL; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -1086,7 +1086,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -1251,7 +1251,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(old_inode, new_dir->i_ino); f2fs_up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = current_time(old_dir); + inode_set_ctime_current(old_dir); if (old_nlink) { f2fs_down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -1270,7 +1270,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_pino_write(new_inode, old_dir->i_ino); f2fs_up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = current_time(new_dir); + inode_set_ctime_current(new_dir); if (new_nlink) { f2fs_down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4e7d4ceeb084..b8637e88d94f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -321,10 +321,10 @@ static int recover_inode(struct inode *inode, struct page *page) f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); - inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode_set_ctime(inode, le64_to_cpu(raw->i_ctime), + le32_to_cpu(raw->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); F2FS_I(inode)->i_advise = raw->i_advise; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ca31163da00a..aa1f9a3a8037 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1561,7 +1561,7 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) int i; for (i = 0; i < sbi->s_ndevs; i++) { - blkdev_put(FDEV(i).bdev, sbi->sb->s_type); + blkdev_put(FDEV(i).bdev, sbi->sb); #ifdef CONFIG_BLK_DEV_ZONED kvfree(FDEV(i).blkz_seq); #endif @@ -2703,7 +2703,7 @@ retry: if (len == towrite) return err; - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); f2fs_mark_inode_dirty_sync(inode, false); return len - towrite; } @@ -4198,7 +4198,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* Single zoned block device mount */ FDEV(0).bdev = blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, mode, - sbi->sb->s_type, NULL); + sbi->sb, NULL); } else { /* Multi-device mount */ memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); @@ -4217,8 +4217,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) sbi->log_blocks_per_seg) - 1; } FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, mode, - sbi->sb->s_type, - NULL); + sbi->sb, NULL); } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 476b186b90a6..4ae93e1df421 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -764,7 +764,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, same: if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); clear_inode_flag(inode, FI_ACL_MODE); } diff --git a/fs/fat/fat.h b/fs/fat/fat.h index e3b690b48e3e..66cf4778cf3b 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -460,8 +460,7 @@ extern struct timespec64 fat_truncate_mtime(const struct msdos_sb_info *sbi, const struct timespec64 *ts); extern int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags); -extern int fat_update_time(struct inode *inode, struct timespec64 *now, - int flags); +extern int fat_update_time(struct inode *inode, int flags); extern int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs); int fat_cache_init(void); diff --git a/fs/fat/file.c b/fs/fat/file.c index 456477946dd9..e887e9ab7472 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -401,7 +401,7 @@ int fat_getattr(struct mnt_idmap *idmap, const struct path *path, struct inode *inode = d_inode(path->dentry); struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->blksize = sbi->cluster_size; if (sbi->options.nfs == FAT_NFS_NOSTALE_RO) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index d99b8549ec8f..cdd39b6020f3 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -562,7 +562,7 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) & ~((loff_t)sbi->cluster_size - 1)) >> 9; fat_time_fat2unix(sbi, &inode->i_mtime, de->time, de->date, 0); - inode->i_ctime = inode->i_mtime; + inode_set_ctime_to_ts(inode, inode->i_mtime); if (sbi->options.isvfat) { fat_time_fat2unix(sbi, &inode->i_atime, 0, de->adate, 0); fat_time_fat2unix(sbi, &MSDOS_I(inode)->i_crtime, de->ctime, @@ -1407,8 +1407,7 @@ static int fat_read_root(struct inode *inode) MSDOS_I(inode)->mmu_private = inode->i_size; fat_save_attrs(inode, ATTR_DIR); - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = 0; - inode->i_mtime.tv_nsec = inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = inode_set_ctime(inode, 0, 0); set_nlink(inode, fat_subdirs(inode)+2); return 0; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 7e5d6ae305f2..f2304a1054aa 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -332,13 +332,14 @@ int fat_truncate_time(struct inode *inode, struct timespec64 *now, int flags) * but ctime updates are ignored. */ if (flags & S_MTIME) - inode->i_mtime = inode->i_ctime = fat_truncate_mtime(sbi, now); + inode->i_mtime = inode_set_ctime_to_ts(inode, + fat_truncate_mtime(sbi, now)); return 0; } EXPORT_SYMBOL_GPL(fat_truncate_time); -int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) +int fat_update_time(struct inode *inode, int flags) { int dirty_flags = 0; @@ -346,16 +347,13 @@ int fat_update_time(struct inode *inode, struct timespec64 *now, int flags) return 0; if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - fat_truncate_time(inode, now, flags); + fat_truncate_time(inode, NULL, flags); if (inode->i_sb->s_flags & SB_LAZYTIME) dirty_flags |= I_DIRTY_TIME; else dirty_flags |= I_DIRTY_SYNC; } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; - __mark_inode_dirty(inode, dirty_flags); return 0; } diff --git a/fs/fcntl.c b/fs/fcntl.c index b622be119706..e871009f6c88 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -34,7 +34,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -static int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned int arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -112,11 +112,11 @@ void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +int f_setown(struct file *filp, int who, int force) { enum pid_type type; struct pid *pid = NULL; - int who = arg, ret = 0; + int ret = 0; type = PIDTYPE_TGID; if (who < 0) { @@ -317,28 +317,29 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, struct file *filp) { void __user *argp = (void __user *)arg; + int argi = (int)arg; struct flock flock; long err = -EINVAL; switch (cmd) { case F_DUPFD: - err = f_dupfd(arg, filp, 0); + err = f_dupfd(argi, filp, 0); break; case F_DUPFD_CLOEXEC: - err = f_dupfd(arg, filp, O_CLOEXEC); + err = f_dupfd(argi, filp, O_CLOEXEC); break; case F_GETFD: err = get_close_on_exec(fd) ? FD_CLOEXEC : 0; break; case F_SETFD: err = 0; - set_close_on_exec(fd, arg & FD_CLOEXEC); + set_close_on_exec(fd, argi & FD_CLOEXEC); break; case F_GETFL: err = filp->f_flags; break; case F_SETFL: - err = setfl(fd, filp, arg); + err = setfl(fd, filp, argi); break; #if BITS_PER_LONG != 32 /* 32-bit arches must use fcntl64() */ @@ -375,7 +376,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + err = f_setown(filp, argi, 1); break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); @@ -391,28 +392,28 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_SETSIG: /* arg == 0 restores default behaviour. */ - if (!valid_signal(arg)) { + if (!valid_signal(argi)) { break; } err = 0; - filp->f_owner.signum = arg; + filp->f_owner.signum = argi; break; case F_GETLEASE: err = fcntl_getlease(filp); break; case F_SETLEASE: - err = fcntl_setlease(fd, filp, arg); + err = fcntl_setlease(fd, filp, argi); break; case F_NOTIFY: - err = fcntl_dirnotify(fd, filp, arg); + err = fcntl_dirnotify(fd, filp, argi); break; case F_SETPIPE_SZ: case F_GETPIPE_SZ: - err = pipe_fcntl(filp, cmd, arg); + err = pipe_fcntl(filp, cmd, argi); break; case F_ADD_SEALS: case F_GET_SEALS: - err = memfd_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/fs/file.c b/fs/file.c index 3fd003a8604f..3e4a4dfa38fc 100644 --- a/fs/file.c +++ b/fs/file.c @@ -668,7 +668,7 @@ EXPORT_SYMBOL(close_fd); /* for ksys_close() */ /** * last_fd - return last valid index into fd table - * @cur_fds: files struct + * @fdt: File descriptor table. * * Context: Either rcu read lock or files_lock must be held. * @@ -693,29 +693,30 @@ static inline void __range_cloexec(struct files_struct *cur_fds, spin_unlock(&cur_fds->file_lock); } -static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, +static inline void __range_close(struct files_struct *files, unsigned int fd, unsigned int max_fd) { + struct file *file; unsigned n; - rcu_read_lock(); - n = last_fd(files_fdtable(cur_fds)); - rcu_read_unlock(); + spin_lock(&files->file_lock); + n = last_fd(files_fdtable(files)); max_fd = min(max_fd, n); - while (fd <= max_fd) { - struct file *file; - - spin_lock(&cur_fds->file_lock); - file = pick_file(cur_fds, fd++); - spin_unlock(&cur_fds->file_lock); - + for (; fd <= max_fd; fd++) { + file = pick_file(files, fd); if (file) { - /* found a valid file to close */ - filp_close(file, cur_fds); + spin_unlock(&files->file_lock); + filp_close(file, files); cond_resched(); + spin_lock(&files->file_lock); + } else if (need_resched()) { + spin_unlock(&files->file_lock); + cond_resched(); + spin_lock(&files->file_lock); } } + spin_unlock(&files->file_lock); } /** @@ -723,6 +724,7 @@ static inline void __range_close(struct files_struct *cur_fds, unsigned int fd, * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close + * @flags: CLOSE_RANGE flags. * * This closes a range of file descriptors. All file descriptors * from @fd up to and including @max_fd are closed. diff --git a/fs/file_table.c b/fs/file_table.c index fc7d677ff5ad..ee21b3da9d08 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -461,11 +461,8 @@ void fput(struct file *file) */ void __fput_sync(struct file *file) { - if (atomic_long_dec_and_test(&file->f_count)) { - struct task_struct *task = current; - BUG_ON(!(task->flags & PF_KTHREAD)); + if (atomic_long_dec_and_test(&file->f_count)) __fput(file); - } } EXPORT_SYMBOL(fput); diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ceb6a12649ba..ac5d43b164b5 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -110,10 +110,9 @@ static inline void dip2vip_cpy(struct vxfs_sb_info *sbi, inode->i_size = vip->vii_size; inode->i_atime.tv_sec = vip->vii_atime; - inode->i_ctime.tv_sec = vip->vii_ctime; + inode_set_ctime(inode, vip->vii_ctime, 0); inode->i_mtime.tv_sec = vip->vii_mtime; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = vip->vii_blocks; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index aca4b4811394..969ce991b0b0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1953,9 +1953,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, struct inode *inode = wb_inode(wb->b_io.prev); struct super_block *sb = inode->i_sb; - if (!trylock_super(sb)) { + if (!super_trylock_shared(sb)) { /* - * trylock_super() may fail consistently due to + * super_trylock_shared() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ diff --git a/fs/fs_context.c b/fs/fs_context.c index 851214d1d013..a0ad7a0c4680 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -162,6 +162,10 @@ EXPORT_SYMBOL(vfs_parse_fs_param); /** * vfs_parse_fs_string - Convenience function to just parse a string. + * @fc: Filesystem context. + * @key: Parameter name. + * @value: Default value. + * @v_size: Maximum number of bytes in the value. */ int vfs_parse_fs_string(struct fs_context *fc, const char *key, const char *value, size_t v_size) @@ -189,7 +193,7 @@ EXPORT_SYMBOL(vfs_parse_fs_string); /** * generic_parse_monolithic - Parse key[=val][,key[=val]]* mount data - * @ctx: The superblock configuration to fill in. + * @fc: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be @@ -315,10 +319,31 @@ struct fs_context *fs_context_for_reconfigure(struct dentry *dentry, } EXPORT_SYMBOL(fs_context_for_reconfigure); +/** + * fs_context_for_submount: allocate a new fs_context for a submount + * @type: file_system_type of the new context + * @reference: reference dentry from which to copy relevant info + * + * Allocate a new fs_context suitable for a submount. This also ensures that + * the fc->security object is inherited from @reference (if needed). + */ struct fs_context *fs_context_for_submount(struct file_system_type *type, struct dentry *reference) { - return alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + struct fs_context *fc; + int ret; + + fc = alloc_fs_context(type, reference, 0, 0, FS_CONTEXT_FOR_SUBMOUNT); + if (IS_ERR(fc)) + return fc; + + ret = security_fs_context_submount(fc, reference->d_sb); + if (ret) { + put_fs_context(fc); + return ERR_PTR(ret); + } + + return fc; } EXPORT_SYMBOL(fs_context_for_submount); @@ -333,7 +358,7 @@ void fc_drop_locked(struct fs_context *fc) static void legacy_fs_context_free(struct fs_context *fc); /** - * vfs_dup_fc_config: Duplicate a filesystem context. + * vfs_dup_fs_context - Duplicate a filesystem context. * @src_fc: The context to copy. */ struct fs_context *vfs_dup_fs_context(struct fs_context *src_fc) @@ -379,7 +404,9 @@ EXPORT_SYMBOL(vfs_dup_fs_context); /** * logfc - Log a message to a filesystem context - * @fc: The filesystem context to log to. + * @log: The filesystem context to log to, or NULL to use printk. + * @prefix: A string to prefix the output with, or NULL. + * @level: 'w' for a warning, 'e' for an error. Anything else is a notice. * @fmt: The format of the buffer. */ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, ...) @@ -692,6 +719,7 @@ void vfs_clean_context(struct fs_context *fc) security_free_mnt_opts(&fc->security); kfree(fc->source); fc->source = NULL; + fc->exclusive = false; fc->purpose = FS_CONTEXT_FOR_RECONFIGURE; fc->phase = FS_CONTEXT_AWAITING_RECONF; diff --git a/fs/fsopen.c b/fs/fsopen.c index fc9d2d9fd234..ce03f6521c88 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -209,6 +209,72 @@ err: return ret; } +static int vfs_cmd_create(struct fs_context *fc, bool exclusive) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_CREATE_PARAMS) + return -EBUSY; + + if (!mount_capable(fc)) + return -EPERM; + + /* require the new mount api */ + if (exclusive && fc->ops == &legacy_fs_context_ops) + return -EOPNOTSUPP; + + fc->phase = FS_CONTEXT_CREATING; + fc->exclusive = exclusive; + + ret = vfs_get_tree(fc); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + sb = fc->root->d_sb; + ret = security_sb_kern_mount(sb); + if (unlikely(ret)) { + fc_drop_locked(fc); + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + /* vfs_get_tree() callchains will have grabbed @s_umount */ + up_write(&sb->s_umount); + fc->phase = FS_CONTEXT_AWAITING_MOUNT; + return 0; +} + +static int vfs_cmd_reconfigure(struct fs_context *fc) +{ + struct super_block *sb; + int ret; + + if (fc->phase != FS_CONTEXT_RECONF_PARAMS) + return -EBUSY; + + fc->phase = FS_CONTEXT_RECONFIGURING; + + sb = fc->root->d_sb; + if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { + fc->phase = FS_CONTEXT_FAILED; + return -EPERM; + } + + down_write(&sb->s_umount); + ret = reconfigure_super(fc); + up_write(&sb->s_umount); + if (ret) { + fc->phase = FS_CONTEXT_FAILED; + return ret; + } + + vfs_clean_context(fc); + return 0; +} + /* * Check the state and apply the configuration. Note that this function is * allowed to 'steal' the value by setting param->xxx to NULL before returning. @@ -216,7 +282,6 @@ err: static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, struct fs_parameter *param) { - struct super_block *sb; int ret; ret = finish_clean_context(fc); @@ -224,39 +289,11 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return ret; switch (cmd) { case FSCONFIG_CMD_CREATE: - if (fc->phase != FS_CONTEXT_CREATE_PARAMS) - return -EBUSY; - if (!mount_capable(fc)) - return -EPERM; - fc->phase = FS_CONTEXT_CREATING; - ret = vfs_get_tree(fc); - if (ret) - break; - sb = fc->root->d_sb; - ret = security_sb_kern_mount(sb); - if (unlikely(ret)) { - fc_drop_locked(fc); - break; - } - up_write(&sb->s_umount); - fc->phase = FS_CONTEXT_AWAITING_MOUNT; - return 0; + return vfs_cmd_create(fc, false); + case FSCONFIG_CMD_CREATE_EXCL: + return vfs_cmd_create(fc, true); case FSCONFIG_CMD_RECONFIGURE: - if (fc->phase != FS_CONTEXT_RECONF_PARAMS) - return -EBUSY; - fc->phase = FS_CONTEXT_RECONFIGURING; - sb = fc->root->d_sb; - if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { - ret = -EPERM; - break; - } - down_write(&sb->s_umount); - ret = reconfigure_super(fc); - up_write(&sb->s_umount); - if (ret) - break; - vfs_clean_context(fc); - return 0; + return vfs_cmd_reconfigure(fc); default: if (fc->phase != FS_CONTEXT_CREATE_PARAMS && fc->phase != FS_CONTEXT_RECONF_PARAMS) @@ -264,8 +301,6 @@ static int vfs_fsconfig_locked(struct fs_context *fc, int cmd, return vfs_parse_fs_param(fc, param); } - fc->phase = FS_CONTEXT_FAILED; - return ret; } /** @@ -353,6 +388,7 @@ SYSCALL_DEFINE5(fsconfig, return -EINVAL; break; case FSCONFIG_CMD_CREATE: + case FSCONFIG_CMD_CREATE_EXCL: case FSCONFIG_CMD_RECONFIGURE: if (_key || _value || aux) return -EINVAL; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 247ef4f76761..ab62e4624256 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -235,7 +235,7 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, inode->i_mode = mode; inode->i_uid = fc->user_id; inode->i_gid = fc->group_id; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); /* setting ->i_op to NULL is not allowed */ if (iop) inode->i_op = iop; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f67bef9d83c4..881524b9a55a 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -933,7 +933,7 @@ void fuse_flush_time_update(struct inode *inode) static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty_sync(inode); fuse_flush_time_update(inode); } @@ -1222,7 +1222,7 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); } else if (stat) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; } @@ -1715,8 +1715,8 @@ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) inarg.mtimensec = inode->i_mtime.tv_nsec; if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; - inarg.ctime = inode->i_ctime.tv_sec; - inarg.ctimensec = inode->i_ctime.tv_nsec; + inarg.ctime = inode_get_ctime(inode).tv_sec; + inarg.ctimensec = inode_get_ctime(inode).tv_nsec; } if (ff) { inarg.valid |= FATTR_FH; @@ -1857,7 +1857,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, if (attr->ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f19d748890f0..549358ffea8b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -194,8 +194,7 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_mtime.tv_nsec = attr->mtimensec; } if (!(cache_mask & STATX_CTIME)) { - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_ctime(inode, attr->ctime, attr->ctimensec); } if (attr->blksize != 0) @@ -259,8 +258,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, attr->mtimensec = inode->i_mtime.tv_nsec; } if (cache_mask & STATX_CTIME) { - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; + attr->ctime = inode_get_ctime(inode).tv_sec; + attr->ctimensec = inode_get_ctime(inode).tv_nsec; } if ((attr_version != 0 && fi->attr_version > attr_version) || @@ -318,8 +317,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr, inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; + inode_set_ctime(inode, attr->ctime, attr->ctimensec); if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode, attr->flags); @@ -1401,16 +1399,18 @@ EXPORT_SYMBOL_GPL(fuse_dev_free); static void fuse_fill_attr_from_inode(struct fuse_attr *attr, const struct fuse_inode *fi) { + struct timespec64 ctime = inode_get_ctime(&fi->inode); + *attr = (struct fuse_attr){ .ino = fi->inode.i_ino, .size = fi->inode.i_size, .blocks = fi->inode.i_blocks, .atime = fi->inode.i_atime.tv_sec, .mtime = fi->inode.i_mtime.tv_sec, - .ctime = fi->inode.i_ctime.tv_sec, + .ctime = ctime.tv_sec, .atimensec = fi->inode.i_atime.tv_nsec, .mtimensec = fi->inode.i_mtime.tv_nsec, - .ctimensec = fi->inode.i_ctime.tv_nsec, + .ctimensec = ctime.tv_nsec, .mode = fi->inode.i_mode, .nlink = fi->inode.i_nlink, .uid = fi->inode.i_uid.val, diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index a392aa0f041d..443640e6fb9c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -142,7 +142,7 @@ int gfs2_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, ret = __gfs2_set_acl(inode, acl, type); if (!ret && mode != inode->i_mode) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode->i_mode = mode; mark_inode_dirty(inode); } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ae49256b7c8c..9c4b26aec580 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -747,7 +747,7 @@ static const struct address_space_operations gfs2_aops = { .writepages = gfs2_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = gfs2_bmap, diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 8d611fbcf0bd..f62366be7587 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len) if (status) return ERR_PTR(status); - folio = iomap_get_folio(iter, pos); + folio = iomap_get_folio(iter, pos, len); if (IS_ERR(folio)) gfs2_trans_end(sdp); return folio; @@ -1386,7 +1386,7 @@ static int trunc_start(struct inode *inode, u64 newsize) ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; i_size_write(inode, newsize); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); if (journaled) @@ -1583,8 +1583,7 @@ out_unlock: /* Every transaction boundary, we rewrite the dinode to keep its di_blocks current in case of failure. */ - ip->i_inode.i_mtime = ip->i_inode.i_ctime = - current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -1950,7 +1949,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length) gfs2_statfs_change(sdp, 0, +btotal, 0); gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, ip->i_inode.i_gid); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); up_write(&ip->i_rw_mutex); @@ -1993,7 +1992,7 @@ static int trunc_end(struct gfs2_inode *ip) gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); gfs2_ordered_del_inode(ip); } - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_meta(ip->i_gl, dibh); @@ -2094,7 +2093,7 @@ static int do_grow(struct inode *inode, u64 size) goto do_end_trans; truncate_setsize(inode, size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 54a6d17b8c25..1a2afa88f8be 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -130,7 +130,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); if (ip->i_inode.i_size < offset + size) i_size_write(&ip->i_inode, offset + size); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_dinode_out(ip, dibh->b_data); brelse(dibh); @@ -227,7 +227,7 @@ out: if (ip->i_inode.i_size < offset + copied) i_size_write(&ip->i_inode, offset + copied); - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + ip->i_inode.i_mtime = inode_set_ctime_current(&ip->i_inode); gfs2_trans_add_meta(ip->i_gl, dibh); gfs2_dinode_out(ip, dibh->b_data); @@ -1814,7 +1814,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(IF2DT(nip->i_inode.i_mode)); dent->de_rahead = cpu_to_be16(gfs2_inode_ra_len(nip)); - tv = current_time(&ip->i_inode); + tv = inode_set_ctime_current(&ip->i_inode); if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); @@ -1825,7 +1825,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, da->bh = NULL; brelse(bh); ip->i_entries++; - ip->i_inode.i_mtime = ip->i_inode.i_ctime = tv; + ip->i_inode.i_mtime = tv; if (S_ISDIR(nip->i_inode.i_mode)) inc_nlink(&ip->i_inode); mark_inode_dirty(inode); @@ -1876,7 +1876,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) const struct qstr *name = &dentry->d_name; struct gfs2_dirent *dent, *prev = NULL; struct buffer_head *bh; - struct timespec64 tv = current_time(&dip->i_inode); + struct timespec64 tv; /* Returns _either_ the entry (if its first in block) or the previous entry otherwise */ @@ -1896,6 +1896,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) } dirent_del(dip, bh, prev, dent); + tv = inode_set_ctime_current(&dip->i_inode); if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); @@ -1910,7 +1911,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct dentry *dentry) if (!dip->i_entries) gfs2_consist_inode(dip); dip->i_entries--; - dip->i_inode.i_mtime = dip->i_inode.i_ctime = tv; + dip->i_inode.i_mtime = tv; if (d_is_dir(dentry)) drop_nlink(&dip->i_inode); mark_inode_dirty(&dip->i_inode); @@ -1951,7 +1952,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, dent->de_type = cpu_to_be16(new_type); brelse(bh); - dip->i_inode.i_mtime = dip->i_inode.i_ctime = current_time(&dip->i_inode); + dip->i_inode.i_mtime = inode_set_ctime_current(&dip->i_inode); mark_inode_dirty_sync(&dip->i_inode); return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index b43fa8b8fc05..766186c80682 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -260,7 +260,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = gfs2_meta_inode_buffer(ip, &bh); if (error) goto out_trans_end; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); gfs2_trans_add_meta(ip->i_gl, bh); ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 54319328b16b..aecdac3cfbe1 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -437,8 +437,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) inode->i_atime = atime; inode->i_mtime.tv_sec = be64_to_cpu(str->di_mtime); inode->i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); - inode->i_ctime.tv_sec = be64_to_cpu(str->di_ctime); - inode->i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); + inode_set_ctime(inode, be64_to_cpu(str->di_ctime), + be32_to_cpu(str->di_ctime_nsec)); ip->i_goal = be64_to_cpu(str->di_goal_meta); ip->i_generation = be64_to_cpu(str->di_generation); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 17c994a0c0d0..a21ac41d6669 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -690,7 +690,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, set_nlink(inode, S_ISDIR(mode) ? 2 : 1); inode->i_rdev = dev; inode->i_size = size; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); munge_mode_uid_gid(dip, inode); check_and_update_goal(dip); ip->i_goal = dip->i_goal; @@ -1029,7 +1029,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_trans_add_meta(ip->i_gl, dibh); inc_nlink(&ip->i_inode); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); ihold(inode); d_instantiate(dentry, inode); mark_inode_dirty(inode); @@ -1114,7 +1114,7 @@ static int gfs2_unlink_inode(struct gfs2_inode *dip, return error; ip->i_entries = 0; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else @@ -1371,7 +1371,7 @@ static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, if (dir_rename) return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); mark_inode_dirty_sync(&ip->i_inode); return 0; } @@ -2071,7 +2071,7 @@ static int gfs2_getattr(struct mnt_idmap *idmap, STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (gfs2_holder_initialized(&gh)) gfs2_glock_dq_uninit(&gh); @@ -2139,8 +2139,7 @@ loff_t gfs2_seek_hole(struct file *file, loff_t offset) return vfs_setpos(file, ret, inode->i_sb->s_maxbytes); } -static int gfs2_update_time(struct inode *inode, struct timespec64 *time, - int flags) +static int gfs2_update_time(struct inode *inode, int flags) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_glock *gl = ip->i_gl; @@ -2155,7 +2154,8 @@ static int gfs2_update_time(struct inode *inode, struct timespec64 *time, if (error) return error; } - return generic_update_time(inode, time, flags); + generic_update_time(inode, flags); + return 0; } static const struct inode_operations gfs2_file_iops = { diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 704192b73605..aa5fd06d47bc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -871,7 +871,7 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); - inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9f4d5d6549ee..2f701335e8ee 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -412,7 +412,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(inode)); str->di_atime = cpu_to_be64(inode->i_atime.tv_sec); str->di_mtime = cpu_to_be64(inode->i_mtime.tv_sec); - str->di_ctime = cpu_to_be64(inode->i_ctime.tv_sec); + str->di_ctime = cpu_to_be64(inode_get_ctime(inode).tv_sec); str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); @@ -429,7 +429,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_eattr = cpu_to_be64(ip->i_eattr); str->di_atime_nsec = cpu_to_be32(inode->i_atime.tv_nsec); str->di_mtime_nsec = cpu_to_be32(inode->i_mtime.tv_nsec); - str->di_ctime_nsec = cpu_to_be32(inode->i_ctime.tv_nsec); + str->di_ctime_nsec = cpu_to_be32(inode_get_ctime(inode).tv_nsec); } /** @@ -689,7 +689,7 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) struct super_block *sb = sdp->sd_vfs; int error; - error = freeze_super(sb); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); if (error) return error; @@ -697,7 +697,9 @@ static int gfs2_freeze_locally(struct gfs2_sbd *sdp) gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE | GFS2_LFC_FREEZE_GO_SYNC); if (gfs2_withdrawn(sdp)) { - thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + if (error) + return error; return -EIO; } } @@ -712,7 +714,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp) error = gfs2_freeze_lock_shared(sdp); if (error) goto fail; - error = thaw_super(sb); + error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (!error) return 0; @@ -761,7 +763,7 @@ out: * */ -static int gfs2_freeze_super(struct super_block *sb) +static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -816,7 +818,7 @@ out: * */ -static int gfs2_thaw_super(struct super_block *sb) +static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 2dfbe2f188dd..c60bc7f628e1 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -168,10 +168,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - error = thaw_super(sdp->sd_vfs); + error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; case 1: - error = freeze_super(sdp->sd_vfs); + error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); break; default: return -EINVAL; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 93b36d026bb4..4fea70c0fe3d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -311,7 +311,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, ea->ea_num_ptrs = 0; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(sdp); @@ -763,7 +763,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) goto out_end_trans; - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); out_end_trans: @@ -888,7 +888,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, if (es->es_el) ea_set_remove_stuffed(ip, es->es_el); - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); @@ -1106,7 +1106,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) ea->ea_type = GFS2_EATYPE_UNUSED; } - ip->i_inode.i_ctime = current_time(&ip->i_inode); + inode_set_ctime_current(&ip->i_inode); __mark_inode_dirty(&ip->i_inode, I_DIRTY_DATASYNC); gfs2_trans_end(GFS2_SB(&ip->i_inode)); diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index d365bf0b8c77..632c226a3972 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -133,7 +133,7 @@ int hfs_cat_create(u32 cnid, struct inode *dir, const struct qstr *str, struct i goto err1; dir->i_size++; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); hfs_find_exit(&fd); return 0; @@ -269,7 +269,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, const struct qstr *str) } dir->i_size--; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); res = 0; out: @@ -337,7 +337,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; dst_dir->i_size++; - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + dst_dir->i_mtime = inode_set_ctime_current(dst_dir); mark_inode_dirty(dst_dir); /* finally remove the old entry */ @@ -349,7 +349,7 @@ int hfs_cat_move(u32 cnid, struct inode *src_dir, const struct qstr *src_name, if (err) goto out; src_dir->i_size--; - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + src_dir->i_mtime = inode_set_ctime_current(src_dir); mark_inode_dirty(src_dir); type = entry.type; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 3e1e3dcf0b48..b75c26045df4 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -263,7 +263,7 @@ static int hfs_remove(struct inode *dir, struct dentry *dentry) if (res) return res; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfs_delete_inode(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 441d7fc952e3..ee349b72cfb3 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -200,7 +200,7 @@ struct inode *hfs_new_inode(struct inode *dir, const struct qstr *name, umode_t inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); HFS_I(inode)->flags = 0; HFS_I(inode)->rsrc_inode = NULL; HFS_I(inode)->fs_blocks = 0; @@ -355,8 +355,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_mode |= S_IWUGO; inode->i_mode &= ~hsb->s_file_umask; inode->i_mode |= S_IFREG; - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->file.MdDat); + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, + hfs_m_to_utime(rec->file.MdDat)); inode->i_op = &hfs_file_inode_operations; inode->i_fop = &hfs_file_operations; inode->i_mapping->a_ops = &hfs_aops; @@ -366,8 +366,8 @@ static int hfs_read_inode(struct inode *inode, void *data) inode->i_size = be16_to_cpu(rec->dir.Val) + 2; HFS_I(inode)->fs_blocks = 0; inode->i_mode = S_IFDIR | (S_IRWXUGO & ~hsb->s_dir_umask); - inode->i_ctime = inode->i_atime = inode->i_mtime = - hfs_m_to_utime(rec->dir.MdDat); + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, + hfs_m_to_utime(rec->dir.MdDat)); inode->i_op = &hfs_dir_inode_operations; inode->i_fop = &hfs_dir_operations; break; @@ -654,8 +654,7 @@ int hfs_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, truncate_setsize(inode, attr->ia_size); hfs_file_truncate(inode); - inode->i_atime = inode->i_mtime = inode->i_ctime = - current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 2875961fdc10..dc27d418fbcd 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -28,7 +28,9 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) /* fix up inode on a timezone change */ diff = sys_tz.tz_minuteswest * 60 - HFS_I(inode)->tz_secondswest; if (diff) { - inode->i_ctime.tv_sec += diff; + struct timespec64 ctime = inode_get_ctime(inode); + + inode_set_ctime(inode, ctime.tv_sec + diff, ctime.tv_nsec); inode->i_atime.tv_sec += diff; inode->i_mtime.tv_sec += diff; HFS_I(inode)->tz_secondswest += diff; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 35472cba750e..e71ae2537eaa 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -312,7 +312,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, dir->i_size++; if (S_ISDIR(inode->i_mode)) hfsplus_subfolders_inc(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); hfs_find_exit(&fd); @@ -417,7 +417,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, const struct qstr *str) dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); hfsplus_mark_inode_dirty(dir, HFSPLUS_I_CAT_DIRTY); if (type == HFSPLUS_FILE || type == HFSPLUS_FOLDER) { @@ -494,7 +494,7 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_size++; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_inc(dst_dir); - dst_dir->i_mtime = dst_dir->i_ctime = current_time(dst_dir); + dst_dir->i_mtime = inode_set_ctime_current(dst_dir); /* finally remove the old entry */ err = hfsplus_cat_build_key(sb, src_fd.search_key, @@ -511,7 +511,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_size--; if (type == HFSPLUS_FOLDER) hfsplus_subfolders_dec(src_dir); - src_dir->i_mtime = src_dir->i_ctime = current_time(src_dir); + src_dir->i_mtime = inode_set_ctime_current(src_dir); /* remove old thread entry */ hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 56fb5f1312e7..f5c4b3e31a1c 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -346,7 +346,7 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, inc_nlink(inode); hfsplus_instantiate(dst_dentry, inode, cnid); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); sbi->file_count++; hfsplus_mark_mdb_dirty(dst_dir->i_sb); @@ -405,7 +405,7 @@ static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) hfsplus_delete_inode(inode); } else sbi->file_count--; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: mutex_unlock(&sbi->vh_mutex); @@ -426,7 +426,7 @@ static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) if (res) goto out; clear_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); hfsplus_delete_inode(inode); mark_inode_dirty(inode); out: diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 7d1a675e037d..c65c8c4b03dd 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -267,7 +267,7 @@ static int hfsplus_setattr(struct mnt_idmap *idmap, } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); } setattr_copy(&nop_mnt_idmap, inode, attr); @@ -298,7 +298,7 @@ int hfsplus_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } @@ -392,7 +392,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, struct inode *dir, inode->i_ino = sbi->next_cnid++; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); set_nlink(inode, 1); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); hip = HFSPLUS_I(inode); INIT_LIST_HEAD(&hip->open_dir_list); @@ -523,7 +523,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_size = 2 + be32_to_cpu(folder->valence); inode->i_atime = hfsp_mt2ut(folder->access_date); inode->i_mtime = hfsp_mt2ut(folder->content_mod_date); - inode->i_ctime = hfsp_mt2ut(folder->attribute_mod_date); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(folder->attribute_mod_date)); HFSPLUS_I(inode)->create_date = folder->create_date; HFSPLUS_I(inode)->fs_blocks = 0; if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { @@ -564,7 +565,8 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } inode->i_atime = hfsp_mt2ut(file->access_date); inode->i_mtime = hfsp_mt2ut(file->content_mod_date); - inode->i_ctime = hfsp_mt2ut(file->attribute_mod_date); + inode_set_ctime_to_ts(inode, + hfsp_mt2ut(file->attribute_mod_date)); HFSPLUS_I(inode)->create_date = file->create_date; } else { pr_err("bad catalog entry used to create inode\n"); @@ -609,7 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode) hfsplus_cat_set_perms(inode, &folder->permissions); folder->access_date = hfsp_ut2mt(inode->i_atime); folder->content_mod_date = hfsp_ut2mt(inode->i_mtime); - folder->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + folder->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); folder->valence = cpu_to_be32(inode->i_size - 2); if (folder->flags & cpu_to_be16(HFSPLUS_HAS_FOLDER_COUNT)) { folder->subfolders = @@ -644,7 +646,7 @@ int hfsplus_cat_write_inode(struct inode *inode) file->flags &= cpu_to_be16(~HFSPLUS_FILE_LOCKED); file->access_date = hfsp_ut2mt(inode->i_atime); file->content_mod_date = hfsp_ut2mt(inode->i_mtime); - file->attribute_mod_date = hfsp_ut2mt(inode->i_ctime); + file->attribute_mod_date = hfsp_ut2mt(inode_get_ctime(inode)); hfs_bnode_write(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); } @@ -700,7 +702,7 @@ int hfsplus_fileattr_set(struct mnt_idmap *idmap, else hip->userflags &= ~HFSPLUS_FLG_NODUMP; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 46387090eb76..dc5a5cea5fae 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -517,8 +517,7 @@ static int hostfs_inode_update(struct inode *ino, const struct hostfs_stat *st) (struct timespec64){ st->atime.tv_sec, st->atime.tv_nsec }; ino->i_mtime = (struct timespec64){ st->mtime.tv_sec, st->mtime.tv_nsec }; - ino->i_ctime = - (struct timespec64){ st->ctime.tv_sec, st->ctime.tv_nsec }; + inode_set_ctime(ino, st->ctime.tv_sec, st->ctime.tv_nsec); ino->i_size = st->size; ino->i_blocks = st->blocks; return 0; diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index f32f15669996..f36566d61215 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -277,10 +277,10 @@ struct dentry *hpfs_lookup(struct inode *dir, struct dentry *dentry, unsigned in * inode. */ - if (!result->i_ctime.tv_sec) { - if (!(result->i_ctime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)))) - result->i_ctime.tv_sec = 1; - result->i_ctime.tv_nsec = 0; + if (!inode_get_ctime(result).tv_sec) { + time64_t csec = local_to_gmt(dir->i_sb, le32_to_cpu(de->creation_date)); + + inode_set_ctime(result, csec ? csec : 1, 0); result->i_mtime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->write_date)); result->i_mtime.tv_nsec = 0; result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(de->read_date)); diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index e50e92a42432..479166378bae 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -36,7 +36,7 @@ void hpfs_init_inode(struct inode *i) hpfs_inode->i_rddir_off = NULL; hpfs_inode->i_dirty = 0; - i->i_ctime.tv_sec = i->i_ctime.tv_nsec = 0; + inode_set_ctime(i, 0, 0); i->i_mtime.tv_sec = i->i_mtime.tv_nsec = 0; i->i_atime.tv_sec = i->i_atime.tv_nsec = 0; } @@ -232,7 +232,7 @@ void hpfs_write_inode_nolock(struct inode *i) if (de) { de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(hpfs_inode->i_ea_size); hpfs_mark_4buffers_dirty(&qbh); @@ -242,7 +242,7 @@ void hpfs_write_inode_nolock(struct inode *i) if ((de = map_dirent(i, hpfs_inode->i_dno, "\001\001", 2, NULL, &qbh))) { de->write_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_mtime.tv_sec)); de->read_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_atime.tv_sec)); - de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, i->i_ctime.tv_sec)); + de->creation_date = cpu_to_le32(gmt_to_local(i->i_sb, inode_get_ctime(i).tv_sec)); de->read_only = !(i->i_mode & 0222); de->ea_size = cpu_to_le32(/*hpfs_inode->i_ea_size*/0); de->file_size = cpu_to_le32(0); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index 69fb40b2c99a..f4eb8d6f5989 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -13,10 +13,9 @@ static void hpfs_update_directory_times(struct inode *dir) { time64_t t = local_to_gmt(dir->i_sb, local_get_seconds(dir->i_sb)); if (t == dir->i_mtime.tv_sec && - t == dir->i_ctime.tv_sec) + t == inode_get_ctime(dir).tv_sec) return; - dir->i_mtime.tv_sec = dir->i_ctime.tv_sec = t; - dir->i_mtime.tv_nsec = dir->i_ctime.tv_nsec = 0; + dir->i_mtime = inode_set_ctime(dir, t, 0); hpfs_write_inode_nolock(dir); } @@ -59,10 +58,8 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; hpfs_i(result)->i_dno = dno; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_mode |= S_IFDIR; result->i_op = &hpfs_dir_iops; @@ -167,10 +164,8 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, result->i_fop = &hpfs_file_ops; set_nlink(result, 1); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; if (dee.read_only) result->i_mode &= ~0222; @@ -250,10 +245,8 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_uid = current_fsuid(); result->i_gid = current_fsgid(); @@ -326,10 +319,8 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; - result->i_ctime.tv_sec = result->i_mtime.tv_sec = result->i_atime.tv_sec = local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)); - result->i_ctime.tv_nsec = 0; - result->i_mtime.tv_nsec = 0; - result->i_atime.tv_nsec = 0; + result->i_mtime = result->i_atime = + inode_set_ctime(result, local_to_gmt(dir->i_sb, le32_to_cpu(dee.creation_date)), 0); hpfs_i(result)->i_ea_size = 0; result->i_mode = S_IFLNK | 0777; result->i_uid = current_fsuid(); diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 1cb89595b875..758a51564124 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -729,8 +729,9 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) root->i_atime.tv_nsec = 0; root->i_mtime.tv_sec = local_to_gmt(s, le32_to_cpu(de->write_date)); root->i_mtime.tv_nsec = 0; - root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date)); - root->i_ctime.tv_nsec = 0; + inode_set_ctime(root, + local_to_gmt(s, le32_to_cpu(de->creation_date)), + 0); hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size); hpfs_i(root)->i_parent_dir = root->i_ino; if (root->i_size == -1) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7b17ccfa039d..93d3bcfd4fc8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -887,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); out: inode_unlock(inode); return error; @@ -935,7 +935,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_mode = S_IFDIR | ctx->mode; inode->i_uid = ctx->uid; inode->i_gid = ctx->gid; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &hugetlbfs_dir_inode_operations; inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ @@ -979,7 +979,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, &hugetlbfs_i_mmap_rwsem_key); inode->i_mapping->a_ops = &hugetlbfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_mapping->private_data = resv_map; info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { @@ -1022,7 +1022,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); d_instantiate(dentry, inode); dget(dentry);/* Extra count - pin the dentry in core */ return 0; @@ -1054,7 +1054,7 @@ static int hugetlbfs_tmpfile(struct mnt_idmap *idmap, inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0); if (!inode) return -ENOSPC; - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); d_tmpfile(file, inode); return finish_open_simple(file, 0); } @@ -1076,7 +1076,7 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, } else iput(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); return error; } diff --git a/fs/inode.c b/fs/inode.c index 67611a360031..35fd688168c5 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -751,16 +751,11 @@ EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock * @sb: superblock to operate on - * @kill_dirty: flag to guide handling of dirty inodes * - * Attempts to free all inodes for a given superblock. If there were any - * busy inodes return a non-zero value, else zero. - * If @kill_dirty is set, discard dirty inodes too, otherwise treat - * them as busy. + * Attempts to free all inodes (including dirty inodes) for a given superblock. */ -int invalidate_inodes(struct super_block *sb, bool kill_dirty) +void invalidate_inodes(struct super_block *sb) { - int busy = 0; struct inode *inode, *next; LIST_HEAD(dispose); @@ -772,14 +767,8 @@ again: spin_unlock(&inode->i_lock); continue; } - if (inode->i_state & I_DIRTY_ALL && !kill_dirty) { - spin_unlock(&inode->i_lock); - busy = 1; - continue; - } if (atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); - busy = 1; continue; } @@ -797,8 +786,6 @@ again: spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); - - return busy; } /* @@ -1850,6 +1837,7 @@ EXPORT_SYMBOL(bmap); static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, struct timespec64 now) { + struct timespec64 ctime; if (!(mnt->mnt_flags & MNT_RELATIME)) return 1; @@ -1861,7 +1849,8 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, /* * Is ctime younger than or equal to atime? If yes, update atime: */ - if (timespec64_compare(&inode->i_ctime, &inode->i_atime) >= 0) + ctime = inode_get_ctime(inode); + if (timespec64_compare(&ctime, &inode->i_atime) >= 0) return 1; /* @@ -1876,29 +1865,76 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode, return 0; } -int generic_update_time(struct inode *inode, struct timespec64 *time, int flags) +/** + * inode_update_timestamps - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. This function handles updating the + * actual timestamps. It's up to the caller to ensure that the inode is marked + * dirty appropriately. + * + * In the case where any of S_MTIME, S_CTIME, or S_VERSION need to be updated, + * attempt to update all three of them. S_ATIME updates can be handled + * independently of the rest. + * + * Returns a set of S_* flags indicating which values changed. + */ +int inode_update_timestamps(struct inode *inode, int flags) { - int dirty_flags = 0; + int updated = 0; + struct timespec64 now; + + if (flags & (S_MTIME|S_CTIME|S_VERSION)) { + struct timespec64 ctime = inode_get_ctime(inode); - if (flags & (S_ATIME | S_CTIME | S_MTIME)) { - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - - if (inode->i_sb->s_flags & SB_LAZYTIME) - dirty_flags |= I_DIRTY_TIME; - else - dirty_flags |= I_DIRTY_SYNC; + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + if (!timespec64_equal(&now, &inode->i_mtime)) { + inode->i_mtime = now; + updated |= S_MTIME; + } + if (IS_I_VERSION(inode) && inode_maybe_inc_iversion(inode, updated)) + updated |= S_VERSION; + } else { + now = current_time(inode); } - if ((flags & S_VERSION) && inode_maybe_inc_iversion(inode, false)) - dirty_flags |= I_DIRTY_SYNC; + if (flags & S_ATIME) { + if (!timespec64_equal(&now, &inode->i_atime)) { + inode->i_atime = now; + updated |= S_ATIME; + } + } + return updated; +} +EXPORT_SYMBOL(inode_update_timestamps); +/** + * generic_update_time - update the timestamps on the inode + * @inode: inode to be updated + * @flags: S_* flags that needed to be updated + * + * The update_time function is called when an inode's timestamps need to be + * updated for a read or write operation. In the case where any of S_MTIME, S_CTIME, + * or S_VERSION need to be updated we attempt to update all three of them. S_ATIME + * updates can be handled done independently of the rest. + * + * Returns a S_* mask indicating which fields were updated. + */ +int generic_update_time(struct inode *inode, int flags) +{ + int updated = inode_update_timestamps(inode, flags); + int dirty_flags = 0; + + if (updated & (S_ATIME|S_MTIME|S_CTIME)) + dirty_flags = inode->i_sb->s_flags & SB_LAZYTIME ? I_DIRTY_TIME : I_DIRTY_SYNC; + if (updated & S_VERSION) + dirty_flags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, dirty_flags); - return 0; + return updated; } EXPORT_SYMBOL(generic_update_time); @@ -1906,11 +1942,12 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) +int inode_update_time(struct inode *inode, int flags) { if (inode->i_op->update_time) - return inode->i_op->update_time(inode, time, flags); - return generic_update_time(inode, time, flags); + return inode->i_op->update_time(inode, flags); + generic_update_time(inode, flags); + return 0; } EXPORT_SYMBOL(inode_update_time); @@ -1962,7 +1999,6 @@ void touch_atime(const struct path *path) { struct vfsmount *mnt = path->mnt; struct inode *inode = d_inode(path->dentry); - struct timespec64 now; if (!atime_needs_update(path, inode)) return; @@ -1981,8 +2017,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ - now = current_time(inode); - inode_update_time(inode, &now, S_ATIME); + inode_update_time(inode, S_ATIME); __mnt_drop_write(mnt); skip_update: sb_end_write(inode->i_sb); @@ -2067,18 +2102,63 @@ int file_remove_privs(struct file *file) } EXPORT_SYMBOL(file_remove_privs); -static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) +/** + * current_mgtime - Return FS time (possibly fine-grained) + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs, as suitable for a ctime/mtime change. If the ctime is flagged + * as having been QUERIED, get a fine-grained timestamp. + */ +struct timespec64 current_mgtime(struct inode *inode) +{ + struct timespec64 now, ctime; + atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; + long nsec = atomic_long_read(pnsec); + + if (nsec & I_CTIME_QUERIED) { + ktime_get_real_ts64(&now); + return timestamp_truncate(now, inode); + } + + ktime_get_coarse_real_ts64(&now); + now = timestamp_truncate(now, inode); + + /* + * If we've recently fetched a fine-grained timestamp + * then the coarse-grained one may still be earlier than the + * existing ctime. Just keep the existing value if so. + */ + ctime = inode_get_ctime(inode); + if (timespec64_compare(&ctime, &now) > 0) + now = ctime; + + return now; +} +EXPORT_SYMBOL(current_mgtime); + +static struct timespec64 current_ctime(struct inode *inode) +{ + if (is_mgtime(inode)) + return current_mgtime(inode); + return current_time(inode); +} + +static int inode_needs_update_time(struct inode *inode) { int sync_it = 0; + struct timespec64 now = current_ctime(inode); + struct timespec64 ctime; /* First try to exhaust all avenues to not sync */ if (IS_NOCMTIME(inode)) return 0; - if (!timespec64_equal(&inode->i_mtime, now)) + if (!timespec64_equal(&inode->i_mtime, &now)) sync_it = S_MTIME; - if (!timespec64_equal(&inode->i_ctime, now)) + ctime = inode_get_ctime(inode); + if (!timespec64_equal(&ctime, &now)) sync_it |= S_CTIME; if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) @@ -2087,15 +2167,14 @@ static int inode_needs_update_time(struct inode *inode, struct timespec64 *now) return sync_it; } -static int __file_update_time(struct file *file, struct timespec64 *now, - int sync_mode) +static int __file_update_time(struct file *file, int sync_mode) { int ret = 0; struct inode *inode = file_inode(file); /* try to update time settings */ if (!__mnt_want_write_file(file)) { - ret = inode_update_time(inode, now, sync_mode); + ret = inode_update_time(inode, sync_mode); __mnt_drop_write_file(file); } @@ -2120,13 +2199,12 @@ int file_update_time(struct file *file) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } EXPORT_SYMBOL(file_update_time); @@ -2149,7 +2227,6 @@ static int file_modified_flags(struct file *file, int flags) { int ret; struct inode *inode = file_inode(file); - struct timespec64 now = current_time(inode); /* * Clear the security bits if the process is not being run by root. @@ -2162,13 +2239,13 @@ static int file_modified_flags(struct file *file, int flags) if (unlikely(file->f_mode & FMODE_NOCMTIME)) return 0; - ret = inode_needs_update_time(inode, &now); + ret = inode_needs_update_time(inode); if (ret <= 0) return ret; if (flags & IOCB_NOWAIT) return -EAGAIN; - return __file_update_time(file, &now, ret); + return __file_update_time(file, ret); } /** @@ -2488,15 +2565,59 @@ struct timespec64 current_time(struct inode *inode) struct timespec64 now; ktime_get_coarse_real_ts64(&now); + return timestamp_truncate(now, inode); +} +EXPORT_SYMBOL(current_time); - if (unlikely(!inode->i_sb)) { - WARN(1, "current_time() called with uninitialized super_block in the inode"); +/** + * inode_set_ctime_current - set the ctime to current_time + * @inode: inode + * + * Set the inode->i_ctime to the current value for the inode. Returns + * the current value that was assigned to i_ctime. + */ +struct timespec64 inode_set_ctime_current(struct inode *inode) +{ + struct timespec64 now; + struct timespec64 ctime; + + ctime.tv_nsec = READ_ONCE(inode->__i_ctime.tv_nsec); + if (!(ctime.tv_nsec & I_CTIME_QUERIED)) { + now = current_time(inode); + + /* Just copy it into place if it's not multigrain */ + if (!is_mgtime(inode)) { + inode_set_ctime_to_ts(inode, now); + return now; + } + + /* + * If we've recently updated with a fine-grained timestamp, + * then the coarse-grained one may still be earlier than the + * existing ctime. Just keep the existing value if so. + */ + ctime.tv_sec = inode->__i_ctime.tv_sec; + if (timespec64_compare(&ctime, &now) > 0) + return ctime; + + /* + * Ctime updates are usually protected by the inode_lock, but + * we can still race with someone setting the QUERIED flag. + * Try to swap the new nsec value into place. If it's changed + * in the interim, then just go with a fine-grained timestamp. + */ + if (cmpxchg(&inode->__i_ctime.tv_nsec, ctime.tv_nsec, + now.tv_nsec) != ctime.tv_nsec) + goto fine_grained; + inode->__i_ctime.tv_sec = now.tv_sec; return now; } - - return timestamp_truncate(now, inode); +fine_grained: + ktime_get_real_ts64(&now); + inode_set_ctime_to_ts(inode, timestamp_truncate(now, inode)); + return now; } -EXPORT_SYMBOL(current_time); +EXPORT_SYMBOL(inode_set_ctime_current); /** * in_group_or_capable - check whether caller is CAP_FSETID privileged diff --git a/fs/internal.h b/fs/internal.h index f7a3dc111026..74d3b161dd2c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -115,7 +115,7 @@ static inline void put_file_access(struct file *file) * super.c */ extern int reconfigure_super(struct fs_context *); -extern bool trylock_super(struct super_block *sb); +extern bool super_trylock_shared(struct super_block *sb); struct super_block *user_get_super(dev_t, bool excl); void put_super(struct super_block *sb); extern bool mount_capable(struct fs_context *); @@ -201,7 +201,7 @@ void lock_two_inodes(struct inode *inode1, struct inode *inode2, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -extern int invalidate_inodes(struct super_block *, bool); +void invalidate_inodes(struct super_block *sb); /* * dcache.c diff --git a/fs/ioctl.c b/fs/ioctl.c index 5b2481cd4750..f5fd99d6b0d4 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -109,9 +109,6 @@ static int ioctl_fibmap(struct file *filp, int __user *p) * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ -#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) -#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) -#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { @@ -127,6 +124,10 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) + if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) @@ -396,8 +397,8 @@ static int ioctl_fsfreeze(struct file *filp) /* Freeze */ if (sb->s_op->freeze_super) - return sb->s_op->freeze_super(sb); - return freeze_super(sb); + return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); + return freeze_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_fsthaw(struct file *filp) @@ -409,8 +410,8 @@ static int ioctl_fsthaw(struct file *filp) /* Thaw */ if (sb->s_op->thaw_super) - return sb->s_op->thaw_super(sb); - return thaw_super(sb); + return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); + return thaw_super(sb, FREEZE_HOLDER_USERSPACE); } static int ioctl_file_dedupe_range(struct file *file, @@ -877,6 +878,9 @@ out: #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation + * @file: The file to operate on. + * @cmd: The ioctl command number. + * @arg: The argument to the ioctl. * * This is not normally called as a function, but instead set in struct * file_operations as diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index aa8967cca1a3..283fb96f6609 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -23,65 +23,169 @@ #define IOEND_BATCH_SIZE 4096 +typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); /* - * Structure allocated for each folio when block size < folio size - * to track sub-folio uptodate status and I/O completions. + * Structure allocated for each folio to track per-block uptodate, dirty state + * and I/O completions. */ -struct iomap_page { +struct iomap_folio_state { atomic_t read_bytes_pending; atomic_t write_bytes_pending; - spinlock_t uptodate_lock; - unsigned long uptodate[]; + spinlock_t state_lock; + + /* + * Each block has two bits in this bitmap: + * Bits [0..blocks_per_folio) has the uptodate status. + * Bits [b_p_f...(2*b_p_f)) has the dirty status. + */ + unsigned long state[]; }; -static inline struct iomap_page *to_iomap_page(struct folio *folio) +static struct bio_set iomap_ioend_bioset; + +static inline bool ifs_is_fully_uptodate(struct folio *folio, + struct iomap_folio_state *ifs) { - if (folio_test_private(folio)) - return folio_get_private(folio); - return NULL; + struct inode *inode = folio->mapping->host; + + return bitmap_full(ifs->state, i_blocks_per_folio(inode, folio)); } -static struct bio_set iomap_ioend_bioset; +static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, + unsigned int block) +{ + return test_bit(block, ifs->state); +} + +static void ifs_set_range_uptodate(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int first_blk = off >> inode->i_blkbits; + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk, nr_blks); + if (ifs_is_fully_uptodate(folio, ifs)) + folio_mark_uptodate(folio); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_uptodate(struct folio *folio, size_t off, + size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_uptodate(folio, ifs, off, len); + else + folio_mark_uptodate(folio); +} + +static inline bool ifs_block_is_dirty(struct folio *folio, + struct iomap_folio_state *ifs, int block) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + + return test_bit(block + blks_per_folio, ifs->state); +} + +static void ifs_clear_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; -static struct iomap_page * -iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags) + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_clear(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_clear_range_dirty(struct folio *folio, size_t off, size_t len) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_clear_range_dirty(folio, ifs, off, len); +} + +static void ifs_set_range_dirty(struct folio *folio, + struct iomap_folio_state *ifs, size_t off, size_t len) +{ + struct inode *inode = folio->mapping->host; + unsigned int blks_per_folio = i_blocks_per_folio(inode, folio); + unsigned int first_blk = (off >> inode->i_blkbits); + unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; + unsigned int nr_blks = last_blk - first_blk + 1; + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + bitmap_set(ifs->state, first_blk + blks_per_folio, nr_blks); + spin_unlock_irqrestore(&ifs->state_lock, flags); +} + +static void iomap_set_range_dirty(struct folio *folio, size_t off, size_t len) +{ + struct iomap_folio_state *ifs = folio->private; + + if (ifs) + ifs_set_range_dirty(folio, ifs, off, len); +} + +static struct iomap_folio_state *ifs_alloc(struct inode *inode, + struct folio *folio, unsigned int flags) +{ + struct iomap_folio_state *ifs = folio->private; unsigned int nr_blocks = i_blocks_per_folio(inode, folio); gfp_t gfp; - if (iop || nr_blocks <= 1) - return iop; + if (ifs || nr_blocks <= 1) + return ifs; if (flags & IOMAP_NOWAIT) gfp = GFP_NOWAIT; else gfp = GFP_NOFS | __GFP_NOFAIL; - iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), - gfp); - if (iop) { - spin_lock_init(&iop->uptodate_lock); - if (folio_test_uptodate(folio)) - bitmap_fill(iop->uptodate, nr_blocks); - folio_attach_private(folio, iop); - } - return iop; + /* + * ifs->state tracks two sets of state flags when the + * filesystem block size is smaller than the folio size. + * The first state tracks per-block uptodate and the + * second tracks per-block dirty state. + */ + ifs = kzalloc(struct_size(ifs, state, + BITS_TO_LONGS(2 * nr_blocks)), gfp); + if (!ifs) + return ifs; + + spin_lock_init(&ifs->state_lock); + if (folio_test_uptodate(folio)) + bitmap_set(ifs->state, 0, nr_blocks); + if (folio_test_dirty(folio)) + bitmap_set(ifs->state, nr_blocks, nr_blocks); + folio_attach_private(folio, ifs); + + return ifs; } -static void iomap_page_release(struct folio *folio) +static void ifs_free(struct folio *folio) { - struct iomap_page *iop = folio_detach_private(folio); - struct inode *inode = folio->mapping->host; - unsigned int nr_blocks = i_blocks_per_folio(inode, folio); + struct iomap_folio_state *ifs = folio_detach_private(folio); - if (!iop) + if (!ifs) return; - WARN_ON_ONCE(atomic_read(&iop->read_bytes_pending)); - WARN_ON_ONCE(atomic_read(&iop->write_bytes_pending)); - WARN_ON_ONCE(bitmap_full(iop->uptodate, nr_blocks) != + WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending)); + WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); + WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); - kfree(iop); + kfree(ifs); } /* @@ -90,7 +194,7 @@ static void iomap_page_release(struct folio *folio) static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, loff_t *pos, loff_t length, size_t *offp, size_t *lenp) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; loff_t orig_pos = *pos; loff_t isize = i_size_read(inode); unsigned block_bits = inode->i_blkbits; @@ -105,12 +209,12 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, * per-block uptodate status and adjust the offset and length if needed * to avoid reading in already uptodate ranges. */ - if (iop) { + if (ifs) { unsigned int i; /* move forward for each leading block marked uptodate */ for (i = first; i <= last; i++) { - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) break; *pos += block_size; poff += block_size; @@ -120,7 +224,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, /* truncate len if we find any trailing uptodate block(s) */ for ( ; i <= last; i++) { - if (test_bit(i, iop->uptodate)) { + if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; break; @@ -144,43 +248,19 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } -static void iomap_iop_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - struct inode *inode = folio->mapping->host; - unsigned first = off >> inode->i_blkbits; - unsigned last = (off + len - 1) >> inode->i_blkbits; - unsigned long flags; - - spin_lock_irqsave(&iop->uptodate_lock, flags); - bitmap_set(iop->uptodate, first, last - first + 1); - if (bitmap_full(iop->uptodate, i_blocks_per_folio(inode, folio))) - folio_mark_uptodate(folio); - spin_unlock_irqrestore(&iop->uptodate_lock, flags); -} - -static void iomap_set_range_uptodate(struct folio *folio, - struct iomap_page *iop, size_t off, size_t len) -{ - if (iop) - iomap_iop_set_range_uptodate(folio, iop, off, len); - else - folio_mark_uptodate(folio); -} - static void iomap_finish_folio_read(struct folio *folio, size_t offset, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; if (unlikely(error)) { folio_clear_uptodate(folio); folio_set_error(folio); } else { - iomap_set_range_uptodate(folio, iop, offset, len); + iomap_set_range_uptodate(folio, offset, len); } - if (!iop || atomic_sub_and_test(len, &iop->read_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending)) folio_unlock(folio); } @@ -213,7 +293,6 @@ struct iomap_readpage_ctx { static int iomap_read_inline_data(const struct iomap_iter *iter, struct folio *folio) { - struct iomap_page *iop; const struct iomap *iomap = iomap_iter_srcmap(iter); size_t size = i_size_read(iter->inode) - iomap->offset; size_t poff = offset_in_page(iomap->offset); @@ -231,15 +310,13 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, if (WARN_ON_ONCE(size > iomap->length)) return -EIO; if (offset > 0) - iop = iomap_page_create(iter->inode, folio, iter->flags); - else - iop = to_iomap_page(folio); + ifs_alloc(iter->inode, folio, iter->flags); addr = kmap_local_folio(folio, offset); memcpy(addr, iomap->inline_data, size); memset(addr + size, 0, PAGE_SIZE - poff - size); kunmap_local(addr); - iomap_set_range_uptodate(folio, iop, offset, PAGE_SIZE - poff); + iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff); return 0; } @@ -260,7 +337,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, loff_t pos = iter->pos + offset; loff_t length = iomap_length(iter) - offset; struct folio *folio = ctx->cur_folio; - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t orig_pos = pos; size_t poff, plen; sector_t sector; @@ -269,20 +346,20 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); /* zero post-eof blocks as the page may be mapped */ - iop = iomap_page_create(iter->inode, folio, iter->flags); + ifs = ifs_alloc(iter->inode, folio, iter->flags); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); if (plen == 0) goto done; if (iomap_block_needs_zeroing(iter, pos)) { folio_zero_range(folio, poff, plen); - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); goto done; } ctx->cur_folio_in_bio = true; - if (iop) - atomic_add(plen, &iop->read_bytes_pending); + if (ifs) + atomic_add(plen, &ifs->read_bytes_pending); sector = iomap_sector(iomap, pos); if (!ctx->bio || @@ -436,11 +513,11 @@ EXPORT_SYMBOL_GPL(iomap_readahead); */ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; struct inode *inode = folio->mapping->host; unsigned first, last, i; - if (!iop) + if (!ifs) return false; /* Caller's range may extend past the end of this folio */ @@ -451,7 +528,7 @@ bool iomap_is_partially_uptodate(struct folio *folio, size_t from, size_t count) last = (from + count - 1) >> inode->i_blkbits; for (i = first; i <= last; i++) - if (!test_bit(i, iop->uptodate)) + if (!ifs_block_is_uptodate(ifs, i)) return false; return true; } @@ -461,16 +538,18 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate); * iomap_get_folio - get a folio reference for writing * @iter: iteration structure * @pos: start offset of write + * @len: Suggested size of folio to create. * * Returns a locked reference to the folio at @pos, or an error pointer if the * folio could not be obtained. */ -struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos) +struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) { - unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS; + fgf_t fgp = FGP_WRITEBEGIN | FGP_NOFS; if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, fgp, mapping_gfp_mask(iter->inode->i_mapping)); @@ -483,14 +562,13 @@ bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags) folio_size(folio)); /* - * mm accommodates an old ext3 case where clean folios might - * not have had the dirty bit cleared. Thus, it can send actual - * dirty folios to ->release_folio() via shrink_active_list(); - * skip those here. + * If the folio is dirty, we refuse to release our metadata because + * it may be partially dirty. Once we track per-block dirty state, + * we can release the metadata if every block is dirty. */ - if (folio_test_dirty(folio) || folio_test_writeback(folio)) + if (folio_test_dirty(folio)) return false; - iomap_page_release(folio); + ifs_free(folio); return true; } EXPORT_SYMBOL_GPL(iomap_release_folio); @@ -507,16 +585,22 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len) if (offset == 0 && len == folio_size(folio)) { WARN_ON_ONCE(folio_test_writeback(folio)); folio_cancel_dirty(folio); - iomap_page_release(folio); - } else if (folio_test_large(folio)) { - /* Must release the iop so the page can be split */ - WARN_ON_ONCE(!folio_test_uptodate(folio) && - folio_test_dirty(folio)); - iomap_page_release(folio); + ifs_free(folio); } } EXPORT_SYMBOL_GPL(iomap_invalidate_folio); +bool iomap_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct inode *inode = mapping->host; + size_t len = folio_size(folio); + + ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, len); + return filemap_dirty_folio(mapping, folio); +} +EXPORT_SYMBOL_GPL(iomap_dirty_folio); + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -547,7 +631,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t len, struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); - struct iomap_page *iop; + struct iomap_folio_state *ifs; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); @@ -555,14 +639,23 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, size_t from = offset_in_folio(folio, pos), to = from + len; size_t poff, plen; - if (folio_test_uptodate(folio)) + /* + * If the write completely overlaps the current folio, then + * entire folio will be dirtied so there is no need for + * per-block state tracking structures to be attached to this folio. + */ + if (pos <= folio_pos(folio) && + pos + len >= folio_pos(folio) + folio_size(folio)) return 0; - folio_clear_error(folio); - iop = iomap_page_create(iter->inode, folio, iter->flags); - if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1) + ifs = ifs_alloc(iter->inode, folio, iter->flags); + if ((iter->flags & IOMAP_NOWAIT) && !ifs && nr_blocks > 1) return -EAGAIN; + if (folio_test_uptodate(folio)) + return 0; + folio_clear_error(folio); + do { iomap_adjust_read_range(iter->inode, folio, &block_start, block_end - block_start, &poff, &plen); @@ -589,7 +682,7 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, if (status) return status; } - iomap_set_range_uptodate(folio, iop, poff, plen); + iomap_set_range_uptodate(folio, poff, plen); } while ((block_start += plen) < block_end); return 0; @@ -603,7 +696,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); else - return iomap_get_folio(iter, pos); + return iomap_get_folio(iter, pos, len); } static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, @@ -696,7 +789,6 @@ out_unlock: static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, size_t copied, struct folio *folio) { - struct iomap_page *iop = to_iomap_page(folio); flush_dcache_folio(folio); /* @@ -712,7 +804,8 @@ static size_t __iomap_write_end(struct inode *inode, loff_t pos, size_t len, */ if (unlikely(copied < len && !folio_test_uptodate(folio))) return 0; - iomap_set_range_uptodate(folio, iop, offset_in_folio(folio, pos), len); + iomap_set_range_uptodate(folio, offset_in_folio(folio, pos), len); + iomap_set_range_dirty(folio, offset_in_folio(folio, pos), copied); filemap_dirty_folio(inode->i_mapping, folio); return copied; } @@ -773,6 +866,7 @@ static size_t iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { loff_t length = iomap_length(iter); + size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER; loff_t pos = iter->pos; ssize_t written = 0; long status = 0; @@ -781,15 +875,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) do { struct folio *folio; - struct page *page; - unsigned long offset; /* Offset into pagecache page */ - unsigned long bytes; /* Bytes to write to page */ + size_t offset; /* Offset into folio */ + size_t bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - offset = offset_in_page(pos); - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_count(i)); -again: + offset = pos & (chunk - 1); + bytes = min(chunk - offset, iov_iter_count(i)); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) @@ -819,12 +910,14 @@ again: if (iter->iomap.flags & IOMAP_F_STALE) break; - page = folio_file_page(folio, pos >> PAGE_SHIFT); - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + offset = offset_in_folio(folio, pos); + if (bytes > folio_size(folio) - offset) + bytes = folio_size(folio) - offset; - copied = copy_page_from_iter_atomic(page, offset, bytes, i); + if (mapping_writably_mapped(mapping)) + flush_dcache_folio(folio); + copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); status = iomap_write_end(iter, pos, bytes, copied, folio); if (unlikely(copied != status)) @@ -840,11 +933,13 @@ again: */ if (copied) bytes = copied; - goto again; + if (chunk > PAGE_SIZE) + chunk /= 2; + } else { + pos += status; + written += status; + length -= status; } - pos += status; - written += status; - length -= status; } while (iov_iter_count(i) && length); if (status == -EAGAIN) { @@ -880,6 +975,76 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static int iomap_write_delalloc_ifs_punch(struct inode *inode, + struct folio *folio, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + unsigned int first_blk, last_blk, i; + loff_t last_byte; + u8 blkbits = inode->i_blkbits; + struct iomap_folio_state *ifs; + int ret = 0; + + /* + * When we have per-block dirty tracking, there can be + * blocks within a folio which are marked uptodate + * but not dirty. In that case it is necessary to punch + * out such blocks to avoid leaking any delalloc blocks. + */ + ifs = folio->private; + if (!ifs) + return ret; + + last_byte = min_t(loff_t, end_byte - 1, + folio_pos(folio) + folio_size(folio) - 1); + first_blk = offset_in_folio(folio, start_byte) >> blkbits; + last_blk = offset_in_folio(folio, last_byte) >> blkbits; + for (i = first_blk; i <= last_blk; i++) { + if (!ifs_block_is_dirty(folio, ifs, i)) { + ret = punch(inode, folio_pos(folio) + (i << blkbits), + 1 << blkbits); + if (ret) + return ret; + } + } + + return ret; +} + + +static int iomap_write_delalloc_punch(struct inode *inode, struct folio *folio, + loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, + iomap_punch_t punch) +{ + int ret = 0; + + if (!folio_test_dirty(folio)) + return ret; + + /* if dirty, punch up to offset */ + if (start_byte > *punch_start_byte) { + ret = punch(inode, *punch_start_byte, + start_byte - *punch_start_byte); + if (ret) + return ret; + } + + /* Punch non-dirty blocks within folio */ + ret = iomap_write_delalloc_ifs_punch(inode, folio, start_byte, + end_byte, punch); + if (ret) + return ret; + + /* + * Make sure the next punch start is correctly bound to + * the end of this data range, not the end of the folio. + */ + *punch_start_byte = min_t(loff_t, end_byte, + folio_pos(folio) + folio_size(folio)); + + return ret; +} + /* * Scan the data range passed to us for dirty page cache folios. If we find a * dirty folio, punch out the preceeding range and update the offset from which @@ -899,10 +1064,11 @@ EXPORT_SYMBOL_GPL(iomap_file_buffered_write); */ static int iomap_write_delalloc_scan(struct inode *inode, loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t offset, loff_t length)) + iomap_punch_t punch) { while (start_byte < end_byte) { struct folio *folio; + int ret; /* grab locked page */ folio = filemap_lock_folio(inode->i_mapping, @@ -913,26 +1079,12 @@ static int iomap_write_delalloc_scan(struct inode *inode, continue; } - /* if dirty, punch up to offset */ - if (folio_test_dirty(folio)) { - if (start_byte > *punch_start_byte) { - int error; - - error = punch(inode, *punch_start_byte, - start_byte - *punch_start_byte); - if (error) { - folio_unlock(folio); - folio_put(folio); - return error; - } - } - - /* - * Make sure the next punch start is correctly bound to - * the end of this data range, not the end of the folio. - */ - *punch_start_byte = min_t(loff_t, end_byte, - folio_next_index(folio) << PAGE_SHIFT); + ret = iomap_write_delalloc_punch(inode, folio, punch_start_byte, + start_byte, end_byte, punch); + if (ret) { + folio_unlock(folio); + folio_put(folio); + return ret; } /* move offset to start of next folio in range */ @@ -977,8 +1129,7 @@ static int iomap_write_delalloc_scan(struct inode *inode, * the code to subtle off-by-one bugs.... */ static int iomap_write_delalloc_release(struct inode *inode, - loff_t start_byte, loff_t end_byte, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + loff_t start_byte, loff_t end_byte, iomap_punch_t punch) { loff_t punch_start_byte = start_byte; loff_t scan_end_byte = min(i_size_read(inode), end_byte); @@ -1071,8 +1222,7 @@ out_unlock: */ int iomap_file_buffered_write_punch_delalloc(struct inode *inode, struct iomap *iomap, loff_t pos, loff_t length, - ssize_t written, - int (*punch)(struct inode *inode, loff_t pos, loff_t length)) + ssize_t written, iomap_punch_t punch) { loff_t start_byte; loff_t end_byte; @@ -1293,17 +1443,17 @@ EXPORT_SYMBOL_GPL(iomap_page_mkwrite); static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, size_t len, int error) { - struct iomap_page *iop = to_iomap_page(folio); + struct iomap_folio_state *ifs = folio->private; if (error) { folio_set_error(folio); mapping_set_error(inode->i_mapping, error); } - WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !iop); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) <= 0); + WARN_ON_ONCE(i_blocks_per_folio(inode, folio) > 1 && !ifs); + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) <= 0); - if (!iop || atomic_sub_and_test(len, &iop->write_bytes_pending)) + if (!ifs || atomic_sub_and_test(len, &ifs->write_bytes_pending)) folio_end_writeback(folio); } @@ -1570,7 +1720,7 @@ iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t offset, */ static void iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, - struct iomap_page *iop, struct iomap_writepage_ctx *wpc, + struct iomap_folio_state *ifs, struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct list_head *iolist) { sector_t sector = iomap_sector(&wpc->iomap, pos); @@ -1588,8 +1738,8 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, bio_add_folio_nofail(wpc->ioend->io_bio, folio, len, poff); } - if (iop) - atomic_add(len, &iop->write_bytes_pending); + if (ifs) + atomic_add(len, &ifs->write_bytes_pending); wpc->ioend->io_size += len; wbc_account_cgroup_owner(wbc, &folio->page, len); } @@ -1615,7 +1765,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, struct writeback_control *wbc, struct inode *inode, struct folio *folio, u64 end_pos) { - struct iomap_page *iop = iomap_page_create(inode, folio, 0); + struct iomap_folio_state *ifs = folio->private; struct iomap_ioend *ioend, *next; unsigned len = i_blocksize(inode); unsigned nblocks = i_blocks_per_folio(inode, folio); @@ -1623,7 +1773,14 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, int error = 0, count = 0, i; LIST_HEAD(submit_list); - WARN_ON_ONCE(iop && atomic_read(&iop->write_bytes_pending) != 0); + WARN_ON_ONCE(end_pos <= pos); + + if (!ifs && nblocks > 1) { + ifs = ifs_alloc(inode, folio, 0); + iomap_set_range_dirty(folio, 0, end_pos - pos); + } + + WARN_ON_ONCE(ifs && atomic_read(&ifs->write_bytes_pending) != 0); /* * Walk through the folio to find areas to write back. If we @@ -1631,7 +1788,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, * invalid, grab a new one. */ for (i = 0; i < nblocks && pos < end_pos; i++, pos += len) { - if (iop && !test_bit(i, iop->uptodate)) + if (ifs && !ifs_block_is_dirty(folio, ifs, i)) continue; error = wpc->ops->map_blocks(wpc, inode, pos); @@ -1642,7 +1799,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, continue; if (wpc->iomap.type == IOMAP_HOLE) continue; - iomap_add_to_ioend(inode, pos, folio, iop, wpc, wbc, + iomap_add_to_ioend(inode, pos, folio, ifs, wpc, wbc, &submit_list); count++; } @@ -1675,6 +1832,12 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, } } + /* + * We can have dirty bits set past end of file in page_mkwrite path + * while mapping the last partial folio. Hence it's better to clear + * all the dirty bits in the folio here. + */ + iomap_clear_range_dirty(folio, 0, folio_size(folio)); folio_start_writeback(folio); folio_unlock(folio); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index ea3b868c8355..bcd3f8cf5ea4 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -20,10 +20,12 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ -#define IOMAP_DIO_WRITE_FUA (1 << 28) -#define IOMAP_DIO_NEED_SYNC (1 << 29) -#define IOMAP_DIO_WRITE (1 << 30) -#define IOMAP_DIO_DIRTY (1 << 31) +#define IOMAP_DIO_CALLER_COMP (1U << 26) +#define IOMAP_DIO_INLINE_COMP (1U << 27) +#define IOMAP_DIO_WRITE_THROUGH (1U << 28) +#define IOMAP_DIO_NEED_SYNC (1U << 29) +#define IOMAP_DIO_WRITE (1U << 30) +#define IOMAP_DIO_DIRTY (1U << 31) struct iomap_dio { struct kiocb *iocb; @@ -41,7 +43,6 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -63,12 +64,14 @@ static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter, static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { + struct kiocb *iocb = dio->iocb; + atomic_inc(&dio->ref); /* Sync dio can't be polled reliably */ - if ((dio->iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(dio->iocb)) { - bio_set_polled(bio, dio->iocb); - dio->submit.poll_bio = bio; + if ((iocb->ki_flags & IOCB_HIPRI) && !is_sync_kiocb(iocb)) { + bio_set_polled(bio, iocb); + WRITE_ONCE(iocb->private, bio); } if (dio->dops && dio->dops->submit_io) @@ -130,6 +133,11 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) } EXPORT_SYMBOL_GPL(iomap_dio_complete); +static ssize_t iomap_dio_deferred_complete(void *data) +{ + return iomap_dio_complete(data); +} + static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); @@ -152,27 +160,69 @@ void iomap_dio_bio_end_io(struct bio *bio) { struct iomap_dio *dio = bio->bi_private; bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + struct kiocb *iocb = dio->iocb; if (bio->bi_status) iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + if (!atomic_dec_and_test(&dio->ref)) + goto release_bio; - if (atomic_dec_and_test(&dio->ref)) { - if (dio->wait_for_completion) { - struct task_struct *waiter = dio->submit.waiter; - WRITE_ONCE(dio->submit.waiter, NULL); - blk_wake_io_task(waiter); - } else if (dio->flags & IOMAP_DIO_WRITE) { - struct inode *inode = file_inode(dio->iocb->ki_filp); - - WRITE_ONCE(dio->iocb->private, NULL); - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); - } else { - WRITE_ONCE(dio->iocb->private, NULL); - iomap_dio_complete_work(&dio->aio.work); - } + /* + * Synchronous dio, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ + if (dio->wait_for_completion) { + struct task_struct *waiter = dio->submit.waiter; + + WRITE_ONCE(dio->submit.waiter, NULL); + blk_wake_io_task(waiter); + goto release_bio; + } + + /* + * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline + */ + if (dio->flags & IOMAP_DIO_INLINE_COMP) { + WRITE_ONCE(iocb->private, NULL); + iomap_dio_complete_work(&dio->aio.work); + goto release_bio; + } + + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule + * our completion that way to avoid an async punt to a workqueue. + */ + if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* only polled IO cares about private cleared */ + iocb->private = dio; + iocb->dio_complete = iomap_dio_deferred_complete; + + /* + * Invoke ->ki_complete() directly. We've assigned our + * dio_complete callback handler, and since the issuer set + * IOCB_DIO_CALLER_COMP, we know their ki_complete handler will + * notice ->dio_complete being set and will defer calling that + * handler until it can be done from a safe task context. + * + * Note that the 'res' being passed in here is not important + * for this case. The actual completion value of the request + * will be gotten from dio_complete when that is run by the + * issuer. + */ + iocb->ki_complete(iocb, 0); + goto release_bio; } + /* + * Async DIO completion that requires filesystem level completion work + * gets punted to a work queue to complete as the operation may require + * more IO to be issued to finalise filesystem metadata changes or + * guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, + &dio->aio.work); +release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -203,7 +253,7 @@ static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, /* * Figure out the bio's operation flags from the dio request, the * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_FUA flag in the dio request. + * clearing the WRITE_THROUGH flag in the dio request. */ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, const struct iomap *iomap, bool use_fua) @@ -217,7 +267,7 @@ static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, if (use_fua) opflags |= REQ_FUA; else - dio->flags &= ~IOMAP_DIO_WRITE_FUA; + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; return opflags; } @@ -257,12 +307,19 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, * Use a FUA write if we need datasync semantics, this is a pure * data IO that doesn't require any metadata updates (including * after IO completion such as unwritten extent conversion) and - * the underlying device supports FUA. This allows us to avoid - * cache flushes on IO completion. + * the underlying device either supports FUA or doesn't have + * a volatile write cache. This allows us to avoid cache flushes + * on IO completion. If we can't use writethrough and need to + * sync, disable in-task completions as dio completion will + * need to call generic_write_sync() which will do a blocking + * fsync / cache flush call. */ if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_FUA) && bdev_fua(iomap->bdev)) + (dio->flags & IOMAP_DIO_WRITE_THROUGH) && + (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) use_fua = true; + else if (dio->flags & IOMAP_DIO_NEED_SYNC) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; } /* @@ -277,10 +334,23 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only poll for single bio I/Os. + * We can only do deferred completion for pure overwrites that + * don't require additional IO at completion. This rules out + * writes that need zeroing or extent conversion, extend + * the file size, or issue journal IO or cache flushes + * during completion processing. */ if (need_zeroout || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + + /* + * The rules for polled IO completions follow the guidelines as the + * ones we set for inline and deferred completions. If none of those + * are available for this IO, clear the polled flag. + */ + if (!(dio->flags & (IOMAP_DIO_INLINE_COMP|IOMAP_DIO_CALLER_COMP))) dio->iocb->ki_flags &= ~IOCB_HIPRI; if (need_zeroout) { @@ -505,12 +575,14 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.poll_bio = NULL; if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; if (iov_iter_rw(iter) == READ) { + /* reads can always complete inline */ + dio->flags |= IOMAP_DIO_INLINE_COMP; + if (iomi.pos >= dio->i_size) goto out_free_dio; @@ -524,6 +596,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + /* + * Flag as supporting deferred completions, if the issuer + * groks it. This can avoid a workqueue punt for writes. + * We may later clear this flag if we need to do other IO + * as part of this IO completion. + */ + if (iocb->ki_flags & IOCB_DIO_CALLER_COMP) + dio->flags |= IOMAP_DIO_CALLER_COMP; + if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (iomi.pos >= dio->i_size || @@ -537,13 +618,16 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_NEED_SYNC; /* - * For datasync only writes, we optimistically try - * using FUA for this IO. Any non-FUA write that - * occurs will clear this flag, hence we know before - * completion whether a cache flush is necessary. + * For datasync only writes, we optimistically try using + * WRITE_THROUGH for this IO. This flag requires either + * FUA writes through the device's write cache, or a + * normal write to a device without a volatile write + * cache. For the former, Any non-FUA write that occurs + * will clear this flag, hence we know before completion + * whether a cache flush is necessary. */ if (!(iocb->ki_flags & IOCB_SYNC)) - dio->flags |= IOMAP_DIO_WRITE_FUA; + dio->flags |= IOMAP_DIO_WRITE_THROUGH; } /* @@ -605,14 +689,13 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomap_dio_set_error(dio, ret); /* - * If all the writes we issued were FUA, we don't need to flush the - * cache on IO completion. Clear the sync flag for this case. + * If all the writes we issued were already written through to the + * media, we don't need to flush the cache on IO completion. Clear the + * sync flag for this case. */ - if (dio->flags & IOMAP_DIO_WRITE_FUA) + if (dio->flags & IOMAP_DIO_WRITE_THROUGH) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->private, dio->submit.poll_bio); - /* * We are about to drop our additional submission reference, which * might be the last reference to the dio. There are three different diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index df9d70588b60..2ee21286ac8f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1422,13 +1422,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_ino, de->flags[-high_sierra]); } #endif - - inode->i_mtime.tv_sec = - inode->i_atime.tv_sec = - inode->i_ctime.tv_sec = iso_date(de->date, high_sierra); - inode->i_mtime.tv_nsec = - inode->i_atime.tv_nsec = - inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = + inode_set_ctime(inode, iso_date(de->date, high_sierra), 0); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 48f58c6c9e69..348783a70f57 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -421,10 +421,9 @@ repeat: /* Rock ridge never appears on a High Sierra disk */ cnt = 0; if (rr->u.TF.flags & TF_CREATE) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } if (rr->u.TF.flags & TF_MODIFY) { inode->i_mtime.tv_sec = @@ -439,10 +438,9 @@ repeat: inode->i_atime.tv_nsec = 0; } if (rr->u.TF.flags & TF_ATTRIBUTES) { - inode->i_ctime.tv_sec = - iso_date(rr->u.TF.times[cnt++].time, - 0); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, + iso_date(rr->u.TF.times[cnt++].time, 0), + 0); } break; case SIG('S', 'L'): @@ -534,7 +532,7 @@ repeat: inode->i_size = reloc->i_size; inode->i_blocks = reloc->i_blocks; inode->i_atime = reloc->i_atime; - inode->i_ctime = reloc->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(reloc)); inode->i_mtime = reloc->i_mtime; iput(reloc); break; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 5075a0a6d594..091ab0eaabbe 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -204,7 +204,8 @@ static int jffs2_create(struct mnt_idmap *idmap, struct inode *dir_i, if (ret) goto fail; - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(ri->ctime))); jffs2_free_raw_inode(ri); @@ -237,7 +238,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) if (dead_f->inocache) set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); return ret; } /***********************************************************************/ @@ -271,7 +272,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); d_instantiate(dentry, d_inode(old_dentry)); - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); ihold(d_inode(old_dentry)); } return ret; @@ -422,7 +423,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -566,7 +568,8 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); inc_nlink(dir_i); jffs2_free_raw_dirent(rd); @@ -607,7 +610,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, f, now); if (!ret) { - dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, ITIME(now)); clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } @@ -743,7 +746,8 @@ static int jffs2_mknod (struct mnt_idmap *idmap, struct inode *dir_i, goto fail; } - dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime)); + dir_i->i_mtime = inode_set_ctime_to_ts(dir_i, + ITIME(je32_to_cpu(rd->mctime))); jffs2_free_raw_dirent(rd); @@ -864,14 +868,16 @@ static int jffs2_rename (struct mnt_idmap *idmap, * caller won't do it on its own since we are returning an error. */ d_invalidate(new_dentry); - new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); + new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, + ITIME(now)); return ret; } if (d_is_dir(old_dentry)) drop_nlink(old_dir_i); - new_dir_i->i_mtime = new_dir_i->i_ctime = old_dir_i->i_mtime = old_dir_i->i_ctime = ITIME(now); + old_dir_i->i_mtime = inode_set_ctime_to_ts(old_dir_i, ITIME(now)); + new_dir_i->i_mtime = inode_set_ctime_to_ts(new_dir_i, ITIME(now)); return 0; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 2345ca3f09ee..11c66793960e 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -317,7 +317,8 @@ static int jffs2_write_end(struct file *filp, struct address_space *mapping, inode->i_size = pos + writtenlen; inode->i_blocks = (inode->i_size + 511) >> 9; - inode->i_ctime = inode->i_mtime = ITIME(je32_to_cpu(ri->ctime)); + inode->i_mtime = inode_set_ctime_to_ts(inode, + ITIME(je32_to_cpu(ri->ctime))); } } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 038516bee1ab..0403efab4089 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -115,7 +115,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->isize = cpu_to_je32((ivalid & ATTR_SIZE)?iattr->ia_size:inode->i_size); ri->atime = cpu_to_je32(I_SEC((ivalid & ATTR_ATIME)?iattr->ia_atime:inode->i_atime)); ri->mtime = cpu_to_je32(I_SEC((ivalid & ATTR_MTIME)?iattr->ia_mtime:inode->i_mtime)); - ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode->i_ctime)); + ri->ctime = cpu_to_je32(I_SEC((ivalid & ATTR_CTIME)?iattr->ia_ctime:inode_get_ctime(inode))); ri->offset = cpu_to_je32(0); ri->csize = ri->dsize = cpu_to_je32(mdatalen); @@ -148,7 +148,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) } /* It worked. Update the inode */ inode->i_atime = ITIME(je32_to_cpu(ri->atime)); - inode->i_ctime = ITIME(je32_to_cpu(ri->ctime)); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(ri->ctime))); inode->i_mtime = ITIME(je32_to_cpu(ri->mtime)); inode->i_mode = jemode_to_cpu(ri->mode); i_uid_write(inode, je16_to_cpu(ri->uid)); @@ -284,7 +284,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_size = je32_to_cpu(latest_node.isize); inode->i_atime = ITIME(je32_to_cpu(latest_node.atime)); inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); - inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); + inode_set_ctime_to_ts(inode, ITIME(je32_to_cpu(latest_node.ctime))); set_nlink(inode, f->inocache->pino_nlink); @@ -388,7 +388,7 @@ void jffs2_dirty_inode(struct inode *inode, int flags) iattr.ia_gid = inode->i_gid; iattr.ia_atime = inode->i_atime; iattr.ia_mtime = inode->i_mtime; - iattr.ia_ctime = inode->i_ctime; + iattr.ia_ctime = inode_get_ctime(inode); jffs2_do_setattr(inode, &iattr); } @@ -475,7 +475,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, umode_t mode, struct jffs2_r inode->i_mode = jemode_to_cpu(ri->mode); i_gid_write(inode, je16_to_cpu(ri->gid)); i_uid_write(inode, je16_to_cpu(ri->uid)); - inode->i_atime = inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); ri->atime = ri->mtime = ri->ctime = cpu_to_je32(I_SEC(inode->i_mtime)); inode->i_blocks = 0; diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 8da19766c101..50727a1ff931 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -35,7 +35,7 @@ struct kvec; #define ITIME(sec) ((struct timespec64){sec, 0}) #define JFFS2_NOW() JFFS2_CLAMP_TIME(ktime_get_real_seconds()) #define I_SEC(tv) JFFS2_CLAMP_TIME((tv).tv_sec) -#define JFFS2_F_I_CTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_ctime) +#define JFFS2_F_I_CTIME(f) I_SEC(inode_get_ctime(OFNI_EDONI_2SFFJ(f))) #define JFFS2_F_I_MTIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_mtime) #define JFFS2_F_I_ATIME(f) I_SEC(OFNI_EDONI_2SFFJ(f)->i_atime) #define sleep_on_spinunlock(wq, s) \ diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index fb96f872d207..1de3602c98de 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -116,7 +116,7 @@ int jfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, if (!rc) { if (update_mode) { inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } rc = txCommit(tid, 1, &inode, 0); diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 8ac10e396050..920d58a1566b 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -393,7 +393,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) break; } - ip->i_mtime = ip->i_ctime = current_time(ip); + ip->i_mtime = inode_set_ctime_current(ip); mark_inode_dirty(ip); txCommit(tid, 1, &ip, 0); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index ed7989bc2db1..f7bd7e8f5be4 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ int jfs_fileattr_set(struct mnt_idmap *idmap, jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return 0; diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 390cbfce391f..a40383aa6c84 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3064,8 +3064,8 @@ static int copy_from_dinode(struct dinode * dip, struct inode *ip) ip->i_atime.tv_nsec = le32_to_cpu(dip->di_atime.tv_nsec); ip->i_mtime.tv_sec = le32_to_cpu(dip->di_mtime.tv_sec); ip->i_mtime.tv_nsec = le32_to_cpu(dip->di_mtime.tv_nsec); - ip->i_ctime.tv_sec = le32_to_cpu(dip->di_ctime.tv_sec); - ip->i_ctime.tv_nsec = le32_to_cpu(dip->di_ctime.tv_nsec); + inode_set_ctime(ip, le32_to_cpu(dip->di_ctime.tv_sec), + le32_to_cpu(dip->di_ctime.tv_nsec)); ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); ip->i_generation = le32_to_cpu(dip->di_gen); @@ -3139,8 +3139,8 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime.tv_sec); dip->di_atime.tv_nsec = cpu_to_le32(ip->i_atime.tv_nsec); - dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime.tv_sec); - dip->di_ctime.tv_nsec = cpu_to_le32(ip->i_ctime.tv_nsec); + dip->di_ctime.tv_sec = cpu_to_le32(inode_get_ctime(ip).tv_sec); + dip->di_ctime.tv_nsec = cpu_to_le32(inode_get_ctime(ip).tv_nsec); dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime.tv_sec); dip->di_mtime.tv_nsec = cpu_to_le32(ip->i_mtime.tv_nsec); dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 9e1f02767201..87594efa7f7c 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -97,8 +97,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode) jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); - jfs_inode->otime = inode->i_ctime.tv_sec; + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); + jfs_inode->otime = inode_get_ctime(inode).tv_sec; inode->i_generation = JFS_SBI(sb)->gengen++; jfs_inode->cflag = 0; diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index e98ddb2b1cf2..029d47065600 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -149,7 +149,7 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); @@ -284,7 +284,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, /* update parent directory inode */ inc_nlink(dip); /* for '..' from child directory */ - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); rc = txCommit(tid, 2, &iplist[0], 0); @@ -390,7 +390,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) /* update parent directory's link count corresponding * to ".." entry of the target directory deleted */ - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); inode_dec_link_count(dip); /* @@ -512,7 +512,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) ASSERT(ip->i_nlink); - ip->i_ctime = dip->i_ctime = dip->i_mtime = current_time(ip); + dip->i_mtime = inode_set_ctime_to_ts(dip, inode_set_ctime_current(ip)); mark_inode_dirty(dip); /* update target's inode */ @@ -827,8 +827,8 @@ static int jfs_link(struct dentry *old_dentry, /* update object inode */ inc_nlink(ip); /* for new link */ - ip->i_ctime = current_time(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + inode_set_ctime_current(ip); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ihold(ip); @@ -1028,7 +1028,7 @@ static int jfs_symlink(struct mnt_idmap *idmap, struct inode *dip, mark_inode_dirty(ip); - dip->i_ctime = dip->i_mtime = current_time(dip); + dip->i_mtime = inode_set_ctime_current(dip); mark_inode_dirty(dip); /* * commit update of parent directory and link object @@ -1205,7 +1205,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, tblk->xflag |= COMMIT_DELETE; tblk->u.ip = new_ip; } else { - new_ip->i_ctime = current_time(new_ip); + inode_set_ctime_current(new_ip); mark_inode_dirty(new_ip); } } else { @@ -1268,10 +1268,10 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, /* * Update ctime on changed/moved inodes & mark dirty */ - old_ip->i_ctime = current_time(old_ip); + inode_set_ctime_current(old_ip); mark_inode_dirty(old_ip); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + new_dir->i_mtime = inode_set_ctime_current(new_dir); mark_inode_dirty(new_dir); /* Build list of inodes modified by this transaction */ @@ -1283,7 +1283,7 @@ static int jfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (old_dir != new_dir) { iplist[ipcount++] = new_dir; - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); mark_inode_dirty(old_dir); } @@ -1416,7 +1416,7 @@ static int jfs_mknod(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(ip); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index d2f82cb7db1b..2e2f7f6d36a0 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -818,7 +818,7 @@ out: } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); inode_unlock(inode); return len - towrite; diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 931e50018f88..8577ad494e05 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -647,7 +647,7 @@ static int ea_put(tid_t tid, struct inode *inode, struct ea_buffer *ea_buf, if (old_blocks) dquot_free_block(inode, old_blocks); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); return 0; } diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 5d826274570c..c429c42a6867 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -8,16 +8,16 @@ /** * kernel_read_file() - read file contents into a kernel buffer * - * @file file to read from - * @offset where to start reading from (see below). - * @buf pointer to a "void *" buffer for reading into (if + * @file: file to read from + * @offset: where to start reading from (see below). + * @buf: pointer to a "void *" buffer for reading into (if * *@buf is NULL, a buffer will be allocated, and * @buf_size will be ignored) - * @buf_size size of buf, if already allocated. If @buf not + * @buf_size: size of buf, if already allocated. If @buf not * allocated, this is the largest size to allocate. - * @file_size if non-NULL, the full size of @file will be + * @file_size: if non-NULL, the full size of @file will be * written here. - * @id the kernel_read_file_id identifying the type of + * @id: the kernel_read_file_id identifying the type of * file contents being read (for LSMs to examine) * * @offset must be 0 unless both @buf and @file_size are non-NULL diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5a1a4af9d3d2..660995856a04 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -556,7 +556,7 @@ void kernfs_put(struct kernfs_node *kn) kfree_const(kn->name); if (kn->iattr) { - simple_xattrs_free(&kn->iattr->xattrs); + simple_xattrs_free(&kn->iattr->xattrs, NULL); kmem_cache_free(kernfs_iattrs_cache, kn->iattr); } spin_lock(&kernfs_idr_lock); diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index b22b74d1a115..922719a343a7 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -151,8 +151,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) static inline void set_default_inode_attr(struct inode *inode, umode_t mode) { inode->i_mode = mode; - inode->i_atime = inode->i_mtime = - inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } static inline void set_inode_attr(struct inode *inode, @@ -162,7 +161,7 @@ static inline void set_inode_attr(struct inode *inode, inode->i_gid = attrs->ia_gid; inode->i_atime = attrs->ia_atime; inode->i_mtime = attrs->ia_mtime; - inode->i_ctime = attrs->ia_ctime; + inode_set_ctime_to_ts(inode, attrs->ia_ctime); } static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) @@ -191,7 +190,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap, down_read(&root->kernfs_iattr_rwsem); kernfs_refresh_inode(kn, inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); up_read(&root->kernfs_iattr_rwsem); return 0; @@ -306,11 +305,17 @@ int kernfs_xattr_get(struct kernfs_node *kn, const char *name, int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { + struct simple_xattr *old_xattr; struct kernfs_iattrs *attrs = kernfs_iattrs(kn); if (!attrs) return -ENOMEM; - return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL); + old_xattr = simple_xattr_set(&attrs->xattrs, name, value, size, flags); + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); + + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, @@ -342,7 +347,7 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; + struct simple_xattr *old_xattr; int ret; if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) { @@ -355,13 +360,18 @@ static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn, goto dec_size_out; } - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); - - if (!ret && removed_size >= 0) - size = removed_size; - else if (!ret) + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) return 0; + + if (IS_ERR(old_xattr)) { + ret = PTR_ERR(old_xattr); + goto dec_size_out; + } + + ret = 0; + size = old_xattr->size; + simple_xattr_free(old_xattr); dec_size_out: atomic_sub(size, sz); dec_count_out: @@ -376,18 +386,19 @@ static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn, { atomic_t *sz = &kn->iattr->user_xattr_size; atomic_t *nr = &kn->iattr->nr_user_xattrs; - ssize_t removed_size; - int ret; + struct simple_xattr *old_xattr; - ret = simple_xattr_set(xattrs, full_name, value, size, flags, - &removed_size); + old_xattr = simple_xattr_set(xattrs, full_name, value, size, flags); + if (!old_xattr) + return 0; - if (removed_size >= 0) { - atomic_sub(removed_size, sz); - atomic_dec(nr); - } + if (IS_ERR(old_xattr)) + return PTR_ERR(old_xattr); - return ret; + atomic_sub(old_xattr->size, sz); + atomic_dec(nr); + simple_xattr_free(old_xattr); + return 0; } static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler, diff --git a/fs/libfs.c b/fs/libfs.c index 5b851315eeed..da78eb64831e 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -33,7 +33,7 @@ int simple_getattr(struct mnt_idmap *idmap, const struct path *path, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); return 0; } @@ -239,6 +239,254 @@ const struct inode_operations simple_dir_inode_operations = { }; EXPORT_SYMBOL(simple_dir_inode_operations); +static void offset_set(struct dentry *dentry, u32 offset) +{ + dentry->d_fsdata = (void *)((uintptr_t)(offset)); +} + +static u32 dentry2offset(struct dentry *dentry) +{ + return (u32)((uintptr_t)(dentry->d_fsdata)); +} + +static struct lock_class_key simple_offset_xa_lock; + +/** + * simple_offset_init - initialize an offset_ctx + * @octx: directory offset map to be initialized + * + */ +void simple_offset_init(struct offset_ctx *octx) +{ + xa_init_flags(&octx->xa, XA_FLAGS_ALLOC1); + lockdep_set_class(&octx->xa.xa_lock, &simple_offset_xa_lock); + + /* 0 is '.', 1 is '..', so always start with offset 2 */ + octx->next_offset = 2; +} + +/** + * simple_offset_add - Add an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: new dentry being added + * + * Returns zero on success. @so_ctx and the dentry offset are updated. + * Otherwise, a negative errno value is returned. + */ +int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) +{ + static const struct xa_limit limit = XA_LIMIT(2, U32_MAX); + u32 offset; + int ret; + + if (dentry2offset(dentry) != 0) + return -EBUSY; + + ret = xa_alloc_cyclic(&octx->xa, &offset, dentry, limit, + &octx->next_offset, GFP_KERNEL); + if (ret < 0) + return ret; + + offset_set(dentry, offset); + return 0; +} + +/** + * simple_offset_remove - Remove an entry to a directory's offset map + * @octx: directory offset ctx to be updated + * @dentry: dentry being removed + * + */ +void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) +{ + u32 offset; + + offset = dentry2offset(dentry); + if (offset == 0) + return; + + xa_erase(&octx->xa, offset); + offset_set(dentry, 0); +} + +/** + * simple_offset_rename_exchange - exchange rename with directory offsets + * @old_dir: parent of dentry being moved + * @old_dentry: dentry being moved + * @new_dir: destination parent + * @new_dentry: destination dentry + * + * Returns zero on success. Otherwise a negative errno is returned and the + * rename is rolled back. + */ +int simple_offset_rename_exchange(struct inode *old_dir, + struct dentry *old_dentry, + struct inode *new_dir, + struct dentry *new_dentry) +{ + struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); + struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); + u32 old_index = dentry2offset(old_dentry); + u32 new_index = dentry2offset(new_dentry); + int ret; + + simple_offset_remove(old_ctx, old_dentry); + simple_offset_remove(new_ctx, new_dentry); + + ret = simple_offset_add(new_ctx, old_dentry); + if (ret) + goto out_restore; + + ret = simple_offset_add(old_ctx, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + goto out_restore; + } + + ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); + if (ret) { + simple_offset_remove(new_ctx, old_dentry); + simple_offset_remove(old_ctx, new_dentry); + goto out_restore; + } + return 0; + +out_restore: + offset_set(old_dentry, old_index); + xa_store(&old_ctx->xa, old_index, old_dentry, GFP_KERNEL); + offset_set(new_dentry, new_index); + xa_store(&new_ctx->xa, new_index, new_dentry, GFP_KERNEL); + return ret; +} + +/** + * simple_offset_destroy - Release offset map + * @octx: directory offset ctx that is about to be destroyed + * + * During fs teardown (eg. umount), a directory's offset map might still + * contain entries. xa_destroy() cleans out anything that remains. + */ +void simple_offset_destroy(struct offset_ctx *octx) +{ + xa_destroy(&octx->xa); +} + +/** + * offset_dir_llseek - Advance the read position of a directory descriptor + * @file: an open directory whose position is to be updated + * @offset: a byte offset + * @whence: enumerator describing the starting position for this update + * + * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. + * + * Returns the updated read position if successful; otherwise a + * negative errno is returned and the read position remains unchanged. + */ +static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_CUR: + offset += file->f_pos; + fallthrough; + case SEEK_SET: + if (offset >= 0) + break; + fallthrough; + default: + return -EINVAL; + } + + return vfs_setpos(file, offset, U32_MAX); +} + +static struct dentry *offset_find_next(struct xa_state *xas) +{ + struct dentry *child, *found = NULL; + + rcu_read_lock(); + child = xas_next_entry(xas, U32_MAX); + if (!child) + goto out; + spin_lock(&child->d_lock); + if (simple_positive(child)) + found = dget_dlock(child); + spin_unlock(&child->d_lock); +out: + rcu_read_unlock(); + return found; +} + +static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) +{ + u32 offset = dentry2offset(dentry); + struct inode *inode = d_inode(dentry); + + return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, + inode->i_ino, fs_umode_to_dtype(inode->i_mode)); +} + +static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx) +{ + struct offset_ctx *so_ctx = inode->i_op->get_offset_ctx(inode); + XA_STATE(xas, &so_ctx->xa, ctx->pos); + struct dentry *dentry; + + while (true) { + dentry = offset_find_next(&xas); + if (!dentry) + break; + + if (!offset_dir_emit(ctx, dentry)) { + dput(dentry); + break; + } + + dput(dentry); + ctx->pos = xas.xa_index + 1; + } +} + +/** + * offset_readdir - Emit entries starting at offset @ctx->pos + * @file: an open directory to iterate over + * @ctx: directory iteration context + * + * Caller must hold @file's i_rwsem to prevent insertion or removal of + * entries during this call. + * + * On entry, @ctx->pos contains an offset that represents the first entry + * to be read from the directory. + * + * The operation continues until there are no more entries to read, or + * until the ctx->actor indicates there is no more space in the caller's + * output buffer. + * + * On return, @ctx->pos contains an offset that will read the next entry + * in this directory when offset_readdir() is called again with @ctx. + * + * Return values: + * %0 - Complete + */ +static int offset_readdir(struct file *file, struct dir_context *ctx) +{ + struct dentry *dir = file->f_path.dentry; + + lockdep_assert_held(&d_inode(dir)->i_rwsem); + + if (!dir_emit_dots(file, ctx)) + return 0; + + offset_iterate_dir(d_inode(dir), ctx); + return 0; +} + +const struct file_operations simple_offset_dir_operations = { + .llseek = offset_dir_llseek, + .iterate_shared = offset_readdir, + .read = generic_read_dir, + .fsync = noop_fsync, +}; + static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL; @@ -275,7 +523,7 @@ void simple_recursive_removal(struct dentry *dentry, while ((child = find_next_child(this, victim)) == NULL) { // kill and ascend // update metadata while it's still locked - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); clear_nlink(inode); inode_unlock(inode); victim = this; @@ -293,8 +541,7 @@ void simple_recursive_removal(struct dentry *dentry, dput(victim); // unpin it } if (victim == dentry) { - inode->i_ctime = inode->i_mtime = - current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (d_is_dir(dentry)) drop_nlink(inode); inode_unlock(inode); @@ -335,7 +582,7 @@ static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_atime = root->i_mtime = root->i_ctime = current_time(root); + root->i_atime = root->i_mtime = inode_set_ctime_current(root); s->s_root = d_make_root(root); if (!s->s_root) return -ENOMEM; @@ -391,7 +638,8 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); inc_nlink(inode); ihold(inode); dget(dentry); @@ -425,7 +673,8 @@ int simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); drop_nlink(inode); dput(dentry); return 0; @@ -444,6 +693,31 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) } EXPORT_SYMBOL(simple_rmdir); +/** + * simple_rename_timestamp - update the various inode timestamps for rename + * @old_dir: old parent directory + * @old_dentry: dentry that is being renamed + * @new_dir: new parent directory + * @new_dentry: target for rename + * + * POSIX mandates that the old and new parent directories have their ctime and + * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have + * their ctime updated. + */ +void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct inode *newino = d_inode(new_dentry); + + old_dir->i_mtime = inode_set_ctime_current(old_dir); + if (new_dir != old_dir) + new_dir->i_mtime = inode_set_ctime_current(new_dir); + inode_set_ctime_current(d_inode(old_dentry)); + if (newino) + inode_set_ctime_current(newino); +} +EXPORT_SYMBOL_GPL(simple_rename_timestamp); + int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { @@ -459,11 +733,7 @@ int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, inc_nlink(old_dir); } } - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - d_inode(old_dentry)->i_ctime = - d_inode(new_dentry)->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL_GPL(simple_rename_exchange); @@ -472,7 +742,6 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) @@ -495,9 +764,7 @@ int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, inc_nlink(new_dir); } - old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = - new_dir->i_mtime = inode->i_ctime = current_time(old_dir); - + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); return 0; } EXPORT_SYMBOL(simple_rename); @@ -548,21 +815,20 @@ int simple_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { - struct page *page; - pgoff_t index; - - index = pos >> PAGE_SHIFT; + struct folio *folio; - page = grab_cache_page_write_begin(mapping, index); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - *pagep = page; + *pagep = &folio->page; - if (!PageUptodate(page) && (len != PAGE_SIZE)) { - unsigned from = pos & (PAGE_SIZE - 1); + if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { + size_t from = offset_in_folio(folio, pos); - zero_user_segments(page, 0, from, from + len, PAGE_SIZE); + folio_zero_segments(folio, 0, from, + from + len, folio_size(folio)); } return 0; } @@ -594,17 +860,18 @@ static int simple_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; loff_t last_pos = pos + copied; - /* zero the stale part of the page if we did a short copy */ - if (!PageUptodate(page)) { + /* zero the stale part of the folio if we did a short copy */ + if (!folio_test_uptodate(folio)) { if (copied < len) { - unsigned from = pos & (PAGE_SIZE - 1); + size_t from = offset_in_folio(folio, pos); - zero_user(page, from + copied, len - copied); + folio_zero_range(folio, from + copied, len - copied); } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* * No need to use i_size_read() here, the i_size @@ -613,9 +880,9 @@ static int simple_write_end(struct file *file, struct address_space *mapping, if (last_pos > inode->i_size) i_size_write(inode, last_pos); - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -659,7 +926,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; set_nlink(inode, 2); @@ -685,7 +952,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, goto out; } inode->i_mode = S_IFREG | files->mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inode->i_fop = files->ops; inode->i_ino = i; d_add(dentry, inode); @@ -1253,7 +1520,7 @@ struct inode *alloc_anon_inode(struct super_block *s) inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); inode->i_flags |= S_PRIVATE; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); return inode; } EXPORT_SYMBOL(alloc_anon_inode); @@ -1269,7 +1536,7 @@ EXPORT_SYMBOL(alloc_anon_inode); * All arguments are ignored and it just returns -EINVAL. */ int -simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, +simple_nosetlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { return -EINVAL; @@ -1315,7 +1582,7 @@ static int empty_dir_getattr(struct mnt_idmap *idmap, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/locks.c b/fs/locks.c index df8b26a42524..a45efc16945d 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -438,7 +438,7 @@ static void flock_make_lock(struct file *filp, struct file_lock *fl, int type) fl->fl_end = OFFSET_MAX; } -static int assign_type(struct file_lock *fl, long type) +static int assign_type(struct file_lock *fl, int type) { switch (type) { case F_RDLCK: @@ -549,7 +549,7 @@ static const struct lock_manager_operations lease_manager_ops = { /* * Initialize a lease, use the default lock manager operations */ -static int lease_init(struct file *filp, long type, struct file_lock *fl) +static int lease_init(struct file *filp, int type, struct file_lock *fl) { if (assign_type(fl, type) != 0) return -EINVAL; @@ -567,7 +567,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) } /* Allocate a file_lock initialised to this type of lease */ -static struct file_lock *lease_alloc(struct file *filp, long type) +static struct file_lock *lease_alloc(struct file *filp, int type) { struct file_lock *fl = locks_alloc_lock(); int error = -ENOMEM; @@ -868,6 +868,21 @@ static bool posix_locks_conflict(struct file_lock *caller_fl, return locks_conflict(caller_fl, sys_fl); } +/* Determine if lock sys_fl blocks lock caller_fl. Used on xx_GETLK + * path so checks for additional GETLK-specific things like F_UNLCK. + */ +static bool posix_test_locks_conflict(struct file_lock *caller_fl, + struct file_lock *sys_fl) +{ + /* F_UNLCK checks any locks on the same fd. */ + if (caller_fl->fl_type == F_UNLCK) { + if (!posix_same_owner(caller_fl, sys_fl)) + return false; + return locks_overlap(caller_fl, sys_fl); + } + return posix_locks_conflict(caller_fl, sys_fl); +} + /* Determine if lock sys_fl blocks lock caller_fl. FLOCK specific * checking before calling the locks_conflict(). */ @@ -901,7 +916,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) retry: spin_lock(&ctx->flc_lock); list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { - if (!posix_locks_conflict(fl, cfl)) + if (!posix_test_locks_conflict(fl, cfl)) continue; if (cfl->fl_lmops && cfl->fl_lmops->lm_lock_expirable && (*cfl->fl_lmops->lm_lock_expirable)(cfl)) { @@ -1301,6 +1316,7 @@ retry: out: spin_unlock(&ctx->flc_lock); percpu_up_read(&file_rwsem); + trace_posix_lock_inode(inode, request, error); /* * Free any unused locks. */ @@ -1309,7 +1325,6 @@ retry: if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); - trace_posix_lock_inode(inode, request, error); return error; } @@ -1666,7 +1681,7 @@ int fcntl_getlease(struct file *filp) * conflict with the lease we're trying to set. */ static int -check_conflicting_open(struct file *filp, const long arg, int flags) +check_conflicting_open(struct file *filp, const int arg, int flags) { struct inode *inode = file_inode(filp); int self_wcount = 0, self_rcount = 0; @@ -1701,7 +1716,7 @@ check_conflicting_open(struct file *filp, const long arg, int flags) } static int -generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) +generic_add_lease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct file_lock *fl, *my_fl = NULL, *lease; struct inode *inode = file_inode(filp); @@ -1859,7 +1874,7 @@ static int generic_delete_lease(struct file *filp, void *owner) * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp, +int generic_setlease(struct file *filp, int arg, struct file_lock **flp, void **priv) { struct inode *inode = file_inode(filp); @@ -1906,7 +1921,7 @@ lease_notifier_chain_init(void) } static inline void -setlease_notifier(long arg, struct file_lock *lease) +setlease_notifier(int arg, struct file_lock *lease) { if (arg != F_UNLCK) srcu_notifier_call_chain(&lease_notifier_chain, arg, lease); @@ -1942,7 +1957,7 @@ EXPORT_SYMBOL_GPL(lease_unregister_notifier); * may be NULL if the lm_setup operation doesn't require it. */ int -vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) +vfs_setlease(struct file *filp, int arg, struct file_lock **lease, void **priv) { if (lease) setlease_notifier(arg, *lease); @@ -1953,7 +1968,7 @@ vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) +static int do_fcntl_add_lease(unsigned int fd, struct file *filp, int arg) { struct file_lock *fl; struct fasync_struct *new; @@ -1988,7 +2003,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) * Note that you also need to call %F_SETSIG to * receive a signal when the lease is broken. */ -int fcntl_setlease(unsigned int fd, struct file *filp, long arg) +int fcntl_setlease(unsigned int fd, struct file *filp, int arg) { if (arg == F_UNLCK) return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp); @@ -2136,7 +2151,7 @@ EXPORT_SYMBOL_GPL(vfs_test_lock); * @fl: The file_lock who's fl_pid should be translated * @ns: The namespace into which the pid should be translated * - * Used to tranlate a fl_pid into a namespace virtual pid number + * Used to translate a fl_pid into a namespace virtual pid number */ static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) { @@ -2207,7 +2222,8 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) if (fl == NULL) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock_to_posix_lock(filp, fl, flock); @@ -2414,7 +2430,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) return -ENOMEM; error = -EINVAL; - if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) + if (cmd != F_OFD_GETLK && flock->l_type != F_RDLCK + && flock->l_type != F_WRLCK) goto out; error = flock64_to_posix_lock(filp, fl, flock); diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 870207ba23f1..25c08fbfcb9d 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -251,7 +251,7 @@ struct inode *minix_new_inode(const struct inode *dir, umode_t mode) } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); diff --git a/fs/minix/dir.c b/fs/minix/dir.c index bf9858f76b6a..20f23e6e58ad 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -281,7 +281,7 @@ got_it: de->inode = inode->i_ino; } dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: @@ -313,7 +313,7 @@ int minix_delete_entry(struct minix_dir_entry *de, struct page *page) else de->inode = 0; dir_commit_chunk(page, pos, len); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } @@ -436,7 +436,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page, else de->inode = inode->i_ino; dir_commit_chunk(page, pos, sbi->s_dirsize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index e9fbb5303a22..df575473c1cc 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -501,10 +501,7 @@ static struct inode *V1_minix_iget(struct inode *inode) i_gid_write(inode, raw_inode->i_gid); set_nlink(inode, raw_inode->i_nlinks); inode->i_size = raw_inode->i_size; - inode->i_mtime.tv_sec = inode->i_atime.tv_sec = inode->i_ctime.tv_sec = raw_inode->i_time; - inode->i_mtime.tv_nsec = 0; - inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; + inode->i_mtime = inode->i_atime = inode_set_ctime(inode, raw_inode->i_time, 0); inode->i_blocks = 0; for (i = 0; i < 9; i++) minix_inode->u.i1_data[i] = raw_inode->i_zone[i]; @@ -543,10 +540,9 @@ static struct inode *V2_minix_iget(struct inode *inode) inode->i_size = raw_inode->i_size; inode->i_mtime.tv_sec = raw_inode->i_mtime; inode->i_atime.tv_sec = raw_inode->i_atime; - inode->i_ctime.tv_sec = raw_inode->i_ctime; + inode_set_ctime(inode, raw_inode->i_ctime, 0); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_blocks = 0; for (i = 0; i < 10; i++) minix_inode->u.i2_data[i] = raw_inode->i_zone[i]; @@ -622,7 +618,7 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode) raw_inode->i_size = inode->i_size; raw_inode->i_mtime = inode->i_mtime.tv_sec; raw_inode->i_atime = inode->i_atime.tv_sec; - raw_inode->i_ctime = inode->i_ctime.tv_sec; + raw_inode->i_ctime = inode_get_ctime(inode).tv_sec; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) raw_inode->i_zone[0] = old_encode_dev(inode->i_rdev); else for (i = 0; i < 10; i++) @@ -660,7 +656,7 @@ int minix_getattr(struct mnt_idmap *idmap, const struct path *path, struct super_block *sb = path->dentry->d_sb; struct inode *inode = d_inode(path->dentry); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (INODE_VERSION(inode) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c index 446148792f41..ce18ae37c29d 100644 --- a/fs/minix/itree_common.c +++ b/fs/minix/itree_common.c @@ -131,7 +131,7 @@ static inline int splice_branch(struct inode *inode, /* We are done with atomic stuff, now do the rest of housekeeping */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -350,7 +350,7 @@ do_indirects: } first_whole++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); } diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 956d5183828d..114084d5636a 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -98,7 +98,7 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); return add_nondir(dentry, inode); @@ -154,7 +154,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry) if (err) return err; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); return 0; } @@ -218,7 +218,7 @@ static int minix_rename(struct mnt_idmap *idmap, put_page(new_page); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/namei.c b/fs/namei.c index e56ff39a79bc..567ee547492b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -643,6 +643,8 @@ static bool nd_alloc_stack(struct nameidata *nd) /** * path_connected - Verify that a dentry is below mnt.mnt_root + * @mnt: The mountpoint to check. + * @dentry: The dentry to check. * * Rename can sometimes move a file or directory outside of a bind * mount, path_connected allows those cases to be detected. @@ -1083,6 +1085,7 @@ fs_initcall(init_fs_namei_sysctls); /** * may_follow_link - Check symlink following for unsafe situations * @nd: nameidata pathwalk data + * @inode: Used for idmapping. * * In the case of the sysctl_protected_symlinks sysctl being enabled, * CAP_DAC_OVERRIDE needs to be specifically ignored if the symlink is @@ -2890,7 +2893,7 @@ int path_pts(struct path *path) dput(path->dentry); path->dentry = parent; child = d_hash_and_lookup(parent, &this); - if (!child) + if (IS_ERR_OR_NULL(child)) return -ENOENT; path->dentry = child; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index c1eda73254e1..6bed1394d748 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -59,7 +59,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, res->change_attr = delegation->change_attr; if (nfs_have_writebacks(inode)) res->change_attr++; - res->ctime = inode->i_ctime; + res->ctime = inode_get_ctime(inode); res->mtime = inode->i_mtime; res->bitmap[0] = (FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE) & args->bitmap[0]; diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h index e1706e736c64..2dc64454492b 100644 --- a/fs/nfs/fscache.h +++ b/fs/nfs/fscache.h @@ -116,8 +116,8 @@ static inline void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata * memset(auxdata, 0, sizeof(*auxdata)); auxdata->mtime_sec = inode->i_mtime.tv_sec; auxdata->mtime_nsec = inode->i_mtime.tv_nsec; - auxdata->ctime_sec = inode->i_ctime.tv_sec; - auxdata->ctime_nsec = inode->i_ctime.tv_nsec; + auxdata->ctime_sec = inode_get_ctime(inode).tv_sec; + auxdata->ctime_nsec = inode_get_ctime(inode).tv_nsec; if (NFS_SERVER(inode)->nfs_client->rpc_ops->version == 4) auxdata->change_attr = inode_peek_iversion_raw(inode); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8172dd4135a1..e21c073158e5 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -514,7 +514,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); - memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); + inode_set_ctime(inode, 0, 0); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); @@ -535,7 +535,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) @@ -731,7 +731,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -749,7 +749,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -765,7 +765,7 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); @@ -912,7 +912,7 @@ out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask = nfs_get_valid_attrmask(inode) | request_mask; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); stat->change_cookie = inode_peek_iversion_raw(inode); stat->attributes_mask |= STATX_ATTR_CHANGE_MONOTONIC; @@ -1444,11 +1444,11 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); } ts = inode->i_mtime; @@ -1510,7 +1510,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; - ts = inode->i_ctime; + ts = inode_get_ctime(inode); if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; @@ -1997,7 +1997,7 @@ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fa } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { - fattr->pre_ctime = inode->i_ctime; + fattr->pre_ctime = inode_get_ctime(inode); fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && @@ -2190,7 +2190,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) save_cache_validity & NFS_INO_INVALID_MTIME; if (fattr->valid & NFS_ATTR_FATTR_CTIME) - inode->i_ctime = fattr->ctime; + inode_set_ctime_to_ts(inode, fattr->ctime); else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 19d51ebf842c..e7494cdd957e 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -215,7 +215,8 @@ nfs_namespace_getattr(struct mnt_idmap *idmap, if (NFS_FH(d_inode(path->dentry))->size != 0) return nfs_getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); return 0; } diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 4c9f8bd866ab..47c5c1f86d66 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -328,7 +328,7 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); -extern int nfs4_proc_setlease(struct file *file, long arg, +extern int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 4aeadd6e1a6d..02788c3c85e5 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -438,7 +438,7 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { return nfs4_proc_setlease(file, arg, lease, priv); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 832fa226b8f2..d57aaf0cc577 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7579,7 +7579,7 @@ static int nfs4_delete_lease(struct file *file, void **priv) return generic_setlease(file, F_UNLCK, NULL, priv); } -static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, +static int nfs4_add_lease(struct file *file, int arg, struct file_lock **lease, void **priv) { struct inode *inode = file_inode(file); @@ -7597,7 +7597,7 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, return -EAGAIN; } -int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, +int nfs4_proc_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { switch (arg) { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3aefbad4cc09..daf305daa751 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1354,9 +1354,9 @@ static void revoke_delegation(struct nfs4_delegation *dp) trace_nfsd_stid_revoke(&dp->dl_stid); if (clp->cl_minorversion) { + spin_lock(&clp->cl_lock); dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID; refcount_inc(&dp->dl_stid.sc_count); - spin_lock(&clp->cl_lock); list_add(&dp->dl_recall_lru, &clp->cl_revoked); spin_unlock(&clp->cl_lock); } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1b8b1aab9a15..3709830f90a6 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1105,6 +1105,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) if (!nn->nfsd_serv) return -EBUSY; trace_nfsd_end_grace(netns(file)); + nfsd4_end_grace(nn); break; default: return -EINVAL; @@ -1131,7 +1132,7 @@ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) /* Following advice from simple_fill_super documentation: */ inode->i_ino = iunique(sb, NFSD_MaxReserved); inode->i_mode = mode; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_fop = &simple_dir_operations; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 2c9074ab2315..9b7acba382fe 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -520,7 +520,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_sanitize_attrs(inode, iap); - if (check_guard && guardtime != inode->i_ctime.tv_sec) + if (check_guard && guardtime != inode_get_ctime(inode).tv_sec) return nfserr_notsync; /* diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index decd6471300b..bce734b68f08 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -429,7 +429,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } /* @@ -519,7 +519,7 @@ got_it: de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: @@ -567,7 +567,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); out: nilfs_put_page(page); return err; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 35bc79305318..d588c719d743 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -366,7 +366,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) atomic64_inc(&root->inodes_count); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { err = nilfs_bmap_read(ii->i_bmap, NULL); @@ -450,10 +450,10 @@ int nilfs_read_inode_common(struct inode *inode, set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); inode->i_size = le64_to_cpu(raw_inode->i_size); inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); + inode_set_ctime(inode, le64_to_cpu(raw_inode->i_ctime), + le32_to_cpu(raw_inode->i_ctime_nsec)); inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode)) return -EIO; /* this inode is for metadata and corrupted */ @@ -768,9 +768,9 @@ void nilfs_write_inode_common(struct inode *inode, raw_inode->i_gid = cpu_to_le32(i_gid_read(inode)); raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); raw_inode->i_size = cpu_to_le64(inode->i_size); - raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); - raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + raw_inode->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); raw_inode->i_blocks = cpu_to_le64(inode->i_blocks); @@ -875,7 +875,7 @@ void nilfs_truncate(struct inode *inode) nilfs_truncate_bmap(ii, blkoff); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 1dfbc0c34513..40ffade49f38 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -149,7 +149,7 @@ int nilfs_fileattr_set(struct mnt_idmap *idmap, NILFS_I(inode)->i_flags = oldflags | (flags & FS_FL_USER_MODIFIABLE); nilfs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) nilfs_set_transaction_flag(NILFS_TI_SYNC); diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c7024da8f1e2..2a4e7f4a8102 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -185,7 +185,7 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, if (err) return err; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); drop_nlink(inode); err = 0; out: @@ -387,7 +387,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, goto out_dir; nilfs_set_link(new_dir, new_de, new_page, old_inode); nilfs_mark_inode_dirty(new_dir); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); drop_nlink(new_inode); @@ -406,7 +406,7 @@ static int nilfs_rename(struct mnt_idmap *idmap, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); nilfs_delete_entry(old_de, old_page); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 581691e4be49..7ec16879756e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -725,6 +725,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, struct folio *folio = fbatch.folios[i]; folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + /* Exclude folios removed from the address space */ + folio_unlock(folio); + continue; + } head = folio_buffers(folio); if (!head) { create_empty_buffers(&folio->page, i_blocksize(inode), 0); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0ef8c71bde8e..a5d1fa4e7552 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -35,6 +35,7 @@ #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/mount.h> +#include <linux/fs_context.h> #include "nilfs.h" #include "export.h" #include "mdt.h" @@ -1216,7 +1217,6 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) } struct nilfs_super_data { - struct block_device *bdev; __u64 cno; int flags; }; @@ -1283,64 +1283,49 @@ static int nilfs_identify(char *data, struct nilfs_super_data *sd) static int nilfs_set_bdev_super(struct super_block *s, void *data) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; + s->s_dev = *(dev_t *)data; return 0; } static int nilfs_test_bdev_super(struct super_block *s, void *data) { - return (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } static struct dentry * nilfs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - struct nilfs_super_data sd; + struct nilfs_super_data sd = { .flags = flags }; struct super_block *s; - struct dentry *root_dentry; - int err, s_new = false; + dev_t dev; + int err; - sd.bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - NULL); - if (IS_ERR(sd.bdev)) - return ERR_CAST(sd.bdev); + if (nilfs_identify(data, &sd)) + return ERR_PTR(-EINVAL); - sd.cno = 0; - sd.flags = flags; - if (nilfs_identify((char *)data, &sd)) { - err = -EINVAL; - goto failed; - } + err = lookup_bdev(dev_name, &dev); + if (err) + return ERR_PTR(err); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&sd.bdev->bd_fsfreeze_mutex); - if (sd.bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - err = -EBUSY; - goto failed; - } s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, flags, - sd.bdev); - mutex_unlock(&sd.bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - err = PTR_ERR(s); - goto failed; - } + &dev); + if (IS_ERR(s)) + return ERR_CAST(s); if (!s->s_root) { - s_new = true; - - /* New superblock instance created */ - snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); - sb_set_blocksize(s, block_size(sd.bdev)); - - err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); + /* + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * __invalidate_device()). It is safe because we have active sb + * reference and SB_BORN is not set yet. + */ + up_write(&s->s_umount); + err = setup_bdev_super(s, flags, NULL); + down_write(&s->s_umount); + if (!err) + err = nilfs_fill_super(s, data, + flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; @@ -1366,24 +1351,18 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (sd.cno) { + struct dentry *root_dentry; + err = nilfs_attach_snapshot(s, sd.cno, &root_dentry); if (err) goto failed_super; - } else { - root_dentry = dget(s->s_root); + return root_dentry; } - if (!s_new) - blkdev_put(sd.bdev, fs_type); - - return root_dentry; + return dget(s->s_root); failed_super: deactivate_locked_super(s); - - failed: - if (!s_new) - blkdev_put(sd.bdev, fs_type); return ERR_PTR(err); } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 190aa717fa32..ebdcc25df0f7 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -199,7 +199,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) } /* this conversion is done only at watch creation */ -static __u32 convert_arg(unsigned long arg) +static __u32 convert_arg(unsigned int arg) { __u32 new_mask = FS_EVENT_ON_CHILD; @@ -258,7 +258,7 @@ static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark *dn_mark, * up here. Allocate both a mark for fsnotify to add and a dnotify_struct to be * attached to the fsnotify_mark. */ -int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) +int fcntl_dirnotify(int fd, struct file *filp, unsigned int arg) { struct dnotify_mark *new_dn_mark, *dn_mark; struct fsnotify_mark *new_fsn_mark, *fsn_mark; diff --git a/fs/nsfs.c b/fs/nsfs.c index f602a96a1afe..647a22433bd8 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -84,7 +84,7 @@ slow: return -ENOMEM; } inode->i_ino = ns->inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_flags |= S_IMMUTABLE; inode->i_mode = S_IFREG | S_IRUGO; inode->i_fop = &ns_file_operations; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 6c3f38d66579..99ac6ea277c4 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -654,7 +654,7 @@ static int ntfs_read_locked_inode(struct inode *vi) * always changes, when mtime is changed. ctime can be changed on its * own, mtime is then not changed, e.g. when a file is renamed. */ - vi->i_ctime = ntfs2utc(si->last_mft_change_time); + inode_set_ctime_to_ts(vi, ntfs2utc(si->last_mft_change_time)); /* * Last access to the data within the file. Not changed during a rename * for example but changed whenever the file is written to. @@ -1218,7 +1218,7 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); vi->i_atime = base_vi->i_atime; vi->i_generation = ni->seq_no = base_ni->seq_no; @@ -1484,7 +1484,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) vi->i_gid = base_vi->i_gid; set_nlink(vi, base_vi->i_nlink); vi->i_mtime = base_vi->i_mtime; - vi->i_ctime = base_vi->i_ctime; + inode_set_ctime_to_ts(vi, inode_get_ctime(base_vi)); vi->i_atime = base_vi->i_atime; vi->i_generation = ni->seq_no = base_ni->seq_no; /* Set inode type to zero but preserve permissions. */ @@ -2804,13 +2804,14 @@ done: */ if (!IS_NOCMTIME(VFS_I(base_ni)) && !IS_RDONLY(VFS_I(base_ni))) { struct timespec64 now = current_time(VFS_I(base_ni)); + struct timespec64 ctime = inode_get_ctime(VFS_I(base_ni)); int sync_it = 0; if (!timespec64_equal(&VFS_I(base_ni)->i_mtime, &now) || - !timespec64_equal(&VFS_I(base_ni)->i_ctime, &now)) + !timespec64_equal(&ctime, &now)) sync_it = 1; + inode_set_ctime_to_ts(VFS_I(base_ni), now); VFS_I(base_ni)->i_mtime = now; - VFS_I(base_ni)->i_ctime = now; if (sync_it) mark_inode_dirty_sync(VFS_I(base_ni)); @@ -2928,7 +2929,7 @@ int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (ia_valid & ATTR_MTIME) vi->i_mtime = attr->ia_mtime; if (ia_valid & ATTR_CTIME) - vi->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(vi, attr->ia_ctime); mark_inode_dirty(vi); out: return err; @@ -3004,7 +3005,7 @@ int __ntfs_write_inode(struct inode *vi, int sync) si->last_data_change_time = nt; modified = true; } - nt = utc2ntfs(vi->i_ctime); + nt = utc2ntfs(inode_get_ctime(vi)); if (si->last_mft_change_time != nt) { ntfs_debug("Updating ctime for inode 0x%lx: old = 0x%llx, " "new = 0x%llx", vi->i_ino, (long long) diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 0155f106ec34..ad1a8f72da22 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -2682,8 +2682,7 @@ mft_rec_already_initialized: vi->i_mode &= ~S_IWUGO; /* Set the inode times to the current time. */ - vi->i_atime = vi->i_mtime = vi->i_ctime = - current_time(vi); + vi->i_atime = vi->i_mtime = inode_set_ctime_current(vi); /* * Set the file size to 0, the ntfs inode sizes are set to 0 by * the call to ntfs_init_big_inode() below. diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 1d6c824246c4..962f12ce6c0a 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -85,7 +85,7 @@ int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, stat->attributes_mask |= STATX_ATTR_COMPRESSED | STATX_ATTR_ENCRYPTED; - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); stat->result_mask |= STATX_BTIME; stat->btime = ni->i_crtime; @@ -342,7 +342,7 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, err = 0; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); if (IS_SYNC(inode)) { @@ -400,7 +400,7 @@ static int ntfs_truncate(struct inode *inode, loff_t new_size) ni_unlock(ni); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (!IS_DIRSYNC(inode)) { dirty = 1; } else { @@ -642,7 +642,7 @@ out: filemap_invalidate_unlock(mapping); if (!err) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); } diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 16bd9faa2d28..2b85cb10f0be 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3265,6 +3265,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) if (is_rec_inuse(ni->mi.mrec) && !(sbi->flags & NTFS_FLAGS_LOG_REPLAYING) && inode->i_nlink) { bool modified = false; + struct timespec64 ctime = inode_get_ctime(inode); /* Update times in standard attribute. */ std = ni_std(ni); @@ -3280,7 +3281,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) modified = true; } - dup.c_time = kernel2nt(&inode->i_ctime); + dup.c_time = kernel2nt(&ctime); if (std->c_time != dup.c_time) { std->c_time = dup.c_time; modified = true; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index dc7e7ab701c6..4123e126c4d0 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -44,6 +44,7 @@ static struct inode *ntfs_read_mft(struct inode *inode, u64 t64; struct MFT_REC *rec; struct runs_tree *run; + struct timespec64 ctime; inode->i_op = NULL; /* Setup 'uid' and 'gid' */ @@ -169,7 +170,8 @@ next_attr: nt2kernel(std5->cr_time, &ni->i_crtime); #endif nt2kernel(std5->a_time, &inode->i_atime); - nt2kernel(std5->c_time, &inode->i_ctime); + ctime = inode_get_ctime(inode); + nt2kernel(std5->c_time, &ctime); nt2kernel(std5->m_time, &inode->i_mtime); ni->std_fa = std5->fa; @@ -958,7 +960,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, if (err >= 0) { if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; dirty = true; } @@ -1658,8 +1660,8 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ - inode->i_atime = inode->i_mtime = inode->i_ctime = dir->i_mtime = - dir->i_ctime = ni->i_crtime; + inode->i_atime = inode->i_mtime = inode_set_ctime_to_ts(inode, ni->i_crtime); + dir->i_mtime = inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); mark_inode_dirty(inode); @@ -1765,9 +1767,9 @@ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) if (!err) { drop_nlink(inode); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); if (inode->i_nlink) mark_inode_dirty(inode); } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 70f8c859e0ad..ad430d50bd79 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -156,8 +156,8 @@ static int ntfs_link(struct dentry *ode, struct inode *dir, struct dentry *de) err = ntfs_link_inode(inode, de); if (!err) { - dir->i_ctime = dir->i_mtime = inode->i_ctime = - current_time(dir); + dir->i_mtime = inode_set_ctime_to_ts(inode, + inode_set_ctime_current(dir)); mark_inode_dirty(inode); mark_inode_dirty(dir); d_instantiate(de, inode); @@ -324,14 +324,11 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir, /* Restore after failed rename failed too. */ _ntfs_bad_inode(inode); } else if (!err) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(dir); + simple_rename_timestamp(dir, dentry, new_dir, new_dentry); mark_inode_dirty(inode); mark_inode_dirty(dir); - if (dir != new_dir) { - new_dir->i_mtime = new_dir->i_ctime = dir->i_ctime; + if (dir != new_dir) mark_inode_dirty(new_dir); - } if (IS_DIRSYNC(dir)) ntfs_sync_inode(dir); diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 1a02072b6b0e..5fffddea554f 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -569,9 +569,9 @@ static void init_once(void *foo) } /* - * put_ntfs - Noinline to reduce binary size. + * Noinline to reduce binary size. */ -static noinline void put_ntfs(struct ntfs_sb_info *sbi) +static noinline void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); @@ -625,12 +625,6 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); - - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; - - sync_blockdev(sb->s_bdev); } static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -1564,15 +1558,7 @@ load_root: put_inode_out: iput(inode); out: - /* - * Free resources here. - * ntfs_fs_free will be called with fc->s_fs_info = NULL - */ - put_mount_options(sbi->options); - put_ntfs(sbi); - sb->s_fs_info = NULL; kfree(boot2); - return err; } @@ -1659,7 +1645,7 @@ static void ntfs_fs_free(struct fs_context *fc) struct ntfs_sb_info *sbi = fc->s_fs_info; if (sbi) - put_ntfs(sbi); + ntfs3_free_sbi(sbi); if (opts) put_mount_options(opts); @@ -1728,13 +1714,24 @@ free_opts: return -ENOMEM; } +static void ntfs3_kill_sb(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + kill_block_super(sb); + + if (sbi->options) + put_mount_options(sbi->options); + ntfs3_free_sbi(sbi); +} + // clang-format off static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs3", .init_fs_context = ntfs_init_fs_context, .parameters = ntfs_fs_parameters, - .kill_sb = kill_block_super, + .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 023f314e8950..29fd391899e5 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -637,7 +637,7 @@ static noinline int ntfs_set_acl_ex(struct mnt_idmap *idmap, if (!err) { set_cached_acl(inode, type, acl); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } @@ -924,7 +924,7 @@ set_new_fa: NULL); out: - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); return err; diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 9fd03eaf15f8..e75137a8e7cb 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -191,10 +191,10 @@ static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, } inode->i_mode = new_mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); di->i_mode = cpu_to_le16(inode->i_mode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 51c93929a146..aef58f1395c8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7436,10 +7436,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, } inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 8dfc284e85f0..0fdba30740ab 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2048,7 +2048,7 @@ out_write_size: } inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); if (handle) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 694471fc46b8..8b123d543e6e 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1658,7 +1658,7 @@ int __ocfs2_add_entry(handle_t *handle, offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); if (ocfs2_dirent_would_fit(de, rec_len)) { - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); if (retval < 0) { mlog_errno(retval); @@ -2962,11 +2962,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_dinode_new_extent_list(dir, di); i_size_write(dir, sb->s_blocksize); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); di->i_size = cpu_to_le64(sb->s_blocksize); - di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(dir).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(dir).tv_nsec); ocfs2_update_inode_fsync_trans(handle, dir, 1); /* diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index ba26c5567cff..81265123ce6c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -337,7 +337,7 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) if (inode) { inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); inc_nlink(inode); inode->i_fop = &simple_dir_operations; @@ -360,7 +360,7 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_ino = get_next_ino(); inode_init_owner(&nop_mnt_idmap, inode, parent, mode); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); ip = DLMFS_I(inode); ip->ip_conn = DLMFS_I(parent)->ip_conn; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c28bc983a7b1..c3e2961ee5db 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2162,6 +2162,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ctime = inode_get_ctime(inode); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); @@ -2185,7 +2186,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) lvb->lvb_iatime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_atime)); lvb->lvb_ictime_packed = - cpu_to_be64(ocfs2_pack_timespec(&inode->i_ctime)); + cpu_to_be64(ocfs2_pack_timespec(&ctime)); lvb->lvb_imtime_packed = cpu_to_be64(ocfs2_pack_timespec(&inode->i_mtime)); lvb->lvb_iattr = cpu_to_be32(oi->ip_attr); @@ -2208,6 +2209,7 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres; struct ocfs2_meta_lvb *lvb; + struct timespec64 ctime; mlog_meta_lvb(0, lockres); @@ -2238,8 +2240,9 @@ static int ocfs2_refresh_inode_from_lvb(struct inode *inode) be64_to_cpu(lvb->lvb_iatime_packed)); ocfs2_unpack_timespec(&inode->i_mtime, be64_to_cpu(lvb->lvb_imtime_packed)); - ocfs2_unpack_timespec(&inode->i_ctime, + ocfs2_unpack_timespec(&ctime, be64_to_cpu(lvb->lvb_ictime_packed)); + inode_set_ctime_to_ts(inode, ctime); spin_unlock(&oi->ip_lock); return 0; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bf2c17ea96a0..3b91b4cc7c6a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -232,8 +232,10 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if (vfsmnt->mnt_flags & MNT_RELATIME) { + struct timespec64 ctime = inode_get_ctime(inode); + if ((timespec64_compare(&inode->i_atime, &inode->i_mtime) <= 0) || - (timespec64_compare(&inode->i_atime, &inode->i_ctime) <= 0)) + (timespec64_compare(&inode->i_atime, &ctime) <= 0)) return 1; return 0; @@ -294,7 +296,7 @@ int ocfs2_set_inode_size(handle_t *handle, i_size_write(inode, new_i_size); inode->i_blocks = ocfs2_inode_sector_count(inode); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); if (status < 0) { @@ -415,12 +417,12 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, } i_size_write(inode, new_i_size); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di = (struct ocfs2_dinode *) fe_bh->b_data; di->i_size = cpu_to_le64(new_i_size); - di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + di->i_ctime = di->i_mtime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, fe_bh); @@ -824,7 +826,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); di->i_mtime_nsec = di->i_ctime_nsec; @@ -1317,7 +1319,7 @@ int ocfs2_getattr(struct mnt_idmap *idmap, const struct path *path, goto bail; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* * If there is inline data in the inode, the inode will normally not * have data blocks allocated (it may have an external xattr block). @@ -2043,7 +2045,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, goto out_inode_unlock; } - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); if (ret < 0) mlog_errno(ret); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bb116c39b581..e8771600b930 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -306,8 +306,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); if (OCFS2_I(inode)->ip_blkno != le64_to_cpu(fe->i_blkno)) mlog(ML_ERROR, @@ -1314,8 +1314,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, fe->i_mode = cpu_to_le16(inode->i_mode); fe->i_atime = cpu_to_le64(inode->i_atime.tv_sec); fe->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); fe->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec); fe->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); @@ -1352,8 +1352,8 @@ void ocfs2_refresh_inode(struct inode *inode, inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec); inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime); inode->i_mtime.tv_nsec = le32_to_cpu(fe->i_mtime_nsec); - inode->i_ctime.tv_sec = le64_to_cpu(fe->i_ctime); - inode->i_ctime.tv_nsec = le32_to_cpu(fe->i_ctime_nsec); + inode_set_ctime(inode, le64_to_cpu(fe->i_ctime), + le32_to_cpu(fe->i_ctime_nsec)); spin_unlock(&OCFS2_I(inode)->ip_lock); } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 25d8072ccfce..c19c730c26e2 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -557,7 +557,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, (unsigned long)bh, (unsigned long long)bh->b_blocknr); - ocfs2_error(bh->b_bdev->bd_super, + ocfs2_error(bh->b_assoc_map->host->i_sb, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } @@ -780,14 +780,14 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) mlog_errno(status); if (!is_handle_aborted(handle)) { journal_t *journal = handle->h_transaction->t_journal; - struct super_block *sb = bh->b_bdev->bd_super; mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " "Aborting transaction and journal.\n"); handle->h_err = status; jbd2_journal_abort_handle(handle); jbd2_journal_abort(journal, status); - ocfs2_abort(sb, "Journal already aborted.\n"); + ocfs2_abort(bh->b_assoc_map->host->i_sb, + "Journal already aborted.\n"); } } } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index b1e32ec4a9d4..05d67968a3a9 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -950,9 +950,9 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) } di = (struct ocfs2_dinode *)di_bh->b_data; - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_update_inode_fsync_trans(handle, inode, 0); ocfs2_journal_dirty(handle, di_bh); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 17c52225b87d..e4a684d45308 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -793,10 +793,10 @@ static int ocfs2_link(struct dentry *old_dentry, } inc_nlink(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); ocfs2_set_links_count(fe, inode->i_nlink); - fe->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - fe->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + fe->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + fe->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(handle, fe_bh); err = ocfs2_add_entry(handle, dentry, inode, @@ -995,7 +995,7 @@ static int ocfs2_unlink(struct inode *dir, ocfs2_set_links_count(fe, inode->i_nlink); ocfs2_journal_dirty(handle, fe_bh); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (S_ISDIR(inode->i_mode)) drop_nlink(dir); @@ -1537,7 +1537,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap, new_dir_bh, &target_insert); } - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); status = ocfs2_journal_access_di(handle, INODE_CACHE(old_inode), @@ -1546,8 +1546,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; - old_di->i_ctime = cpu_to_le64(old_inode->i_ctime.tv_sec); - old_di->i_ctime_nsec = cpu_to_le32(old_inode->i_ctime.tv_nsec); + old_di->i_ctime = cpu_to_le64(inode_get_ctime(old_inode).tv_sec); + old_di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(old_inode).tv_nsec); ocfs2_journal_dirty(handle, old_inode_bh); } else mlog_errno(status); @@ -1586,9 +1586,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (new_inode) { drop_nlink(new_inode); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); if (update_dot_dot) { status = ocfs2_update_entry(old_inode, handle, @@ -1610,7 +1610,8 @@ static int ocfs2_rename(struct mnt_idmap *idmap, if (old_dir != new_dir) { /* Keep the same times on both directories.*/ - new_dir->i_ctime = new_dir->i_mtime = old_dir->i_ctime; + new_dir->i_mtime = inode_set_ctime_to_ts(new_dir, + inode_get_ctime(old_dir)); /* * This will also pick up the i_nlink change from the diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 564ab48d03ef..25c8ec3c8c3a 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3750,9 +3750,9 @@ static int ocfs2_change_ctime(struct inode *inode, goto out_commit; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(handle, di_bh); @@ -4073,10 +4073,10 @@ static int ocfs2_complete_reflink(struct inode *s_inode, * we want mtime to appear identical to the source and * update ctime. */ - t_inode->i_ctime = current_time(t_inode); + inode_set_ctime_current(t_inode); - di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec); + di->i_ctime = cpu_to_le64(inode_get_ctime(t_inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(t_inode).tv_nsec); t_inode->i_mtime = s_inode->i_mtime; di->i_mtime = s_di->i_mtime; @@ -4456,7 +4456,7 @@ int ocfs2_reflink_update_dest(struct inode *dest, if (newlen > i_size_read(dest)) i_size_write(dest, newlen); spin_unlock(&OCFS2_I(dest)->ip_lock); - dest->i_ctime = dest->i_mtime = current_time(dest); + dest->i_mtime = inode_set_ctime_current(dest); ret = ocfs2_mark_inode_dirty(handle, dest, d_bh); if (ret) { diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ac77ff6e676..6510ad783c91 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -3421,9 +3421,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, goto out; } - inode->i_ctime = current_time(inode); - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + inode_set_ctime_current(inode); + di->i_ctime = cpu_to_le64(inode_get_ctime(inode).tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); } out: diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 82cf7e9a665f..6bda275826d6 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -143,7 +143,7 @@ static int omfs_add_link(struct dentry *dentry, struct inode *inode) mark_buffer_dirty(bh); brelse(bh); - dir->i_ctime = current_time(dir); + inode_set_ctime_current(dir); /* mark affected inodes dirty to rebuild checksums */ mark_inode_dirty(dir); @@ -399,7 +399,7 @@ static int omfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) goto out; - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); out: return err; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index c4c79e07efc7..2f8c1882f45c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -51,7 +51,7 @@ struct inode *omfs_new_inode(struct inode *dir, umode_t mode) inode_init_owner(&nop_mnt_idmap, inode, NULL, mode); inode->i_mapping->a_ops = &omfs_aops; - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { case S_IFDIR: inode->i_op = &omfs_dir_inops; @@ -134,8 +134,8 @@ static int __omfs_write_inode(struct inode *inode, int wait) oi->i_head.h_magic = OMFS_IMAGIC; oi->i_size = cpu_to_be64(inode->i_size); - ctime = inode->i_ctime.tv_sec * 1000LL + - ((inode->i_ctime.tv_nsec + 999)/1000); + ctime = inode_get_ctime(inode).tv_sec * 1000LL + + ((inode_get_ctime(inode).tv_nsec + 999)/1000); oi->i_ctime = cpu_to_be64(ctime); omfs_update_checksums(oi); @@ -232,10 +232,9 @@ struct inode *omfs_iget(struct super_block *sb, ino_t ino) inode->i_atime.tv_sec = ctime; inode->i_mtime.tv_sec = ctime; - inode->i_ctime.tv_sec = ctime; + inode_set_ctime(inode, ctime, nsecs); inode->i_atime.tv_nsec = nsecs; inode->i_mtime.tv_nsec = nsecs; - inode->i_ctime.tv_nsec = nsecs; inode->i_mapping->a_ops = &omfs_aops; diff --git a/fs/open.c b/fs/open.c index e6ead0f19964..98f6601fbac6 100644 --- a/fs/open.c +++ b/fs/open.c @@ -671,11 +671,20 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, umode_t, mode) return err; } -static int do_fchmodat(int dfd, const char __user *filename, umode_t mode) +static int do_fchmodat(int dfd, const char __user *filename, umode_t mode, + unsigned int flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; + unsigned int lookup_flags; + + if (unlikely(flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH))) + return -EINVAL; + + lookup_flags = (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + retry: error = user_path_at(dfd, filename, lookup_flags, &path); if (!error) { @@ -689,15 +698,21 @@ retry: return error; } +SYSCALL_DEFINE4(fchmodat2, int, dfd, const char __user *, filename, + umode_t, mode, unsigned int, flags) +{ + return do_fchmodat(dfd, filename, mode, flags); +} + SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename, umode_t, mode) { - return do_fchmodat(dfd, filename, mode); + return do_fchmodat(dfd, filename, mode, 0); } SYSCALL_DEFINE2(chmod, const char __user *, filename, umode_t, mode) { - return do_fchmodat(AT_FDCWD, filename, mode); + return do_fchmodat(AT_FDCWD, filename, mode, 0); } /* @@ -1150,7 +1165,7 @@ EXPORT_SYMBOL_GPL(kernel_file_open); * backing_file_open - open a backing file for kernel internal use * @path: path of the file to open * @flags: open flags - * @path: path of the backing file + * @real_path: path of the backing file * @cred: credentials for open * * Open a backing file for a stackable filesystem (e.g., overlayfs). @@ -1503,7 +1518,7 @@ SYSCALL_DEFINE2(creat, const char __user *, pathname, umode_t, mode) * "id" is the POSIX thread ID. We use the * files pointer for this.. */ -int filp_close(struct file *filp, fl_owner_t id) +static int filp_flush(struct file *filp, fl_owner_t id) { int retval = 0; @@ -1520,10 +1535,18 @@ int filp_close(struct file *filp, fl_owner_t id) dnotify_flush(filp, id); locks_remove_posix(filp, id); } - fput(filp); return retval; } +int filp_close(struct file *filp, fl_owner_t id) +{ + int retval; + + retval = filp_flush(filp, id); + fput(filp); + + return retval; +} EXPORT_SYMBOL(filp_close); /* @@ -1533,7 +1556,20 @@ EXPORT_SYMBOL(filp_close); */ SYSCALL_DEFINE1(close, unsigned int, fd) { - int retval = close_fd(fd); + int retval; + struct file *file; + + file = close_fd_get_file(fd); + if (!file) + return -EBADF; + + retval = filp_flush(file, current->files); + + /* + * We're returning to user space. Don't bother + * with any delayed fput() cases. + */ + __fput_sync(file); /* can't restart close syscall because file table entry was cleared */ if (unlikely(retval == -ERESTARTSYS || @@ -1546,7 +1582,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd) } /** - * close_range() - Close all file descriptors in a given range. + * sys_close_range() - Close all file descriptors in a given range. * * @fd: starting file descriptor to close * @max_fd: last file descriptor to close diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index f0b7f4d51a17..b2457cb97fa0 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -237,7 +237,7 @@ found: if (IS_ERR(inode)) return ERR_CAST(inode); if (inode->i_state & I_NEW) { - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ent_oi = OP_I(inode); ent_oi->type = ent_type; ent_oi->u = ent_data; @@ -387,8 +387,7 @@ static int openprom_fill_super(struct super_block *s, struct fs_context *fc) goto out_no_root; } - root_inode->i_mtime = root_inode->i_atime = - root_inode->i_ctime = current_time(root_inode); + root_inode->i_mtime = root_inode->i_atime = inode_set_ctime_current(root_inode); root_inode->i_op = &openprom_inode_operations; root_inode->i_fop = &openprom_operations; root_inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 9014bbcc8031..085912268442 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -871,7 +871,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, ret = orangefs_inode_getattr(inode, request_mask & STATX_SIZE ? ORANGEFS_GETATTR_SIZE : 0); if (ret == 0) { - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* override block size reported to stat */ if (!(request_mask & STATX_SIZE)) @@ -900,12 +900,13 @@ int orangefs_permission(struct mnt_idmap *idmap, return generic_permission(&nop_mnt_idmap, inode, mask); } -int orangefs_update_time(struct inode *inode, struct timespec64 *time, int flags) +int orangefs_update_time(struct inode *inode, int flags) { struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", get_khandle_from_ino(inode)); - generic_update_time(inode, time, flags); + flags = generic_update_time(inode, flags); memset(&iattr, 0, sizeof iattr); if (flags & S_ATIME) iattr.ia_valid |= ATTR_ATIME; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 77518e248cf7..c9dfd5c6a097 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -421,7 +421,7 @@ static int orangefs_rename(struct mnt_idmap *idmap, ret); if (new_dentry->d_inode) - new_dentry->d_inode->i_ctime = current_time(new_dentry->d_inode); + inode_set_ctime_current(d_inode(new_dentry)); op_release(new_op); return ret; diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index ce20d3443869..b711654ca18a 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -370,7 +370,7 @@ int orangefs_getattr(struct mnt_idmap *idmap, const struct path *path, int orangefs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask); -int orangefs_update_time(struct inode *, struct timespec64 *, int); +int orangefs_update_time(struct inode *, int); /* * defined in xattr.c diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 46b7dcff18ac..0a9fcfdf552f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -361,11 +361,11 @@ again2: downcall.resp.getattr.attributes.atime; inode->i_mtime.tv_sec = (time64_t)new_op-> downcall.resp.getattr.attributes.mtime; - inode->i_ctime.tv_sec = (time64_t)new_op-> - downcall.resp.getattr.attributes.ctime; + inode_set_ctime(inode, + (time64_t)new_op->downcall.resp.getattr.attributes.ctime, + 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; /* special case: mark the root inode as sticky */ inode->i_mode = type | (is_root_handle(inode) ? S_ISVTX : 0) | diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 21245b00722a..eaa1e6b3e04a 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -239,6 +239,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence) static void ovl_file_accessed(struct file *file) { struct inode *inode, *upperinode; + struct timespec64 ctime, uctime; if (file->f_flags & O_NOATIME) return; @@ -249,10 +250,12 @@ static void ovl_file_accessed(struct file *file) if (!upperinode) return; + ctime = inode_get_ctime(inode); + uctime = inode_get_ctime(upperinode); if ((!timespec64_equal(&inode->i_mtime, &upperinode->i_mtime) || - !timespec64_equal(&inode->i_ctime, &upperinode->i_ctime))) { + !timespec64_equal(&ctime, &uctime))) { inode->i_mtime = upperinode->i_mtime; - inode->i_ctime = upperinode->i_ctime; + inode_set_ctime_to_ts(inode, uctime); } touch_atime(&file->f_path); @@ -290,10 +293,7 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) if (iocb->ki_flags & IOCB_WRITE) { struct inode *inode = file_inode(orig_iocb->ki_filp); - /* Actually acquired in ovl_write_iter() */ - __sb_writers_acquired(file_inode(iocb->ki_filp)->i_sb, - SB_FREEZE_WRITE); - file_end_write(iocb->ki_filp); + kiocb_end_write(iocb); ovl_copyattr(inode); } @@ -409,10 +409,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!aio_req) goto out; - file_start_write(real.file); - /* Pacify lockdep, same trick as done in aio_write() */ - __sb_writers_release(file_inode(real.file)->i_sb, - SB_FREEZE_WRITE); aio_req->fd = real; real.flags = 0; aio_req->orig_iocb = iocb; @@ -420,6 +416,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) aio_req->iocb.ki_flags = ifl; aio_req->iocb.ki_complete = ovl_aio_rw_complete; refcount_set(&aio_req->ref, 2); + kiocb_start_write(&aio_req->iocb); ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter); ovl_aio_put(aio_req); if (ret != -EIOCBQUEUED) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index a63e57447be9..f22e27b78025 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -693,7 +693,7 @@ int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags) +int ovl_update_time(struct inode *inode, int flags) { if (flags & S_ATIME) { struct ovl_fs *ofs = inode->i_sb->s_fs_info; diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 9402591f12aa..8bbe6173bef4 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -665,7 +665,7 @@ static inline struct posix_acl *ovl_get_acl_path(const struct path *path, } #endif -int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags); +int ovl_update_time(struct inode *inode, int flags); bool ovl_is_private_xattr(struct super_block *sb, const char *name); struct ovl_inode_params { diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 7ef9e13c404a..c210b5d496a8 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -1202,6 +1202,6 @@ void ovl_copyattr(struct inode *inode) inode->i_mode = realinode->i_mode; inode->i_atime = realinode->i_atime; inode->i_mtime = realinode->i_mtime; - inode->i_ctime = realinode->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(realinode)); i_size_write(inode, i_size_read(realinode)); } diff --git a/fs/pipe.c b/fs/pipe.c index 2d88f73f585a..6c1a9b1db907 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -489,7 +489,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; @@ -899,7 +899,7 @@ static struct inode * get_pipe_inode(void) inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); return inode; @@ -1236,7 +1236,7 @@ const struct file_operations pipefifo_fops = { * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ -unsigned int round_pipe_size(unsigned long size) +unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; @@ -1319,7 +1319,7 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ -static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) +static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; @@ -1387,7 +1387,7 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) return pipe; } -long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 7fa1b738bbab..a05fe94970ce 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -1027,7 +1027,7 @@ int simple_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, return error; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_I_VERSION(inode)) inode_inc_iversion(inode); set_cached_acl(inode, type, acl); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9df3f4839662..7576effe8d52 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1902,7 +1902,7 @@ struct inode *proc_pid_make_inode(struct super_block *sb, ei = PROC_I(inode); inode->i_mode = mode; inode->i_ino = get_next_ino(); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_op = &proc_def_inode_operations; /* @@ -1966,7 +1966,7 @@ int pid_getattr(struct mnt_idmap *idmap, const struct path *path, struct proc_fs_info *fs_info = proc_sb_info(inode->i_sb); struct task_struct *task; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; @@ -3583,7 +3583,8 @@ static int proc_tid_comm_permission(struct mnt_idmap *idmap, } static const struct inode_operations proc_tid_comm_inode_operations = { - .permission = proc_tid_comm_permission, + .setattr = proc_setattr, + .permission = proc_tid_comm_permission, }; /* @@ -3899,7 +3900,7 @@ static int proc_task_getattr(struct mnt_idmap *idmap, { struct inode *inode = d_inode(path->dentry); struct task_struct *p = get_proc_task(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (p) { stat->nlink += get_nr_threads(p); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index b3140deebbbf..6276b3938842 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -352,7 +352,7 @@ static int proc_fd_getattr(struct mnt_idmap *idmap, struct inode *inode = d_inode(path->dentry); int rv = 0; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); /* If it's a directory, put the number of open fds there */ if (S_ISDIR(inode->i_mode)) { diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 42ae38ff6e7e..775ce0bcf08c 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -146,7 +146,7 @@ static int proc_getattr(struct mnt_idmap *idmap, } } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); return 0; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 67b09a1d9433..532dc9d240f7 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -660,7 +660,7 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_private = de->data; inode->i_ino = de->low_ino; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); PROC_I(inode)->pde = de; if (is_empty_pde(de)) { make_empty_dir_inode(inode); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index a0c0419872e3..2ba31b6d68c0 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -308,7 +308,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, net = get_proc_task_net(inode); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (net != NULL) { stat->nlink = net->proc_net->nlink; @@ -321,6 +321,7 @@ static int proc_tgid_net_getattr(struct mnt_idmap *idmap, const struct inode_operations proc_net_inode_operations = { .lookup = proc_tgid_net_lookup, .getattr = proc_tgid_net_getattr, + .setattr = proc_setattr, }; static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5ea42653126e..bf06344a42cc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -463,7 +463,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, head->count++; spin_unlock(&sysctl_lock); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = table->mode; if (!S_ISDIR(table->mode)) { inode->i_mode |= S_IFREG; @@ -849,7 +849,7 @@ static int proc_sys_getattr(struct mnt_idmap *idmap, if (IS_ERR(head)) return PTR_ERR(head); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); if (table) stat->mode = (stat->mode & S_IFMT) | table->mode; diff --git a/fs/proc/root.c b/fs/proc/root.c index a86e65a608da..9191248f2dac 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -314,7 +314,8 @@ static int proc_root_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 72cd69bcaf4a..ecc4da8d265e 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -46,7 +46,7 @@ int proc_setup_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 507cd4e59d07..fafff1bd34cd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -587,8 +587,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool migration = false; if (pmd_present(*pmd)) { - /* FOLL_DUMP will return -EFAULT on huge zero page */ - page = follow_trans_huge_pmd(vma, addr, pmd, FOLL_DUMP); + page = vm_normal_page_pmd(vma, addr, *pmd); } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { swp_entry_t entry = pmd_to_swp_entry(*pmd); @@ -758,12 +757,14 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; static const struct mm_walk_ops smaps_shmem_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, .pte_hole = smaps_pte_hole, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1245,6 +1246,7 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, static const struct mm_walk_ops clear_refs_walk_ops = { .pmd_entry = clear_refs_pte_range, .test_walk = clear_refs_test_walk, + .walk_lock = PGWALK_WRLOCK, }; static ssize_t clear_refs_write(struct file *file, const char __user *buf, @@ -1622,6 +1624,7 @@ static const struct mm_walk_ops pagemap_ops = { .pmd_entry = pagemap_pmd_range, .pte_hole = pagemap_pte_hole, .hugetlb_entry = pagemap_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -1935,6 +1938,7 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, static const struct mm_walk_ops show_numa_ops = { .hugetlb_entry = gather_hugetlb_stats, .pmd_entry = gather_pte_stats, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a553273fbd41..63ac1f93289f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -46,7 +46,7 @@ int proc_setup_thread_self(struct super_block *s) struct inode *inode = new_inode(s); if (inode) { inode->i_ino = thread_self_inum; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig index c49d554cc9ae..3acc38600cd1 100644 --- a/fs/pstore/Kconfig +++ b/fs/pstore/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config PSTORE tristate "Persistent store support" - select CRYPTO if PSTORE_COMPRESS default n help This option enables generic access to platform level @@ -22,99 +21,18 @@ config PSTORE_DEFAULT_KMSG_BYTES Defines default size of pstore kernel log storage. Can be enlarged if needed, not recommended to shrink it. -config PSTORE_DEFLATE_COMPRESS - tristate "DEFLATE (ZLIB) compression" - default y - depends on PSTORE - select CRYPTO_DEFLATE - help - This option enables DEFLATE (also known as ZLIB) compression - algorithm support. - -config PSTORE_LZO_COMPRESS - tristate "LZO compression" - depends on PSTORE - select CRYPTO_LZO - help - This option enables LZO compression algorithm support. - -config PSTORE_LZ4_COMPRESS - tristate "LZ4 compression" - depends on PSTORE - select CRYPTO_LZ4 - help - This option enables LZ4 compression algorithm support. - -config PSTORE_LZ4HC_COMPRESS - tristate "LZ4HC compression" - depends on PSTORE - select CRYPTO_LZ4HC - help - This option enables LZ4HC (high compression) mode algorithm. - -config PSTORE_842_COMPRESS - bool "842 compression" - depends on PSTORE - select CRYPTO_842 - help - This option enables 842 compression algorithm support. - -config PSTORE_ZSTD_COMPRESS - bool "zstd compression" - depends on PSTORE - select CRYPTO_ZSTD - help - This option enables zstd compression algorithm support. - config PSTORE_COMPRESS - def_bool y + bool "Pstore compression (deflate)" depends on PSTORE - depends on PSTORE_DEFLATE_COMPRESS || PSTORE_LZO_COMPRESS || \ - PSTORE_LZ4_COMPRESS || PSTORE_LZ4HC_COMPRESS || \ - PSTORE_842_COMPRESS || PSTORE_ZSTD_COMPRESS - -choice - prompt "Default pstore compression algorithm" - depends on PSTORE_COMPRESS + select ZLIB_INFLATE + select ZLIB_DEFLATE + default y help - This option chooses the default active compression algorithm. - This change be changed at boot with "pstore.compress=..." on - the kernel command line. - - Currently, pstore has support for 6 compression algorithms: - deflate, lzo, lz4, lz4hc, 842 and zstd. - - The default compression algorithm is deflate. - - config PSTORE_DEFLATE_COMPRESS_DEFAULT - bool "deflate" if PSTORE_DEFLATE_COMPRESS - - config PSTORE_LZO_COMPRESS_DEFAULT - bool "lzo" if PSTORE_LZO_COMPRESS - - config PSTORE_LZ4_COMPRESS_DEFAULT - bool "lz4" if PSTORE_LZ4_COMPRESS - - config PSTORE_LZ4HC_COMPRESS_DEFAULT - bool "lz4hc" if PSTORE_LZ4HC_COMPRESS - - config PSTORE_842_COMPRESS_DEFAULT - bool "842" if PSTORE_842_COMPRESS - - config PSTORE_ZSTD_COMPRESS_DEFAULT - bool "zstd" if PSTORE_ZSTD_COMPRESS - -endchoice - -config PSTORE_COMPRESS_DEFAULT - string - depends on PSTORE_COMPRESS - default "deflate" if PSTORE_DEFLATE_COMPRESS_DEFAULT - default "lzo" if PSTORE_LZO_COMPRESS_DEFAULT - default "lz4" if PSTORE_LZ4_COMPRESS_DEFAULT - default "lz4hc" if PSTORE_LZ4HC_COMPRESS_DEFAULT - default "842" if PSTORE_842_COMPRESS_DEFAULT - default "zstd" if PSTORE_ZSTD_COMPRESS_DEFAULT + Whether pstore records should be compressed before being written to + the backing store. This is implemented using the zlib 'deflate' + algorithm, using the library implementation instead of using the full + blown crypto API. This reduces the risk of secondary oopses or other + problems while pstore is recording panic metadata. config PSTORE_CONSOLE bool "Log kernel console messages" diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index ffbadb8b3032..585360706b33 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -54,7 +54,7 @@ static void free_pstore_private(struct pstore_private *private) if (!private) return; if (private->record) { - kfree(private->record->buf); + kvfree(private->record->buf); kfree(private->record->priv); kfree(private->record); } @@ -223,7 +223,7 @@ static struct inode *pstore_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } @@ -390,7 +390,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode->i_private = private; if (record->time.tv_sec) - inode->i_mtime = inode->i_ctime = record->time; + inode->i_mtime = inode_set_ctime_to_ts(inode, record->time); d_add(dentry, inode); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index cbc0b468c1ab..62356d542ef6 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -14,24 +14,17 @@ #include <linux/init.h> #include <linux/kmsg_dump.h> #include <linux/console.h> +#include <linux/mm.h> #include <linux/module.h> #include <linux/pstore.h> -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -#include <linux/lzo.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -#include <linux/lz4.h> -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -#include <linux/zstd.h> -#endif -#include <linux/crypto.h> #include <linux/string.h> #include <linux/timer.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/jiffies.h> +#include <linux/vmalloc.h> #include <linux/workqueue.h> +#include <linux/zlib.h> #include "internal.h" @@ -80,12 +73,21 @@ static char *backend; module_param(backend, charp, 0444); MODULE_PARM_DESC(backend, "specific backend to use"); -static char *compress = -#ifdef CONFIG_PSTORE_COMPRESS_DEFAULT - CONFIG_PSTORE_COMPRESS_DEFAULT; -#else - NULL; -#endif +/* + * pstore no longer implements compression via the crypto API, and only + * supports zlib deflate compression implemented using the zlib library + * interface. This removes additional complexity which is hard to justify for a + * diagnostic facility that has to operate in conditions where the system may + * have become unstable. Zlib deflate is comparatively small in terms of code + * size, and compresses ASCII text comparatively well. In terms of compression + * speed, deflate is not the best performer but for recording the log output on + * a kernel panic, this is not considered critical. + * + * The only remaining arguments supported by the compress= module parameter are + * 'deflate' and 'none'. To retain compatibility with existing installations, + * all other values are logged and replaced with 'deflate'. + */ +static char *compress = "deflate"; module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); @@ -94,16 +96,9 @@ unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; module_param(kmsg_bytes, ulong, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); -/* Compression parameters */ -static struct crypto_comp *tfm; - -struct pstore_zbackend { - int (*zbufsize)(size_t size); - const char *name; -}; +static void *compress_workspace; static char *big_oops_buf; -static size_t big_oops_buf_sz; void pstore_set_kmsg_bytes(int bytes) { @@ -168,206 +163,89 @@ static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) } } -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) -static int zbufsize_deflate(size_t size) -{ - size_t cmpr; - - switch (size) { - /* buffer range for efivars */ - case 1000 ... 2000: - cmpr = 56; - break; - case 2001 ... 3000: - cmpr = 54; - break; - case 3001 ... 3999: - cmpr = 52; - break; - /* buffer range for nvram, erst */ - case 4000 ... 10000: - cmpr = 45; - break; - default: - cmpr = 60; - break; - } - - return (size * 100) / cmpr; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) -static int zbufsize_lzo(size_t size) -{ - return lzo1x_worst_compress(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) || IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) -static int zbufsize_lz4(size_t size) -{ - return LZ4_compressBound(size); -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) -static int zbufsize_842(size_t size) -{ - return size; -} -#endif - -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) -static int zbufsize_zstd(size_t size) -{ - return zstd_compress_bound(size); -} -#endif - -static const struct pstore_zbackend *zbackend __ro_after_init; - -static const struct pstore_zbackend zbackends[] = { -#if IS_ENABLED(CONFIG_PSTORE_DEFLATE_COMPRESS) - { - .zbufsize = zbufsize_deflate, - .name = "deflate", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZO_COMPRESS) - { - .zbufsize = zbufsize_lzo, - .name = "lzo", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_LZ4HC_COMPRESS) - { - .zbufsize = zbufsize_lz4, - .name = "lz4hc", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_842_COMPRESS) - { - .zbufsize = zbufsize_842, - .name = "842", - }, -#endif -#if IS_ENABLED(CONFIG_PSTORE_ZSTD_COMPRESS) - { - .zbufsize = zbufsize_zstd, - .name = "zstd", - }, -#endif - { } -}; - static int pstore_compress(const void *in, void *out, unsigned int inlen, unsigned int outlen) { + struct z_stream_s zstream = { + .next_in = in, + .avail_in = inlen, + .next_out = out, + .avail_out = outlen, + .workspace = compress_workspace, + }; int ret; if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS)) return -EINVAL; - ret = crypto_comp_compress(tfm, in, inlen, out, &outlen); - if (ret) { - pr_err("crypto_comp_compress failed, ret = %d!\n", ret); - return ret; - } + ret = zlib_deflateInit2(&zstream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, + -MAX_WBITS, DEF_MEM_LEVEL, Z_DEFAULT_STRATEGY); + if (ret != Z_OK) + return -EINVAL; + + ret = zlib_deflate(&zstream, Z_FINISH); + if (ret != Z_STREAM_END) + return -EINVAL; + + ret = zlib_deflateEnd(&zstream); + if (ret != Z_OK) + pr_warn_once("zlib_deflateEnd() failed: %d\n", ret); - return outlen; + return zstream.total_out; } static void allocate_buf_for_compression(void) { - struct crypto_comp *ctx; - int size; char *buf; - /* Skip if not built-in or compression backend not selected yet. */ - if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend) - return; - - /* Skip if no pstore backend yet or compression init already done. */ - if (!psinfo || tfm) - return; - - if (!crypto_has_comp(zbackend->name, 0, 0)) { - pr_err("Unknown compression: %s\n", zbackend->name); + /* Skip if not built-in or compression disabled. */ + if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !compress || + !strcmp(compress, "none")) { + compress = NULL; return; } - size = zbackend->zbufsize(psinfo->bufsize); - if (size <= 0) { - pr_err("Invalid compression size for %s: %d\n", - zbackend->name, size); - return; + if (strcmp(compress, "deflate")) { + pr_err("Unsupported compression '%s', falling back to deflate\n", + compress); + compress = "deflate"; } - buf = kmalloc(size, GFP_KERNEL); + /* + * The compression buffer only needs to be as large as the maximum + * uncompressed record size, since any record that would be expanded by + * compression is just stored uncompressed. + */ + buf = kvzalloc(psinfo->bufsize, GFP_KERNEL); if (!buf) { - pr_err("Failed %d byte compression buffer allocation for: %s\n", - size, zbackend->name); + pr_err("Failed %zu byte compression buffer allocation for: %s\n", + psinfo->bufsize, compress); return; } - ctx = crypto_alloc_comp(zbackend->name, 0, 0); - if (IS_ERR_OR_NULL(ctx)) { - kfree(buf); - pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name, - PTR_ERR(ctx)); + compress_workspace = + vmalloc(zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL)); + if (!compress_workspace) { + pr_err("Failed to allocate zlib deflate workspace\n"); + kvfree(buf); return; } /* A non-NULL big_oops_buf indicates compression is available. */ - tfm = ctx; - big_oops_buf_sz = size; big_oops_buf = buf; - pr_info("Using crash dump compression: %s\n", zbackend->name); + pr_info("Using crash dump compression: %s\n", compress); } static void free_buf_for_compression(void) { - if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm) { - crypto_free_comp(tfm); - tfm = NULL; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress_workspace) { + vfree(compress_workspace); + compress_workspace = NULL; } - kfree(big_oops_buf); - big_oops_buf = NULL; - big_oops_buf_sz = 0; -} -/* - * Called when compression fails, since the printk buffer - * would be fetched for compression calling it again when - * compression fails would have moved the iterator of - * printk buffer which results in fetching old contents. - * Copy the recent messages from big_oops_buf to psinfo->buf - */ -static size_t copy_kmsg_to_buffer(int hsize, size_t len) -{ - size_t total_len; - size_t diff; - - total_len = hsize + len; - - if (total_len > psinfo->bufsize) { - diff = total_len - psinfo->bufsize + hsize; - memcpy(psinfo->buf, big_oops_buf, hsize); - memcpy(psinfo->buf + hsize, big_oops_buf + diff, - psinfo->bufsize - hsize); - total_len = psinfo->bufsize; - } else - memcpy(psinfo->buf, big_oops_buf, total_len); - - return total_len; + kvfree(big_oops_buf); + big_oops_buf = NULL; } void pstore_record_init(struct pstore_record *record, @@ -426,13 +304,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.part = part; record.buf = psinfo->buf; - if (big_oops_buf) { - dst = big_oops_buf; - dst_size = big_oops_buf_sz; - } else { - dst = psinfo->buf; - dst_size = psinfo->bufsize; - } + dst = big_oops_buf ?: psinfo->buf; + dst_size = psinfo->bufsize; /* Write dump header. */ header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, @@ -453,8 +326,8 @@ static void pstore_dump(struct kmsg_dumper *dumper, record.compressed = true; record.size = zipped_len; } else { - record.size = copy_kmsg_to_buffer(header_size, - dump_size); + record.size = header_size + dump_size; + memcpy(psinfo->buf, dst, record.size); } } else { record.size = header_size + dump_size; @@ -549,7 +422,7 @@ static int pstore_write_user_compat(struct pstore_record *record, if (record->buf) return -EINVAL; - record->buf = memdup_user(buf, record->size); + record->buf = vmemdup_user(buf, record->size); if (IS_ERR(record->buf)) { ret = PTR_ERR(record->buf); goto out; @@ -557,7 +430,7 @@ static int pstore_write_user_compat(struct pstore_record *record, ret = record->psi->write(record); - kfree(record->buf); + kvfree(record->buf); out: record->buf = NULL; @@ -681,7 +554,8 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); -static void decompress_record(struct pstore_record *record) +static void decompress_record(struct pstore_record *record, + struct z_stream_s *zstream) { int ret; int unzipped_len; @@ -697,40 +571,50 @@ static void decompress_record(struct pstore_record *record) } /* Missing compression buffer means compression was not initialized. */ - if (!big_oops_buf) { + if (!zstream->workspace) { pr_warn("no decompression method initialized!\n"); return; } + ret = zlib_inflateReset(zstream); + if (ret != Z_OK) { + pr_err("zlib_inflateReset() failed, ret = %d!\n", ret); + return; + } + /* Allocate enough space to hold max decompression and ECC. */ - unzipped_len = big_oops_buf_sz; - workspace = kmalloc(unzipped_len + record->ecc_notice_size, - GFP_KERNEL); + workspace = kvzalloc(psinfo->bufsize + record->ecc_notice_size, + GFP_KERNEL); if (!workspace) return; - /* After decompression "unzipped_len" is almost certainly smaller. */ - ret = crypto_comp_decompress(tfm, record->buf, record->size, - workspace, &unzipped_len); - if (ret) { - pr_err("crypto_comp_decompress failed, ret = %d!\n", ret); - kfree(workspace); + zstream->next_in = record->buf; + zstream->avail_in = record->size; + zstream->next_out = workspace; + zstream->avail_out = psinfo->bufsize; + + ret = zlib_inflate(zstream, Z_FINISH); + if (ret != Z_STREAM_END) { + pr_err("zlib_inflate() failed, ret = %d!\n", ret); + kvfree(workspace); return; } + unzipped_len = zstream->total_out; + /* Append ECC notice to decompressed buffer. */ memcpy(workspace + unzipped_len, record->buf + record->size, record->ecc_notice_size); /* Copy decompressed contents into an minimum-sized allocation. */ - unzipped = kmemdup(workspace, unzipped_len + record->ecc_notice_size, - GFP_KERNEL); - kfree(workspace); + unzipped = kvmemdup(workspace, unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + kvfree(workspace); if (!unzipped) return; /* Swap out compressed contents with decompressed contents. */ - kfree(record->buf); + kvfree(record->buf); record->buf = unzipped; record->size = unzipped_len; record->compressed = false; @@ -747,10 +631,17 @@ void pstore_get_backend_records(struct pstore_info *psi, { int failed = 0; unsigned int stop_loop = 65536; + struct z_stream_s zstream = {}; if (!psi || !root) return; + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + zstream.workspace = kvmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + zlib_inflateInit2(&zstream, -DEF_WBITS); + } + mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; @@ -779,11 +670,11 @@ void pstore_get_backend_records(struct pstore_info *psi, break; } - decompress_record(record); + decompress_record(record, &zstream); rc = pstore_mkfile(root, record); if (rc) { /* pstore_mkfile() did not take record, so free it. */ - kfree(record->buf); + kvfree(record->buf); kfree(record->priv); kfree(record); if (rc != -EEXIST || !quiet) @@ -795,6 +686,12 @@ void pstore_get_backend_records(struct pstore_info *psi, out: mutex_unlock(&psi->read_mutex); + if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && compress) { + if (zlib_inflateEnd(&zstream) != Z_OK) + pr_warn("zlib_inflateEnd() failed\n"); + kvfree(zstream.workspace); + } + if (failed) pr_warn("failed to create %d record(s) from '%s'\n", failed, psi->name); @@ -818,34 +715,10 @@ static void pstore_timefunc(struct timer_list *unused) pstore_timer_kick(); } -static void __init pstore_choose_compression(void) -{ - const struct pstore_zbackend *step; - - if (!compress) - return; - - for (step = zbackends; step->name; step++) { - if (!strcmp(compress, step->name)) { - zbackend = step; - return; - } - } -} - static int __init pstore_init(void) { int ret; - pstore_choose_compression(); - - /* - * Check if any pstore backends registered earlier but did not - * initialize compression because crypto was not ready. If so, - * initialize compression now. - */ - allocate_buf_for_compression(); - ret = pstore_init_fs(); if (ret) free_buf_for_compression(); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 2f625e1fa8d8..d36702c7ab3c 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -20,6 +20,7 @@ #include <linux/compiler.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/mm.h> #include "internal.h" #include "ram_internal.h" @@ -268,7 +269,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) /* ECC correction notice */ record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + record->buf = kvzalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); if (record->buf == NULL) { size = -ENOMEM; goto out; @@ -282,7 +283,7 @@ static ssize_t ramoops_pstore_read(struct pstore_record *record) out: if (free_prz) { - kfree(prz->old_log); + kvfree(prz->old_log); kfree(prz); } @@ -833,7 +834,7 @@ static int ramoops_probe(struct platform_device *pdev) */ if (cxt->pstore.flags & PSTORE_FLAGS_DMESG) { cxt->pstore.bufsize = cxt->dprzs[0]->buffer_size; - cxt->pstore.buf = kzalloc(cxt->pstore.bufsize, GFP_KERNEL); + cxt->pstore.buf = kvzalloc(cxt->pstore.bufsize, GFP_KERNEL); if (!cxt->pstore.buf) { pr_err("cannot allocate pstore crash dump buffer\n"); err = -ENOMEM; @@ -866,7 +867,7 @@ static int ramoops_probe(struct platform_device *pdev) return 0; fail_buf: - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); fail_clear: cxt->pstore.bufsize = 0; fail_init: @@ -881,7 +882,7 @@ static void ramoops_remove(struct platform_device *pdev) pstore_unregister(&cxt->pstore); - kfree(cxt->pstore.buf); + kvfree(cxt->pstore.buf); cxt->pstore.bufsize = 0; ramoops_free_przs(cxt); diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 85aaf0fc6d7d..650e437b55e6 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> +#include <linux/mm.h> #include <asm/page.h> #include "ram_internal.h" @@ -24,12 +25,10 @@ /** * struct persistent_ram_buffer - persistent circular RAM buffer * - * @sig: - * signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) - * @start: - * offset into @data where the beginning of the stored bytes begin - * @size: - * number of valid bytes stored in @data + * @sig: Signature to indicate header (PERSISTENT_RAM_SIG xor PRZ-type value) + * @start: First valid byte in the buffer. + * @size: Number of valid bytes in the buffer. + * @data: The contents of the buffer. */ struct persistent_ram_buffer { uint32_t sig; @@ -301,7 +300,7 @@ void persistent_ram_save_old(struct persistent_ram_zone *prz) if (!prz->old_log) { persistent_ram_ecc_old(prz); - prz->old_log = kmalloc(size, GFP_KERNEL); + prz->old_log = kvzalloc(size, GFP_KERNEL); } if (!prz->old_log) { pr_err("failed to allocate buffer\n"); @@ -385,7 +384,7 @@ void *persistent_ram_old(struct persistent_ram_zone *prz) void persistent_ram_free_old(struct persistent_ram_zone *prz) { - kfree(prz->old_log); + kvfree(prz->old_log); prz->old_log = NULL; prz->old_log_size = 0; } @@ -519,7 +518,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig, sig ^= PERSISTENT_RAM_SIG; if (prz->buffer->sig == sig) { - if (buffer_size(prz) == 0) { + if (buffer_size(prz) == 0 && buffer_start(prz) == 0) { pr_debug("found existing empty buffer\n"); return 0; } diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 391ea402920d..a7171f5532a1 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -305,8 +305,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_sec = le32_to_cpu(raw_inode->di_atime); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0); inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 85b2fa3b211c..21f90d519f1a 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -562,8 +562,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_atime); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->di_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->di_ctime), 0); /* calc blocks based on 512 byte blocksize */ inode->i_blocks = (inode->i_size + 511) >> 9; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e3e4f4047657..4d826c369da2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2367,7 +2367,7 @@ int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, if (!fmt) return -ESRCH; - if (!sb->s_op->quota_write || !sb->s_op->quota_read || + if (!sb->dq_op || !sb->s_qcop || (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index fef477c78107..18e8387cab41 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -65,7 +65,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &ram_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); mapping_set_unevictable(inode->i_mapping); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); @@ -105,7 +105,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ error = 0; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } return error; } @@ -138,7 +138,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (!error) { d_instantiate(dentry, inode); dget(dentry); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); } else iput(inode); } diff --git a/fs/read_write.c b/fs/read_write.c index b07de77ef126..4771701c896b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -71,7 +71,7 @@ EXPORT_SYMBOL(vfs_setpos); * @file: file structure to seek on * @offset: file offset to seek to * @whence: type of seek - * @size: max size of this file in file system + * @maxsize: max size of this file in file system * @eof: offset used for SEEK_END position * * This is a variant of generic_file_llseek that allows passing in a custom diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77bd3b27059f..86e55d4bb10d 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1259,9 +1259,8 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_size = sd_v1_size(sd); inode->i_atime.tv_sec = sd_v1_atime(sd); inode->i_mtime.tv_sec = sd_v1_mtime(sd); - inode->i_ctime.tv_sec = sd_v1_ctime(sd); + inode_set_ctime(inode, sd_v1_ctime(sd), 0); inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = sd_v1_blocks(sd); @@ -1314,8 +1313,7 @@ static void init_inode(struct inode *inode, struct treepath *path) i_gid_write(inode, sd_v2_gid(sd)); inode->i_mtime.tv_sec = sd_v2_mtime(sd); inode->i_atime.tv_sec = sd_v2_atime(sd); - inode->i_ctime.tv_sec = sd_v2_ctime(sd); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, sd_v2_ctime(sd), 0); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; inode->i_blocks = sd_v2_blocks(sd); @@ -1374,7 +1372,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_gid(sd_v2, i_gid_read(inode)); set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec); set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec); - set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec); + set_sd_v2_ctime(sd_v2, inode_get_ctime(inode).tv_sec); set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); @@ -1394,7 +1392,7 @@ static void inode2sd_v1(void *sd, struct inode *inode, loff_t size) set_sd_v1_nlink(sd_v1, inode->i_nlink); set_sd_v1_size(sd_v1, size); set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec); - set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec); + set_sd_v1_ctime(sd_v1, inode_get_ctime(inode).tv_sec); set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) @@ -1986,7 +1984,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_size = i_size; inode->i_blocks = 0; inode->i_bytes = 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6bf9b54e58ca..dd33f8cc6eda 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -55,7 +55,7 @@ int reiserfs_fileattr_set(struct mnt_idmap *idmap, } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); err = 0; unlock: @@ -107,7 +107,7 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) err = -EFAULT; goto setversion_out; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); setversion_out: mnt_drop_write_file(filp); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 479aa4a57602..015bfe4e4524 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2326,7 +2326,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, int i, j; bh = __getblk(dev, block, bufsize); - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return (bh); if (block + BUFNR > max_block) { @@ -2336,6 +2336,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev, j = 1; for (i = 1; i < blocks; i++) { bh = __getblk(dev, block + i, bufsize); + if (!bh) + break; if (buffer_uptodate(bh)) { brelse(bh); break; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 52240cc891cf..9c5704be2435 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -572,7 +572,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, } dir->i_size += paste_size; - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); if (!S_ISDIR(inode->i_mode) && visible) /* reiserfs_mkdir or reiserfs_rename will do that by itself */ reiserfs_update_sd(th, dir); @@ -966,7 +966,8 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_nlink); clear_nlink(inode); - inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); reiserfs_update_sd(&th, inode); DEC_DIR_INODE_NLINK(dir) @@ -1070,11 +1071,11 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) inc_nlink(inode); goto end_unlink; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); reiserfs_update_sd(&th, dir); if (!savelink) @@ -1250,7 +1251,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, return err ? err : retval; } - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); reiserfs_update_sd(&th, inode); ihold(inode); @@ -1325,7 +1326,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, int jbegin_count; umode_t old_inode_mode; unsigned long savelink = 1; - struct timespec64 ctime; if (flags & ~RENAME_NOREPLACE) return -EINVAL; @@ -1576,14 +1576,11 @@ static int reiserfs_rename(struct mnt_idmap *idmap, mark_de_hidden(old_de.de_deh + old_de.de_entry_num); journal_mark_dirty(&th, old_de.de_bh); - ctime = current_time(old_dir); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; /* * thanks to Alex Adriaanse <alex_a@caltech.edu> for patch * which adds ctime update of renamed object */ - old_inode->i_ctime = ctime; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (new_dentry_inode) { /* adjust link number of the victim */ @@ -1592,7 +1589,6 @@ static int reiserfs_rename(struct mnt_idmap *idmap, } else { drop_nlink(new_dentry_inode); } - new_dentry_inode->i_ctime = ctime; savelink = new_dentry_inode->i_nlink; } diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index ce5003986789..3676e02a0232 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2004,7 +2004,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, if (update_timestamps) { inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); @@ -2029,7 +2029,7 @@ update_and_out: if (update_timestamps) { /* this is truncate, not file closing */ inode->i_mtime = current_time(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } reiserfs_update_sd(th, inode); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 929acce6e731..7eaf36b3de12 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2587,7 +2587,7 @@ out: return err; if (inode->i_size < off + len - towrite) i_size_write(inode, off + len - towrite); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return len - towrite; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 651027967159..6000964c2b80 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -466,12 +466,13 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec64 now = current_time(inode); + struct timespec64 ctime = inode_get_ctime(inode); if (inode_unhashed(inode) || !inode->i_nlink || - timespec64_equal(&inode->i_ctime, &now)) + timespec64_equal(&ctime, &now)) return; - inode->i_ctime = current_time(inode); + inode_set_ctime_to_ts(inode, now); mark_inode_dirty(inode); } diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 138060452678..064264992b49 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -285,7 +285,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, if (error == -ENODATA) { error = 0; if (type == ACL_TYPE_ACCESS) { - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); } } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c59b230d55b4..5c35f6c76037 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -322,8 +322,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) set_nlink(i, 1); /* Hard to decide.. */ i->i_size = be32_to_cpu(ri.size); - i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; - i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; + i->i_mtime = i->i_atime = inode_set_ctime(i, 0, 0); /* set up mode and ops */ mode = romfs_modemap[nextfh & ROMFH_TYPE]; @@ -583,16 +582,18 @@ static int romfs_init_fs_context(struct fs_context *fc) */ static void romfs_kill_sb(struct super_block *sb) { + generic_shutdown_super(sb); + #ifdef CONFIG_ROMFS_ON_MTD if (sb->s_mtd) { - kill_mtd_super(sb); - return; + put_mtd_device(sb->s_mtd); + sb->s_mtd = NULL; } #endif #ifdef CONFIG_ROMFS_ON_BLOCK if (sb->s_bdev) { - kill_block_super(sb); - return; + sync_blockdev(sb->s_bdev); + blkdev_put(sb->s_bdev, sb); } #endif } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index a4d8b0ea1c8c..6fc8f43b1c9d 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -1077,7 +1077,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) } static int -cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) +cifs_setlease(struct file *file, int arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 6bc44f79d2e9..2108b3b40ce9 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1085,7 +1085,7 @@ int cifs_close(struct inode *inode, struct file *file) !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); } spin_lock(&cinode->deferred_lock); cifs_add_deferred_close(cfile, dclose); @@ -2596,7 +2596,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) write_data, to - from, &offset); cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ - inode->i_atime = inode->i_mtime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); if ((bytes_written > 0) && (offset)) rc = 0; else if (bytes_written < 0) diff --git a/fs/smb/client/fscache.h b/fs/smb/client/fscache.h index 173999610997..84f3b09367d2 100644 --- a/fs/smb/client/fscache.h +++ b/fs/smb/client/fscache.h @@ -50,12 +50,13 @@ void cifs_fscache_fill_coherency(struct inode *inode, struct cifs_fscache_inode_coherency_data *cd) { struct cifsInodeInfo *cifsi = CIFS_I(inode); + struct timespec64 ctime = inode_get_ctime(inode); memset(cd, 0, sizeof(*cd)); cd->last_write_time_sec = cpu_to_le64(cifsi->netfs.inode.i_mtime.tv_sec); cd->last_write_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_mtime.tv_nsec); - cd->last_change_time_sec = cpu_to_le64(cifsi->netfs.inode.i_ctime.tv_sec); - cd->last_change_time_nsec = cpu_to_le32(cifsi->netfs.inode.i_ctime.tv_nsec); + cd->last_change_time_sec = cpu_to_le64(ctime.tv_sec); + cd->last_change_time_nsec = cpu_to_le32(ctime.tv_nsec); } diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index c3eeae07e139..93fe43789d7a 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -172,7 +172,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) else inode->i_atime = fattr->cf_atime; inode->i_mtime = fattr->cf_mtime; - inode->i_ctime = fattr->cf_ctime; + inode_set_ctime_to_ts(inode, fattr->cf_ctime); inode->i_rdev = fattr->cf_rdev; cifs_nlink_fattr_to_inode(inode, fattr); inode->i_uid = fattr->cf_uid; @@ -1744,9 +1744,9 @@ out_reval: cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); } - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -2060,8 +2060,8 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_time(inode); + inode_set_ctime_current(d_inode(direntry)); + inode->i_mtime = inode_set_ctime_current(inode); rmdir_exit: free_dentry_path(page); @@ -2267,8 +2267,8 @@ unlink_target: /* force revalidate to go get info when needed */ CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; - source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_time(source_dir); + source_dir->i_mtime = target_dir->i_mtime = inode_set_ctime_to_ts(source_dir, + inode_set_ctime_current(target_dir)); cifs_rename_exit: kfree(info_buf_source); @@ -2540,7 +2540,7 @@ int cifs_getattr(struct mnt_idmap *idmap, const struct path *path, return rc; } - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = cifs_sb->ctx->bsize; stat->ino = CIFS_I(inode)->uniqueid; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 0f62bc373ad0..182e2e879ecf 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -1396,7 +1396,8 @@ smb2_close_getattr(const unsigned int xid, struct cifs_tcon *tcon, if (file_inf.LastWriteTime) inode->i_mtime = cifs_NTtimeToUnix(file_inf.LastWriteTime); if (file_inf.ChangeTime) - inode->i_ctime = cifs_NTtimeToUnix(file_inf.ChangeTime); + inode_set_ctime_to_ts(inode, + cifs_NTtimeToUnix(file_inf.ChangeTime)); if (file_inf.LastAccessTime) inode->i_atime = cifs_NTtimeToUnix(file_inf.LastAccessTime); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 7cc1b0c47d0a..a947c18915c2 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4402,8 +4402,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, } basic_info = (struct smb2_file_basic_info *)rsp->Buffer; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); basic_info->LastAccessTime = cpu_to_le64(time); @@ -4428,7 +4428,7 @@ static void get_file_standard_info(struct smb2_query_info_rsp *rsp, struct kstat stat; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); sinfo = (struct smb2_file_standard_info *)rsp->Buffer; delete_pending = ksmbd_inode_pending_delete(fp); @@ -4482,7 +4482,7 @@ static int get_file_all_info(struct ksmbd_work *work, return PTR_ERR(filename); inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); ksmbd_debug(SMB, "filename = %s\n", filename); delete_pending = ksmbd_inode_pending_delete(fp); @@ -4559,8 +4559,8 @@ static void get_file_stream_info(struct ksmbd_work *work, int buf_free_len; struct smb2_query_info_req *req = ksmbd_req_buf_next(work); - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_stream_info *)rsp->Buffer; buf_free_len = @@ -4650,8 +4650,8 @@ static void get_file_internal_info(struct smb2_query_info_rsp *rsp, struct smb2_file_internal_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_internal_info *)rsp->Buffer; file_info->IndexNumber = cpu_to_le64(stat.ino); rsp->OutputBufferLength = @@ -4676,7 +4676,7 @@ static int get_file_network_open_info(struct smb2_query_info_rsp *rsp, file_info = (struct smb2_file_ntwrk_info *)rsp->Buffer; inode = file_inode(fp->filp); - generic_fillattr(file_mnt_idmap(fp->filp), inode, &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, inode, &stat); file_info->CreationTime = cpu_to_le64(fp->create_time); time = ksmbd_UnixTimeToNT(stat.atime); @@ -4737,8 +4737,8 @@ static void get_file_compression_info(struct smb2_query_info_rsp *rsp, struct smb2_file_comp_info *file_info; struct kstat stat; - generic_fillattr(file_mnt_idmap(fp->filp), file_inode(fp->filp), - &stat); + generic_fillattr(file_mnt_idmap(fp->filp), STATX_BASIC_STATS, + file_inode(fp->filp), &stat); file_info = (struct smb2_file_comp_info *)rsp->Buffer; file_info->CompressedFileSize = cpu_to_le64(stat.blocks << 9); @@ -4790,7 +4790,7 @@ static int find_file_posix_info(struct smb2_query_info_rsp *rsp, file_info->LastAccessTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode->i_mtime); file_info->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); file_info->ChangeTime = cpu_to_le64(time); file_info->DosAttributes = fp->f_ci->m_fattr; file_info->Inode = cpu_to_le64(inode->i_ino); @@ -5433,7 +5433,7 @@ int smb2_close(struct ksmbd_work *work) rsp->LastAccessTime = cpu_to_le64(time); time = ksmbd_UnixTimeToNT(inode->i_mtime); rsp->LastWriteTime = cpu_to_le64(time); - time = ksmbd_UnixTimeToNT(inode->i_ctime); + time = ksmbd_UnixTimeToNT(inode_get_ctime(inode)); rsp->ChangeTime = cpu_to_le64(time); ksmbd_fd_put(work, fp); } else { @@ -5656,7 +5656,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); else - attrs.ia_ctime = inode->i_ctime; + attrs.ia_ctime = inode_get_ctime(inode); if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5701,7 +5701,7 @@ static int set_file_basic_info(struct ksmbd_file *fp, return -EACCES; inode_lock(inode); - inode->i_ctime = attrs.ia_ctime; + inode_set_ctime_to_ts(inode, attrs.ia_ctime); attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(idmap, dentry, &attrs, NULL); inode_unlock(inode); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 3d5d652153a5..d48756a339a5 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -1659,7 +1659,8 @@ int ksmbd_vfs_fill_dentry_attrs(struct ksmbd_work *work, u64 time; int rc; - generic_fillattr(idmap, d_inode(dentry), ksmbd_kstat->kstat); + generic_fillattr(idmap, STATX_BASIC_STATS, d_inode(dentry), + ksmbd_kstat->kstat); time = ksmbd_UnixTimeToNT(ksmbd_kstat->kstat->ctime); ksmbd_kstat->create_time = time; diff --git a/fs/splice.c b/fs/splice.c index 3e2a31e1ce6a..02631013b09f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -120,17 +120,17 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->page); int err; - if (!PageUptodate(page)) { - lock_page(page); + if (!folio_test_uptodate(folio)) { + folio_lock(folio); /* - * Page got truncated/unhashed. This will cause a 0-byte + * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ - if (!page->mapping) { + if (!folio->mapping) { err = -ENODATA; goto error; } @@ -138,20 +138,18 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, /* * Uh oh, read-error from disk. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } - /* - * Page is ok afterall, we are done. - */ - unlock_page(page); + /* Folio is ok after all, we are done */ + folio_unlock(folio); } return 0; error: - unlock_page(page); + folio_unlock(folio); return err; } @@ -1269,10 +1267,8 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, if ((in->f_flags | out->f_flags) & O_NONBLOCK) flags |= SPLICE_F_NONBLOCK; - return splice_pipe_to_pipe(ipipe, opipe, len, flags); - } - - if (ipipe) { + ret = splice_pipe_to_pipe(ipipe, opipe, len, flags); + } else if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1297,18 +1293,11 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = do_splice_from(ipipe, out, &offset, len, flags); file_end_write(out); - if (ret > 0) - fsnotify_modify(out); - if (!off_out) out->f_pos = offset; else *off_out = offset; - - return ret; - } - - if (opipe) { + } else if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1324,18 +1313,25 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out, ret = splice_file_to_pipe(in, opipe, &offset, len, flags); - if (ret > 0) - fsnotify_access(in); - if (!off_in) in->f_pos = offset; else *off_in = offset; + } else { + ret = -EINVAL; + } - return ret; + if (ret > 0) { + /* + * Generate modify out before access in: + * do_splice_from() may've already sent modify out, + * and this ensures the events get merged. + */ + fsnotify_modify(out); + fsnotify_access(in); } - return -EINVAL; + return ret; } static long __do_splice(struct file *in, loff_t __user *off_in, @@ -1464,6 +1460,9 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter, pipe_unlock(pipe); } + if (ret > 0) + fsnotify_access(file); + return ret; } @@ -1493,8 +1492,10 @@ static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter, if (!ret) ret = iter_to_pipe(iter, pipe, buf_flag); pipe_unlock(pipe); - if (ret > 0) + if (ret > 0) { wakeup_pipe_readers(pipe); + fsnotify_modify(file); + } return ret; } @@ -1928,6 +1929,11 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags) } } + if (ret > 0) { + fsnotify_access(in); + fsnotify_modify(out); + } + return ret; } diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 24463145b351..c6e626b00546 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -61,7 +61,7 @@ static int squashfs_new_inode(struct super_block *sb, struct inode *inode, inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); inode->i_atime.tv_sec = inode->i_mtime.tv_sec; - inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode_set_ctime(inode, inode->i_mtime.tv_sec, 0); inode->i_mode = le16_to_cpu(sqsh_ino->mode); inode->i_size = 0; diff --git a/fs/stack.c b/fs/stack.c index c9830924eb12..b5e01bdb5f5f 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -68,7 +68,7 @@ void fsstack_copy_attr_all(struct inode *dest, const struct inode *src) dest->i_rdev = src->i_rdev; dest->i_atime = src->i_atime; dest->i_mtime = src->i_mtime; - dest->i_ctime = src->i_ctime; + inode_set_ctime_to_ts(dest, inode_get_ctime(src)); dest->i_blkbits = src->i_blkbits; dest->i_flags = src->i_flags; set_nlink(dest, src->i_nlink); diff --git a/fs/stat.c b/fs/stat.c index 7c238da22ef0..136711ae72fb 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -27,10 +27,42 @@ #include "mount.h" /** + * fill_mg_cmtime - Fill in the mtime and ctime and flag ctime as QUERIED + * @stat: where to store the resulting values + * @request_mask: STATX_* values requested + * @inode: inode from which to grab the c/mtime + * + * Given @inode, grab the ctime and mtime out if it and store the result + * in @stat. When fetching the value, flag it as queried so the next write + * will use a fine-grained timestamp. + */ +void fill_mg_cmtime(struct kstat *stat, u32 request_mask, struct inode *inode) +{ + atomic_long_t *pnsec = (atomic_long_t *)&inode->__i_ctime.tv_nsec; + + /* If neither time was requested, then don't report them */ + if (!(request_mask & (STATX_CTIME|STATX_MTIME))) { + stat->result_mask &= ~(STATX_CTIME|STATX_MTIME); + return; + } + + stat->mtime = inode->i_mtime; + stat->ctime.tv_sec = inode->__i_ctime.tv_sec; + /* + * Atomically set the QUERIED flag and fetch the new value with + * the flag masked off. + */ + stat->ctime.tv_nsec = atomic_long_fetch_or(I_CTIME_QUERIED, pnsec) & + ~I_CTIME_QUERIED; +} +EXPORT_SYMBOL(fill_mg_cmtime); + +/** * generic_fillattr - Fill in the basic attributes from the inode struct - * @idmap: idmap of the mount the inode was found from - * @inode: Inode to use as the source - * @stat: Where to fill in the attributes + * @idmap: idmap of the mount the inode was found from + * @request_mask: statx request_mask + * @inode: Inode to use as the source + * @stat: Where to fill in the attributes * * Fill in the basic attributes in the kstat structure from data that's to be * found on the VFS inode structure. This is the default if no getattr inode @@ -42,8 +74,8 @@ * uid and gid filds. On non-idmapped mounts or if permission checking is to be * performed on the raw inode simply passs @nop_mnt_idmap. */ -void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, - struct kstat *stat) +void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask, + struct inode *inode, struct kstat *stat) { vfsuid_t vfsuid = i_uid_into_vfsuid(idmap, inode); vfsgid_t vfsgid = i_gid_into_vfsgid(idmap, inode); @@ -57,10 +89,22 @@ void generic_fillattr(struct mnt_idmap *idmap, struct inode *inode, stat->rdev = inode->i_rdev; stat->size = i_size_read(inode); stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; + + if (is_mgtime(inode)) { + fill_mg_cmtime(stat, request_mask, inode); + } else { + stat->mtime = inode->i_mtime; + stat->ctime = inode_get_ctime(inode); + } + stat->blksize = i_blocksize(inode); stat->blocks = inode->i_blocks; + + if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { + stat->result_mask |= STATX_CHANGE_COOKIE; + stat->change_cookie = inode_query_iversion(inode); + } + } EXPORT_SYMBOL(generic_fillattr); @@ -123,17 +167,12 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, stat->attributes_mask |= (STATX_ATTR_AUTOMOUNT | STATX_ATTR_DAX); - if ((request_mask & STATX_CHANGE_COOKIE) && IS_I_VERSION(inode)) { - stat->result_mask |= STATX_CHANGE_COOKIE; - stat->change_cookie = inode_query_iversion(inode); - } - idmap = mnt_idmap(path->mnt); if (inode->i_op->getattr) return inode->i_op->getattr(idmap, path, stat, request_mask, query_flags); - generic_fillattr(idmap, inode, stat); + generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); diff --git a/fs/super.c b/fs/super.c index e781226e2880..692654c2af36 100644 --- a/fs/super.c +++ b/fs/super.c @@ -39,7 +39,7 @@ #include <uapi/linux/mount.h> #include "internal.h" -static int thaw_super_locked(struct super_block *sb); +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -50,6 +50,130 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { "sb_internal", }; +static inline void __super_lock(struct super_block *sb, bool excl) +{ + if (excl) + down_write(&sb->s_umount); + else + down_read(&sb->s_umount); +} + +static inline void super_unlock(struct super_block *sb, bool excl) +{ + if (excl) + up_write(&sb->s_umount); + else + up_read(&sb->s_umount); +} + +static inline void __super_lock_excl(struct super_block *sb) +{ + __super_lock(sb, true); +} + +static inline void super_unlock_excl(struct super_block *sb) +{ + super_unlock(sb, true); +} + +static inline void super_unlock_shared(struct super_block *sb) +{ + super_unlock(sb, false); +} + +static inline bool wait_born(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with smp_store_release() in super_wake() and ensures + * that we see SB_BORN or SB_DYING after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & (SB_BORN | SB_DYING); +} + +/** + * super_lock - wait for superblock to become ready and lock it + * @sb: superblock to wait for + * @excl: whether exclusive access is required + * + * If the superblock has neither passed through vfs_get_tree() or + * generic_shutdown_super() yet wait for it to happen. Either superblock + * creation will succeed and SB_BORN is set by vfs_get_tree() or we're + * woken and we'll see SB_DYING. + * + * The caller must have acquired a temporary reference on @sb->s_count. + * + * Return: This returns true if SB_BORN was set, false if SB_DYING was + * set. The function acquires s_umount and returns with it held. + */ +static __must_check bool super_lock(struct super_block *sb, bool excl) +{ + + lockdep_assert_not_held(&sb->s_umount); + +relock: + __super_lock(sb, excl); + + /* + * Has gone through generic_shutdown_super() in the meantime. + * @sb->s_root is NULL and @sb->s_active is 0. No one needs to + * grab a reference to this. Tell them so. + */ + if (sb->s_flags & SB_DYING) + return false; + + /* Has called ->get_tree() successfully. */ + if (sb->s_flags & SB_BORN) + return true; + + super_unlock(sb, excl); + + /* wait until the superblock is ready or dying */ + wait_var_event(&sb->s_flags, wait_born(sb)); + + /* + * Neither SB_BORN nor SB_DYING are ever unset so we never loop. + * Just reacquire @sb->s_umount for the caller. + */ + goto relock; +} + +/* wait and acquire read-side of @sb->s_umount */ +static inline bool super_lock_shared(struct super_block *sb) +{ + return super_lock(sb, false); +} + +/* wait and acquire write-side of @sb->s_umount */ +static inline bool super_lock_excl(struct super_block *sb) +{ + return super_lock(sb, true); +} + +/* wake waiters */ +#define SUPER_WAKE_FLAGS (SB_BORN | SB_DYING | SB_DEAD) +static void super_wake(struct super_block *sb, unsigned int flag) +{ + WARN_ON_ONCE((flag & ~SUPER_WAKE_FLAGS)); + WARN_ON_ONCE(hweight32(flag & SUPER_WAKE_FLAGS) > 1); + + /* + * Pairs with smp_load_acquire() in super_lock() to make sure + * all initializations in the superblock are seen by the user + * seeing SB_BORN sent. + */ + smp_store_release(&sb->s_flags, sb->s_flags | flag); + /* + * Pairs with the barrier in prepare_to_wait_event() to make sure + * ___wait_var_event() either sees SB_BORN set or + * waitqueue_active() check in wake_up_var() sees the waiter. + */ + smp_mb(); + wake_up_var(&sb->s_flags); +} + /* * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. @@ -76,7 +200,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!trylock_super(sb)) + if (!super_trylock_shared(sb)) return SHRINK_STOP; if (sb->s_op->nr_cached_objects) @@ -110,7 +234,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - up_read(&sb->s_umount); + super_unlock_shared(sb); return freed; } @@ -123,17 +247,17 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* - * We don't call trylock_super() here as it is a scalability bottleneck, - * so we're exposed to partial setup state. The shrinker rwsem does not - * protect filesystem operations backing list_lru_shrink_count() or - * s_op->nr_cached_objects(). Counts can change between - * super_cache_count and super_cache_scan, so we really don't need locks - * here. + * We don't call super_trylock_shared() here as it is a scalability + * bottleneck, so we're exposed to partial setup state. The shrinker + * rwsem does not protect filesystem operations backing + * list_lru_shrink_count() or s_op->nr_cached_objects(). Counts can + * change between super_cache_count and super_cache_scan, so we really + * don't need locks here. * * However, if we are currently mounting the superblock, the underlying * filesystem might be in a state of partial construction and hence it - * is dangerous to access it. trylock_super() uses a SB_BORN check to - * avoid this situation, so do the same here. The memory barrier is + * is dangerous to access it. super_trylock_shared() uses a SB_BORN check + * to avoid this situation, so do the same here. The memory barrier is * matched with the one in mount_fs() as we don't hold locks here. */ if (!(sb->s_flags & SB_BORN)) @@ -176,7 +300,7 @@ static void destroy_unused_super(struct super_block *s) { if (!s) return; - up_write(&s->s_umount); + super_unlock_excl(s); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); @@ -337,10 +461,29 @@ void deactivate_locked_super(struct super_block *s) list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); + /* + * Remove it from @fs_supers so it isn't found by new + * sget{_fc}() walkers anymore. Any concurrent mounter still + * managing to grab a temporary reference is guaranteed to + * already see SB_DYING and will wait until we notify them about + * SB_DEAD. + */ + spin_lock(&sb_lock); + hlist_del_init(&s->s_instances); + spin_unlock(&sb_lock); + + /* + * Let concurrent mounts know that this thing is really dead. + * We don't need @sb->s_umount here as every concurrent caller + * will see SB_DYING and either discard the superblock or wait + * for SB_DEAD. + */ + super_wake(s, SB_DEAD); + put_filesystem(fs); put_super(s); } else { - up_write(&s->s_umount); + super_unlock_excl(s); } } @@ -357,7 +500,7 @@ EXPORT_SYMBOL(deactivate_locked_super); void deactivate_super(struct super_block *s) { if (!atomic_add_unless(&s->s_active, -1, 1)) { - down_write(&s->s_umount); + __super_lock_excl(s); deactivate_locked_super(s); } } @@ -379,20 +522,61 @@ EXPORT_SYMBOL(deactivate_super); */ static int grab_super(struct super_block *s) __releases(sb_lock) { + bool born; + s->s_count++; spin_unlock(&sb_lock); - down_write(&s->s_umount); - if ((s->s_flags & SB_BORN) && atomic_inc_not_zero(&s->s_active)) { + born = super_lock_excl(s); + if (born && atomic_inc_not_zero(&s->s_active)) { put_super(s); return 1; } - up_write(&s->s_umount); + super_unlock_excl(s); put_super(s); return 0; } +static inline bool wait_dead(struct super_block *sb) +{ + unsigned int flags; + + /* + * Pairs with memory barrier in super_wake() and ensures + * that we see SB_DEAD after we're woken. + */ + flags = smp_load_acquire(&sb->s_flags); + return flags & SB_DEAD; +} + +/** + * grab_super_dead - acquire an active reference to a superblock + * @sb: superblock to acquire + * + * Acquire a temporary reference on a superblock and try to trade it for + * an active reference. This is used in sget{_fc}() to wait for a + * superblock to either become SB_BORN or for it to pass through + * sb->kill() and be marked as SB_DEAD. + * + * Return: This returns true if an active reference could be acquired, + * false if not. + */ +static bool grab_super_dead(struct super_block *sb) +{ + + sb->s_count++; + if (grab_super(sb)) { + put_super(sb); + lockdep_assert_held(&sb->s_umount); + return true; + } + wait_var_event(&sb->s_flags, wait_dead(sb)); + put_super(sb); + lockdep_assert_not_held(&sb->s_umount); + return false; +} + /* - * trylock_super - try to grab ->s_umount shared + * super_trylock_shared - try to grab ->s_umount shared * @sb: reference we are trying to grab * * Try to prevent fs shutdown. This is used in places where we @@ -408,13 +592,13 @@ static int grab_super(struct super_block *s) __releases(sb_lock) * of down_read(). There's a couple of places that are OK with that, but * it's very much not a general-purpose interface. */ -bool trylock_super(struct super_block *sb) +bool super_trylock_shared(struct super_block *sb) { if (down_read_trylock(&sb->s_umount)) { - if (!hlist_unhashed(&sb->s_instances) && - sb->s_root && (sb->s_flags & SB_BORN)) + if (!(sb->s_flags & SB_DYING) && sb->s_root && + (sb->s_flags & SB_BORN)) return true; - up_read(&sb->s_umount); + super_unlock_shared(sb); } return false; @@ -439,13 +623,13 @@ bool trylock_super(struct super_block *sb) void retire_super(struct super_block *sb) { WARN_ON(!sb->s_bdev); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (sb->s_iflags & SB_I_PERSB_BDI) { bdi_unregister(sb->s_bdi); sb->s_iflags &= ~SB_I_PERSB_BDI; } sb->s_iflags |= SB_I_RETIRED; - up_write(&sb->s_umount); + super_unlock_excl(sb); } EXPORT_SYMBOL(retire_super); @@ -517,11 +701,17 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb->s_inode_list_lock); } } - spin_lock(&sb_lock); - /* should be initialized for __put_super_and_need_restart() */ - hlist_del_init(&sb->s_instances); - spin_unlock(&sb_lock); - up_write(&sb->s_umount); + /* + * Broadcast to everyone that grabbed a temporary reference to this + * superblock before we removed it from @fs_supers that the superblock + * is dying. Every walker of @fs_supers outside of sget{_fc}() will now + * discard this superblock and treat it as dead. + * + * We leave the superblock on @fs_supers so it can be found by + * sget{_fc}() until we passed sb->kill_sb(). + */ + super_wake(sb, SB_DYING); + super_unlock_excl(sb); if (sb->s_bdi != &noop_backing_dev_info) { if (sb->s_iflags & SB_I_PERSB_BDI) bdi_unregister(sb->s_bdi); @@ -546,17 +736,31 @@ bool mount_capable(struct fs_context *fc) * @test: Comparison callback * @set: Setup callback * - * Find or create a superblock using the parameters stored in the filesystem - * context and the two callback functions. + * Create a new superblock or find an existing one. + * + * The @test callback is used to find a matching existing superblock. + * Whether or not the requested parameters in @fc are taken into account + * is specific to the @test callback that is used. They may even be + * completely ignored. + * + * If an extant superblock is matched, it will be returned unless: * - * If an extant superblock is matched, then that will be returned with an - * elevated reference count that the caller must transfer or discard. + * (1) the namespace the filesystem context @fc and the extant + * superblock's namespace differ + * + * (2) the filesystem context @fc has requested that reusing an extant + * superblock is not allowed + * + * In both cases EBUSY will be returned. * * If no match is made, a new superblock will be allocated and basic - * initialisation will be performed (s_type, s_fs_info and s_id will be set and - * the set() callback will be invoked), the superblock will be published and it - * will be returned in a partially constructed state with SB_BORN and SB_ACTIVE - * as yet unset. + * initialisation will be performed (s_type, s_fs_info and s_id will be + * set and the @set callback will be invoked), the superblock will be + * published and it will be returned in a partially constructed state + * with SB_BORN and SB_ACTIVE as yet unset. + * + * Return: On success, an extant or newly created superblock is + * returned. On failure an error pointer is returned. */ struct super_block *sget_fc(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), @@ -595,6 +799,11 @@ retry: s->s_type = fc->fs_type; s->s_iflags |= fc->s_iflags; strscpy(s->s_id, s->s_type->name, sizeof(s->s_id)); + /* + * Make the superblock visible on @super_blocks and @fs_supers. + * It's in a nascent state and users should wait on SB_BORN or + * SB_DYING to be set. + */ list_add_tail(&s->s_list, &super_blocks); hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); @@ -603,12 +812,16 @@ retry: return s; share_extant_sb: - if (user_ns != old->s_user_ns) { + if (user_ns != old->s_user_ns || fc->exclusive) { spin_unlock(&sb_lock); destroy_unused_super(s); + if (fc->exclusive) + warnfc(fc, "reusing existing filesystem not allowed"); + else + warnfc(fc, "reusing existing filesystem in another namespace not allowed"); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -652,7 +865,7 @@ retry: destroy_unused_super(s); return ERR_PTR(-EBUSY); } - if (!grab_super(old)) + if (!grab_super_dead(old)) goto retry; destroy_unused_super(s); return old; @@ -685,7 +898,7 @@ EXPORT_SYMBOL(sget); void drop_super(struct super_block *sb) { - up_read(&sb->s_umount); + super_unlock_shared(sb); put_super(sb); } @@ -693,7 +906,7 @@ EXPORT_SYMBOL(drop_super); void drop_super_exclusive(struct super_block *sb) { - up_write(&sb->s_umount); + super_unlock_excl(sb); put_super(sb); } EXPORT_SYMBOL(drop_super_exclusive); @@ -704,7 +917,8 @@ static void __iterate_supers(void (*f)(struct super_block *)) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) + /* Pairs with memory marrier in super_wake(). */ + if (smp_load_acquire(&sb->s_flags) & SB_DYING) continue; sb->s_count++; spin_unlock(&sb_lock); @@ -734,15 +948,15 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -770,13 +984,15 @@ void iterate_supers_type(struct file_system_type *type, spin_lock(&sb_lock); hlist_for_each_entry(sb, &type->fs_supers, s_instances) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock_shared(sb); + if (born && sb->s_root) f(sb, arg); - up_read(&sb->s_umount); + super_unlock_shared(sb); spin_lock(&sb_lock); if (p) @@ -791,43 +1007,6 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); /** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) -{ - struct super_block *sb; - - if (!bdev) - return NULL; - - spin_lock(&sb_lock); -rescan: - list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; - if (sb->s_bdev == bdev) { - sb->s_count++; - spin_unlock(&sb_lock); - down_read(&sb->s_umount); - /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) - return sb; - up_read(&sb->s_umount); - /* nope, got unmounted */ - spin_lock(&sb_lock); - __put_super(sb); - goto rescan; - } - } - spin_unlock(&sb_lock); - return NULL; -} - -/** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for * @@ -842,15 +1021,12 @@ struct super_block *get_active_super(struct block_device *bdev) if (!bdev) return NULL; -restart: spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_bdev == bdev) { if (!grab_super(sb)) - goto restart; - up_write(&sb->s_umount); + return NULL; + super_unlock_excl(sb); return sb; } } @@ -863,28 +1039,21 @@ struct super_block *user_get_super(dev_t dev, bool excl) struct super_block *sb; spin_lock(&sb_lock); -rescan: list_for_each_entry(sb, &super_blocks, s_list) { - if (hlist_unhashed(&sb->s_instances)) - continue; if (sb->s_dev == dev) { + bool born; + sb->s_count++; spin_unlock(&sb_lock); - if (excl) - down_write(&sb->s_umount); - else - down_read(&sb->s_umount); /* still alive? */ - if (sb->s_root && (sb->s_flags & SB_BORN)) + born = super_lock(sb, excl); + if (born && sb->s_root) return sb; - if (excl) - up_write(&sb->s_umount); - else - up_read(&sb->s_umount); + super_unlock(sb, excl); /* nope, got unmounted */ spin_lock(&sb_lock); __put_super(sb); - goto rescan; + break; } } spin_unlock(&sb_lock); @@ -926,9 +1095,9 @@ int reconfigure_super(struct fs_context *fc) if (remount_ro) { if (!hlist_empty(&sb->s_pins)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); group_pin_kill(&sb->s_pins); - down_write(&sb->s_umount); + __super_lock_excl(sb); if (!sb->s_root) return 0; if (sb->s_writers.frozen != SB_UNFROZEN) @@ -991,9 +1160,9 @@ cancel_readonly: static void do_emergency_remount_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_bdev && (sb->s_flags & SB_BORN) && - !sb_rdonly(sb)) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1004,7 +1173,7 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - up_write(&sb->s_umount); + super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) @@ -1027,12 +1196,13 @@ void emergency_remount(void) static void do_thaw_all_callback(struct super_block *sb) { - down_write(&sb->s_umount); - if (sb->s_root && sb->s_flags & SB_BORN) { + bool born = super_lock_excl(sb); + + if (born && sb->s_root) { emergency_thaw_bdev(sb); - thaw_super_locked(sb); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); } else { - up_write(&sb->s_umount); + super_unlock_excl(sb); } } @@ -1136,7 +1306,7 @@ static int test_single_super(struct super_block *s, struct fs_context *fc) return 1; } -static int vfs_get_super(struct fs_context *fc, bool reconf, +static int vfs_get_super(struct fs_context *fc, int (*test)(struct super_block *, struct fs_context *), int (*fill_super)(struct super_block *sb, struct fs_context *fc)) @@ -1154,19 +1324,9 @@ static int vfs_get_super(struct fs_context *fc, bool reconf, goto error; sb->s_flags |= SB_ACTIVE; - fc->root = dget(sb->s_root); - } else { - fc->root = dget(sb->s_root); - if (reconf) { - err = reconfigure_super(fc); - if (err < 0) { - dput(fc->root); - fc->root = NULL; - goto error; - } - } } + fc->root = dget(sb->s_root); return 0; error: @@ -1178,7 +1338,7 @@ int get_tree_nodev(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, NULL, fill_super); + return vfs_get_super(fc, NULL, fill_super); } EXPORT_SYMBOL(get_tree_nodev); @@ -1186,54 +1346,81 @@ int get_tree_single(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc)) { - return vfs_get_super(fc, false, test_single_super, fill_super); + return vfs_get_super(fc, test_single_super, fill_super); } EXPORT_SYMBOL(get_tree_single); -int get_tree_single_reconf(struct fs_context *fc, - int (*fill_super)(struct super_block *sb, - struct fs_context *fc)) -{ - return vfs_get_super(fc, true, test_single_super, fill_super); -} -EXPORT_SYMBOL(get_tree_single_reconf); - int get_tree_keyed(struct fs_context *fc, int (*fill_super)(struct super_block *sb, struct fs_context *fc), void *key) { fc->s_fs_info = key; - return vfs_get_super(fc, false, test_keyed_super, fill_super); + return vfs_get_super(fc, test_keyed_super, fill_super); } EXPORT_SYMBOL(get_tree_keyed); #ifdef CONFIG_BLOCK -static void fs_mark_dead(struct block_device *bdev) +/* + * Lock a super block that the callers holds a reference to. + * + * The caller needs to ensure that the super_block isn't being freed while + * calling this function, e.g. by holding a lock over the call to this function + * and the place that clears the pointer to the superblock used by this function + * before freeing the superblock. + */ +static bool super_lock_shared_active(struct super_block *sb) { - struct super_block *sb; + bool born = super_lock_shared(sb); + + if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) { + super_unlock_shared(sb); + return false; + } + return true; +} + +static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct super_block *sb = bdev->bd_holder; + + /* bd_holder_lock ensures that the sb isn't freed */ + lockdep_assert_held(&bdev->bd_holder_lock); - sb = get_super(bdev); - if (!sb) + if (!super_lock_shared_active(sb)) return; + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + invalidate_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); - drop_super(sb); + + super_unlock_shared(sb); +} + +static void fs_bdev_sync(struct block_device *bdev) +{ + struct super_block *sb = bdev->bd_holder; + + lockdep_assert_held(&bdev->bd_holder_lock); + + if (!super_lock_shared_active(sb)) + return; + sync_filesystem(sb); + super_unlock_shared(sb); } -static const struct blk_holder_ops fs_holder_ops = { - .mark_dead = fs_mark_dead, +const struct blk_holder_ops fs_holder_ops = { + .mark_dead = fs_bdev_mark_dead, + .sync = fs_bdev_sync, }; +EXPORT_SYMBOL_GPL(fs_holder_ops); static int set_bdev_super(struct super_block *s, void *data) { - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - s->s_bdi = bdi_get(s->s_bdev->bd_disk->bdi); - - if (bdev_stable_writes(s->s_bdev)) - s->s_iflags |= SB_I_STABLE_WRITES; + s->s_dev = *(dev_t *)data; return 0; } @@ -1244,8 +1431,63 @@ static int set_bdev_super_fc(struct super_block *s, struct fs_context *fc) static int test_bdev_super_fc(struct super_block *s, struct fs_context *fc) { - return !(s->s_iflags & SB_I_RETIRED) && s->s_bdev == fc->sget_key; + return !(s->s_iflags & SB_I_RETIRED) && + s->s_dev == *(dev_t *)fc->sget_key; +} + +int setup_bdev_super(struct super_block *sb, int sb_flags, + struct fs_context *fc) +{ + blk_mode_t mode = sb_open_mode(sb_flags); + struct block_device *bdev; + + bdev = blkdev_get_by_dev(sb->s_dev, mode, sb, &fs_holder_ops); + if (IS_ERR(bdev)) { + if (fc) + errorf(fc, "%s: Can't open blockdev", fc->source); + return PTR_ERR(bdev); + } + + /* + * This really should be in blkdev_get_by_dev, but right now can't due + * to legacy issues that require us to allow opening a block device node + * writable from userspace even for a read-only block device. + */ + if ((mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, sb); + return -EACCES; + } + + /* + * Until SB_BORN flag is set, there can be no active superblock + * references and thus no filesystem freezing. get_active_super() will + * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed. + * + * It is enough to check bdev was not frozen before we set s_bdev. + */ + mutex_lock(&bdev->bd_fsfreeze_mutex); + if (bdev->bd_fsfreeze_count > 0) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + if (fc) + warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); + blkdev_put(bdev, sb); + return -EBUSY; + } + spin_lock(&sb_lock); + sb->s_bdev = bdev; + sb->s_bdi = bdi_get(bdev->bd_disk->bdi); + if (bdev_stable_writes(bdev)) + sb->s_iflags |= SB_I_STABLE_WRITES; + spin_unlock(&sb_lock); + mutex_unlock(&bdev->bd_fsfreeze_mutex); + + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, + sb->s_id); + sb_set_blocksize(sb, block_size(bdev)); + return 0; } +EXPORT_SYMBOL_GPL(setup_bdev_super); /** * get_tree_bdev - Get a superblock based on a single block device @@ -1256,73 +1498,49 @@ int get_tree_bdev(struct fs_context *fc, int (*fill_super)(struct super_block *, struct fs_context *)) { - struct block_device *bdev; struct super_block *s; int error = 0; + dev_t dev; if (!fc->source) return invalf(fc, "No source specified"); - bdev = blkdev_get_by_path(fc->source, sb_open_mode(fc->sb_flags), - fc->fs_type, &fs_holder_ops); - if (IS_ERR(bdev)) { - errorf(fc, "%s: Can't open blockdev", fc->source); - return PTR_ERR(bdev); - } - - /* Once the superblock is inserted into the list by sget_fc(), s_umount - * will protect the lockfs code from trying to start a snapshot while - * we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev); - blkdev_put(bdev, fc->fs_type); - return -EBUSY; + error = lookup_bdev(fc->source, &dev); + if (error) { + errorf(fc, "%s: Can't lookup blockdev", fc->source); + return error; } fc->sb_flags |= SB_NOSEC; - fc->sget_key = bdev; + fc->sget_key = &dev; s = sget_fc(fc, test_bdev_super_fc, set_bdev_super_fc); - mutex_unlock(&bdev->bd_fsfreeze_mutex); - if (IS_ERR(s)) { - blkdev_put(bdev, fc->fs_type); + if (IS_ERR(s)) return PTR_ERR(s); - } if (s->s_root) { /* Don't summarily change the RO/RW state. */ if ((fc->sb_flags ^ s->s_flags) & SB_RDONLY) { - warnf(fc, "%pg: Can't mount, would change RO state", bdev); + warnf(fc, "%pg: Can't mount, would change RO state", s->s_bdev); deactivate_locked_super(s); - blkdev_put(bdev, fc->fs_type); return -EBUSY; } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fc->fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fc->fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, fc); + super_unlock_excl(s); + error = setup_bdev_super(s, fc->sb_flags, fc); + __super_lock_excl(s); + if (!error) + error = fill_super(s, fc); if (error) { deactivate_locked_super(s); return error; } - s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } BUG_ON(fc->root); @@ -1333,79 +1551,52 @@ EXPORT_SYMBOL(get_tree_bdev); static int test_bdev_super(struct super_block *s, void *data) { - return !(s->s_iflags & SB_I_RETIRED) && (void *)s->s_bdev == data; + return !(s->s_iflags & SB_I_RETIRED) && s->s_dev == *(dev_t *)data; } struct dentry *mount_bdev(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, int (*fill_super)(struct super_block *, void *, int)) { - struct block_device *bdev; struct super_block *s; - int error = 0; + int error; + dev_t dev; - bdev = blkdev_get_by_path(dev_name, sb_open_mode(flags), fs_type, - &fs_holder_ops); - if (IS_ERR(bdev)) - return ERR_CAST(bdev); + error = lookup_bdev(dev_name, &dev); + if (error) + return ERR_PTR(error); - /* - * once the super is inserted into the list by sget, s_umount - * will protect the lockfs code from trying to start a snapshot - * while we are mounting - */ - mutex_lock(&bdev->bd_fsfreeze_mutex); - if (bdev->bd_fsfreeze_count > 0) { - mutex_unlock(&bdev->bd_fsfreeze_mutex); - error = -EBUSY; - goto error_bdev; - } - s = sget(fs_type, test_bdev_super, set_bdev_super, flags | SB_NOSEC, - bdev); - mutex_unlock(&bdev->bd_fsfreeze_mutex); + flags |= SB_NOSEC; + s = sget(fs_type, test_bdev_super, set_bdev_super, flags, &dev); if (IS_ERR(s)) - goto error_s; + return ERR_CAST(s); if (s->s_root) { if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); - error = -EBUSY; - goto error_bdev; + return ERR_PTR(-EBUSY); } - + } else { /* - * s_umount nests inside open_mutex during - * __invalidate_device(). blkdev_put() acquires - * open_mutex and can't be called under s_umount. Drop - * s_umount temporarily. This is safe as we're - * holding an active reference. + * We drop s_umount here because we need to open the bdev and + * bdev->open_mutex ranks above s_umount (blkdev_put() -> + * bdev_mark_dead()). It is safe because we have active sb + * reference and SB_BORN is not set yet. */ - up_write(&s->s_umount); - blkdev_put(bdev, fs_type); - down_write(&s->s_umount); - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", - fs_type->name, s->s_id); - sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); + super_unlock_excl(s); + error = setup_bdev_super(s, flags, NULL); + __super_lock_excl(s); + if (!error) + error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (error) { deactivate_locked_super(s); - goto error; + return ERR_PTR(error); } s->s_flags |= SB_ACTIVE; - bdev->bd_super = s; } return dget(s->s_root); - -error_s: - error = PTR_ERR(s); -error_bdev: - blkdev_put(bdev, fs_type); -error: - return ERR_PTR(error); } EXPORT_SYMBOL(mount_bdev); @@ -1413,10 +1604,11 @@ void kill_block_super(struct super_block *sb) { struct block_device *bdev = sb->s_bdev; - bdev->bd_super = NULL; generic_shutdown_super(sb); - sync_blockdev(bdev); - blkdev_put(bdev, sb->s_type); + if (bdev) { + sync_blockdev(bdev); + blkdev_put(bdev, sb); + } } EXPORT_SYMBOL(kill_block_super); @@ -1533,13 +1725,13 @@ int vfs_get_tree(struct fs_context *fc) WARN_ON(!sb->s_bdi); /* - * Write barrier is for super_cache_count(). We place it before setting - * SB_BORN as the data dependency between the two functions is the - * superblock structure contents that we just set up, not the SB_BORN - * flag. + * super_wake() contains a memory barrier which also care of + * ordering for super_cache_count(). We place it before setting + * SB_BORN as the data dependency between the two functions is + * the superblock structure contents that we just set up, not + * the SB_BORN flag. */ - smp_wmb(); - sb->s_flags |= SB_BORN; + super_wake(sb, SB_BORN); error = security_sb_set_mnt_opts(sb, fc->security, 0, NULL); if (unlikely(error)) { @@ -1644,14 +1836,43 @@ static void sb_freeze_unlock(struct super_block *sb, int level) percpu_up_write(sb->s_writers.rw_sem + level); } +static int wait_for_partially_frozen(struct super_block *sb) +{ + int ret = 0; + + do { + unsigned short old = sb->s_writers.frozen; + + up_write(&sb->s_umount); + ret = wait_var_event_killable(&sb->s_writers.frozen, + sb->s_writers.frozen != old); + down_write(&sb->s_umount); + } while (ret == 0 && + sb->s_writers.frozen != SB_UNFROZEN && + sb->s_writers.frozen != SB_FREEZE_COMPLETE); + + return ret; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock + * @who: context that wants to freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's - * freeze_fs. Subsequent calls to this without first thawing the fs will return + * freeze_fs. Subsequent calls to this without first thawing the fs may return * -EBUSY. * + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs. + * + * The @who argument distinguishes between the kernel and userspace trying to + * freeze the filesystem. Although there cannot be multiple kernel freezes or + * multiple userspace freezes in effect at any given time, the kernel and + * userspace can both hold a filesystem frozen. The filesystem remains frozen + * until there are no kernel or userspace freezes in effect. + * * During this function, sb->s_writers.frozen goes through these values: * * SB_UNFROZEN: File system is normal, all writes progress as usual. @@ -1677,34 +1898,62 @@ static void sb_freeze_unlock(struct super_block *sb, int level) * * sb->s_writers.frozen is protected by sb->s_umount. */ -int freeze_super(struct super_block *sb) +int freeze_super(struct super_block *sb, enum freeze_holder who) { int ret; atomic_inc(&sb->s_active); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); + +retry: + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (sb->s_writers.freeze_holders & who) { + deactivate_locked_super(sb); + return -EBUSY; + } + + WARN_ON(sb->s_writers.freeze_holders == 0); + + /* + * Someone else already holds this type of freeze; share the + * freeze and assign the active ref to the freeze. + */ + sb->s_writers.freeze_holders |= who; + super_unlock_excl(sb); + return 0; + } + if (sb->s_writers.frozen != SB_UNFROZEN) { - deactivate_locked_super(sb); - return -EBUSY; + ret = wait_for_partially_frozen(sb); + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + goto retry; } if (!(sb->s_flags & SB_BORN)) { - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; /* sic - it's "nothing to do" */ } if (sb_rdonly(sb)) { /* Nothing to do really... */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; - up_write(&sb->s_umount); + wake_up_var(&sb->s_writers.frozen); + super_unlock_excl(sb); return 0; } sb->s_writers.frozen = SB_FREEZE_WRITE; /* Release s_umount to preserve sb_start_write -> s_umount ordering */ - up_write(&sb->s_umount); + super_unlock_excl(sb); sb_wait_write(sb, SB_FREEZE_WRITE); - down_write(&sb->s_umount); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while freezing!"); /* Now we go and block page faults... */ sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; @@ -1715,6 +1964,7 @@ int freeze_super(struct super_block *sb) if (ret) { sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_PAGEFAULT); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1730,6 +1980,7 @@ int freeze_super(struct super_block *sb) "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; sb_freeze_unlock(sb, SB_FREEZE_FS); + wake_up_var(&sb->s_writers.frozen); deactivate_locked_super(sb); return ret; } @@ -1738,24 +1989,50 @@ int freeze_super(struct super_block *sb) * For debugging purposes so that fs can warn if it sees write activity * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ + sb->s_writers.freeze_holders |= who; sb->s_writers.frozen = SB_FREEZE_COMPLETE; + wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return 0; } EXPORT_SYMBOL(freeze_super); -static int thaw_super_locked(struct super_block *sb) +/* + * Undoes the effect of a freeze_super_locked call. If the filesystem is + * frozen both by userspace and the kernel, a thaw call from either source + * removes that state without releasing the other state or unlocking the + * filesystem. + */ +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) { int error; - if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) { - up_write(&sb->s_umount); + if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { + if (!(sb->s_writers.freeze_holders & who)) { + super_unlock_excl(sb); + return -EINVAL; + } + + /* + * Freeze is shared with someone else. Release our hold and + * drop the active ref that freeze_super assigned to the + * freezer. + */ + if (sb->s_writers.freeze_holders & ~who) { + sb->s_writers.freeze_holders &= ~who; + deactivate_locked_super(sb); + return 0; + } + } else { + super_unlock_excl(sb); return -EINVAL; } if (sb_rdonly(sb)) { + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); goto out; } @@ -1764,15 +2041,16 @@ static int thaw_super_locked(struct super_block *sb) if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); if (error) { - printk(KERN_ERR - "VFS:Filesystem thaw failed\n"); + printk(KERN_ERR "VFS:Filesystem thaw failed\n"); lockdep_sb_freeze_release(sb); - up_write(&sb->s_umount); + super_unlock_excl(sb); return error; } } + sb->s_writers.freeze_holders &= ~who; sb->s_writers.frozen = SB_UNFROZEN; + wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out: deactivate_locked_super(sb); @@ -1782,13 +2060,20 @@ out: /** * thaw_super -- unlock filesystem * @sb: the super to thaw + * @who: context that wants to freeze + * + * Unlocks the filesystem and marks it writeable again after freeze_super() + * if there are no remaining freezes on the filesystem. * - * Unlocks the filesystem and marks it writeable again after freeze_super(). + * @who should be: + * * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs; + * * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs. */ -int thaw_super(struct super_block *sb) +int thaw_super(struct super_block *sb, enum freeze_holder who) { - down_write(&sb->s_umount); - return thaw_super_locked(sb); + if (!super_lock_excl(sb)) + WARN(1, "Dying superblock while thawing!"); + return thaw_super_locked(sb, who); } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 0140010aa0c3..2f5ead88d00b 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -224,7 +224,7 @@ got_it: memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = sysv_handle_dirsync(dir); out_page: @@ -249,7 +249,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) } de->inode = 0; dir_commit_chunk(page, pos, SYSV_DIRSIZE); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); return sysv_handle_dirsync(inode); } @@ -346,7 +346,7 @@ int sysv_set_link(struct sysv_dir_entry *de, struct page *page, } de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); dir_commit_chunk(page, pos, SYSV_DIRSIZE); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); return sysv_handle_dirsync(inode); } diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c index e732879036ab..6719da5889d9 100644 --- a/fs/sysv/ialloc.c +++ b/fs/sysv/ialloc.c @@ -165,7 +165,7 @@ struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) dirty_sb(sb); inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = fs16_to_cpu(sbi, ino); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_blocks = 0; memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); SYSV_I(inode)->i_dir_start_lookup = 0; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 9e8d4a6fb2f3..0aa3827d8178 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -202,8 +202,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino) inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); inode->i_atime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_atime); inode->i_mtime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_mtime); - inode->i_ctime.tv_sec = fs32_to_cpu(sbi, raw_inode->i_ctime); - inode->i_ctime.tv_nsec = 0; + inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); inode->i_atime.tv_nsec = 0; inode->i_mtime.tv_nsec = 0; inode->i_blocks = 0; @@ -256,7 +255,7 @@ static int __sysv_write_inode(struct inode *inode, int wait) raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); raw_inode->i_atime = cpu_to_fs32(sbi, inode->i_atime.tv_sec); raw_inode->i_mtime = cpu_to_fs32(sbi, inode->i_mtime.tv_sec); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode->i_ctime.tv_sec); + raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime(inode).tv_sec); si = SYSV_I(inode); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 58d7f43a1371..edb94e55de8e 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -183,7 +183,7 @@ static inline int splice_branch(struct inode *inode, *where->p = where->key; write_unlock(&pointers_lock); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); /* had we spliced it onto indirect block? */ if (where->bh) @@ -423,7 +423,7 @@ do_indirects: } n++; } - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) sysv_sync_inode (inode); else @@ -449,7 +449,8 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, d_inode(path->dentry), stat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), + stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index fcf163fea3ad..d6b73798071b 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -103,7 +103,7 @@ static int sysv_link(struct dentry * old_dentry, struct inode * dir, { struct inode *inode = d_inode(old_dentry); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -161,7 +161,7 @@ static int sysv_unlink(struct inode * dir, struct dentry * dentry) err = sysv_delete_entry(de, page); if (!err) { - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); } unmap_and_put_page(page, de); @@ -230,7 +230,7 @@ static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, unmap_and_put_page(new_page, new_de); if (err) goto out_dir; - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 57ac8aa4a724..2feb6c58648c 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -132,7 +132,7 @@ static struct inode *tracefs_get_inode(struct super_block *sb) struct inode *inode = new_inode(sb); if (inode) { inode->i_ino = get_next_ino(); - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_atime = inode->i_mtime = inode_set_ctime_current(inode); } return inode; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 9c9d3f0e36a4..eef9e527d9ff 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -243,8 +243,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) (unsigned int)inode->i_mtime.tv_sec, (unsigned int)inode->i_mtime.tv_nsec); pr_err("\tctime %u.%u\n", - (unsigned int)inode->i_ctime.tv_sec, - (unsigned int)inode->i_ctime.tv_nsec); + (unsigned int) inode_get_ctime(inode).tv_sec, + (unsigned int) inode_get_ctime(inode).tv_nsec); pr_err("\tcreat_sqnum %llu\n", ui->creat_sqnum); pr_err("\txattr_size %u\n", ui->xattr_size); pr_err("\txattr_cnt %u\n", ui->xattr_cnt); diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index ef0499edc248..2f48c58d47cd 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -96,8 +96,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode->i_flags |= S_NOCMTIME; inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_mtime = inode->i_atime = inode->i_ctime = - current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_mapping->nrpages = 0; if (!is_xattr) { @@ -325,7 +324,7 @@ static int ubifs_create(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -765,10 +764,10 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); ihold(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -838,11 +837,11 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -940,12 +939,12 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = current_time(dir); + inode_set_ctime_current(inode); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0); if (err) goto out_cancel; @@ -1019,7 +1018,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inc_nlink(dir); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) { ubifs_err(c, "cannot create directory, error %d", err); @@ -1110,7 +1109,7 @@ static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1210,7 +1209,7 @@ static int ubifs_symlink(struct mnt_idmap *idmap, struct inode *dir, mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; - dir->i_mtime = dir->i_ctime = inode->i_ctime; + dir->i_mtime = inode_set_ctime_to_ts(dir, inode_get_ctime(inode)); err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0); if (err) goto out_cancel; @@ -1298,7 +1297,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, struct ubifs_budget_req ino_req = { .dirtied_ino = 1, .dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) }; struct ubifs_budget_req wht_req; - struct timespec64 time; unsigned int saved_nlink; struct fscrypt_name old_nm, new_nm; @@ -1414,8 +1412,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = current_time(old_dir); - old_inode->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); /* We must adjust parent link count when renaming directories */ if (is_dir) { @@ -1444,13 +1441,11 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, old_dir->i_size -= old_sz; ubifs_inode(old_dir)->ui_size = old_dir->i_size; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; /* * And finally, if we unlinked a direntry which happened to have the * same name as the moved direntry, we have to decrement @i_nlink of - * the unlinked inode and change its ctime. + * the unlinked inode. */ if (unlink) { /* @@ -1462,7 +1457,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, clear_nlink(new_inode); else drop_nlink(new_inode); - new_inode->i_ctime = time; } else { new_dir->i_size += new_sz; ubifs_inode(new_dir)->ui_size = new_dir->i_size; @@ -1557,7 +1551,6 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, int sync = IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir); struct inode *fst_inode = d_inode(old_dentry); struct inode *snd_inode = d_inode(new_dentry); - struct timespec64 time; int err; struct fscrypt_name fst_nm, snd_nm; @@ -1588,11 +1581,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = current_time(old_dir); - fst_inode->i_ctime = time; - snd_inode->i_ctime = time; - old_dir->i_mtime = old_dir->i_ctime = time; - new_dir->i_mtime = new_dir->i_ctime = time; + simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dir != new_dir) { if (S_ISDIR(fst_inode->i_mode) && !S_ISDIR(snd_inode->i_mode)) { @@ -1665,7 +1654,7 @@ int ubifs_getattr(struct mnt_idmap *idmap, const struct path *path, STATX_ATTR_ENCRYPTED | STATX_ATTR_IMMUTABLE); - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); stat->blksize = UBIFS_BLOCK_SIZE; stat->size = ui->ui_size; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 6738fe43040b..e5382f0b2587 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1092,7 +1092,7 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr) if (attr->ia_valid & ATTR_MTIME) inode->i_mtime = attr->ia_mtime; if (attr->ia_valid & ATTR_CTIME) - inode->i_ctime = attr->ia_ctime; + inode_set_ctime_to_ts(inode, attr->ia_ctime); if (attr->ia_valid & ATTR_MODE) { umode_t mode = attr->ia_mode; @@ -1192,7 +1192,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1239,7 +1239,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1364,8 +1364,10 @@ out: static inline int mctime_update_needed(const struct inode *inode, const struct timespec64 *now) { + struct timespec64 ctime = inode_get_ctime(inode); + if (!timespec64_equal(&inode->i_mtime, now) || - !timespec64_equal(&inode->i_ctime, now)) + !timespec64_equal(&ctime, now)) return 1; return 0; } @@ -1376,8 +1378,7 @@ static inline int mctime_update_needed(const struct inode *inode, * * This function updates time of the inode. */ -int ubifs_update_time(struct inode *inode, struct timespec64 *time, - int flags) +int ubifs_update_time(struct inode *inode, int flags) { struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1385,21 +1386,17 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time, .dirtied_ino_d = ALIGN(ui->data_len, 8) }; int err, release; - if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) - return generic_update_time(inode, time, flags); + if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT)) { + generic_update_time(inode, flags); + return 0; + } err = ubifs_budget_space(c, &req); if (err) return err; mutex_lock(&ui->ui_mutex); - if (flags & S_ATIME) - inode->i_atime = *time; - if (flags & S_CTIME) - inode->i_ctime = *time; - if (flags & S_MTIME) - inode->i_mtime = *time; - + inode_update_timestamps(inode, flags); release = ui->dirty; __mark_inode_dirty(inode, I_DIRTY_SYNC); mutex_unlock(&ui->ui_mutex); @@ -1432,7 +1429,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1570,7 +1567,7 @@ static vm_fault_t ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 67c5108abd89..d79cabe193c3 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -118,7 +118,7 @@ static int setflags(struct inode *inode, int flags) ui->flags &= ~ioctl2ubifs(UBIFS_SETTABLE_IOCTL_FLAGS); ui->flags |= ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index dc52ac0f4a34..ffc9beee7be6 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -454,8 +454,8 @@ static void pack_inode(struct ubifs_info *c, struct ubifs_ino_node *ino, ino->creat_sqnum = cpu_to_le64(ui->creat_sqnum); ino->atime_sec = cpu_to_le64(inode->i_atime.tv_sec); ino->atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); - ino->ctime_sec = cpu_to_le64(inode->i_ctime.tv_sec); - ino->ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ino->ctime_sec = cpu_to_le64(inode_get_ctime(inode).tv_sec); + ino->ctime_nsec = cpu_to_le32(inode_get_ctime(inode).tv_nsec); ino->mtime_sec = cpu_to_le64(inode->i_mtime.tv_sec); ino->mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); ino->uid = cpu_to_le32(i_uid_read(inode)); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 32cb14759796..b08fb28d16b5 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -146,8 +146,8 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) inode->i_atime.tv_nsec = le32_to_cpu(ino->atime_nsec); inode->i_mtime.tv_sec = (int64_t)le64_to_cpu(ino->mtime_sec); inode->i_mtime.tv_nsec = le32_to_cpu(ino->mtime_nsec); - inode->i_ctime.tv_sec = (int64_t)le64_to_cpu(ino->ctime_sec); - inode->i_ctime.tv_nsec = le32_to_cpu(ino->ctime_nsec); + inode_set_ctime(inode, (int64_t)le64_to_cpu(ino->ctime_sec), + le32_to_cpu(ino->ctime_nsec)); inode->i_mode = le32_to_cpu(ino->mode); inode->i_size = le64_to_cpu(ino->size); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4c36044140e7..ebb3ad6b5e7e 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -2027,7 +2027,7 @@ int ubifs_calc_dark(const struct ubifs_info *c, int spc); int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync); int ubifs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr); -int ubifs_update_time(struct inode *inode, struct timespec64 *time, int flags); +int ubifs_update_time(struct inode *inode, int flags); /* dir.c */ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index 349228dd1191..406c82eab513 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -134,7 +134,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -215,7 +215,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -474,7 +474,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = current_time(host); + inode_set_ctime_current(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 5f7ac8c84798..6b558cbbeb6b 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -100,7 +100,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; else iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); iinfo->i_crtime = inode->i_mtime; if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 28cdfc57d946..d089795074e8 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -910,7 +910,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) map->oflags = UDF_BLK_NEW | UDF_BLK_MAPPED; iinfo->i_next_alloc_block = map->lblk + 1; iinfo->i_next_alloc_goal = newblocknum + 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); @@ -1298,7 +1298,7 @@ set_size: goto out_unlock; } update_time: - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); if (IS_SYNC(inode)) udf_sync_inode(inode); else @@ -1329,6 +1329,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) int bs = inode->i_sb->s_blocksize; int ret = -EIO; uint32_t uid, gid; + struct timespec64 ctime; reread: if (iloc->partitionReferenceNum >= sbi->s_partitions) { @@ -1507,7 +1508,8 @@ reread: udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime); udf_disk_stamp_to_time(&inode->i_mtime, fe->modificationTime); - udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime); + udf_disk_stamp_to_time(&ctime, fe->attrTime); + inode_set_ctime_to_ts(inode, ctime); iinfo->i_unique = le64_to_cpu(fe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); @@ -1522,7 +1524,8 @@ reread: udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime); udf_disk_stamp_to_time(&inode->i_mtime, efe->modificationTime); udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime); - udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime); + udf_disk_stamp_to_time(&ctime, efe->attrTime); + inode_set_ctime_to_ts(inode, ctime); iinfo->i_unique = le64_to_cpu(efe->uniqueID); iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); @@ -1799,7 +1802,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime); - udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&fe->attrTime, inode_get_ctime(inode)); memset(&(fe->impIdent), 0, sizeof(struct regid)); strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; @@ -1830,12 +1833,12 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_adjust_time(iinfo, inode->i_atime); udf_adjust_time(iinfo, inode->i_mtime); - udf_adjust_time(iinfo, inode->i_ctime); + udf_adjust_time(iinfo, inode_get_ctime(inode)); udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime); udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime); udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); - udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); + udf_time_to_disk_stamp(&efe->attrTime, inode_get_ctime(inode)); memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index a95579b043ab..ae55ab8859b6 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -365,7 +365,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)iter.fi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_fiiter_write_fi(&iter, NULL); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, false, 1); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, udf_fiiter_release(&iter); udf_add_fid_counter(dir->i_sb, true, 1); inc_nlink(dir); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); d_instantiate_new(dentry, inode); @@ -523,8 +523,8 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; inode_dec_link_count(dir); udf_add_fid_counter(dir->i_sb, true, -1); - inode->i_ctime = dir->i_ctime = dir->i_mtime = - current_time(inode); + dir->i_mtime = inode_set_ctime_to_ts(dir, + inode_set_ctime_current(inode)); mark_inode_dirty(dir); ret = 0; end_rmdir: @@ -555,11 +555,11 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) set_nlink(inode, 1); } udf_fiiter_delete_entry(&iter); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); inode_dec_link_count(inode); udf_add_fid_counter(dir->i_sb, false, -1); - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); ret = 0; end_unlink: udf_fiiter_release(&iter); @@ -746,9 +746,9 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); udf_add_fid_counter(dir->i_sb, false, 1); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); - dir->i_ctime = dir->i_mtime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -833,7 +833,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); mark_inode_dirty(old_inode); /* @@ -861,13 +861,13 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (new_inode) { - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); inode_dec_link_count(new_inode); udf_add_fid_counter(old_dir->i_sb, S_ISDIR(new_inode->i_mode), -1); } - old_dir->i_ctime = old_dir->i_mtime = current_time(old_dir); - new_dir->i_ctime = new_dir->i_mtime = current_time(new_dir); + old_dir->i_mtime = inode_set_ctime_current(old_dir); + new_dir->i_mtime = inode_set_ctime_current(new_dir); mark_inode_dirty(old_dir); mark_inode_dirty(new_dir); diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 779b5c2c75f6..f7eaf7b14594 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -149,7 +149,7 @@ static int udf_symlink_getattr(struct mnt_idmap *idmap, struct inode *inode = d_backing_inode(dentry); struct page *page; - generic_fillattr(&nop_mnt_idmap, inode, stat); + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); page = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(page)) return PTR_ERR(page); diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 379d75796a5c..fd57f03b6c93 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -107,7 +107,7 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, ufs_commit_chunk(page, pos, len); ufs_put_page(page); if (update_times) - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); ufs_handle_dirsync(dir); } @@ -397,7 +397,7 @@ got_it: ufs_set_de_type(sb, de, inode->i_mode); ufs_commit_chunk(page, pos, rec_len); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = inode_set_ctime_current(dir); mark_inode_dirty(dir); err = ufs_handle_dirsync(dir); @@ -539,7 +539,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, pde->d_reclen = cpu_to_fs16(sb, to - from); dir->d_ino = 0; ufs_commit_chunk(page, pos, to - from); - inode->i_ctime = inode->i_mtime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); err = ufs_handle_dirsync(inode); out: diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 06bd84d555bd..a1e7bd9d1f98 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -292,7 +292,7 @@ cg_found: inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_blocks = 0; inode->i_generation = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); ufsi->i_flags = UFS_I(dir)->i_flags; ufsi->i_lastfrag = 0; ufsi->i_shadow = 0; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a4246c83a8cd..21a4779a2de5 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -296,7 +296,7 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, if (new) *new = 1; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); @@ -378,7 +378,7 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, mark_buffer_dirty(bh); if (IS_SYNC(inode)) sync_dirty_buffer(bh); - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); mark_inode_dirty(inode); out: brelse (bh); @@ -580,11 +580,12 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) inode->i_size = fs64_to_cpu(sb, ufs_inode->ui_size); inode->i_atime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_atime.tv_sec); - inode->i_ctime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec); + inode_set_ctime(inode, + (signed)fs32_to_cpu(sb, ufs_inode->ui_ctime.tv_sec), + 0); inode->i_mtime.tv_sec = (signed)fs32_to_cpu(sb, ufs_inode->ui_mtime.tv_sec); inode->i_mtime.tv_nsec = 0; inode->i_atime.tv_nsec = 0; - inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); @@ -626,10 +627,10 @@ static int ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) inode->i_size = fs64_to_cpu(sb, ufs2_inode->ui_size); inode->i_atime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_atime); - inode->i_ctime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_ctime); + inode_set_ctime(inode, fs64_to_cpu(sb, ufs2_inode->ui_ctime), + fs32_to_cpu(sb, ufs2_inode->ui_ctimensec)); inode->i_mtime.tv_sec = fs64_to_cpu(sb, ufs2_inode->ui_mtime); inode->i_atime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_atimensec); - inode->i_ctime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_ctimensec); inode->i_mtime.tv_nsec = fs32_to_cpu(sb, ufs2_inode->ui_mtimensec); inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); inode->i_generation = fs32_to_cpu(sb, ufs2_inode->ui_gen); @@ -726,7 +727,8 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; - ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, inode->i_ctime.tv_sec); + ufs_inode->ui_ctime.tv_sec = cpu_to_fs32(sb, + inode_get_ctime(inode).tv_sec); ufs_inode->ui_ctime.tv_usec = 0; ufs_inode->ui_mtime.tv_sec = cpu_to_fs32(sb, inode->i_mtime.tv_sec); ufs_inode->ui_mtime.tv_usec = 0; @@ -770,8 +772,9 @@ static void ufs2_update_inode(struct inode *inode, struct ufs2_inode *ufs_inode) ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime = cpu_to_fs64(sb, inode->i_atime.tv_sec); ufs_inode->ui_atimensec = cpu_to_fs32(sb, inode->i_atime.tv_nsec); - ufs_inode->ui_ctime = cpu_to_fs64(sb, inode->i_ctime.tv_sec); - ufs_inode->ui_ctimensec = cpu_to_fs32(sb, inode->i_ctime.tv_nsec); + ufs_inode->ui_ctime = cpu_to_fs64(sb, inode_get_ctime(inode).tv_sec); + ufs_inode->ui_ctimensec = cpu_to_fs32(sb, + inode_get_ctime(inode).tv_nsec); ufs_inode->ui_mtime = cpu_to_fs64(sb, inode->i_mtime.tv_sec); ufs_inode->ui_mtimensec = cpu_to_fs32(sb, inode->i_mtime.tv_nsec); @@ -1205,7 +1208,7 @@ static int ufs_truncate(struct inode *inode, loff_t size) truncate_setsize(inode, size); ufs_truncate_blocks(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = inode_set_ctime_current(inode); mark_inode_dirty(inode); out: UFSD("EXIT: err %d\n", err); diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 36154b5aca6d..9cad29463791 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -153,7 +153,7 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); inode_inc_link_count(inode); ihold(inode); @@ -220,7 +220,7 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry) if (err) goto out; - inode->i_ctime = dir->i_ctime; + inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); inode_dec_link_count(inode); err = 0; out: @@ -282,7 +282,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (!new_de) goto out_dir; ufs_set_link(new_dir, new_de, new_page, old_inode, 1); - new_inode->i_ctime = current_time(new_inode); + inode_set_ctime_current(new_inode); if (dir_de) drop_nlink(new_inode); inode_dec_link_count(new_inode); @@ -298,7 +298,7 @@ static int ufs_rename(struct mnt_idmap *idmap, struct inode *old_dir, * Like most other Unix systems, set the ctime for inodes on a * rename. */ - old_inode->i_ctime = current_time(old_inode); + inode_set_ctime_current(old_inode); ufs_delete_entry(old_dir, old_de, old_page); mark_inode_dirty(old_inode); diff --git a/fs/vboxsf/utils.c b/fs/vboxsf/utils.c index dd0ae1188e87..83f20dd15522 100644 --- a/fs/vboxsf/utils.c +++ b/fs/vboxsf/utils.c @@ -128,8 +128,8 @@ int vboxsf_init_inode(struct vboxsf_sbi *sbi, struct inode *inode, inode->i_atime = ns_to_timespec64( info->access_time.ns_relative_to_unix_epoch); - inode->i_ctime = ns_to_timespec64( - info->change_time.ns_relative_to_unix_epoch); + inode_set_ctime_to_ts(inode, + ns_to_timespec64(info->change_time.ns_relative_to_unix_epoch)); inode->i_mtime = ns_to_timespec64( info->modification_time.ns_relative_to_unix_epoch); return 0; @@ -252,7 +252,7 @@ int vboxsf_getattr(struct mnt_idmap *idmap, const struct path *path, if (err) return err; - generic_fillattr(&nop_mnt_idmap, d_inode(dentry), kstat); + generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(dentry), kstat); return 0; } diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index 49bf3a1eb2a0..d071a6e32581 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -118,16 +118,16 @@ void fsverity_free_info(struct fsverity_info *vi); int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret); -int __init fsverity_init_info_cache(void); -void __init fsverity_exit_info_cache(void); +void __init fsverity_init_info_cache(void); /* signature.c */ #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES +extern int fsverity_require_signatures; int fsverity_verify_signature(const struct fsverity_info *vi, const u8 *signature, size_t sig_size); -int __init fsverity_init_signature(void); +void __init fsverity_init_signature(void); #else /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ static inline int fsverity_verify_signature(const struct fsverity_info *vi, @@ -136,15 +136,13 @@ fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -static inline int fsverity_init_signature(void) +static inline void fsverity_init_signature(void) { - return 0; } #endif /* !CONFIG_FS_VERITY_BUILTIN_SIGNATURES */ /* verify.c */ -int __init fsverity_init_workqueue(void); -void __init fsverity_exit_workqueue(void); +void __init fsverity_init_workqueue(void); #endif /* _FSVERITY_PRIVATE_H */ diff --git a/fs/verity/hash_algs.c b/fs/verity/hash_algs.c index c598d2035476..6b08b1d9a7d7 100644 --- a/fs/verity/hash_algs.c +++ b/fs/verity/hash_algs.c @@ -226,6 +226,14 @@ void __init fsverity_check_hash_algs(void) if (!alg->name) continue; + /* + * 0 must never be allocated as an FS_VERITY_HASH_ALG_* value, + * as it is reserved for users that use 0 to mean unspecified or + * a default value. fs/verity/ itself doesn't care and doesn't + * have a default algorithm, but some users make use of this. + */ + BUG_ON(i == 0); + BUG_ON(alg->digest_size > FS_VERITY_MAX_DIGEST_SIZE); /* diff --git a/fs/verity/init.c b/fs/verity/init.c index 023905151035..a29f062f6047 100644 --- a/fs/verity/init.c +++ b/fs/verity/init.c @@ -9,6 +9,37 @@ #include <linux/ratelimit.h> +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fsverity_sysctl_header; + +static struct ctl_table fsverity_sysctl_table[] = { +#ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES + { + .procname = "require_signatures", + .data = &fsverity_require_signatures, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif + { } +}; + +static void __init fsverity_init_sysctl(void) +{ + fsverity_sysctl_header = register_sysctl("fs/verity", + fsverity_sysctl_table); + if (!fsverity_sysctl_header) + panic("fsverity sysctl registration failed"); +} +#else /* CONFIG_SYSCTL */ +static inline void fsverity_init_sysctl(void) +{ +} +#endif /* !CONFIG_SYSCTL */ + void fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...) { @@ -33,28 +64,11 @@ void fsverity_msg(const struct inode *inode, const char *level, static int __init fsverity_init(void) { - int err; - fsverity_check_hash_algs(); - - err = fsverity_init_info_cache(); - if (err) - return err; - - err = fsverity_init_workqueue(); - if (err) - goto err_exit_info_cache; - - err = fsverity_init_signature(); - if (err) - goto err_exit_workqueue; - + fsverity_init_info_cache(); + fsverity_init_workqueue(); + fsverity_init_sysctl(); + fsverity_init_signature(); return 0; - -err_exit_workqueue: - fsverity_exit_workqueue(); -err_exit_info_cache: - fsverity_exit_info_cache(); - return err; } late_initcall(fsverity_init) diff --git a/fs/verity/open.c b/fs/verity/open.c index 1db5106a9c38..6c31a871b84b 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -408,18 +408,10 @@ void __fsverity_cleanup_inode(struct inode *inode) } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); -int __init fsverity_init_info_cache(void) +void __init fsverity_init_info_cache(void) { - fsverity_info_cachep = KMEM_CACHE_USERCOPY(fsverity_info, - SLAB_RECLAIM_ACCOUNT, - file_digest); - if (!fsverity_info_cachep) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_info_cache(void) -{ - kmem_cache_destroy(fsverity_info_cachep); - fsverity_info_cachep = NULL; + fsverity_info_cachep = KMEM_CACHE_USERCOPY( + fsverity_info, + SLAB_RECLAIM_ACCOUNT | SLAB_PANIC, + file_digest); } diff --git a/fs/verity/signature.c b/fs/verity/signature.c index 72034bc71c9d..90c07573dd77 100644 --- a/fs/verity/signature.c +++ b/fs/verity/signature.c @@ -24,7 +24,7 @@ * /proc/sys/fs/verity/require_signatures * If 1, all verity files must have a valid builtin signature. */ -static int fsverity_require_signatures; +int fsverity_require_signatures; /* * Keyring that contains the trusted X.509 certificates. @@ -62,6 +62,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } + if (fsverity_keyring->keys.nr_leaves_on_tree == 0) { + /* + * The ".fs-verity" keyring is empty, due to builtin signatures + * being supported by the kernel but not actually being used. + * In this case, verify_pkcs7_signature() would always return an + * error, usually ENOKEY. It could also be EBADMSG if the + * PKCS#7 is malformed, but that isn't very important to + * distinguish. So, just skip to ENOKEY to avoid the attack + * surface of the PKCS#7 parser, which would otherwise be + * reachable by any task able to execute FS_IOC_ENABLE_VERITY. + */ + fsverity_err(inode, + "fs-verity keyring is empty, rejecting signed file!"); + return -ENOKEY; + } + d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL); if (!d) return -ENOMEM; @@ -93,59 +109,14 @@ int fsverity_verify_signature(const struct fsverity_info *vi, return 0; } -#ifdef CONFIG_SYSCTL -static struct ctl_table_header *fsverity_sysctl_header; - -static struct ctl_table fsverity_sysctl_table[] = { - { - .procname = "require_signatures", - .data = &fsverity_require_signatures, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { } -}; - -static int __init fsverity_sysctl_init(void) -{ - fsverity_sysctl_header = register_sysctl("fs/verity", fsverity_sysctl_table); - if (!fsverity_sysctl_header) { - pr_err("sysctl registration failed!\n"); - return -ENOMEM; - } - return 0; -} -#else /* !CONFIG_SYSCTL */ -static inline int __init fsverity_sysctl_init(void) +void __init fsverity_init_signature(void) { - return 0; -} -#endif /* !CONFIG_SYSCTL */ - -int __init fsverity_init_signature(void) -{ - struct key *ring; - int err; - - ring = keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), - current_cred(), KEY_POS_SEARCH | + fsverity_keyring = + keyring_alloc(".fs-verity", KUIDT_INIT(0), KGIDT_INIT(0), + current_cred(), KEY_POS_SEARCH | KEY_USR_VIEW | KEY_USR_READ | KEY_USR_WRITE | KEY_USR_SEARCH | KEY_USR_SETATTR, - KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); - if (IS_ERR(ring)) - return PTR_ERR(ring); - - err = fsverity_sysctl_init(); - if (err) - goto err_put_ring; - - fsverity_keyring = ring; - return 0; - -err_put_ring: - key_put(ring); - return err; + KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); + if (IS_ERR(fsverity_keyring)) + panic("failed to allocate \".fs-verity\" keyring"); } diff --git a/fs/verity/verify.c b/fs/verity/verify.c index 433cef51f5f6..904ccd7e8e16 100644 --- a/fs/verity/verify.c +++ b/fs/verity/verify.c @@ -346,7 +346,7 @@ void fsverity_enqueue_verify_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work); -int __init fsverity_init_workqueue(void) +void __init fsverity_init_workqueue(void) { /* * Use a high-priority workqueue to prioritize verification work, which @@ -360,12 +360,5 @@ int __init fsverity_init_workqueue(void) WQ_HIGHPRI, num_online_cpus()); if (!fsverity_read_workqueue) - return -ENOMEM; - return 0; -} - -void __init fsverity_exit_workqueue(void) -{ - destroy_workqueue(fsverity_read_workqueue); - fsverity_read_workqueue = NULL; + panic("failed to allocate fsverity_read_queue"); } diff --git a/fs/xattr.c b/fs/xattr.c index e7bbb7f57557..efd4736bc94b 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -1040,12 +1040,32 @@ const char *xattr_full_name(const struct xattr_handler *handler, EXPORT_SYMBOL(xattr_full_name); /** - * free_simple_xattr - free an xattr object + * simple_xattr_space - estimate the memory used by a simple xattr + * @name: the full name of the xattr + * @size: the size of its value + * + * This takes no account of how much larger the two slab objects actually are: + * that would depend on the slab implementation, when what is required is a + * deterministic number, which grows with name length and size and quantity. + * + * Return: The approximate number of bytes of memory used by such an xattr. + */ +size_t simple_xattr_space(const char *name, size_t size) +{ + /* + * Use "40" instead of sizeof(struct simple_xattr), to return the + * same result on 32-bit and 64-bit, and even if simple_xattr grows. + */ + return 40 + size + strlen(name); +} + +/** + * simple_xattr_free - free an xattr object * @xattr: the xattr object * * Free the xattr object. Can handle @xattr being NULL. */ -static inline void free_simple_xattr(struct simple_xattr *xattr) +void simple_xattr_free(struct simple_xattr *xattr) { if (xattr) kfree(xattr->name); @@ -1073,7 +1093,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) if (len < sizeof(*new_xattr)) return NULL; - new_xattr = kvmalloc(len, GFP_KERNEL); + new_xattr = kvmalloc(len, GFP_KERNEL_ACCOUNT); if (!new_xattr) return NULL; @@ -1164,7 +1184,6 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * @value: the value to store along the xattr * @size: the size of @value * @flags: the flags determining how to set the xattr - * @removed_size: the size of the removed xattr * * Set a new xattr object. * If @value is passed a new xattr object will be allocated. If XATTR_REPLACE @@ -1181,29 +1200,27 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, * nothing if XATTR_CREATE is specified in @flags or @flags is zero. For * XATTR_REPLACE we fail as mentioned above. * - * Return: On success zero and on error a negative error code is returned. + * Return: On success, the removed or replaced xattr is returned, to be freed + * by the caller; or NULL if none. On failure a negative error code is returned. */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags, - ssize_t *removed_size) +struct simple_xattr *simple_xattr_set(struct simple_xattrs *xattrs, + const char *name, const void *value, + size_t size, int flags) { - struct simple_xattr *xattr = NULL, *new_xattr = NULL; + struct simple_xattr *old_xattr = NULL, *new_xattr = NULL; struct rb_node *parent = NULL, **rbp; int err = 0, ret; - if (removed_size) - *removed_size = -1; - /* value == NULL means remove */ if (value) { new_xattr = simple_xattr_alloc(value, size); if (!new_xattr) - return -ENOMEM; + return ERR_PTR(-ENOMEM); - new_xattr->name = kstrdup(name, GFP_KERNEL); + new_xattr->name = kstrdup(name, GFP_KERNEL_ACCOUNT); if (!new_xattr->name) { - free_simple_xattr(new_xattr); - return -ENOMEM; + simple_xattr_free(new_xattr); + return ERR_PTR(-ENOMEM); } } @@ -1217,12 +1234,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, else if (ret > 0) rbp = &(*rbp)->rb_right; else - xattr = rb_entry(*rbp, struct simple_xattr, rb_node); - if (xattr) + old_xattr = rb_entry(*rbp, struct simple_xattr, rb_node); + if (old_xattr) break; } - if (xattr) { + if (old_xattr) { /* Fail if XATTR_CREATE is requested and the xattr exists. */ if (flags & XATTR_CREATE) { err = -EEXIST; @@ -1230,12 +1247,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, } if (new_xattr) - rb_replace_node(&xattr->rb_node, &new_xattr->rb_node, - &xattrs->rb_root); + rb_replace_node(&old_xattr->rb_node, + &new_xattr->rb_node, &xattrs->rb_root); else - rb_erase(&xattr->rb_node, &xattrs->rb_root); - if (!err && removed_size) - *removed_size = xattr->size; + rb_erase(&old_xattr->rb_node, &xattrs->rb_root); } else { /* Fail if XATTR_REPLACE is requested but no xattr is found. */ if (flags & XATTR_REPLACE) { @@ -1260,12 +1275,10 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, out_unlock: write_unlock(&xattrs->lock); - if (err) - free_simple_xattr(new_xattr); - else - free_simple_xattr(xattr); - return err; - + if (!err) + return old_xattr; + simple_xattr_free(new_xattr); + return ERR_PTR(err); } static bool xattr_is_trusted(const char *name) @@ -1370,14 +1383,17 @@ void simple_xattrs_init(struct simple_xattrs *xattrs) /** * simple_xattrs_free - free xattrs * @xattrs: xattr header whose xattrs to destroy + * @freed_space: approximate number of bytes of memory freed from @xattrs * * Destroy all xattrs in @xattr. When this is called no one can hold a * reference to any of the xattrs anymore. */ -void simple_xattrs_free(struct simple_xattrs *xattrs) +void simple_xattrs_free(struct simple_xattrs *xattrs, size_t *freed_space) { struct rb_node *rbp; + if (freed_space) + *freed_space = 0; rbp = rb_first(&xattrs->rb_root); while (rbp) { struct simple_xattr *xattr; @@ -1386,7 +1402,10 @@ void simple_xattrs_free(struct simple_xattrs *xattrs) rbp_next = rb_next(rbp); xattr = rb_entry(rbp, struct simple_xattr, rb_node); rb_erase(&xattr->rb_node, &xattrs->rb_root); - free_simple_xattr(xattr); + if (freed_space) + *freed_space += simple_xattr_space(xattr->name, + xattr->size); + simple_xattr_free(xattr); rbp = rbp_next; } } diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 758aacd8166b..a35781577cad 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -222,7 +222,8 @@ xfs_inode_from_disk( */ inode->i_atime = xfs_inode_from_disk_ts(from, from->di_atime); inode->i_mtime = xfs_inode_from_disk_ts(from, from->di_mtime); - inode->i_ctime = xfs_inode_from_disk_ts(from, from->di_ctime); + inode_set_ctime_to_ts(inode, + xfs_inode_from_disk_ts(from, from->di_ctime)); ip->i_disk_size = be64_to_cpu(from->di_size); ip->i_nblocks = be64_to_cpu(from->di_nblocks); @@ -316,7 +317,7 @@ xfs_inode_to_disk( to->di_atime = xfs_inode_to_disk_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_disk_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_disk_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_disk_ts(ip, inode_get_ctime(inode)); to->di_nlink = cpu_to_be32(inode->i_nlink); to->di_gen = cpu_to_be32(inode->i_generation); to->di_mode = cpu_to_be16(inode->i_mode); diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c index cb4796b6e693..ad22656376d3 100644 --- a/fs/xfs/libxfs/xfs_trans_inode.c +++ b/fs/xfs/libxfs/xfs_trans_inode.c @@ -62,12 +62,12 @@ xfs_trans_ichgtime( ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - tv = current_time(inode); + /* If the mtime changes, then ctime must also change */ + ASSERT(flags & XFS_ICHGTIME_CHG); + tv = inode_set_ctime_current(inode); if (flags & XFS_ICHGTIME_MOD) inode->i_mtime = tv; - if (flags & XFS_ICHGTIME_CHG) - inode->i_ctime = tv; if (flags & XFS_ICHGTIME_CREATE) ip->i_crtime = tv; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index e382a35e98d8..05be757668bb 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0+ +// SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2019-2023 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> @@ -8,6 +8,8 @@ #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_trans_resv.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" #include "xfs_mount.h" #include "xfs_alloc.h" #include "xfs_ialloc.h" @@ -16,6 +18,7 @@ #include "xfs_ag.h" #include "xfs_rtalloc.h" #include "xfs_inode.h" +#include "xfs_icache.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -53,6 +56,7 @@ struct xchk_fscounters { uint64_t frextents; unsigned long long icount_min; unsigned long long icount_max; + bool frozen; }; /* @@ -123,6 +127,82 @@ xchk_fscount_warmup( return error; } +static inline int +xchk_fsfreeze( + struct xfs_scrub *sc) +{ + int error; + + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsfreeze(sc, error); + return error; +} + +static inline int +xchk_fsthaw( + struct xfs_scrub *sc) +{ + int error; + + /* This should always succeed, we have a kernel freeze */ + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + trace_xchk_fsthaw(sc, error); + return error; +} + +/* + * We couldn't stabilize the filesystem long enough to sample all the variables + * that comprise the summary counters and compare them to the percpu counters. + * We need to disable all writer threads, which means taking the first two + * freeze levels to put userspace to sleep, and the third freeze level to + * prevent background threads from starting new transactions. Take one level + * more to prevent other callers from unfreezing the filesystem while we run. + */ +STATIC int +xchk_fscounters_freeze( + struct xfs_scrub *sc) +{ + struct xchk_fscounters *fsc = sc->buf; + int error = 0; + + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; + mnt_drop_write_file(sc->file); + } + + /* Try to grab a kernel freeze. */ + while ((error = xchk_fsfreeze(sc)) == -EBUSY) { + if (xchk_should_terminate(sc, &error)) + return error; + + delay(HZ / 10); + } + if (error) + return error; + + fsc->frozen = true; + return 0; +} + +/* Thaw the filesystem after checking or repairing fscounters. */ +STATIC void +xchk_fscounters_cleanup( + void *buf) +{ + struct xchk_fscounters *fsc = buf; + struct xfs_scrub *sc = fsc->sc; + int error; + + if (!fsc->frozen) + return; + + error = xchk_fsthaw(sc); + if (error) + xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); + else + fsc->frozen = false; +} + int xchk_setup_fscounters( struct xfs_scrub *sc) @@ -140,6 +220,7 @@ xchk_setup_fscounters( sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); if (!sc->buf) return -ENOMEM; + sc->buf_cleanup = xchk_fscounters_cleanup; fsc = sc->buf; fsc->sc = sc; @@ -150,7 +231,18 @@ xchk_setup_fscounters( if (error) return error; - return xchk_trans_alloc(sc, 0); + /* + * Pause all writer activity in the filesystem while we're scrubbing to + * reduce the likelihood of background perturbations to the counters + * throwing off our calculations. + */ + if (sc->flags & XCHK_TRY_HARDER) { + error = xchk_fscounters_freeze(sc); + if (error) + return error; + } + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); } /* @@ -290,8 +382,7 @@ retry: if (fsc->ifree > fsc->icount) { if (tries--) goto retry; - xchk_set_incomplete(sc); - return 0; + return -EDEADLOCK; } return 0; @@ -367,6 +458,8 @@ xchk_fscount_count_frextents( * Otherwise, we /might/ have a problem. If the change in the summations is * more than we want to tolerate, the filesystem is probably busy and we should * just send back INCOMPLETE and see if userspace will try again. + * + * If we're repairing then we require an exact match. */ static inline bool xchk_fscount_within_range( @@ -396,21 +489,7 @@ xchk_fscount_within_range( if (expected >= min_value && expected <= max_value) return true; - /* - * If the difference between the two summations is too large, the fs - * might just be busy and so we'll mark the scrub incomplete. Return - * true here so that we don't mark the counter corrupt. - * - * XXX: In the future when userspace can grant scrub permission to - * quiesce the filesystem to solve the outsized variance problem, this - * check should be moved up and the return code changed to signal to - * userspace that we need quiesce permission. - */ - if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) { - xchk_set_incomplete(sc); - return true; - } - + /* Everything else is bad. */ return false; } @@ -422,6 +501,7 @@ xchk_fscounters( struct xfs_mount *mp = sc->mp; struct xchk_fscounters *fsc = sc->buf; int64_t icount, ifree, fdblocks, frextents; + bool try_again = false; int error; /* Snapshot the percpu counters. */ @@ -431,9 +511,26 @@ xchk_fscounters( frextents = percpu_counter_sum(&mp->m_frextents); /* No negative values, please! */ - if (icount < 0 || ifree < 0 || fdblocks < 0 || frextents < 0) + if (icount < 0 || ifree < 0) xchk_set_corrupt(sc); + /* + * If the filesystem is not frozen, the counter summation calls above + * can race with xfs_mod_freecounter, which subtracts a requested space + * reservation from the counter and undoes the subtraction if that made + * the counter go negative. Therefore, it's possible to see negative + * values here, and we should only flag that as a corruption if we + * froze the fs. This is much more likely to happen with frextents + * since there are no reserved pools. + */ + if (fdblocks < 0 || frextents < 0) { + if (!fsc->frozen) + return -EDEADLOCK; + + xchk_set_corrupt(sc); + return 0; + } + /* See if icount is obviously wrong. */ if (icount < fsc->icount_min || icount > fsc->icount_max) xchk_set_corrupt(sc); @@ -447,12 +544,6 @@ xchk_fscounters( xchk_set_corrupt(sc); /* - * XXX: We can't quiesce percpu counter updates, so exit early. - * This can be re-enabled when we gain exclusive freeze functionality. - */ - return 0; - - /* * If ifree exceeds icount by more than the minimum variance then * something's probably wrong with the counters. */ @@ -463,8 +554,6 @@ xchk_fscounters( error = xchk_fscount_aggregate_agcounts(sc, fsc); if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) - return 0; /* Count the free extents counter for rt volumes. */ error = xchk_fscount_count_frextents(sc, fsc); @@ -473,20 +562,45 @@ xchk_fscounters( if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) return 0; - /* Compare the in-core counters with whatever we counted. */ - if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount)) - xchk_set_corrupt(sc); + /* + * Compare the in-core counters with whatever we counted. If the fs is + * frozen, we treat the discrepancy as a corruption because the freeze + * should have stabilized the counter values. Otherwise, we need + * userspace to call us back having granted us freeze permission. + */ + if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, + fsc->icount)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } - if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) - xchk_set_corrupt(sc); + if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) - xchk_set_corrupt(sc); + fsc->fdblocks)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, - fsc->frextents)) - xchk_set_corrupt(sc); + fsc->frextents)) { + if (fsc->frozen) + xchk_set_corrupt(sc); + else + try_again = true; + } + + if (try_again) + return -EDEADLOCK; return 0; } diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 3d98f604765e..a0fffbcd022b 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -184,8 +184,10 @@ xchk_teardown( xchk_irele(sc, sc->ip); sc->ip = NULL; } - if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + if (sc->flags & XCHK_HAVE_FREEZE_PROT) { + sc->flags &= ~XCHK_HAVE_FREEZE_PROT; mnt_drop_write_file(sc->file); + } if (sc->buf) { if (sc->buf_cleanup) sc->buf_cleanup(sc->buf); @@ -505,6 +507,8 @@ retry_op: error = mnt_want_write_file(sc->file); if (error) goto out_sc; + + sc->flags |= XCHK_HAVE_FREEZE_PROT; } /* Set up for the operation. */ diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index e113f2f5c254..f8ba00e51ca9 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -106,6 +106,7 @@ struct xfs_scrub { /* XCHK state flags grow up from zero, XREP state flags grown down from 2^31 */ #define XCHK_TRY_HARDER (1U << 0) /* can't get resources, try again */ +#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */ #define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */ #define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */ #define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */ diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index b3894daeb86a..0b54f1a1cf0c 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -98,6 +98,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS); #define XFS_SCRUB_STATE_STRINGS \ { XCHK_TRY_HARDER, "try_harder" }, \ + { XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \ { XCHK_FSGATES_DRAIN, "fsgates_drain" }, \ { XCHK_NEED_DRAIN, "need_drain" }, \ { XREP_ALREADY_FIXED, "already_fixed" } @@ -693,6 +694,31 @@ TRACE_EVENT(xchk_fscounters_within_range, __entry->old_value) ) +DECLARE_EVENT_CLASS(xchk_fsfreeze_class, + TP_PROTO(struct xfs_scrub *sc, int error), + TP_ARGS(sc, error), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, type) + __field(int, error) + ), + TP_fast_assign( + __entry->dev = sc->mp->m_super->s_dev; + __entry->type = sc->sm->sm_type; + __entry->error = error; + ), + TP_printk("dev %d:%d type %s error %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->type, XFS_SCRUB_TYPE_STRINGS), + __entry->error) +); +#define DEFINE_XCHK_FSFREEZE_EVENT(name) \ +DEFINE_EVENT(xchk_fsfreeze_class, name, \ + TP_PROTO(struct xfs_scrub *sc, int error), \ + TP_ARGS(sc, error)) +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsfreeze); +DEFINE_XCHK_FSFREEZE_EVENT(xchk_fsthaw); + TRACE_EVENT(xchk_refcount_incorrect, TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *irec, xfs_nlink_t seen), diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 791db7d9c849..6b840301817a 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -233,7 +233,7 @@ xfs_acl_set_mode( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); inode->i_mode = mode; - inode->i_ctime = current_time(inode); + inode_set_ctime_current(inode); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (xfs_has_wsync(mp)) diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 451942fb38ec..2fca4b4e7fd8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -578,7 +578,7 @@ const struct address_space_operations xfs_address_space_operations = { .read_folio = xfs_vm_read_folio, .readahead = xfs_vm_readahead, .writepages = xfs_vm_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .bmap = xfs_vm_bmap, diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index fbb675563208..fcefab687285 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1644,6 +1644,7 @@ xfs_swap_extents( uint64_t f; int resblks = 0; unsigned int flags = 0; + struct timespec64 ctime; /* * Lock the inodes against other IO, page faults and truncate to @@ -1756,8 +1757,9 @@ xfs_swap_extents( * process that the file was not changed out from * under it. */ - if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) || - (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) || + ctime = inode_get_ctime(VFS_I(ip)); + if ((sbp->bs_ctime.tv_sec != ctime.tv_sec) || + (sbp->bs_ctime.tv_nsec != ctime.tv_nsec) || (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) || (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) { error = -EBUSY; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15d1e5a7c2d3..3b903f6bce98 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1938,14 +1938,17 @@ void xfs_free_buftarg( struct xfs_buftarg *btp) { + struct block_device *bdev = btp->bt_bdev; + unregister_shrinker(&btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); - blkdev_issue_flush(btp->bt_bdev); - invalidate_bdev(btp->bt_bdev); fs_put_dax(btp->bt_daxdev, btp->bt_mount); + /* the main block device is closed by kill_block_super */ + if (bdev != btp->bt_mount->m_super->s_bdev) + blkdev_put(bdev, btp->bt_mount->m_super); kmem_free(btp); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 9e62cc500140..360fe83a334f 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -843,10 +843,9 @@ xfs_init_new_inode( ip->i_df.if_nextents = 0; ASSERT(ip->i_nblocks == 0); - tv = current_time(inode); + tv = inode_set_ctime_current(inode); inode->i_mtime = tv; inode->i_atime = tv; - inode->i_ctime = tv; ip->i_extsize = 0; ip->i_diflags = 0; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 91c847a84e10..127b2410eb20 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -528,7 +528,7 @@ xfs_inode_to_log_dinode( memset(to->di_pad3, 0, sizeof(to->di_pad3)); to->di_atime = xfs_inode_to_log_dinode_ts(ip, inode->i_atime); to->di_mtime = xfs_inode_to_log_dinode_ts(ip, inode->i_mtime); - to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode->i_ctime); + to->di_ctime = xfs_inode_to_log_dinode_ts(ip, inode_get_ctime(inode)); to->di_nlink = inode->i_nlink; to->di_gen = inode->i_generation; to->di_mode = inode->i_mode; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 24718adb3c16..2ededd3f6b8c 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -573,10 +573,10 @@ xfs_vn_getattr( stat->gid = vfsgid_into_kgid(vfsgid); stat->ino = ip->i_ino; stat->atime = inode->i_atime; - stat->mtime = inode->i_mtime; - stat->ctime = inode->i_ctime; stat->blocks = XFS_FSB_TO_BB(mp, ip->i_nblocks + ip->i_delayed_blks); + fill_mg_cmtime(stat, request_mask, inode); + if (xfs_has_v3inodes(mp)) { if (request_mask & STATX_BTIME) { stat->result_mask |= STATX_BTIME; @@ -917,7 +917,7 @@ xfs_setattr_size( if (newsize != oldsize && !(iattr->ia_valid & (ATTR_CTIME | ATTR_MTIME))) { iattr->ia_ctime = iattr->ia_mtime = - current_time(inode); + current_mgtime(inode); iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME; } @@ -1029,7 +1029,6 @@ xfs_vn_setattr( STATIC int xfs_vn_update_time( struct inode *inode, - struct timespec64 *now, int flags) { struct xfs_inode *ip = XFS_I(inode); @@ -1037,13 +1036,16 @@ xfs_vn_update_time( int log_flags = XFS_ILOG_TIMESTAMP; struct xfs_trans *tp; int error; + struct timespec64 now; trace_xfs_update_time(ip); if (inode->i_sb->s_flags & SB_LAZYTIME) { if (!((flags & S_VERSION) && - inode_maybe_inc_iversion(inode, false))) - return generic_update_time(inode, now, flags); + inode_maybe_inc_iversion(inode, false))) { + generic_update_time(inode, flags); + return 0; + } /* Capture the iversion update that just occurred */ log_flags |= XFS_ILOG_CORE; @@ -1054,12 +1056,15 @@ xfs_vn_update_time( return error; xfs_ilock(ip, XFS_ILOCK_EXCL); - if (flags & S_CTIME) - inode->i_ctime = *now; + if (flags & (S_CTIME|S_MTIME)) + now = inode_set_ctime_current(inode); + else + now = current_time(inode); + if (flags & S_MTIME) - inode->i_mtime = *now; + inode->i_mtime = now; if (flags & S_ATIME) - inode->i_atime = *now; + inode->i_atime = now; xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, log_flags); diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f225413a993c..c2093cb56092 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -100,8 +100,8 @@ xfs_bulkstat_one_int( buf->bs_atime_nsec = inode->i_atime.tv_nsec; buf->bs_mtime = inode->i_mtime.tv_sec; buf->bs_mtime_nsec = inode->i_mtime.tv_nsec; - buf->bs_ctime = inode->i_ctime.tv_sec; - buf->bs_ctime_nsec = inode->i_ctime.tv_nsec; + buf->bs_ctime = inode_get_ctime(inode).tv_sec; + buf->bs_ctime_nsec = inode_get_ctime(inode).tv_nsec; buf->bs_gen = inode->i_generation; buf->bs_mode = inode->i_mode; diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 818510243130..c79eac048456 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -377,17 +377,6 @@ disable_dax: return 0; } -static void -xfs_bdev_mark_dead( - struct block_device *bdev) -{ - xfs_force_shutdown(bdev->bd_holder, SHUTDOWN_DEVICE_REMOVED); -} - -static const struct blk_holder_ops xfs_holder_ops = { - .mark_dead = xfs_bdev_mark_dead, -}; - STATIC int xfs_blkdev_get( xfs_mount_t *mp, @@ -396,8 +385,8 @@ xfs_blkdev_get( { int error = 0; - *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, mp, - &xfs_holder_ops); + *bdevp = blkdev_get_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); xfs_warn(mp, "Invalid device [%s], error=%d", name, error); @@ -407,31 +396,45 @@ xfs_blkdev_get( } STATIC void -xfs_blkdev_put( - struct xfs_mount *mp, - struct block_device *bdev) -{ - if (bdev) - blkdev_put(bdev, mp); -} - -STATIC void -xfs_close_devices( +xfs_shutdown_devices( struct xfs_mount *mp) { + /* + * Udev is triggered whenever anyone closes a block device or unmounts + * a file systemm on a block device. + * The default udev rules invoke blkid to read the fs super and create + * symlinks to the bdev under /dev/disk. For this, it uses buffered + * reads through the page cache. + * + * xfs_db also uses buffered reads to examine metadata. There is no + * coordination between xfs_db and udev, which means that they can run + * concurrently. Note there is no coordination between the kernel and + * blkid either. + * + * On a system with 64k pages, the page cache can cache the superblock + * and the root inode (and hence the root directory) with the same 64k + * page. If udev spawns blkid after the mkfs and the system is busy + * enough that it is still running when xfs_db starts up, they'll both + * read from the same page in the pagecache. + * + * The unmount writes updated inode metadata to disk directly. The XFS + * buffer cache does not use the bdev pagecache, so it needs to + * invalidate that pagecache on unmount. If the above scenario occurs, + * the pagecache no longer reflects what's on disk, xfs_db reads the + * stale metadata, and fails to find /a. Most of the time this succeeds + * because closing a bdev invalidates the page cache, but when processes + * race, everyone loses. + */ if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { - struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_logdev_targp); - xfs_blkdev_put(mp, logdev); + blkdev_issue_flush(mp->m_logdev_targp->bt_bdev); + invalidate_bdev(mp->m_logdev_targp->bt_bdev); } if (mp->m_rtdev_targp) { - struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - - xfs_free_buftarg(mp->m_rtdev_targp); - xfs_blkdev_put(mp, rtdev); + blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); + invalidate_bdev(mp->m_rtdev_targp->bt_bdev); } - xfs_free_buftarg(mp->m_ddev_targp); + blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); + invalidate_bdev(mp->m_ddev_targp->bt_bdev); } /* @@ -448,17 +451,24 @@ STATIC int xfs_open_devices( struct xfs_mount *mp) { - struct block_device *ddev = mp->m_super->s_bdev; + struct super_block *sb = mp->m_super; + struct block_device *ddev = sb->s_bdev; struct block_device *logdev = NULL, *rtdev = NULL; int error; /* + * blkdev_put() can't be called under s_umount, see the comment + * in get_tree_bdev() for more details + */ + up_write(&sb->s_umount); + + /* * Open real time and log devices - order is important. */ if (mp->m_logname) { error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) - return error; + goto out_relock; } if (mp->m_rtname) { @@ -496,7 +506,10 @@ xfs_open_devices( mp->m_logdev_targp = mp->m_ddev_targp; } - return 0; + error = 0; +out_relock: + down_write(&sb->s_umount); + return error; out_free_rtdev_targ: if (mp->m_rtdev_targp) @@ -504,11 +517,12 @@ xfs_open_devices( out_free_ddev_targ: xfs_free_buftarg(mp->m_ddev_targp); out_close_rtdev: - xfs_blkdev_put(mp, rtdev); + if (rtdev) + blkdev_put(rtdev, sb); out_close_logdev: if (logdev && logdev != ddev) - xfs_blkdev_put(mp, logdev); - return error; + blkdev_put(logdev, sb); + goto out_relock; } /* @@ -758,6 +772,17 @@ static void xfs_mount_free( struct xfs_mount *mp) { + /* + * Free the buftargs here because blkdev_put needs to be called outside + * of sb->s_umount, which is held around the call to ->put_super. + */ + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) + xfs_free_buftarg(mp->m_logdev_targp); + if (mp->m_rtdev_targp) + xfs_free_buftarg(mp->m_rtdev_targp); + if (mp->m_ddev_targp) + xfs_free_buftarg(mp->m_ddev_targp); + kfree(mp->m_rtname); kfree(mp->m_logname); kmem_free(mp); @@ -1133,10 +1158,6 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); - /* if ->fill_super failed, we have no mount to tear down */ - if (!sb->s_fs_info) - return; - xfs_notice(mp, "Unmounting Filesystem %pU", &mp->m_sb.sb_uuid); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1147,10 +1168,7 @@ xfs_fs_put_super( xfs_inodegc_free_percpu(mp); xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - - sb->s_fs_info = NULL; - xfs_mount_free(mp); + xfs_shutdown_devices(mp); } static long @@ -1492,7 +1510,7 @@ xfs_fs_fill_super( error = xfs_fs_validate_params(mp); if (error) - goto out_free_names; + return error; sb_min_blocksize(sb, BBSIZE); sb->s_xattr = xfs_xattr_handlers; @@ -1519,11 +1537,11 @@ xfs_fs_fill_super( error = xfs_open_devices(mp); if (error) - goto out_free_names; + return error; error = xfs_init_mount_workqueues(mp); if (error) - goto out_close_devices; + goto out_shutdown_devices; error = xfs_init_percpu_counters(mp); if (error) @@ -1737,11 +1755,8 @@ xfs_fs_fill_super( xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); - out_close_devices: - xfs_close_devices(mp); - out_free_names: - sb->s_fs_info = NULL; - xfs_mount_free(mp); + out_shutdown_devices: + xfs_shutdown_devices(mp); return error; out_unmount: @@ -1934,7 +1949,8 @@ xfs_fs_reconfigure( return 0; } -static void xfs_fs_free( +static void +xfs_fs_free( struct fs_context *fc) { struct xfs_mount *mp = fc->s_fs_info; @@ -2003,13 +2019,21 @@ static int xfs_init_fs_context( return 0; } +static void +xfs_kill_sb( + struct super_block *sb) +{ + kill_block_super(sb); + xfs_mount_free(XFS_M(sb)); +} + static struct file_system_type xfs_fs_type = { .owner = THIS_MODULE, .name = "xfs", .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .kill_sb = xfs_kill_sb, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 789cfb74c146..b2c9b35df8f7 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -175,7 +175,7 @@ const struct address_space_operations zonefs_file_aops = { .read_folio = zonefs_read_folio, .readahead = zonefs_readahead, .writepages = zonefs_writepages, - .dirty_folio = filemap_dirty_folio, + .dirty_folio = iomap_dirty_folio, .release_folio = iomap_release_folio, .invalidate_folio = iomap_invalidate_folio, .migrate_folio = filemap_migrate_folio, diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 9350221abfc5..9d1a9808fbbb 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -658,7 +658,8 @@ static struct inode *zonefs_get_file_inode(struct inode *dir, inode->i_ino = ino; inode->i_mode = z->z_mode; - inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + inode_get_ctime(dir)); inode->i_uid = z->z_uid; inode->i_gid = z->z_gid; inode->i_size = z->z_wpoffset; @@ -694,7 +695,8 @@ static struct inode *zonefs_get_zgroup_inode(struct super_block *sb, inode->i_ino = ino; inode_init_owner(&nop_mnt_idmap, inode, root, S_IFDIR | 0555); inode->i_size = sbi->s_zgroup[ztype].g_nr_zones; - inode->i_ctime = inode->i_mtime = inode->i_atime = root->i_ctime; + inode->i_mtime = inode->i_atime = inode_set_ctime_to_ts(inode, + inode_get_ctime(root)); inode->i_private = &sbi->s_zgroup[ztype]; set_nlink(inode, 2); @@ -1317,7 +1319,7 @@ static int zonefs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = bdev_nr_zones(sb->s_bdev); inode->i_mode = S_IFDIR | 0555; - inode->i_ctime = inode->i_mtime = inode->i_atime = current_time(inode); + inode->i_mtime = inode->i_atime = inode_set_ctime_current(inode); inode->i_op = &zonefs_dir_inode_operations; inode->i_fop = &zonefs_dir_operations; inode->i_size = 2; |