diff options
Diffstat (limited to 'fs')
334 files changed, 9495 insertions, 5766 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index 103ca5e1267b..64c58eb26159 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -151,34 +151,6 @@ fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void v9fs_cache_inode_now_uncached(void *cookie_netfs_data) -{ - struct v9fs_inode *v9inode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - nr_pages = pagevec_lookup(&pvec, v9inode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def v9fs_cache_inode_index_def = { .name = "9p.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -186,7 +158,6 @@ const struct fscache_cookie_def v9fs_cache_inode_index_def = { .get_attr = v9fs_cache_inode_get_attr, .get_aux = v9fs_cache_inode_get_aux, .check_aux = v9fs_cache_inode_check_aux, - .now_uncached = v9fs_cache_inode_now_uncached, }; void v9fs_cache_inode_get_cookie(struct inode *inode) diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 3de3b4a89d89..03c9e325bfbc 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -288,7 +288,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = OFFSET_MAX; else fl->fl_end = glock.start + glock.length - 1; - fl->fl_pid = glock.proc_id; + fl->fl_pid = -glock.proc_id; } kfree(glock.client_id); return res; @@ -445,7 +445,7 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, struct p9_wstat wstat; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; @@ -468,7 +468,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int retval; - retval = filemap_write_and_wait_range(inode->i_mapping, start, end); + retval = file_write_and_wait_range(filp, start, end); if (retval) return retval; diff --git a/fs/affs/file.c b/fs/affs/file.c index 196ee7f6fdc4..00331810f690 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -954,7 +954,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct inode *inode = filp->f_mapping->host; int ret, err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/afs/cache.c b/fs/afs/cache.c index 577763c3d88b..1fe855191261 100644 --- a/fs/afs/cache.c +++ b/fs/afs/cache.c @@ -39,7 +39,6 @@ static uint16_t afs_vnode_cache_get_aux(const void *cookie_netfs_data, static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, const void *buffer, uint16_t buflen); -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data); struct fscache_netfs afs_cache_netfs = { .name = "afs", @@ -75,7 +74,6 @@ struct fscache_cookie_def afs_vnode_cache_index_def = { .get_attr = afs_vnode_cache_get_attr, .get_aux = afs_vnode_cache_get_aux, .check_aux = afs_vnode_cache_check_aux, - .now_uncached = afs_vnode_cache_now_uncached, }; /* @@ -359,44 +357,3 @@ static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, _leave(" = SUCCESS"); return FSCACHE_CHECKAUX_OKAY; } - -/* - * indication the cookie is no longer uncached - * - this function is called when the backing store currently caching a cookie - * is removed - * - the netfs should use this to clean up any markers indicating cached pages - * - this is mandatory for any object that may have data - */ -static void afs_vnode_cache_now_uncached(void *cookie_netfs_data) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - _enter("{%x,%x,%Lx}", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version); - - pagevec_init(&pvec, 0); - first = 0; - - for (;;) { - /* grab a bunch of pages to clean */ - nr_pages = pagevec_lookup(&pvec, vnode->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } - - _leave(""); -} diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 100b207efc9e..c05f1f1c0d41 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -12,7 +12,6 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> -#include <rxrpc/packet.h> #include "internal.h" #include "afs_fs.h" diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 02781e78ffb6..0bf191f0dbaf 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -14,7 +14,6 @@ #include <net/sock.h> #include <net/af_rxrpc.h> -#include <rxrpc/packet.h> #include "internal.h" #include "afs_cm.h" @@ -293,6 +292,19 @@ static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, } /* + * Advance the AFS call state when the RxRPC call ends the transmit phase. + */ +static void afs_notify_end_request_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REQUESTING) + call->state = AFS_CALL_AWAIT_REPLY; +} + +/* * attach the data from a bunch of pages on an inode to a call */ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) @@ -311,14 +323,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg) bytes = msg->msg_iter.count; nr = msg->msg_iter.nr_segs; - /* Have to change the state *before* sending the last - * packet as RxRPC might give us the reply before it - * returns from sending the request. - */ - if (first + nr - 1 >= last) - call->state = AFS_CALL_AWAIT_REPLY; - ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, - msg, bytes); + ret = rxrpc_kernel_send_data(afs_socket, call->rxcall, msg, + bytes, afs_notify_end_request_tx); for (loop = 0; loop < nr; loop++) put_page(bv[loop].bv_page); if (ret < 0) @@ -410,7 +416,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, if (!call->send_pages) call->state = AFS_CALL_AWAIT_REPLY; ret = rxrpc_kernel_send_data(afs_socket, rxcall, - &msg, call->request_size); + &msg, call->request_size, + afs_notify_end_request_tx); if (ret < 0) goto error_do_abort; @@ -742,6 +749,20 @@ static int afs_deliver_cm_op_id(struct afs_call *call) } /* + * Advance the AFS call state when an RxRPC service call ends the transmit + * phase. + */ +static void afs_notify_end_reply_tx(struct sock *sock, + struct rxrpc_call *rxcall, + unsigned long call_user_ID) +{ + struct afs_call *call = (struct afs_call *)call_user_ID; + + if (call->state == AFS_CALL_REPLYING) + call->state = AFS_CALL_AWAIT_ACK; +} + +/* * send an empty reply */ void afs_send_empty_reply(struct afs_call *call) @@ -760,7 +781,8 @@ void afs_send_empty_reply(struct afs_call *call) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0)) { + switch (rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, 0, + afs_notify_end_reply_tx)) { case 0: _leave(" [replied]"); return; @@ -798,7 +820,8 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) msg.msg_flags = 0; call->state = AFS_CALL_AWAIT_ACK; - n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len); + n = rxrpc_kernel_send_data(afs_socket, call->rxcall, &msg, len, + afs_notify_end_reply_tx); if (n >= 0) { /* Success */ _leave(" [replied]"); diff --git a/fs/afs/write.c b/fs/afs/write.c index 2d2fccd5044b..106e43db1115 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -714,7 +714,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) vnode->fid.vid, vnode->fid.vnode, file, datasync); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; inode_lock(inode); @@ -373,6 +373,14 @@ static int aio_migratepage(struct address_space *mapping, struct page *new, pgoff_t idx; int rc; + /* + * We cannot support the _NO_COPY case here, because copy needs to + * happen under the ctx->completion_lock. That does not work with the + * migration workflow of MIGRATE_SYNC_NO_COPY. + */ + if (mode == MIGRATE_SYNC_NO_COPY) + return -EINVAL; + rc = 0; /* mapping->private_lock here protects against the kioctx teardown. */ @@ -441,10 +449,9 @@ static const struct address_space_operations aio_ctx_aops = { #endif }; -static int aio_setup_ring(struct kioctx *ctx) +static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events) { struct aio_ring *ring; - unsigned nr_events = ctx->max_reqs; struct mm_struct *mm = current->mm; unsigned long size, unused; int nr_pages; @@ -707,6 +714,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) int err = -ENOMEM; /* + * Store the original nr_events -- what userspace passed to io_setup(), + * for counting against the global limit -- before it changes. + */ + unsigned int max_reqs = nr_events; + + /* * We keep track of the number of available ringbuffer slots, to prevent * overflow (reqs_available), and we also use percpu counters for this. * @@ -724,14 +737,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) return ERR_PTR(-EINVAL); } - if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL)) + if (!nr_events || (unsigned long)max_reqs > aio_max_nr) return ERR_PTR(-EAGAIN); ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); - ctx->max_reqs = nr_events; + ctx->max_reqs = max_reqs; spin_lock_init(&ctx->ctx_lock); spin_lock_init(&ctx->completion_lock); @@ -753,7 +766,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (!ctx->cpu) goto err; - err = aio_setup_ring(ctx); + err = aio_setup_ring(ctx, nr_events); if (err < 0) goto err; @@ -764,8 +777,8 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); - if (aio_nr + nr_events > (aio_max_nr * 2UL) || - aio_nr + nr_events < aio_nr) { + if (aio_nr + ctx->max_reqs > aio_max_nr || + aio_nr + ctx->max_reqs < aio_nr) { spin_unlock(&aio_nr_lock); err = -EAGAIN; goto err_ctx; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index beef981aa54f..4737615f0eaa 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -11,10 +11,21 @@ #include <linux/auto_fs4.h> #include <linux/auto_dev-ioctl.h> + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/string.h> +#include <linux/wait.h> +#include <linux/sched.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> +#include <asm/current.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY @@ -24,17 +35,6 @@ #define AUTOFS_DEV_IOCTL_IOC_COUNT \ (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/time.h> -#include <linux/string.h> -#include <linux/wait.h> -#include <linux/sched.h> -#include <linux/mount.h> -#include <linux/namei.h> -#include <asm/current.h> -#include <linux/uaccess.h> - #ifdef pr_fmt #undef pr_fmt #endif diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index dd9f1bebb5a3..b7c816f39404 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -93,17 +93,17 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) * at the end of the struct. */ static struct autofs_dev_ioctl * - copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; - if (copy_from_user(&tmp, in, sizeof(tmp))) + if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); - if (tmp.size < sizeof(tmp)) + if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); - if (tmp.size > (PATH_MAX + sizeof(tmp))) + if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); @@ -133,8 +133,8 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) goto out; } - if (param->size > sizeof(*param)) { - err = invalid_str(param->path, param->size - sizeof(*param)); + if (param->size > AUTOFS_DEV_IOCTL_SIZE) { + err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", @@ -258,11 +258,6 @@ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) if (err) goto out; - /* - * Find autofs super block that has the device number - * corresponding to the autofs fs we want to open. - */ - filp = dentry_open(&path, O_RDONLY, current_cred()); path_put(&path); if (IS_ERR(filp)) { @@ -451,7 +446,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, dev_t devid; int err = -ENOENT; - if (param->size <= sizeof(*param)) { + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { err = -EINVAL; goto out; } @@ -539,7 +534,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, unsigned int devid, magic; int err = -ENOENT; - if (param->size <= sizeof(*param)) { + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { err = -EINVAL; goto out; } @@ -628,10 +623,6 @@ static int _autofs_dev_ioctl(unsigned int command, ioctl_fn fn = NULL; int err = 0; - /* only root can play with this */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); @@ -640,6 +631,14 @@ static int _autofs_dev_ioctl(unsigned int command, return -ENOTTY; } + /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD + * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD + */ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) @@ -706,7 +705,8 @@ out: return err; } -static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +static long autofs_dev_ioctl(struct file *file, unsigned int command, + unsigned long u) { int err; @@ -715,9 +715,10 @@ static long autofs_dev_ioctl(struct file *file, uint command, ulong u) } #ifdef CONFIG_COMPAT -static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, + unsigned long u) { - return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); + return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL @@ -733,7 +734,8 @@ static const struct file_operations _dev_ioctl_fops = { static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops + .fops = &_dev_ioctl_fops, + .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2f928b87c90e..73b01e474fdc 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -252,7 +252,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, from_kgid_munged(cred->user_ns, cred->egid)); - NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); #ifdef ELF_HWCAP2 NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index b4ebfe203a68..e70c039ac190 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -651,7 +651,7 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EUID, (elf_addr_t) from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->gid)); NEW_AUX_ENT(AT_EGID, (elf_addr_t) from_kgid_munged(cred->user_ns, cred->egid)); - NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_SECURE, bprm->secureexec); NEW_AUX_ENT(AT_EXECFN, bprm->exec); #ifdef ARCH_DLINFO diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index afb7e9d521d2..475d083f8088 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -187,13 +187,11 @@ static int decompress_exec(struct linux_binprm *bprm, loff_t fpos, char *dst, memset(&strm, 0, sizeof(strm)); strm.workspace = kmalloc(zlib_inflate_workspacesize(), GFP_KERNEL); - if (strm.workspace == NULL) { - pr_debug("no memory for decompress workspace\n"); + if (!strm.workspace) return -ENOMEM; - } + buf = kmalloc(LBUFSIZE, GFP_KERNEL); - if (buf == NULL) { - pr_debug("no memory for read buffer\n"); + if (!buf) { retval = -ENOMEM; goto out_free; } @@ -882,7 +880,7 @@ static int load_flat_shared_library(int id, struct lib_info *libs) * as we're past the point of no return and are dealing with shared * libraries. */ - bprm.cred_prepared = 1; + bprm.called_set_creds = 1; res = prepare_binprm(&bprm); diff --git a/fs/block_dev.c b/fs/block_dev.c index 9941dc8342df..bb715b2fcfb8 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -223,7 +223,7 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, } bio_init(&bio, vecs, nr_pages); - bio.bi_bdev = bdev; + bio_set_dev(&bio, bdev); bio.bi_iter.bi_sector = pos >> 9; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; @@ -362,7 +362,7 @@ __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) blk_start_plug(&plug); for (;;) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; @@ -1451,6 +1451,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_partno = partno; if (!partno) { ret = -ENXIO; diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 80e9c18ea64f..a26c63b4ad68 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -6,6 +6,8 @@ config BTRFS_FS select ZLIB_DEFLATE select LZO_COMPRESS select LZO_DECOMPRESS + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 128ce17a80b0..962a95aefb81 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -6,7 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ transaction.o inode.o file.o tree-defrag.o \ extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ - export.o tree-log.o free-space-cache.o zlib.o lzo.o \ + export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o hash.o free-space-tree.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 8d8370ddb6b2..1ba49ebe67da 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -114,13 +114,17 @@ out: int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { int ret; + umode_t old_mode = inode->i_mode; if (type == ACL_TYPE_ACCESS && acl) { ret = posix_acl_update_mode(inode, &inode->i_mode, &acl); if (ret) return ret; } - return __btrfs_set_acl(NULL, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); + if (ret) + inode->i_mode = old_mode; + return ret; } /* diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index ff0b0be92d61..e00c8a9fd5bb 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -75,18 +75,18 @@ void btrfs_##name(struct work_struct *arg) \ } struct btrfs_fs_info * -btrfs_workqueue_owner(struct __btrfs_workqueue *wq) +btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) { return wq->fs_info; } struct btrfs_fs_info * -btrfs_work_owner(struct btrfs_work *work) +btrfs_work_owner(const struct btrfs_work *work) { return work->wq->fs_info; } -bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq) +bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* * We could compare wq->normal->pending with num_online_cpus() diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 1f9597355c9d..fc957e00cef1 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -82,7 +82,7 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); void btrfs_set_work_high_priority(struct btrfs_work *work); -struct btrfs_fs_info *btrfs_work_owner(struct btrfs_work *work); -struct btrfs_fs_info *btrfs_workqueue_owner(struct __btrfs_workqueue *wq); -bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq); +struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work); +struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f723c11bb763..b517ef1477ea 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -18,6 +18,7 @@ #include <linux/mm.h> #include <linux/rbtree.h> +#include <trace/events/btrfs.h> #include "ctree.h" #include "disk-io.h" #include "backref.h" @@ -26,11 +27,6 @@ #include "delayed-ref.h" #include "locking.h" -enum merge_mode { - MERGE_IDENTICAL_KEYS = 1, - MERGE_IDENTICAL_PARENTS, -}; - /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -40,269 +36,11 @@ struct extent_inode_elem { struct extent_inode_elem *next; }; -/* - * ref_root is used as the root of the ref tree that hold a collection - * of unique references. - */ -struct ref_root { - struct rb_root rb_root; - - /* - * The unique_refs represents the number of ref_nodes with a positive - * count stored in the tree. Even if a ref_node (the count is greater - * than one) is added, the unique_refs will only increase by one. - */ - unsigned int unique_refs; -}; - -/* ref_node is used to store a unique reference to the ref tree. */ -struct ref_node { - struct rb_node rb_node; - - /* For NORMAL_REF, otherwise all these fields should be set to 0 */ - u64 root_id; - u64 object_id; - u64 offset; - - /* For SHARED_REF, otherwise parent field should be set to 0 */ - u64 parent; - - /* Ref to the ref_mod of btrfs_delayed_ref_node */ - int ref_mod; -}; - -/* Dynamically allocate and initialize a ref_root */ -static struct ref_root *ref_root_alloc(void) -{ - struct ref_root *ref_tree; - - ref_tree = kmalloc(sizeof(*ref_tree), GFP_NOFS); - if (!ref_tree) - return NULL; - - ref_tree->rb_root = RB_ROOT; - ref_tree->unique_refs = 0; - - return ref_tree; -} - -/* Free all nodes in the ref tree, and reinit ref_root */ -static void ref_root_fini(struct ref_root *ref_tree) -{ - struct ref_node *node; - struct rb_node *next; - - while ((next = rb_first(&ref_tree->rb_root)) != NULL) { - node = rb_entry(next, struct ref_node, rb_node); - rb_erase(next, &ref_tree->rb_root); - kfree(node); - } - - ref_tree->rb_root = RB_ROOT; - ref_tree->unique_refs = 0; -} - -static void ref_root_free(struct ref_root *ref_tree) -{ - if (!ref_tree) - return; - - ref_root_fini(ref_tree); - kfree(ref_tree); -} - -/* - * Compare ref_node with (root_id, object_id, offset, parent) - * - * The function compares two ref_node a and b. It returns an integer less - * than, equal to, or greater than zero , respectively, to be less than, to - * equal, or be greater than b. - */ -static int ref_node_cmp(struct ref_node *a, struct ref_node *b) -{ - if (a->root_id < b->root_id) - return -1; - else if (a->root_id > b->root_id) - return 1; - - if (a->object_id < b->object_id) - return -1; - else if (a->object_id > b->object_id) - return 1; - - if (a->offset < b->offset) - return -1; - else if (a->offset > b->offset) - return 1; - - if (a->parent < b->parent) - return -1; - else if (a->parent > b->parent) - return 1; - - return 0; -} - -/* - * Search ref_node with (root_id, object_id, offset, parent) in the tree - * - * if found, the pointer of the ref_node will be returned; - * if not found, NULL will be returned and pos will point to the rb_node for - * insert, pos_parent will point to pos'parent for insert; -*/ -static struct ref_node *__ref_tree_search(struct ref_root *ref_tree, - struct rb_node ***pos, - struct rb_node **pos_parent, - u64 root_id, u64 object_id, - u64 offset, u64 parent) -{ - struct ref_node *cur = NULL; - struct ref_node entry; - int ret; - - entry.root_id = root_id; - entry.object_id = object_id; - entry.offset = offset; - entry.parent = parent; - - *pos = &ref_tree->rb_root.rb_node; - - while (**pos) { - *pos_parent = **pos; - cur = rb_entry(*pos_parent, struct ref_node, rb_node); - - ret = ref_node_cmp(cur, &entry); - if (ret > 0) - *pos = &(**pos)->rb_left; - else if (ret < 0) - *pos = &(**pos)->rb_right; - else - return cur; - } - - return NULL; -} - -/* - * Insert a ref_node to the ref tree - * @pos used for specifiy the position to insert - * @pos_parent for specifiy pos's parent - * - * success, return 0; - * ref_node already exists, return -EEXIST; -*/ -static int ref_tree_insert(struct ref_root *ref_tree, struct rb_node **pos, - struct rb_node *pos_parent, struct ref_node *ins) -{ - struct rb_node **p = NULL; - struct rb_node *parent = NULL; - struct ref_node *cur = NULL; - - if (!pos) { - cur = __ref_tree_search(ref_tree, &p, &parent, ins->root_id, - ins->object_id, ins->offset, - ins->parent); - if (cur) - return -EEXIST; - } else { - p = pos; - parent = pos_parent; - } - - rb_link_node(&ins->rb_node, parent, p); - rb_insert_color(&ins->rb_node, &ref_tree->rb_root); - - return 0; -} - -/* Erase and free ref_node, caller should update ref_root->unique_refs */ -static void ref_tree_remove(struct ref_root *ref_tree, struct ref_node *node) -{ - rb_erase(&node->rb_node, &ref_tree->rb_root); - kfree(node); -} - -/* - * Update ref_root->unique_refs - * - * Call __ref_tree_search - * 1. if ref_node doesn't exist, ref_tree_insert this node, and update - * ref_root->unique_refs: - * if ref_node->ref_mod > 0, ref_root->unique_refs++; - * if ref_node->ref_mod < 0, do noting; - * - * 2. if ref_node is found, then get origin ref_node->ref_mod, and update - * ref_node->ref_mod. - * if ref_node->ref_mod is equal to 0,then call ref_tree_remove - * - * according to origin_mod and new_mod, update ref_root->items - * +----------------+--------------+-------------+ - * | |new_count <= 0|new_count > 0| - * +----------------+--------------+-------------+ - * |origin_count < 0| 0 | 1 | - * +----------------+--------------+-------------+ - * |origin_count > 0| -1 | 0 | - * +----------------+--------------+-------------+ - * - * In case of allocation failure, -ENOMEM is returned and the ref_tree stays - * unaltered. - * Success, return 0 - */ -static int ref_tree_add(struct ref_root *ref_tree, u64 root_id, u64 object_id, - u64 offset, u64 parent, int count) -{ - struct ref_node *node = NULL; - struct rb_node **pos = NULL; - struct rb_node *pos_parent = NULL; - int origin_count; - int ret; - - if (!count) - return 0; - - node = __ref_tree_search(ref_tree, &pos, &pos_parent, root_id, - object_id, offset, parent); - if (node == NULL) { - node = kmalloc(sizeof(*node), GFP_NOFS); - if (!node) - return -ENOMEM; - - node->root_id = root_id; - node->object_id = object_id; - node->offset = offset; - node->parent = parent; - node->ref_mod = count; - - ret = ref_tree_insert(ref_tree, pos, pos_parent, node); - ASSERT(!ret); - if (ret) { - kfree(node); - return ret; - } - - ref_tree->unique_refs += node->ref_mod > 0 ? 1 : 0; - - return 0; - } - - origin_count = node->ref_mod; - node->ref_mod += count; - - if (node->ref_mod > 0) - ref_tree->unique_refs += origin_count > 0 ? 0 : 1; - else if (node->ref_mod <= 0) - ref_tree->unique_refs += origin_count > 0 ? -1 : 0; - - if (!node->ref_mod) - ref_tree_remove(ref_tree, node); - - return 0; -} - -static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb, - struct btrfs_file_extent_item *fi, - u64 extent_item_pos, - struct extent_inode_elem **eie) +static int check_extent_in_eb(const struct btrfs_key *key, + const struct extent_buffer *eb, + const struct btrfs_file_extent_item *fi, + u64 extent_item_pos, + struct extent_inode_elem **eie) { u64 offset = 0; struct extent_inode_elem *e; @@ -344,9 +82,9 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) } } -static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, - u64 extent_item_pos, - struct extent_inode_elem **eie) +static int find_extent_in_eb(const struct extent_buffer *eb, + u64 wanted_disk_byte, u64 extent_item_pos, + struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; @@ -383,26 +121,44 @@ static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte, return 0; } +struct preftree { + struct rb_root root; + unsigned int count; +}; + +#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 } + +struct preftrees { + struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */ + struct preftree indirect; /* BTRFS_[TREE_BLOCK|EXTENT_DATA]_REF_KEY */ + struct preftree indirect_missing_keys; +}; + /* - * this structure records all encountered refs on the way up to the root + * Checks for a shared extent during backref search. + * + * The share_count tracks prelim_refs (direct and indirect) having a + * ref->count >0: + * - incremented when a ref->count transitions to >0 + * - decremented when a ref->count transitions to <1 */ -struct __prelim_ref { - struct list_head list; - u64 root_id; - struct btrfs_key key_for_search; - int level; - int count; - struct extent_inode_elem *inode_list; - u64 parent; - u64 wanted_disk_byte; +struct share_check { + u64 root_objectid; + u64 inum; + int share_count; }; +static inline int extent_is_shared(struct share_check *sc) +{ + return (sc && sc->share_count > 1) ? BACKREF_FOUND_SHARED : 0; +} + static struct kmem_cache *btrfs_prelim_ref_cache; int __init btrfs_prelim_ref_init(void) { btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref", - sizeof(struct __prelim_ref), + sizeof(struct prelim_ref), 0, SLAB_MEM_SPREAD, NULL); @@ -416,6 +172,134 @@ void btrfs_prelim_ref_exit(void) kmem_cache_destroy(btrfs_prelim_ref_cache); } +static void free_pref(struct prelim_ref *ref) +{ + kmem_cache_free(btrfs_prelim_ref_cache, ref); +} + +/* + * Return 0 when both refs are for the same block (and can be merged). + * A -1 return indicates ref1 is a 'lower' block than ref2, while 1 + * indicates a 'higher' block. + */ +static int prelim_ref_compare(struct prelim_ref *ref1, + struct prelim_ref *ref2) +{ + if (ref1->level < ref2->level) + return -1; + if (ref1->level > ref2->level) + return 1; + if (ref1->root_id < ref2->root_id) + return -1; + if (ref1->root_id > ref2->root_id) + return 1; + if (ref1->key_for_search.type < ref2->key_for_search.type) + return -1; + if (ref1->key_for_search.type > ref2->key_for_search.type) + return 1; + if (ref1->key_for_search.objectid < ref2->key_for_search.objectid) + return -1; + if (ref1->key_for_search.objectid > ref2->key_for_search.objectid) + return 1; + if (ref1->key_for_search.offset < ref2->key_for_search.offset) + return -1; + if (ref1->key_for_search.offset > ref2->key_for_search.offset) + return 1; + if (ref1->parent < ref2->parent) + return -1; + if (ref1->parent > ref2->parent) + return 1; + + return 0; +} + +void update_share_count(struct share_check *sc, int oldcount, int newcount) +{ + if ((!sc) || (oldcount == 0 && newcount < 1)) + return; + + if (oldcount > 0 && newcount < 1) + sc->share_count--; + else if (oldcount < 1 && newcount > 0) + sc->share_count++; +} + +/* + * Add @newref to the @root rbtree, merging identical refs. + * + * Callers should assume that newref has been freed after calling. + */ +static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, + struct prelim_ref *newref, + struct share_check *sc) +{ + struct rb_root *root; + struct rb_node **p; + struct rb_node *parent = NULL; + struct prelim_ref *ref; + int result; + + root = &preftree->root; + p = &root->rb_node; + + while (*p) { + parent = *p; + ref = rb_entry(parent, struct prelim_ref, rbnode); + result = prelim_ref_compare(ref, newref); + if (result < 0) { + p = &(*p)->rb_left; + } else if (result > 0) { + p = &(*p)->rb_right; + } else { + /* Identical refs, merge them and free @newref */ + struct extent_inode_elem *eie = ref->inode_list; + + while (eie && eie->next) + eie = eie->next; + + if (!eie) + ref->inode_list = newref->inode_list; + else + eie->next = newref->inode_list; + trace_btrfs_prelim_ref_merge(fs_info, ref, newref, + preftree->count); + /* + * A delayed ref can have newref->count < 0. + * The ref->count is updated to follow any + * BTRFS_[ADD|DROP]_DELAYED_REF actions. + */ + update_share_count(sc, ref->count, + ref->count + newref->count); + ref->count += newref->count; + free_pref(newref); + return; + } + } + + update_share_count(sc, 0, newref->count); + preftree->count++; + trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); + rb_link_node(&newref->rbnode, parent, p); + rb_insert_color(&newref->rbnode, root); +} + +/* + * Release the entire tree. We don't care about internal consistency so + * just free everything and then reset the tree root. + */ +static void prelim_release(struct preftree *preftree) +{ + struct prelim_ref *ref, *next_ref; + + rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root, + rbnode) + free_pref(ref); + + preftree->root = RB_ROOT; + preftree->count = 0; +} + /* * the rules for all callers of this function are: * - obtaining the parent is the goal @@ -448,19 +332,19 @@ void btrfs_prelim_ref_exit(void) * * - column 1, 3: we've the parent -> done * - column 2: we take the first key from the block to find the parent - * (see __add_missing_keys) + * (see add_missing_keys) * - column 4: we use the key to find the parent * * additional information that's available but not required to find the parent * block might help in merging entries to gain some speed. */ - -static int __add_prelim_ref(struct list_head *head, u64 root_id, - struct btrfs_key *key, int level, - u64 parent, u64 wanted_disk_byte, int count, - gfp_t gfp_mask) +static int add_prelim_ref(const struct btrfs_fs_info *fs_info, + struct preftree *preftree, u64 root_id, + const struct btrfs_key *key, int level, u64 parent, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) { - struct __prelim_ref *ref; + struct prelim_ref *ref; if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID) return 0; @@ -503,13 +387,37 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id, ref->count = count; ref->parent = parent; ref->wanted_disk_byte = wanted_disk_byte; - list_add_tail(&ref->list, head); + prelim_ref_insert(fs_info, preftree, ref, sc); + return extent_is_shared(sc); +} - return 0; +/* direct refs use root == 0, key == NULL */ +static int add_direct_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, int level, u64 parent, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) +{ + return add_prelim_ref(fs_info, &preftrees->direct, 0, NULL, level, + parent, wanted_disk_byte, count, sc, gfp_mask); +} + +/* indirect refs use parent == 0 */ +static int add_indirect_ref(const struct btrfs_fs_info *fs_info, + struct preftrees *preftrees, u64 root_id, + const struct btrfs_key *key, int level, + u64 wanted_disk_byte, int count, + struct share_check *sc, gfp_t gfp_mask) +{ + struct preftree *tree = &preftrees->indirect; + + if (!key) + tree = &preftrees->indirect_missing_keys; + return add_prelim_ref(fs_info, tree, root_id, key, level, 0, + wanted_disk_byte, count, sc, gfp_mask); } static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, - struct ulist *parents, struct __prelim_ref *ref, + struct ulist *parents, struct prelim_ref *ref, int level, u64 time_seq, const u64 *extent_item_pos, u64 total_refs) { @@ -599,11 +507,10 @@ next: * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ -static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, - struct __prelim_ref *ref, - struct ulist *parents, - const u64 *extent_item_pos, u64 total_refs) +static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct prelim_ref *ref, struct ulist *parents, + const u64 *extent_item_pos, u64 total_refs) { struct btrfs_root *root; struct btrfs_key root_key; @@ -681,52 +588,90 @@ out: return ret; } +static struct extent_inode_elem * +unode_aux_to_inode_list(struct ulist_node *node) +{ + if (!node) + return NULL; + return (struct extent_inode_elem *)(uintptr_t)node->aux; +} + /* - * resolve all indirect backrefs from the list + * We maintain three seperate rbtrees: one for direct refs, one for + * indirect refs which have a key, and one for indirect refs which do not + * have a key. Each tree does merge on insertion. + * + * Once all of the references are located, we iterate over the tree of + * indirect refs with missing keys. An appropriate key is located and + * the ref is moved onto the tree for indirect refs. After all missing + * keys are thus located, we iterate over the indirect ref tree, resolve + * each reference, and then insert the resolved reference onto the + * direct tree (merging there too). + * + * New backrefs (i.e., for parent nodes) are added to the appropriate + * rbtree as they are encountered. The new backrefs are subsequently + * resolved as above. */ -static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, - struct list_head *head, - const u64 *extent_item_pos, u64 total_refs, - u64 root_objectid) +static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 time_seq, + struct preftrees *preftrees, + const u64 *extent_item_pos, u64 total_refs, + struct share_check *sc) { int err; int ret = 0; - struct __prelim_ref *ref; - struct __prelim_ref *ref_safe; - struct __prelim_ref *new_ref; struct ulist *parents; struct ulist_node *node; struct ulist_iterator uiter; + struct rb_node *rnode; parents = ulist_alloc(GFP_NOFS); if (!parents) return -ENOMEM; /* - * _safe allows us to insert directly after the current item without - * iterating over the newly inserted items. - * we're also allowed to re-assign ref during iteration. + * We could trade memory usage for performance here by iterating + * the tree, allocating new refs for each insertion, and then + * freeing the entire indirect tree when we're done. In some test + * cases, the tree can grow quite large (~200k objects). */ - list_for_each_entry_safe(ref, ref_safe, head, list) { - if (ref->parent) /* already direct */ - continue; - if (ref->count == 0) + while ((rnode = rb_first(&preftrees->indirect.root))) { + struct prelim_ref *ref; + + ref = rb_entry(rnode, struct prelim_ref, rbnode); + if (WARN(ref->parent, + "BUG: direct ref found in indirect tree")) { + ret = -EINVAL; + goto out; + } + + rb_erase(&ref->rbnode, &preftrees->indirect.root); + preftrees->indirect.count--; + + if (ref->count == 0) { + free_pref(ref); continue; - if (root_objectid && ref->root_id != root_objectid) { + } + + if (sc && sc->root_objectid && + ref->root_id != sc->root_objectid) { + free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; } - err = __resolve_indirect_ref(fs_info, path, time_seq, ref, - parents, extent_item_pos, - total_refs); + err = resolve_indirect_ref(fs_info, path, time_seq, ref, + parents, extent_item_pos, + total_refs); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { + prelim_ref_insert(fs_info, &preftrees->direct, ref, + NULL); continue; } else if (err) { + free_pref(ref); ret = err; goto out; } @@ -735,68 +680,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&uiter); node = ulist_next(parents, &uiter); ref->parent = node ? node->val : 0; - ref->inode_list = node ? - (struct extent_inode_elem *)(uintptr_t)node->aux : NULL; + ref->inode_list = unode_aux_to_inode_list(node); - /* additional parents require new refs being added here */ + /* Add a prelim_ref(s) for any other parent(s). */ while ((node = ulist_next(parents, &uiter))) { + struct prelim_ref *new_ref; + new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache, GFP_NOFS); if (!new_ref) { + free_pref(ref); ret = -ENOMEM; goto out; } memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; - new_ref->inode_list = (struct extent_inode_elem *) - (uintptr_t)node->aux; - list_add(&new_ref->list, &ref->list); + new_ref->inode_list = unode_aux_to_inode_list(node); + prelim_ref_insert(fs_info, &preftrees->direct, + new_ref, NULL); } + + /* + * Now it's a direct ref, put it in the the direct tree. We must + * do this last because the ref could be merged/freed here. + */ + prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); + ulist_reinit(parents); + cond_resched(); } out: ulist_free(parents); return ret; } -static inline int ref_for_same_block(struct __prelim_ref *ref1, - struct __prelim_ref *ref2) -{ - if (ref1->level != ref2->level) - return 0; - if (ref1->root_id != ref2->root_id) - return 0; - if (ref1->key_for_search.type != ref2->key_for_search.type) - return 0; - if (ref1->key_for_search.objectid != ref2->key_for_search.objectid) - return 0; - if (ref1->key_for_search.offset != ref2->key_for_search.offset) - return 0; - if (ref1->parent != ref2->parent) - return 0; - - return 1; -} - /* * read tree blocks and add keys where required. */ -static int __add_missing_keys(struct btrfs_fs_info *fs_info, - struct list_head *head) +static int add_missing_keys(struct btrfs_fs_info *fs_info, + struct preftrees *preftrees) { - struct __prelim_ref *ref; + struct prelim_ref *ref; struct extent_buffer *eb; + struct preftree *tree = &preftrees->indirect_missing_keys; + struct rb_node *node; - list_for_each_entry(ref, head, list) { - if (ref->parent) - continue; - if (ref->key_for_search.type) - continue; + while ((node = rb_first(&tree->root))) { + ref = rb_entry(node, struct prelim_ref, rbnode); + rb_erase(node, &tree->root); + + BUG_ON(ref->parent); /* should not be a direct ref */ + BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); + eb = read_tree_block(fs_info, ref->wanted_disk_byte, 0); if (IS_ERR(eb)) { + free_pref(ref); return PTR_ERR(eb); } else if (!extent_buffer_uptodate(eb)) { + free_pref(ref); free_extent_buffer(eb); return -EIO; } @@ -807,73 +749,33 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0); btrfs_tree_read_unlock(eb); free_extent_buffer(eb); + prelim_ref_insert(fs_info, &preftrees->indirect, ref, NULL); + cond_resched(); } return 0; } /* - * merge backrefs and adjust counts accordingly - * - * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref - * then we can merge more here. Additionally, we could even add a key - * range for the blocks we looked into to merge even more (-> replace - * unresolved refs by those having a parent). - */ -static void __merge_refs(struct list_head *head, enum merge_mode mode) -{ - struct __prelim_ref *pos1; - - list_for_each_entry(pos1, head, list) { - struct __prelim_ref *pos2 = pos1, *tmp; - - list_for_each_entry_safe_continue(pos2, tmp, head, list) { - struct __prelim_ref *ref1 = pos1, *ref2 = pos2; - struct extent_inode_elem *eie; - - if (!ref_for_same_block(ref1, ref2)) - continue; - if (mode == MERGE_IDENTICAL_KEYS) { - if (!ref1->parent && ref2->parent) - swap(ref1, ref2); - } else { - if (ref1->parent != ref2->parent) - continue; - } - - eie = ref1->inode_list; - while (eie && eie->next) - eie = eie->next; - if (eie) - eie->next = ref2->inode_list; - else - ref1->inode_list = ref2->inode_list; - ref1->count += ref2->count; - - list_del(&ref2->list); - kmem_cache_free(btrfs_prelim_ref_cache, ref2); - cond_resched(); - } - - } -} - -/* * add all currently queued delayed refs from this head whose seq nr is * smaller or equal that seq to the list */ -static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs, u64 *total_refs, - u64 inum) +static int add_delayed_refs(const struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *head, u64 seq, + struct preftrees *preftrees, u64 *total_refs, + struct share_check *sc) { struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct btrfs_key key; - struct btrfs_key op_key = {0}; - int sgn; + struct btrfs_key tmp_op_key; + struct btrfs_key *op_key = NULL; + int count; int ret = 0; - if (extent_op && extent_op->update_key) - btrfs_disk_key_to_cpu(&op_key, &extent_op->key); + if (extent_op && extent_op->update_key) { + btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key); + op_key = &tmp_op_key; + } spin_lock(&head->lock); list_for_each_entry(node, &head->ref_list, list) { @@ -886,36 +788,40 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, WARN_ON(1); continue; case BTRFS_ADD_DELAYED_REF: - sgn = 1; + count = node->ref_mod; break; case BTRFS_DROP_DELAYED_REF: - sgn = -1; + count = node->ref_mod * -1; break; default: BUG_ON(1); } - *total_refs += (node->ref_mod * sgn); + *total_refs += count; switch (node->type) { case BTRFS_TREE_BLOCK_REF_KEY: { + /* NORMAL INDIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, ref->root, &op_key, - ref->level + 1, 0, node->bytenr, - node->ref_mod * sgn, GFP_ATOMIC); + ret = add_indirect_ref(fs_info, preftrees, ref->root, + &tmp_op_key, ref->level + 1, + node->bytenr, count, sc, + GFP_ATOMIC); break; } case BTRFS_SHARED_BLOCK_REF_KEY: { + /* SHARED DIRECT METADATA backref */ struct btrfs_delayed_tree_ref *ref; ref = btrfs_delayed_node_to_tree_ref(node); - ret = __add_prelim_ref(prefs, 0, NULL, - ref->level + 1, ref->parent, - node->bytenr, - node->ref_mod * sgn, GFP_ATOMIC); + + ret = add_direct_ref(fs_info, preftrees, ref->level + 1, + ref->parent, node->bytenr, count, + sc, GFP_ATOMIC); break; } case BTRFS_EXTENT_DATA_REF_KEY: { + /* NORMAL INDIRECT DATA backref */ struct btrfs_delayed_data_ref *ref; ref = btrfs_delayed_node_to_data_ref(node); @@ -927,42 +833,53 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, * Found a inum that doesn't match our known inum, we * know it's shared. */ - if (inum && ref->objectid != inum) { + if (sc && sc->inum && ref->objectid != sc->inum) { ret = BACKREF_FOUND_SHARED; - break; + goto out; } - ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, - node->bytenr, - node->ref_mod * sgn, GFP_ATOMIC); + ret = add_indirect_ref(fs_info, preftrees, ref->root, + &key, 0, node->bytenr, count, sc, + GFP_ATOMIC); break; } case BTRFS_SHARED_DATA_REF_KEY: { + /* SHARED DIRECT FULL backref */ struct btrfs_delayed_data_ref *ref; ref = btrfs_delayed_node_to_data_ref(node); - ret = __add_prelim_ref(prefs, 0, NULL, 0, - ref->parent, node->bytenr, - node->ref_mod * sgn, GFP_ATOMIC); + + ret = add_direct_ref(fs_info, preftrees, 0, ref->parent, + node->bytenr, count, sc, + GFP_ATOMIC); break; } default: WARN_ON(1); } - if (ret) + /* + * We must ignore BACKREF_FOUND_SHARED until all delayed + * refs have been checked. + */ + if (ret && (ret != BACKREF_FOUND_SHARED)) break; } + if (!ret) + ret = extent_is_shared(sc); +out: spin_unlock(&head->lock); return ret; } /* * add all inline backrefs for bytenr to the list + * + * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, - int *info_level, struct list_head *prefs, - struct ref_root *ref_tree, - u64 *total_refs, u64 inum) +static int add_inline_refs(const struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int *info_level, struct preftrees *preftrees, + u64 *total_refs, struct share_check *sc) { int ret = 0; int slot; @@ -1012,14 +929,18 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, int type; iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); + type = btrfs_get_extent_inline_ref_type(leaf, iref, + BTRFS_REF_TYPE_ANY); + if (type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; + offset = btrfs_extent_inline_ref_offset(leaf, iref); switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, NULL, - *info_level + 1, offset, - bytenr, 1, GFP_NOFS); + ret = add_direct_ref(fs_info, preftrees, + *info_level + 1, offset, + bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -1027,21 +948,15 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, offset, - bytenr, count, GFP_NOFS); - if (ref_tree) { - if (!ret) - ret = ref_tree_add(ref_tree, 0, 0, 0, - bytenr, count); - if (!ret && ref_tree->unique_refs > 1) - ret = BACKREF_FOUND_SHARED; - } + + ret = add_direct_ref(fs_info, preftrees, 0, offset, + bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, offset, NULL, - *info_level + 1, 0, - bytenr, 1, GFP_NOFS); + ret = add_indirect_ref(fs_info, preftrees, offset, + NULL, *info_level + 1, + bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -1055,23 +970,16 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (inum && key.objectid != inum) { + if (sc && sc->inum && key.objectid != sc->inum) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count, GFP_NOFS); - if (ref_tree) { - if (!ret) - ret = ref_tree_add(ref_tree, root, - key.objectid, - key.offset, 0, - count); - if (!ret && ref_tree->unique_refs > 1) - ret = BACKREF_FOUND_SHARED; - } + + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, bytenr, count, + sc, GFP_NOFS); break; } default: @@ -1087,11 +995,13 @@ static int __add_inline_refs(struct btrfs_path *path, u64 bytenr, /* * add all non-inline backrefs for bytenr to the list + * + * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int __add_keyed_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, - int info_level, struct list_head *prefs, - struct ref_root *ref_tree, u64 inum) +static int add_keyed_refs(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, u64 bytenr, + int info_level, struct preftrees *preftrees, + struct share_check *sc) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -1121,34 +1031,32 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, switch (key.type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, 0, NULL, - info_level + 1, key.offset, - bytenr, 1, GFP_NOFS); + /* SHARED DIRECT METADATA backref */ + ret = add_direct_ref(fs_info, preftrees, + info_level + 1, key.offset, + bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { + /* SHARED DIRECT FULL backref */ struct btrfs_shared_data_ref *sdref; int count; sdref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset, - bytenr, count, GFP_NOFS); - if (ref_tree) { - if (!ret) - ret = ref_tree_add(ref_tree, 0, 0, 0, - bytenr, count); - if (!ret && ref_tree->unique_refs > 1) - ret = BACKREF_FOUND_SHARED; - } + ret = add_direct_ref(fs_info, preftrees, 0, + key.offset, bytenr, count, + sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = __add_prelim_ref(prefs, key.offset, NULL, - info_level + 1, 0, - bytenr, 1, GFP_NOFS); + /* NORMAL INDIRECT METADATA backref */ + ret = add_indirect_ref(fs_info, preftrees, key.offset, + NULL, info_level + 1, bytenr, + 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { + /* NORMAL INDIRECT DATA backref */ struct btrfs_extent_data_ref *dref; int count; u64 root; @@ -1161,23 +1069,15 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (inum && key.objectid != inum) { + if (sc && sc->inum && key.objectid != sc->inum) { ret = BACKREF_FOUND_SHARED; break; } root = btrfs_extent_data_ref_root(leaf, dref); - ret = __add_prelim_ref(prefs, root, &key, 0, 0, - bytenr, count, GFP_NOFS); - if (ref_tree) { - if (!ret) - ret = ref_tree_add(ref_tree, root, - key.objectid, - key.offset, 0, - count); - if (!ret && ref_tree->unique_refs > 1) - ret = BACKREF_FOUND_SHARED; - } + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, bytenr, count, + sc, GFP_NOFS); break; } default: @@ -1197,15 +1097,15 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * NOTE: This can return values > 0 - * * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). * - * If check_shared is set to 1, any extent has more than one ref item, will - * be returned BACKREF_FOUND_SHARED immediately. + * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. + * + * Otherwise this returns 0 for success and <0 for an error. * * FIXME some caching might speed things up */ @@ -1213,7 +1113,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, struct ulist *roots, const u64 *extent_item_pos, - u64 root_objectid, u64 inum, int check_shared) + struct share_check *sc) { struct btrfs_key key; struct btrfs_path *path; @@ -1221,15 +1121,16 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *head; int info_level = 0; int ret; - struct list_head prefs_delayed; - struct list_head prefs; - struct __prelim_ref *ref; + struct prelim_ref *ref; + struct rb_node *node; struct extent_inode_elem *eie = NULL; - struct ref_root *ref_tree = NULL; + /* total of both direct AND indirect refs! */ u64 total_refs = 0; - - INIT_LIST_HEAD(&prefs); - INIT_LIST_HEAD(&prefs_delayed); + struct preftrees preftrees = { + .direct = PREFTREE_INIT, + .indirect = PREFTREE_INIT, + .indirect_missing_keys = PREFTREE_INIT + }; key.objectid = bytenr; key.offset = (u64)-1; @@ -1257,18 +1158,6 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, again: head = NULL; - if (check_shared) { - if (!ref_tree) { - ref_tree = ref_root_alloc(); - if (!ref_tree) { - ret = -ENOMEM; - goto out; - } - } else { - ref_root_fini(ref_tree); - } - } - ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto out; @@ -1304,45 +1193,14 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = __add_delayed_refs(head, time_seq, - &prefs_delayed, &total_refs, - inum); + ret = add_delayed_refs(fs_info, head, time_seq, + &preftrees, &total_refs, sc); mutex_unlock(&head->mutex); if (ret) goto out; } else { spin_unlock(&delayed_refs->lock); } - - if (check_shared && !list_empty(&prefs_delayed)) { - /* - * Add all delay_ref to the ref_tree and check if there - * are multiple ref items added. - */ - list_for_each_entry(ref, &prefs_delayed, list) { - if (ref->key_for_search.type) { - ret = ref_tree_add(ref_tree, - ref->root_id, - ref->key_for_search.objectid, - ref->key_for_search.offset, - 0, ref->count); - if (ret) - goto out; - } else { - ret = ref_tree_add(ref_tree, 0, 0, 0, - ref->parent, ref->count); - if (ret) - goto out; - } - - } - - if (ref_tree->unique_refs > 1) { - ret = BACKREF_FOUND_SHARED; - goto out; - } - - } } if (path->slots[0]) { @@ -1356,42 +1214,48 @@ again: if (key.objectid == bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = __add_inline_refs(path, bytenr, - &info_level, &prefs, - ref_tree, &total_refs, - inum); + ret = add_inline_refs(fs_info, path, bytenr, + &info_level, &preftrees, + &total_refs, sc); if (ret) goto out; - ret = __add_keyed_refs(fs_info, path, bytenr, - info_level, &prefs, - ref_tree, inum); + ret = add_keyed_refs(fs_info, path, bytenr, info_level, + &preftrees, sc); if (ret) goto out; } } - btrfs_release_path(path); - list_splice_init(&prefs_delayed, &prefs); + btrfs_release_path(path); - ret = __add_missing_keys(fs_info, &prefs); + ret = add_missing_keys(fs_info, &preftrees); if (ret) goto out; - __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root)); - ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos, total_refs, - root_objectid); + ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, + extent_item_pos, total_refs, sc); if (ret) goto out; - __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); + WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root)); - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); + /* + * This walks the tree of merged and resolved refs. Tree blocks are + * read in as needed. Unique entries are added to the ulist, and + * the list of found roots is updated. + * + * We release the entire tree in one go before returning. + */ + node = rb_first(&preftrees.direct.root); + while (node) { + ref = rb_entry(node, struct prelim_ref, rbnode); + node = rb_next(&ref->rbnode); WARN_ON(ref->count < 0); if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (root_objectid && ref->root_id != root_objectid) { + if (sc && sc->root_objectid && + ref->root_id != sc->root_objectid) { ret = BACKREF_FOUND_SHARED; goto out; } @@ -1442,24 +1306,16 @@ again: } eie = NULL; } - list_del(&ref->list); - kmem_cache_free(btrfs_prelim_ref_cache, ref); + cond_resched(); } out: btrfs_free_path(path); - ref_root_free(ref_tree); - while (!list_empty(&prefs)) { - ref = list_first_entry(&prefs, struct __prelim_ref, list); - list_del(&ref->list); - kmem_cache_free(btrfs_prelim_ref_cache, ref); - } - while (!list_empty(&prefs_delayed)) { - ref = list_first_entry(&prefs_delayed, struct __prelim_ref, - list); - list_del(&ref->list); - kmem_cache_free(btrfs_prelim_ref_cache, ref); - } + + prelim_release(&preftrees.direct); + prelim_release(&preftrees.indirect); + prelim_release(&preftrees.indirect_missing_keys); + if (ret < 0) free_inode_elem_list(eie); return ret; @@ -1475,7 +1331,7 @@ static void free_leaf_list(struct ulist *blocks) while ((node = ulist_next(blocks, &uiter))) { if (!node->aux) continue; - eie = (struct extent_inode_elem *)(uintptr_t)node->aux; + eie = unode_aux_to_inode_list(node); free_inode_elem_list(eie); node->aux = 0; } @@ -1503,7 +1359,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, 0, 0, 0); + *leafs, NULL, extent_item_pos, NULL); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1525,9 +1381,9 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * * returns 0 on success, < 0 on error. */ -static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1546,7 +1402,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, 0, 0, 0); + tmp, *roots, NULL, NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1571,7 +1427,8 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!trans) down_read(&fs_info->commit_root_sem); - ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots); + ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, + time_seq, roots); if (!trans) up_read(&fs_info->commit_root_sem); return ret; @@ -1580,26 +1437,32 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, /** * btrfs_check_shared - tell us whether an extent is shared * - * @trans: optional trans handle - * * btrfs_check_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the * one passed in. This provides a significant performance benefit for * callers (such as fiemap) which want to know whether the extent is * shared but do not need a ref count. * + * This attempts to allocate a transaction in order to account for + * delayed refs, but continues on even when the alloc fails. + * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_check_shared(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 root_objectid, - u64 inum, u64 bytenr) +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_trans_handle *trans; struct ulist *tmp = NULL; struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; + struct share_check shared = { + .root_objectid = root->objectid, + .inum = inum, + .share_count = 0, + }; tmp = ulist_alloc(GFP_NOFS); roots = ulist_alloc(GFP_NOFS); @@ -1609,14 +1472,18 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (trans) - btrfs_get_tree_mod_seq(fs_info, &elem); - else + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + trans = NULL; down_read(&fs_info->commit_root_sem); + } else { + btrfs_get_tree_mod_seq(fs_info, &elem); + } + ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, root_objectid, inum, 1); + roots, NULL, &shared); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1631,10 +1498,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, bytenr = node->val; cond_resched(); } - if (trans) + + if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); - else + btrfs_end_transaction(trans); + } else { up_read(&fs_info->commit_root_sem); + } ulist_free(tmp); ulist_free(roots); return ret; @@ -1649,7 +1519,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, struct btrfs_key key; struct btrfs_key found_key; struct btrfs_inode_extref *extref; - struct extent_buffer *leaf; + const struct extent_buffer *leaf; unsigned long ptr; key.objectid = inode_objectid; @@ -1806,7 +1676,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, u64 flags; u64 size = 0; u32 item_size; - struct extent_buffer *eb; + const struct extent_buffer *eb; struct btrfs_extent_item *ei; struct btrfs_key key; @@ -1870,15 +1740,17 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, * helper function to iterate extent inline refs. ptr must point to a 0 value * for the first call and may be modified. it is used to track state. * if more refs exist, 0 is returned and the next call to - * __get_extent_inline_ref must pass the modified ptr parameter to get the + * get_extent_inline_ref must pass the modified ptr parameter to get the * next ref. after the last ref was processed, 1 is returned. * returns <0 on error */ -static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, - struct btrfs_key *key, - struct btrfs_extent_item *ei, u32 item_size, - struct btrfs_extent_inline_ref **out_eiref, - int *out_type) +static int get_extent_inline_ref(unsigned long *ptr, + const struct extent_buffer *eb, + const struct btrfs_key *key, + const struct btrfs_extent_item *ei, + u32 item_size, + struct btrfs_extent_inline_ref **out_eiref, + int *out_type) { unsigned long end; u64 flags; @@ -1908,7 +1780,10 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, end = (unsigned long)ei + item_size; *out_eiref = (struct btrfs_extent_inline_ref *)(*ptr); - *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref); + *out_type = btrfs_get_extent_inline_ref_type(eb, *out_eiref, + BTRFS_REF_TYPE_ANY); + if (*out_type == BTRFS_REF_TYPE_INVALID) + return -EINVAL; *ptr += btrfs_extent_inline_ref_size(*out_type); WARN_ON(*ptr > end); @@ -1921,7 +1796,7 @@ static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb, /* * reads the tree block backref for an extent. tree level and root are returned * through out_level and out_root. ptr must point to a 0 value for the first - * call and may be modified (see __get_extent_inline_ref comment). + * call and may be modified (see get_extent_inline_ref comment). * returns 0 if data was provided, 1 if there was no more data to provide or * <0 on error. */ @@ -1937,7 +1812,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, return 1; while (1) { - ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size, + ret = get_extent_inline_ref(ptr, eb, key, ei, item_size, &eiref, &type); if (ret < 0) return ret; @@ -2034,8 +1909,8 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots); + ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, + tree_mod_seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 9c41fbac3009..e410335841aa 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -68,10 +68,20 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_check_shared(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 root_objectid, - u64 inum, u64 bytenr); +int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr); int __init btrfs_prelim_ref_init(void); void btrfs_prelim_ref_exit(void); + +struct prelim_ref { + struct rb_node rbnode; + u64 root_id; + struct btrfs_key key_for_search; + int level; + int count; + struct extent_inode_elem *inode_list; + u64 parent; + u64 wanted_disk_byte; +}; + #endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d87ac27a5f2b..eccadb5f62a5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -179,9 +179,14 @@ struct btrfs_inode { unsigned reserved_extents; /* - * always compress this one file + * Cached values of inode properties */ - unsigned force_compress; + unsigned prop_compress; /* per-file compression algorithm */ + /* + * Force compression on the file using the defrag ioctl, could be + * different from prop_compress and takes precedence if set + */ + unsigned defrag_compress; struct btrfs_delayed_node *delayed_node; @@ -207,7 +212,7 @@ struct btrfs_inode { extern unsigned char btrfs_filetype_table[]; -static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); } @@ -231,7 +236,7 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } -static inline u64 btrfs_ino(struct btrfs_inode *inode) +static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d37c94ce05..7d5a9b51f0d7 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -296,8 +296,7 @@ static void btrfsic_dev_state_hashtable_add( struct btrfsic_dev_state *ds, struct btrfsic_dev_state_hashtable *h); static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds); -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h); static struct btrfsic_stack_frame *btrfsic_stack_frame_alloc(void); static void btrfsic_stack_frame_free(struct btrfsic_stack_frame *sf); @@ -385,8 +384,7 @@ static int btrfsic_process_superblock_dev_mirror( int superblock_mirror_num, struct btrfsic_dev_state **selected_dev_state, struct btrfs_super_block *selected_super); -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev); +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev); static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, u64 bytenr, struct btrfsic_dev_state *dev_state, @@ -626,17 +624,15 @@ static void btrfsic_dev_state_hashtable_remove(struct btrfsic_dev_state *ds) list_del(&ds->collision_resolving_node); } -static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( - struct block_device *bdev, +static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, struct btrfsic_dev_state_hashtable *h) { const unsigned int hashval = - (((unsigned int)((uintptr_t)bdev)) & - (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); + dev & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1); struct btrfsic_dev_state *ds; list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { - if (ds->bdev == bdev) + if (ds->bdev->bd_dev == dev) return ds; } @@ -668,7 +664,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, if (!device->bdev || !device->name) continue; - dev_state = btrfsic_dev_state_lookup(device->bdev); + dev_state = btrfsic_dev_state_lookup(device->bdev->bd_dev); BUG_ON(NULL == dev_state); for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { ret = btrfsic_process_superblock_dev_mirror( @@ -795,12 +791,12 @@ static int btrfsic_process_superblock_dev_mirror( dev_bytenr = btrfs_sb_offset(superblock_mirror_num); if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; - bh = __bread(superblock_bdev, dev_bytenr / 4096, + bh = __bread(superblock_bdev, dev_bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); if (NULL == bh) return -1; super_tmp = (struct btrfs_super_block *) - (bh->b_data + (dev_bytenr & 4095)); + (bh->b_data + (dev_bytenr & (BTRFS_BDEV_BLOCKSIZE - 1))); if (btrfs_super_bytenr(super_tmp) != dev_bytenr || btrfs_super_magic(super_tmp) != BTRFS_MAGIC || @@ -1556,7 +1552,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, } device = multi->stripes[0].dev; - block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev); + block_ctx_out->dev = btrfsic_dev_state_lookup(device->bdev->bd_dev); block_ctx_out->dev_bytenr = multi->stripes[0].physical; block_ctx_out->start = bytenr; block_ctx_out->len = len; @@ -1639,7 +1635,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, unsigned int j; bio = btrfs_io_bio_alloc(num_pages - i); - bio->bi_bdev = block_ctx->dev->bdev; + bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); @@ -1732,7 +1728,7 @@ static int btrfsic_test_for_metadata(struct btrfsic_state *state, num_pages = state->metablock_size >> PAGE_SHIFT; h = (struct btrfs_header *)datav[0]; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + if (memcmp(h->fsid, fs_info->fsid, BTRFS_FSID_SIZE)) return 1; for (i = 0; i < num_pages; i++) { @@ -2654,7 +2650,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( pr_info("btrfsic: error, kmalloc failed!\n"); return NULL; } - dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev); + dev_state = btrfsic_dev_state_lookup(block_ctx->dev->bdev->bd_dev); if (NULL == dev_state) { pr_info("btrfsic: error, lookup dev_state failed!\n"); btrfsic_block_free(block); @@ -2734,10 +2730,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } } -static struct btrfsic_dev_state *btrfsic_dev_state_lookup( - struct block_device *bdev) +static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) { - return btrfsic_dev_state_hashtable_lookup(bdev, + return btrfsic_dev_state_hashtable_lookup(dev, &btrfsic_dev_state_hashtable); } @@ -2751,14 +2746,14 @@ int btrfsic_submit_bh(int op, int op_flags, struct buffer_head *bh) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bh() might also be called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bh->b_bdev); + dev_state = btrfsic_dev_state_lookup(bh->b_bdev->bd_dev); /* Only called to write the superblock (incl. FLUSH/FUA) */ if (NULL != dev_state && (op == REQ_OP_WRITE) && bh->b_size > 0) { u64 dev_bytenr; - dev_bytenr = 4096 * bh->b_blocknr; + dev_bytenr = BTRFS_BDEV_BLOCKSIZE * bh->b_blocknr; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) pr_info("submit_bh(op=0x%x,0x%x, blocknr=%llu (bytenr %llu), size=%zu, data=%p, bdev=%p)\n", @@ -2808,7 +2803,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev); + dev_state = btrfsic_dev_state_lookup(bio_dev(bio)); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2824,10 +2819,10 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", bio_op(bio), bio->bi_opf, segs, (unsigned long long)bio->bi_iter.bi_sector, - dev_bytenr, bio->bi_bdev); + dev_bytenr, bio->bi_disk); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2856,8 +2851,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); + pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_disk); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | @@ -2998,7 +2993,7 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) continue; ds = btrfsic_dev_state_hashtable_lookup( - device->bdev, + device->bdev->bd_dev, &btrfsic_dev_state_hashtable); if (NULL != ds) { state = ds->state; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d2ef9ac2a630..b51d23f5cafa 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -704,6 +704,7 @@ static struct { static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, + &btrfs_zstd_compress, }; void __init btrfs_init_compress(void) @@ -825,7 +826,7 @@ static void free_workspace(int type, struct list_head *workspace) int *free_ws = &btrfs_comp_ws[idx].free_ws; spin_lock(ws_lock); - if (*free_ws < num_online_cpus()) { + if (*free_ws <= num_online_cpus()) { list_add(workspace, idle_ws); (*free_ws)++; spin_unlock(ws_lock); @@ -1047,3 +1048,36 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, return 1; } + +/* + * Compression heuristic. + * + * For now is's a naive and optimistic 'return true', we'll extend the logic to + * quickly (compared to direct compression) detect data characteristics + * (compressible/uncompressible) to avoid wasting CPU time on uncompressible + * data. + * + * The following types of analysis can be performed: + * - detect mostly zero data + * - detect data with low "byte set" size (text, etc) + * - detect data with low/high "core byte" set + * + * Return non-zero if the compression should be done, 0 otherwise. + */ +int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end) +{ + u64 index = start >> PAGE_SHIFT; + u64 end_index = end >> PAGE_SHIFT; + struct page *page; + int ret = 1; + + while (index <= end_index) { + page = find_get_page(inode->i_mapping, index); + kmap(page); + kunmap(page); + put_page(page); + index++; + } + + return ret; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 87f6d3332163..d2781ff8f994 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -99,8 +99,8 @@ enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, BTRFS_COMPRESS_LZO = 2, - BTRFS_COMPRESS_TYPES = 2, - BTRFS_COMPRESS_LAST = 3, + BTRFS_COMPRESS_ZSTD = 3, + BTRFS_COMPRESS_TYPES = 3, }; struct btrfs_compress_op { @@ -128,5 +128,8 @@ struct btrfs_compress_op { extern const struct btrfs_compress_op btrfs_zlib_compress; extern const struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zstd_compress; + +int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3f4daa9d6e2c..6d49db7d86be 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4650,7 +4650,7 @@ void btrfs_truncate_item(struct btrfs_fs_info *fs_info, btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(fs_info, leaf) < 0) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); BUG(); } } @@ -4679,7 +4679,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, data_end = leaf_data_end(fs_info, leaf); if (btrfs_leaf_free_space(fs_info, leaf) < data_size) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); BUG(); } slot = path->slots[0]; @@ -4687,7 +4687,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, BUG_ON(slot < 0); if (slot >= nritems) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); btrfs_crit(fs_info, "slot %d too large, nritems %d", slot, nritems); BUG_ON(1); @@ -4718,7 +4718,7 @@ void btrfs_extend_item(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(fs_info, leaf) < 0) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); BUG(); } } @@ -4757,7 +4757,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, data_end = leaf_data_end(fs_info, leaf); if (btrfs_leaf_free_space(fs_info, leaf) < total_size) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); btrfs_crit(fs_info, "not enough freespace need %u have %d", total_size, btrfs_leaf_free_space(fs_info, leaf)); BUG(); @@ -4767,7 +4767,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, unsigned int old_data = btrfs_item_end_nr(leaf, slot); if (old_data < data_end) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); btrfs_crit(fs_info, "slot %d old_data %d data_end %d", slot, old_data, data_end); BUG_ON(1); @@ -4811,7 +4811,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(fs_info, leaf) < 0) { - btrfs_print_leaf(fs_info, leaf); + btrfs_print_leaf(leaf); BUG(); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f3eb7b17cac..5a8933da39a7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -270,6 +270,7 @@ struct btrfs_super_block { BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ BTRFS_FEATURE_INCOMPAT_RAID56 | \ BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ @@ -470,8 +471,8 @@ struct btrfs_block_rsv { /* * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations and data allocations in ssd mode. + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. */ struct btrfs_free_cluster { spinlock_t lock; @@ -558,7 +559,6 @@ struct btrfs_block_group_cache { u64 bytes_super; u64 flags; u64 cache_generation; - u32 sectorsize; /* * If the free space extent count exceeds this number, convert the block @@ -968,7 +968,7 @@ struct btrfs_fs_info { struct reloc_control *reloc_ctl; - /* data_alloc_cluster is only used in ssd mode */ + /* data_alloc_cluster is only used in ssd_spread mode */ struct btrfs_free_cluster data_alloc_cluster; /* all metadata allocations go through this cluster */ @@ -1072,8 +1072,6 @@ struct btrfs_fs_info { /* next backup root to be overwritten */ int backup_root_index; - int num_tolerated_disk_barrier_failures; - /* device replace state */ struct btrfs_dev_replace dev_replace; @@ -1261,12 +1259,17 @@ struct btrfs_root { */ int send_in_progress; struct btrfs_subvolume_writers *subv_writers; - atomic_t will_be_snapshoted; + atomic_t will_be_snapshotted; /* For qgroup metadata space reserve */ atomic64_t qgroup_meta_rsv; }; +struct btrfs_file_private { + struct btrfs_trans_handle *trans; + void *filldir_buf; +}; + static inline u32 btrfs_inode_sectorsize(const struct inode *inode) { return btrfs_sb(inode->i_sb)->sectorsize; @@ -1435,7 +1438,7 @@ do { \ #define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31) struct btrfs_map_token { - struct extent_buffer *eb; + const struct extent_buffer *eb; char *kaddr; unsigned long offset; }; @@ -1469,18 +1472,19 @@ static inline void btrfs_init_map_token (struct btrfs_map_token *token) sizeof(((type *)0)->member))) #define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ - unsigned long off, \ - struct btrfs_map_token *token); \ -void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr, \ +u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off, \ + struct btrfs_map_token *token); \ +void btrfs_set_token_##bits(struct extent_buffer *eb, const void *ptr, \ unsigned long off, u##bits val, \ struct btrfs_map_token *token); \ -static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \ +static inline u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, \ unsigned long off) \ { \ return btrfs_get_token_##bits(eb, ptr, off, NULL); \ } \ -static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \ +static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr,\ unsigned long off, u##bits val) \ { \ btrfs_set_token_##bits(eb, ptr, off, val, NULL); \ @@ -1492,7 +1496,8 @@ DECLARE_BTRFS_SETGET_BITS(32) DECLARE_BTRFS_SETGET_BITS(64) #define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ @@ -1503,7 +1508,8 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, type *s, \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ -static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \ +static inline u##bits btrfs_token_##name(const struct extent_buffer *eb,\ + const type *s, \ struct btrfs_map_token *token) \ { \ BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ @@ -1518,9 +1524,9 @@ static inline void btrfs_set_token_##name(struct extent_buffer *eb, \ } #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - type *p = page_address(eb->pages[0]); \ + const type *p = page_address(eb->pages[0]); \ u##bits res = le##bits##_to_cpu(p->member); \ return res; \ } \ @@ -1532,7 +1538,7 @@ static inline void btrfs_set_##name(struct extent_buffer *eb, \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(type *s) \ +static inline u##bits btrfs_##name(const type *s) \ { \ return le##bits##_to_cpu(s->member); \ } \ @@ -1799,7 +1805,6 @@ static inline u32 btrfs_extent_inline_ref_size(int type) if (type == BTRFS_EXTENT_DATA_REF_KEY) return sizeof(struct btrfs_extent_data_ref) + offsetof(struct btrfs_extent_inline_ref, offset); - BUG(); return 0; } @@ -1857,7 +1862,7 @@ static inline unsigned long btrfs_node_key_ptr_offset(int nr) sizeof(struct btrfs_key_ptr) * nr; } -void btrfs_node_key(struct extent_buffer *eb, +void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr); static inline void btrfs_set_node_key(struct extent_buffer *eb, @@ -1886,28 +1891,28 @@ static inline struct btrfs_item *btrfs_item_nr(int nr) return (struct btrfs_item *)btrfs_item_nr_offset(nr); } -static inline u32 btrfs_item_end(struct extent_buffer *eb, +static inline u32 btrfs_item_end(const struct extent_buffer *eb, struct btrfs_item *item) { return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); } -static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_end_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_end(eb, btrfs_item_nr(nr)); } -static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_offset_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_offset(eb, btrfs_item_nr(nr)); } -static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +static inline u32 btrfs_item_size_nr(const struct extent_buffer *eb, int nr) { return btrfs_item_size(eb, btrfs_item_nr(nr)); } -static inline void btrfs_item_key(struct extent_buffer *eb, +static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { struct btrfs_item *item = btrfs_item_nr(nr); @@ -1943,8 +1948,8 @@ BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); -static inline void btrfs_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, struct btrfs_disk_key *key) { read_eb_member(eb, item, struct btrfs_dir_item, location, key); @@ -1952,7 +1957,7 @@ static inline void btrfs_dir_item_key(struct extent_buffer *eb, static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, struct btrfs_dir_item *item, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, item, struct btrfs_dir_item, location, key); } @@ -1964,8 +1969,8 @@ BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, generation, 64); -static inline void btrfs_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, struct btrfs_disk_key *key) { read_eb_member(eb, h, struct btrfs_free_space_header, location, key); @@ -1973,7 +1978,7 @@ static inline void btrfs_free_space_key(struct extent_buffer *eb, static inline void btrfs_set_free_space_key(struct extent_buffer *eb, struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) + const struct btrfs_disk_key *key) { write_eb_member(eb, h, struct btrfs_free_space_header, location, key); } @@ -2000,25 +2005,25 @@ static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, disk->objectid = cpu_to_le64(cpu->objectid); } -static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_node_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_key *key, int nr) +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) { struct btrfs_disk_key disk_key; btrfs_item_key(eb, &disk_key, nr); btrfs_disk_key_to_cpu(key, &disk_key); } -static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, - struct btrfs_dir_item *item, - struct btrfs_key *key) +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) { struct btrfs_disk_key disk_key; btrfs_dir_item_key(eb, item, &disk_key); @@ -2050,7 +2055,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); -static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) { return (btrfs_header_flags(eb) & flag) == flag; } @@ -2069,7 +2074,7 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) return (flags & flag) == flag; } -static inline int btrfs_header_backref_rev(struct extent_buffer *eb) +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) { u64 flags = btrfs_header_flags(eb); return flags >> BTRFS_BACKREF_REV_SHIFT; @@ -2089,12 +2094,12 @@ static inline unsigned long btrfs_header_fsid(void) return offsetof(struct btrfs_header, fsid); } -static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +static inline unsigned long btrfs_header_chunk_tree_uuid(const struct extent_buffer *eb) { return offsetof(struct btrfs_header, chunk_tree_uuid); } -static inline int btrfs_is_leaf(struct extent_buffer *eb) +static inline int btrfs_is_leaf(const struct extent_buffer *eb) { return btrfs_header_level(eb) == 0; } @@ -2128,12 +2133,12 @@ BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); -static inline bool btrfs_root_readonly(struct btrfs_root *root) +static inline bool btrfs_root_readonly(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } -static inline bool btrfs_root_dead(struct btrfs_root *root) +static inline bool btrfs_root_dead(const struct btrfs_root *root) { return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; } @@ -2190,51 +2195,51 @@ BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); -static inline void btrfs_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); } -static inline void btrfs_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); } -static inline void btrfs_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, struct btrfs_disk_balance_args *ba) { read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) { write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); } static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - struct btrfs_disk_balance_args *disk) + const struct btrfs_disk_balance_args *disk) { memset(cpu, 0, sizeof(*cpu)); @@ -2254,7 +2259,7 @@ btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, static inline void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - struct btrfs_balance_args *cpu) + const struct btrfs_balance_args *cpu) { memset(disk, 0, sizeof(*disk)); @@ -2322,7 +2327,7 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +static inline int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); /* @@ -2337,8 +2342,8 @@ static inline int btrfs_super_csum_size(struct btrfs_super_block *s) * this returns the address of the start of the last item, * which is the stop of the leaf data stack */ -static inline unsigned int leaf_data_end(struct btrfs_fs_info *fs_info, - struct extent_buffer *leaf) +static inline unsigned int leaf_data_end(const struct btrfs_fs_info *fs_info, + const struct extent_buffer *leaf) { u32 nr = btrfs_header_nritems(leaf); @@ -2363,7 +2368,7 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); static inline unsigned long -btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) { return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } @@ -2397,8 +2402,9 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, * size of any extent headers. If a file is compressed on disk, this is * the compressed size */ -static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, - struct btrfs_item *e) +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + struct btrfs_item *e) { return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } @@ -2406,9 +2412,9 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, /* this returns the number of file bytes represented by the inline item. * If an item is compressed, this is the uncompressed size */ -static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, - int slot, - struct btrfs_file_extent_item *fi) +static inline u32 btrfs_file_extent_inline_len(const struct extent_buffer *eb, + int slot, + const struct btrfs_file_extent_item *fi) { struct btrfs_map_token token; @@ -2430,8 +2436,8 @@ static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, /* btrfs_dev_stats_item */ -static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb, - struct btrfs_dev_stats_item *ptr, +static inline u64 btrfs_dev_stats_value(const struct extent_buffer *eb, + const struct btrfs_dev_stats_item *ptr, int index) { u64 val; @@ -2561,6 +2567,17 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) /* extent-tree.c */ +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID = 0, + BTRFS_REF_TYPE_BLOCK = 1, + BTRFS_REF_TYPE_DATA = 2, + BTRFS_REF_TYPE_ANY = 3, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); + u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes); static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_fs_info *fs_info, @@ -2670,8 +2687,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_can_relocate(struct btrfs_fs_info *fs_info, u64 bytenr); int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytes_used, - u64 type, u64 chunk_objectid, u64 chunk_offset, - u64 size); + u64 type, u64 chunk_offset, u64 size); struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); @@ -2772,8 +2788,8 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int __get_raid_index(u64 flags); -int btrfs_start_write_no_snapshoting(struct btrfs_root *root); -void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +int btrfs_start_write_no_snapshotting(struct btrfs_root *root); +void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, const u64 type); @@ -2973,8 +2989,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, const char *name, int name_len); -int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key); +int btrfs_del_root(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_root_item *item); @@ -3135,21 +3151,6 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes); -/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ -#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) -#define ClearPageChecked ClearPageFsMisc -#define SetPageChecked SetPageFsMisc -#define PageChecked PageFsMisc -#endif - -/* This forces readahead on a given range of bytes in an inode */ -static inline void btrfs_force_ra(struct address_space *mapping, - struct file_ra_state *ra, struct file *file, - pgoff_t offset, unsigned long req_size) -{ - page_cache_sync_readahead(mapping, ra, file, offset, req_size); -} - struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, @@ -3229,7 +3230,6 @@ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_update_iflags(struct inode *inode); -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); int btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file *file, struct btrfs_ioctl_defrag_range_args *range, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8ae409b5a61d..19e4ad2f3f2e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1727,6 +1727,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, if (over) return 1; + ctx->pos++; } return 0; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bee3edeea7a3..7a93a3e1a847 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -639,11 +639,39 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( write_unlock(&em_tree->lock); } +/* + * Read progress of device replace status according to the state and last + * stored position. The value format is the same as for + * btrfs_dev_replace::progress_1000 + */ +static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info) +{ + struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + u64 ret = 0; + + switch (dev_replace->replace_state) { + case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: + ret = 0; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: + ret = 1000; + break; + case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: + case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + ret = div64_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes( + dev_replace->srcdev), 1000)); + break; + } + + return ret; +} + void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - struct btrfs_device *srcdev; btrfs_dev_replace_lock(dev_replace, 0); /* even if !dev_replace_is_valid, the values are good enough for @@ -656,21 +684,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, atomic64_read(&dev_replace->num_write_errors); args->status.num_uncorrectable_read_errors = atomic64_read(&dev_replace->num_uncorrectable_read_errors); - switch (dev_replace->replace_state) { - case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: - case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - args->status.progress_1000 = 0; - break; - case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: - args->status.progress_1000 = 1000; - break; - case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: - case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - srcdev = dev_replace->srcdev; - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); - break; - } + args->status.progress_1000 = btrfs_dev_replace_progress(fs_info); btrfs_dev_replace_unlock(dev_replace, 0); } @@ -795,25 +809,19 @@ static int btrfs_dev_replace_kthread(void *data) { struct btrfs_fs_info *fs_info = data; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - struct btrfs_ioctl_dev_replace_args *status_args; u64 progress; - status_args = kzalloc(sizeof(*status_args), GFP_KERNEL); - if (status_args) { - btrfs_dev_replace_status(fs_info, status_args); - progress = status_args->status.progress_1000; - kfree(status_args); - progress = div_u64(progress, 10); - btrfs_info_in_rcu(fs_info, - "continuing dev_replace from %s (devid %llu) to %s @%u%%", - dev_replace->srcdev->missing ? "<missing disk>" : - rcu_str_deref(dev_replace->srcdev->name), - dev_replace->srcdev->devid, - dev_replace->tgtdev ? - rcu_str_deref(dev_replace->tgtdev->name) : - "<missing target disk>", - (unsigned int)progress); - } + progress = btrfs_dev_replace_progress(fs_info); + progress = div_u64(progress, 10); + btrfs_info_in_rcu(fs_info, + "continuing dev_replace from %s (devid %llu) to %s @%u%%", + dev_replace->srcdev->missing ? "<missing disk>" + : rcu_str_deref(dev_replace->srcdev->name), + dev_replace->srcdev->devid, + dev_replace->tgtdev ? rcu_str_deref(dev_replace->tgtdev->name) + : "<missing target disk>", + (unsigned int)progress); + btrfs_dev_replace_continue_on_mount(fs_info); clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f45b61fe9a9a..b6dc1d179d23 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -529,7 +529,7 @@ static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - u8 fsid[BTRFS_UUID_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; int ret = 1; read_extent_buffer(eb, fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); @@ -1343,7 +1343,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); refcount_set(&root->refs, 1); - atomic_set(&root->will_be_snapshoted, 0); + atomic_set(&root->will_be_snapshotted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; root->log_transid_committed = -1; @@ -2694,8 +2694,8 @@ int open_ctree(struct super_block *sb, btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); - sb->s_blocksize = 4096; - sb->s_blocksize_bits = blksize_bits(4096); + sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE; + sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE); btrfs_init_btree_inode(fs_info); @@ -2828,6 +2828,8 @@ int open_ctree(struct super_block *sb, features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (fs_info->compress_type == BTRFS_COMPRESS_LZO) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; + else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) + features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) btrfs_info(fs_info, "has skinny extents"); @@ -3035,15 +3037,10 @@ retry_root_backup: btrfs_err(fs_info, "failed to read block groups: %d", ret); goto fail_sysfs; } - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(sb->s_flags & MS_RDONLY)) { + + if (!(sb->s_flags & MS_RDONLY) && !btrfs_check_rw_degradable(fs_info)) { btrfs_warn(fs_info, -"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed", - fs_info->fs_devices->missing_devices, - fs_info->num_tolerated_disk_barrier_failures); + "writeable mount is not allowed due to too many missing devices"); goto fail_sysfs; } @@ -3058,11 +3055,9 @@ retry_root_backup: if (IS_ERR(fs_info->transaction_kthread)) goto fail_cleaner; - if (!btrfs_test_opt(fs_info, SSD) && - !btrfs_test_opt(fs_info, NOSSD) && + if (!btrfs_test_opt(fs_info, NOSSD) && !fs_info->fs_devices->rotating) { - btrfs_info(fs_info, "detected SSD devices, enabling SSD mode"); - btrfs_set_opt(fs_info->mount_opt, SSD); + btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } /* @@ -3321,7 +3316,7 @@ int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num, if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) return -EINVAL; - bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE); + bh = __bread(bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, BTRFS_SUPER_INFO_SIZE); /* * If we fail to read from the underlying devices, as of now * the best option we have is to mark it EIO. @@ -3378,19 +3373,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) } /* - * this should be called twice, once with wait == 0 and - * once with wait == 1. When wait == 0 is done, all the buffer heads - * we write are pinned. + * Write superblock @sb to the @device. Do not wait for completion, all the + * buffer heads we write are pinned. * - * They are released when wait == 1 is done. - * max_mirrors must be the same for both runs, and it indicates how - * many supers on this one device should be written. + * Write @max_mirrors copies of the superblock, where 0 means default that fit + * the expected device size at commit time. Note that max_mirrors must be + * same for write and wait phases. * - * max_mirrors == 0 means to write them all. + * Return number of errors when buffer head is not found or submission fails. */ static int write_dev_supers(struct btrfs_device *device, - struct btrfs_super_block *sb, - int wait, int max_mirrors) + struct btrfs_super_block *sb, int max_mirrors) { struct buffer_head *bh; int i; @@ -3408,57 +3401,33 @@ static int write_dev_supers(struct btrfs_device *device, device->commit_total_bytes) break; - if (wait) { - bh = __find_get_block(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { - errors++; - continue; - } - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - errors++; + btrfs_set_super_bytenr(sb, bytenr); - /* drop our reference */ - brelse(bh); + crc = ~(u32)0; + crc = btrfs_csum_data((const char *)sb + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); - /* drop the reference from the wait == 0 run */ - brelse(bh); + /* One reference for us, and we leave it for the caller */ + bh = __getblk(device->bdev, bytenr / BTRFS_BDEV_BLOCKSIZE, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + btrfs_err(device->fs_info, + "couldn't get super buffer head for bytenr %llu", + bytenr); + errors++; continue; - } else { - btrfs_set_super_bytenr(sb, bytenr); - - crc = ~(u32)0; - crc = btrfs_csum_data((const char *)sb + - BTRFS_CSUM_SIZE, crc, - BTRFS_SUPER_INFO_SIZE - - BTRFS_CSUM_SIZE); - btrfs_csum_final(crc, sb->csum); - - /* - * one reference for us, and we leave it for the - * caller - */ - bh = __getblk(device->bdev, bytenr / 4096, - BTRFS_SUPER_INFO_SIZE); - if (!bh) { - btrfs_err(device->fs_info, - "couldn't get super buffer head for bytenr %llu", - bytenr); - errors++; - continue; - } + } - memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); - /* one reference for submit_bh */ - get_bh(bh); + /* one reference for submit_bh */ + get_bh(bh); - set_buffer_uptodate(bh); - lock_buffer(bh); - bh->b_end_io = btrfs_end_buffer_write_sync; - bh->b_private = device; - } + set_buffer_uptodate(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + bh->b_private = device; /* * we fua the first super. The others we allow @@ -3466,9 +3435,10 @@ static int write_dev_supers(struct btrfs_device *device, */ if (i == 0) { ret = btrfsic_submit_bh(REQ_OP_WRITE, - REQ_SYNC | REQ_FUA, bh); + REQ_SYNC | REQ_FUA | REQ_META | REQ_PRIO, bh); } else { - ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + ret = btrfsic_submit_bh(REQ_OP_WRITE, + REQ_SYNC | REQ_META | REQ_PRIO, bh); } if (ret) errors++; @@ -3477,6 +3447,50 @@ static int write_dev_supers(struct btrfs_device *device, } /* + * Wait for write completion of superblocks done by write_dev_supers, + * @max_mirrors same for write and wait phases. + * + * Return number of errors when buffer head is not found or not marked up to + * date. + */ +static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int errors = 0; + u64 bytenr; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) + break; + + bh = __find_get_block(device->bdev, + bytenr / BTRFS_BDEV_BLOCKSIZE, + BTRFS_SUPER_INFO_SIZE); + if (!bh) { + errors++; + continue; + } + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + + /* drop our reference */ + brelse(bh); + + /* drop the reference from the writing run */ + brelse(bh); + } + + return errors < i ? 0 : -1; +} + +/* * endio for the write_dev_flush, this will wake anyone waiting * for the barrier when it is done */ @@ -3499,12 +3513,12 @@ static void write_dev_flush(struct btrfs_device *device) bio_reset(bio); bio->bi_end_io = btrfs_end_empty_barrier; - bio->bi_bdev = device->bdev; + bio_set_dev(bio, device->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - submit_bio(bio); + btrfsic_submit_bio(bio); device->flush_bio_sent = 1; } @@ -3524,20 +3538,10 @@ static blk_status_t wait_dev_flush(struct btrfs_device *device) return bio->bi_status; } -static int check_barrier_error(struct btrfs_fs_devices *fsdevs) +static int check_barrier_error(struct btrfs_fs_info *fs_info) { - int dev_flush_error = 0; - struct btrfs_device *dev; - - list_for_each_entry_rcu(dev, &fsdevs->devices, dev_list) { - if (!dev->bdev || dev->last_flush_error) - dev_flush_error++; - } - - if (dev_flush_error > - fsdevs->fs_info->num_tolerated_disk_barrier_failures) + if (!btrfs_check_rw_degradable(fs_info)) return -EIO; - return 0; } @@ -3592,7 +3596,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) * to arrive at the volume status. So error checking * is being pushed to a separate loop. */ - return check_barrier_error(info->fs_devices); + return check_barrier_error(info); } return 0; } @@ -3626,60 +3630,6 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) return min_tolerated; } -int btrfs_calc_num_tolerated_disk_barrier_failures( - struct btrfs_fs_info *fs_info) -{ - struct btrfs_ioctl_space_info space; - struct btrfs_space_info *sinfo; - u64 types[] = {BTRFS_BLOCK_GROUP_DATA, - BTRFS_BLOCK_GROUP_SYSTEM, - BTRFS_BLOCK_GROUP_METADATA, - BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA}; - int i; - int c; - int num_tolerated_disk_barrier_failures = - (int)fs_info->fs_devices->num_devices; - - for (i = 0; i < ARRAY_SIZE(types); i++) { - struct btrfs_space_info *tmp; - - sinfo = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, list) { - if (tmp->flags == types[i]) { - sinfo = tmp; - break; - } - } - rcu_read_unlock(); - - if (!sinfo) - continue; - - down_read(&sinfo->groups_sem); - for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) { - u64 flags; - - if (list_empty(&sinfo->block_groups[c])) - continue; - - btrfs_get_block_group_info(&sinfo->block_groups[c], - &space); - if (space.total_bytes == 0 || space.used_bytes == 0) - continue; - flags = space.flags; - - num_tolerated_disk_barrier_failures = min( - num_tolerated_disk_barrier_failures, - btrfs_get_num_tolerated_disk_barrier_failures( - flags)); - } - up_read(&sinfo->groups_sem); - } - - return num_tolerated_disk_barrier_failures; -} - int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) { struct list_head *head; @@ -3732,12 +3682,12 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); - memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_FSID_SIZE); flags = btrfs_super_flags(sb); btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); - ret = write_dev_supers(dev, sb, 0, max_mirrors); + ret = write_dev_supers(dev, sb, max_mirrors); if (ret) total_errors++; } @@ -3760,7 +3710,7 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (!dev->in_fs_metadata || !dev->writeable) continue; - ret = write_dev_supers(dev, sb, 1, max_mirrors); + ret = wait_dev_supers(dev, max_mirrors); if (ret) total_errors++; } @@ -3995,7 +3945,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; - mutex_lock(&fs_info->chunk_mutex); while (!list_empty(&fs_info->pinned_chunks)) { struct extent_map *em; @@ -4004,7 +3953,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) list_del_init(&em->list); free_extent_map(em); } - mutex_unlock(&fs_info->chunk_mutex); } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -4053,7 +4001,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) { - btrfs_print_leaf(fs_info, buf); + btrfs_print_leaf(buf); ASSERT(0); } #endif @@ -4173,7 +4121,7 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info) ret = -EINVAL; } - if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_FSID_SIZE) != 0) { btrfs_err(fs_info, "dev_item UUID does not match fsid: %pU != %pU", fs_info->fsid, sb->dev_item.fsid); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0a634d3ffc16..7f7c35d6347a 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,14 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 +/* + * Fixed blocksize for all devices, applies to specific ways of reading + * metadata like superblock. Must meet the set_blocksize requirements. + * + * Do not change. + */ +#define BTRFS_BDEV_BLOCKSIZE (4096) + enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA = 0, BTRFS_WQ_ENDIO_METADATA = 1, @@ -142,8 +150,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); -int btrfs_calc_num_tolerated_disk_barrier_failures( - struct btrfs_fs_info *fs_info); int __init btrfs_end_io_wq_init(void); void btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e3b0b4196d3d..e2d7e86b51d1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1148,6 +1148,64 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans, } #endif +/* + * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required, + * is_data == BTRFS_REF_TYPE_DATA, data type is requried, + * is_data == BTRFS_REF_TYPE_ANY, either type is OK. + */ +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data) +{ + int type = btrfs_extent_inline_ref_type(eb, iref); + u64 offset = btrfs_extent_inline_ref_offset(eb, iref); + + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY || + type == BTRFS_SHARED_DATA_REF_KEY || + type == BTRFS_EXTENT_DATA_REF_KEY) { + if (is_data == BTRFS_REF_TYPE_BLOCK) { + if (type == BTRFS_TREE_BLOCK_REF_KEY) + return type; + if (type == BTRFS_SHARED_BLOCK_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree + * block, which must be aligned to + * nodesize. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->nodesize)) + return type; + } + } else if (is_data == BTRFS_REF_TYPE_DATA) { + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return type; + if (type == BTRFS_SHARED_DATA_REF_KEY) { + ASSERT(eb->fs_info); + /* + * Every shared one has parent tree + * block, which must be aligned to + * nodesize. + */ + if (offset && + IS_ALIGNED(offset, eb->fs_info->nodesize)) + return type; + } + } else { + ASSERT(is_data == BTRFS_REF_TYPE_ANY); + return type; + } + } + + btrfs_print_leaf((struct extent_buffer *)eb); + btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d", + eb->start, type); + WARN_ON(1); + + return BTRFS_REF_TYPE_INVALID; +} + static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) { u32 high_crc = ~(u32)0; @@ -1417,12 +1475,18 @@ static noinline u32 extent_data_ref_count(struct btrfs_path *path, struct btrfs_extent_data_ref *ref1; struct btrfs_shared_data_ref *ref2; u32 num_refs = 0; + int type; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (iref) { - if (btrfs_extent_inline_ref_type(leaf, iref) == - BTRFS_EXTENT_DATA_REF_KEY) { + /* + * If type is invalid, we should have bailed out earlier than + * this call. + */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + ASSERT(type != BTRFS_REF_TYPE_INVALID); + if (type == BTRFS_EXTENT_DATA_REF_KEY) { ref1 = (struct btrfs_extent_data_ref *)(&iref->offset); num_refs = btrfs_extent_data_ref_count(leaf, ref1); } else { @@ -1583,6 +1647,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans, int ret; int err = 0; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); + int needed; key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1674,6 +1739,11 @@ again: BUG_ON(ptr > end); } + if (owner >= BTRFS_FIRST_FREE_OBJECTID) + needed = BTRFS_REF_TYPE_DATA; + else + needed = BTRFS_REF_TYPE_BLOCK; + err = -ENOENT; while (1) { if (ptr >= end) { @@ -1681,7 +1751,12 @@ again: break; } iref = (struct btrfs_extent_inline_ref *)ptr; - type = btrfs_extent_inline_ref_type(leaf, iref); + type = btrfs_get_extent_inline_ref_type(leaf, iref, needed); + if (type == BTRFS_REF_TYPE_INVALID) { + err = -EINVAL; + goto out; + } + if (want < type) break; if (want > type) { @@ -1873,7 +1948,12 @@ void update_inline_extent_backref(struct btrfs_fs_info *fs_info, if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); - type = btrfs_extent_inline_ref_type(leaf, iref); + /* + * If type is invalid, we should have bailed out after + * lookup_inline_extent_backref(). + */ + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY); + ASSERT(type != BTRFS_REF_TYPE_INVALID); if (type == BTRFS_EXTENT_DATA_REF_KEY) { dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -3158,6 +3238,7 @@ static noinline int check_committed_ref(struct btrfs_root *root, struct btrfs_extent_item *ei; struct btrfs_key key; u32 item_size; + int type; int ret; key.objectid = bytenr; @@ -3199,8 +3280,9 @@ static noinline int check_committed_ref(struct btrfs_root *root, goto out; iref = (struct btrfs_extent_inline_ref *)(ei + 1); - if (btrfs_extent_inline_ref_type(leaf, iref) != - BTRFS_EXTENT_DATA_REF_KEY) + + type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA); + if (type != BTRFS_EXTENT_DATA_REF_KEY) goto out; ref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -4199,9 +4281,9 @@ static u64 btrfs_space_info_used(struct btrfs_space_info *s_info, int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { - struct btrfs_space_info *data_sinfo; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; u64 used; int ret = 0; int need_commit = 2; @@ -4215,10 +4297,6 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) ASSERT(current->journal_info); } - data_sinfo = fs_info->data_sinfo; - if (!data_sinfo) - goto alloc; - again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); @@ -4236,7 +4314,7 @@ again: data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; spin_unlock(&data_sinfo->lock); -alloc: + alloc_target = btrfs_data_alloc_profile(fs_info); /* * It is ugly that we don't call nolock join @@ -4264,9 +4342,6 @@ alloc: } } - if (!data_sinfo) - data_sinfo = fs_info->data_sinfo; - goto again; } @@ -4425,8 +4500,7 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, struct btrfs_space_info *sinfo, int force) { struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; - u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; + u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) @@ -4438,7 +4512,7 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, * global_rsv, it doesn't change except when the transaction commits. */ if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA) - num_allocated += calc_global_rsv_need_space(global_rsv); + bytes_used += calc_global_rsv_need_space(global_rsv); /* * in limited mode, we want to have some free space up to @@ -4448,11 +4522,11 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, thresh = btrfs_super_total_bytes(fs_info->super_copy); thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); - if (num_bytes - num_allocated < thresh) + if (sinfo->total_bytes - bytes_used < thresh) return 1; } - if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) + if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) return 0; return 1; } @@ -4904,9 +4978,14 @@ struct reserve_ticket { wait_queue_head_t wait; }; -static int flush_space(struct btrfs_fs_info *fs_info, +/* + * Try to flush some data based on policy set by @state. This is only advisory + * and may fail for various reasons. The caller is supposed to examine the + * state of @space_info to detect the outcome. + */ +static void flush_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 num_bytes, - u64 orig_bytes, int state) + int state) { struct btrfs_root *root = fs_info->extent_root; struct btrfs_trans_handle *trans; @@ -4931,7 +5010,7 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, orig_bytes, + shrink_delalloc(fs_info, num_bytes * 2, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case ALLOC_CHUNK: @@ -4949,16 +5028,16 @@ static int flush_space(struct btrfs_fs_info *fs_info, break; case COMMIT_TRANS: ret = may_commit_transaction(fs_info, space_info, - orig_bytes, 0); + num_bytes, 0); break; default: ret = -ENOSPC; break; } - trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, - orig_bytes, state, ret); - return ret; + trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state, + ret); + return; } static inline u64 @@ -5060,11 +5139,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_state = FLUSH_DELAYED_ITEMS_NR; do { - struct reserve_ticket *ticket; - int ret; - - ret = flush_space(fs_info, space_info, to_reclaim, to_reclaim, - flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state); spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -5074,8 +5149,6 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, false); - ticket = list_first_entry(&space_info->tickets, - struct reserve_ticket, list); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -5120,8 +5193,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); do { - flush_space(fs_info, space_info, to_reclaim, to_reclaim, - flush_state); + flush_space(fs_info, space_info, to_reclaim, flush_state); flush_state++; spin_lock(&space_info->lock); if (ticket->bytes == 0) { @@ -6664,19 +6736,20 @@ fetch_cluster_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 *empty_cluster) { struct btrfs_free_cluster *ret = NULL; - bool ssd = btrfs_test_opt(fs_info, SSD); *empty_cluster = 0; if (btrfs_mixed_space_info(space_info)) return ret; - if (ssd) - *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &fs_info->meta_alloc_cluster; - if (!ssd) + if (btrfs_test_opt(fs_info, SSD)) + *empty_cluster = SZ_2M; + else *empty_cluster = SZ_64K; - } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { + } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + btrfs_test_opt(fs_info, SSD_SPREAD)) { + *empty_cluster = SZ_2M; ret = &fs_info->data_alloc_cluster; } @@ -6755,7 +6828,7 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, if (!readonly && return_free_space && global_rsv->space_info == space_info) { u64 to_add = len; - WARN_ON(!return_free_space); + spin_lock(&global_rsv->lock); if (!global_rsv->full) { to_add = min(len, global_rsv->size - @@ -6841,7 +6914,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, if (ret) { const char *errstr = btrfs_decode_error(ret); btrfs_warn(fs_info, - "Discard failed while removing blockgroup: errno=%d %s\n", + "discard failed while removing blockgroup: errno=%d %s", ret, errstr); } } @@ -6969,7 +7042,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, "umm, got %d back from search, was looking for %llu", ret, bytenr); if (ret > 0) - btrfs_print_leaf(info, path->nodes[0]); + btrfs_print_leaf(path->nodes[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -6978,7 +7051,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, extent_slot = path->slots[0]; } } else if (WARN_ON(ret == -ENOENT)) { - btrfs_print_leaf(info, path->nodes[0]); + btrfs_print_leaf(path->nodes[0]); btrfs_err(info, "unable to find ref byte nr %llu parent %llu root %llu owner %llu offset %llu", bytenr, parent, root_objectid, owner_objectid, @@ -7015,7 +7088,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_err(info, "umm, got %d back from search, was looking for %llu", ret, bytenr); - btrfs_print_leaf(info, path->nodes[0]); + btrfs_print_leaf(path->nodes[0]); } if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -9193,7 +9266,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, if (err) goto out_end_trans; - ret = btrfs_del_root(trans, tree_root, &root->root_key); + ret = btrfs_del_root(trans, fs_info, &root->root_key); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -9952,11 +10025,8 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = fs_info->sectorsize; cache->fs_info = fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(fs_info, - &fs_info->mapping_tree, - start); + cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); set_free_space_tree_thresholds(cache); atomic_set(&cache->count, 1); @@ -10192,8 +10262,7 @@ next: int btrfs_make_block_group(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytes_used, - u64 type, u64 chunk_objectid, u64 chunk_offset, - u64 size) + u64 type, u64 chunk_offset, u64 size) { struct btrfs_block_group_cache *cache; int ret; @@ -10205,7 +10274,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return -ENOMEM; btrfs_set_block_group_used(&cache->item, bytes_used); - btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + btrfs_set_block_group_chunk_objectid(&cache->item, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_block_group_flags(&cache->item, type); cache->flags = type; @@ -11001,14 +11071,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range) } /* - * btrfs_{start,end}_write_no_snapshoting() are similar to + * btrfs_{start,end}_write_no_snapshotting() are similar to * mnt_{want,drop}_write(), they are used to prevent some tasks from writing * data into the page cache through nocow before the subvolume is snapshoted, * but flush the data into disk after the snapshot creation, or to prevent - * operations while snapshoting is ongoing and that cause the snapshot to be + * operations while snapshotting is ongoing and that cause the snapshot to be * inconsistent (writes followed by expanding truncates for example). */ -void btrfs_end_write_no_snapshoting(struct btrfs_root *root) +void btrfs_end_write_no_snapshotting(struct btrfs_root *root) { percpu_counter_dec(&root->subv_writers->counter); /* @@ -11019,9 +11089,9 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root) wake_up(&root->subv_writers->wait); } -int btrfs_start_write_no_snapshoting(struct btrfs_root *root) +int btrfs_start_write_no_snapshotting(struct btrfs_root *root) { - if (atomic_read(&root->will_be_snapshoted)) + if (atomic_read(&root->will_be_snapshotted)) return 0; percpu_counter_inc(&root->subv_writers->counter); @@ -11029,14 +11099,14 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) * Make sure counter is updated before we check for snapshot creation. */ smp_mb(); - if (atomic_read(&root->will_be_snapshoted)) { - btrfs_end_write_no_snapshoting(root); + if (atomic_read(&root->will_be_snapshotted)) { + btrfs_end_write_no_snapshotting(root); return 0; } return 1; } -static int wait_snapshoting_atomic_t(atomic_t *a) +static int wait_snapshotting_atomic_t(atomic_t *a) { schedule(); return 0; @@ -11047,11 +11117,11 @@ void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) while (true) { int ret; - ret = btrfs_start_write_no_snapshoting(root); + ret = btrfs_start_write_no_snapshotting(root); if (ret) break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, + wait_on_atomic_t(&root->will_be_snapshotted, + wait_snapshotting_atomic_t, TASK_UNINTERRUPTIBLE); } } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0aff9b278c19..0f077c5db58e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,7 +20,6 @@ #include "locking.h" #include "rcu-string.h" #include "backref.h" -#include "transaction.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -1998,7 +1997,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + if (btrfs_is_parity_mirror(fs_info, logical, length)) { /* * Note that we don't use BTRFS_MAP_WRITE because it's supposed * to update all raid stripes, but here we just want to correct @@ -2033,7 +2032,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_put(bio); return -EIO; } - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; bio_add_page(bio, page, length, pg_offset); @@ -2335,7 +2334,7 @@ struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, bio = btrfs_io_bio_alloc(1); bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; - bio->bi_bdev = fs_info->fs_devices->latest_bdev; + bio_set_dev(bio, fs_info->fs_devices->latest_bdev); bio->bi_iter.bi_size = 0; bio->bi_private = data; @@ -2675,7 +2674,7 @@ struct bio *btrfs_bio_alloc(struct block_device *bdev, u64 first_byte) struct bio *bio; bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, btrfs_bioset); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_byte >> 9; btrfs_io_bio_init(btrfs_io_bio(bio)); return bio; @@ -2757,7 +2756,10 @@ static int merge_bio(struct extent_io_tree *tree, struct page *page, } -static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, +/* + * @opf: bio REQ_OP_* and REQ_* flags as one value + */ +static int submit_extent_page(unsigned int opf, struct extent_io_tree *tree, struct writeback_control *wbc, struct page *page, sector_t sector, size_t size, unsigned long offset, @@ -2804,7 +2806,7 @@ static int submit_extent_page(int op, int op_flags, struct extent_io_tree *tree, bio->bi_end_io = end_io_func; bio->bi_private = tree; bio->bi_write_hint = page->mapping->host->i_write_hint; - bio_set_op_attrs(bio, op, op_flags); + bio->bi_opf = opf; if (wbc) { wbc_init_bio(wbc, bio); wbc_account_io(wbc, page, page_size); @@ -2878,7 +2880,7 @@ static int __do_readpage(struct extent_io_tree *tree, get_extent_t *get_extent, struct extent_map **em_cached, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int read_flags, + unsigned long *bio_flags, unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; @@ -3059,7 +3061,7 @@ static int __do_readpage(struct extent_io_tree *tree, continue; } - ret = submit_extent_page(REQ_OP_READ, read_flags, tree, NULL, + ret = submit_extent_page(REQ_OP_READ | read_flags, tree, NULL, page, sector, disk_io_size, pg_offset, bdev, bio, end_bio_extent_readpage, mirror_num, @@ -3164,7 +3166,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, struct bio **bio, int mirror_num, - unsigned long *bio_flags, int read_flags) + unsigned long *bio_flags, + unsigned int read_flags) { struct inode *inode = page->mapping->host; struct btrfs_ordered_extent *ordered; @@ -3311,7 +3314,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, struct extent_page_data *epd, loff_t i_size, unsigned long nr_written, - int write_flags, int *nr_ret) + unsigned int write_flags, int *nr_ret) { struct extent_io_tree *tree = epd->tree; u64 start = page_offset(page); @@ -3427,7 +3430,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page->index, cur, end); } - ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, page, sector, iosize, pg_offset, bdev, &epd->bio, end_bio_extent_writepage, @@ -3465,7 +3468,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, size_t pg_offset = 0; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; - int write_flags = 0; + unsigned int write_flags = 0; unsigned long nr_written = 0; if (wbc->sync_mode == WB_SYNC_ALL) @@ -3715,7 +3718,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, unsigned long i, num_pages; unsigned long bio_flags = 0; unsigned long start, end; - int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; + unsigned int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META; int ret = 0; clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); @@ -3745,7 +3748,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); - ret = submit_extent_page(REQ_OP_WRITE, write_flags, tree, wbc, + ret = submit_extent_page(REQ_OP_WRITE | write_flags, tree, wbc, p, offset >> 9, PAGE_SIZE, 0, bdev, &epd->bio, end_bio_extent_buffer_writepage, @@ -4606,36 +4609,21 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); } else if (fieinfo->fi_extents_max) { - struct btrfs_trans_handle *trans; - u64 bytenr = em->block_start - (em->start - em->orig_start); disko = em->block_start + offset_in_extent; /* - * We need a trans handle to get delayed refs - */ - trans = btrfs_join_transaction(root); - /* - * It's OK if we can't start a trans we can still check - * from commit_root - */ - if (IS_ERR(trans)) - trans = NULL; - - /* * As btrfs supports shared space, this information * can be exported to userspace tools via * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(trans, root->fs_info, - root->objectid, - btrfs_ino(BTRFS_I(inode)), bytenr); - if (trans) - btrfs_end_transaction(trans); + ret = btrfs_check_shared(root, + btrfs_ino(BTRFS_I(inode)), + bytenr); if (ret < 0) goto out_free; if (ret) @@ -5405,9 +5393,8 @@ unlock_exit: return ret; } -void read_extent_buffer(struct extent_buffer *eb, void *dstv, - unsigned long start, - unsigned long len) +void read_extent_buffer(const struct extent_buffer *eb, void *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5417,8 +5404,12 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, size_t start_offset = eb->start & ((u64)PAGE_SIZE - 1); unsigned long i = (start_offset + start) >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (start + len > eb->len) { + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", + eb->start, eb->len, start, len); + memset(dst, 0, len); + return; + } offset = (start_offset + start) & (PAGE_SIZE - 1); @@ -5436,9 +5427,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, - unsigned long start, - unsigned long len) +int read_extent_buffer_to_user(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5478,10 +5469,10 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, * return 1 if the item spans two pages. * return -EINVAL otherwise. */ -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len) +int map_private_extent_buffer(const struct extent_buffer *eb, + unsigned long start, unsigned long min_len, + char **map, unsigned long *map_start, + unsigned long *map_len) { size_t offset = start & (PAGE_SIZE - 1); char *kaddr; @@ -5491,6 +5482,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, unsigned long end_i = (start_offset + start + min_len - 1) >> PAGE_SHIFT; + if (start + min_len > eb->len) { + WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", + eb->start, eb->len, start, min_len); + return -EINVAL; + } + if (i != end_i) return 1; @@ -5502,12 +5499,6 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, *map_start = ((u64)i << PAGE_SHIFT) - start_offset; } - if (start + min_len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, min_len); - return -EINVAL; - } - p = eb->pages[i]; kaddr = page_address(p); *map = kaddr + offset; @@ -5515,9 +5506,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, return 0; } -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len) +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4f030912f3ef..faffa28ba707 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -449,14 +449,13 @@ static inline void extent_buffer_get(struct extent_buffer *eb) atomic_inc(&eb->refs); } -int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, - unsigned long start, - unsigned long len); -void read_extent_buffer(struct extent_buffer *eb, void *dst, +int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, + unsigned long start, unsigned long len); +void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dst, - unsigned long start, +int read_extent_buffer_to_user(const struct extent_buffer *eb, + void __user *dst, unsigned long start, unsigned long len); void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src); void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb, @@ -486,10 +485,10 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb); void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); -int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, - unsigned long min_len, char **map, - unsigned long *map_start, - unsigned long *map_len); +int map_private_extent_buffer(const struct extent_buffer *eb, + unsigned long offset, unsigned long min_len, + char **map, unsigned long *map_start, + unsigned long *map_len); void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9e75d8a39aac..74fd7756cff3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1536,7 +1536,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, u64 num_bytes; int ret; - ret = btrfs_start_write_no_snapshoting(root); + ret = btrfs_start_write_no_snapshotting(root); if (!ret) return -ENOSPC; @@ -1561,7 +1561,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos, NULL, NULL, NULL); if (ret <= 0) { ret = 0; - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); @@ -1664,7 +1664,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, data_reserved, pos, write_bytes); else - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); break; } @@ -1767,7 +1767,7 @@ again: release_bytes = 0; if (only_release_metadata) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); if (only_release_metadata && copied > 0) { lockstart = round_down(pos, @@ -1797,7 +1797,7 @@ again: if (release_bytes) { if (only_release_metadata) { - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); btrfs_delalloc_release_metadata(BTRFS_I(inode), release_bytes); } else { @@ -1990,8 +1990,15 @@ out: int btrfs_release_file(struct inode *inode, struct file *filp) { - if (filp->private_data) + struct btrfs_file_private *private = filp->private_data; + + if (private && private->trans) btrfs_ioctl_trans_end(filp); + if (private && private->filldir_buf) + kfree(private->filldir_buf); + kfree(private); + filp->private_data = NULL; + /* * ordered_data_close is set by settattr when we are about to truncate * a file from a non-zero size to a zero size. This tries to diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index c5e6180cdb8c..cdc9f4015ec3 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -709,7 +709,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, if (!BTRFS_I(inode)->generation) { btrfs_info(fs_info, - "The free space cache file (%llu) is invalid. skip it\n", + "the free space cache file (%llu) is invalid, skip it", offset); return 0; } diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a5e34de06c2f..684f12247db7 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1257,7 +1257,7 @@ int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) if (ret) goto abort; - ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key); + ret = btrfs_del_root(trans, fs_info, &free_space_root->root_key); if (ret) goto abort; diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h index 54ffced3bce8..ba3787df43c3 100644 --- a/fs/btrfs/free-space-tree.h +++ b/fs/btrfs/free-space-tree.h @@ -44,7 +44,7 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 start, u64 size); -/* Exposed for testing. */ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_free_space_info * search_free_space_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -68,5 +68,6 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, struct btrfs_path *path); int free_space_test_bit(struct btrfs_block_group_cache *block_group, struct btrfs_path *path, u64 offset); +#endif #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 24bcd5cd9cf2..17ad018da0a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -392,20 +392,23 @@ static noinline int add_async_extent(struct async_cow *cow, return 0; } -static inline int inode_need_compress(struct inode *inode) +static inline int inode_need_compress(struct inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; + /* defrag ioctl */ + if (BTRFS_I(inode)->defrag_compress) + return 1; /* bad compression ratios */ if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || - BTRFS_I(inode)->force_compress) - return 1; + BTRFS_I(inode)->prop_compress) + return btrfs_compress_heuristic(inode, start, end); return 0; } @@ -503,7 +506,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(inode)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -511,8 +514,10 @@ again: goto cont; } - if (BTRFS_I(inode)->force_compress) - compress_type = BTRFS_I(inode)->force_compress; + if (BTRFS_I(inode)->defrag_compress) + compress_type = BTRFS_I(inode)->defrag_compress; + else if (BTRFS_I(inode)->prop_compress) + compress_type = BTRFS_I(inode)->prop_compress; /* * we need to call clear_page_dirty_for_io on each @@ -645,7 +650,7 @@ cont: /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(BTRFS_I(inode)->force_compress)) { + !(BTRFS_I(inode)->prop_compress)) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } } @@ -1381,7 +1386,7 @@ next_slot: * we fall into common COW way. */ if (!nolock) { - err = btrfs_start_write_no_snapshoting(root); + err = btrfs_start_write_no_snapshotting(root); if (!err) goto out_check; } @@ -1393,12 +1398,12 @@ next_slot: if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) { if (!nolock) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); goto out_check; } if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) { if (!nolock) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); goto out_check; } nocow = 1; @@ -1415,7 +1420,7 @@ out_check: if (extent_end <= start) { path->slots[0]++; if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); goto next_slot; @@ -1438,7 +1443,7 @@ out_check: NULL); if (ret) { if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1459,7 +1464,7 @@ out_check: BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); if (nocow) btrfs_dec_nocow_writers(fs_info, disk_bytenr); @@ -1499,7 +1504,7 @@ out_check: PAGE_UNLOCK | PAGE_SET_PRIVATE2); if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); cur_offset = extent_end; /* @@ -1576,7 +1581,7 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!inode_need_compress(inode)) { + } else if (!inode_need_compress(inode, start, end)) { ret = cow_file_range(inode, locked_page, start, end, end, page_started, nr_written, 1, NULL); } else { @@ -1796,10 +1801,11 @@ static void btrfs_clear_bit_hook(void *private_data, u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(len); - spin_lock(&inode->lock); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) + if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + spin_lock(&inode->lock); inode->defrag_bytes -= len; - spin_unlock(&inode->lock); + spin_unlock(&inode->lock); + } /* * set_bit and clear bit hooks normally require _irqsave/restore @@ -3159,8 +3165,6 @@ zeroit: memset(kaddr + pgoff, 1, len); flush_dcache_page(page); kunmap_atomic(kaddr); - if (csum_expected == 0) - return 0; return -EIO; } @@ -5055,7 +5059,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) if (newsize > oldsize) { /* - * Don't do an expanding truncate while snapshoting is ongoing. + * Don't do an expanding truncate while snapshotting is ongoing. * This is to ensure the snapshot captures a fully consistent * state of this file - if the snapshot captures this expanding * truncation, it must capture all writes that happened before @@ -5064,13 +5068,13 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); return PTR_ERR(trans); } @@ -5078,7 +5082,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, root, inode); - btrfs_end_write_no_snapshoting(root); + btrfs_end_write_no_snapshotting(root); btrfs_end_transaction(trans); } else { @@ -5873,25 +5877,74 @@ unsigned char btrfs_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; +/* + * All this infrastructure exists because dir_emit can fault, and we are holding + * the tree lock when doing readdir. For now just allocate a buffer and copy + * our information into that, and then dir_emit from the buffer. This is + * similar to what NFS does, only we don't keep the buffer around in pagecache + * because I'm afraid I'll mess that up. Long term we need to make filldir do + * copy_to_user_inatomic so we don't have to worry about page faulting under the + * tree lock. + */ +static int btrfs_opendir(struct inode *inode, struct file *file) +{ + struct btrfs_file_private *private; + + private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); + if (!private) + return -ENOMEM; + private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!private->filldir_buf) { + kfree(private); + return -ENOMEM; + } + file->private_data = private; + return 0; +} + +struct dir_entry { + u64 ino; + u64 offset; + unsigned type; + int name_len; +}; + +static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) +{ + while (entries--) { + struct dir_entry *entry = addr; + char *name = (char *)(entry + 1); + + ctx->pos = entry->offset; + if (!dir_emit(ctx, name, entry->name_len, entry->ino, + entry->type)) + return 1; + addr += sizeof(struct dir_entry) + entry->name_len; + ctx->pos++; + } + return 0; +} + static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; + void *addr; struct list_head ins_list; struct list_head del_list; int ret; struct extent_buffer *leaf; int slot; - unsigned char d_type; - int over = 0; - char tmp_name[32]; char *name_ptr; int name_len; + int entries = 0; + int total_len = 0; bool put = false; struct btrfs_key location; @@ -5902,12 +5955,14 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; + addr = private->filldir_buf; path->reada = READA_FORWARD; INIT_LIST_HEAD(&ins_list); INIT_LIST_HEAD(&del_list); put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list); +again: key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); @@ -5917,6 +5972,8 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) goto err; while (1) { + struct dir_entry *entry; + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { @@ -5938,41 +5995,43 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) goto next; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) goto next; - - ctx->pos = found_key.offset; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); if (verify_dir_item(fs_info, leaf, slot, di)) goto next; name_len = btrfs_dir_name_len(leaf, di); - if (name_len <= sizeof(tmp_name)) { - name_ptr = tmp_name; - } else { - name_ptr = kmalloc(name_len, GFP_KERNEL); - if (!name_ptr) { - ret = -ENOMEM; - goto err; - } + if ((total_len + sizeof(struct dir_entry) + name_len) >= + PAGE_SIZE) { + btrfs_release_path(path); + ret = btrfs_filldir(private->filldir_buf, entries, ctx); + if (ret) + goto nopos; + addr = private->filldir_buf; + entries = 0; + total_len = 0; + goto again; } + + entry = addr; + entry->name_len = name_len; + name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); - - d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - - over = !dir_emit(ctx, name_ptr, name_len, location.objectid, - d_type); - - if (name_ptr != tmp_name) - kfree(name_ptr); - - if (over) - goto nopos; - ctx->pos++; + entry->ino = location.objectid; + entry->offset = found_key.offset; + entries++; + addr += sizeof(struct dir_entry) + name_len; + total_len += sizeof(struct dir_entry) + name_len; next: path->slots[0]++; } + btrfs_release_path(path); + + ret = btrfs_filldir(private->filldir_buf, entries, ctx); + if (ret) + goto nopos; ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) @@ -6185,6 +6244,37 @@ static int btrfs_insert_inode_locked(struct inode *inode) btrfs_find_actor, &args); } +/* + * Inherit flags from the parent inode. + * + * Currently only the compression flags and the cow flags are inherited. + */ +static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +{ + unsigned int flags; + + if (!dir) + return; + + flags = BTRFS_I(dir)->flags; + + if (flags & BTRFS_INODE_NOCOMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + } else if (flags & BTRFS_INODE_COMPRESS) { + BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; + BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + } + + if (flags & BTRFS_INODE_NODATACOW) { + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->i_mode)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + } + + btrfs_update_iflags(inode); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -7991,7 +8081,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct bio *bio; int isector; - int read_mode = 0; + unsigned int read_mode = 0; int segs; int ret; blk_status_t status; @@ -8021,7 +8111,7 @@ static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio, bio_set_op_attrs(bio, REQ_OP_READ, read_mode); btrfs_debug(BTRFS_I(inode)->root->fs_info, - "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d", read_mode, failrec->this_mirror, failrec->in_validation); status = submit_dio_repair_bio(inode, bio, failrec->this_mirror); @@ -8106,7 +8196,7 @@ next_block_or_try_again: goto next; } - wait_for_completion(&done.done); + wait_for_completion_io(&done.done); if (!done.uptodate) { /* We might have another mirror, so try again */ @@ -8221,7 +8311,7 @@ try_again: goto next; } - wait_for_completion(&done.done); + wait_for_completion_io(&done.done); if (!done.uptodate) { /* We might have another mirror, so try again */ @@ -8428,7 +8518,7 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, static inline blk_status_t __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, - int skip_sum, int async_submit) + int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; @@ -8446,7 +8536,7 @@ __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset, goto err; } - if (skip_sum) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; if (write && async_submit) { @@ -8476,8 +8566,7 @@ err: return ret; } -static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, - int skip_sum) +static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip) { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -8541,7 +8630,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, */ atomic_inc(&dip->pending_bios); - status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, + status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (status) { bio_put(bio); @@ -8561,8 +8650,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, } while (submit_len > 0); submit: - status = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum, - async_submit); + status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); if (!status) return 0; @@ -8587,12 +8675,9 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, struct btrfs_dio_private *dip = NULL; struct bio *bio = NULL; struct btrfs_io_bio *io_bio; - int skip_sum; bool write = (bio_op(dio_bio) == REQ_OP_WRITE); int ret = 0; - skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - bio = btrfs_bio_clone(dio_bio); dip = kzalloc(sizeof(*dip), GFP_NOFS); @@ -8635,7 +8720,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, dio_data->unsubmitted_oe_range_end; } - ret = btrfs_submit_direct_hook(dip, skip_sum); + ret = btrfs_submit_direct_hook(dip); if (!ret) return; @@ -8735,7 +8820,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; inode_dio_begin(inode); - smp_mb__after_atomic(); /* * The generic stuff only does filemap_write_and_wait_range, which @@ -9408,7 +9492,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->reserved_extents = 0; ei->runtime_flags = 0; - ei->force_compress = BTRFS_COMPRESS_NONE; + ei->prop_compress = BTRFS_COMPRESS_NONE; + ei->defrag_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; @@ -10748,6 +10833,7 @@ static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, + .open = btrfs_opendir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fa1b78cf25f6..0ebaf5d116bc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -156,37 +156,6 @@ void btrfs_update_iflags(struct inode *inode) new_fl); } -/* - * Inherit flags from the parent inode. - * - * Currently only the compression flags and the cow flags are inherited. - */ -void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) -{ - unsigned int flags; - - if (!dir) - return; - - flags = BTRFS_I(dir)->flags; - - if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - } - - if (flags & BTRFS_INODE_NODATACOW) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - } - - btrfs_update_iflags(inode); -} - static int btrfs_ioctl_getflags(struct file *file, void __user *arg) { struct btrfs_inode *ip = BTRFS_I(file_inode(file)); @@ -327,8 +296,10 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (fs_info->compress_type == BTRFS_COMPRESS_LZO) comp = "lzo"; - else + else if (fs_info->compress_type == BTRFS_COMPRESS_ZLIB) comp = "zlib"; + else + comp = "zstd"; ret = btrfs_set_prop(inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) @@ -638,7 +609,7 @@ fail_free: return ret; } -static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) +static void btrfs_wait_for_no_snapshotting_writes(struct btrfs_root *root) { s64 writers; DEFINE_WAIT(wait); @@ -681,9 +652,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, goto free_pending; } - atomic_inc(&root->will_be_snapshoted); + atomic_inc(&root->will_be_snapshotted); smp_mb__after_atomic(); - btrfs_wait_for_no_snapshoting_writes(root); + btrfs_wait_for_no_snapshotting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) @@ -754,8 +725,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, fail: btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); dec_and_free: - if (atomic_dec_and_test(&root->will_be_snapshoted)) - wake_up_atomic_t(&root->will_be_snapshoted); + if (atomic_dec_and_test(&root->will_be_snapshotted)) + wake_up_atomic_t(&root->will_be_snapshotted); free_pending: kfree(pending_snapshot->root_item); btrfs_free_path(pending_snapshot->path); @@ -1286,6 +1257,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, unsigned long cluster = max_cluster; u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; + bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; if (isize == 0) return 0; @@ -1293,7 +1265,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (range->start >= isize) return -EINVAL; - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (do_compress) { if (range->compress_type > BTRFS_COMPRESS_TYPES) return -EINVAL; if (range->compress_type) @@ -1304,20 +1276,19 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, extent_thresh = SZ_256K; /* - * if we were not given a file, allocate a readahead - * context + * If we were not given a file, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. */ if (!file) { - ra = kzalloc(sizeof(*ra), GFP_NOFS); - if (!ra) - return -ENOMEM; - file_ra_state_init(ra, inode->i_mapping); + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); } else { ra = &file->f_ra; } - pages = kmalloc_array(max_cluster, sizeof(struct page *), - GFP_NOFS); + pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out_ra; @@ -1373,8 +1344,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, extent_thresh, &last_len, &skip, - &defrag_end, range->flags & - BTRFS_DEFRAG_RANGE_COMPRESS)) { + &defrag_end, do_compress)){ unsigned long next; /* * the should_defrag function tells us how much to skip @@ -1395,14 +1365,15 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (i + cluster > ra_index) { ra_index = max(i, ra_index); - btrfs_force_ra(inode->i_mapping, ra, file, ra_index, - cluster); + if (ra) + page_cache_sync_readahead(inode->i_mapping, ra, + file, ra_index, cluster); ra_index += cluster; } inode_lock(inode); - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) - BTRFS_I(inode)->force_compress = compress_type; + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { inode_unlock(inode); @@ -1449,7 +1420,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, filemap_flush(inode->i_mapping); } - if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) { + if (do_compress) { /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -1466,14 +1437,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (range->compress_type == BTRFS_COMPRESS_LZO) { btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); } ret = defrag_count; out_ra: - if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { + if (do_compress) { inode_lock(inode); - BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; inode_unlock(inode); } if (!file) @@ -1600,8 +1573,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - new_size = div_u64(new_size, fs_info->sectorsize); - new_size *= fs_info->sectorsize; + new_size = round_down(new_size, fs_info->sectorsize); btrfs_info_in_rcu(fs_info, "new size for %s is %llu", rcu_str_deref(device->name), new_size); @@ -2201,9 +2173,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, buf_size = args.buf_size; - if (buf_size < sizeof(struct btrfs_ioctl_search_header)) - return -EOVERFLOW; - /* limit result size to 16MB */ if (buf_size > buf_limit) buf_size = buf_limit; @@ -3998,15 +3967,35 @@ static long btrfs_ioctl_trans_start(struct file *file) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; + struct btrfs_file_private *private; int ret; + static bool warned = false; ret = -EPERM; if (!capable(CAP_SYS_ADMIN)) goto out; + if (!warned) { + btrfs_warn(fs_info, + "Userspace transaction mechanism is considered " + "deprecated and slated to be removed in 4.17. " + "If you have a valid use case please " + "speak up on the mailing list"); + WARN_ON(1); + warned = true; + } + ret = -EINPROGRESS; - if (file->private_data) + private = file->private_data; + if (private && private->trans) goto out; + if (!private) { + private = kzalloc(sizeof(struct btrfs_file_private), + GFP_KERNEL); + if (!private) + return -ENOMEM; + file->private_data = private; + } ret = -EROFS; if (btrfs_root_readonly(root)) @@ -4023,7 +4012,7 @@ static long btrfs_ioctl_trans_start(struct file *file) if (IS_ERR(trans)) goto out_drop; - file->private_data = trans; + private->trans = trans; return 0; out_drop: @@ -4278,14 +4267,13 @@ long btrfs_ioctl_trans_end(struct file *file) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; + struct btrfs_file_private *private = file->private_data; - trans = file->private_data; - if (!trans) + if (!private || !private->trans) return -EINVAL; - file->private_data = NULL; - btrfs_end_transaction(trans); + btrfs_end_transaction(private->trans); + private->trans = NULL; atomic_dec(&root->fs_info->open_ioctl_trans); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index fcae61e175f3..569205e651c7 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -44,7 +44,7 @@ static void print_dev_item(struct extent_buffer *eb, static void print_extent_data_ref(struct extent_buffer *eb, struct btrfs_extent_data_ref *ref) { - pr_info("\t\textent data backref root %llu objectid %llu offset %llu count %u\n", + pr_cont("extent data backref root %llu objectid %llu offset %llu count %u\n", btrfs_extent_data_ref_root(eb, ref), btrfs_extent_data_ref_objectid(eb, ref), btrfs_extent_data_ref_offset(eb, ref), @@ -63,6 +63,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) u32 item_size = btrfs_item_size_nr(eb, slot); u64 flags; u64 offset; + int ref_index = 0; if (item_size < sizeof(*ei)) { #ifdef BTRFS_COMPAT_EXTENT_TREE_V0 @@ -104,12 +105,20 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) iref = (struct btrfs_extent_inline_ref *)ptr; type = btrfs_extent_inline_ref_type(eb, iref); offset = btrfs_extent_inline_ref_offset(eb, iref); + pr_info("\t\tref#%d: ", ref_index++); switch (type) { case BTRFS_TREE_BLOCK_REF_KEY: - pr_info("\t\ttree block backref root %llu\n", offset); + pr_cont("tree block backref root %llu\n", offset); break; case BTRFS_SHARED_BLOCK_REF_KEY: - pr_info("\t\tshared block backref parent %llu\n", offset); + pr_cont("shared block backref parent %llu\n", offset); + /* + * offset is supposed to be a tree block which + * must be aligned to nodesize. + */ + if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) + pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", + offset, (unsigned long long)eb->fs_info->nodesize); break; case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); @@ -117,11 +126,20 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type) break; case BTRFS_SHARED_DATA_REF_KEY: sref = (struct btrfs_shared_data_ref *)(iref + 1); - pr_info("\t\tshared data backref parent %llu count %u\n", + pr_cont("shared data backref parent %llu count %u\n", offset, btrfs_shared_data_ref_count(eb, sref)); + /* + * offset is supposed to be a tree block which + * must be aligned to nodesize. + */ + if (!IS_ALIGNED(offset, eb->fs_info->nodesize)) + pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n", + offset, (unsigned long long)eb->fs_info->nodesize); break; default: - BUG(); + pr_cont("(extent %llu has INVALID ref type %d)\n", + eb->start, type); + return; } ptr += btrfs_extent_inline_ref_size(type); } @@ -161,8 +179,9 @@ static void print_uuid_item(struct extent_buffer *l, unsigned long offset, } } -void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) +void btrfs_print_leaf(struct extent_buffer *l) { + struct btrfs_fs_info *fs_info; int i; u32 type, nr; struct btrfs_item *item; @@ -180,6 +199,7 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) if (!l) return; + fs_info = l->fs_info; nr = btrfs_header_nritems(l); btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d", @@ -318,18 +338,20 @@ void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l) } } -void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c) +void btrfs_print_tree(struct extent_buffer *c) { + struct btrfs_fs_info *fs_info; int i; u32 nr; struct btrfs_key key; int level; if (!c) return; + fs_info = c->fs_info; nr = btrfs_header_nritems(c); level = btrfs_header_level(c); if (level == 0) { - btrfs_print_leaf(fs_info, c); + btrfs_print_leaf(c); return; } btrfs_info(fs_info, @@ -359,7 +381,7 @@ void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c) if (btrfs_header_level(next) != level - 1) BUG(); - btrfs_print_tree(fs_info, next); + btrfs_print_tree(next); free_extent_buffer(next); } } diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 4f2e0ea0e95a..3afd508ed8c5 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -18,6 +18,6 @@ #ifndef __PRINT_TREE_ #define __PRINT_TREE_ -void btrfs_print_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *l); -void btrfs_print_tree(struct btrfs_fs_info *fs_info, struct extent_buffer *c); +void btrfs_print_leaf(struct extent_buffer *l); +void btrfs_print_tree(struct extent_buffer *c); #endif diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 4b23ae5d0e5c..f6a05f836629 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -390,6 +390,8 @@ static int prop_compression_validate(const char *value, size_t len) return 0; else if (!strncmp("zlib", value, len)) return 0; + else if (!strncmp("zstd", value, len)) + return 0; return -EINVAL; } @@ -403,32 +405,36 @@ static int prop_compression_apply(struct inode *inode, if (len == 0) { BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; + BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; return 0; } - if (!strncmp("lzo", value, len)) + if (!strncmp("lzo", value, 3)) type = BTRFS_COMPRESS_LZO; - else if (!strncmp("zlib", value, len)) + else if (!strncmp("zlib", value, 4)) type = BTRFS_COMPRESS_ZLIB; + else if (!strncmp("zstd", value, len)) + type = BTRFS_COMPRESS_ZSTD; else return -EINVAL; BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->force_compress = type; + BTRFS_I(inode)->prop_compress = type; return 0; } static const char *prop_compression_extract(struct inode *inode) { - switch (BTRFS_I(inode)->force_compress) { + switch (BTRFS_I(inode)->prop_compress) { case BTRFS_COMPRESS_ZLIB: return "zlib"; case BTRFS_COMPRESS_LZO: return "lzo"; + case BTRFS_COMPRESS_ZSTD: + return "zstd"; } return NULL; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4ce351efe281..5c8b61c86e61 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -946,7 +946,6 @@ out: int btrfs_quota_disable(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { - struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *quota_root; int ret = 0; @@ -968,7 +967,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_del_root(trans, tree_root, "a_root->root_key); + ret = btrfs_del_root(trans, fs_info, "a_root->root_key); if (ret) goto out; @@ -1603,7 +1602,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *eb = root_eb; struct btrfs_path *path = NULL; - BUG_ON(root_level < 0 || root_level > BTRFS_MAX_LEVEL); + BUG_ON(root_level < 0 || root_level >= BTRFS_MAX_LEVEL); BUG_ON(root_eb == NULL); if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) @@ -2646,7 +2645,7 @@ out: if (IS_ERR(trans)) { err = PTR_ERR(trans); btrfs_err(fs_info, - "fail to start transaction for status update: %d\n", + "fail to start transaction for status update: %d", err); goto done; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2cf6ba40f7c4..24a62224b24b 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1090,7 +1090,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && stripe->dev->bdev && !last->bi_status && - last->bi_bdev == stripe->dev->bdev) { + last->bi_disk == stripe->dev->bdev->bd_disk && + last->bi_partno == stripe->dev->bdev->bd_partno) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1100,7 +1101,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, /* put a new bio on the list */ bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); bio->bi_iter.bi_size = 0; - bio->bi_bdev = stripe->dev->bdev; + bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -1347,7 +1348,8 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, stripe_start = stripe->physical; if (physical >= stripe_start && physical < stripe_start + rbio->stripe_len && - bio->bi_bdev == stripe->dev->bdev) { + bio->bi_disk == stripe->dev->bdev->bd_disk && + bio->bi_partno == stripe->dev->bdev->bd_partno) { return i; } } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 65661d1aae4e..3a49a3c2fca4 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -32,6 +32,7 @@ #include "free-space-cache.h" #include "inode-map.h" #include "qgroup.h" +#include "print-tree.h" /* * backref_node, mapping_node and tree_block start with this @@ -799,9 +800,17 @@ again: if (ptr < end) { /* update key for inline back ref */ struct btrfs_extent_inline_ref *iref; + int type; iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); + type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_BLOCK); + if (type == BTRFS_REF_TYPE_INVALID) { + err = -EINVAL; + goto out; + } + key.type = type; key.offset = btrfs_extent_inline_ref_offset(eb, iref); + WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY && key.type != BTRFS_SHARED_BLOCK_REF_KEY); } @@ -1308,8 +1317,6 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) btrfs_panic(fs_info, -EEXIST, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); - kfree(node); - return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); @@ -3477,7 +3484,16 @@ again: goto again; } } - BUG_ON(ret); + if (ret) { + ASSERT(ret == 1); + btrfs_print_leaf(path->nodes[0]); + btrfs_err(fs_info, + "tree block extent item (%llu) is not found in extent tree", + bytenr); + WARN_ON(1); + ret = -EINVAL; + goto out; + } ret = add_tree_block(rc, &key, path, blocks); out: @@ -3755,7 +3771,8 @@ int add_data_references(struct reloc_control *rc, while (ptr < end) { iref = (struct btrfs_extent_inline_ref *)ptr; - key.type = btrfs_extent_inline_ref_type(eb, iref); + key.type = btrfs_get_extent_inline_ref_type(eb, iref, + BTRFS_REF_TYPE_DATA); if (key.type == BTRFS_SHARED_DATA_REF_KEY) { key.offset = btrfs_extent_inline_ref_offset(eb, iref); ret = __add_tree_block(rc, key.offset, blocksize, @@ -3765,7 +3782,10 @@ int add_data_references(struct reloc_control *rc, ret = find_data_references(rc, extent_key, eb, dref, blocks); } else { - BUG(); + ret = -EINVAL; + btrfs_err(rc->extent_root->fs_info, + "extent %llu slot %d has an invalid inline ref type", + eb->start, path->slots[0]); } if (ret) { err = ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 460db0cb2d07..9fb9896610e0 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -151,7 +151,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root } if (ret != 0) { - btrfs_print_leaf(fs_info, path->nodes[0]); + btrfs_print_leaf(path->nodes[0]); btrfs_crit(fs_info, "unable to update root key %llu %u %llu", key->objectid, key->type, key->offset); BUG_ON(1); @@ -335,10 +335,11 @@ int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info) return err; } -/* drop the root item for 'key' from 'root' */ -int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key) +/* drop the root item for 'key' from the tree root */ +int btrfs_del_root(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, const struct btrfs_key *key) { + struct btrfs_root *root = fs_info->tree_root; struct btrfs_path *path; int ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6f1e4c984b94..e3f6c49e5c4d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -182,8 +182,8 @@ struct scrub_ctx { struct scrub_bio *wr_curr_bio; struct mutex wr_lock; int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */ - atomic_t flush_all_writes; struct btrfs_device *wr_tgtdev; + bool flush_all_writes; /* * statistics @@ -717,7 +717,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) WARN_ON(!fs_info->dev_replace.tgtdev); sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; - atomic_set(&sctx->flush_all_writes, 0); + sctx->flush_all_writes = false; } return sctx; @@ -1704,7 +1704,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, if (ret) return ret; - wait_for_completion(&done.event); + wait_for_completion_io(&done.event); if (done.status) return -EIO; @@ -1738,7 +1738,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, WARN_ON(!page->page); bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page->dev->bdev; + bio_set_dev(bio, page->dev->bdev); bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { @@ -1769,7 +1769,7 @@ static inline int scrub_check_fsid(u8 fsid[], struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; int ret; - ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE); + ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); return !ret; } @@ -1826,7 +1826,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, } bio = btrfs_io_bio_alloc(1); - bio->bi_bdev = page_bad->dev->bdev; + bio_set_dev(bio, page_bad->dev->bdev); bio->bi_iter.bi_sector = page_bad->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -1921,7 +1921,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_wr_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); sbio->status = 0; @@ -1964,7 +1964,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); + WARN_ON(!sbio->bio->bi_disk); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which @@ -2321,7 +2321,7 @@ again: bio->bi_private = sbio; bio->bi_end_io = scrub_bio_end_io; - bio->bi_bdev = sbio->dev->bdev; + bio_set_dev(bio, sbio->dev->bdev); bio->bi_iter.bi_sector = sbio->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); sbio->status = 0; @@ -2402,8 +2402,7 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) scrub_block_put(sblock); - if (sctx->is_dev_replace && - atomic_read(&sctx->flush_all_writes)) { + if (sctx->is_dev_replace && sctx->flush_all_writes) { mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -2607,8 +2606,7 @@ static void scrub_bio_end_io_worker(struct btrfs_work *work) sctx->first_free = sbio->index; spin_unlock(&sctx->list_lock); - if (sctx->is_dev_replace && - atomic_read(&sctx->flush_all_writes)) { + if (sctx->is_dev_replace && sctx->flush_all_writes) { mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -2622,7 +2620,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, u64 start, u64 len) { u64 offset; - int nsectors; + u64 nsectors64; + u32 nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; if (len >= sparity->stripe_len) { @@ -2633,7 +2632,10 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, start -= sparity->logic_start; start = div64_u64_rem(start, sparity->stripe_len, &offset); offset = div_u64(offset, sectorsize); - nsectors = (int)len / sectorsize; + nsectors64 = div_u64(len, sectorsize); + + ASSERT(nsectors64 < UINT_MAX); + nsectors = (u32)nsectors64; if (offset + nsectors <= sparity->nsectors) { bitmap_set(bitmap, offset, nsectors); @@ -2706,7 +2708,9 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) if (!sum) return 0; - index = ((u32)(logical - sum->bytenr)) / sctx->fs_info->sectorsize; + index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize); + ASSERT(index < UINT_MAX); + num_sectors = sum->len / sctx->fs_info->sectorsize; memcpy(csum, sum->sums + index, sctx->csum_size); if (index == num_sectors - 1) { @@ -3440,14 +3444,14 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, */ if (atomic_read(&fs_info->scrub_pause_req)) { /* push queued extents */ - atomic_set(&sctx->flush_all_writes, 1); + sctx->flush_all_writes = true; scrub_submit(sctx); mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); - atomic_set(&sctx->flush_all_writes, 0); + sctx->flush_all_writes = false; scrub_blocked_if_needed(fs_info); } @@ -3869,8 +3873,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, ro_set = 0; } else { btrfs_warn(fs_info, - "failed setting block group ro, ret=%d\n", - ret); + "failed setting block group ro: %d", ret); btrfs_put_block_group(cache); break; } @@ -3893,7 +3896,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * write requests are really completed when bios_in_flight * changes to 0. */ - atomic_set(&sctx->flush_all_writes, 1); + sctx->flush_all_writes = true; scrub_submit(sctx); mutex_lock(&sctx->wr_lock); scrub_wr_submit(sctx); @@ -3911,7 +3914,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, */ wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0); - atomic_set(&sctx->flush_all_writes, 0); + sctx->flush_all_writes = false; scrub_pause_off(fs_info); @@ -4012,14 +4015,8 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { - if (is_dev_replace) - fs_info->scrub_workers = - btrfs_alloc_workqueue(fs_info, "scrub", flags, - 1, 4); - else - fs_info->scrub_workers = - btrfs_alloc_workqueue(fs_info, "scrub", flags, - max_active, 4); + fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", + flags, is_dev_replace ? 1 : max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; @@ -4627,7 +4624,7 @@ static int write_page_nocow(struct scrub_ctx *sctx, bio = btrfs_io_bio_alloc(1); bio->bi_iter.bi_size = 0; bio->bi_iter.bi_sector = physical_for_dev_replace >> 9; - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; ret = bio_add_page(bio, page, PAGE_SIZE, 0); if (ret != PAGE_SIZE) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 24b989fd130c..32b043ef8ac9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4723,7 +4723,7 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) /* initial readahead */ memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, inode->i_mapping); - btrfs_force_ra(inode->i_mapping, &sctx->ra, NULL, index, + page_cache_sync_readahead(inode->i_mapping, &sctx->ra, NULL, index, last_index - index + 1); while (index <= last_index) { @@ -4982,6 +4982,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_key key; int ret; + /* + * Prevent cloning from a zero offset with a length matching the sector + * size because in some scenarios this will make the receiver fail. + * + * For example, if in the source filesystem the extent at offset 0 + * has a length of sectorsize and it was written using direct IO, then + * it can never be an inline extent (even if compression is enabled). + * Then this extent can be cloned in the original filesystem to a non + * zero file offset, but it may not be possible to clone in the + * destination filesystem because it can be inlined due to compression + * on the destination filesystem (as the receiver's write operations are + * always done using buffered IO). The same happens when the original + * filesystem does not have compression enabled but the destination + * filesystem has. + */ + if (clone_root->offset == 0 && + len == sctx->send_root->fs_info->sectorsize) + return send_extent_data(sctx, offset, len); + path = alloc_path_for_send(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 875c757e73e2..5e2b92d83617 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -50,8 +50,8 @@ static inline void put_unaligned_le8(u8 val, void *p) */ #define DEFINE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ - unsigned long off, \ +u##bits btrfs_get_token_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ @@ -90,7 +90,8 @@ u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr, \ return res; \ } \ void btrfs_set_token_##bits(struct extent_buffer *eb, \ - void *ptr, unsigned long off, u##bits val, \ + const void *ptr, unsigned long off, \ + u##bits val, \ struct btrfs_map_token *token) \ { \ unsigned long part_offset = (unsigned long)ptr; \ @@ -133,7 +134,7 @@ DEFINE_BTRFS_SETGET_BITS(16) DEFINE_BTRFS_SETGET_BITS(32) DEFINE_BTRFS_SETGET_BITS(64) -void btrfs_node_key(struct extent_buffer *eb, +void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { unsigned long ptr = btrfs_node_key_ptr_offset(nr); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 12540b6104b5..2b13d1a69f0b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -61,6 +61,7 @@ #include "tests/btrfs-tests.h" #include "qgroup.h" +#include "backref.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -425,7 +426,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, * strsep changes the string, duplicate it because parse_options * gets called twice */ - options = kstrdup(options, GFP_NOFS); + options = kstrdup(options, GFP_KERNEL); if (!options) return -ENOMEM; @@ -498,14 +499,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_test_opt(info, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || - strcmp(args[0].from, "zlib") == 0) { + strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; info->compress_type = BTRFS_COMPRESS_ZLIB; btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); no_compress = 0; - } else if (strcmp(args[0].from, "lzo") == 0) { + } else if (strncmp(args[0].from, "lzo", 3) == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; btrfs_set_opt(info->mount_opt, COMPRESS); @@ -513,6 +514,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); no_compress = 0; + } else if (strcmp(args[0].from, "zstd") == 0) { + compress_type = "zstd"; + info->compress_type = BTRFS_COMPRESS_ZSTD; + btrfs_set_opt(info->mount_opt, COMPRESS); + btrfs_clear_opt(info->mount_opt, NODATACOW); + btrfs_clear_opt(info->mount_opt, NODATASUM); + btrfs_set_fs_incompat(info, COMPRESS_ZSTD); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); @@ -548,20 +557,22 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ssd: btrfs_set_and_info(info, SSD, - "use ssd allocation scheme"); + "enabling ssd optimizations"); btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_ssd_spread: + btrfs_set_and_info(info, SSD, + "enabling ssd optimizations"); btrfs_set_and_info(info, SSD_SPREAD, - "use spread ssd allocation scheme"); - btrfs_set_opt(info->mount_opt, SSD); + "using spread ssd allocation scheme"); btrfs_clear_opt(info->mount_opt, NOSSD); break; case Opt_nossd: - btrfs_set_and_info(info, NOSSD, - "not using ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, SSD); - btrfs_clear_opt(info->mount_opt, SSD_SPREAD); + btrfs_set_opt(info->mount_opt, NOSSD); + btrfs_clear_and_info(info, SSD, + "not using ssd optimizations"); + btrfs_clear_and_info(info, SSD_SPREAD, + "not using spread ssd allocation scheme"); break; case Opt_barrier: btrfs_clear_and_info(info, NOBARRIER, @@ -949,7 +960,7 @@ static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, } path->leave_spinning = 1; - name = kmalloc(PATH_MAX, GFP_NOFS); + name = kmalloc(PATH_MAX, GFP_KERNEL); if (!name) { ret = -ENOMEM; goto err; @@ -1227,8 +1238,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) if (btrfs_test_opt(info, COMPRESS)) { if (info->compress_type == BTRFS_COMPRESS_ZLIB) compress_type = "zlib"; - else + else if (info->compress_type == BTRFS_COMPRESS_LZO) compress_type = "lzo"; + else + compress_type = "zstd"; if (btrfs_test_opt(info, FORCE_COMPRESS)) seq_printf(seq, ",compress-force=%s", compress_type); else @@ -1335,10 +1348,11 @@ static char *setup_root_args(char *args) char *buf, *dst, *sep; if (!args) - return kstrdup("subvolid=0", GFP_NOFS); + return kstrdup("subvolid=0", GFP_KERNEL); /* The worst case is that we add ",subvolid=0" to the end. */ - buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, + GFP_KERNEL); if (!buf) return NULL; @@ -1567,7 +1581,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * it for searching for existing supers, so this lets us do that and * then open_ctree will properly initialize everything later. */ - fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); + fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); if (!fs_info) { error = -ENOMEM; goto error_sec_opts; @@ -1575,8 +1589,8 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, fs_info->fs_devices = fs_devices; - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; @@ -1780,8 +1794,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } - if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures) { + if (!btrfs_check_rw_degradable(fs_info)) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; @@ -1814,6 +1827,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_qgroup_rescan_resume(fs_info); + if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c2d5f3580b4c..2b6d37c09a81 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -200,6 +200,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); +BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); @@ -212,6 +213,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), + BTRFS_FEAT_ATTR_PTR(compress_zstd), BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index b18ab8f327a5..d3f25376a0f8 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -211,7 +211,6 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, cache->key.objectid = 0; cache->key.offset = length; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = fs_info->sectorsize; cache->full_stripe_len = fs_info->sectorsize; cache->fs_info = fs_info; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index b29954c01673..1458bb0ea124 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -81,7 +81,7 @@ static int __check_free_space_extents(struct btrfs_trans_handle *trans, i++; } prev_bit = bit; - offset += cache->sectorsize; + offset += fs_info->sectorsize; } } if (prev_bit == 1) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3a11ae63676e..ad7f4bab640b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1143,8 +1143,6 @@ again: goto again; } kfree(victim_name); - if (ret) - return ret; next: cur_offset += victim_name_len + sizeof(*extref); } @@ -3690,7 +3688,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - if ((i == (nr - 1))) + if (i == nr - 1) last_key = ins_keys[i]; if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { @@ -4450,7 +4448,10 @@ static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, len = btrfs_file_extent_inline_len(leaf, path->slots[0], extent); - ASSERT(len == i_size); + ASSERT(len == i_size || + (len == fs_info->sectorsize && + btrfs_file_extent_compression(leaf, extent) != + BTRFS_COMPRESS_NONE)); return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bd679bc7a1a9..c188256a367c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -152,7 +152,15 @@ struct list_head *btrfs_get_fs_uuids(void) return &fs_uuids; } -static struct btrfs_fs_devices *__alloc_fs_devices(void) +/* + * alloc_fs_devices - allocate struct btrfs_fs_devices + * @fsid: if not NULL, copy the uuid to fs_devices::fsid + * + * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR(). + * The returned struct is not linked onto any lists and can be destroyed with + * kfree() right away. + */ +static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) { struct btrfs_fs_devices *fs_devs; @@ -166,31 +174,8 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->list); - - return fs_devs; -} - -/** - * alloc_fs_devices - allocate struct btrfs_fs_devices - * @fsid: a pointer to UUID for this FS. If NULL a new UUID is - * generated. - * - * Return: a pointer to a new &struct btrfs_fs_devices on success; - * ERR_PTR() on error. Returned struct is not linked onto any lists and - * can be destroyed with kfree() right away. - */ -static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) -{ - struct btrfs_fs_devices *fs_devs; - - fs_devs = __alloc_fs_devices(); - if (IS_ERR(fs_devs)) - return fs_devs; - if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); - else - generate_random_uuid(fs_devs->fsid); return fs_devs; } @@ -269,9 +254,17 @@ static struct btrfs_device *__alloc_device(void) return dev; } -static noinline struct btrfs_device *__find_device(struct list_head *head, - u64 devid, u8 *uuid) +/* + * Find a device specified by @devid or @uuid in the list of @fs_devices, or + * return NULL. + * + * If devid and uuid are both specified, the match must be exact, otherwise + * only devid is used. + */ +static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices, + u64 devid, const u8 *uuid) { + struct list_head *head = &fs_devices->devices; struct btrfs_device *dev; list_for_each_entry(dev, head, dev_list) { @@ -310,7 +303,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, if (flush) filemap_write_and_wait((*bdev)->bd_inode->i_mapping); - ret = set_blocksize(*bdev, 4096); + ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { blkdev_put(*bdev, flags); goto error; @@ -636,8 +629,8 @@ static noinline int device_list_add(const char *path, device = NULL; } else { - device = __find_device(&fs_devices->devices, devid, - disk_super->dev_item.uuid); + device = find_device(fs_devices, devid, + disk_super->dev_item.uuid); } if (!device) { @@ -1578,7 +1571,6 @@ out: static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 start, u64 num_bytes) { int ret; @@ -1606,12 +1598,12 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); - btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); - btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_tree(leaf, extent, + BTRFS_CHUNK_TREE_OBJECTID); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, + BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - write_extent_buffer_chunk_tree_uuid(leaf, fs_info->chunk_tree_uuid); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); btrfs_mark_buffer_dirty(leaf); out: @@ -1726,7 +1718,7 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, ptr = btrfs_device_uuid(dev_item); write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); ptr = btrfs_device_fsid(dev_item); - write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_UUID_SIZE); + write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE); btrfs_mark_buffer_dirty(leaf); ret = 0; @@ -1872,7 +1864,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, struct btrfs_fs_devices *cur_devices; u64 num_devices; int ret = 0; - bool clear_super = false; mutex_lock(&uuid_mutex); @@ -1908,7 +1899,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; mutex_unlock(&fs_info->chunk_mutex); - clear_super = true; } mutex_unlock(&uuid_mutex); @@ -1987,9 +1977,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, free_fs_devices(cur_devices); } - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - out: mutex_unlock(&uuid_mutex); return ret; @@ -2202,7 +2189,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; - seed_devices = __alloc_fs_devices(); + seed_devices = alloc_fs_devices(NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); @@ -2261,7 +2248,7 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, struct btrfs_dev_item *dev_item; struct btrfs_device *device; struct btrfs_key key; - u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; u64 devid; int ret; @@ -2304,7 +2291,7 @@ next_slot: read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), - BTRFS_UUID_SIZE); + BTRFS_FSID_SIZE); device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); BUG_ON(!device); /* Logic error */ @@ -2407,7 +2394,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->is_tgtdev_for_dev_replace = 0; device->mode = FMODE_EXCL; device->dev_stats_valid = 1; - set_blocksize(device->bdev, 4096); + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { sb->s_flags &= ~MS_RDONLY; @@ -2487,8 +2474,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path "sysfs: failed to create fsid for sprout"); } - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); ret = btrfs_commit_transaction(trans); if (seeding_dev) { @@ -2612,7 +2597,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->is_tgtdev_for_dev_replace = 1; device->mode = FMODE_EXCL; device->dev_stats_valid = 1; - set_blocksize(device->bdev, 4096); + set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); device->fs_devices = fs_info->fs_devices; list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; @@ -2728,8 +2713,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 chunk_objectid, - u64 chunk_offset) + struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; int ret; @@ -2740,7 +2724,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - key.objectid = chunk_objectid; + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; @@ -2763,8 +2747,7 @@ out: return ret; } -static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, - u64 chunk_objectid, u64 chunk_offset) +static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_super_block *super_copy = fs_info->super_copy; struct btrfs_disk_key *disk_key; @@ -2797,7 +2780,7 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, ret = -EIO; break; } - if (key.objectid == chunk_objectid && + if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID && key.offset == chunk_offset) { memmove(ptr, ptr + len, array_size - (cur + len)); array_size -= len; @@ -2846,7 +2829,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; - u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; @@ -2902,7 +2884,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } mutex_unlock(&fs_devices->device_list_mutex); - ret = btrfs_free_chunk(trans, fs_info, chunk_objectid, chunk_offset); + ret = btrfs_free_chunk(trans, fs_info, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -2911,8 +2893,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_del_sys_chunk(fs_info, chunk_objectid, - chunk_offset); + ret = btrfs_del_sys_chunk(fs_info, chunk_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3312,7 +3293,6 @@ static int chunk_devid_filter(struct extent_buffer *leaf, /* [pstart, pend) */ static int chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, - u64 chunk_offset, struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; @@ -3439,7 +3419,7 @@ static int should_balance_chunk(struct btrfs_fs_info *fs_info, /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && - chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) { + chunk_drange_filter(leaf, chunk, bargs)) { return 0; } @@ -3898,13 +3878,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, meta_target, data_target); } - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - fs_info->num_tolerated_disk_barrier_failures = min( - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), - btrfs_get_num_tolerated_disk_barrier_failures( - bctl->sys.target)); - } - ret = insert_balance_item(fs_info, bctl); if (ret && ret != -EEXIST) goto out; @@ -3927,11 +3900,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, mutex_lock(&fs_info->balance_mutex); atomic_dec(&fs_info->balance_running); - if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { - fs_info->num_tolerated_disk_barrier_failures = - btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); - } - if (bargs) { memset(bargs, 0, sizeof(*bargs)); update_ioctl_balance_args(fs_info, 0, bargs); @@ -4127,7 +4095,6 @@ static int btrfs_uuid_scan_kthread(void *data) struct btrfs_fs_info *fs_info = data; struct btrfs_root *root = fs_info->tree_root; struct btrfs_key key; - struct btrfs_key max_key; struct btrfs_path *path = NULL; int ret = 0; struct extent_buffer *eb; @@ -4146,10 +4113,6 @@ static int btrfs_uuid_scan_kthread(void *data) key.type = BTRFS_ROOT_ITEM_KEY; key.offset = 0; - max_key.objectid = (u64)-1; - max_key.type = BTRFS_ROOT_ITEM_KEY; - max_key.offset = (u64)-1; - while (1) { ret = btrfs_search_forward(root, &key, path, 0); if (ret) { @@ -4601,12 +4564,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) -{ - /* TODO allow them to set a preferred stripe size */ - return SZ_64K; -} - static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) @@ -4629,7 +4586,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; - struct list_head *cur; + struct btrfs_device *device; struct map_lookup *map = NULL; struct extent_map_tree *em_tree; struct extent_map *em; @@ -4649,7 +4606,6 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 max_chunk_size; u64 stripe_size; u64 num_bytes; - u64 raid_stripe_len = BTRFS_STRIPE_LEN; int ndevs; int i; int j; @@ -4703,22 +4659,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (!devices_info) return -ENOMEM; - cur = fs_devices->alloc_list.next; - /* * in the first pass through the devices list, we gather information * about the available holes on each device. */ ndevs = 0; - while (cur != &fs_devices->alloc_list) { - struct btrfs_device *device; + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { u64 max_avail; u64 dev_offset; - device = list_entry(cur, struct btrfs_device, dev_alloc_list); - - cur = cur->next; - if (!device->writeable) { WARN(1, KERN_ERR "BTRFS: read-only device in alloc_list\n"); @@ -4769,15 +4718,15 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, btrfs_cmp_device_info, NULL); /* round down to number of usable stripes */ - ndevs -= ndevs % devs_increment; + ndevs = round_down(ndevs, devs_increment); if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) { ret = -ENOSPC; goto error; } - if (devs_max && ndevs > devs_max) - ndevs = devs_max; + ndevs = min(ndevs, devs_max); + /* * the primary goal is to maximize the number of stripes, so use as many * devices as possible, even if the stripes are not maximum sized. @@ -4791,16 +4740,11 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ data_stripes = num_stripes / ncopies; - if (type & BTRFS_BLOCK_GROUP_RAID5) { - raid_stripe_len = find_raid56_stripe_len(ndevs - 1, - info->stripesize); + if (type & BTRFS_BLOCK_GROUP_RAID5) data_stripes = num_stripes - 1; - } - if (type & BTRFS_BLOCK_GROUP_RAID6) { - raid_stripe_len = find_raid56_stripe_len(ndevs - 2, - info->stripesize); + + if (type & BTRFS_BLOCK_GROUP_RAID6) data_stripes = num_stripes - 2; - } /* * Use the number of data stripes to figure out how big this chunk @@ -4825,8 +4769,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div64_u64(stripe_size, raid_stripe_len); - stripe_size *= raid_stripe_len; + stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { @@ -4843,10 +4786,9 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, j * stripe_size; } } - map->sector_size = info->sectorsize; - map->stripe_len = raid_stripe_len; - map->io_align = raid_stripe_len; - map->io_width = raid_stripe_len; + map->stripe_len = BTRFS_STRIPE_LEN; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; map->type = type; map->sub_stripes = sub_stripes; @@ -4881,9 +4823,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } - ret = btrfs_make_block_group(trans, info, 0, type, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - start, num_bytes); + ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes); if (ret) goto error_del_extent; @@ -4963,11 +4903,8 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, ret = btrfs_update_device(trans, device); if (ret) break; - ret = btrfs_alloc_dev_extent(trans, device, - chunk_root->root_key.objectid, - BTRFS_FIRST_CHUNK_TREE_OBJECTID, - chunk_offset, dev_offset, - stripe_size); + ret = btrfs_alloc_dev_extent(trans, device, chunk_offset, + dev_offset, stripe_size); if (ret) break; } @@ -5172,7 +5109,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, - struct btrfs_mapping_tree *map_tree, u64 logical) { struct extent_map *em; @@ -5180,29 +5116,30 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, unsigned long len = fs_info->sectorsize; em = get_chunk_map(fs_info, logical, len); - WARN_ON(IS_ERR(em)); - map = em->map_lookup; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - len = map->stripe_len * nr_data_stripes(map); - free_extent_map(em); + if (!WARN_ON(IS_ERR(em))) { + map = em->map_lookup; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + len = map->stripe_len * nr_data_stripes(map); + free_extent_map(em); + } return len; } -int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len, int mirror_num) +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; int ret = 0; em = get_chunk_map(fs_info, logical, len); - WARN_ON(IS_ERR(em)); - map = em->map_lookup; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) - ret = 1; - free_extent_map(em); + if(!WARN_ON(IS_ERR(em))) { + map = em->map_lookup; + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) + ret = 1; + free_extent_map(em); + } return ret; } @@ -6188,7 +6125,7 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, rcu_read_unlock(); } #endif - bio->bi_bdev = dev->bdev; + bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); @@ -6295,9 +6232,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, cur_devices = fs_info->fs_devices; while (cur_devices) { if (!fsid || - !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { - device = __find_device(&cur_devices->devices, - devid, uuid); + !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) { + device = find_device(cur_devices, devid, uuid); if (device) return device; } @@ -6450,7 +6386,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; - u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6459,7 +6394,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); - stripe_len = btrfs_chunk_stripe_len(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical); @@ -6498,7 +6432,6 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); - map->sector_size = btrfs_chunk_sector_size(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); map->type = btrfs_chunk_type(leaf, chunk); map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); @@ -6514,6 +6447,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); + btrfs_report_missing_device(fs_info, devid, uuid); return -EIO; } if (!map->stripes[i].dev) { @@ -6524,8 +6458,7 @@ static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key, free_extent_map(em); return -EIO; } - btrfs_warn(fs_info, "devid %llu uuid %pU is missing", - devid, uuid); + btrfs_report_missing_device(fs_info, devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6569,10 +6502,11 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, int ret; BUG_ON(!mutex_is_locked(&uuid_mutex)); + ASSERT(fsid); fs_devices = fs_info->fs_devices->seed; while (fs_devices) { - if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) + if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; fs_devices = fs_devices->seed; @@ -6625,16 +6559,16 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, struct btrfs_device *device; u64 devid; int ret; - u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), - BTRFS_UUID_SIZE); + BTRFS_FSID_SIZE); - if (memcmp(fs_uuid, fs_info->fsid, BTRFS_UUID_SIZE)) { + if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); if (IS_ERR(fs_devices)) return PTR_ERR(fs_devices); @@ -6642,17 +6576,21 @@ static int read_one_dev(struct btrfs_fs_info *fs_info, device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid); if (!device) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_report_missing_device(fs_info, devid, dev_uuid); return -EIO; + } device = add_missing_dev(fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; - btrfs_warn(fs_info, "devid %llu uuid %pU missing", - devid, dev_uuid); + btrfs_report_missing_device(fs_info, devid, dev_uuid); } else { - if (!device->bdev && !btrfs_test_opt(fs_info, DEGRADED)) - return -EIO; + if (!device->bdev) { + btrfs_report_missing_device(fs_info, devid, dev_uuid); + if (!btrfs_test_opt(fs_info, DEGRADED)) + return -EIO; + } if(!device->bdev && !device->missing) { /* @@ -6818,6 +6756,70 @@ out_short_read: return -EIO; } +void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, + u8 *uuid) +{ + btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid); +} + +/* + * Check if all chunks in the fs are OK for read-write degraded mount + * + * Return true if all chunks meet the minimal RW mount requirements. + * Return false if any chunk doesn't meet the minimal RW mount requirements. + */ +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info) +{ + struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; + struct extent_map *em; + u64 next_start = 0; + bool ret = true; + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1); + read_unlock(&map_tree->map_tree.lock); + /* No chunk at all? Return false anyway */ + if (!em) { + ret = false; + goto out; + } + while (em) { + struct map_lookup *map; + int missing = 0; + int max_tolerated; + int i; + + map = em->map_lookup; + max_tolerated = + btrfs_get_num_tolerated_disk_barrier_failures( + map->type); + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = map->stripes[i].dev; + + if (!dev || !dev->bdev || dev->missing || + dev->last_flush_error) + missing++; + } + if (missing > max_tolerated) { + btrfs_warn(fs_info, + "chunk %llu missing %d devices, max tolerance is %d for writeable mount", + em->start, missing, max_tolerated); + free_extent_map(em); + ret = false; + goto out; + } + next_start = extent_map_end(em); + free_extent_map(em); + + read_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, next_start, + (u64)(-1) - next_start); + read_unlock(&map_tree->map_tree.lock); + } +out: + return ret; +} + int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) { struct btrfs_root *root = fs_info->chunk_root; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 93277fc60930..6108fdfec67f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -353,7 +353,6 @@ struct map_lookup { int io_align; int io_width; u64 stripe_len; - int sector_size; int num_stripes; int sub_stripes; struct btrfs_bio_stripe stripes[]; @@ -481,9 +480,8 @@ void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, - u64 logical, u64 len, int mirror_num); + u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, - struct btrfs_mapping_tree *map_tree, u64 logical); int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, @@ -543,4 +541,8 @@ struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); +bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info); +void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid, + u8 *uuid); + #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c new file mode 100644 index 000000000000..607ce47b483a --- /dev/null +++ b/fs/btrfs/zstd.c @@ -0,0 +1,432 @@ +/* + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#include <linux/bio.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/pagemap.h> +#include <linux/refcount.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/zstd.h> +#include "compression.h" + +#define ZSTD_BTRFS_MAX_WINDOWLOG 17 +#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_DEFAULT_LEVEL 3 + +static ZSTD_parameters zstd_get_btrfs_parameters(size_t src_len) +{ + ZSTD_parameters params = ZSTD_getParams(ZSTD_BTRFS_DEFAULT_LEVEL, + src_len, 0); + + if (params.cParams.windowLog > ZSTD_BTRFS_MAX_WINDOWLOG) + params.cParams.windowLog = ZSTD_BTRFS_MAX_WINDOWLOG; + WARN_ON(src_len > ZSTD_BTRFS_MAX_INPUT); + return params; +} + +struct workspace { + void *mem; + size_t size; + char *buf; + struct list_head list; +}; + +static void zstd_free_workspace(struct list_head *ws) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + + kvfree(workspace->mem); + kfree(workspace->buf); + kfree(workspace); +} + +static struct list_head *zstd_alloc_workspace(void) +{ + ZSTD_parameters params = + zstd_get_btrfs_parameters(ZSTD_BTRFS_MAX_INPUT); + struct workspace *workspace; + + workspace = kzalloc(sizeof(*workspace), GFP_KERNEL); + if (!workspace) + return ERR_PTR(-ENOMEM); + + workspace->size = max_t(size_t, + ZSTD_CStreamWorkspaceBound(params.cParams), + ZSTD_DStreamWorkspaceBound(ZSTD_BTRFS_MAX_INPUT)); + workspace->mem = kvmalloc(workspace->size, GFP_KERNEL); + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!workspace->mem || !workspace->buf) + goto fail; + + INIT_LIST_HEAD(&workspace->list); + + return &workspace->list; +fail: + zstd_free_workspace(&workspace->list); + return ERR_PTR(-ENOMEM); +} + +static int zstd_compress_pages(struct list_head *ws, + struct address_space *mapping, + u64 start, + struct page **pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_CStream *stream; + int ret = 0; + int nr_pages = 0; + struct page *in_page = NULL; /* The current page to read */ + struct page *out_page = NULL; /* The current page to write to */ + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long tot_in = 0; + unsigned long tot_out = 0; + unsigned long len = *total_out; + const unsigned long nr_dest_pages = *out_pages; + unsigned long max_out = nr_dest_pages * PAGE_SIZE; + ZSTD_parameters params = zstd_get_btrfs_parameters(len); + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + /* Initialize the stream */ + stream = ZSTD_initCStream(params, len, workspace->mem, + workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initCStream failed\n"); + ret = -EIO; + goto out; + } + + /* map in the first page of input data */ + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + + + /* Allocate and map in the output buffer */ + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + + while (1) { + size_t ret2; + + ret2 = ZSTD_compressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_compressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + + /* Check to see if we are making it bigger */ + if (tot_in + in_buf.pos > 8192 && + tot_in + in_buf.pos < + tot_out + out_buf.pos) { + ret = -E2BIG; + goto out; + } + + /* We've reached the end of our output range */ + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + /* Check if we need more output space */ + if (out_buf.pos == out_buf.size) { + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + /* We've reached the end of the input */ + if (in_buf.pos >= len) { + tot_in += in_buf.pos; + break; + } + + /* Check if we need more input */ + if (in_buf.pos == in_buf.size) { + tot_in += PAGE_SIZE; + kunmap(in_page); + put_page(in_page); + + start += PAGE_SIZE; + len -= PAGE_SIZE; + in_page = find_get_page(mapping, start >> PAGE_SHIFT); + in_buf.src = kmap(in_page); + in_buf.pos = 0; + in_buf.size = min_t(size_t, len, PAGE_SIZE); + } + } + while (1) { + size_t ret2; + + ret2 = ZSTD_endStream(stream, &out_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_endStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto out; + } + if (ret2 == 0) { + tot_out += out_buf.pos; + break; + } + if (out_buf.pos >= max_out) { + tot_out += out_buf.pos; + ret = -E2BIG; + goto out; + } + + tot_out += PAGE_SIZE; + max_out -= PAGE_SIZE; + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + pages[nr_pages++] = out_page; + out_buf.dst = kmap(out_page); + out_buf.pos = 0; + out_buf.size = min_t(size_t, max_out, PAGE_SIZE); + } + + if (tot_out >= tot_in) { + ret = -E2BIG; + goto out; + } + + ret = 0; + *total_in = tot_in; + *total_out = tot_out; +out: + *out_pages = nr_pages; + /* Cleanup */ + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + if (out_page) + kunmap(out_page); + return ret; +} + +static int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + struct page **pages_in = cb->compressed_pages; + u64 disk_start = cb->start; + struct bio *orig_bio = cb->orig_bio; + size_t srclen = cb->compressed_len; + ZSTD_DStream *stream; + int ret = 0; + unsigned long page_in_index = 0; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_SIZE); + unsigned long buf_start; + unsigned long total_out = 0; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_debug("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto done; + } + + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + while (1) { + size_t ret2; + + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto done; + } + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + ret = btrfs_decompress_buf2page(out_buf.dst, buf_start, + total_out, disk_start, orig_bio); + if (ret == 0) + break; + + if (in_buf.pos >= srclen) + break; + + /* Check if we've hit the end of a frame */ + if (ret2 == 0) + break; + + if (in_buf.pos == in_buf.size) { + kunmap(pages_in[page_in_index++]); + if (page_in_index >= total_pages_in) { + in_buf.src = NULL; + ret = -EIO; + goto done; + } + srclen -= PAGE_SIZE; + in_buf.src = kmap(pages_in[page_in_index]); + in_buf.pos = 0; + in_buf.size = min_t(size_t, srclen, PAGE_SIZE); + } + } + ret = 0; + zero_fill_bio(orig_bio); +done: + if (in_buf.src) + kunmap(pages_in[page_in_index]); + return ret; +} + +static int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + struct workspace *workspace = list_entry(ws, struct workspace, list); + ZSTD_DStream *stream; + int ret = 0; + size_t ret2; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + unsigned long total_out = 0; + unsigned long pg_offset = 0; + char *kaddr; + + stream = ZSTD_initDStream( + ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); + if (!stream) { + pr_warn("BTRFS: ZSTD_initDStream failed\n"); + ret = -EIO; + goto finish; + } + + destlen = min_t(size_t, destlen, PAGE_SIZE); + + in_buf.src = data_in; + in_buf.pos = 0; + in_buf.size = srclen; + + out_buf.dst = workspace->buf; + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + + ret2 = 1; + while (pg_offset < destlen && in_buf.pos < in_buf.size) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + + /* Check if the frame is over and we still need more input */ + if (ret2 == 0) { + pr_debug("BTRFS: ZSTD_decompressStream ended early\n"); + ret = -EIO; + goto finish; + } + ret2 = ZSTD_decompressStream(stream, &out_buf, &in_buf); + if (ZSTD_isError(ret2)) { + pr_debug("BTRFS: ZSTD_decompressStream returned %d\n", + ZSTD_getErrorCode(ret2)); + ret = -EIO; + goto finish; + } + + buf_start = total_out; + total_out += out_buf.pos; + out_buf.pos = 0; + + if (total_out <= start_byte) + continue; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min_t(unsigned long, destlen - pg_offset, + out_buf.size - buf_offset); + + kaddr = kmap_atomic(dest_page); + memcpy(kaddr + pg_offset, out_buf.dst + buf_offset, bytes); + kunmap_atomic(kaddr); + + pg_offset += bytes; + } + ret = 0; +finish: + if (pg_offset < destlen) { + kaddr = kmap_atomic(dest_page); + memset(kaddr + pg_offset, 0, destlen - pg_offset); + kunmap_atomic(kaddr); + } + return ret; +} + +const struct btrfs_compress_op btrfs_zstd_compress = { + .alloc_workspace = zstd_alloc_workspace, + .free_workspace = zstd_free_workspace, + .compress_pages = zstd_compress_pages, + .decompress_bio = zstd_decompress_bio, + .decompress = zstd_decompress, +}; diff --git a/fs/buffer.c b/fs/buffer.c index 5715dac7821f..170df856bdb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1627,20 +1627,17 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct pagevec pvec; pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits); pgoff_t end; - int i; + int i, count; struct buffer_head *bh; struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); pagevec_init(&pvec, 0); - while (index <= end && pagevec_lookup(&pvec, bd_mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { - for (i = 0; i < pagevec_count(&pvec); i++) { + while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { + count = pagevec_count(&pvec); + for (i = 0; i < count; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; if (!page_has_buffers(page)) continue; /* @@ -1670,7 +1667,9 @@ unlock_page: } pagevec_release(&pvec); cond_resched(); - index++; + /* End of range already reached? */ + if (index > end || !index) + break; } } EXPORT_SYMBOL(clean_bdev_aliases); @@ -3057,7 +3056,7 @@ void guard_bio_eod(int op, struct bio *bio) struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned truncated_bytes; - maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; + maxsector = get_capacity(bio->bi_disk); if (!maxsector) return; @@ -3116,7 +3115,7 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, } bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_write_hint = write_hint; bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); @@ -3549,10 +3548,10 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, pagevec_init(&pvec, 0); do { - unsigned want, nr_pages, i; + unsigned nr_pages, i; - want = min_t(unsigned, end - index, PAGEVEC_SIZE); - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, want); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); if (nr_pages == 0) break; @@ -3573,10 +3572,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, lastoff < page_offset(page)) goto check_range; - /* Searching done if the page index is out of range. */ - if (page->index >= end) - goto not_found; - lock_page(page); if (likely(page->mapping == inode->i_mapping) && page_has_buffers(page)) { @@ -3589,12 +3584,6 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, unlock_page(page); lastoff = page_offset(page) + PAGE_SIZE; } - - /* Searching done if fewer pages returned than wanted. */ - if (nr_pages < want) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index < end); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 50836280a6f8..b3e3edc09d80 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -152,17 +152,10 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ceph_invalidate_fscache_page(inode, page); + WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; - /* - * We can get non-dirty pages here due to races between - * set_page_dirty and truncate_complete_page; just spit out a - * warning, in case we end up with accounting problems later. - */ - if (!PageDirty(page)) - pr_err("%p invalidatepage %p page not dirty\n", inode, page); - ClearPageChecked(page); dout("%p invalidatepage %p idx %lu full dirty page\n", @@ -189,7 +182,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -219,7 +212,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -249,8 +242,11 @@ out: static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -452,13 +448,9 @@ static int ceph_readpages(struct file *file, struct address_space *mapping, if (rc == 0) goto out; - if (fsc->mount_options->rsize >= PAGE_SIZE) - max = (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - - dout("readpages %p file %p nr_pages %d max %d\n", inode, - file, nr_pages, - max); + max = fsc->mount_options->rsize >> PAGE_SHIFT; + dout("readpages %p file %p nr_pages %d max %d\n", + inode, file, nr_pages, max); while (!list_empty(page_list)) { rc = start_read(inode, page_list, max); if (rc < 0) @@ -471,14 +463,22 @@ out: return rc; } +struct ceph_writeback_ctl +{ + loff_t i_size; + u64 truncate_size; + u32 truncate_seq; + bool size_stable; + bool head_snapc; +}; + /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. */ -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - loff_t *snap_size, - u64 *truncate_size, - u32 *truncate_seq) +static struct ceph_snap_context * +get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl, + struct ceph_snap_context *page_snapc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -488,30 +488,78 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); - if (capsnap->dirty_pages) { - snapc = ceph_get_snap_context(capsnap->context); - if (snap_size) - *snap_size = capsnap->size; - if (truncate_size) - *truncate_size = capsnap->truncate_size; - if (truncate_seq) - *truncate_seq = capsnap->truncate_seq; - break; + if (!capsnap->dirty_pages) + continue; + + /* get i_size, truncate_{seq,size} for page_snapc? */ + if (snapc && capsnap->context != page_snapc) + continue; + + if (ctl) { + if (capsnap->writing) { + ctl->i_size = i_size_read(inode); + ctl->size_stable = false; + } else { + ctl->i_size = capsnap->size; + ctl->size_stable = true; + } + ctl->truncate_size = capsnap->truncate_size; + ctl->truncate_seq = capsnap->truncate_seq; + ctl->head_snapc = false; } + + if (snapc) + break; + + snapc = ceph_get_snap_context(capsnap->context); + if (!page_snapc || + page_snapc == snapc || + page_snapc->seq > snapc->seq) + break; } if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); - if (truncate_size) - *truncate_size = ci->i_truncate_size; - if (truncate_seq) - *truncate_seq = ci->i_truncate_seq; + if (ctl) { + ctl->i_size = i_size_read(inode); + ctl->truncate_size = ci->i_truncate_size; + ctl->truncate_seq = ci->i_truncate_seq; + ctl->size_stable = false; + ctl->head_snapc = true; + } } spin_unlock(&ci->i_ceph_lock); return snapc; } +static u64 get_writepages_data_length(struct inode *inode, + struct page *page, u64 start) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_cap_snap *capsnap = NULL; + u64 end = i_size_read(inode); + + if (snapc != ci->i_head_snapc) { + bool found = false; + spin_lock(&ci->i_ceph_lock); + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + if (capsnap->context == snapc) { + if (!capsnap->writing) + end = capsnap->size; + found = true; + break; + } + } + spin_unlock(&ci->i_ceph_lock); + WARN_ON(!found); + } + if (end > page_offset(page) + PAGE_SIZE) + end = page_offset(page) + PAGE_SIZE; + return end > start ? end - start : 0; +} + /* * Write a single page, but leave the page locked. * @@ -523,30 +571,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; - struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - loff_t snap_size = -1; long writeback_stat; - u64 truncate_size; - u32 truncate_seq; int err, len = PAGE_SIZE; + struct ceph_writeback_ctl ceph_wbc; dout("writepage %p idx %lu\n", page, page->index); inode = page->mapping->host; ci = ceph_inode(inode); fsc = ceph_inode_to_client(inode); - osdc = &fsc->client->osdc; /* verify this is a writeable snap context */ snapc = page_snap_context(page); - if (snapc == NULL) { + if (!snapc) { dout("writepage %p page %p not dirty?\n", inode, page); return 0; } - oldest = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, snapc); @@ -558,20 +601,18 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } ceph_put_snap_context(oldest); - if (snap_size == -1) - snap_size = i_size_read(inode); - /* is this a partial page at end of file? */ - if (page_off >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_off >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", page, ceph_wbc.i_size); + page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); return 0; } - if (snap_size < page_off + len) - len = snap_size - page_off; + if (ceph_wbc.i_size < page_off + len) + len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p\n", - inode, page, page->index, page_off, len, snapc); + dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + inode, page, page->index, page_off, len, snapc, snapc->seq); writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > @@ -579,10 +620,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_osdc_writepages(osdc, ceph_vino(inode), - &ci->i_layout, snapc, - page_off, len, - truncate_seq, truncate_size, + err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), + &ci->i_layout, snapc, page_off, len, + ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, &inode->i_mtime, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; @@ -743,31 +784,17 @@ static int ceph_writepages_start(struct address_space *mapping, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start, end; - int range_whole = 0; - int should_loop = 1; - pgoff_t max_pages = 0, max_pages_ever = 0; + pgoff_t index, start_index, end = -1; struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; - int done = 0; int rc = 0; unsigned int wsize = i_blocksize(inode); struct ceph_osd_request *req = NULL; - int do_sync = 0; - loff_t snap_size, i_size; - u64 truncate_size; - u32 truncate_seq; + struct ceph_writeback_ctl ceph_wbc; + bool should_loop, range_whole = false; + bool stop, done = false; - /* - * Include a 'sync' in the OSD request if this is a data - * integrity write (e.g., O_SYNC write or fsync()), or if our - * cap is being revoked. - */ - if ((wbc->sync_mode == WB_SYNC_ALL) || - ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER)) - do_sync = 1; - dout("writepages_start %p dosync=%d (mode=%s)\n", - inode, do_sync, + dout("writepages_start %p (mode=%s)\n", inode, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); @@ -780,35 +807,17 @@ static int ceph_writepages_start(struct address_space *mapping, mapping_set_error(mapping, -EIO); return -EIO; /* we're in a forced umount, don't write! */ } - if (fsc->mount_options->wsize && fsc->mount_options->wsize < wsize) + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - if (wsize < PAGE_SIZE) - wsize = PAGE_SIZE; - max_pages_ever = wsize >> PAGE_SHIFT; pagevec_init(&pvec, 0); - /* where to start/end? */ - if (wbc->range_cyclic) { - start = mapping->writeback_index; /* Start from prev offset */ - end = -1; - dout(" cyclic, start at %lu\n", start); - } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = 1; - should_loop = 0; - dout(" not cyclic, %lu to %lu\n", start, end); - } - index = start; + start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + index = start_index; retry: /* find oldest snap context with dirty data */ - ceph_put_snap_context(snapc); - snap_size = -1; - snapc = get_oldest_context(inode, &snap_size, - &truncate_size, &truncate_seq); + snapc = get_oldest_context(inode, &ceph_wbc, NULL); if (!snapc) { /* hmm, why does writepages get called when there is no dirty data? */ @@ -818,40 +827,56 @@ retry: dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); - i_size = i_size_read(inode); - - if (last_snapc && snapc != last_snapc) { - /* if we switched to a newer snapc, restart our scan at the - * start of the original file range. */ - dout(" snapc differs from last pass, restarting at %lu\n", - index); - index = start; + should_loop = false; + if (ceph_wbc.head_snapc && snapc != last_snapc) { + /* where to start/end? */ + if (wbc->range_cyclic) { + index = start_index; + end = -1; + if (index > 0) + should_loop = true; + dout(" cyclic, start at %lu\n", index); + } else { + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = true; + dout(" not cyclic, %lu to %lu\n", index, end); + } + } else if (!ceph_wbc.head_snapc) { + /* Do not respect wbc->range_{start,end}. Dirty pages + * in that range can be associated with newer snapc. + * They are not writeable until we write all dirty pages + * associated with 'snapc' get written */ + if (index > 0 || wbc->sync_mode != WB_SYNC_NONE) + should_loop = true; + dout(" non-head snapc, range whole\n"); } + + ceph_put_snap_context(last_snapc); last_snapc = snapc; - while (!done && index <= end) { - unsigned i; - int first; - pgoff_t strip_unit_end = 0; + stop = false; + while (!stop && index <= end) { int num_ops = 0, op_idx; - int pvec_pages, locked_pages = 0; + unsigned i, pvec_pages, max_pages, locked_pages = 0; struct page **pages = NULL, **data_pages; mempool_t *pool = NULL; /* Becomes non-null if mempool used */ struct page *page; - int want; + pgoff_t strip_unit_end = 0; u64 offset = 0, len = 0; - max_pages = max_pages_ever; + max_pages = wsize >> PAGE_SHIFT; get_more_pages: - first = -1; - want = min(end - index, - min((pgoff_t)PAGEVEC_SIZE, - max_pages - (pgoff_t)locked_pages) - 1) - + 1; + pvec_pages = min_t(unsigned, PAGEVEC_SIZE, + max_pages - locked_pages); + if (end - index < (u64)(pvec_pages - 1)) + pvec_pages = (unsigned)(end - index) + 1; + pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - want); + pvec_pages); dout("pagevec_lookup_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; @@ -868,11 +893,15 @@ get_more_pages: unlikely(page->mapping != mapping)) { dout("!dirty or !mapping %p\n", page); unlock_page(page); - break; + continue; } - if (!wbc->range_cyclic && page->index > end) { + if (page->index > end) { dout("end of range %p\n", page); - done = 1; + /* can't be range_cyclic (1st pass) because + * end == -1 in that case. */ + stop = true; + if (ceph_wbc.head_snapc) + done = true; unlock_page(page); break; } @@ -881,39 +910,37 @@ get_more_pages: unlock_page(page); break; } - if (wbc->sync_mode != WB_SYNC_NONE) { - dout("waiting on writeback %p\n", page); - wait_on_page_writeback(page); - } - if (page_offset(page) >= - (snap_size == -1 ? i_size : snap_size)) { - dout("%p page eof %llu\n", page, - (snap_size == -1 ? i_size : snap_size)); - done = 1; + if (page_offset(page) >= ceph_wbc.i_size) { + dout("%p page eof %llu\n", + page, ceph_wbc.i_size); + /* not done if range_cyclic */ + stop = true; unlock_page(page); break; } if (PageWriteback(page)) { - dout("%p under writeback\n", page); - unlock_page(page); - break; + if (wbc->sync_mode == WB_SYNC_NONE) { + dout("%p under writeback\n", page); + unlock_page(page); + continue; + } + dout("waiting on writeback %p\n", page); + wait_on_page_writeback(page); } /* only if matching snap context */ pgsnapc = page_snap_context(page); - if (pgsnapc->seq > snapc->seq) { - dout("page snapc %p %lld > oldest %p %lld\n", + if (pgsnapc != snapc) { + dout("page snapc %p %lld != oldest %p %lld\n", pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); - if (!locked_pages) - continue; /* keep looking for snap */ - break; + continue; } if (!clear_page_dirty_for_io(page)) { dout("%p !clear_page_dirty_for_io\n", page); unlock_page(page); - break; + continue; } /* @@ -939,7 +966,7 @@ get_more_pages: break; } - num_ops = 1 + do_sync; + num_ops = 1; strip_unit_end = page->index + ((len - 1) >> PAGE_SHIFT); @@ -969,8 +996,6 @@ get_more_pages: } /* note position of first page in pvec */ - if (first < 0) - first = i; dout("%p will write page %p idx %lu\n", inode, page, page->index); @@ -981,8 +1006,10 @@ get_more_pages: BLK_RW_ASYNC); } - pages[locked_pages] = page; - locked_pages++; + + pages[locked_pages++] = page; + pvec.pages[i] = NULL; + len += PAGE_SIZE; } @@ -990,23 +1017,23 @@ get_more_pages: if (!locked_pages) goto release_pvec_pages; if (i) { - int j; - BUG_ON(!locked_pages || first < 0); + unsigned j, n = 0; + /* shift unused page to beginning of pvec */ + for (j = 0; j < pvec_pages; j++) { + if (!pvec.pages[j]) + continue; + if (n < j) + pvec.pages[n] = pvec.pages[j]; + n++; + } + pvec.nr = n; if (pvec_pages && i == pvec_pages && locked_pages < max_pages) { dout("reached end pvec, trying for more\n"); - pagevec_reinit(&pvec); + pagevec_release(&pvec); goto get_more_pages; } - - /* shift unused pages over in the pvec... we - * will need to release them below. */ - for (j = i; j < pvec_pages; j++) { - dout(" pvec leftover page %p\n", pvec.pages[j]); - pvec.pages[j-i+first] = pvec.pages[j]; - } - pvec.nr -= i-first; } new_request: @@ -1016,10 +1043,9 @@ new_request: req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, false); + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, false); if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, @@ -1028,8 +1054,8 @@ new_request: CEPH_OSD_SLAB_OPS), CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, - truncate_size, true); + snapc, ceph_wbc.truncate_seq, + ceph_wbc.truncate_size, true); BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + @@ -1045,7 +1071,7 @@ new_request: for (i = 0; i < locked_pages; i++) { u64 cur_offset = page_offset(pages[i]); if (offset + len != cur_offset) { - if (op_idx + do_sync + 1 == req->r_num_ops) + if (op_idx + 1 == req->r_num_ops) break; osd_req_op_extent_dup_last(req, op_idx, cur_offset - offset); @@ -1066,14 +1092,15 @@ new_request: len += PAGE_SIZE; } - if (snap_size != -1) { - len = min(len, snap_size - offset); + if (ceph_wbc.size_stable) { + len = min(len, ceph_wbc.i_size - offset); } else if (i == locked_pages) { /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ u64 min_len = len + 1 - PAGE_SIZE; - len = min(len, (u64)i_size_read(inode) - offset); + len = get_writepages_data_length(inode, pages[i - 1], + offset); len = max(len, min_len); } dout("writepages got pages at %llu~%llu\n", offset, len); @@ -1082,17 +1109,12 @@ new_request: 0, !!pool, false); osd_req_op_extent_update(req, op_idx, len); - if (do_sync) { - op_idx++; - osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0); - } BUG_ON(op_idx + 1 != req->r_num_ops); pool = NULL; if (i < locked_pages) { BUG_ON(num_ops <= req->r_num_ops); num_ops -= req->r_num_ops; - num_ops += do_sync; locked_pages -= i; /* allocate new pages array for next request */ @@ -1124,22 +1146,50 @@ new_request: if (pages) goto new_request; - if (wbc->nr_to_write <= 0) - done = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) + done = stop = true; release_pvec_pages: dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr, pvec.nr ? pvec.pages[0] : NULL); pagevec_release(&pvec); - - if (locked_pages && !done) - goto retry; } if (should_loop && !done) { /* more to do; loop back to beginning of file */ dout("writepages looping back to beginning of file\n"); - should_loop = 0; + end = start_index - 1; /* OK even when start_index == 0 */ + + /* to write dirty pages associated with next snapc, + * we need to wait until current writes complete */ + if (wbc->sync_mode != WB_SYNC_NONE && + start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc.head_snapc) { + struct page *page; + unsigned i, nr; + index = 0; + while ((index <= end) && + (nr = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + PAGEVEC_SIZE))) { + for (i = 0; i < nr; i++) { + page = pvec.pages[i]; + if (page_snap_context(page) != snapc) + continue; + wait_on_page_writeback(page); + } + pagevec_release(&pvec); + cond_resched(); + } + } + + start_index = 0; index = 0; goto retry; } @@ -1149,8 +1199,8 @@ release_pvec_pages: out: ceph_osdc_put_request(req); - ceph_put_snap_context(snapc); - dout("writepages done, rc = %d\n", rc); + ceph_put_snap_context(last_snapc); + dout("writepages dend - startone, rc = %d\n", rc); return rc; } @@ -1162,8 +1212,7 @@ out: static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { - struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, - NULL, NULL); + struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL); int ret = !oldest || snapc->seq <= oldest->seq; ceph_put_snap_context(oldest); @@ -1208,8 +1257,7 @@ retry_locked: * this page is already dirty in another (older) snap * context! is it writeable now? */ - oldest = get_oldest_context(inode, NULL, NULL, NULL); - + oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", @@ -1237,7 +1285,7 @@ retry_locked: goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1265,11 +1313,14 @@ retry_locked: } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index fd1172823f86..a3ab265d3215 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -194,36 +194,6 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( return FSCACHE_CHECKAUX_OKAY; } -static void ceph_fscache_inode_now_uncached(void* cookie_netfs_data) -{ - struct ceph_inode_info* ci = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dout("ceph inode 0x%p now uncached", ci); - - while (1) { - nr_pages = pagevec_lookup(&pvec, ci->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .name = "CEPH.inode", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -231,7 +201,6 @@ static const struct fscache_cookie_def ceph_fscache_inode_object_def = { .get_attr = ceph_fscache_inode_get_attr, .get_aux = ceph_fscache_inode_get_aux, .check_aux = ceph_fscache_inode_check_aux, - .now_uncached = ceph_fscache_inode_now_uncached, }; void ceph_fscache_register_inode_cookie(struct inode *inode) @@ -240,7 +209,7 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) struct ceph_fs_client *fsc = ceph_inode_to_client(inode); /* No caching for filesystem */ - if (fsc->fscache == NULL) + if (!fsc->fscache) return; /* Only cache for regular files that are read only */ @@ -297,13 +266,7 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -331,7 +294,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -360,7 +323,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7007ae2a5ad2..157fe59fbabe 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -490,13 +490,14 @@ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, } /* - * if we are newly issued FILE_SHARED, mark dir not complete; we - * don't know what happened to this directory while we didn't - * have the cap. + * If FILE_SHARED is newly issued, mark dir not complete. We don't + * know what happened to this directory while we didn't have the cap. + * If FILE_SHARED is being revoked, also mark dir not complete. It + * stops on-going cached readdir. */ - if ((issued & CEPH_CAP_FILE_SHARED) && - (had & CEPH_CAP_FILE_SHARED) == 0) { - ci->i_shared_gen++; + if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { + if (issued & CEPH_CAP_FILE_SHARED) + ci->i_shared_gen++; if (S_ISDIR(ci->vfs_inode.i_mode)) { dout(" marking %p NOT complete\n", &ci->vfs_inode); __ceph_dir_clear_complete(ci); @@ -611,7 +612,7 @@ void ceph_add_cap(struct inode *inode, } if (flags & CEPH_CAP_FLAG_AUTH) { - if (ci->i_auth_cap == NULL || + if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { ci->i_auth_cap = cap; cap->mds_wanted = wanted; @@ -728,7 +729,7 @@ static void __touch_cap(struct ceph_cap *cap) struct ceph_mds_session *s = cap->session; spin_lock(&s->s_cap_lock); - if (s->s_cap_iterator == NULL) { + if (!s->s_cap_iterator) { dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); @@ -1248,7 +1249,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.mode = inode->i_mode; arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - arg.flags = 0; + if (list_empty(&ci->i_cap_snaps)) + arg.flags = CEPH_CLIENT_CAPS_NO_CAPSNAP; + else + arg.flags = CEPH_CLIENT_CAPS_PENDING_CAPSNAP; if (sync) arg.flags |= CEPH_CLIENT_CAPS_SYNC; @@ -1454,13 +1458,19 @@ retry: goto retry; } + // make sure flushsnap messages are sent in proper order. + if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { + __kick_flushing_caps(mdsc, session, ci, 0); + ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; + } + __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) { *psession = session; - } else { + } else if (session) { mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); } @@ -1901,11 +1911,7 @@ ack: (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { - spin_lock(&mdsc->cap_dirty_lock); - oldest_flush_tid = __get_oldest_flush_tid(mdsc); - spin_unlock(&mdsc->cap_dirty_lock); - __kick_flushing_caps(mdsc, session, ci, - oldest_flush_tid); + __kick_flushing_caps(mdsc, session, ci, 0); ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) @@ -2110,7 +2116,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) dout("fsync %p%s\n", inode, datasync ? " datasync" : ""); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret < 0) goto out; @@ -3422,7 +3428,7 @@ retry: tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ - if (tcap->cap_id != t_cap_id || + if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { dout(" updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 4e2d112c982f..d635496ea189 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -24,7 +24,7 @@ static int mdsmap_show(struct seq_file *s, void *p) struct ceph_fs_client *fsc = s->private; struct ceph_mdsmap *mdsmap; - if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) + if (!fsc->mdsc || !fsc->mdsc->mdsmap) return 0; mdsmap = fsc->mdsc->mdsmap; seq_printf(s, "epoch %d\n", mdsmap->m_epoch); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ef7240ace576..019c2036d36f 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -377,8 +377,10 @@ more: } /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; - req->r_direct_hash = ceph_frag_value(frag); - __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + if (op == CEPH_MDS_OP_READDIR) { + req->r_direct_hash = ceph_frag_value(frag); + __set_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); + } if (fi->last_name) { req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3d48c415f3cb..65a6fa12c857 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -175,7 +175,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL); - if (cf == NULL) { + if (!cf) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } @@ -562,8 +562,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; size_t len = iov_iter_count(to); - dout("sync_read on file %p %llu~%u %s\n", file, off, - (unsigned)len, + dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); if (!len) @@ -788,7 +787,7 @@ static void ceph_aio_retry_work(struct work_struct *work) goto out; } - req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + req->r_flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; ceph_oloc_copy(&req->r_base_oloc, &orig_req->r_base_oloc); ceph_oid_copy(&req->r_base_oid, &orig_req->r_base_oid); @@ -800,7 +799,6 @@ static void ceph_aio_retry_work(struct work_struct *work) } req->r_ops[0] = orig_req->r_ops[0]; - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = aio_req->mtime; req->r_data_offset = req->r_ops[0].extent.offset; @@ -847,8 +845,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_read_write (%s) on file %p %lld~%u\n", - (write ? "write" : "read"), file, pos, (unsigned)count); + dout("sync_direct_%s on file %p %lld~%u snapc %p seq %lld\n", + (write ? "write" : "read"), file, pos, (unsigned)count, + snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -861,7 +860,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, if (ret2 < 0) dout("invalidate_inode_pages2_range returned %d\n", ret2); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; } else { flags = CEPH_OSD_FLAG_READ; } @@ -874,8 +873,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &size, 0, - /*include a 'startsync' command*/ - write ? 2 : 1, + 1, write ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ, flags, snapc, @@ -887,6 +885,11 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, break; } + if (write) + size = min_t(u64, size, fsc->mount_options->wsize); + else + size = min_t(u64, size, fsc->mount_options->rsize); + len = size; pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { @@ -922,7 +925,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, truncate_inode_pages_range(inode->i_mapping, pos, (pos+len) | (PAGE_SIZE - 1)); - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); req->r_mtime = mtime; } @@ -1048,7 +1050,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_write on file %p %lld~%u\n", file, pos, (unsigned)count); + dout("sync_write on file %p %lld~%u snapc %p seq %lld\n", + file, pos, (unsigned)count, snapc, snapc->seq); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) @@ -1060,7 +1063,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (ret < 0) dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | CEPH_OSD_FLAG_WRITE; + flags = /* CEPH_OSD_FLAG_ORDERSNAP | */ CEPH_OSD_FLAG_WRITE; while ((len = iov_iter_count(from)) > 0) { size_t left; @@ -1307,6 +1310,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; +retry_snap: inode_lock(inode); /* We can write back this queue in page reclaim */ @@ -1338,7 +1342,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out; } -retry_snap: /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; @@ -1387,14 +1390,6 @@ retry_snap: &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); - if (written == -EOLDSNAPC) { - dout("aio_write %p %llx.%llx %llu~%u" - "got EOLDSNAPC, retrying\n", - inode, ceph_vinop(inode), - pos, (unsigned)count); - inode_lock(inode); - goto retry_snap; - } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); @@ -1428,10 +1423,15 @@ retry_snap: ceph_cap_string(got)); ceph_put_cap_refs(ci, got); + if (written == -EOLDSNAPC) { + dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", + inode, ceph_vinop(inode), pos, (unsigned)count); + goto retry_snap; + } + if (written >= 0) { if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; - written = generic_write_sync(iocb, written); } @@ -1481,13 +1481,13 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= i_size) { + if (offset < 0 || offset >= i_size) { ret = -ENXIO; goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 220dfd87cbfa..373dab5173ca 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -52,7 +52,7 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) ino_t t = ceph_vino_to_ino(vino); inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); - if (inode == NULL) + if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { dout("get_inode created new inode %p %llx.%llx ino %llx\n", @@ -133,12 +133,9 @@ static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci, } frag = kmalloc(sizeof(*frag), GFP_NOFS); - if (!frag) { - pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx " - "frag %x\n", &ci->vfs_inode, - ceph_vinop(&ci->vfs_inode), f); + if (!frag) return ERR_PTR(-ENOMEM); - } + frag->frag = f; frag->split_by = 0; frag->mds = -1; @@ -1070,7 +1067,6 @@ out_unlock: spin_unlock(&dentry->d_lock); if (old_lease_session) ceph_put_mds_session(old_lease_session); - return; } /* @@ -1177,7 +1173,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dput(parent); err = -ENOMEM; goto done; @@ -1477,7 +1473,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct dentry *dn; struct inode *in; int err = 0, skipped = 0, ret, i; - struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 last_hash = 0; @@ -1510,8 +1505,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(d_inode(parent)); - parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); } else { @@ -1519,15 +1512,18 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, rinfo->dir_nr, parent); if (rinfo->dir_dir) ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); - } - if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && last_hash)) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); - req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); - req->r_readdir_cache_idx = 0; + if (ceph_frag_is_leftmost(frag) && + req->r_readdir_offset == 2 && + !(rinfo->hash_order && last_hash)) { + /* note dir version at start of readdir so we can + * tell if any dentries get dropped */ + req->r_dir_release_cnt = + atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = + atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } } cache_ctl.index = req->r_readdir_cache_idx; @@ -1566,7 +1562,7 @@ retry_lookup: dn = d_alloc(parent, &dname); dout("d_alloc %p '%.*s' = %p\n", parent, dname.len, dname.name, dn); - if (dn == NULL) { + if (!dn) { dout("d_alloc badness\n"); err = -ENOMEM; goto out; @@ -1650,10 +1646,6 @@ out: req->r_readdir_cache_idx = cache_ctl.index; } ceph_readdir_cache_release(&cache_ctl); - if (snapdir) { - iput(snapdir); - dput(parent); - } dout("readdir_prepopulate done\n"); return err; } @@ -1841,9 +1833,20 @@ retry: * possibly truncate them.. so write AND block! */ if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) { + struct ceph_cap_snap *capsnap; + to = ci->i_truncate_size; + list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { + // MDS should have revoked Frw caps + WARN_ON_ONCE(capsnap->writing); + if (capsnap->dirty_pages && capsnap->size > to) + to = capsnap->size; + } + spin_unlock(&ci->i_ceph_lock); dout("__do_pending_vmtruncate %p flushing snaps first\n", inode); - spin_unlock(&ci->i_ceph_lock); + + truncate_pagecache(inode, to); + filemap_write_and_wait_range(&inode->i_data, 0, inode->i_sb->s_maxbytes); goto retry; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 64ae74472046..8cd63e8123d8 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -79,7 +79,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, err = ceph_mdsc_do_request(mdsc, inode, req); if (operation == CEPH_MDS_OP_GETFILELOCK) { - fl->fl_pid = le64_to_cpu(req->r_reply_info.filelock_reply->pid); + fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid); if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type) fl->fl_type = F_RDLCK; else if (CEPH_LOCK_EXCL == req->r_reply_info.filelock_reply->type) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 666a9f274832..9dd6b836ac9e 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -408,7 +408,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *session; - if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL) + if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, @@ -483,7 +483,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, dout("register_session realloc to %d\n", newmax); sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); - if (sa == NULL) + if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, @@ -731,9 +731,16 @@ static int __choose_mds(struct ceph_mds_client *mdsc, inode = NULL; if (req->r_inode) { - inode = req->r_inode; - ihold(inode); - } else if (req->r_dentry) { + if (ceph_snap(req->r_inode) != CEPH_SNAPDIR) { + inode = req->r_inode; + ihold(inode); + } else { + /* req->r_dentry is non-null for LSSNAP request. + * fall-thru */ + WARN_ON_ONCE(!req->r_dentry); + } + } + if (!inode && req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent; struct inode *dir; @@ -886,7 +893,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 /* Calculate serialized length of metadata */ metadata_bytes = 4; /* map length */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { metadata_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; @@ -919,7 +926,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ceph_encode_32(&p, metadata_key_count); /* Two length-prefixed strings for each entry in the map */ - for (i = 0; metadata[i][0] != NULL; ++i) { + for (i = 0; metadata[i][0]; ++i) { size_t const key_len = strlen(metadata[i][0]); size_t const val_len = strlen(metadata[i][1]); @@ -1122,7 +1129,7 @@ static int iterate_session_caps(struct ceph_mds_session *session, spin_lock(&session->s_cap_lock); p = p->next; - if (cap->ci == NULL) { + if (!cap->ci) { dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); @@ -1748,7 +1755,7 @@ char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base, int len, pos; unsigned seq; - if (dentry == NULL) + if (!dentry) return ERR_PTR(-EINVAL); retry: @@ -1771,7 +1778,7 @@ retry: len--; /* no leading '/' */ path = kmalloc(len+1, GFP_NOFS); - if (path == NULL) + if (!path) return ERR_PTR(-ENOMEM); pos = len; path[pos] = 0; /* trailing null */ @@ -2875,7 +2882,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } if (list_empty(&ci->i_cap_snaps)) { - snap_follows = 0; + snap_follows = ci->i_head_snapc ? ci->i_head_snapc->seq : 0; } else { struct ceph_cap_snap *capsnap = list_first_entry(&ci->i_cap_snaps, @@ -3133,7 +3140,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, newmap->m_epoch, oldmap->m_epoch); for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { - if (mdsc->sessions[i] == NULL) + if (!mdsc->sessions[i]) continue; s = mdsc->sessions[i]; oldstate = ceph_mdsmap_get_state(oldmap, i); @@ -3280,7 +3287,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, mutex_lock(&session->s_mutex); session->s_seq++; - if (inode == NULL) { + if (!inode) { dout("handle_lease no inode %llx\n", vino.ino); goto release; } @@ -3438,7 +3445,7 @@ static void delayed_work(struct work_struct *work) for (i = 0; i < mdsc->max_sessions; i++) { struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i); - if (s == NULL) + if (!s) continue; if (s->s_state == CEPH_MDS_SESSION_CLOSING) { dout("resending session close request for mds%d\n", @@ -3490,7 +3497,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) fsc->mdsc = mdsc; mutex_init(&mdsc->mutex); mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS); - if (mdsc->mdsmap == NULL) { + if (!mdsc->mdsmap) { kfree(mdsc); return -ENOMEM; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 1a748cf88535..33ced4c22732 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -112,7 +112,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) u16 mdsmap_ev; m = kzalloc(sizeof(*m), GFP_NOFS); - if (m == NULL) + if (!m) return ERR_PTR(-ENOMEM); ceph_decode_need(p, end, 1 + 1, bad); @@ -138,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_num_mds = m->m_max_mds; m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); - if (m->m_info == NULL) + if (!m->m_info) goto nomem; /* pick out active nodes from mds_info (state > 0) */ @@ -232,7 +232,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) if (num_export_targets) { info->export_targets = kcalloc(num_export_targets, sizeof(u32), GFP_NOFS); - if (info->export_targets == NULL) + if (!info->export_targets) goto nomem; for (j = 0; j < num_export_targets; j++) info->export_targets[j] = diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index dab5d6732345..1ffc8b426c1c 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -299,7 +299,8 @@ static int cmpu64_rev(const void *a, const void *b) /* * build the snap context for a given realm. */ -static int build_snap_context(struct ceph_snap_realm *realm) +static int build_snap_context(struct ceph_snap_realm *realm, + struct list_head* dirty_realms) { struct ceph_snap_realm *parent = realm->parent; struct ceph_snap_context *snapc; @@ -313,7 +314,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) */ if (parent) { if (!parent->cached_context) { - err = build_snap_context(parent); + err = build_snap_context(parent, dirty_realms); if (err) goto fail; } @@ -332,7 +333,7 @@ static int build_snap_context(struct ceph_snap_realm *realm) " (unchanged)\n", realm->ino, realm, realm->cached_context, realm->cached_context->seq, - (unsigned int) realm->cached_context->num_snaps); + (unsigned int)realm->cached_context->num_snaps); return 0; } @@ -373,7 +374,11 @@ static int build_snap_context(struct ceph_snap_realm *realm) realm->ino, realm, snapc, snapc->seq, (unsigned int) snapc->num_snaps); - ceph_put_snap_context(realm->cached_context); + if (realm->cached_context) { + ceph_put_snap_context(realm->cached_context); + /* queue realm for cap_snap creation */ + list_add_tail(&realm->dirty_item, dirty_realms); + } realm->cached_context = snapc; return 0; @@ -394,15 +399,16 @@ fail: /* * rebuild snap context for the given realm and all of its children. */ -static void rebuild_snap_realms(struct ceph_snap_realm *realm) +static void rebuild_snap_realms(struct ceph_snap_realm *realm, + struct list_head *dirty_realms) { struct ceph_snap_realm *child; dout("rebuild_snap_realms %llx %p\n", realm->ino, realm); - build_snap_context(realm); + build_snap_context(realm, dirty_realms); list_for_each_entry(child, &realm->children, child_item) - rebuild_snap_realms(child); + rebuild_snap_realms(child, dirty_realms); } @@ -624,13 +630,11 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) { struct ceph_inode_info *ci; struct inode *lastinode = NULL; - struct ceph_snap_realm *child; dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { + list_for_each_entry(ci, &realm->inodes_with_caps, i_snap_realm_item) { struct inode *inode = igrab(&ci->vfs_inode); if (!inode) continue; @@ -643,14 +647,6 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) spin_unlock(&realm->inodes_with_caps_lock); iput(lastinode); - list_for_each_entry(child, &realm->children, child_item) { - dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n", - realm, realm->ino, child, child->ino); - list_del_init(&child->dirty_item); - list_add(&child->dirty_item, &realm->dirty_item); - } - - list_del_init(&realm->dirty_item); dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); } @@ -721,8 +717,6 @@ more: if (err < 0) goto fail; - /* queue realm for cap_snap creation */ - list_add(&realm->dirty_item, &dirty_realms); if (realm->seq > mdsc->last_snap_seq) mdsc->last_snap_seq = realm->seq; @@ -741,7 +735,7 @@ more: /* invalidate when we reach the _end_ (root) of the trace */ if (invalidate && p >= e) - rebuild_snap_realms(realm); + rebuild_snap_realms(realm, &dirty_realms); if (!first_realm) first_realm = realm; @@ -758,6 +752,7 @@ more: while (!list_empty(&dirty_realms)) { realm = list_first_entry(&dirty_realms, struct ceph_snap_realm, dirty_item); + list_del_init(&realm->dirty_item); queue_realm_cap_snaps(realm); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index aa06a8c24792..e4082afedcb1 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -49,9 +49,16 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) struct ceph_statfs st; u64 fsid; int err; + u64 data_pool; + + if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { + data_pool = fsc->mdsc->mdsmap->m_data_pg_pools[0]; + } else { + data_pool = CEPH_NOPOOL; + } dout("statfs\n"); - err = ceph_monc_do_statfs(&fsc->client->monc, &st); + err = ceph_monc_do_statfs(&fsc->client->monc, data_pool, &st); if (err < 0) return err; @@ -113,7 +120,6 @@ enum { Opt_rasize, Opt_caps_wanted_delay_min, Opt_caps_wanted_delay_max, - Opt_cap_release_safety, Opt_readdir_max_entries, Opt_readdir_max_bytes, Opt_congestion_kb, @@ -152,7 +158,6 @@ static match_table_t fsopt_tokens = { {Opt_rasize, "rasize=%d"}, {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"}, {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"}, - {Opt_cap_release_safety, "cap_release_safety=%d"}, {Opt_readdir_max_entries, "readdir_max_entries=%d"}, {Opt_readdir_max_bytes, "readdir_max_bytes=%d"}, {Opt_congestion_kb, "write_congestion_kb=%d"}, @@ -235,27 +240,43 @@ static int parse_fsopt_token(char *c, void *private) break; /* misc */ case Opt_wsize: - fsopt->wsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE) + return -EINVAL; + fsopt->wsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rsize: - fsopt->rsize = intval; + if (intval < PAGE_SIZE || intval > CEPH_MAX_READ_SIZE) + return -EINVAL; + fsopt->rsize = ALIGN(intval, PAGE_SIZE); break; case Opt_rasize: - fsopt->rasize = intval; + if (intval < 0) + return -EINVAL; + fsopt->rasize = ALIGN(intval + PAGE_SIZE - 1, PAGE_SIZE); break; case Opt_caps_wanted_delay_min: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_min = intval; break; case Opt_caps_wanted_delay_max: + if (intval < 1) + return -EINVAL; fsopt->caps_wanted_delay_max = intval; break; case Opt_readdir_max_entries: + if (intval < 1) + return -EINVAL; fsopt->max_readdir = intval; break; case Opt_readdir_max_bytes: + if (intval < PAGE_SIZE && intval != 0) + return -EINVAL; fsopt->max_readdir_bytes = intval; break; case Opt_congestion_kb: + if (intval < 1024) /* at least 1M */ + return -EINVAL; fsopt->congestion_kb = intval; break; case Opt_dirstat: @@ -392,7 +413,8 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->sb_flags = flags; fsopt->flags = CEPH_MOUNT_OPT_DEFAULT; - fsopt->rsize = CEPH_RSIZE_DEFAULT; + fsopt->wsize = CEPH_MAX_WRITE_SIZE; + fsopt->rsize = CEPH_MAX_READ_SIZE; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); if (!fsopt->snapdir_name) { @@ -402,7 +424,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; - fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT; fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT; fsopt->congestion_kb = default_congestion_kb(); @@ -508,7 +529,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace); if (fsopt->wsize) seq_printf(m, ",wsize=%d", fsopt->wsize); - if (fsopt->rsize != CEPH_RSIZE_DEFAULT) + if (fsopt->rsize != CEPH_MAX_READ_SIZE) seq_printf(m, ",rsize=%d", fsopt->rsize); if (fsopt->rasize != CEPH_RASIZE_DEFAULT) seq_printf(m, ",rasize=%d", fsopt->rasize); @@ -520,9 +541,6 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT) seq_printf(m, ",caps_wanted_delay_max=%d", fsopt->caps_wanted_delay_max); - if (fsopt->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT) - seq_printf(m, ",cap_release_safety=%d", - fsopt->cap_release_safety); if (fsopt->max_readdir != CEPH_MAX_READDIR_DEFAULT) seq_printf(m, ",readdir_max_entries=%d", fsopt->max_readdir); if (fsopt->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT) @@ -576,7 +594,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, } fsc->client->extra_mon_dispatch = extra_mon_dispatch; - if (fsopt->mds_namespace == NULL) { + if (!fsopt->mds_namespace) { ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true); } else { @@ -597,13 +615,13 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, * to be processed in parallel, limit concurrency. */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); - if (fsc->wb_wq == NULL) + if (!fsc->wb_wq) goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); - if (fsc->pg_inv_wq == NULL) + if (!fsc->pg_inv_wq) goto fail_wb_wq; fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1); - if (fsc->trunc_wq == NULL) + if (!fsc->trunc_wq) goto fail_pg_inv_wq; /* set up mempools */ @@ -674,26 +692,26 @@ static int __init init_caches(void) __alignof__(struct ceph_inode_info), SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| SLAB_ACCOUNT, ceph_inode_init_once); - if (ceph_inode_cachep == NULL) + if (!ceph_inode_cachep) return -ENOMEM; ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_cachep == NULL) + if (!ceph_cap_cachep) goto bad_cap; ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_cap_flush_cachep == NULL) + if (!ceph_cap_flush_cachep) goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); - if (ceph_dentry_cachep == NULL) + if (!ceph_dentry_cachep) goto bad_dentry; ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD); - if (ceph_file_cachep == NULL) + if (!ceph_file_cachep) goto bad_file; if ((error = ceph_fscache_register())) @@ -947,20 +965,10 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) return err; /* set ra_pages based on rasize mount option? */ - if (fsc->mount_options->rasize >= PAGE_SIZE) - sb->s_bdi->ra_pages = - (fsc->mount_options->rasize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else - sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - - if (fsc->mount_options->rsize > fsc->mount_options->rasize && - fsc->mount_options->rsize >= PAGE_SIZE) - sb->s_bdi->io_pages = - (fsc->mount_options->rsize + PAGE_SIZE - 1) - >> PAGE_SHIFT; - else if (fsc->mount_options->rsize == 0) - sb->s_bdi->io_pages = ULONG_MAX; + sb->s_bdi->ra_pages = fsc->mount_options->rasize >> PAGE_SHIFT; + + /* set io_pages based on max osd read size */ + sb->s_bdi->io_pages = fsc->mount_options->rsize >> PAGE_SHIFT; return 0; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f02a2225fe42..279a2f401cf5 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,12 +46,25 @@ #define ceph_test_mount_opt(fsc, opt) \ (!!((fsc)->mount_options->flags & CEPH_MOUNT_OPT_##opt)) -#define CEPH_RSIZE_DEFAULT (64*1024*1024) /* max read size */ +/* max size of osd read request, limited by libceph */ +#define CEPH_MAX_READ_SIZE CEPH_MSG_MAX_DATA_LEN +/* osd has a configurable limitaion of max write size. + * CEPH_MSG_MAX_DATA_LEN should be small enough. */ +#define CEPH_MAX_WRITE_SIZE CEPH_MSG_MAX_DATA_LEN #define CEPH_RASIZE_DEFAULT (8192*1024) /* max readahead */ #define CEPH_MAX_READDIR_DEFAULT 1024 #define CEPH_MAX_READDIR_BYTES_DEFAULT (512*1024) #define CEPH_SNAPDIRNAME_DEFAULT ".snap" +/* + * Delay telling the MDS we no longer want caps, in case we reopen + * the file. Delay a minimum amount of time, even if we send a cap + * message for some other reason. Otherwise, take the oppotunity to + * update the mds to avoid sending another message later. + */ +#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */ +#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */ + struct ceph_mount_options { int flags; int sb_flags; @@ -61,7 +74,6 @@ struct ceph_mount_options { int rasize; /* max readahead */ int congestion_kb; /* max writeback in flight */ int caps_wanted_delay_min, caps_wanted_delay_max; - int cap_release_safety; int max_readdir; /* max readdir result (entires) */ int max_readdir_bytes; /* max readdir result (bytes) */ diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 11263f102e4c..3542b2c364cf 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -777,7 +777,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, spin_unlock(&ci->i_ceph_lock); /* security module gets xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync getxattr %p " "during filling trace\n", inode); return -EBUSY; @@ -809,7 +809,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, memcpy(value, xattr->val, xattr->val_len); - if (current->journal_info != NULL && + if (current->journal_info && !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) ci->i_ceph_flags |= CEPH_I_SEC_INITED; out: @@ -1058,7 +1058,7 @@ do_sync_unlocked: up_read(&mdsc->snap_rwsem); /* security module set xattr while filling trace */ - if (current->journal_info != NULL) { + if (current->journal_info) { pr_warn_ratelimited("sync setxattr %p " "during filling trace\n", inode); err = -EBUSY; @@ -1108,7 +1108,7 @@ bool ceph_security_xattr_deadlock(struct inode *in) { struct ceph_inode_info *ci; bool ret; - if (in->i_security == NULL) + if (!in->i_security) return false; ci = ceph_inode(in); spin_lock(&ci->i_ceph_lock); diff --git a/fs/char_dev.c b/fs/char_dev.c index fb8507f521b2..ebcc8fb3fa66 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -28,6 +28,8 @@ static struct kobj_map *cdev_map; static DEFINE_MUTEX(chrdevs_lock); +#define CHRDEV_MAJOR_HASH_SIZE 255 + static struct char_device_struct { struct char_device_struct *next; unsigned int major; @@ -49,16 +51,39 @@ void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; - if (offset < CHRDEV_MAJOR_HASH_SIZE) { - mutex_lock(&chrdevs_lock); - for (cd = chrdevs[offset]; cd; cd = cd->next) + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[major_to_index(offset)]; cd; cd = cd->next) { + if (cd->major == offset) seq_printf(f, "%3d %s\n", cd->major, cd->name); - mutex_unlock(&chrdevs_lock); } + mutex_unlock(&chrdevs_lock); } #endif /* CONFIG_PROC_FS */ +static int find_dynamic_major(void) +{ + int i; + struct char_device_struct *cd; + + for (i = ARRAY_SIZE(chrdevs)-1; i > CHRDEV_MAJOR_DYN_END; i--) { + if (chrdevs[i] == NULL) + return i; + } + + for (i = CHRDEV_MAJOR_DYN_EXT_START; + i > CHRDEV_MAJOR_DYN_EXT_END; i--) { + for (cd = chrdevs[major_to_index(i)]; cd; cd = cd->next) + if (cd->major == i) + break; + + if (cd == NULL || cd->major != i) + return i; + } + + return -EBUSY; +} + /* * Register a single major with a specified minor range. * @@ -84,22 +109,21 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, mutex_lock(&chrdevs_lock); - /* temporary */ if (major == 0) { - for (i = ARRAY_SIZE(chrdevs)-1; i > 0; i--) { - if (chrdevs[i] == NULL) - break; - } - - if (i < CHRDEV_MAJOR_DYN_END) - pr_warn("CHRDEV \"%s\" major number %d goes below the dynamic allocation range\n", - name, i); - - if (i == 0) { - ret = -EBUSY; + ret = find_dynamic_major(); + if (ret < 0) { + pr_err("CHRDEV \"%s\" dynamic allocation region is full\n", + name); goto out; } - major = i; + major = ret; + } + + if (major >= CHRDEV_MAJOR_MAX) { + pr_err("CHRDEV \"%s\" major requested (%d) is greater than the maximum (%d)\n", + name, major, CHRDEV_MAJOR_MAX); + ret = -EINVAL; + goto out; } cd->major = major; diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 6c665bf4a27c..2c14020e5e1d 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -292,36 +292,6 @@ fscache_checkaux cifs_fscache_inode_check_aux(void *cookie_netfs_data, return FSCACHE_CHECKAUX_OKAY; } -static void cifs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct cifsInodeInfo *cifsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - cifs_dbg(FYI, "%s: cifs inode 0x%p now uncached\n", __func__, cifsi); - - for (;;) { - nr_pages = pagevec_lookup(&pvec, - cifsi->vfs_inode.i_mapping, first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - const struct fscache_cookie_def cifs_fscache_inode_object_def = { .name = "CIFS.uniqueid", .type = FSCACHE_COOKIE_TYPE_DATAFILE, @@ -329,5 +299,4 @@ const struct fscache_cookie_def cifs_fscache_inode_object_def = { .get_attr = cifs_fscache_inode_get_attr, .get_aux = cifs_fscache_inode_get_aux, .check_aux = cifs_fscache_inode_check_aux, - .now_uncached = cifs_fscache_inode_now_uncached, }; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 221693fe49ec..808486c29f0d 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -421,7 +421,7 @@ struct smb_version_operations { size_t, struct cifs_sb_info *); int (*set_EA)(const unsigned int, struct cifs_tcon *, const char *, const char *, const void *, const __u16, - const struct nls_table *, int); + const struct nls_table *, struct cifs_sb_info *); struct cifs_ntsd * (*get_acl)(struct cifs_sb_info *, struct inode *, const char *, u32 *); struct cifs_ntsd * (*get_acl_by_fid)(struct cifs_sb_info *, diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 6eb3147132e3..4143c9dec463 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -484,7 +484,8 @@ extern ssize_t CIFSSMBQAllEAs(const unsigned int xid, struct cifs_tcon *tcon, extern int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, - const struct nls_table *nls_codepage, int remap_special_chars); + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, struct cifs_ntsd **acl_inf, __u32 *buflen); extern int CIFSSMBSetCIFSACL(const unsigned int, struct cifs_tcon *, __u16, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 72a53bd19865..35dc5bf01ee2 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -178,6 +178,18 @@ cifs_reconnect_tcon(struct cifs_tcon *tcon, int smb_command) * reconnect the same SMB session */ mutex_lock(&ses->session_mutex); + + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed + * and tcpStatus set to reconnect. + */ + if (server->tcpStatus == CifsNeedReconnect) { + rc = -EHOSTDOWN; + mutex_unlock(&ses->session_mutex); + goto out; + } + rc = cifs_negotiate_protocol(0, ses); if (rc == 0 && ses->need_reconnect) rc = cifs_setup_session(0, ses, nls_codepage); @@ -2522,7 +2534,7 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, pLockData->fl_start = le64_to_cpu(parm_data->start); pLockData->fl_end = pLockData->fl_start + le64_to_cpu(parm_data->length) - 1; - pLockData->fl_pid = le32_to_cpu(parm_data->pid); + pLockData->fl_pid = -le32_to_cpu(parm_data->pid); } } @@ -6264,7 +6276,7 @@ int CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const char *ea_name, const void *ea_value, const __u16 ea_value_len, const struct nls_table *nls_codepage, - int remap) + struct cifs_sb_info *cifs_sb) { struct smb_com_transaction2_spi_req *pSMB = NULL; struct smb_com_transaction2_spi_rsp *pSMBr = NULL; @@ -6273,6 +6285,7 @@ CIFSSMBSetEA(const unsigned int xid, struct cifs_tcon *tcon, int rc = 0; int bytes_returned = 0; __u16 params, param_offset, byte_count, offset, count; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In SetEA\n"); SetEARetry: diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 59647eb72c5f..5aa2d278ca84 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -509,7 +509,8 @@ server_unresponsive(struct TCP_Server_Info *server) * 65s kernel_recvmsg times out, and we see that we haven't gotten * a response in >60s. */ - if (server->tcpStatus == CifsGood && + if ((server->tcpStatus == CifsGood || + server->tcpStatus == CifsNeedNegotiate) && time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", server->hostname, (2 * server->echo_interval) / HZ); @@ -1223,6 +1224,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, char *tmp_end, *value; char delim; bool got_ip = false; + bool got_version = false; unsigned short port = 0; struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; @@ -1874,24 +1876,35 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, pr_warn("CIFS: server netbiosname longer than 15 truncated.\n"); break; case Opt_ver: + /* version of mount userspace tools, not dialect */ string = match_strdup(args); if (string == NULL) goto out_nomem; + /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(string, "1", 1) == 0) { + if (strlen(string) > 1) { + pr_warn("Bad mount helper ver=%s. Did " + "you want SMB1 (CIFS) dialect " + "and mean to type vers=1.0 " + "instead?\n", string); + goto cifs_parse_mount_err; + } /* This is the default */ break; } /* For all other value, error */ - pr_warn("CIFS: Invalid version specified\n"); + pr_warn("CIFS: Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: + /* protocol version (dialect) */ string = match_strdup(args); if (string == NULL) goto out_nomem; if (cifs_parse_smb_version(string, vol) != 0) goto cifs_parse_mount_err; + got_version = true; break; case Opt_sec: string = match_strdup(args); @@ -1973,6 +1986,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, else if (override_gid == 1) pr_notice("CIFS: ignoring forcegid mount option specified with no gid= option.\n"); + if (got_version == false) + pr_warn("No dialect specified on mount. Default has changed to " + "a more secure dialect, SMB3 (vers=3.0), from CIFS " + "(SMB1). To use the less secure SMB1 dialect to access " + "old servers which do not support SMB3 specify vers=1.0" + " on mount. For somewhat newer servers such as Windows " + "7 try vers=2.1.\n"); + kfree(mountdata_copy); return 0; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 569d3fb736be..e702d48bd023 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -205,7 +205,7 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) int i; if (unlikely(direntry->d_name.len > - tcon->fsAttrInfo.MaxPathNameComponentLength)) + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) return -ENAMETOOLONG; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index bc09df6b473a..0786f19d288f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2329,7 +2329,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file_inode(file); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); @@ -2371,7 +2371,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct inode *inode = file->f_mapping->host; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; inode_lock(inode); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index cfacf2c97e94..fb2934b9b97c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -426,6 +426,194 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +static ssize_t +move_smb2_ea_to_cifs(char *dst, size_t dst_size, + struct smb2_file_full_ea_info *src, size_t src_size, + const unsigned char *ea_name) +{ + int rc = 0; + unsigned int ea_name_len = ea_name ? strlen(ea_name) : 0; + char *name, *value; + size_t name_len, value_len, user_name_len; + + while (src_size > 0) { + name = &src->ea_data[0]; + name_len = (size_t)src->ea_name_length; + value = &src->ea_data[src->ea_name_length + 1]; + value_len = (size_t)le16_to_cpu(src->ea_value_length); + + if (name_len == 0) { + break; + } + + if (src_size < 8 + name_len + 1 + value_len) { + cifs_dbg(FYI, "EA entry goes beyond length of list\n"); + rc = -EIO; + goto out; + } + + if (ea_name) { + if (ea_name_len == name_len && + memcmp(ea_name, name, name_len) == 0) { + rc = value_len; + if (dst_size == 0) + goto out; + if (dst_size < value_len) { + rc = -ERANGE; + goto out; + } + memcpy(dst, value, value_len); + goto out; + } + } else { + /* 'user.' plus a terminating null */ + user_name_len = 5 + 1 + name_len; + + rc += user_name_len; + + if (dst_size >= user_name_len) { + dst_size -= user_name_len; + memcpy(dst, "user.", 5); + dst += 5; + memcpy(dst, src->ea_data, name_len); + dst += name_len; + *dst = 0; + ++dst; + } else if (dst_size == 0) { + /* skip copy - calc size only */ + } else { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + } + + if (!src->next_entry_offset) + break; + + if (src_size < le32_to_cpu(src->next_entry_offset)) { + /* stop before overrun buffer */ + rc = -ERANGE; + break; + } + src_size -= le32_to_cpu(src->next_entry_offset); + src = (void *)((char *)src + + le32_to_cpu(src->next_entry_offset)); + } + + /* didn't find the named attribute */ + if (ea_name) + rc = -ENODATA; + +out: + return (ssize_t)rc; +} + +static ssize_t +smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + const unsigned char *path, const unsigned char *ea_name, + char *ea_data, size_t buf_size, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_file_full_ea_info *smb2_data; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_READ_EA; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (rc) { + cifs_dbg(FYI, "open failed rc=%d\n", rc); + return rc; + } + + smb2_data = kzalloc(SMB2_MAX_EA_BUF, GFP_KERNEL); + if (smb2_data == NULL) { + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return -ENOMEM; + } + + rc = SMB2_query_eas(xid, tcon, fid.persistent_fid, fid.volatile_fid, + smb2_data); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + if (!rc) + rc = move_smb2_ea_to_cifs(ea_data, buf_size, smb2_data, + SMB2_MAX_EA_BUF, ea_name); + + kfree(smb2_data); + return rc; +} + + +static int +smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + const char *path, const char *ea_name, const void *ea_value, + const __u16 ea_value_len, const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb) +{ + int rc; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct cifs_open_parms oparms; + struct cifs_fid fid; + struct smb2_file_full_ea_info *ea; + int ea_name_len = strlen(ea_name); + int len; + + if (ea_name_len > 255) + return -EINVAL; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.desired_access = FILE_WRITE_EA; + oparms.disposition = FILE_OPEN; + oparms.create_options = 0; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + kfree(utf16_path); + if (rc) { + cifs_dbg(FYI, "open failed rc=%d\n", rc); + return rc; + } + + len = sizeof(ea) + ea_name_len + ea_value_len + 1; + ea = kzalloc(len, GFP_KERNEL); + if (ea == NULL) { + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + return -ENOMEM; + } + + ea->ea_name_length = ea_name_len; + ea->ea_value_length = cpu_to_le16(ea_value_len); + memcpy(ea->ea_data, ea_name, ea_name_len + 1); + memcpy(ea->ea_data + ea_name_len + 1, ea_value, ea_value_len); + + rc = SMB2_set_ea(xid, tcon, fid.persistent_fid, fid.volatile_fid, ea, + len); + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + return rc; +} + static bool smb2_can_echo(struct TCP_Server_Info *server) { @@ -2572,6 +2760,10 @@ struct smb_version_operations smb20_operations = { .dir_needs_close = smb2_dir_needs_close, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2662,6 +2854,10 @@ struct smb_version_operations smb21_operations = { .enum_snapshots = smb3_enum_snapshots, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2762,6 +2958,10 @@ struct smb_version_operations smb30_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ #ifdef CONFIG_CIFS_ACL .get_acl = get_smb2_acl, .get_acl_by_fid = get_smb2_acl_by_fid, @@ -2863,6 +3063,10 @@ struct smb_version_operations smb311_operations = { .receive_transform = smb3_receive_transform, .get_dfs_refer = smb2_get_dfs_refer, .select_sectype = smb2_select_sectype, +#ifdef CONFIG_CIFS_XATTR + .query_all_EAs = smb2_query_eas, + .set_EA = smb2_set_ea, +#endif /* CIFS_XATTR */ }; #endif /* CIFS_SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 97edb4d376cd..5531e7ee1210 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -238,6 +238,18 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) * the same SMB session */ mutex_lock(&tcon->ses->session_mutex); + + /* + * Recheck after acquire mutex. If another thread is negotiating + * and the server never sends an answer the socket will be closed + * and tcpStatus set to reconnect. + */ + if (server->tcpStatus == CifsNeedReconnect) { + rc = -EHOSTDOWN; + mutex_unlock(&tcon->ses->session_mutex); + goto out; + } + rc = cifs_negotiate_protocol(0, tcon->ses); if (!rc && tcon->ses->need_reconnect) rc = cifs_setup_session(0, tcon->ses, nls_codepage); @@ -514,7 +526,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) * No tcon so can't do * cifs_stats_inc(&tcon->stats.smb2_stats.smb2_com_fail[SMB2...]); */ - if (rc != 0) + if (rc == -EOPNOTSUPP) { + cifs_dbg(VFS, "Dialect not supported by server. Consider " + "specifying vers=1.0 or vers=2.1 on mount for accessing" + " older servers\n"); + goto neg_exit; + } else if (rc != 0) goto neg_exit; cifs_dbg(FYI, "mode 0x%x\n", rsp->SecurityMode); @@ -2140,6 +2157,18 @@ qinf_exit: return rc; } +int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *data) +{ + return query_info(xid, tcon, persistent_fid, volatile_fid, + FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, 0, + SMB2_MAX_EA_BUF, + sizeof(struct smb2_file_full_ea_info), + (void **)&data, + NULL); +} + int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct smb2_file_all_info *data) { @@ -3180,6 +3209,16 @@ SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, } int +SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *buf, int len) +{ + return send_set_info(xid, tcon, persistent_fid, volatile_fid, + current->tgid, FILE_FULL_EA_INFORMATION, SMB2_O_INFO_FILE, + 0, 1, (void **)&buf, &len); +} + +int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, const u64 persistent_fid, const u64 volatile_fid, __u8 oplock_level) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 18700fd25a0b..393ed5f4e1b6 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -84,8 +84,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) #define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) @@ -1178,6 +1178,16 @@ struct smb2_file_link_info { /* encoding of request for level 11 */ char FileName[0]; /* Name to be assigned to new link */ } __packed; /* level 11 Set */ +#define SMB2_MAX_EA_BUF 2048 + +struct smb2_file_full_ea_info { /* encoding of response for level 15 */ + __le32 next_entry_offset; + __u8 flags; + __u8 ea_name_length; + __le16 ea_value_length; + char ea_data[0]; /* \0 terminated name plus value */ +} __packed; /* level 15 Set */ + /* * This level 18, although with struct with same name is different from cifs * level 0x107. Level 0x107 has an extra u64 between AccessFlags and diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 1cadaf9f3c58..003217099ef3 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -132,6 +132,9 @@ extern int SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); extern int SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id); +extern int SMB2_query_eas(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_file_id, u64 volatile_file_id, + struct smb2_file_full_ea_info *data); extern int SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct smb2_file_all_info *data); @@ -169,6 +172,9 @@ extern int SMB2_set_info(const unsigned int xid, struct cifs_tcon *tcon, extern int SMB2_set_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, struct cifs_ntsd *pnntsd, int pacllen, int aclflag); +extern int SMB2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, + struct smb2_file_full_ea_info *buf, int len); extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid); extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index de50e749ff05..52f975d848a0 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -84,7 +84,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, name, value, (__u16)size, - cifs_sb->local_nls, cifs_remap(cifs_sb)); + cifs_sb->local_nls, cifs_sb); break; case XATTR_CIFS_ACL: { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 2dd4a7af7dd7..d27b326d96f4 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1331,8 +1331,6 @@ COMPATIBLE_IOCTL(DMX_SET_FILTER) COMPATIBLE_IOCTL(DMX_SET_PES_FILTER) COMPATIBLE_IOCTL(DMX_SET_BUFFER_SIZE) COMPATIBLE_IOCTL(DMX_GET_PES_PIDS) -COMPATIBLE_IOCTL(DMX_GET_CAPS) -COMPATIBLE_IOCTL(DMX_SET_SOURCE) COMPATIBLE_IOCTL(DMX_GET_STC) COMPATIBLE_IOCTL(FE_GET_INFO) COMPATIBLE_IOCTL(FE_DISEQC_RESET_OVERLOAD) diff --git a/fs/coredump.c b/fs/coredump.c index 592683711c64..0eec03696707 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -161,7 +161,7 @@ static int cn_print_exe_file(struct core_name *cn) if (!exe_file) return cn_esc_printf(cn, "%s (path unknown)", current->comm); - pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); + pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) { ret = -ENOMEM; goto put_exe_file; diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 6181e9526860..483784d5eb73 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -115,7 +115,7 @@ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, err = -ENOMEM; goto errout; } - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (inode->i_sb->s_blocksize_bits - 9); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -42,6 +42,9 @@ #define DAX_WAIT_TABLE_BITS 12 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) +/* The 'colour' (ie low bits) within a PMD of a page offset. */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) + static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; static int __init init_dax_wait_table(void) @@ -54,6 +57,40 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); +/* + * We use lowest available bit in exceptional entry for locking, one bit for + * the entry size (PMD) and two more to tell us if the entry is a zero page or + * an empty entry that is just used for locking. In total four special bits. + * + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE + * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem + * block allocation. + */ +#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4) +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT) +#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1)) +#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) +#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) + +static unsigned long dax_radix_sector(void *entry) +{ + return (unsigned long)entry >> RADIX_DAX_SHIFT; +} + +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) +{ + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | + ((unsigned long)sector << RADIX_DAX_SHIFT) | + RADIX_DAX_ENTRY_LOCK); +} + +static unsigned int dax_radix_order(void *entry) +{ + if ((unsigned long)entry & RADIX_DAX_PMD) + return PMD_SHIFT - PAGE_SHIFT; + return 0; +} + static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -66,7 +103,7 @@ static int dax_is_pte_entry(void *entry) static int dax_is_zero_entry(void *entry) { - return (unsigned long)entry & RADIX_DAX_HZP; + return (unsigned long)entry & RADIX_DAX_ZERO_PAGE; } static int dax_is_empty_entry(void *entry) @@ -98,7 +135,7 @@ static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping, * the range covered by the PMD map to the same bit lock. */ if (dax_is_pmd_entry(entry)) - index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1); + index &= ~PG_PMD_COLOUR; key->mapping = mapping; key->entry_start = index; @@ -121,6 +158,31 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo } /* + * We do not necessarily hold the mapping->tree_lock when we call this + * function so it is possible that 'entry' is no longer a valid item in the + * radix tree. This is okay because all we really need to do is to find the + * correct waitqueue where tasks might be waiting for that old 'entry' and + * wake them. + */ +static void dax_wake_mapping_entry_waiter(struct address_space *mapping, + pgoff_t index, void *entry, bool wake_all) +{ + struct exceptional_entry_key key; + wait_queue_head_t *wq; + + wq = dax_entry_waitqueue(mapping, index, entry, &key); + + /* + * Checking for locked entry and prepare_to_wait_exclusive() happens + * under mapping->tree_lock, ditto for entry handling in our callers. + * So at this point all tasks that could have seen our entry locked + * must be in the waitqueue and the following check will see them. + */ + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); +} + +/* * Check whether the given slot is locked. The function must be called with * mapping->tree_lock held */ @@ -181,7 +243,8 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, for (;;) { entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || + if (!entry || + WARN_ON_ONCE(!radix_tree_exceptional_entry(entry)) || !slot_locked(mapping, slot)) { if (slotp) *slotp = slot; @@ -216,14 +279,9 @@ static void dax_unlock_mapping_entry(struct address_space *mapping, } static void put_locked_mapping_entry(struct address_space *mapping, - pgoff_t index, void *entry) + pgoff_t index) { - if (!radix_tree_exceptional_entry(entry)) { - unlock_page(entry); - put_page(entry); - } else { - dax_unlock_mapping_entry(mapping, index); - } + dax_unlock_mapping_entry(mapping, index); } /* @@ -233,7 +291,7 @@ static void put_locked_mapping_entry(struct address_space *mapping, static void put_unlocked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { - if (!radix_tree_exceptional_entry(entry)) + if (!entry) return; /* We have to wake up next waiter for the radix tree entry lock */ @@ -241,15 +299,15 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, } /* - * Find radix tree entry at given index. If it points to a page, return with - * the page locked. If it points to the exceptional entry, return with the - * radix tree entry locked. If the radix tree doesn't contain given index, - * create empty exceptional entry for the index and return with it locked. + * Find radix tree entry at given index. If it points to an exceptional entry, + * return it with the radix tree entry locked. If the radix tree doesn't + * contain given index, create an empty exceptional entry for the index and + * return with it locked. * * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will * either return that locked entry or will return an error. This error will - * happen if there are any 4k entries (either zero pages or DAX entries) - * within the 2MiB range that we are requesting. + * happen if there are any 4k entries within the 2MiB range that we are + * requesting. * * We always favor 4k entries over 2MiB entries. There isn't a flow where we * evict 4k entries in order to 'upgrade' them to a 2MiB entry. A 2MiB @@ -276,18 +334,21 @@ restart: spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, &slot); + if (WARN_ON_ONCE(entry && !radix_tree_exceptional_entry(entry))) { + entry = ERR_PTR(-EIO); + goto out_unlock; + } + if (entry) { if (size_flag & RADIX_DAX_PMD) { - if (!radix_tree_exceptional_entry(entry) || - dax_is_pte_entry(entry)) { + if (dax_is_pte_entry(entry)) { put_unlocked_mapping_entry(mapping, index, entry); entry = ERR_PTR(-EEXIST); goto out_unlock; } } else { /* trying to grab a PTE entry */ - if (radix_tree_exceptional_entry(entry) && - dax_is_pmd_entry(entry) && + if (dax_is_pmd_entry(entry) && (dax_is_zero_entry(entry) || dax_is_empty_entry(entry))) { pmd_downgrade = true; @@ -321,7 +382,7 @@ restart: mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); if (err) { if (pmd_downgrade) - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ERR_PTR(err); } spin_lock_irq(&mapping->tree_lock); @@ -371,52 +432,12 @@ restart: spin_unlock_irq(&mapping->tree_lock); return entry; } - /* Normal page in radix tree? */ - if (!radix_tree_exceptional_entry(entry)) { - struct page *page = entry; - - get_page(page); - spin_unlock_irq(&mapping->tree_lock); - lock_page(page); - /* Page got truncated? Retry... */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto restart; - } - return page; - } entry = lock_slot(mapping, slot); out_unlock: spin_unlock_irq(&mapping->tree_lock); return entry; } -/* - * We do not necessarily hold the mapping->tree_lock when we call this - * function so it is possible that 'entry' is no longer a valid item in the - * radix tree. This is okay because all we really need to do is to find the - * correct waitqueue where tasks might be waiting for that old 'entry' and - * wake them. - */ -void dax_wake_mapping_entry_waiter(struct address_space *mapping, - pgoff_t index, void *entry, bool wake_all) -{ - struct exceptional_entry_key key; - wait_queue_head_t *wq; - - wq = dax_entry_waitqueue(mapping, index, entry, &key); - - /* - * Checking for locked entry and prepare_to_wait_exclusive() happens - * under mapping->tree_lock, ditto for entry handling in our callers. - * So at this point all tasks that could have seen our entry locked - * must be in the waitqueue and the following check will see them. - */ - if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); -} - static int __dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index, bool trunc) { @@ -426,7 +447,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, spin_lock_irq(&mapping->tree_lock); entry = get_unlocked_mapping_entry(mapping, index, NULL); - if (!entry || !radix_tree_exceptional_entry(entry)) + if (!entry || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry))) goto out; if (!trunc && (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || @@ -468,50 +489,6 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, return __dax_invalidate_mapping_entry(mapping, index, false); } -/* - * The user has performed a load from a hole in the file. Allocating - * a new page in the file would cause excessive storage usage for - * workloads with sparse files. We allocate a page cache page instead. - * We'll kick it out of the page cache if it's ever written to, - * otherwise it will simply fall out of the page cache under memory - * pressure without ever having been dirtied. - */ -static int dax_load_hole(struct address_space *mapping, void **entry, - struct vm_fault *vmf) -{ - struct inode *inode = mapping->host; - struct page *page; - int ret; - - /* Hole page already exists? Return it... */ - if (!radix_tree_exceptional_entry(*entry)) { - page = *entry; - goto finish_fault; - } - - /* This will replace locked radix tree entry with a hole page */ - page = find_or_create_page(mapping, vmf->pgoff, - vmf->gfp_mask | __GFP_ZERO); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - -finish_fault: - vmf->page = page; - ret = finish_fault(vmf); - vmf->page = NULL; - *entry = page; - if (!ret) { - /* Grab reference for PTE that is now referencing the page */ - get_page(page); - ret = VM_FAULT_NOPAGE; - } -out: - trace_dax_load_hole(inode, vmf, ret); - return ret; -} - static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, sector_t sector, size_t size, struct page *to, unsigned long vaddr) @@ -552,47 +529,27 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, unsigned long flags) { struct radix_tree_root *page_tree = &mapping->page_tree; - int error = 0; - bool hole_fill = false; void *new_entry; pgoff_t index = vmf->pgoff; if (vmf->flags & FAULT_FLAG_WRITE) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - /* Replacing hole page with block mapping? */ - if (!radix_tree_exceptional_entry(entry)) { - hole_fill = true; - /* - * Unmap the page now before we remove it from page cache below. - * The page is locked so it cannot be faulted in again. - */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); - error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM); - if (error) - return ERR_PTR(error); - } else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) { - /* replacing huge zero page with PMD block mapping */ - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { + /* we are replacing a zero page with block mapping */ + if (dax_is_pmd_entry(entry)) + unmap_mapping_range(mapping, + (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, + PMD_SIZE, 0); + else /* pte entry */ + unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, + PAGE_SIZE, 0); } spin_lock_irq(&mapping->tree_lock); new_entry = dax_radix_locked_entry(sector, flags); - if (hole_fill) { - __delete_from_page_cache(entry, NULL); - /* Drop pagecache reference */ - put_page(entry); - error = __radix_tree_insert(page_tree, index, - dax_radix_order(new_entry), new_entry); - if (error) { - new_entry = ERR_PTR(error); - goto unlock; - } - mapping->nrexceptional++; - } else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { /* * Only swap our new entry into the radix tree if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -609,23 +566,14 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, new_entry, NULL, NULL); + entry = new_entry; } + if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); - unlock: + spin_unlock_irq(&mapping->tree_lock); - if (hole_fill) { - radix_tree_preload_end(); - /* - * We don't need hole page anymore, it has been replaced with - * locked radix tree entry now. - */ - if (mapping->a_ops->freepage) - mapping->a_ops->freepage(entry); - unlock_page(entry); - put_page(entry); - } - return new_entry; + return entry; } static inline unsigned long @@ -646,11 +594,10 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pte_t pte, *ptep = NULL; pmd_t *pmdp = NULL; spinlock_t *ptl; - bool changed; i_mmap_lock_read(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { - unsigned long address; + unsigned long address, start, end; cond_resched(); @@ -658,8 +605,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, continue; address = pgoff_address(index, vma); - changed = false; - if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) + + /* + * Note because we provide start/end to follow_pte_pmd it will + * call mmu_notifier_invalidate_range_start() on our behalf + * before taking any lock. + */ + if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; if (pmdp) { @@ -676,7 +628,7 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -691,13 +643,12 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; + mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } - if (changed) - mmu_notifier_invalidate_page(vma->vm_mm, address); + mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); } i_mmap_unlock_read(mapping); } @@ -724,7 +675,7 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); entry2 = get_unlocked_mapping_entry(mapping, index, &slot); /* Entry got punched out / reallocated? */ - if (!entry2 || !radix_tree_exceptional_entry(entry2)) + if (!entry2 || WARN_ON_ONCE(!radix_tree_exceptional_entry(entry2))) goto put_unlocked; /* * Entry got reallocated elsewhere? No need to writeback. We have to @@ -783,7 +734,7 @@ static int dax_writeback_one(struct block_device *bdev, } dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); - dax_flush(dax_dev, pgoff, kaddr, size); + dax_flush(dax_dev, kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -796,7 +747,7 @@ static int dax_writeback_one(struct block_device *bdev, trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); dax_unlock: dax_read_unlock(id); - put_locked_mapping_entry(mapping, index, entry); + put_locked_mapping_entry(mapping, index); return ret; put_unlocked: @@ -871,11 +822,10 @@ EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void **entryp, + sector_t sector, size_t size, void *entry, struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - void *entry = *entryp; void *ret, *kaddr; pgoff_t pgoff; int id, rc; @@ -896,47 +846,48 @@ static int dax_insert_mapping(struct address_space *mapping, ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); - *entryp = ret; trace_dax_insert_mapping(mapping->host, vmf, ret); - return vm_insert_mixed(vma, vaddr, pfn); + if (vmf->flags & FAULT_FLAG_WRITE) + return vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + return vm_insert_mixed(vma, vaddr, pfn); } -/** - * dax_pfn_mkwrite - handle first write to DAX page - * @vmf: The description of the fault +/* + * The user has performed a load from a hole in the file. Allocating a new + * page in the file would cause excessive storage usage for workloads with + * sparse files. Instead we insert a read-only mapping of the 4k zero page. + * If this page is ever written to we will re-fault and change the mapping to + * point to real DAX storage instead. */ -int dax_pfn_mkwrite(struct vm_fault *vmf) +static int dax_load_hole(struct address_space *mapping, void *entry, + struct vm_fault *vmf) { - struct file *file = vmf->vma->vm_file; - struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - void *entry, **slot; - pgoff_t index = vmf->pgoff; + unsigned long vaddr = vmf->address; + int ret = VM_FAULT_NOPAGE; + struct page *zero_page; + void *entry2; - spin_lock_irq(&mapping->tree_lock); - entry = get_unlocked_mapping_entry(mapping, index, &slot); - if (!entry || !radix_tree_exceptional_entry(entry)) { - if (entry) - put_unlocked_mapping_entry(mapping, index, entry); - spin_unlock_irq(&mapping->tree_lock); - trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + zero_page = ZERO_PAGE(0); + if (unlikely(!zero_page)) { + ret = VM_FAULT_OOM; + goto out; } - radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); - entry = lock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - /* - * If we race with somebody updating the PTE and finish_mkwrite_fault() - * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry - * the fault in either case. - */ - finish_mkwrite_fault(vmf); - put_locked_mapping_entry(mapping, index, entry); - trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); - return VM_FAULT_NOPAGE; + + entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_ZERO_PAGE); + if (IS_ERR(entry2)) { + ret = VM_FAULT_SIGBUS; + goto out; + } + + vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); +out: + trace_dax_load_hole(inode, vmf, ret); + return ret; } -EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); static bool dax_range_is_aligned(struct block_device *bdev, unsigned int offset, unsigned int length) @@ -978,7 +929,7 @@ int __dax_zero_page_range(struct block_device *bdev, return rc; } memset(kaddr + offset, 0, size); - dax_flush(dax_dev, pgoff, kaddr + offset, size); + dax_flush(dax_dev, kaddr + offset, size); dax_read_unlock(id); } return 0; @@ -1056,6 +1007,11 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, if (map_len > end - pos) map_len = end - pos; + /* + * The userspace address for the memory copy has already been + * validated via access_ok() in either vfs_read() or + * vfs_write(), depending on which operation we are doing. + */ if (iov_iter_rw(iter) == WRITE) map_len = dax_copy_from_iter(dax_dev, pgoff, kaddr, map_len, iter); @@ -1220,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, major = VM_FAULT_MAJOR; } error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, &entry, vmf->vma, vmf); + sector, PAGE_SIZE, entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1228,7 +1184,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - vmf_ret = dax_load_hole(mapping, &entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1255,21 +1211,15 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + put_locked_mapping_entry(mapping, vmf->pgoff); out: trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void **entryp) + loff_t pos, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; const sector_t sector = dax_iomap_sector(iomap, pos); @@ -1280,7 +1230,7 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, void *ret = NULL, *kaddr; long length = 0; pgoff_t pgoff; - pfn_t pfn; + pfn_t pfn = {}; int id; if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) @@ -1300,11 +1250,10 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, goto unlock_fallback; dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; - *entryp = ret; trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, @@ -1318,7 +1267,7 @@ fallback: } static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, - void **entryp) + void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; @@ -1333,11 +1282,10 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, if (unlikely(!zero_page)) goto fallback; - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0, - RADIX_DAX_PMD | RADIX_DAX_HZP); + ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); if (IS_ERR(ret)) goto fallback; - *entryp = ret; ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); if (!pmd_none(*(vmf->pmd))) { @@ -1413,10 +1361,10 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, goto fallback; /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. + * grab_mapping_entry() will make sure we get a 2MiB empty entry, a + * 2MiB zero page entry or a DAX PMD. If it can't (because a 4k page + * is already in the tree, for instance), it will return -EEXIST and + * we just fall back to 4k entries. */ entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); if (IS_ERR(entry)) @@ -1449,13 +1397,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); + result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) break; - result = dax_pmd_load_hole(vmf, &iomap, &entry); + result = dax_pmd_load_hole(vmf, &iomap, entry); break; default: WARN_ON_ONCE(1); @@ -1478,7 +1426,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, &iomap); } unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); + put_locked_mapping_entry(mapping, pgoff); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/direct-io.c b/fs/direct-io.c index 08cf27811e5a..5fa2211e49ae 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -111,7 +111,7 @@ struct dio { int op; int op_flags; blk_qc_t bio_cookie; - struct block_device *bio_bdev; + struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ dio_iodone_t *end_io; /* IO completion function */ @@ -377,7 +377,7 @@ dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, */ bio = bio_alloc(GFP_KERNEL, nr_vecs); - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; bio_set_op_attrs(bio, dio->op, dio->op_flags); if (dio->is_async) @@ -412,7 +412,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_bdev = bio->bi_bdev; + dio->bio_disk = bio->bi_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); @@ -458,7 +458,7 @@ static struct bio *dio_await_one(struct dio *dio) dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie)) + !blk_mq_poll(dio->bio_disk->queue, dio->bio_cookie)) io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index ca7089aeadab..fa08448e35dd 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -68,7 +68,7 @@ static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); } static void print_format1(struct dlm_rsb *res, struct seq_file *s) @@ -111,7 +111,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) } if (rsb_flag(res, RSB_VALNOTVALID)) seq_puts(s, " (INVALID)"); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -156,7 +156,7 @@ static void print_format1(struct dlm_rsb *res, struct seq_file *s) lkb->lkb_id, print_lockmode(lkb->lkb_rqmode)); if (lkb->lkb_wait_type) seq_printf(s, " wait_type: %d", lkb->lkb_wait_type); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; } @@ -287,7 +287,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -298,7 +298,7 @@ static void print_format3(struct dlm_rsb *r, struct seq_file *s) for (i = 0; i < lvblen; i++) seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); - seq_puts(s, "\n"); + seq_putc(s, '\n'); if (seq_has_overflowed(s)) goto out; @@ -361,8 +361,7 @@ static void print_format4(struct dlm_rsb *r, struct seq_file *s) else seq_printf(s, " %02x", (unsigned char)r->res_name[i]); } - seq_puts(s, "\n"); - + seq_putc(s, '\n'); unlock_rsb(r); } @@ -436,7 +435,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) return NULL; - ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS); + ri = kzalloc(sizeof(*ri), GFP_NOFS); if (!ri) return NULL; if (n == 0) @@ -742,7 +741,7 @@ void dlm_delete_debug_file(struct dlm_ls *ls) int dlm_create_debug_file(struct dlm_ls *ls) { - char name[DLM_LOCKSPACE_LEN+8]; + char name[DLM_LOCKSPACE_LEN + 8]; /* format 1 */ @@ -757,7 +756,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 2 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_locks", ls->ls_name); ls->ls_debug_locks_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -770,7 +769,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 3 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_all", ls->ls_name); ls->ls_debug_all_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -783,7 +782,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) /* format 4 */ memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_toss", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_toss", ls->ls_name); ls->ls_debug_toss_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, @@ -794,7 +793,7 @@ int dlm_create_debug_file(struct dlm_ls *ls) goto fail; memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + snprintf(name, DLM_LOCKSPACE_LEN + 8, "%s_waiters", ls->ls_name); ls->ls_debug_waiters_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO, diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 6df332296c66..d4aaddec1b16 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1426,7 +1426,7 @@ void dlm_scan_waiters(struct dlm_ls *ls) if (!num_nodes) { num_nodes = ls->ls_num_nodes; - warned = kzalloc(num_nodes * sizeof(int), GFP_KERNEL); + warned = kcalloc(num_nodes, sizeof(int), GFP_KERNEL); } if (!warned) continue; @@ -5119,11 +5119,9 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls) int wait_type, stub_unlock_result, stub_cancel_result; int dir_nodeid; - ms_stub = kmalloc(sizeof(struct dlm_message), GFP_KERNEL); - if (!ms_stub) { - log_error(ls, "dlm_recover_waiters_pre no mem"); + ms_stub = kmalloc(sizeof(*ms_stub), GFP_KERNEL); + if (!ms_stub) return; - } mutex_lock(&ls->ls_waiters_mutex); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 91592b75c309..78a7c855b06b 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -235,7 +235,7 @@ static int dlm_uevent(struct kset *kset, struct kobject *kobj, return 0; } -static struct kset_uevent_ops dlm_uevent_ops = { +static const struct kset_uevent_ops dlm_uevent_ops = { .uevent = dlm_uevent, }; @@ -453,9 +453,14 @@ static int new_lockspace(const char *name, const char *cluster, *ops_result = 0; } + if (!cluster) + log_print("dlm cluster name '%s' is being used without an application provided cluster name", + dlm_config.ci_cluster_name); + if (dlm_config.ci_recover_callbacks && cluster && strncmp(cluster, dlm_config.ci_cluster_name, DLM_LOCKSPACE_LEN)) { - log_print("dlm cluster name %s mismatch %s", + log_print("dlm cluster name '%s' does not match " + "the application cluster name '%s'", dlm_config.ci_cluster_name, cluster); error = -EBADR; goto out; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9382db998ec9..4813d0e0cd9b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -729,7 +729,7 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + result = sock_create_lite(dlm_local_addr[0]->ss_family, SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; diff --git a/fs/dlm/member.c b/fs/dlm/member.c index 9c47f1c14a8b..3fda3832cf6a 100644 --- a/fs/dlm/member.c +++ b/fs/dlm/member.c @@ -217,8 +217,7 @@ int dlm_slots_assign(struct dlm_ls *ls, int *num_slots, int *slots_size, } array_size = max + need; - - array = kzalloc(array_size * sizeof(struct dlm_slot), GFP_NOFS); + array = kcalloc(array_size, sizeof(*array), GFP_NOFS); if (!array) return -ENOMEM; @@ -319,7 +318,7 @@ static int dlm_add_member(struct dlm_ls *ls, struct dlm_config_node *node) struct dlm_member *memb; int error; - memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS); + memb = kzalloc(sizeof(*memb), GFP_NOFS); if (!memb) return -ENOMEM; @@ -405,8 +404,7 @@ static void make_member_array(struct dlm_ls *ls) } ls->ls_total_weight = total; - - array = kmalloc(sizeof(int) * total, GFP_NOFS); + array = kmalloc_array(total, sizeof(*array), GFP_NOFS); if (!array) return; @@ -492,8 +490,7 @@ void dlm_lsop_recover_done(struct dlm_ls *ls) return; num = ls->ls_num_nodes; - - slots = kzalloc(num * sizeof(struct dlm_slot), GFP_KERNEL); + slots = kcalloc(num, sizeof(*slots), GFP_KERNEL); if (!slots) return; @@ -673,11 +670,11 @@ int dlm_ls_stop(struct dlm_ls *ls) int dlm_ls_start(struct dlm_ls *ls) { - struct dlm_recover *rv = NULL, *rv_old; + struct dlm_recover *rv, *rv_old; struct dlm_config_node *nodes; int error, count; - rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS); + rv = kzalloc(sizeof(*rv), GFP_NOFS); if (!rv) return -ENOMEM; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index d401425f602a..e631b1689228 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -367,7 +367,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file, locks_init_lock(fl); fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK; fl->fl_flags = FL_POSIX; - fl->fl_pid = op->info.pid; + fl->fl_pid = -op->info.pid; fl->fl_start = op->info.start; fl->fl_end = op->info.end; rv = 0; diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 23488f559cf9..d18e7a539f11 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -123,6 +123,8 @@ static void compat_input(struct dlm_write_request *kb, static void compat_output(struct dlm_lock_result *res, struct dlm_lock_result32 *res32) { + memset(res32, 0, sizeof(*res32)); + res32->version[0] = res->version[0]; res32->version[1] = res->version[1]; res32->version[2] = res->version[2]; @@ -355,6 +357,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index ca4e83750214..c74ed3ca3372 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -328,7 +328,7 @@ ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { int rc; - rc = filemap_write_and_wait(file->f_mapping); + rc = file_write_and_wait(file); if (rc) return rc; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e767e4389cb1..2fabd19cdeea 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -205,7 +205,7 @@ struct eventpoll { struct list_head rdllist; /* RB tree root used to store monitored fd structs */ - struct rb_root rbr; + struct rb_root_cached rbr; /* * This is a single linked list that chains all the "struct epitem" that @@ -600,8 +600,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -791,7 +796,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_rcu(&epi->fllink); spin_unlock(&file->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) @@ -835,7 +840,7 @@ static void ep_free(struct eventpoll *ep) /* * Walks through the whole tree by unregistering poll callbacks. */ - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); ep_unregister_pollwait(ep, epi); @@ -851,7 +856,7 @@ static void ep_free(struct eventpoll *ep) * a lockdep warning. */ mutex_lock(&ep->mtx); - while ((rbp = rb_first(&ep->rbr)) != NULL) { + while ((rbp = rb_first_cached(&ep->rbr)) != NULL) { epi = rb_entry(rbp, struct epitem, rbn); ep_remove(ep, epi); cond_resched(); @@ -958,7 +963,7 @@ static void ep_show_fdinfo(struct seq_file *m, struct file *f) struct rb_node *rbp; mutex_lock(&ep->mtx); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { struct epitem *epi = rb_entry(rbp, struct epitem, rbn); struct inode *inode = file_inode(epi->ffd.file); @@ -1035,7 +1040,7 @@ static int ep_alloc(struct eventpoll **pep) init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); - ep->rbr = RB_ROOT; + ep->rbr = RB_ROOT_CACHED; ep->ovflist = EP_UNACTIVE_PTR; ep->user = user; @@ -1061,7 +1066,7 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) struct epoll_filefd ffd; ep_set_ffd(&ffd, file, fd); - for (rbp = ep->rbr.rb_node; rbp; ) { + for (rbp = ep->rbr.rb_root.rb_node; rbp; ) { epi = rb_entry(rbp, struct epitem, rbn); kcmp = ep_cmp_ffd(&ffd, &epi->ffd); if (kcmp > 0) @@ -1083,7 +1088,7 @@ static struct epitem *ep_find_tfd(struct eventpoll *ep, int tfd, unsigned long t struct rb_node *rbp; struct epitem *epi; - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (epi->ffd.fd == tfd) { if (toff == 0) @@ -1134,17 +1139,6 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v struct eventpoll *ep = epi->ep; int ewake = 0; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->entry); - } - spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1228,10 +1222,26 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); - if (epi->event.events & EPOLLEXCLUSIVE) - return ewake; + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; - return 1; + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->entry); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } + + return ewake; } /* @@ -1263,20 +1273,22 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) { int kcmp; - struct rb_node **p = &ep->rbr.rb_node, *parent = NULL; + struct rb_node **p = &ep->rbr.rb_root.rb_node, *parent = NULL; struct epitem *epic; + bool leftmost = true; while (*p) { parent = *p; epic = rb_entry(parent, struct epitem, rbn); kcmp = ep_cmp_ffd(&epi->ffd, &epic->ffd); - if (kcmp > 0) + if (kcmp > 0) { p = &parent->rb_right; - else + leftmost = false; + } else p = &parent->rb_left; } rb_link_node(&epi->rbn, parent, p); - rb_insert_color(&epi->rbn, &ep->rbr); + rb_insert_color_cached(&epi->rbn, &ep->rbr, leftmost); } @@ -1520,7 +1532,7 @@ error_remove_epi: list_del_rcu(&epi->fllink); spin_unlock(&tfile->f_lock); - rb_erase(&epi->rbn, &ep->rbr); + rb_erase_cached(&epi->rbn, &ep->rbr); error_unregister: ep_unregister_pollwait(ep, epi); @@ -1868,7 +1880,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) mutex_lock_nested(&ep->mtx, call_nests + 1); ep->visited = 1; list_add(&ep->visited_list_link, &visited_list); - for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { + for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; diff --git a/fs/exec.c b/fs/exec.c index 15fb4d56cc43..69a543259aa5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1240,6 +1240,12 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) perf_event_comm(tsk, exec); } +/* + * Calling this is the point of no return. None of the failures will be + * seen by userspace since either the process is already taking a fatal + * signal (via de_thread() or coredump), or will have SEGV raised + * (after exec_mmap()) by search_binary_handlers (see below). + */ int flush_old_exec(struct linux_binprm * bprm) { int retval; @@ -1267,7 +1273,13 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; - bprm->mm = NULL; /* We're using it now */ + /* + * After clearing bprm->mm (to mark that current is using the + * prepared mm now), we have nothing left of the original + * process. If anything from here on returns an error, the check + * in search_binary_handler() will SEGV current. + */ + bprm->mm = NULL; set_fs(USER_DS); current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | @@ -1312,15 +1324,38 @@ EXPORT_SYMBOL(would_dump); void setup_new_exec(struct linux_binprm * bprm) { + /* + * Once here, prepare_binrpm() will not be called any more, so + * the final state of setuid/setgid/fscaps can be merged into the + * secureexec flag. + */ + bprm->secureexec |= bprm->cap_elevated; + + if (bprm->secureexec) { + /* Make sure parent cannot signal privileged process. */ + current->pdeath_signal = 0; + + /* + * For secureexec, reset the stack limit to sane default to + * avoid bad behavior from the prior rlimits. This has to + * happen before arch_pick_mmap_layout(), which examines + * RLIMIT_STACK, but after the point of no return to avoid + * needing to clean up the change on failure. + */ + if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + } + arch_pick_mmap_layout(current->mm); - /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; - if (uid_eq(current_euid(), current_uid()) && gid_eq(current_egid(), current_gid())) - set_dumpable(current->mm, SUID_DUMP_USER); - else + /* Figure out dumpability. */ + if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP || + bprm->secureexec) set_dumpable(current->mm, suid_dumpable); + else + set_dumpable(current->mm, SUID_DUMP_USER); arch_setup_new_exec(); perf_event_exec(); @@ -1332,15 +1367,6 @@ void setup_new_exec(struct linux_binprm * bprm) */ current->mm->task_size = TASK_SIZE; - /* install the new credentials */ - if (!uid_eq(bprm->cred->uid, current_euid()) || - !gid_eq(bprm->cred->gid, current_egid())) { - current->pdeath_signal = 0; - } else { - if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP) - set_dumpable(current->mm, suid_dumpable); - } - /* An exec changes our domain. We are no longer part of the thread group */ current->self_exec_id++; @@ -1530,7 +1556,7 @@ int prepare_binprm(struct linux_binprm *bprm) retval = security_bprm_set_creds(bprm); if (retval) return retval; - bprm->cred_prepared = 1; + bprm->called_set_creds = 1; memset(bprm->buf, 0, BINPRM_BUF_SIZE); return kernel_read(bprm->file, bprm->buf, BINPRM_BUF_SIZE, &pos); @@ -1719,9 +1745,9 @@ static int do_execveat_common(int fd, struct filename *filename, bprm->filename = filename->name; } else { if (filename->name[0] == '\0') - pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d", fd); + pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d", fd); else - pathbuf = kasprintf(GFP_TEMPORARY, "/dev/fd/%d/%s", + pathbuf = kasprintf(GFP_KERNEL, "/dev/fd/%d/%s", fd, filename->name); if (!pathbuf) { retval = -ENOMEM; diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 28645f0640f7..a94594ea2aa3 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -48,7 +48,7 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index 8bb72807e70d..3c6a9c156b7a 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -869,7 +869,7 @@ static int _write_mirror(struct ore_io_state *ios, int cur_comp) goto out; } - bio->bi_bdev = NULL; + bio->bi_disk = NULL; bio->bi_next = NULL; per_dev->offset = master_dev->offset; per_dev->length = master_dev->length; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 23ebb92484c6..28de3edd4f4d 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -114,6 +114,7 @@ struct ext2_sb_info { */ spinlock_t s_lock; struct mb_cache *s_ea_block_cache; + struct dax_device *s_daxdev; }; static inline spinlock_t * diff --git a/fs/ext2/file.c b/fs/ext2/file.c index d34d32bdc944..ff3a3636a5ca 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -107,29 +107,6 @@ static int ext2_dax_fault(struct vm_fault *vmf) return ret; } -static int ext2_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - loff_t size; - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - down_read(&ei->dax_sem); - - /* check that the faulting page hasn't raced with truncate */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, /* @@ -138,7 +115,7 @@ static const struct vm_operations_struct ext2_dax_vm_ops = { * will always fail and fail back to regular faults. */ .page_mkwrite = ext2_dax_fault, - .pfn_mkwrite = ext2_dax_pfn_mkwrite, + .pfn_mkwrite = ext2_dax_fault, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 30163d007b2f..4dca6f348714 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -800,10 +800,10 @@ int ext2_get_block(struct inode *inode, sector_t iblock, static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { - struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; + struct ext2_sb_info *sbi = EXT2_SB(inode->i_sb); bool new = false, boundary = false; u32 bno; int ret; @@ -814,13 +814,9 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - bdev = inode->i_sb->s_bdev; - iomap->bdev = bdev; + iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64)first_block << blkbits; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; + iomap->dax_dev = sbi->s_daxdev; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -842,7 +838,6 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { - fs_put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 7b1bc9059863..fc18edd81815 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -171,6 +171,7 @@ static void ext2_put_super (struct super_block * sb) brelse (sbi->s_sbh); sb->s_fs_info = NULL; kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); kfree(sbi); } @@ -813,6 +814,7 @@ static unsigned long descriptor_loc(struct super_block *sb, static int ext2_fill_super(struct super_block *sb, void *data, int silent) { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; @@ -842,6 +844,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; + sbi->s_daxdev = dax_dev; spin_lock_init(&sbi->s_lock); @@ -1200,6 +1203,7 @@ failed_sbi: kfree(sbi->s_blockgroup_lock); kfree(sbi); failed: + fs_put_dax(dax_dev); return ret; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index e8b365000d73..b04e882179c6 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -411,7 +411,7 @@ static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp, { struct dir_private_info *p; - p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; p->curr_hash = pos2maj_hash(filp, pos); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index a2bb7d2870e4..e2abe01c8c6b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -838,13 +838,11 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) { if (unlikely(sizeof(time->tv_sec) > 4 && (extra & cpu_to_le32(EXT4_EPOCH_MASK)))) { -#if LINUX_VERSION_CODE < KERNEL_VERSION(4,20,0) + +#if 1 /* Handle legacy encoding of pre-1970 dates with epoch - * bits 1,1. We assume that by kernel version 4.20, - * everyone will have run fsck over the affected - * filesystems to correct the problem. (This - * backwards compatibility may be removed before this - * time, at the discretion of the ext4 developers.) + * bits 1,1. (This backwards compatibility may be removed + * at the discretion of the ext4 developers.) */ u64 extra_bits = le32_to_cpu(extra) & EXT4_EPOCH_MASK; if (extra_bits == 3 && ((time->tv_sec) & 0x80000000) != 0) @@ -1528,6 +1526,7 @@ struct ext4_sb_info { /* Barrier between changing inodes' journal flags and writepages ops. */ struct percpu_rw_semaphore s_journal_flag_rwsem; + struct dax_device *s_daxdev; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1567,6 +1566,7 @@ enum { nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ + EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d7cf0cc9b87..57dcaea762c3 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -279,7 +279,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; - bool write = vmf->flags & FAULT_FLAG_WRITE; + + /* + * We have to distinguish real writes from writes which will result in a + * COW page; COW writes should *not* poke the journal (the file will not + * be changed). Doing so would cause unintended failures when mounted + * read-only. + * + * We check for VM_SHARED rather than vmf->cow_page since the latter is + * unset for pe_size != PE_SIZE_PTE (i.e. only in do_cow_fault); for + * other sizes, dax_iomap_fault will handle splitting / fallback so that + * we eventually come back with a COW page. + */ + bool write = (vmf->flags & FAULT_FLAG_WRITE) && + (vmf->vma->vm_flags & VM_SHARED); if (write) { sb_start_pagefault(sb); @@ -311,41 +324,11 @@ static int ext4_dax_fault(struct vm_fault *vmf) return ext4_dax_huge_fault(vmf, PE_SIZE_PTE); } -/* - * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() - * handler we check for races agaist truncate. Note that since we cycle through - * i_mmap_sem, we are sure that also any hole punching that began before we - * were called is finished by now and so if it included part of the file we - * are working on, our pte will get unmapped and the check for pte_same() in - * wp_pfn_shared() fails. Thus fault gets retried and things work out as - * desired. - */ -static int ext4_dax_pfn_mkwrite(struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vmf->vma->vm_file); - struct super_block *sb = inode->i_sb; - loff_t size; - int ret; - - sb_start_pagefault(sb); - file_update_time(vmf->vma->vm_file); - down_read(&EXT4_I(inode)->i_mmap_sem); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else - ret = dax_pfn_mkwrite(vmf); - up_read(&EXT4_I(inode)->i_mmap_sem); - sb_end_pagefault(sb); - - return ret; -} - static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .huge_fault = ext4_dax_huge_fault, .page_mkwrite = ext4_dax_fault, - .pfn_mkwrite = ext4_dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_fault, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops @@ -494,12 +477,11 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, pagevec_init(&pvec, 0); do { - int i, num; + int i; unsigned long nr_pages; - num = min_t(pgoff_t, end - index, PAGEVEC_SIZE - 1) + 1; - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index, - (pgoff_t)num); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &index, end); if (nr_pages == 0) break; @@ -518,9 +500,6 @@ static int ext4_find_unwritten_pgoff(struct inode *inode, goto out; } - if (page->index > end) - goto out; - lock_page(page); if (unlikely(page->mapping != inode->i_mapping)) { @@ -563,14 +542,10 @@ next: unlock_page(page); } - /* The no. of pages is less than our desired, we are done. */ - if (nr_pages < num) - break; - - index = pvec.pages[i - 1]->index + 1; pagevec_release(&pvec); } while (index <= end); + /* There are no pages upto endoff - that would be a hole in there. */ if (whence == SEEK_HOLE && lastoff < endoff) { found = 1; *offset = lastoff; @@ -595,7 +570,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } @@ -658,7 +633,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) inode_lock(inode); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { inode_unlock(inode); return -ENXIO; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 38b8a96eb97c..00c6dd29e621 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -148,8 +148,6 @@ static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) scp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; @@ -176,8 +174,6 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) if (len > num*4) len = num * 4; for (i = 0; i < len; i++) { - if ((i % 4) == 0) - val = pad; val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 507bfb3344d4..71e93a23cec3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -692,24 +692,25 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * somewhat arbitrary...) */ #define RECENTCY_MIN 5 -#define RECENTCY_DIRTY 30 +#define RECENTCY_DIRTY 300 static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) { struct ext4_group_desc *gdp; struct ext4_inode *raw_inode; struct buffer_head *bh; - unsigned long dtime, now; - int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; - int offset, ret = 0, recentcy = RECENTCY_MIN; + int inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int offset, ret = 0; + int recentcy = RECENTCY_MIN; + u32 dtime, now; gdp = ext4_get_group_desc(sb, group, NULL); if (unlikely(!gdp)) return 0; - bh = sb_getblk(sb, ext4_inode_table(sb, gdp) + + bh = sb_find_get_block(sb, ext4_inode_table(sb, gdp) + (ino / inodes_per_block)); - if (unlikely(!bh) || !buffer_uptodate(bh)) + if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it * must have been written out. @@ -718,18 +719,45 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) offset = (ino % inodes_per_block) * EXT4_INODE_SIZE(sb); raw_inode = (struct ext4_inode *) (bh->b_data + offset); + + /* i_dtime is only 32 bits on disk, but we only care about relative + * times in the range of a few minutes (i.e. long enough to sync a + * recently-deleted inode to disk), so using the low 32 bits of the + * clock (a 68 year range) is enough, see time_before32() */ dtime = le32_to_cpu(raw_inode->i_dtime); - now = get_seconds(); + now = ktime_get_real_seconds(); if (buffer_dirty(bh)) recentcy += RECENTCY_DIRTY; - if (dtime && (dtime < now) && (now < dtime + recentcy)) + if (dtime && time_before32(dtime, now) && + time_before32(now, dtime + recentcy)) ret = 1; out: brelse(bh); return ret; } +static int find_inode_bit(struct super_block *sb, ext4_group_t group, + struct buffer_head *bitmap, unsigned long *ino) +{ +next: + *ino = ext4_find_next_zero_bit((unsigned long *) + bitmap->b_data, + EXT4_INODES_PER_GROUP(sb), *ino); + if (*ino >= EXT4_INODES_PER_GROUP(sb)) + return 0; + + if ((EXT4_SB(sb)->s_journal == NULL) && + recently_deleted(sb, group, *ino)) { + *ino = *ino + 1; + if (*ino < EXT4_INODES_PER_GROUP(sb)) + goto next; + return 0; + } + + return 1; +} + /* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both @@ -892,19 +920,13 @@ got_group: /* * Check free inodes count before loading bitmap. */ - if (ext4_free_inodes_count(sb, gdp) == 0) { - if (++group == ngroups) - group = 0; - continue; - } + if (ext4_free_inodes_count(sb, gdp) == 0) + goto next_group; grp = ext4_get_group_info(sb, group); /* Skip groups with already-known suspicious inode tables */ - if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) { - if (++group == ngroups) - group = 0; - continue; - } + if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) + goto next_group; brelse(inode_bitmap_bh); inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); @@ -912,27 +934,20 @@ got_group: if (EXT4_MB_GRP_IBITMAP_CORRUPT(grp) || IS_ERR(inode_bitmap_bh)) { inode_bitmap_bh = NULL; - if (++group == ngroups) - group = 0; - continue; + goto next_group; } repeat_in_this_group: - ino = ext4_find_next_zero_bit((unsigned long *) - inode_bitmap_bh->b_data, - EXT4_INODES_PER_GROUP(sb), ino); - if (ino >= EXT4_INODES_PER_GROUP(sb)) + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (!ret2) goto next_group; - if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { + + if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) { ext4_error(sb, "reserved inode found cleared - " "inode=%lu", ino + 1); - continue; - } - if ((EXT4_SB(sb)->s_journal == NULL) && - recently_deleted(sb, group, ino)) { - ino++; - goto next_inode; + goto next_group; } + if (!handle) { BUG_ON(nblocks <= 0); handle = __ext4_journal_start_sb(dir->i_sb, line_no, @@ -952,11 +967,23 @@ repeat_in_this_group: } ext4_lock_group(sb, group); ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data); + if (ret2) { + /* Someone already took the bit. Repeat the search + * with lock held. + */ + ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino); + if (ret2) { + ext4_set_bit(ino, inode_bitmap_bh->b_data); + ret2 = 0; + } else { + ret2 = 1; /* we didn't grab the inode */ + } + } ext4_unlock_group(sb, group); ino++; /* the inode bitmap is zero-based */ if (!ret2) goto got; /* we grabbed the inode! */ -next_inode: + if (ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; next_group: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c774bdc22759..31db875bc7a1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1720,13 +1720,12 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, pagevec_init(&pvec, 0); while (index <= end) { - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; + BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); if (invalidate) { @@ -1737,7 +1736,6 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, } unlock_page(page); } - index = pvec.pages[nr_pages - 1]->index + 1; pagevec_release(&pvec); } } @@ -2348,17 +2346,13 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_init(&pvec, 0); while (start <= end) { - nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start, - PAGEVEC_SIZE); + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, + &start, end); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (page->index > end) - break; - /* Up to 'end' pages must be contiguous */ - BUG_ON(page->index != start); bh = head = page_buffers(page); do { if (lblk < mpd->map.m_lblk) @@ -2403,7 +2397,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) pagevec_release(&pvec); return err; } - start++; } pagevec_release(&pvec); } @@ -3404,7 +3397,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { - struct block_device *bdev; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3473,12 +3466,8 @@ retry: } iomap->flags = 0; - bdev = inode->i_sb->s_bdev; - iomap->bdev = bdev; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; + iomap->bdev = inode->i_sb->s_bdev; + iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3511,7 +3500,6 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; - fs_put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -4897,14 +4885,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) brelse(iloc.bh); ext4_set_inode_flags(inode); - if (ei->i_flags & EXT4_EA_INODE_FL) { - ext4_xattr_inode_set_class(inode); - - inode_lock(inode); - inode->i_flags |= S_NOQUOTA; - inode_unlock(inode); - } - unlock_new_inode(inode); return inode; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index eb9835638680..77cdce1f17ce 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -367,7 +367,7 @@ skip: goto failed; } - mmpd_data = kmalloc(sizeof(struct mmpd_data), GFP_KERNEL); + mmpd_data = kmalloc(sizeof(*mmpd_data), GFP_KERNEL); if (!mmpd_data) { ext4_warning(sb, "not enough memory for mmpd_data"); goto failed; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index c2fce4478cca..55ad7dd149d0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -300,7 +300,7 @@ static void ext4_end_bio(struct bio *bio) char b[BDEVNAME_SIZE]; if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", - bdevname(bio->bi_bdev, b), + bio_devname(bio, b), (long long) bio->bi_iter.bi_sector, (unsigned) bio_sectors(bio), bio->bi_status)) { @@ -375,7 +375,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, return -ENOMEM; wbc_init_bio(io->io_wbc, bio); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); bio->bi_end_io = ext4_end_bio; bio->bi_private = ext4_get_io_end(io->io_end); io->io_bio = bio; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 40a5497b0f60..04c90643af7a 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -254,7 +254,7 @@ int ext4_mpage_readpages(struct address_space *mapping, fscrypt_release_ctx(ctx); goto set_error_page; } - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); bio->bi_end_io = mpage_end_io; bio->bi_private = ctx; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d61a70e2193a..71b9a667e1bc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -951,6 +951,7 @@ static void ext4_put_super(struct super_block *sb) if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->s_blockgroup_lock); + fs_put_dax(sbi->s_daxdev); kfree(sbi); } @@ -2404,6 +2405,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int ret, nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2442,14 +2444,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2510,10 +2530,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -3377,6 +3399,7 @@ static void ext4_set_resv_clusters(struct super_block *sb) static int ext4_fill_super(struct super_block *sb, void *data, int silent) { + struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev); char *orig_data = kstrdup(data, GFP_KERNEL); struct buffer_head *bh; struct ext4_super_block *es = NULL; @@ -3402,6 +3425,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if ((data && !orig_data) || !sbi) goto out_free_base; + sbi->s_daxdev = dax_dev; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) @@ -4378,6 +4402,7 @@ out_fail: out_free_base: kfree(sbi); kfree(orig_data); + fs_put_dax(dax_dev); return err ? err : ret; } @@ -5194,7 +5219,7 @@ static int ext4_statfs_project(struct super_block *sb, dquot = dqget(sb, qid); if (IS_ERR(dquot)) return PTR_ERR(dquot); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); limit = (dquot->dq_dqb.dqb_bsoftlimit ? dquot->dq_dqb.dqb_bsoftlimit : @@ -5217,7 +5242,7 @@ static int ext4_statfs_project(struct super_block *sb, (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; } - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); dqput(dquot); return 0; } @@ -5263,18 +5288,13 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } -/* Helper function for writing quotas on sync - we need to start transaction - * before quota file is locked for write. Otherwise the are possible deadlocks: - * Process 1 Process 2 - * ext4_create() quota_sync() - * jbd2_journal_start() write_dquot() - * dquot_initialize() down(dqio_mutex) - * down(dqio_mutex) jbd2_journal_start() - * - */ #ifdef CONFIG_QUOTA +/* + * Helper functions so that transaction is started before we acquire dqio_sem + * to keep correct lock ordering of transaction > dqio_sem + */ static inline struct inode *dquot_to_inode(struct dquot *dquot) { return sb_dqopt(dquot->dq_sb)->files[dquot->dq_id.type]; @@ -5409,6 +5429,13 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, ext4_msg(sb, KERN_WARNING, "Quota file not on filesystem root. " "Journaled quota will not work"); + sb_dqopt(sb)->flags |= DQUOT_NOLIST_DIRTY; + } else { + /* + * Clear the flag just in case mount options changed since + * last time. + */ + sb_dqopt(sb)->flags &= ~DQUOT_NOLIST_DIRTY; } /* @@ -5505,13 +5532,16 @@ static int ext4_enable_quotas(struct super_block *sb) test_opt(sb, PRJQUOTA), }; - sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY; for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED | (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0)); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 3dd970168448..3b69330a4250 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -354,8 +354,10 @@ free_bhs: return ret; } +#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) + static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, - struct inode **ea_inode) + u32 ea_inode_hash, struct inode **ea_inode) { struct inode *inode; int err; @@ -385,6 +387,24 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, goto error; } + ext4_xattr_inode_set_class(inode); + + /* + * Check whether this is an old Lustre-style xattr inode. Lustre + * implementation does not have hash validation, rather it has a + * backpointer from ea_inode to the parent inode. + */ + if (ea_inode_hash != ext4_xattr_inode_get_hash(inode) && + EXT4_XATTR_INODE_GET_PARENT(inode) == parent->i_ino && + inode->i_generation == parent->i_generation) { + ext4_set_inode_state(inode, EXT4_STATE_LUSTRE_EA_INODE); + ext4_xattr_inode_set_ref(inode, 1); + } else { + inode_lock(inode); + inode->i_flags |= S_NOQUOTA; + inode_unlock(inode); + } + *ea_inode = inode; return 0; error: @@ -417,8 +437,6 @@ ext4_xattr_inode_verify_hashes(struct inode *ea_inode, return 0; } -#define EXT4_XATTR_INODE_GET_PARENT(inode) ((__u32)(inode)->i_mtime.tv_sec) - /* * Read xattr value from the EA inode. */ @@ -431,7 +449,7 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, int err; err = ext4_xattr_inode_iget(inode, le32_to_cpu(entry->e_value_inum), - &ea_inode); + le32_to_cpu(entry->e_hash), &ea_inode); if (err) { ea_inode = NULL; goto out; @@ -449,29 +467,20 @@ ext4_xattr_inode_get(struct inode *inode, struct ext4_xattr_entry *entry, if (err) goto out; - err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, size); - /* - * Compatibility check for old Lustre ea_inode implementation. Old - * version does not have hash validation, but it has a backpointer - * from ea_inode to the parent inode. - */ - if (err == -EFSCORRUPTED) { - if (EXT4_XATTR_INODE_GET_PARENT(ea_inode) != inode->i_ino || - ea_inode->i_generation != inode->i_generation) { + if (!ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) { + err = ext4_xattr_inode_verify_hashes(ea_inode, entry, buffer, + size); + if (err) { ext4_warning_inode(ea_inode, "EA inode hash validation failed"); goto out; } - /* Do not add ea_inode to the cache. */ - ea_inode_cache = NULL; - err = 0; - } else if (err) - goto out; - if (ea_inode_cache) - mb_cache_entry_create(ea_inode_cache, GFP_NOFS, - ext4_xattr_inode_get_hash(ea_inode), - ea_inode->i_ino, true /* reusable */); + if (ea_inode_cache) + mb_cache_entry_create(ea_inode_cache, GFP_NOFS, + ext4_xattr_inode_get_hash(ea_inode), + ea_inode->i_ino, true /* reusable */); + } out: iput(ea_inode); return err; @@ -838,10 +847,15 @@ static int ext4_xattr_inode_alloc_quota(struct inode *inode, size_t len) return err; } -static void ext4_xattr_inode_free_quota(struct inode *inode, size_t len) +static void ext4_xattr_inode_free_quota(struct inode *parent, + struct inode *ea_inode, + size_t len) { - dquot_free_space_nodirty(inode, round_up_cluster(inode, len)); - dquot_free_inode(inode); + if (ea_inode && + ext4_test_inode_state(ea_inode, EXT4_STATE_LUSTRE_EA_INODE)) + return; + dquot_free_space_nodirty(parent, round_up_cluster(parent, len)); + dquot_free_inode(parent); } int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, @@ -1071,7 +1085,9 @@ static int ext4_xattr_inode_inc_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) goto cleanup; err = ext4_xattr_inode_inc_ref(handle, ea_inode); @@ -1093,7 +1109,9 @@ cleanup: if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) { ext4_warning(parent->i_sb, "cleanup ea_ino %u iget error %d", ea_ino, @@ -1131,7 +1149,9 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, if (!entry->e_value_inum) continue; ea_ino = le32_to_cpu(entry->e_value_inum); - err = ext4_xattr_inode_iget(parent, ea_ino, &ea_inode); + err = ext4_xattr_inode_iget(parent, ea_ino, + le32_to_cpu(entry->e_hash), + &ea_inode); if (err) continue; @@ -1159,7 +1179,7 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, } if (!skip_quota) - ext4_xattr_inode_free_quota(parent, + ext4_xattr_inode_free_quota(parent, ea_inode, le32_to_cpu(entry->e_value_size)); /* @@ -1591,6 +1611,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, if (!s->not_found && here->e_value_inum) { ret = ext4_xattr_inode_iget(inode, le32_to_cpu(here->e_value_inum), + le32_to_cpu(here->e_hash), &old_ea_inode); if (ret) { old_ea_inode = NULL; @@ -1609,7 +1630,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, &new_ea_inode); if (ret) { new_ea_inode = NULL; - ext4_xattr_inode_free_quota(inode, i->value_len); + ext4_xattr_inode_free_quota(inode, NULL, i->value_len); goto out; } } @@ -1628,13 +1649,13 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, ext4_warning_inode(new_ea_inode, "dec ref new_ea_inode err=%d", err); - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, new_ea_inode, i->value_len); } goto out; } - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, old_ea_inode, le32_to_cpu(here->e_value_size)); } @@ -1805,8 +1826,10 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, struct mb_cache_entry *ce = NULL; int error = 0; struct mb_cache *ea_block_cache = EA_BLOCK_CACHE(inode); - struct inode *ea_inode = NULL; - size_t old_ea_inode_size = 0; + struct inode *ea_inode = NULL, *tmp_inode; + size_t old_ea_inode_quota = 0; + unsigned int ea_ino; + #define header(x) ((struct ext4_xattr_header *)(x)) @@ -1865,12 +1888,24 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * like it has an empty value. */ if (!s->not_found && s->here->e_value_inum) { - /* - * Defer quota free call for previous inode - * until success is guaranteed. - */ - old_ea_inode_size = le32_to_cpu( + ea_ino = le32_to_cpu(s->here->e_value_inum); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &tmp_inode); + if (error) + goto cleanup; + + if (!ext4_test_inode_state(tmp_inode, + EXT4_STATE_LUSTRE_EA_INODE)) { + /* + * Defer quota free call for previous + * inode until success is guaranteed. + */ + old_ea_inode_quota = le32_to_cpu( s->here->e_value_size); + } + iput(tmp_inode); + s->here->e_value_inum = 0; s->here->e_value_size = 0; } @@ -1897,8 +1932,6 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, goto cleanup; if (i->value && s->here->e_value_inum) { - unsigned int ea_ino; - /* * A ref count on ea_inode has been taken as part of the call to * ext4_xattr_set_entry() above. We would like to drop this @@ -1906,7 +1939,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, * initialized and has its own ref count on the ea_inode. */ ea_ino = le32_to_cpu(s->here->e_value_inum); - error = ext4_xattr_inode_iget(inode, ea_ino, &ea_inode); + error = ext4_xattr_inode_iget(inode, ea_ino, + le32_to_cpu(s->here->e_hash), + &ea_inode); if (error) { ea_inode = NULL; goto cleanup; @@ -2056,8 +2091,8 @@ getblk_failed: } } - if (old_ea_inode_size) - ext4_xattr_inode_free_quota(inode, old_ea_inode_size); + if (old_ea_inode_quota) + ext4_xattr_inode_free_quota(inode, NULL, old_ea_inode_quota); /* Update the inode. */ EXT4_I(inode)->i_file_acl = new_bh ? new_bh->b_blocknr : 0; @@ -2084,7 +2119,7 @@ cleanup: /* If there was an error, revert the quota charge. */ if (error) - ext4_xattr_inode_free_quota(inode, + ext4_xattr_inode_free_quota(inode, ea_inode, i_size_read(ea_inode)); iput(ea_inode); } @@ -2800,6 +2835,7 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_ibody_header *header; struct ext4_iloc iloc = { .bh = NULL }; struct ext4_xattr_entry *entry; + struct inode *ea_inode; int error; error = ext4_xattr_ensure_credits(handle, inode, extra_credits, @@ -2854,10 +2890,19 @@ int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, if (ext4_has_feature_ea_inode(inode->i_sb)) { for (entry = BFIRST(bh); !IS_LAST_ENTRY(entry); - entry = EXT4_XATTR_NEXT(entry)) - if (entry->e_value_inum) - ext4_xattr_inode_free_quota(inode, + entry = EXT4_XATTR_NEXT(entry)) { + if (!entry->e_value_inum) + continue; + error = ext4_xattr_inode_iget(inode, + le32_to_cpu(entry->e_value_inum), + le32_to_cpu(entry->e_hash), + &ea_inode); + if (error) + continue; + ext4_xattr_inode_free_quota(inode, ea_inode, le32_to_cpu(entry->e_value_size)); + iput(ea_inode); + } } diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index b4b8438c42ef..436b3a1464d9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -207,15 +207,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 5b876f6d3f6b..04fe1df052b2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -230,8 +230,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -244,7 +245,7 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) @@ -263,6 +264,12 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -283,7 +290,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -295,7 +302,7 @@ skip_write: } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -346,7 +353,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -581,11 +588,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -601,14 +621,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -904,7 +931,14 @@ retry: if (inode) { unsigned long cur_ino = inode->i_ino; + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) { @@ -1017,7 +1051,7 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { up_write(&sbi->node_change); f2fs_unlock_all(sbi); @@ -1115,7 +1149,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1194,7 +1228,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT BITS pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } @@ -1249,7 +1283,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87c1f4150c64..36b535207c88 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -142,7 +142,7 @@ struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, } } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); } return bdev; @@ -161,7 +161,8 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static bool __same_bdev(struct f2fs_sb_info *sbi, block_t blk_addr, struct bio *bio) { - return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; + struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); + return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; } /* @@ -456,14 +457,65 @@ out_fail: return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -507,8 +559,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -569,16 +621,6 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct page *page; struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -619,9 +661,7 @@ got_it: return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -755,7 +795,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; @@ -781,7 +822,7 @@ alloc: static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -813,7 +854,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -902,7 +943,7 @@ next_dnode: end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -1039,7 +1080,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1145,35 +1186,6 @@ out: return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1239,7 +1251,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1270,12 +1282,11 @@ submit_and_realloc: bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1340,11 +1351,11 @@ static int encrypt_one_page(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; gfp_t gfp_flags = GFP_NOFS; - if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + if (!f2fs_encrypted_file(inode)) return 0; /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); retry_encrypt: fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, @@ -1470,7 +1481,8 @@ out: } static int __write_data_page(struct page *page, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1491,6 +1503,7 @@ static int __write_data_page(struct page *page, bool *submitted, .encrypted_page = NULL, .submitted = false, .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); @@ -1597,7 +1610,7 @@ redirty_out: static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { - return __write_data_page(page, NULL, wbc); + return __write_data_page(page, NULL, wbc, FS_DATA_IO); } /* @@ -1606,7 +1619,8 @@ static int f2fs_write_data_page(struct page *page, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1696,7 +1710,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = __write_data_page(page, &submitted, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1751,8 +1765,9 @@ continue_unlock: return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1789,7 +1804,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, goto skip_write; blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); if (wbc->sync_mode == WB_SYNC_ALL) @@ -1808,6 +1823,16 @@ skip_write: return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -1857,7 +1882,7 @@ restart: set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1955,8 +1980,8 @@ repeat: f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1970,21 +1995,9 @@ repeat: zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); - goto fail; - } - bio->bi_opf = REQ_OP_READ; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -2074,10 +2087,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); @@ -2252,7 +2268,10 @@ int f2fs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); set_page_private(newpage, page_private(page)); - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 37f9c7f55605..c0c933ad43c8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -705,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -735,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 94a88b233e98..9a7c90386947 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -91,6 +91,8 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_LFS 0x00040000 #define F2FS_MOUNT_USRQUOTA 0x00080000 #define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) @@ -110,8 +112,12 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -142,6 +148,8 @@ enum { (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) #define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -190,11 +198,18 @@ struct discard_entry { unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + /* max discard pend list number */ #define MAX_PLIST_NUM 512 #define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ (MAX_PLIST_NUM - 1) : (blk_num - 1)) +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + enum { D_PREP, D_SUBMIT, @@ -230,11 +245,14 @@ struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ struct list_head entry_list; /* 4KB discard entry list */ struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ struct mutex cmd_lock; unsigned int nr_discards; /* # of discards in the list */ unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ unsigned int undiscard_blks; /* # of undiscard blocks */ atomic_t issued_discard; /* # of issued discard */ atomic_t issing_discard; /* # of issing discard */ @@ -308,6 +326,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, struct f2fs_flush_device) #define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -332,6 +351,9 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +#define F2FS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define F2FS_IOC_FSSETXATTR FS_IOC_FSSETXATTR + struct f2fs_gc_range { u32 sync; u64 start; @@ -355,16 +377,36 @@ struct f2fs_flush_device { u32 segments; /* # of segments to flush */ }; +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; static inline void make_dentry_ptr_block(struct inode *inode, @@ -372,19 +414,26 @@ static inline void make_dentry_ptr_block(struct inode *inode, { d->inode = inode; d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; d->bitmap = &t->dentry_bitmap; d->dentry = t->dentry; d->filename = t->filename; } static inline void make_dentry_ptr_inline(struct inode *inode, - struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) + struct f2fs_dentry_ptr *d, void *t) { + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + d->inode = inode; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -473,12 +522,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -521,6 +571,7 @@ struct f2fs_inode_info { f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ loff_t last_disk_size; /* lastly written file size */ @@ -533,10 +584,15 @@ struct f2fs_inode_info { struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -823,6 +879,23 @@ enum need_lock_type { LOCK_RETRY, }; +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ @@ -837,6 +910,7 @@ struct f2fs_io_info { bool submitted; /* indicate IO submission */ int need_lock; /* indicate we need to lock cp_rwsem */ bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; #define is_read_io(rw) ((rw) == READ) @@ -1028,6 +1102,11 @@ struct f2fs_sb_info { #endif spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -1046,10 +1125,19 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -1137,6 +1225,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1760,20 +1869,38 @@ static inline bool IS_INODE(struct page *page) return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1836,6 +1963,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1864,6 +2005,8 @@ enum { FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1983,6 +2126,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1999,6 +2144,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -2009,8 +2161,8 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) @@ -2069,11 +2221,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + int extra_size = get_extra_isize(inode); - return (void *)&(ri->i_addr[1]); + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2164,10 +2317,50 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} + /* * file.c */ @@ -2187,6 +2380,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); @@ -2255,6 +2450,8 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) @@ -2285,15 +2482,15 @@ int truncate_xattr_node(struct inode *inode, struct page *page); int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); int remove_inode_page(struct inode *inode); struct page *new_inode_page(struct inode *inode); -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); struct page *get_node_page_ra(struct page *parent, int start); void move_node_page(struct page *node_page, int gc_type); int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic); -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); @@ -2314,6 +2511,7 @@ void destroy_node_manager_caches(void); /* * segment.c */ +bool need_SSR(struct f2fs_sb_info *sbi); void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); void drop_inmem_page(struct inode *inode, struct page *page); @@ -2336,7 +2534,8 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); int rewrite_data_page(struct f2fs_io_info *fio); @@ -2353,8 +2552,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio, bool add_list); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, @@ -2377,7 +2575,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync); void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write); + long nr_to_write, enum iostat_type io_type); void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); void release_ino_entry(struct f2fs_sb_info *sbi, bool all); @@ -2430,6 +2628,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); void f2fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length); int f2fs_release_page(struct page *page, gfp_t wait); @@ -2726,10 +2927,10 @@ void destroy_extent_cache(void); /* * sysfs.c */ -int __init f2fs_register_sysfs(void); -void f2fs_unregister_sysfs(void); -int f2fs_init_sysfs(struct f2fs_sb_info *sbi); -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi); +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); /* * crypto support @@ -2739,6 +2940,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2761,6 +2967,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2706130c261b..517e112c8a9a 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -98,14 +98,16 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf) if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); out_sem: up_read(&F2FS_I(inode)->i_mmap_sem); @@ -206,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, /* if fdatasync is triggered, let's do in-place-update */ if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(inode, FI_NEED_IPU); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); clear_inode_flag(inode, FI_NEED_IPU); if (ret) { @@ -274,9 +276,19 @@ sync_nodes: goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -382,7 +394,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -467,9 +480,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -647,7 +664,7 @@ int f2fs_getattr(const struct path *path, struct kstat *stat, struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - flags = fi->i_flags & FS_FL_USER_VISIBLE; + flags = fi->i_flags & (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); if (flags & FS_APPEND_FL) stat->attributes |= STATX_ATTR_APPEND; if (flags & FS_COMPR_FL) @@ -927,7 +944,8 @@ next_dnode: done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -1003,8 +1021,8 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { @@ -1173,7 +1191,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1184,8 +1203,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1495,33 +1514,67 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags = fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL); return put_user(flags, (int __user *)arg); } +static int __f2fs_ioc_setflags(struct inode *inode, unsigned int flags) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int oldflags; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + return -EPERM; + + flags = f2fs_mask_flags(inode->i_mode, flags); + + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) + if (!capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + flags = flags & (FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + flags |= oldflags & ~(FS_FL_USER_MODIFIABLE | FS_PROJINHERIT_FL); + fi->i_flags = flags; + + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + else + clear_inode_flag(inode, FI_PROJ_INHERIT); + + inode->i_ctime = current_time(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return 0; +} + static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int flags; - unsigned int oldflags; int ret; if (!inode_owner_or_capable(inode)) @@ -1536,31 +1589,8 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) inode_lock(inode); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - ret = -EPERM; - goto unlock_out; - } - - flags = f2fs_mask_flags(inode->i_mode, flags); - - oldflags = fi->i_flags; - - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - ret = -EPERM; - goto unlock_out; - } - } - - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; + ret = __f2fs_ioc_setflags(inode, flags); - inode->i_ctime = current_time(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, false); -unlock_out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1610,10 +1640,12 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); goto out; } inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); out: @@ -1647,10 +1679,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1786,7 +1819,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -2043,7 +2076,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -2085,7 +2118,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2384,6 +2417,210 @@ out: return ret; } +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + +#ifdef CONFIG_QUOTA +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + struct dquot *transfer_to[MAXQUOTAS] = {}; + struct page *ipage; + kprojid_t kprojid; + int err; + + if (!f2fs_sb_has_project_quota(sb)) { + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (!f2fs_has_extra_attr(inode)) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, F2FS_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto out_unlock; + } + + if (!F2FS_FITS_IN_INODE(F2FS_INODE(ipage), fi->i_extra_isize, + i_projid)) { + err = -EOVERFLOW; + f2fs_put_page(ipage, 1); + goto out_unlock; + } + f2fs_put_page(ipage, 1); + + dquot_initialize(inode); + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (!IS_ERR(transfer_to[PRJQUOTA])) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + + F2FS_I(inode)->i_projid = kprojid; + inode->i_ctime = current_time(inode); +out_dirty: + f2fs_mark_inode_dirty_sync(inode, true); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int f2fs_ioc_setproject(struct file *filp, __u32 projid) +{ + if (projid != F2FS_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 f2fs_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & FS_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & FS_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & FS_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & FS_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & FS_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & FS_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +#define F2FS_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \ + FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \ + FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT) + +/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */ +#define F2FS_FL_XFLAG_VISIBLE (FS_SYNC_FL | \ + FS_IMMUTABLE_FL | \ + FS_APPEND_FL | \ + FS_NODUMP_FL | \ + FS_NOATIME_FL | \ + FS_PROJINHERIT_FL) + +/* Transfer xflags flags to internal */ +static inline unsigned long f2fs_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= FS_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= FS_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= FS_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= FS_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= FS_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= FS_PROJINHERIT_FL; + + return iflags; +} + +static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + fa.fsx_xflags = f2fs_iflags_to_xflags(fi->i_flags & + (FS_FL_USER_VISIBLE | FS_PROJINHERIT_FL)); + + if (f2fs_sb_has_project_quota(inode->i_sb)) + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + fi->i_projid); + + if (copy_to_user((struct fsxattr __user *)arg, &fa, sizeof(fa))) + return -EFAULT; + return 0; +} + +static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct fsxattr fa; + unsigned int flags; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (fa.fsx_xflags & ~F2FS_SUPPORTED_FS_XFLAGS) + return -EOPNOTSUPP; + + flags = f2fs_xflags_to_iflags(fa.fsx_xflags); + if (f2fs_mask_flags(inode->i_mode, flags) != flags) + return -EOPNOTSUPP; + + err = mnt_want_write_file(filp); + if (err) + return err; + + inode_lock(inode); + flags = (fi->i_flags & ~F2FS_FL_XFLAG_VISIBLE) | + (flags & F2FS_FL_XFLAG_VISIBLE); + err = __f2fs_ioc_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = f2fs_ioc_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; +} long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -2426,6 +2663,12 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_move_range(filp, arg); case F2FS_IOC_FLUSH_DEVICE: return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); + case F2FS_IOC_FSGETXATTR: + return f2fs_ioc_fsgetxattr(filp, arg); + case F2FS_IOC_FSSETXATTR: + return f2fs_ioc_fssetxattr(filp, arg); default: return -ENOTTY; } @@ -2455,6 +2698,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2491,6 +2737,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DEFRAGMENT: case F2FS_IOC_MOVE_RANGE: case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: + case F2FS_IOC_FSGETXATTR: + case F2FS_IOC_FSSETXATTR: break; default: return -ENOIOCTLCMD; @@ -2506,6 +2755,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fa3d2e2df8e7..bfe6a8ccc3a0 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,16 +28,21 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { wait_event_interruptible_timeout(*wq, - kthread_should_stop() || freezing(current), + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; if (kthread_should_stop()) @@ -55,6 +60,9 @@ static int gc_thread_func(void *data) } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -69,19 +77,24 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ @@ -93,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -110,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -259,20 +277,11 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, valid_blocks * 2 : valid_blocks; } -static unsigned int get_ssr_cost(struct f2fs_sb_info *sbi, - unsigned int segno) -{ - struct seg_entry *se = get_seg_entry(sbi, segno); - - return se->ckpt_valid_blocks > se->valid_blocks ? - se->ckpt_valid_blocks : se->valid_blocks; -} - static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { if (p->alloc_mode == SSR) - return get_ssr_cost(sbi, segno); + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) @@ -582,7 +591,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -590,8 +599,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -684,6 +697,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = newaddr; f2fs_submit_page_write(&fio); + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); + f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) @@ -731,6 +746,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .page = page, .encrypted_page = NULL, .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -819,8 +835,7 @@ next_step: continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -854,14 +869,18 @@ next_step: continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -898,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -944,6 +963,10 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } @@ -954,21 +977,17 @@ next: blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, true) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; unsigned int init_segno = segno; struct gc_inode_list gc_list = { @@ -976,6 +995,15 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { @@ -1002,17 +1030,20 @@ gc_more: gc_type = FG_GC; } - ret = -EINVAL; /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ - if (gc_type == BG_GC && !background) + if (gc_type == BG_GC && !background) { + ret = -EINVAL; goto stop; - if (!__get_victim(sbi, &segno, gc_type)) + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; - ret = 0; + } - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; @@ -1029,6 +1060,16 @@ gc_more: stop: SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e0fd4376e6fb..8322e4e7bb3f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -22,10 +22,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -44,6 +44,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -51,12 +52,12 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) @@ -67,13 +68,13 @@ void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) + if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); if (from == 0) @@ -116,6 +117,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -200,6 +202,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -216,11 +220,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -255,9 +264,9 @@ process_inline: f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,11 +294,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -300,9 +309,9 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -316,19 +325,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *inline_dentry; struct f2fs_dentry_ptr d; + void *inline_dentry; - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -337,11 +346,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -356,25 +366,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -395,14 +404,13 @@ out: return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr_inline(NULL, &d, inline_dentry); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -444,19 +452,19 @@ punch_dentry_pages: } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -473,9 +481,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -484,7 +492,7 @@ recover: } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -500,7 +508,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *inline_dentry = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -510,10 +518,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); - bit_pos = room_for_filename(&inline_dentry->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; @@ -534,7 +543,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, f2fs_wait_on_page_writeback(ipage, NODE, true); name_hash = f2fs_dentry_hash(new_name, NULL); - make_dentry_ptr_inline(NULL, &d, inline_dentry); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -557,7 +565,8 @@ out: void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -565,11 +574,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -586,20 +596,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *inline_dentry; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - inline_dentry = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&inline_dentry->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -609,25 +620,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -652,7 +665,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -661,7 +674,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 6cd312a17c69..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -49,20 +49,22 @@ void f2fs_set_inode_flags(struct inode *inode) static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -292,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -416,6 +519,9 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + /* ino == 0, if f2fs_new_inode() was failed t*/ if (inode->i_ino) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 760d85223c81..a4dab98c4b7b 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -58,6 +58,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto fail_drop; @@ -72,6 +79,11 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -85,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -204,6 +225,11 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -261,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -724,6 +754,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; @@ -912,6 +947,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(old_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d53fe620939e..fca87835a1da 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,6 +19,7 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -554,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -578,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -613,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -654,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -876,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -1022,11 +1028,10 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; @@ -1170,6 +1175,11 @@ repeat: err = -EIO; goto out_err; } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " @@ -1177,9 +1187,9 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); - ClearPageUptodate(page); err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); return ERR_PTR(err); } @@ -1326,7 +1336,8 @@ continue_unlock: } static int __write_node_page(struct page *page, bool atomic, bool *submitted, - struct writeback_control *wbc) + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; @@ -1339,6 +1350,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, .page = page, .encrypted_page = NULL, .submitted = false, + .io_type = io_type, }; trace_f2fs_writepage(page, NODE); @@ -1395,6 +1407,8 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (submitted) *submitted = fio.submitted; + if (do_balance) + f2fs_balance_fs(sbi, false); return 0; redirty_out: @@ -1405,7 +1419,7 @@ redirty_out: static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - return __write_node_page(page, false, NULL, wbc); + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); } int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, @@ -1493,7 +1507,8 @@ continue_unlock: ret = __write_node_page(page, atomic && page == last_page, - &submitted, wbc); + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); @@ -1530,7 +1545,8 @@ out: return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1608,7 +1624,8 @@ continue_unlock: set_fsync_mark(page, 0); set_dentry_mark(page, 0); - ret = __write_node_page(page, false, &submitted, wbc); + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); if (ret) unlock_page(page); else if (submitted) @@ -1697,7 +1714,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -2191,7 +2208,8 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; struct page *xpage; @@ -2207,22 +2225,22 @@ int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: update xattr nid in inode */ - remove_free_nid(sbi, new_xnid); - f2fs_i_xnid_write(inode, new_xnid); - if (unlikely(inc_valid_node_count(sbi, inode, false))) - f2fs_bug_on(sbi, 1); + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; + + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } + + alloc_nid_done(sbi, new_xnid); update_inode_page(inode); /* 3: update and set xattr node page dirty */ - xpage = grab_cache_page(NODE_MAPPING(sbi), new_xnid); - if (!xpage) - return -ENOMEM; - - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), PAGE_SIZE); + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -2262,7 +2280,14 @@ retry: dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 907d6b7dde6a..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ retry: err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -226,18 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { + bool quota_inode = false; + if (!check_only && IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -291,7 +317,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; @@ -328,10 +354,18 @@ got_it: f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -361,7 +395,8 @@ out: return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -414,8 +449,8 @@ retry_dn: for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -557,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -573,11 +623,11 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -586,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -599,8 +649,6 @@ out: } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -614,5 +662,12 @@ out: } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f964b68718c1..621b9b3d320b 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -17,10 +17,12 @@ #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/sched/signal.h> #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include <trace/events/f2fs.h> @@ -167,6 +169,21 @@ found: return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -213,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -248,6 +271,7 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } @@ -292,6 +316,7 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -309,17 +334,21 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; fio.old_blkaddr = NULL_ADDR; fio.encrypted_page = NULL; fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; last_idx = page->index; @@ -447,7 +476,7 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi, int ret; bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); ret = submit_bio_wait(bio); bio_put(bio); @@ -481,6 +510,8 @@ repeat: if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -499,6 +530,8 @@ repeat: fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -519,8 +552,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return ret; } - if (!atomic_read(&fcc->issing_flush)) { - atomic_inc(&fcc->issing_flush); + if (atomic_inc_return(&fcc->issing_flush) == 1) { ret = submit_flush_wait(sbi); atomic_dec(&fcc->issing_flush); @@ -530,18 +562,39 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) init_completion(&cmd.wait); - atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->issing_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -778,11 +831,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi, sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); - size = min((unsigned long)(end - blk), max_blocks); + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); - blk += size; + blk = START_BLOCK(sbi, segno + 1); } #endif } @@ -815,6 +871,8 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi, submit_bio(bio); list_move_tail(&dc->list, &dcc->wait_list); __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); } } else { __remove_discard_cmd(sbi, dc); @@ -996,32 +1054,81 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi, return 0; } -static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int i, iter = 0; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); blk_start_plug(&plug); - for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); - if (!issue_cond || is_idle(sbi)) + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { __submit_discard_cmd(sbi, dc); - if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) goto out; } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } out: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); } static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, @@ -1102,34 +1209,63 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } } -/* This comes from f2fs_put_super */ +/* This comes from f2fs_put_super and f2fs_trim_fs */ void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); __wait_discard_cmd(sbi, false); } +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; set_freezable(); do { - wait_event_interruptible(*q, kthread_should_stop() || - freezing(current) || - atomic_read(&dcc->discard_cmd_cnt)); + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); if (try_to_freeze()) continue; if (kthread_should_stop()) return 0; - __issue_discard_cmd(sbi, true); - __wait_discard_cmd(sbi, true); + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); - congestion_wait(BLK_RW_SYNC, HZ/50); } while (!kthread_should_stop()); return 0; } @@ -1320,7 +1456,8 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; @@ -1402,11 +1539,11 @@ skip: goto find_next; list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= total_len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); + wake_up_discard_thread(sbi, false); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) @@ -1424,9 +1561,13 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; INIT_LIST_HEAD(&dcc->entry_list); - for (i = 0; i < MAX_PLIST_NUM; i++) + for (i = 0; i < MAX_PLIST_NUM; i++) { INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); @@ -1491,6 +1632,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1507,17 +1652,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; @@ -1528,17 +1681,25 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) se->ckpt_valid_blocks++; } } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); + } #endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1900,7 +2061,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1921,12 +2082,10 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) @@ -1990,7 +2149,7 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -2083,6 +2242,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -2202,9 +2364,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + if (add_list) { struct f2fs_bio_info *io; @@ -2236,7 +2401,8 @@ reallocate: } } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -2255,6 +2421,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) set_page_writeback(page); f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -2263,6 +2431,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -2276,13 +2446,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - return f2fs_submit_page_bio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2324,7 +2503,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -2343,7 +2522,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -2382,8 +2561,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 6b871b492fd5..e0a6cc23ace3 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -492,29 +492,11 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - 2 * reserved_sections(sbi)); -} - static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed, int needed) { @@ -577,6 +559,10 @@ static inline bool need_inplace_update_policy(struct inode *inode, if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -799,3 +785,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 32e4c025e97e..89f61eb3d167 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -25,6 +25,7 @@ #include <linux/quotaops.h> #include <linux/f2fs_fs.h> #include <linux/sysfs.h> +#include <linux/quota.h> #include "f2fs.h" #include "node.h" @@ -107,8 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, Opt_usrquota, Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -144,8 +157,20 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, {Opt_usrquota, "usrquota"}, {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; @@ -157,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -168,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -175,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -386,15 +512,76 @@ static int parse_options(struct super_block *sb, char *options) sb->s_flags &= ~MS_LAZYTIME; break; #ifdef CONFIG_QUOTA + case Opt_quota: case Opt_usrquota: set_opt(sbi, USRQUOTA); break; case Opt_grpquota: set_opt(sbi, GRPQUOTA); break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; #else + case Opt_quota: case Opt_usrquota: case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: f2fs_msg(sb, KERN_INFO, "quota operations not supported"); break; @@ -406,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -439,6 +630,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); #ifdef CONFIG_QUOTA memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); @@ -446,6 +638,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) #endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } @@ -584,7 +777,6 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) kfree(sbi->devs); } -static void f2fs_quota_off_umount(struct super_block *sb); static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -642,7 +834,7 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->ckpt); - f2fs_exit_sysfs(sbi); + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -651,6 +843,10 @@ static void f2fs_put_super(struct super_block *sb) destroy_device_list(sbi); mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); @@ -664,6 +860,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -698,6 +897,48 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -733,9 +974,49 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -809,11 +1090,16 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) sbi->fault_info.inject_rate); #endif #ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); if (test_opt(sbi, USRQUOTA)) seq_puts(seq, ",usrquota"); if (test_opt(sbi, GRPQUOTA)) seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); #endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } @@ -862,6 +1148,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we @@ -871,6 +1162,23 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -952,6 +1260,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -966,6 +1279,13 @@ restore_gc: stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; sb->s_flags = old_sb_flags; @@ -1065,7 +1385,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type, } if (len == towrite) - return err; + return 0; inode->i_version++; inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); @@ -1082,6 +1402,27 @@ static qsize_t *f2fs_get_reserved_space(struct inode *inode) return &F2FS_I(inode)->i_reserved_quota; } +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + static int f2fs_quota_sync(struct super_block *sb, int type) { struct quota_info *dqopt = sb_dqopt(sb); @@ -1119,7 +1460,7 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, struct inode *inode; int err; - err = f2fs_quota_sync(sb, -1); + err = f2fs_quota_sync(sb, type); if (err) return err; @@ -1147,7 +1488,7 @@ static int f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - f2fs_quota_sync(sb, -1); + f2fs_quota_sync(sb, type); err = dquot_quota_off(sb, type); if (err) @@ -1163,7 +1504,7 @@ out_put: return err; } -static void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { int type; @@ -1171,6 +1512,12 @@ static void f2fs_quota_off_umount(struct super_block *sb) f2fs_quota_off(sb, type); } +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} + static const struct dquot_operations f2fs_quota_operations = { .get_reserved_space = f2fs_get_reserved_space, .write_dquot = dquot_commit, @@ -1180,6 +1527,7 @@ static const struct dquot_operations f2fs_quota_operations = { .write_info = dquot_commit_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = f2fs_get_projid, .get_next_id = dquot_get_next_id, }; @@ -1194,12 +1542,12 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else -static inline void f2fs_quota_off_umount(struct super_block *sb) +void f2fs_quota_off_umount(struct super_block *sb) { } #endif -static struct super_operations f2fs_sops = { +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, @@ -1303,9 +1651,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1922,6 +2277,11 @@ try_onemore: sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1956,7 +2316,7 @@ try_onemore: #ifdef CONFIG_QUOTA sb->dq_op = &f2fs_quota_operations; sb->s_qcop = &f2fs_quotactl_ops; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif sb->s_op = &f2fs_sops; @@ -1980,6 +2340,10 @@ try_onemore: set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { int n = (i == META) ? 1: NR_TEMP_TYPE; int j; @@ -2098,11 +2462,6 @@ try_onemore: if (err) goto free_nm; - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); - if (err) - goto free_node_inode; - /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { @@ -2122,10 +2481,15 @@ try_onemore: goto free_root_inode; } - err = f2fs_init_sysfs(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); + if (err) + goto free_sysfs; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -2135,7 +2499,7 @@ try_onemore: if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_sysfs; + goto free_meta; } if (need_fsck) @@ -2149,7 +2513,7 @@ try_onemore: need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_sysfs; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2173,7 +2537,7 @@ skip_recovery: /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_sysfs; + goto free_meta; } kfree(options); @@ -2191,9 +2555,17 @@ skip_recovery: f2fs_update_time(sbi, REQ_TIME); return 0; -free_sysfs: +free_meta: f2fs_sync_inode_meta(sbi); - f2fs_exit_sysfs(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2202,13 +2574,6 @@ free_node_inode: mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); f2fs_destroy_stats(sbi); @@ -2228,6 +2593,10 @@ free_options: for (i = 0; i < NR_PAGE_TYPE; i++) kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2311,7 +2680,7 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - err = f2fs_register_sysfs(); + err = f2fs_init_sysfs(); if (err) goto free_extent_cache; err = register_shrinker(&f2fs_shrinker_info); @@ -2330,7 +2699,7 @@ free_filesystem: free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_sysfs: - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2350,7 +2719,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - f2fs_unregister_sysfs(); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 71191d89917d..e2c258f717cd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -18,7 +18,6 @@ #include "gc.h" static struct proc_dir_entry *f2fs_proc_root; -static struct kset *f2fs_kset; /* Sysfs support for f2fs */ enum { @@ -41,6 +40,7 @@ struct f2fs_attr { const char *, size_t); int struct_type; int offset; + int id; }; static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) @@ -76,6 +76,34 @@ static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, BD_PART_WRITTEN(sbi))); } +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -124,7 +152,39 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, spin_unlock(&sbi->stat_lock); return count; } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + return count; } @@ -155,6 +215,30 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + #define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ static struct f2fs_attr f2fs_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = _mode }, \ @@ -172,12 +256,23 @@ static struct f2fs_attr f2fs_attr_##_name = { \ #define F2FS_GENERAL_RO_ATTR(name) \ static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); @@ -191,20 +286,36 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); #ifdef CONFIG_F2FS_FAULT_INJECTION F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); #endif F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), ATTR_LIST(gc_min_sleep_time), ATTR_LIST(gc_max_sleep_time), ATTR_LIST(gc_no_gc_sleep_time), ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), ATTR_LIST(reclaim_segments), ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), ATTR_LIST(batched_trim_sections), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), @@ -217,26 +328,59 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), #ifdef CONFIG_F2FS_FAULT_INJECTION ATTR_LIST(inject_rate), ATTR_LIST(inject_type), #endif ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), ATTR_LIST(reserved_blocks), NULL, }; +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, }; -static struct kobj_type f2fs_ktype = { +static struct kobj_type f2fs_sb_ktype = { .default_attrs = f2fs_attrs, .sysfs_ops = &f2fs_attr_ops, .release = f2fs_sb_release, }; +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + static int segment_info_seq_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; @@ -288,6 +432,48 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) return 0; } +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + #define F2FS_PROC_FILE_DEF(_name) \ static int _name##_open_fs(struct inode *inode, struct file *file) \ { \ @@ -303,28 +489,47 @@ static const struct file_operations f2fs_seq_##_name##_fops = { \ F2FS_PROC_FILE_DEF(segment_info); F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); -int __init f2fs_register_sysfs(void) +int __init f2fs_init_sysfs(void) { - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + int ret; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) - return -ENOMEM; - return 0; + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; } -void f2fs_unregister_sysfs(void) +void f2fs_exit_sysfs(void) { - kset_unregister(f2fs_kset); + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; } -int f2fs_init_sysfs(struct f2fs_sb_info *sbi) +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) { struct super_block *sb = sbi->sb; int err; + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -333,33 +538,19 @@ int f2fs_init_sysfs(struct f2fs_sb_info *sbi) &f2fs_seq_segment_info_fops, sb); proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); - if (err) - goto err_out; return 0; -err_out: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - return err; } -void f2fs_exit_sysfs(struct f2fs_sb_info *sbi) +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) { - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); - if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); remove_proc_entry("segment_info", sbi->s_proc); remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); } + kobject_del(&sbi->s_kobj); } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 832c5110abab..7c65540148f8 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -442,7 +442,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -473,8 +473,10 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; + down_read(&F2FS_I(inode)->i_xattr_sem); error = lookup_all_xattrs(inode, ipage, index, len, name, &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -503,7 +505,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -686,7 +690,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 6a7152d0c250..02c066663a3a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -19,6 +19,8 @@ #include <linux/ctype.h> #include <linux/slab.h> #include <linux/namei.h> +#include <linux/kernel.h> + #include "fat.h" static inline unsigned long vfat_d_version(struct dentry *dentry) @@ -510,10 +512,8 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, struct nls_table *nls) { const unsigned char *ip; - unsigned char nc; unsigned char *op; - unsigned int ec; - int i, k, fill; + int i, fill; int charlen; if (utf8) { @@ -530,33 +530,22 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, i < len && *outlen < FAT_LFN_LEN; *outlen += 1) { if (escape && (*ip == ':')) { + u8 uc[2]; + if (i > len - 5) return -EINVAL; - ec = 0; - for (k = 1; k < 5; k++) { - nc = ip[k]; - ec <<= 4; - if (nc >= '0' && nc <= '9') { - ec |= nc - '0'; - continue; - } - if (nc >= 'a' && nc <= 'f') { - ec |= nc - ('a' - 10); - continue; - } - if (nc >= 'A' && nc <= 'F') { - ec |= nc - ('A' - 10); - continue; - } + + if (hex2bin(uc, ip + 1, 2) < 0) return -EINVAL; - } - *op++ = ec & 0xFF; - *op++ = ec >> 8; + + *(wchar_t *)op = uc[0] << 8 | uc[1]; + + op += 2; ip += 5; i += 5; } else { charlen = nls->char2uni(ip, len - i, - (wchar_t *)op); + (wchar_t *)op); if (charlen < 0) return -EINVAL; ip += charlen; diff --git a/fs/fcntl.c b/fs/fcntl.c index 3b01b646e528..0491da3b28c3 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -741,10 +741,21 @@ static void send_sigio_to_task(struct task_struct *p, si.si_signo = signum; si.si_errno = 0; si.si_code = reason; + /* + * Posix definies POLL_IN and friends to be signal + * specific si_codes for SIG_POLL. Linux extended + * these si_codes to other signals in a way that is + * ambiguous if other signals also have signal + * specific si_codes. In that case use SI_SIGIO instead + * to remove the ambiguity. + */ + if (sig_specific_sicodes(signum)) + si.si_code = SI_SIGIO; + /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ - BUG_ON((reason & __SI_MASK) != __SI_POLL); + BUG_ON((reason < POLL_IN) || ((reason - POLL_IN) >= NSIGPOLL)); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 67f940892ef8..b5ab06fabc60 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -262,7 +262,8 @@ static int fscache_objlist_show(struct seq_file *m, void *v) type = "DT"; break; default: - sprintf(_type, "%02u", cookie->def->type); + snprintf(_type, sizeof(_type), "%02u", + cookie->def->type); type = _type; break; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index c8c4f79c7ce1..0ad3fd3ad0b4 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1178,11 +1178,10 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, pagevec_init(&pvec, 0); next = 0; do { - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) + if (!pagevec_lookup(&pvec, mapping, &next)) break; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - next = page->index; if (PageFsCache(page)) { __fscache_wait_on_page_write(cookie, page); __fscache_uncache_page(cookie, page); @@ -1190,7 +1189,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, } pagevec_release(&pvec); cond_resched(); - } while (++next); + } while (next); _leave(""); } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index c5b6b7165489..e9e97803442a 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -90,7 +90,7 @@ static struct list_head *cuse_conntbl_head(dev_t devt) static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); @@ -98,7 +98,7 @@ static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(kiocb); loff_t pos = 0; /* * No locking or generic_write_checks(), the server is diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16d00e53264..13c65dd2d37d 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1222,9 +1222,6 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1262,6 +1259,13 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, in = &req->in; reqsize = in->h.len; + + if (task_active_pid_ns(current) != fc->pid_ns) { + rcu_read_lock(); + in->h.pid = pid_vnr(find_pid_ns(in->h.pid, fc->pid_ns)); + rcu_read_unlock(); + } + /* If request is too large, reply with an error and restart the read */ if (nbytes < reqsize) { req->out.h.error = -EIO; @@ -1823,9 +1827,6 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; - if (task_active_pid_ns(current) != fc->pid_ns) - return -EIO; - if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 00800c07ba1c..622081b97426 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -923,33 +923,29 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, return err; } -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed) +static int fuse_update_get_attr(struct inode *inode, struct file *file, + struct kstat *stat) { struct fuse_inode *fi = get_fuse_inode(inode); - int err; - bool r; + int err = 0; if (time_before64(fi->i_time, get_jiffies_64())) { - r = true; forget_all_cached_acls(inode); err = fuse_do_getattr(inode, stat, file); - } else { - r = false; - err = 0; - if (stat) { - generic_fillattr(inode, stat); - stat->mode = fi->orig_i_mode; - stat->ino = fi->orig_ino; - } + } else if (stat) { + generic_fillattr(inode, stat); + stat->mode = fi->orig_i_mode; + stat->ino = fi->orig_ino; } - if (refreshed != NULL) - *refreshed = r; - return err; } +int fuse_update_attributes(struct inode *inode, struct file *file) +{ + return fuse_update_get_attr(inode, file, NULL); +} + int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, u64 child_nodeid, struct qstr *name) { @@ -1786,7 +1782,7 @@ static int fuse_getattr(const struct path *path, struct kstat *stat, if (!fuse_allow_current_process(fc)) return -EACCES; - return fuse_update_attributes(inode, stat, NULL, NULL); + return fuse_update_get_attr(inode, NULL, stat); } static const struct inode_operations fuse_dir_inode_operations = { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ab60051be6e5..cb7dff5c45d7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -457,7 +457,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) goto out; @@ -465,10 +465,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, /* * Due to implementation of fuse writeback - * filemap_write_and_wait_range() does not catch errors. + * file_write_and_wait_range() does not catch errors. * We have to do this directly after fuse_sync_writes() */ - err = filemap_check_errors(file->f_mapping); + err = file_check_and_advance_wb_err(file); if (err) goto out; @@ -645,7 +645,7 @@ static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req, static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -707,7 +707,8 @@ static void fuse_short_read(struct fuse_req *req, struct inode *inode, static int fuse_do_readpage(struct file *file, struct page *page) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct kiocb iocb; + struct fuse_io_priv io; struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -735,6 +736,8 @@ static int fuse_do_readpage(struct file *file, struct page *page) req->num_pages = 1; req->pages[0] = page; req->page_descs[0].length = count; + init_sync_kiocb(&iocb, file); + io = (struct fuse_io_priv) FUSE_IO_PRIV_SYNC(&iocb); num_read = fuse_send_read(req, &io, pos, count, NULL); err = req->out.h.error; @@ -923,7 +926,7 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL); + err = fuse_update_attributes(inode, iocb->ki_filp); if (err) return err; } @@ -957,13 +960,18 @@ static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff, static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io, loff_t pos, size_t count, fl_owner_t owner) { - struct file *file = io->file; + struct kiocb *iocb = io->iocb; + struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; struct fuse_write_in *inarg = &req->misc.write.in; fuse_write_fill(req, ff, pos, count); inarg->flags = file->f_flags; + if (iocb->ki_flags & IOCB_DSYNC) + inarg->flags |= O_DSYNC; + if (iocb->ki_flags & IOCB_SYNC) + inarg->flags |= O_SYNC; if (owner != NULL) { inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -993,14 +1001,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) return ret; } -static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, +static size_t fuse_send_write_pages(struct fuse_req *req, struct kiocb *iocb, struct inode *inode, loff_t pos, size_t count) { size_t res; unsigned offset; unsigned i; - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); for (i = 0; i < req->num_pages; i++) fuse_wait_on_page_writeback(inode, req->pages[i]->index); @@ -1100,7 +1108,7 @@ static inline unsigned fuse_wr_pages(loff_t pos, size_t len) FUSE_MAX_PAGES_PER_REQ); } -static ssize_t fuse_perform_write(struct file *file, +static ssize_t fuse_perform_write(struct kiocb *iocb, struct address_space *mapping, struct iov_iter *ii, loff_t pos) { @@ -1133,7 +1141,7 @@ static ssize_t fuse_perform_write(struct file *file, } else { size_t num_written; - num_written = fuse_send_write_pages(req, file, inode, + num_written = fuse_send_write_pages(req, iocb, inode, pos, count); err = req->out.h.error; if (!err) { @@ -1169,7 +1177,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, NULL, file, NULL); + err = fuse_update_attributes(mapping->host, file); if (err) return err; @@ -1201,7 +1209,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) pos += written; - written_buffered = fuse_perform_write(file, mapping, from, pos); + written_buffered = fuse_perform_write(iocb, mapping, from, pos); if (written_buffered < 0) { err = written_buffered; goto out; @@ -1220,13 +1228,15 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - written = fuse_perform_write(file, mapping, from, iocb->ki_pos); + written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos); if (written >= 0) iocb->ki_pos += written; } out: current->backing_dev_info = NULL; inode_unlock(inode); + if (written > 0) + written = generic_write_sync(iocb, written); return written ? written : err; } @@ -1317,7 +1327,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, { int write = flags & FUSE_DIO_WRITE; int cuse = flags & FUSE_DIO_CUSE; - struct file *file = io->file; + struct file *file = io->iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -1399,8 +1409,7 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, loff_t *ppos) { ssize_t res; - struct file *file = io->file; - struct inode *inode = file_inode(file); + struct inode *inode = file_inode(io->iocb->ki_filp); if (is_bad_inode(inode)) return -EIO; @@ -1414,15 +1423,14 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); return __fuse_direct_read(&io, to, &iocb->ki_pos); } static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *inode = file_inode(file); - struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(file); + struct inode *inode = file_inode(iocb->ki_filp); + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); ssize_t res; if (is_bad_inode(inode)) @@ -2102,11 +2110,11 @@ static int convert_fuse_file_lock(struct fuse_conn *fc, fl->fl_end = ffl->end; /* - * Convert pid into the caller's pid namespace. If the pid - * does not map into the namespace fl_pid will get set to 0. + * Convert pid into init's pid namespace. The locks API will + * translate it into the caller's pid namespace. */ rcu_read_lock(); - fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns); rcu_read_unlock(); break; @@ -2181,9 +2189,6 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - if (pid && pid_nr == 0) - return -EOVERFLOW; - fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); @@ -2303,7 +2308,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, NULL, file, NULL); + err = fuse_update_attributes(inode, file); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2323,7 +2328,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, NULL, file, NULL); + retval = fuse_update_attributes(inode, file); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2874,7 +2879,6 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) io->offset = offset; io->write = (iov_iter_rw(iter) == WRITE); io->err = 0; - io->file = file; /* * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index bd4d2a3e1ec1..d5773ca67ad2 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -252,16 +252,15 @@ struct fuse_io_priv { bool should_dirty; int err; struct kiocb *iocb; - struct file *file; struct completion *done; bool blocking; }; -#define FUSE_IO_PRIV_SYNC(f) \ +#define FUSE_IO_PRIV_SYNC(i) \ { \ .refcnt = KREF_INIT(1), \ .async = 0, \ - .file = f, \ + .iocb = i, \ } /** @@ -905,8 +904,7 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct kstat *stat, - struct file *file, bool *refreshed); +int fuse_update_attributes(struct inode *inode, struct file *file); void fuse_flush_writepages(struct inode *inode); diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 2524807ee070..9d5eecb123de 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -86,19 +86,6 @@ int __gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) char *data; const char *name = gfs2_acl_name(type); - if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) - return -E2BIG; - - if (type == ACL_TYPE_ACCESS) { - umode_t mode = inode->i_mode; - - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - if (mode != inode->i_mode) - mark_inode_dirty(inode); - } - if (acl) { len = posix_acl_to_xattr(&init_user_ns, acl, NULL, 0); if (len == 0) @@ -129,6 +116,10 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) struct gfs2_holder gh; bool need_unlock = false; int ret; + umode_t mode; + + if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode))) + return -E2BIG; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -140,7 +131,20 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) return ret; need_unlock = true; } + + mode = inode->i_mode; + if (type == ACL_TYPE_ACCESS && acl) { + ret = posix_acl_update_mode(inode, &mode, &acl); + if (ret) + goto unlock; + } + ret = __gfs2_set_acl(inode, acl, type); + if (!ret && mode != inode->i_mode) { + inode->i_mode = mode; + mark_inode_dirty(inode); + } +unlock: if (need_unlock) gfs2_glock_dq_uninit(&gh); return ret; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index ed7a2e252ad8..68ed06962537 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -234,7 +234,19 @@ out: static int gfs2_writepages(struct address_space *mapping, struct writeback_control *wbc) { - return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + struct gfs2_sbd *sdp = gfs2_mapping2sbd(mapping); + int ret = mpage_writepages(mapping, wbc, gfs2_get_block_noalloc); + + /* + * Even if we didn't write any pages here, we might still be holding + * dirty pages in the ail. We forcibly flush the ail because we don't + * want balance_dirty_pages() to loop indefinitely trying to write out + * pages held in the ail that it can't find. + */ + if (ret == 0) + set_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags); + + return ret; } /** diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 9fa3aef9a5b3..3dd0cceefa43 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -291,8 +291,9 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, if (trylock_buffer(rabh)) { if (!buffer_uptodate(rabh)) { rabh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, - rabh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + rabh); continue; } unlock_buffer(rabh); @@ -1103,8 +1104,15 @@ static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, while (true) { ptr = metapointer(h, mp); - if (*ptr) /* if we have a non-null pointer */ + if (*ptr) { /* if we have a non-null pointer */ + /* Now zero the metapath after the current height. */ + h++; + if (h < GFS2_MAX_META_HEIGHT) + memset(&mp->mp_list[h], 0, + (GFS2_MAX_META_HEIGHT - h) * + sizeof(mp->mp_list[0])); return true; + } if (mp->mp_list[h] < ptrs) mp->mp_list[h]++; @@ -1120,6 +1128,13 @@ enum dealloc_states { DEALLOC_DONE = 3, /* process complete */ }; +static bool mp_eq_to_hgt(struct metapath *mp, __u16 *nbof, unsigned int h) +{ + if (memcmp(mp->mp_list, nbof, h * sizeof(mp->mp_list[0]))) + return false; + return true; +} + /** * trunc_dealloc - truncate a file down to a desired size * @ip: inode to truncate @@ -1197,8 +1212,7 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) /* If we're truncating to a non-zero size and the mp is at the beginning of file for the strip height, we need to preserve the first metadata pointer. */ - preserve1 = (newsize && - (mp.mp_list[mp_h] == nbof[mp_h])); + preserve1 = (newsize && mp_eq_to_hgt(&mp, nbof, mp_h)); bh = mp.mp_bh[mp_h]; gfs2_assert_withdraw(sdp, bh); if (gfs2_assert_withdraw(sdp, diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 5ee2e2f8576c..06a0d1947c77 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1513,7 +1513,9 @@ static void gfs2_dir_readahead(struct inode *inode, unsigned hsize, u32 index, continue; } bh->b_end_io = end_buffer_read_sync; - submit_bh(REQ_OP_READ, REQ_RAHEAD | REQ_META, bh); + submit_bh(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + bh); continue; } brelse(bh); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c2062a108d19..33a0cb5701a3 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -668,12 +668,14 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; if (gfs2_is_jdata(ip)) - filemap_write_and_wait(mapping); + ret = file_write_and_wait(file); + if (ret) + return ret; gfs2_ail_flush(ip->i_gl, 1); } if (mapping->nrpages) - ret = filemap_fdatawait_range(mapping, start, end); + ret = file_fdatawait_range(file, start, end); return ret ? ret : ret1; } @@ -1030,8 +1032,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); - gl = fl_gh->gh_gl; - if (gl) { + if (gfs2_holder_initialized(fl_gh)) { if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c38ab6c81898..98e845b7841b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/delay.h> #include <linux/sort.h> +#include <linux/hash.h> #include <linux/jhash.h> #include <linux/kallsyms.h> #include <linux/gfs2_ondisk.h> @@ -71,7 +72,7 @@ static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE BIT(GFS2_GL_HASH_SHIFT) -static struct rhashtable_params ht_parms = { +static const struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), @@ -80,6 +81,49 @@ static struct rhashtable_params ht_parms = { static struct rhashtable gl_hash_table; +#define GLOCK_WAIT_TABLE_BITS 12 +#define GLOCK_WAIT_TABLE_SIZE (1 << GLOCK_WAIT_TABLE_BITS) +static wait_queue_head_t glock_wait_table[GLOCK_WAIT_TABLE_SIZE] __cacheline_aligned; + +struct wait_glock_queue { + struct lm_lockname *name; + wait_queue_entry_t wait; +}; + +static int glock_wake_function(wait_queue_entry_t *wait, unsigned int mode, + int sync, void *key) +{ + struct wait_glock_queue *wait_glock = + container_of(wait, struct wait_glock_queue, wait); + struct lm_lockname *wait_name = wait_glock->name; + struct lm_lockname *wake_name = key; + + if (wake_name->ln_sbd != wait_name->ln_sbd || + wake_name->ln_number != wait_name->ln_number || + wake_name->ln_type != wait_name->ln_type) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + +static wait_queue_head_t *glock_waitqueue(struct lm_lockname *name) +{ + u32 hash = jhash2((u32 *)name, sizeof(*name) / 4, 0); + + return glock_wait_table + hash_32(hash, GLOCK_WAIT_TABLE_BITS); +} + +/** + * wake_up_glock - Wake up waiters on a glock + * @gl: the glock + */ +static void wake_up_glock(struct gfs2_glock *gl) +{ + wait_queue_head_t *wq = glock_waitqueue(&gl->gl_name); + + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &gl->gl_name); +} + static void gfs2_glock_dealloc(struct rcu_head *rcu) { struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu); @@ -96,6 +140,9 @@ void gfs2_glock_free(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); + smp_mb(); + wake_up_glock(gl); call_rcu(&gl->gl_rcu, gfs2_glock_dealloc); if (atomic_dec_and_test(&sdp->sd_glock_disposal)) wake_up(&sdp->sd_glock_wait); @@ -107,7 +154,7 @@ void gfs2_glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, __lockref_is_dead(&gl->gl_lockref)); lockref_get(&gl->gl_lockref); @@ -150,6 +197,9 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl) static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl) { + if (!(gl->gl_ops->go_flags & GLOF_LRU)) + return; + spin_lock(&lru_lock); if (!list_empty(&gl->gl_lru)) { list_del_init(&gl->gl_lru); @@ -191,13 +241,20 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_unlock(&gl->gl_lockref.lock); - rhashtable_remove_fast(&gl_hash_table, &gl->gl_node, ht_parms); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); GLOCK_BUG_ON(gl, mapping && mapping->nrpages); trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); } +/* + * Cause the glock to be put in work queue context. + */ +void gfs2_glock_queue_put(struct gfs2_glock *gl) +{ + gfs2_glock_queue_work(gl, 0); +} + /** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put @@ -676,6 +733,40 @@ static void glock_work_func(struct work_struct *work) spin_unlock(&gl->gl_lockref.lock); } +static struct gfs2_glock *find_insert_glock(struct lm_lockname *name, + struct gfs2_glock *new) +{ + struct wait_glock_queue wait; + wait_queue_head_t *wq = glock_waitqueue(name); + struct gfs2_glock *gl; + + wait.name = name; + init_wait(&wait.wait); + wait.wait.func = glock_wake_function; + +again: + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + rcu_read_lock(); + if (new) { + gl = rhashtable_lookup_get_insert_fast(&gl_hash_table, + &new->gl_node, ht_parms); + if (IS_ERR(gl)) + goto out; + } else { + gl = rhashtable_lookup_fast(&gl_hash_table, + name, ht_parms); + } + if (gl && !lockref_get_not_dead(&gl->gl_lockref)) { + rcu_read_unlock(); + schedule(); + goto again; + } +out: + rcu_read_unlock(); + finish_wait(wq, &wait.wait); + return gl; +} + /** * gfs2_glock_get() - Get a glock, or create one if one doesn't exist * @sdp: The GFS2 superblock @@ -702,15 +793,11 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct kmem_cache *cachep; int ret = 0; - rcu_read_lock(); - gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (gl && !lockref_get_not_dead(&gl->gl_lockref)) - gl = NULL; - rcu_read_unlock(); - - *glp = gl; - if (gl) + gl = find_insert_glock(&name, NULL); + if (gl) { + *glp = gl; return 0; + } if (!create) return -ENOENT; @@ -764,10 +851,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping->writeback_index = 0; } -again: - rcu_read_lock(); - tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); + tmp = find_insert_glock(&name, gl); if (!tmp) { *glp = gl; goto out; @@ -776,13 +860,7 @@ again: ret = PTR_ERR(tmp); goto out_free; } - if (lockref_get_not_dead(&tmp->gl_lockref)) { - *glp = tmp; - goto out_free; - } - rcu_read_unlock(); - cond_resched(); - goto again; + *glp = tmp; out_free: kfree(gl->gl_lksb.sb_lvbptr); @@ -790,7 +868,6 @@ out_free: atomic_dec(&sdp->sd_glock_disposal); out: - rcu_read_unlock(); return ret; } @@ -1473,14 +1550,15 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { gl = ERR_PTR(rhashtable_walk_start(&iter)); - if (gl) - continue; + if (IS_ERR(gl)) + goto walk_stop; while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if ((gl->gl_name.ln_sbd == sdp) && + if (gl->gl_name.ln_sbd == sdp && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); +walk_stop: rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1803,7 +1881,7 @@ static int gfs2_sbstats_seq_show(struct seq_file *seq, void *iter_ptr) int __init gfs2_glock_init(void) { - int ret; + int i, ret; ret = rhashtable_init(&gl_hash_table, &ht_parms); if (ret < 0) @@ -1832,6 +1910,9 @@ int __init gfs2_glock_init(void) return ret; } + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) + init_waitqueue_head(glock_wait_table + i); + return 0; } @@ -1860,6 +1941,7 @@ static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(RCU) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; @@ -1892,6 +1974,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, } static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) + __releases(RCU) { struct gfs2_glock_iter *gi = seq->private; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 9ad4a6ac6c84..5e12220cc0c2 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -13,6 +13,7 @@ #include <linux/sched.h> #include <linux/parser.h> #include "incore.h" +#include "util.h" /* Options for hostdata parser */ @@ -181,7 +182,9 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl) extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); +extern void gfs2_glock_queue_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_reinit(unsigned int state, u16 flags, @@ -257,11 +260,44 @@ static inline bool gfs2_holder_initialized(struct gfs2_holder *gh) return gh->gh_gl; } +/** + * glock_set_object - set the gl_object field of a glock + * @gl: the glock + * @object: the object + */ static inline void glock_set_object(struct gfs2_glock *gl, void *object) { spin_lock(&gl->gl_lockref.lock); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, gl->gl_object == NULL)) + gfs2_dump_glock(NULL, gl); gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); } +/** + * glock_clear_object - clear the gl_object field of a glock + * @gl: the glock + * @object: the object + * + * I'd love to similarly add this: + * else if (gfs2_assert_warn(gl->gl_sbd, gl->gl_object == object)) + * gfs2_dump_glock(NULL, gl); + * Unfortunately, that's not possible because as soon as gfs2_delete_inode + * frees the block in the rgrp, another process can reassign it for an I_NEW + * inode in gfs2_create_inode because that calls new_inode, not gfs2_iget. + * That means gfs2_delete_inode may subsequently try to call this function + * for a glock that's already pointing to a brand new inode. If we clear the + * new inode's gl_object, we'll introduce metadata corruption. Function + * gfs2_delete_inode calls clear_inode which calls gfs2_clear_inode which also + * tries to clear gl_object, so it's more than just gfs2_delete_inode. + * + */ +static inline void glock_clear_object(struct gfs2_glock *gl, void *object) +{ + spin_lock(&gl->gl_lockref.lock); + if (gl->gl_object == object) + gl->gl_object = NULL; + spin_unlock(&gl->gl_lockref.lock); +} + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 5e69636d4dd3..dac6559e2195 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -329,32 +329,6 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl) return 1; } -/** - * gfs2_set_nlink - Set the inode's link count based on on-disk info - * @inode: The inode in question - * @nlink: The link count - * - * If the link count has hit zero, it must never be raised, whatever the - * on-disk inode might say. When new struct inodes are created the link - * count is set to 1, so that we can safely use this test even when reading - * in on disk information for the first time. - */ - -static void gfs2_set_nlink(struct inode *inode, u32 nlink) -{ - /* - * We will need to review setting the nlink count here in the - * light of the forthcoming ro bind mount work. This is a reminder - * to do that. - */ - if ((inode->i_nlink != nlink) && (inode->i_nlink != 0)) { - if (nlink == 0) - clear_nlink(inode); - else - set_nlink(inode, nlink); - } -} - static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { const struct gfs2_dinode *str = buf; @@ -376,7 +350,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) i_uid_write(&ip->i_inode, be32_to_cpu(str->di_uid)); i_gid_write(&ip->i_inode, be32_to_cpu(str->di_gid)); - gfs2_set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); + set_nlink(&ip->i_inode, be32_to_cpu(str->di_nlink)); i_size_write(&ip->i_inode, be64_to_cpu(str->di_size)); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); @@ -470,7 +444,7 @@ static int inode_go_lock(struct gfs2_holder *gh) (gh->gh_state == LM_ST_EXCLUSIVE)) { spin_lock(&sdp->sd_trunc_lock); if (list_empty(&ip->i_trunc_list)) - list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); return 1; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 73fce76e67ee..6e18e9793ec4 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -606,6 +606,7 @@ enum { SDF_NOJOURNALID = 6, SDF_RORECOVERY = 7, /* read only recovery */ SDF_SKIP_DLM_UNLOCK = 8, + SDF_FORCE_AIL_FLUSH = 9, }; enum gfs2_freeze_state { @@ -816,6 +817,7 @@ struct gfs2_sbd { atomic_t sd_log_in_flight; struct bio *sd_log_bio; wait_queue_head_t sd_log_flush_wait; + int sd_log_error; atomic_t sd_reserving_log; wait_queue_head_t sd_reserving_log_wait; @@ -831,7 +833,7 @@ struct gfs2_sbd { atomic_t sd_freeze_state; struct mutex sd_freeze_mutex; - char sd_fsname[GFS2_FSNAME_LEN]; + char sd_fsname[GFS2_FSNAME_LEN + 3 * sizeof(int) + 2]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index acca501f8110..863749e29bf9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -109,7 +109,7 @@ static void gfs2_set_iop(struct inode *inode) * @no_addr: The inode number * @no_formal_ino: The inode generation number * @blktype: Requested block type (GFS2_BLKST_DINODE or GFS2_BLKST_UNLINKED; - * GFS2_BLKST_FREE do indicate not to verify) + * GFS2_BLKST_FREE to indicate not to verify) * * If @type is DT_UNKNOWN, the inode type is fetched from disk. * @@ -145,7 +145,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (unlikely(error)) goto fail; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (unlikely(error)) @@ -170,11 +169,11 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } } + glock_set_object(ip->i_gl, ip); set_bit(GIF_INVALID, &ip->i_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail_put; - flush_delayed_work(&ip->i_iopen_gh.gh_gl->gl_work); glock_set_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_put(io_gl); io_gl = NULL; @@ -202,14 +201,14 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); + glock_clear_object(ip->i_gl, ip); if (gfs2_holder_initialized(&i_gh)) gfs2_glock_dq_uninit(&i_gh); - glock_set_object(ip->i_gl, NULL); fail: iget_failed(inode); return ERR_PTR(error); @@ -706,8 +705,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) goto fail_free_inode; - + flush_delayed_work(&ip->i_gl->gl_work); glock_set_object(ip->i_gl, ip); + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_free_inode; @@ -775,14 +775,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: + glock_clear_object(io_gl, ip); gfs2_glock_dq_uninit(&ip->i_iopen_gh); gfs2_glock_put(io_gl); fail_gunlock2: if (io_gl) clear_bit(GLF_INODE_CREATING, &io_gl->gl_flags); fail_free_inode: - if (ip->i_gl) + if (ip->i_gl) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_put(ip->i_gl); + } gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 0515f0a68637..65f33a0ac190 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -23,8 +23,6 @@ #include "sys.h" #include "trace_gfs2.h" -extern struct workqueue_struct *gfs2_control_wq; - /** * gfs2_update_stats - Update time based stats * @mv: Pointer to mean/variance structure to update @@ -1059,6 +1057,7 @@ static void free_recover_size(struct lm_lockstruct *ls) ls->ls_recover_submit = NULL; ls->ls_recover_result = NULL; ls->ls_recover_size = 0; + ls->ls_lvb_bits = NULL; } /* dlm calls before it does lock recovery */ @@ -1175,7 +1174,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid, spin_unlock(&ls->ls_recover_spin); } -const struct dlm_lockspace_ops gdlm_lockspace_ops = { +static const struct dlm_lockspace_ops gdlm_lockspace_ops = { .recover_prep = gdlm_recover_prep, .recover_slot = gdlm_recover_slot, .recover_done = gdlm_recover_done, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 9a624f694400..f72c44231406 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -898,6 +898,10 @@ static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp) static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp) { unsigned int used_blocks = sdp->sd_jdesc->jd_blocks - atomic_read(&sdp->sd_log_blks_free); + + if (test_and_clear_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags)) + return 1; + return used_blocks + atomic_read(&sdp->sd_log_blks_needed) >= atomic_read(&sdp->sd_log_thresh2); } @@ -919,6 +923,15 @@ int gfs2_logd(void *data) while (!kthread_should_stop()) { + /* Check for errors writing to the journal */ + if (sdp->sd_log_error) { + gfs2_lm_withdraw(sdp, + "GFS2: fsid=%s: error %d: " + "withdrawing the file system to " + "prevent further damage.\n", + sdp->sd_fsname, sdp->sd_log_error); + } + did_flush = false; if (gfs2_jrnl_flush_reqd(sdp) || t == 0) { gfs2_ail1_empty(sdp); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 3010f9edd177..c8ff7b7954f0 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -207,8 +207,11 @@ static void gfs2_end_log_write(struct bio *bio) struct page *page; int i; - if (bio->bi_status) - fs_err(sdp, "Error %d writing to log\n", bio->bi_status); + if (bio->bi_status) { + fs_err(sdp, "Error %d writing to journal, jid=%u\n", + bio->bi_status, sdp->sd_jdesc->jd_jid); + wake_up(&sdp->sd_logd_waitq); + } bio_for_each_segment_all(bvec, bio, i) { page = bvec->bv_page; @@ -265,7 +268,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno) bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio->bi_end_io = gfs2_end_log_write; bio->bi_private = sdp; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fabe1614f879..52de1036d9f9 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -221,7 +221,7 @@ static void gfs2_submit_bhs(int op, int op_flags, struct buffer_head *bhs[], bio = bio_alloc(GFP_NOIO, num); bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); while (num > 0) { bh = *bhs; if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { @@ -419,8 +419,9 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; + } else { + *bhp = bh; } - *bhp = bh; return ret; } @@ -452,7 +453,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &first_bh); dblock++; extlen--; @@ -461,7 +462,9 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) bh = gfs2_getbuf(gl, dblock, CREATE); if (!buffer_uptodate(bh) && !buffer_locked(bh)) - ll_rw_block(REQ_OP_READ, REQ_RAHEAD | REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, + REQ_RAHEAD | REQ_META | REQ_PRIO, + 1, &bh); brelse(bh); dblock++; extlen--; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e76058d34b74..84593587691d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -242,7 +242,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_add_page(bio, page, PAGE_SIZE, 0); bio->bi_end_io = end_bio_io_page; @@ -1113,7 +1113,7 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent return error; } - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name); + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); error = gfs2_sys_fs_add(sdp); /* @@ -1159,10 +1159,10 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent } if (sdp->sd_args.ar_spectator) - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.s", sdp->sd_table_name); else - snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", + snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s.%u", sdp->sd_table_name, sdp->sd_lockstruct.ls_jid); error = init_inodes(sdp, DO); @@ -1388,7 +1388,6 @@ static void gfs2_kill_sb(struct super_block *sb) sdp->sd_root_dir = NULL; sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); - gfs2_delete_debugfs_file(sdp); free_percpu(sdp->sd_lkstats); kill_block_super(sb); } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index c2ca9566b764..e647938432bd 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -730,7 +730,7 @@ static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, if (PageUptodate(page)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { - ll_rw_block(REQ_OP_READ, REQ_META, 1, &bh); + ll_rw_block(REQ_OP_READ, REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) goto unlock_out; @@ -1474,8 +1474,11 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; - if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) { fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); + sdp->sd_log_error = error; + wake_up(&sdp->sd_logd_waitq); + } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 836e38ba5d0a..95b2a57ded33 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -705,8 +705,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) rb_erase(n, &sdp->sd_rindex_tree); if (gl) { - glock_set_object(gl, NULL); - gfs2_glock_add_to_lru(gl); + glock_clear_object(gl, rgd); gfs2_glock_put(gl); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index fdedec379b78..769841185ce5 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -924,6 +924,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); + gfs2_delete_debugfs_file(sdp); /* Unmount the locking protocol */ gfs2_lm_unmount(sdp); @@ -943,9 +944,9 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) struct gfs2_sbd *sdp = sb->s_fs_info; gfs2_quota_sync(sb, -1); - if (wait && sdp) + if (wait) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); - return 0; + return sdp->sd_log_error; } void gfs2_freeze_func(struct work_struct *work) @@ -1295,7 +1296,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * gfs2_drop_inode - Drop an inode (test for remote unlink) * @inode: The inode to drop * - * If we've received a callback on an iopen lock then its because a + * If we've received a callback on an iopen lock then it's because a * remote node tried to deallocate the inode but failed due to this node * still having the inode open. Here we mark the link count zero * since we know that it must have reached zero if the GLF_DEMOTE flag @@ -1317,6 +1318,23 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(GLF_DEMOTE, &gl->gl_flags)) clear_nlink(inode); } + + /* + * When under memory pressure when an inode's link count has dropped to + * zero, defer deleting the inode to the delete workqueue. This avoids + * calling into DLM under memory pressure, which can deadlock. + */ + if (!inode->i_nlink && + unlikely(current->flags & PF_MEMALLOC) && + gfs2_holder_initialized(&ip->i_iopen_gh)) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_queue_put(gl); + return false; + } + return generic_drop_inode(inode); } @@ -1501,6 +1519,22 @@ out_qs: } /** + * gfs2_glock_put_eventually + * @gl: The glock to put + * + * When under memory pressure, trigger a deferred glock put to make sure we + * won't call into DLM and deadlock. Otherwise, put the glock directly. + */ + +static void gfs2_glock_put_eventually(struct gfs2_glock *gl) +{ + if (current->flags & PF_MEMALLOC) + gfs2_glock_queue_put(gl); + else + gfs2_glock_put(gl); +} + +/** * gfs2_evict_inode - Remove an inode from cache * @inode: The inode to evict * @@ -1544,9 +1578,14 @@ static void gfs2_evict_inode(struct inode *inode) goto alloc_failed; } + /* Deletes should never happen under memory pressure anymore. */ + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) + goto out; + /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; @@ -1562,6 +1601,12 @@ static void gfs2_evict_inode(struct inode *inode) goto out_truncate; } + /* + * The inode may have been recreated in the meantime. + */ + if (inode->i_nlink) + goto out_truncate; + alloc_failed: if (gfs2_holder_initialized(&ip->i_iopen_gh) && test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { @@ -1595,6 +1640,11 @@ alloc_failed: goto out_unlock; } + /* We're about to clear the bitmap for the dinode, but as soon as we + do, gfs2_create_inode can create another inode at the same block + location and try to set gl_object again. We clear gl_object here so + that subsequent inode creates don't see an old gl_object. */ + glock_clear_object(ip->i_gl, ip); error = gfs2_dinode_dealloc(ip); goto out_unlock; @@ -1623,14 +1673,17 @@ out_unlock: gfs2_rs_deltree(&ip->i_res); if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } - if (gfs2_holder_initialized(&gh)) + if (gfs2_holder_initialized(&gh)) { + glock_clear_object(ip->i_gl, ip); gfs2_glock_dq_uninit(&gh); + } if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_evict_inode: %d\n", error); out: @@ -1640,15 +1693,19 @@ out: gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); - glock_set_object(ip->i_gl, NULL); + glock_clear_object(ip->i_gl, ip); wait_on_bit_io(&ip->i_flags, GIF_GLOP_PENDING, TASK_UNINTERRUPTIBLE); gfs2_glock_add_to_lru(ip->i_gl); - gfs2_glock_put(ip->i_gl); + gfs2_glock_put_eventually(ip->i_gl); ip->i_gl = NULL; if (gfs2_holder_initialized(&ip->i_iopen_gh)) { - glock_set_object(ip->i_iopen_gh.gh_gl, NULL); + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + + glock_clear_object(gl, ip); ip->i_iopen_gh.gh_flags |= GL_NOCACHE; + gfs2_glock_hold(gl); gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put_eventually(gl); } } diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index c81295f407f6..3926f95a6eb7 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -151,6 +151,7 @@ extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; +extern struct workqueue_struct *gfs2_control_wq; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 54179554c7d2..ea09e41dbb49 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -25,6 +25,7 @@ #include "meta_io.h" #include "quota.h" #include "rgrp.h" +#include "super.h" #include "trans.h" #include "util.h" @@ -1209,8 +1210,12 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, if (namel > GFS2_EA_MAX_NAME_LEN) return -ERANGE; - if (value == NULL) - return gfs2_xattr_remove(ip, type, name); + if (value == NULL) { + error = gfs2_xattr_remove(ip, type, name); + if (error == -ENODATA && !(flags & XATTR_REPLACE)) + error = 0; + return error; + } if (ea_check_size(sdp, namel, size)) return -ERANGE; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index bfbba799430f..2538b49cc349 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, struct super_block * sb; int ret, err; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; inode_lock(inode); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index e8638d528195..4f26b6877130 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -283,7 +283,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); int error = 0, error2; - error = filemap_write_and_wait_range(inode->i_mapping, start, end); + error = file_write_and_wait_range(file, start, end); if (error) return error; inode_lock(inode); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index e254fa0f0697..10032b919a85 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -65,7 +65,7 @@ int hfsplus_submit_bio(struct super_block *sb, sector_t sector, bio = bio_alloc(GFP_NOIO, 1); bio->bi_iter.bi_sector = sector; - bio->bi_bdev = sb->s_bdev; + bio_set_dev(bio, sb->s_bdev); bio_set_op_attrs(bio, op, op_flags); if (op != WRITE && data) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e61261a7417e..c148e7f4f451 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -374,7 +374,7 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index b3be1b5a62e2..f26138425b16 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -24,7 +24,7 @@ int hpfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int ret; - ret = filemap_write_and_wait_range(file->f_mapping, start, end); + ret = file_write_and_wait_range(file, start, end); if (ret) return ret; return sync_blockdev(inode->i_sb->s_bdev); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28d2753be094..59073e9f01a4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static void remove_huge_page(struct page *page) } static void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) { struct vm_area_struct *vma; @@ -401,9 +401,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, const pgoff_t end = lend >> huge_page_shift(h); struct vm_area_struct pseudo_vma; struct pagevec pvec; - pgoff_t next; + pgoff_t next, index; int i, freed = 0; - long lookup_nr = PAGEVEC_SIZE; bool truncate_op = (lend == LLONG_MAX); memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); @@ -412,33 +411,19 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, next = start; while (next < end) { /* - * Don't grab more pages than the number left in the range. - */ - if (end - next < lookup_nr) - lookup_nr = end - next; - - /* * When no more pages are found, we are done. */ - if (!pagevec_lookup(&pvec, mapping, next, lookup_nr)) + if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1)) break; for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; u32 hash; - /* - * The page (index) could be beyond end. This is - * only possible in the punch hole case as end is - * max page offset in the truncate case. - */ - next = page->index; - if (next >= end) - break; - + index = page->index; hash = hugetlb_fault_mutex_hash(h, current->mm, &pseudo_vma, - mapping, next, 0); + mapping, index, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); /* @@ -455,8 +440,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, i_mmap_lock_write(mapping); hugetlb_vmdelete_list(&mapping->i_mmap, - next * pages_per_huge_page(h), - (next + 1) * pages_per_huge_page(h)); + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h)); i_mmap_unlock_write(mapping); } @@ -475,14 +460,13 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, - next, next + 1, 1))) + index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); } unlock_page(page); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } - ++next; huge_pagevec_release(&pvec); cond_resched(); } @@ -514,7 +498,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) i_size_write(inode, offset); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, offset, LLONG_MAX); @@ -539,7 +523,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); i_mmap_lock_write(mapping); - if (!RB_EMPTY_ROOT(&mapping->i_mmap)) + if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, hole_start >> PAGE_SHIFT, hole_end >> PAGE_SHIFT); @@ -846,7 +830,10 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, rc = migrate_huge_page_move_mapping(mapping, newpage, page); if (rc != MIGRATEPAGE_SUCCESS) return rc; - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } diff --git a/fs/inode.c b/fs/inode.c index 50370599e371..d1e35b53bb23 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -353,7 +353,7 @@ void address_space_init_once(struct address_space *mapping) init_rwsem(&mapping->i_mmap_rwsem); INIT_LIST_HEAD(&mapping->private_list); spin_lock_init(&mapping->private_lock); - mapping->i_mmap = RB_ROOT; + mapping->i_mmap = RB_ROOT_CACHED; } EXPORT_SYMBOL(address_space_init_once); @@ -637,6 +637,7 @@ again: dispose_list(&dispose); } +EXPORT_SYMBOL_GPL(evict_inodes); /** * invalidate_inodes - attempt to free all inodes on a superblock @@ -1569,11 +1570,24 @@ EXPORT_SYMBOL(bmap); static void update_ovl_inode_times(struct dentry *dentry, struct inode *inode, bool rcu) { - if (!rcu) { - struct inode *realinode = d_real_inode(dentry); + struct dentry *upperdentry; - if (unlikely(inode != realinode) && - (!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || + /* + * Nothing to do if in rcu or if non-overlayfs + */ + if (rcu || likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return; + + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + + /* + * If file is on lower then we can't update atime, so no worries about + * stale mtime/ctime. + */ + if (upperdentry) { + struct inode *realinode = d_inode(upperdentry); + + if ((!timespec_equal(&inode->i_mtime, &realinode->i_mtime) || !timespec_equal(&inode->i_ctime, &realinode->i_ctime))) { inode->i_mtime = realinode->i_mtime; inode->i_ctime = realinode->i_ctime; diff --git a/fs/internal.h b/fs/internal.h index 9676fe11c093..48cee21b4f14 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -71,8 +71,10 @@ extern void __init mnt_init(void); extern int __mnt_want_write(struct vfsmount *); extern int __mnt_want_write_file(struct file *); +extern int mnt_want_write_file_path(struct file *); extern void __mnt_drop_write(struct vfsmount *); extern void __mnt_drop_write_file(struct file *); +extern void mnt_drop_write_file_path(struct file *); /* * fs_struct.c @@ -132,7 +134,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); -extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); /* diff --git a/fs/iomap.c b/fs/iomap.c index 59cc98ad7577..269b24a01f32 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -477,10 +477,10 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) set_page_dirty(page); wait_for_stable_page(page); - return 0; + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); - return ret; + return block_page_mkwrite_return(ret); } EXPORT_SYMBOL_GPL(iomap_page_mkwrite); @@ -805,7 +805,7 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, struct bio *bio; bio = bio_alloc(GFP_KERNEL, 1); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_private = dio; @@ -884,7 +884,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return 0; bio = bio_alloc(GFP_KERNEL, nr_pages); - bio->bi_bdev = iomap->bdev; + bio_set_dev(bio, iomap->bdev); bio->bi_iter.bi_sector = iomap->blkno + ((pos - iomap->offset) >> 9); bio->bi_write_hint = dio->iocb->ki_hint; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 217a5e7815da..f1ed935322db 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -96,7 +96,7 @@ static int __init init_inodecache(void) 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); - if (isofs_inode_cachep == NULL) + if (!isofs_inode_cachep) return -ENOMEM; return 0; } @@ -678,7 +678,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) if (isonum_711(vdp->type) == ISO_VD_END) break; if (isonum_711(vdp->type) == ISO_VD_PRIMARY) { - if (pri == NULL) { + if (!pri) { pri = (struct iso_primary_descriptor *)vdp; /* Save the buffer in case we need it ... */ pri_bh = bh; @@ -742,7 +742,7 @@ root_found: goto out_freebh; } - if (joliet_level && (pri == NULL || !opt.rock)) { + if (joliet_level && (!pri || !opt.rock)) { /* This is the case of Joliet with the norock mount flag. * A disc with both Joliet and Rock Ridge is handled later */ @@ -1298,7 +1298,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned long block; int high_sierra = sbi->s_high_sierra; - struct buffer_head *bh = NULL; + struct buffer_head *bh; struct iso_directory_record *de; struct iso_directory_record *tmpde = NULL; unsigned int de_len; @@ -1320,8 +1320,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) int frag1 = bufsize - offset; tmpde = kmalloc(de_len, GFP_KERNEL); - if (tmpde == NULL) { - printk(KERN_INFO "%s: out of memory\n", __func__); + if (!tmpde) { ret = -ENOMEM; goto fail; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index c12476e309c6..bd0428bebe9b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -35,7 +35,7 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); int ret; - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); + ret = file_write_and_wait_range(filp, start, end); if (ret) return ret; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index b25d28a21212..48d9522e209c 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -17,7 +17,7 @@ #include <linux/slab.h> #include <linux/mtd/mtd.h> #include <linux/crc32.h> -#include <linux/mtd/nand.h> +#include <linux/mtd/rawnand.h> #include <linux/jiffies.h> #include <linux/sched.h> #include <linux/writeback.h> diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 739492c7a3fd..36665fd37095 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -34,7 +34,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) struct inode *inode = file->f_mapping->host; int rc = 0; - rc = filemap_write_and_wait_range(inode->i_mapping, start, end); + rc = file_write_and_wait_range(file, start, end); if (rc) return rc; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a21f0e9eecd4..0e5d412c0b01 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1995,7 +1995,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); @@ -2139,7 +2139,7 @@ static void lbmStartIO(struct lbuf * bp) bio = bio_alloc(GFP_NOFS, 1); bio->bi_iter.bi_sector = bp->l_blkno << (log->l2bsize - 9); - bio->bi_bdev = log->bdev; + bio_set_dev(bio, log->bdev); bio_add_page(bio, bp->l_page, LOGPSIZE, bp->l_offset); BUG_ON(bio->bi_iter.bi_size != LOGPSIZE); diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 65120a471729..1c4b9ad4d7ab 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -430,7 +430,7 @@ static int metapage_writepage(struct page *page, struct writeback_control *wbc) len = min(xlen, (int)JFS_SBI(inode->i_sb)->nbperpage); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_write_end_io; bio->bi_private = page; @@ -510,7 +510,7 @@ static int metapage_readpage(struct file *fp, struct page *page) submit_bio(bio); bio = bio_alloc(GFP_NOFS, 1); - bio->bi_bdev = inode->i_sb->s_bdev; + bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblock << (inode->i_blkbits - 9); bio->bi_end_io = metapage_read_end_io; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 78b41e1d5c67..60726ae7cf26 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -619,16 +619,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) if (!sb->s_root) goto out_no_root; - /* logical blocks are represented by 40 bits in pxd_t, etc. */ - sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; -#if BITS_PER_LONG == 32 - /* - * Page cache is indexed by long. - * I would use MAX_LFS_FILESIZE, but it's only half as big + /* logical blocks are represented by 40 bits in pxd_t, etc. + * and page cache is indexed by long */ - sb->s_maxbytes = min(((u64) PAGE_SIZE << 32) - 1, - (u64)sb->s_maxbytes); -#endif + sb->s_maxbytes = min(((loff_t)sb->s_blocksize) << 40, MAX_LFS_FILESIZE); sb->s_time_gran = 1; return 0; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index db5900aaa55a..89d1dc19340b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -21,6 +21,7 @@ DEFINE_MUTEX(kernfs_mutex); static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */ static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */ +static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -507,6 +508,10 @@ void kernfs_put(struct kernfs_node *kn) struct kernfs_node *parent; struct kernfs_root *root; + /* + * kernfs_node is freed with ->count 0, kernfs_find_and_get_node_by_ino + * depends on this to filter reused stale node + */ if (!kn || !atomic_dec_and_test(&kn->count)) return; root = kernfs_root(kn); @@ -533,7 +538,9 @@ void kernfs_put(struct kernfs_node *kn) simple_xattrs_free(&kn->iattr->xattrs); } kfree(kn->iattr); - ida_simple_remove(&root->ino_ida, kn->ino); + spin_lock(&kernfs_idr_lock); + idr_remove(&root->ino_idr, kn->id.ino); + spin_unlock(&kernfs_idr_lock); kmem_cache_free(kernfs_node_cache, kn); kn = parent; @@ -542,7 +549,7 @@ void kernfs_put(struct kernfs_node *kn) goto repeat; } else { /* just released the root kn, free @root too */ - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); } } @@ -559,7 +566,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) if (d_really_is_negative(dentry)) goto out_bad_unlocked; - kn = dentry->d_fsdata; + kn = kernfs_dentry_node(dentry); mutex_lock(&kernfs_mutex); /* The kernfs node has been deactivated */ @@ -567,7 +574,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; /* The kernfs node has been moved? */ - if (dentry->d_parent->d_fsdata != kn->parent) + if (kernfs_dentry_node(dentry->d_parent) != kn->parent) goto out_bad; /* The kernfs node has been renamed */ @@ -587,14 +594,8 @@ out_bad_unlocked: return 0; } -static void kernfs_dop_release(struct dentry *dentry) -{ - kernfs_put(dentry->d_fsdata); -} - const struct dentry_operations kernfs_dops = { .d_revalidate = kernfs_dop_revalidate, - .d_release = kernfs_dop_release, }; /** @@ -610,8 +611,9 @@ const struct dentry_operations kernfs_dops = { */ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { - if (dentry->d_sb->s_op == &kernfs_sops) - return dentry->d_fsdata; + if (dentry->d_sb->s_op == &kernfs_sops && + !d_really_is_negative(dentry)) + return kernfs_dentry_node(dentry); return NULL; } @@ -620,6 +622,8 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, unsigned flags) { struct kernfs_node *kn; + u32 gen; + int cursor; int ret; name = kstrdup_const(name, GFP_KERNEL); @@ -630,11 +634,25 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + idr_preload(GFP_KERNEL); + spin_lock(&kernfs_idr_lock); + cursor = idr_get_cursor(&root->ino_idr); + ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); + if (ret >= 0 && ret < cursor) + root->next_generation++; + gen = root->next_generation; + spin_unlock(&kernfs_idr_lock); + idr_preload_end(); if (ret < 0) goto err_out2; - kn->ino = ret; + kn->id.ino = ret; + kn->id.generation = gen; + /* + * set ino first. This barrier is paired with atomic_inc_not_zero in + * kernfs_find_and_get_node_by_ino + */ + smp_mb__before_atomic(); atomic_set(&kn->count, 1); atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); @@ -666,6 +684,54 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, return kn; } +/* + * kernfs_find_and_get_node_by_ino - get kernfs_node from inode number + * @root: the kernfs root + * @ino: inode number + * + * RETURNS: + * NULL on failure. Return a kernfs node with reference counter incremented + */ +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino) +{ + struct kernfs_node *kn; + + rcu_read_lock(); + kn = idr_find(&root->ino_idr, ino); + if (!kn) + goto out; + + /* + * Since kernfs_node is freed in RCU, it's possible an old node for ino + * is freed, but reused before RCU grace period. But a freed node (see + * kernfs_put) or an incompletedly initialized node (see + * __kernfs_new_node) should have 'count' 0. We can use this fact to + * filter out such node. + */ + if (!atomic_inc_not_zero(&kn->count)) { + kn = NULL; + goto out; + } + + /* + * The node could be a new node or a reused node. If it's a new node, + * we are ok. If it's reused because of RCU (because of + * SLAB_TYPESAFE_BY_RCU), the __kernfs_new_node always sets its 'ino' + * before 'count'. So if 'count' is uptodate, 'ino' should be uptodate, + * hence we can use 'ino' to filter stale node. + */ + if (kn->id.ino != ino) + goto out; + rcu_read_unlock(); + + return kn; +out: + rcu_read_unlock(); + kernfs_put(kn); + return NULL; +} + /** * kernfs_add_one - add kernfs_node to parent without warning * @kn: kernfs_node to be added @@ -875,13 +941,14 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, if (!root) return ERR_PTR(-ENOMEM); - ida_init(&root->ino_ida); + idr_init(&root->ino_idr); INIT_LIST_HEAD(&root->supers); + root->next_generation = 1; kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO, KERNFS_DIR); if (!kn) { - ida_destroy(&root->ino_ida); + idr_destroy(&root->ino_idr); kfree(root); return ERR_PTR(-ENOMEM); } @@ -984,7 +1051,7 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, unsigned int flags) { struct dentry *ret; - struct kernfs_node *parent = dentry->d_parent->d_fsdata; + struct kernfs_node *parent = dir->i_private; struct kernfs_node *kn; struct inode *inode; const void *ns = NULL; @@ -1001,8 +1068,6 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, ret = NULL; goto out_unlock; } - kernfs_get(kn); - dentry->d_fsdata = kn; /* attach dentry and inode */ inode = kernfs_get_inode(dir->i_sb, kn); @@ -1039,7 +1104,7 @@ static int kernfs_iop_mkdir(struct inode *dir, struct dentry *dentry, static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1059,7 +1124,7 @@ static int kernfs_iop_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct kernfs_node *kn = old_dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(old_dentry); struct kernfs_node *new_parent = new_dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(kn)->syscall_ops; int ret; @@ -1572,7 +1637,7 @@ static struct kernfs_node *kernfs_dir_next_pos(const void *ns, static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct kernfs_node *parent = dentry->d_fsdata; + struct kernfs_node *parent = kernfs_dentry_node(dentry); struct kernfs_node *pos = file->private_data; const void *ns = NULL; @@ -1589,7 +1654,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) const char *name = pos->name; unsigned int type = dt_type(pos); int len = strlen(name); - ino_t ino = pos->ino; + ino_t ino = pos->id.ino; ctx->pos = pos->hash; file->private_data = pos; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index ac2dfe0c5a9c..9698e51656b1 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -616,7 +616,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { - struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; @@ -768,7 +768,7 @@ static void kernfs_release_file(struct kernfs_node *kn, static int kernfs_fop_release(struct inode *inode, struct file *filp) { - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_open_file *of = kernfs_of(filp); if (kn->flags & KERNFS_HAS_RELEASE) { @@ -835,7 +835,7 @@ void kernfs_drain_open_files(struct kernfs_node *kn) static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) { struct kernfs_open_file *of = kernfs_of(filp); - struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(filp->f_path.dentry); struct kernfs_open_node *on = kn->attr.open; if (!kernfs_get_active(kn)) @@ -895,7 +895,7 @@ repeat: * have the matching @file available. Look up the inodes * and generate the events manually. */ - inode = ilookup(info->sb, kn->ino); + inode = ilookup(info->sb, kn->id.ino); if (!inode) continue; @@ -903,7 +903,7 @@ repeat: if (parent) { struct inode *p_inode; - p_inode = ilookup(info->sb, parent->ino); + p_inode = ilookup(info->sb, parent->id.ino); if (p_inode) { fsnotify(p_inode, FS_MODIFY | FS_EVENT_ON_CHILD, inode, FSNOTIFY_EVENT_INODE, kn->name, 0); @@ -997,7 +997,7 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, #ifdef CONFIG_DEBUG_LOCK_ALLOC if (key) { - lockdep_init_map(&kn->dep_map, "s_active", key, 0); + lockdep_init_map(&kn->dep_map, "kn->count", key, 0); kn->flags |= KERNFS_LOCKDEP; } #endif diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index fb4b4a79a0d6..a34303981deb 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -112,7 +112,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = d_inode(dentry); - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; int error; if (!kn) @@ -154,7 +154,7 @@ static int kernfs_node_setsecdata(struct kernfs_iattrs *attrs, void **secdata, ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = kernfs_dentry_node(dentry); struct kernfs_iattrs *attrs; attrs = kernfs_iattrs(kn); @@ -203,8 +203,8 @@ static void kernfs_refresh_inode(struct kernfs_node *kn, struct inode *inode) int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct kernfs_node *kn = path->dentry->d_fsdata; struct inode *inode = d_inode(path->dentry); + struct kernfs_node *kn = inode->i_private; mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); @@ -220,6 +220,7 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) inode->i_private = kn; inode->i_mapping->a_ops = &kernfs_aops; inode->i_op = &kernfs_iops; + inode->i_generation = kn->id.generation; set_default_inode_attr(inode, kn->mode); kernfs_refresh_inode(kn, inode); @@ -265,7 +266,7 @@ struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { struct inode *inode; - inode = iget_locked(sb, kn->ino); + inode = iget_locked(sb, kn->id.ino); if (inode && (inode->i_state & I_NEW)) kernfs_init_inode(kn, inode); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 2d5144ab4251..0f260dcca177 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -70,6 +70,13 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) +{ + if (d_really_is_negative(dentry)) + return NULL; + return d_inode(dentry)->i_private; +} + extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache; @@ -98,6 +105,8 @@ int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, unsigned flags); +struct kernfs_node *kernfs_find_and_get_node_by_ino(struct kernfs_root *root, + unsigned int ino); /* * file.c diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index d5b149a45be1..95a7c88baed9 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/namei.h> #include <linux/seq_file.h> +#include <linux/exportfs.h> #include "kernfs-internal.h" @@ -33,7 +34,7 @@ static int kernfs_sop_remount_fs(struct super_block *sb, int *flags, char *data) static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_root *root = kernfs_root(dentry->d_fsdata); + struct kernfs_root *root = kernfs_root(kernfs_dentry_node(dentry)); struct kernfs_syscall_ops *scops = root->syscall_ops; if (scops && scops->show_options) @@ -43,7 +44,7 @@ static int kernfs_sop_show_options(struct seq_file *sf, struct dentry *dentry) static int kernfs_sop_show_path(struct seq_file *sf, struct dentry *dentry) { - struct kernfs_node *node = dentry->d_fsdata; + struct kernfs_node *node = kernfs_dentry_node(dentry); struct kernfs_root *root = kernfs_root(node); struct kernfs_syscall_ops *scops = root->syscall_ops; @@ -64,6 +65,78 @@ const struct super_operations kernfs_sops = { .show_path = kernfs_sop_show_path, }; +/* + * Similar to kernfs_fh_get_inode, this one gets kernfs node from inode + * number and generation + */ +struct kernfs_node *kernfs_get_node_by_id(struct kernfs_root *root, + const union kernfs_node_id *id) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get_node_by_ino(root, id->ino); + if (!kn) + return NULL; + if (kn->id.generation != id->generation) { + kernfs_put(kn); + return NULL; + } + return kn; +} + +static struct inode *kernfs_fh_get_inode(struct super_block *sb, + u64 ino, u32 generation) +{ + struct kernfs_super_info *info = kernfs_info(sb); + struct inode *inode; + struct kernfs_node *kn; + + if (ino == 0) + return ERR_PTR(-ESTALE); + + kn = kernfs_find_and_get_node_by_ino(info->root, ino); + if (!kn) + return ERR_PTR(-ESTALE); + inode = kernfs_get_inode(sb, kn); + kernfs_put(kn); + if (!inode) + return ERR_PTR(-ESTALE); + + if (generation && inode->i_generation != generation) { + /* we didn't find the right inode.. */ + iput(inode); + return ERR_PTR(-ESTALE); + } + return inode; +} + +static struct dentry *kernfs_fh_to_dentry(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_dentry(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_fh_to_parent(struct super_block *sb, struct fid *fid, + int fh_len, int fh_type) +{ + return generic_fh_to_parent(sb, fid, fh_len, fh_type, + kernfs_fh_get_inode); +} + +static struct dentry *kernfs_get_parent_dentry(struct dentry *child) +{ + struct kernfs_node *kn = kernfs_dentry_node(child); + + return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); +} + +static const struct export_operations kernfs_export_ops = { + .fh_to_dentry = kernfs_fh_to_dentry, + .fh_to_parent = kernfs_fh_to_parent, + .get_parent = kernfs_get_parent_dentry, +}; + /** * kernfs_root_from_sb - determine kernfs_root associated with a super_block * @sb: the super_block in question @@ -159,6 +232,8 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) sb->s_magic = magic; sb->s_op = &kernfs_sops; sb->s_xattr = kernfs_xattr_handlers; + if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) + sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; /* get root inode, initialize and unlock it */ @@ -176,8 +251,6 @@ static int kernfs_fill_super(struct super_block *sb, unsigned long magic) pr_debug("%s: could not get root dentry!\n", __func__); return -ENOMEM; } - kernfs_get(info->root->kn); - root->d_fsdata = info->root->kn; sb->s_root = root; sb->s_d_op = &kernfs_dops; return 0; @@ -283,7 +356,6 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, void kernfs_kill_sb(struct super_block *sb) { struct kernfs_super_info *info = kernfs_info(sb); - struct kernfs_node *root_kn = sb->s_root->d_fsdata; mutex_lock(&kernfs_mutex); list_del(&info->node); @@ -295,7 +367,6 @@ void kernfs_kill_sb(struct super_block *sb) */ kill_anon_super(sb); kfree(info); - kernfs_put(root_kn); } /** @@ -330,7 +401,16 @@ struct super_block *kernfs_pin_sb(struct kernfs_root *root, const void *ns) void __init kernfs_init(void) { + + /* + * the slab is freed in RCU context, so kernfs_find_and_get_node_by_ino + * can access the slab lock free. This could introduce stale nodes, + * please see how kernfs_find_and_get_node_by_ino filters out stale + * nodes. + */ kernfs_node_cache = kmem_cache_create("kernfs_node_cache", sizeof(struct kernfs_node), - 0, SLAB_PANIC, NULL); + 0, + SLAB_PANIC | SLAB_TYPESAFE_BY_RCU, + NULL); } diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 1684af4a8b9b..08ccabd7047f 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -98,9 +98,9 @@ static int kernfs_get_target_path(struct kernfs_node *parent, return 0; } -static int kernfs_getlink(struct dentry *dentry, char *path) +static int kernfs_getlink(struct inode *inode, char *path) { - struct kernfs_node *kn = dentry->d_fsdata; + struct kernfs_node *kn = inode->i_private; struct kernfs_node *parent = kn->parent; struct kernfs_node *target = kn->symlink.target_kn; int error; @@ -124,7 +124,7 @@ static const char *kernfs_iop_get_link(struct dentry *dentry, body = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, body); + error = kernfs_getlink(inode, body); if (unlikely(error < 0)) { kfree(body); return ERR_PTR(error); diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 27d577dbe51a..96c1d14c18f1 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -235,12 +235,8 @@ reclaimer(void *ptr) struct net *net = host->net; req = kmalloc(sizeof(*req), GFP_KERNEL); - if (!req) { - printk(KERN_ERR "lockd: reclaimer unable to alloc memory." - " Locks for %s won't be reclaimed!\n", - host->h_name); + if (!req) return 0; - } allow_signal(SIGKILL); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 726b6cecf430..b995bdc13976 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -396,7 +396,7 @@ out_rqst: return error; } -static struct svc_serv_ops lockd_sv_ops = { +static const struct svc_serv_ops lockd_sv_ops = { .svo_shutdown = svc_rpcb_cleanup, .svo_enqueue_xprt = svc_xprt_do_enqueue, }; diff --git a/fs/locks.c b/fs/locks.c index afefeb4ad6de..1bd71c4d663a 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -137,6 +137,7 @@ #define IS_FLOCK(fl) (fl->fl_flags & FL_FLOCK) #define IS_LEASE(fl) (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT)) #define IS_OFDLCK(fl) (fl->fl_flags & FL_OFDLCK) +#define IS_REMOTELCK(fl) (fl->fl_pid <= 0) static inline bool is_remote_lock(struct file *filp) { @@ -270,6 +271,22 @@ locks_check_ctx_lists(struct inode *inode) } } +static void +locks_check_ctx_file_list(struct file *filp, struct list_head *list, + char *list_type) +{ + struct file_lock *fl; + struct inode *inode = locks_inode(filp); + + list_for_each_entry(fl, list, fl_list) + if (fl->fl_file == filp) + pr_warn("Leaked %s lock on dev=0x%x:0x%x ino=0x%lx " + " fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", + list_type, MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), inode->i_ino, + fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); +} + void locks_free_lock_context(struct inode *inode) { @@ -733,7 +750,6 @@ static void locks_wake_up_blocks(struct file_lock *blocker) static void locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before) { - fl->fl_nspid = get_pid(task_tgid(current)); list_add_tail(&fl->fl_list, before); locks_insert_global_locks(fl); } @@ -743,10 +759,6 @@ locks_unlink_lock_ctx(struct file_lock *fl) { locks_delete_global_locks(fl); list_del_init(&fl->fl_list); - if (fl->fl_nspid) { - put_pid(fl->fl_nspid); - fl->fl_nspid = NULL; - } locks_wake_up_blocks(fl); } @@ -823,8 +835,6 @@ posix_test_lock(struct file *filp, struct file_lock *fl) list_for_each_entry(cfl, &ctx->flc_posix, fl_list) { if (posix_locks_conflict(fl, cfl)) { locks_copy_conflock(fl, cfl); - if (cfl->fl_nspid) - fl->fl_pid = pid_vnr(cfl->fl_nspid); goto out; } } @@ -2048,9 +2058,33 @@ int vfs_test_lock(struct file *filp, struct file_lock *fl) } EXPORT_SYMBOL_GPL(vfs_test_lock); +/** + * locks_translate_pid - translate a file_lock's fl_pid number into a namespace + * @fl: The file_lock who's fl_pid should be translated + * @ns: The namespace into which the pid should be translated + * + * Used to tranlate a fl_pid into a namespace virtual pid number + */ +static pid_t locks_translate_pid(struct file_lock *fl, struct pid_namespace *ns) +{ + pid_t vnr; + struct pid *pid; + + if (IS_OFDLCK(fl)) + return -1; + if (IS_REMOTELCK(fl)) + return fl->fl_pid; + + rcu_read_lock(); + pid = find_pid_ns(fl->fl_pid, &init_pid_ns); + vnr = pid_nr_ns(pid, ns); + rcu_read_unlock(); + return vnr; +} + static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); #if BITS_PER_LONG == 32 /* * Make sure we can represent the posix lock via @@ -2072,7 +2106,7 @@ static int posix_lock_to_flock(struct flock *flock, struct file_lock *fl) #if BITS_PER_LONG == 32 static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) { - flock->l_pid = IS_OFDLCK(fl) ? -1 : fl->fl_pid; + flock->l_pid = locks_translate_pid(fl, task_active_pid_ns(current)); flock->l_start = fl->fl_start; flock->l_len = fl->fl_end == OFFSET_MAX ? 0 : fl->fl_end - fl->fl_start + 1; @@ -2086,14 +2120,17 @@ static void posix_lock_to_flock64(struct flock64 *flock, struct file_lock *fl) */ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock_to_posix_lock(filp, &file_lock, flock); + error = flock_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2103,23 +2140,22 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock *flock) goto out; cmd = F_GETLK; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) { - error = posix_lock_to_flock(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) { + error = posix_lock_to_flock(flock, fl); if (error) - goto rel_priv; + goto out; } -rel_priv: - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2298,14 +2334,18 @@ out: */ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) { - struct file_lock file_lock; + struct file_lock *fl; int error; + fl = locks_alloc_lock(); + if (fl == NULL) + return -ENOMEM; + error = -EINVAL; if (flock->l_type != F_RDLCK && flock->l_type != F_WRLCK) goto out; - error = flock64_to_posix_lock(filp, &file_lock, flock); + error = flock64_to_posix_lock(filp, fl, flock); if (error) goto out; @@ -2315,20 +2355,20 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 *flock) goto out; cmd = F_GETLK64; - file_lock.fl_flags |= FL_OFDLCK; - file_lock.fl_owner = filp; + fl->fl_flags |= FL_OFDLCK; + fl->fl_owner = filp; } - error = vfs_test_lock(filp, &file_lock); + error = vfs_test_lock(filp, fl); if (error) goto out; - flock->l_type = file_lock.fl_type; - if (file_lock.fl_type != F_UNLCK) - posix_lock_to_flock64(flock, &file_lock); + flock->l_type = fl->fl_type; + if (fl->fl_type != F_UNLCK) + posix_lock_to_flock64(flock, fl); - locks_release_private(&file_lock); out: + locks_free_lock(fl); return error; } @@ -2525,6 +2565,12 @@ void locks_remove_file(struct file *filp) /* remove any leases */ locks_remove_lease(filp, ctx); + + spin_lock(&ctx->flc_lock); + locks_check_ctx_file_list(filp, &ctx->flc_posix, "POSIX"); + locks_check_ctx_file_list(filp, &ctx->flc_flock, "FLOCK"); + locks_check_ctx_file_list(filp, &ctx->flc_lease, "LEASE"); + spin_unlock(&ctx->flc_lock); } /** @@ -2578,22 +2624,16 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, { struct inode *inode = NULL; unsigned int fl_pid; + struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - if (fl->fl_nspid) { - struct pid_namespace *proc_pidns = file_inode(f->file)->i_sb->s_fs_info; - - /* Don't let fl_pid change based on who is reading the file */ - fl_pid = pid_nr_ns(fl->fl_nspid, proc_pidns); - - /* - * If there isn't a fl_pid don't display who is waiting on - * the lock if we are called from locks_show, or if we are - * called from __show_fd_info - skip lock entirely - */ - if (fl_pid == 0) - return; - } else - fl_pid = fl->fl_pid; + fl_pid = locks_translate_pid(fl, proc_pidns); + /* + * If there isn't a fl_pid don't display who is waiting on + * the lock if we are called from locks_show, or if we are + * called from __show_fd_info - skip lock entirely + */ + if (fl_pid == 0) + return; if (fl->fl_file != NULL) inode = locks_inode(fl->fl_file); @@ -2668,7 +2708,7 @@ static int locks_show(struct seq_file *f, void *v) fl = hlist_entry(v, struct file_lock, fl_link); - if (fl->fl_nspid && !pid_nr_ns(fl->fl_nspid, proc_pidns)) + if (locks_translate_pid(fl, proc_pidns) == 0) return 0; lock_get_status(f, fl, iter->li_pos, ""); diff --git a/fs/mpage.c b/fs/mpage.c index 2e4c41ccb5c9..37bb77c1302c 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -83,7 +83,7 @@ mpage_alloc(struct block_device *bdev, } if (bio) { - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = first_sector; } return bio; diff --git a/fs/namei.c b/fs/namei.c index ddb6a7c2b3d4..1180f9c58093 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,9 +1129,18 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && - path->dentry->d_inode) - return -EISDIR; + LOOKUP_OPEN | LOOKUP_CREATE | + LOOKUP_AUTOMOUNT))) { + /* Positive dentry that isn't meant to trigger an + * automount, EISDIR will allow it to be used, + * otherwise there's no mount here "now" so return + * ENOENT. + */ + if (path->dentry->d_inode) + return -EISDIR; + else + return -ENOENT; + } if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; diff --git a/fs/namespace.c b/fs/namespace.c index f8893dc6a989..df0f7521979a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -431,13 +431,18 @@ int __mnt_want_write_file(struct file *file) } /** - * mnt_want_write_file - get write access to a file's mount + * mnt_want_write_file_path - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already + * + * Called by the vfs for cases when we have an open file at hand, but will do an + * inode operation on it (important distinction for files opened on overlayfs, + * since the file operations will come from the real underlying file, while + * inode operations come from the overlay). */ -int mnt_want_write_file(struct file *file) +int mnt_want_write_file_path(struct file *file) { int ret; @@ -447,6 +452,53 @@ int mnt_want_write_file(struct file *file) sb_end_write(file->f_path.mnt->mnt_sb); return ret; } + +static inline int may_write_real(struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct dentry *upperdentry; + + /* Writable file? */ + if (file->f_mode & FMODE_WRITER) + return 0; + + /* Not overlayfs? */ + if (likely(!(dentry->d_flags & DCACHE_OP_REAL))) + return 0; + + /* File refers to upper, writable layer? */ + upperdentry = d_real(dentry, NULL, 0, D_REAL_UPPER); + if (upperdentry && file_inode(file) == d_inode(upperdentry)) + return 0; + + /* Lower layer: can't write to real file, sorry... */ + return -EPERM; +} + +/** + * mnt_want_write_file - get write access to a file's mount + * @file: the file who's mount on which to take a write + * + * This is like mnt_want_write, but it takes a file and can + * do some optimisations if the file is open for write already + * + * Mostly called by filesystems from their ioctl operation before performing + * modification. On overlayfs this needs to check if the file is on a read-only + * lower layer and deny access in that case. + */ +int mnt_want_write_file(struct file *file) +{ + int ret; + + ret = may_write_real(file); + if (!ret) { + sb_start_write(file_inode(file)->i_sb); + ret = __mnt_want_write_file(file); + if (ret) + sb_end_write(file_inode(file)->i_sb); + } + return ret; +} EXPORT_SYMBOL_GPL(mnt_want_write_file); /** @@ -484,10 +536,16 @@ void __mnt_drop_write_file(struct file *file) __mnt_drop_write(file->f_path.mnt); } -void mnt_drop_write_file(struct file *file) +void mnt_drop_write_file_path(struct file *file) { mnt_drop_write(file->f_path.mnt); } + +void mnt_drop_write_file(struct file *file) +{ + __mnt_drop_write(file->f_path.mnt); + sb_end_write(file_inode(file)->i_sb); +} EXPORT_SYMBOL(mnt_drop_write_file); static int mnt_make_readonly(struct mount *mnt) diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 76965e772264..a06c07619ee6 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -23,7 +23,7 @@ static int ncp_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - return filemap_write_and_wait_range(file->f_mapping, start, end); + return file_write_and_wait_range(file, start, end); } /* diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index d8863a804b15..995d707537da 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -130,7 +130,7 @@ bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, if (bio) { bio->bi_iter.bi_sector = disk_sector; - bio->bi_bdev = bdev; + bio_set_dev(bio, bdev); bio->bi_end_io = end_io; bio->bi_private = par; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 34323877ec13..2cddf7f437e6 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -226,26 +226,26 @@ err_bind: return ret; } -static struct svc_serv_ops nfs40_cb_sv_ops = { +static const struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) -static struct svc_serv_ops nfs41_cb_sv_ops = { +static const struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = &nfs41_cb_sv_ops, }; #else -static struct svc_serv_ops *nfs4_cb_sv_ops[] = { +static const struct svc_serv_ops *nfs4_cb_sv_ops[] = { [0] = &nfs40_cb_sv_ops, [1] = NULL, }; @@ -254,8 +254,8 @@ static struct svc_serv_ops *nfs4_cb_sv_ops[] = { static struct svc_serv *nfs_callback_create_svc(int minorversion) { struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion]; + const struct svc_serv_ops *sv_ops; struct svc_serv *serv; - struct svc_serv_ops *sv_ops; /* * Check whether we're already up and running. diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 5427cdf04c5a..14358de173fb 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -51,7 +51,7 @@ __be32 nfs4_callback_getattr(void *argp, void *resp, goto out_iput; res->size = i_size_read(inode); res->change_attr = delegation->change_attr; - if (nfsi->nrequests != 0) + if (nfs_have_writebacks(inode)) res->change_attr++; res->ctime = inode->i_ctime; res->mtime = inode->i_mtime; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index d7df5e67b0c1..606dd3871f66 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1089,7 +1089,7 @@ bool nfs4_delegation_flush_on_close(const struct inode *inode) delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || !(delegation->type & FMODE_WRITE)) goto out; - if (nfsi->nrequests < delegation->pagemod_limit) + if (atomic_long_read(&nfsi->nrequests) < delegation->pagemod_limit) ret = false; out: rcu_read_unlock(); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 3522b1249019..5ceaeb1f6fb6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2260,7 +2260,6 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str spin_lock(&inode->i_lock); retry = false; } - res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; list_move_tail(&cache->lru, &nfsi->access_cache_entry_lru); @@ -2296,7 +2295,6 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred, goto out; if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) goto out; - res->jiffies = cache->jiffies; res->cred = cache->cred; res->mask = cache->mask; err = 0; @@ -2344,7 +2342,6 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) if (cache == NULL) return; RB_CLEAR_NODE(&cache->rb_node); - cache->jiffies = set->jiffies; cache->cred = get_rpccred(set->cred); cache->mask = set->mask; @@ -2432,7 +2429,6 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE | NFS_MAY_WRITE | NFS_MAY_READ; cache.cred = cred; - cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { if (status == -ESTALE) { diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 6fb9fad2d1e6..d2972d537469 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -616,13 +616,13 @@ nfs_direct_write_scan_commit_list(struct inode *inode, struct list_head *list, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); #ifdef CONFIG_NFS_V4_1 if (cinfo->ds != NULL && cinfo->ds->nwritten != 0) NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo); #endif nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); } static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index af330c31f627..a385d1c3f146 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -631,11 +631,11 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) if (result <= 0) goto out; - result = generic_write_sync(iocb, result); - if (result < 0) - goto out; written = result; iocb->ki_pos += written; + result = generic_write_sync(iocb, written); + if (result < 0) + goto out; /* Return error values */ if (nfs_need_check_write(file, inode)) { @@ -744,15 +744,18 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) goto out; /* - * Revalidate the cache if the server has time stamps granular - * enough to detect subsecond changes. Otherwise, clear the - * cache to prevent missing any changes. + * Invalidate cache to prevent missing any changes. If + * the file is mapped, clear the page cache as well so + * those mappings will be loaded. * * This makes locking act as a cache coherency point. */ nfs_sync_mapping(filp->f_mapping); - if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) { nfs_zap_caches(inode); + if (mapping_mapped(filp->f_mapping)) + nfs_revalidate_mapping(inode, filp->f_mapping); + } out: return status; } diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 777b055063f6..3025fe8584a0 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -252,45 +252,6 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data, } /* - * Indication from FS-Cache that the cookie is no longer cached - * - This function is called when the backing store currently caching a cookie - * is removed - * - The netfs should use this to clean up any markers indicating cached pages - * - This is mandatory for any object that may have data - */ -static void nfs_fscache_inode_now_uncached(void *cookie_netfs_data) -{ - struct nfs_inode *nfsi = cookie_netfs_data; - struct pagevec pvec; - pgoff_t first; - int loop, nr_pages; - - pagevec_init(&pvec, 0); - first = 0; - - dprintk("NFS: nfs_inode_now_uncached: nfs_inode 0x%p\n", nfsi); - - for (;;) { - /* grab a bunch of pages to unmark */ - nr_pages = pagevec_lookup(&pvec, - nfsi->vfs_inode.i_mapping, - first, - PAGEVEC_SIZE - pagevec_count(&pvec)); - if (!nr_pages) - break; - - for (loop = 0; loop < nr_pages; loop++) - ClearPageFsCache(pvec.pages[loop]); - - first = pvec.pages[nr_pages - 1]->index + 1; - - pvec.nr = nr_pages; - pagevec_release(&pvec); - cond_resched(); - } -} - -/* * Get an extra reference on a read context. * - This function can be absent if the completion function doesn't require a * context. @@ -330,7 +291,6 @@ const struct fscache_cookie_def nfs_fscache_inode_object_def = { .get_attr = nfs_fscache_inode_get_attr, .get_aux = nfs_fscache_inode_get_aux, .check_aux = nfs_fscache_inode_check_aux, - .now_uncached = nfs_fscache_inode_now_uncached, .get_context = nfs_fh_get_context, .put_context = nfs_fh_put_context, }; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 109279d6d91b..134d9f560240 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1285,7 +1285,6 @@ static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); unsigned long ret = 0; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) @@ -1315,7 +1314,7 @@ static unsigned long nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) - && nfsi->nrequests == 0) { + && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); ret |= NFS_INO_INVALID_ATTR; } @@ -1823,7 +1822,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (new_isize != cur_isize) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ - if (nfsi->nrequests == 0 || new_isize > cur_isize) { + if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; @@ -2012,10 +2011,11 @@ static void init_once(void *foo) INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); - nfsi->nrequests = 0; - nfsi->commit_info.ncommit = 0; + atomic_long_set(&nfsi->nrequests, 0); + atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); + mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index dc456416d2be..68cc22083639 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -251,7 +251,6 @@ int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 40bd05f05e74..ac4f10b7f6c1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -303,6 +303,17 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode, struct rpc_cred *newcred = NULL; rpc_authflavor_t flavor; + if (sp4_mode == NFS_SP4_MACH_CRED_CLEANUP || + sp4_mode == NFS_SP4_MACH_CRED_PNFS_CLEANUP) { + /* Using machine creds for cleanup operations + * is only relevent if the client credentials + * might expire. So don't bother for + * RPC_AUTH_UNIX. If file was only exported to + * sec=sys, the PUTFH would fail anyway. + */ + if ((*clntp)->cl_auth->au_flavor == RPC_AUTH_UNIX) + return false; + } if (test_bit(sp4_mode, &clp->cl_sp4_flags)) { spin_lock(&clp->cl_lock); if (clp->cl_machine_cred != NULL) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d90132642340..6c61e2b99635 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1659,12 +1659,52 @@ update: return state; } +static struct inode * +nfs4_opendata_get_inode(struct nfs4_opendata *data) +{ + struct inode *inode; + + switch (data->o_arg.claim) { + case NFS4_OPEN_CLAIM_NULL: + case NFS4_OPEN_CLAIM_DELEGATE_CUR: + case NFS4_OPEN_CLAIM_DELEGATE_PREV: + if (!(data->f_attr.valid & NFS_ATTR_FATTR)) + return ERR_PTR(-EAGAIN); + inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, + &data->f_attr, data->f_label); + break; + default: + inode = d_inode(data->dentry); + ihold(inode); + nfs_refresh_inode(inode, &data->f_attr); + } + return inode; +} + static struct nfs4_state * -_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +nfs4_opendata_find_nfs4_state(struct nfs4_opendata *data) { + struct nfs4_state *state; struct inode *inode; - struct nfs4_state *state = NULL; - int ret; + + inode = nfs4_opendata_get_inode(data); + if (IS_ERR(inode)) + return ERR_CAST(inode); + if (data->state != NULL && data->state->inode == inode) { + state = data->state; + atomic_inc(&state->count); + } else + state = nfs4_get_open_state(inode, data->owner); + iput(inode); + if (state == NULL) + state = ERR_PTR(-ENOMEM); + return state; +} + +static struct nfs4_state * +_nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) +{ + struct nfs4_state *state; if (!data->rpc_done) { state = nfs4_try_open_cached(data); @@ -1672,29 +1712,17 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) goto out; } - ret = -EAGAIN; - if (!(data->f_attr.valid & NFS_ATTR_FATTR)) - goto err; - inode = nfs_fhget(data->dir->d_sb, &data->o_res.fh, &data->f_attr, data->f_label); - ret = PTR_ERR(inode); - if (IS_ERR(inode)) - goto err; - ret = -ENOMEM; - state = nfs4_get_open_state(inode, data->owner); - if (state == NULL) - goto err_put_inode; + state = nfs4_opendata_find_nfs4_state(data); + if (IS_ERR(state)) + goto out; + if (data->o_res.delegation_type != 0) nfs4_opendata_check_deleg(data, state); update_open_stateid(state, &data->o_res.stateid, NULL, data->o_arg.fmode); - iput(inode); out: nfs_release_seqid(data->o_arg.seqid); return state; -err_put_inode: - iput(inode); -err: - return ERR_PTR(ret); } static struct nfs4_state * @@ -2071,7 +2099,6 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0]; case NFS4_OPEN_CLAIM_FH: task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; - nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; if (nfs4_setup_sequence(data->o_arg.server->nfs_client, @@ -2258,7 +2285,6 @@ static int nfs4_opendata_access(struct rpc_cred *cred, mask = NFS4_ACCESS_READ; cache.cred = cred; - cache.jiffies = jiffies; nfs_access_set_mask(&cache, opendata->o_res.access_result); nfs_access_add_cache(state->inode, &cache); @@ -7318,7 +7344,9 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, 1 << (OP_DESTROY_SESSION - 32) | 1 << (OP_DESTROY_CLIENTID - 32) }; + unsigned long flags = 0; unsigned int i; + int ret = 0; if (sp->how == SP4_MACH_CRED) { /* Print state protect result */ @@ -7334,7 +7362,8 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, for (i = 0; i < NFS4_OP_MAP_NUM_WORDS; i++) { if (sp->enforce.u.words[i] & ~supported_enforce[i]) { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } } @@ -7353,10 +7382,11 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DESTROY_CLIENTID, sp->enforce.u.longs)) { dfprintk(MOUNT, "sp4_mach_cred:\n"); dfprintk(MOUNT, " minimal mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_MINIMAL, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_MINIMAL, &flags); } else { dfprintk(MOUNT, "sp4_mach_cred: disabled\n"); - return -EINVAL; + ret = -EINVAL; + goto out; } if (test_bit(OP_CLOSE, sp->allow.u.longs) && @@ -7364,110 +7394,46 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_CLEANUP, &flags); } if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, - &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, &flags); } if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_SECINFO, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_SECINFO, &flags); } if (test_bit(OP_TEST_STATEID, sp->allow.u.longs) && test_bit(OP_FREE_STATEID, sp->allow.u.longs)) { dfprintk(MOUNT, " stateid mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_STATEID, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_STATEID, &flags); } if (test_bit(OP_WRITE, sp->allow.u.longs)) { dfprintk(MOUNT, " write mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_WRITE, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_WRITE, &flags); } if (test_bit(OP_COMMIT, sp->allow.u.longs)) { dfprintk(MOUNT, " commit mode enabled\n"); - set_bit(NFS_SP4_MACH_CRED_COMMIT, &clp->cl_sp4_flags); + __set_bit(NFS_SP4_MACH_CRED_COMMIT, &flags); } } - +out: + clp->cl_sp4_flags = flags; return 0; } struct nfs41_exchange_id_data { struct nfs41_exchange_id_res res; struct nfs41_exchange_id_args args; - struct rpc_xprt *xprt; - int rpc_status; }; -static void nfs4_exchange_id_done(struct rpc_task *task, void *data) -{ - struct nfs41_exchange_id_data *cdata = - (struct nfs41_exchange_id_data *)data; - struct nfs_client *clp = cdata->args.client; - int status = task->tk_status; - - trace_nfs4_exchange_id(clp, status); - - if (status == 0) - status = nfs4_check_cl_exchange_flags(cdata->res.flags); - - if (cdata->xprt && status == 0) { - status = nfs4_detect_session_trunking(clp, &cdata->res, - cdata->xprt); - goto out; - } - - if (status == 0) - status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect); - - if (status == 0) { - clp->cl_clientid = cdata->res.clientid; - clp->cl_exchange_flags = cdata->res.flags; - clp->cl_seqid = cdata->res.seqid; - /* Client ID is not confirmed */ - if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) - clear_bit(NFS4_SESSION_ESTABLISHED, - &clp->cl_session->session_state); - - kfree(clp->cl_serverowner); - clp->cl_serverowner = cdata->res.server_owner; - cdata->res.server_owner = NULL; - - /* use the most recent implementation id */ - kfree(clp->cl_implid); - clp->cl_implid = cdata->res.impl_id; - cdata->res.impl_id = NULL; - - if (clp->cl_serverscope != NULL && - !nfs41_same_server_scope(clp->cl_serverscope, - cdata->res.server_scope)) { - dprintk("%s: server_scope mismatch detected\n", - __func__); - set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); - kfree(clp->cl_serverscope); - clp->cl_serverscope = NULL; - } - - if (clp->cl_serverscope == NULL) { - clp->cl_serverscope = cdata->res.server_scope; - cdata->res.server_scope = NULL; - } - /* Save the EXCHANGE_ID verifier session trunk tests */ - memcpy(clp->cl_confirm.data, cdata->args.verifier.data, - sizeof(clp->cl_confirm.data)); - } -out: - cdata->rpc_status = status; - return; -} - static void nfs4_exchange_id_release(void *data) { struct nfs41_exchange_id_data *cdata = @@ -7481,7 +7447,6 @@ static void nfs4_exchange_id_release(void *data) } static const struct rpc_call_ops nfs4_exchange_id_call_ops = { - .rpc_call_done = nfs4_exchange_id_done, .rpc_release = nfs4_exchange_id_release, }; @@ -7490,7 +7455,8 @@ static const struct rpc_call_ops nfs4_exchange_id_call_ops = { * * Wrapper for EXCHANGE_ID operation. */ -static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, +static struct rpc_task * +nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, u32 sp4_how, struct rpc_xprt *xprt) { struct rpc_message msg = { @@ -7504,17 +7470,15 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, .flags = RPC_TASK_TIMEOUT, }; struct nfs41_exchange_id_data *calldata; - struct rpc_task *task; int status; if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; + return ERR_PTR(-EIO); + status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) { - nfs_put_client(clp); - return -ENOMEM; - } + if (!calldata) + goto out; nfs4_init_boot_verifier(clp, &calldata->args.verifier); @@ -7553,34 +7517,22 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, goto out_impl_id; } if (xprt) { - calldata->xprt = xprt; task_setup_data.rpc_xprt = xprt; task_setup_data.flags |= RPC_TASK_SOFTCONN; memcpy(calldata->args.verifier.data, clp->cl_confirm.data, sizeof(calldata->args.verifier.data)); } calldata->args.client = clp; -#ifdef CONFIG_NFS_V4_1_MIGRATION calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID | - EXCHGID4_FLAG_SUPP_MOVED_MIGR, -#else - calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER | - EXCHGID4_FLAG_BIND_PRINC_STATEID, + EXCHGID4_FLAG_BIND_PRINC_STATEID; +#ifdef CONFIG_NFS_V4_1_MIGRATION + calldata->args.flags |= EXCHGID4_FLAG_SUPP_MOVED_MIGR; #endif msg.rpc_argp = &calldata->args; msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; - task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); - - status = calldata->rpc_status; - - rpc_put_task(task); -out: - return status; + return rpc_run_task(&task_setup_data); out_impl_id: kfree(calldata->res.impl_id); @@ -7590,8 +7542,69 @@ out_server_owner: kfree(calldata->res.server_owner); out_calldata: kfree(calldata); +out: nfs_put_client(clp); - goto out; + return ERR_PTR(status); +} + +/* + * _nfs4_proc_exchange_id() + * + * Wrapper for EXCHANGE_ID operation. + */ +static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, + u32 sp4_how) +{ + struct rpc_task *task; + struct nfs41_exchange_id_args *argp; + struct nfs41_exchange_id_res *resp; + int status; + + task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL); + if (IS_ERR(task)) + return PTR_ERR(task); + + argp = task->tk_msg.rpc_argp; + resp = task->tk_msg.rpc_resp; + status = task->tk_status; + if (status != 0) + goto out; + + status = nfs4_check_cl_exchange_flags(resp->flags); + if (status != 0) + goto out; + + status = nfs4_sp4_select_mode(clp, &resp->state_protect); + if (status != 0) + goto out; + + clp->cl_clientid = resp->clientid; + clp->cl_exchange_flags = resp->flags; + clp->cl_seqid = resp->seqid; + /* Client ID is not confirmed */ + if (!(resp->flags & EXCHGID4_FLAG_CONFIRMED_R)) + clear_bit(NFS4_SESSION_ESTABLISHED, + &clp->cl_session->session_state); + + if (clp->cl_serverscope != NULL && + !nfs41_same_server_scope(clp->cl_serverscope, + resp->server_scope)) { + dprintk("%s: server_scope mismatch detected\n", + __func__); + set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state); + } + + swap(clp->cl_serverowner, resp->server_owner); + swap(clp->cl_serverscope, resp->server_scope); + swap(clp->cl_implid, resp->impl_id); + + /* Save the EXCHANGE_ID verifier session trunk tests */ + memcpy(clp->cl_confirm.data, argp->verifier.data, + sizeof(clp->cl_confirm.data)); +out: + trace_nfs4_exchange_id(clp, status); + rpc_put_task(task); + return status; } /* @@ -7614,13 +7627,13 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred) /* try SP4_MACH_CRED if krb5i/p */ if (authflavor == RPC_AUTH_GSS_KRB5I || authflavor == RPC_AUTH_GSS_KRB5P) { - status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL); + status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED); if (!status) return 0; } /* try SP4_NONE */ - return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL); + return _nfs4_proc_exchange_id(clp, cred, SP4_NONE); } /** @@ -7642,6 +7655,9 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data) { struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data; + struct rpc_task *task; + int status; + u32 sp4_how; dprintk("--> %s try %s\n", __func__, @@ -7650,7 +7666,17 @@ int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED); /* Test connection for session trunking. Async exchange_id call */ - return _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + task = nfs4_run_exchange_id(adata->clp, adata->cred, sp4_how, xprt); + if (IS_ERR(task)) + return PTR_ERR(task); + + status = task->tk_status; + if (status == 0) + status = nfs4_detect_session_trunking(adata->clp, + task->tk_msg.rpc_resp, xprt); + + rpc_put_task(task); + return status; } EXPORT_SYMBOL_GPL(nfs4_test_session_trunk); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index de9066a92c0d..bec120ec1967 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -134,19 +134,14 @@ EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked - * @nonblock - if true don't block waiting for lock * - * this lock must be held if modifying the page group list + * this lock must be held when traversing or modifying the page + * group list * - * return 0 on success, < 0 on error: -EDELAY if nonblocking or the - * result from wait_on_bit_lock - * - * NOTE: calling with nonblock=false should always have set the - * lock bit (see fs/buffer.c and other uses of wait_on_bit_lock - * with TASK_UNINTERRUPTIBLE), so there is no need to check the result. + * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req, bool nonblock) +nfs_page_group_lock(struct nfs_page *req) { struct nfs_page *head = req->wb_head; @@ -155,35 +150,10 @@ nfs_page_group_lock(struct nfs_page *req, bool nonblock) if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) return 0; - if (!nonblock) { - set_bit(PG_CONTENDED1, &head->wb_flags); - smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); - } - - return -EAGAIN; -} - -/* - * nfs_page_group_lock_wait - wait for the lock to clear, but don't grab it - * @req - a request in the group - * - * This is a blocking call to wait for the group lock to be cleared. - */ -void -nfs_page_group_lock_wait(struct nfs_page *req) -{ - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - if (!test_bit(PG_HEADLOCK, &head->wb_flags)) - return; set_bit(PG_CONTENDED1, &head->wb_flags); smp_mb__after_atomic(); - wait_on_bit(&head->wb_flags, PG_HEADLOCK, - TASK_UNINTERRUPTIBLE); + return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + TASK_UNINTERRUPTIBLE); } /* @@ -246,7 +216,7 @@ bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) { bool ret; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); ret = nfs_page_group_sync_on_bit_locked(req, bit); nfs_page_group_unlock(req); @@ -288,9 +258,7 @@ nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) inode = page_file_mapping(req->wb_page)->host; set_bit(PG_INODE_REF, &req->wb_flags); kref_get(&req->wb_kref); - spin_lock(&inode->i_lock); - NFS_I(inode)->nrequests++; - spin_unlock(&inode->i_lock); + atomic_long_inc(&NFS_I(inode)->nrequests); } } } @@ -306,14 +274,11 @@ static void nfs_page_group_destroy(struct kref *kref) { struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); + struct nfs_page *head = req->wb_head; struct nfs_page *tmp, *next; - /* subrequests must release the ref on the head request */ - if (req->wb_head != req) - nfs_release_request(req->wb_head); - if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) - return; + goto out; tmp = req; do { @@ -324,6 +289,10 @@ nfs_page_group_destroy(struct kref *kref) nfs_free_request(tmp); tmp = next; } while (tmp != req); +out: + /* subrequests must release the ref on the head request */ + if (head != req) + nfs_release_request(head); } /** @@ -465,6 +434,7 @@ void nfs_release_request(struct nfs_page *req) { kref_put(&req->wb_kref, nfs_page_group_destroy); } +EXPORT_SYMBOL_GPL(nfs_release_request); /** * nfs_wait_on_request - Wait for a request to complete. @@ -483,6 +453,7 @@ nfs_wait_on_request(struct nfs_page *req) return wait_on_bit_io(&req->wb_flags, PG_BUSY, TASK_UNINTERRUPTIBLE); } +EXPORT_SYMBOL_GPL(nfs_wait_on_request); /* * nfs_generic_pg_test - determine if requests can be coalesced @@ -530,16 +501,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -548,14 +509,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call @@ -669,7 +640,6 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); static void nfs_pgio_error(struct nfs_pgio_header *hdr) { set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -680,7 +650,6 @@ static void nfs_pgio_error(struct nfs_pgio_header *hdr) static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,12 +680,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags, - gfp_t gfp_flags) + int io_flags) { - struct nfs_pgio_mirror *new; - int i; - desc->pg_moreio = 0; desc->pg_inode = inode; desc->pg_ops = pg_ops; @@ -732,23 +697,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, desc->pg_mirror_count = 1; desc->pg_mirror_idx = 0; - if (pg_ops->pg_get_mirror_count) { - /* until we have a request, we don't have an lseg and no - * idea how many mirrors there will be */ - new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), gfp_flags); - desc->pg_mirrors_dynamic = new; - desc->pg_mirrors = new; - - for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++) - nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize); - } else { - desc->pg_mirrors_dynamic = NULL; - desc->pg_mirrors = desc->pg_mirrors_static; - nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); - } + desc->pg_mirrors_dynamic = NULL; + desc->pg_mirrors = desc->pg_mirrors_static; + nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize); } -EXPORT_SYMBOL_GPL(nfs_pageio_init); /** * nfs_pgio_result - Basic pageio error handling @@ -865,32 +817,52 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) return ret; } +static struct nfs_pgio_mirror * +nfs_pageio_alloc_mirrors(struct nfs_pageio_descriptor *desc, + unsigned int mirror_count) +{ + struct nfs_pgio_mirror *ret; + unsigned int i; + + kfree(desc->pg_mirrors_dynamic); + desc->pg_mirrors_dynamic = NULL; + if (mirror_count == 1) + return desc->pg_mirrors_static; + ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_NOFS); + if (ret != NULL) { + for (i = 0; i < mirror_count; i++) + nfs_pageio_mirror_init(&ret[i], desc->pg_bsize); + desc->pg_mirrors_dynamic = ret; + } + return ret; +} + /* * nfs_pageio_setup_mirroring - determine if mirroring is to be used * by calling the pg_get_mirror_count op */ -static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, +static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - int mirror_count = 1; - - if (!pgio->pg_ops->pg_get_mirror_count) - return 0; + unsigned int mirror_count = 1; - mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); - - if (pgio->pg_error < 0) - return pgio->pg_error; - - if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) - return -EINVAL; + if (pgio->pg_ops->pg_get_mirror_count) + mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (mirror_count == pgio->pg_mirror_count || pgio->pg_error < 0) + return; - if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic)) - return -EINVAL; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) { + pgio->pg_error = -EINVAL; + return; + } + pgio->pg_mirrors = nfs_pageio_alloc_mirrors(pgio, mirror_count); + if (pgio->pg_mirrors == NULL) { + pgio->pg_error = -ENOMEM; + pgio->pg_mirrors = pgio->pg_mirrors_static; + mirror_count = 1; + } pgio->pg_mirror_count = mirror_count; - - return 0; } /* @@ -1036,7 +1008,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, unsigned int bytes_left = 0; unsigned int offset, pgbase; - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); subreq = req; bytes_left = subreq->wb_bytes; @@ -1058,7 +1030,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (mirror->pg_recoalesce) return 0; /* retry add_request for this subreq */ - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); continue; } @@ -1155,7 +1127,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); /* find the last request */ for (lastreq = req->wb_head; diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index c383d0913b54..7879ed8ceb76 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -529,47 +529,6 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static void pnfs_free_lseg_async_work(struct work_struct *work) -{ - struct pnfs_layout_segment *lseg; - struct pnfs_layout_hdr *lo; - - lseg = container_of(work, struct pnfs_layout_segment, pls_work); - lo = lseg->pls_layout; - - pnfs_free_lseg(lseg); - pnfs_put_layout_hdr(lo); -} - -static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) -{ - INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); - schedule_work(&lseg->pls_work); -} - -void -pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) -{ - if (!lseg) - return; - - assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); - - dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), - test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - if (atomic_dec_and_test(&lseg->pls_refcount)) { - struct pnfs_layout_hdr *lo = lseg->pls_layout; - if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) - return; - pnfs_layout_remove_lseg(lo, lseg); - if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) { - pnfs_get_layout_hdr(lo); - pnfs_free_lseg_async(lseg); - } - } -} - /* * is l2 fully contained in l1? * start1 end1 @@ -2274,7 +2233,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } @@ -2398,7 +2356,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 99731e3e332f..87f144f14d1e 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -67,7 +67,6 @@ struct pnfs_layout_segment { u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; - struct work_struct pls_work; }; enum pnfs_try_status { @@ -230,7 +229,6 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 25f28fa64c57..60da59be83b6 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -83,34 +83,11 @@ pnfs_generic_clear_request_commit(struct nfs_page *req, } out: nfs_request_remove_commit_list(req, cinfo); - pnfs_put_lseg_locked(freeme); + pnfs_put_lseg(freeme); } EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit); static int -pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst, - struct nfs_commit_info *cinfo, int max) -{ - struct nfs_page *req, *tmp; - int ret = 0; - - list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; - kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); - nfs_request_remove_commit_list(req, cinfo); - clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); - nfs_list_add_request(req, dst); - ret++; - if ((ret == max) && !cinfo->dreq) - break; - } - return ret; -} - -static int pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct nfs_commit_info *cinfo, int max) @@ -119,15 +96,15 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket, struct list_head *dst = &bucket->committing; int ret; - lockdep_assert_held(&cinfo->inode->i_lock); - ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); + ret = nfs_scan_commit_list(src, dst, cinfo, max); if (ret) { cinfo->ds->nwritten -= ret; cinfo->ds->ncommitting += ret; if (bucket->clseg == NULL) bucket->clseg = pnfs_get_lseg(bucket->wlseg); if (list_empty(src)) { - pnfs_put_lseg_locked(bucket->wlseg); + pnfs_put_lseg(bucket->wlseg); bucket->wlseg = NULL; } } @@ -142,7 +119,7 @@ int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, { int i, rv = 0, cnt; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) { cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i], cinfo, max); @@ -162,11 +139,10 @@ void pnfs_generic_recover_commit_reqs(struct list_head *dst, int nwritten; int i; - lockdep_assert_held(&cinfo->inode->i_lock); + lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex); restart: for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) { - nwritten = pnfs_generic_transfer_commit_list(&b->written, - dst, cinfo, 0); + nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0); if (!nwritten) continue; cinfo->ds->nwritten -= nwritten; @@ -953,12 +929,12 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, struct list_head *list; struct pnfs_commit_bucket *buckets; - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { if (!pnfs_is_valid_lseg(lseg)) { - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); cinfo->completion_ops->resched_write(cinfo, req); return; } @@ -975,7 +951,7 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, cinfo->ds->nwritten++; nfs_request_add_commit_list_locked(req, list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); nfs_mark_page_unstable(req->wb_page, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index a8421d9dab6a..0d42573d423d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -68,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0, GFP_KERNEL); + server->rsize, 0); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d828ef88e7db..6b179af59b92 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1691,8 +1691,8 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, rpc_authflavor_t *server_authlist, unsigned int count) { rpc_authflavor_t flavor = RPC_AUTH_MAXFLAVOR; + bool found_auth_null = false; unsigned int i; - int use_auth_null = false; /* * If the sec= mount option is used, the specified flavor or AUTH_NULL @@ -1701,6 +1701,10 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, * AUTH_NULL has a special meaning when it's in the server list - it * means that the server will ignore the rpc creds, so any flavor * can be used but still use the sec= that was specified. + * + * Note also that the MNT procedure in MNTv1 does not return a list + * of supported security flavors. In this case, nfs_mount() fabricates + * a security flavor list containing just AUTH_NULL. */ for (i = 0; i < count; i++) { flavor = server_authlist[i]; @@ -1709,11 +1713,11 @@ static int nfs_verify_authflavors(struct nfs_parsed_mount_data *args, goto out; if (flavor == RPC_AUTH_NULL) - use_auth_null = true; + found_auth_null = true; } - if (use_auth_null) { - flavor = RPC_AUTH_NULL; + if (found_auth_null) { + flavor = args->auth_info.flavors[0]; goto out; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b1af5dee5e0a..f68083db63c8 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -102,10 +102,8 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) { - memset(p, 0, sizeof(*p)); - p->rw_mode = FMODE_WRITE; - } + memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; return p; } @@ -154,6 +152,14 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } +static struct nfs_page * +nfs_page_private_request(struct page *page) +{ + if (!PagePrivate(page)) + return NULL; + return (struct nfs_page *)page_private(page); +} + /* * nfs_page_find_head_request_locked - find head request associated with @page * @@ -162,21 +168,41 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) * returns matching head request with reference held, or NULL if not found. */ static struct nfs_page * -nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) +nfs_page_find_private_request(struct page *page) { - struct nfs_page *req = NULL; - - if (PagePrivate(page)) - req = (struct nfs_page *)page_private(page); - else if (unlikely(PageSwapCache(page))) - req = nfs_page_search_commits_for_head_request_locked(nfsi, - page); + struct address_space *mapping = page_file_mapping(page); + struct nfs_page *req; + if (!PagePrivate(page)) + return NULL; + spin_lock(&mapping->private_lock); + req = nfs_page_private_request(page); if (req) { WARN_ON_ONCE(req->wb_head != req); kref_get(&req->wb_kref); } + spin_unlock(&mapping->private_lock); + return req; +} +static struct nfs_page * +nfs_page_find_swap_request(struct page *page) +{ + struct inode *inode = page_file_mapping(page)->host; + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs_page *req = NULL; + if (!PageSwapCache(page)) + return NULL; + mutex_lock(&nfsi->commit_mutex); + if (PageSwapCache(page)) { + req = nfs_page_search_commits_for_head_request_locked(nfsi, + page); + if (req) { + WARN_ON_ONCE(req->wb_head != req); + kref_get(&req->wb_kref); + } + } + mutex_unlock(&nfsi->commit_mutex); return req; } @@ -187,12 +213,11 @@ nfs_page_find_head_request_locked(struct nfs_inode *nfsi, struct page *page) */ static struct nfs_page *nfs_page_find_head_request(struct page *page) { - struct inode *inode = page_file_mapping(page)->host; - struct nfs_page *req = NULL; + struct nfs_page *req; - spin_lock(&inode->i_lock); - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - spin_unlock(&inode->i_lock); + req = nfs_page_find_private_request(page); + if (!req) + req = nfs_page_find_swap_request(page); return req; } @@ -241,9 +266,6 @@ nfs_page_group_search_locked(struct nfs_page *head, unsigned int page_offset) { struct nfs_page *req; - WARN_ON_ONCE(head != head->wb_head); - WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_head->wb_flags)); - req = head; do { if (page_offset >= req->wb_pgbase && @@ -269,20 +291,17 @@ static bool nfs_page_group_covers_page(struct nfs_page *req) unsigned int pos = 0; unsigned int len = nfs_page_length(req->wb_page); - nfs_page_group_lock(req, false); + nfs_page_group_lock(req); - do { + for (;;) { tmp = nfs_page_group_search_locked(req->wb_head, pos); - if (tmp) { - /* no way this should happen */ - WARN_ON_ONCE(tmp->wb_pgbase != pos); - pos += tmp->wb_bytes - (pos - tmp->wb_pgbase); - } - } while (tmp && pos < len); + if (!tmp) + break; + pos = tmp->wb_pgbase + tmp->wb_bytes; + } nfs_page_group_unlock(req); - WARN_ON_ONCE(pos > len); - return pos == len; + return pos >= len; } /* We can set the PG_uptodate flag if we see that a write request @@ -333,8 +352,11 @@ static void nfs_end_page_writeback(struct nfs_page *req) { struct inode *inode = page_file_mapping(req->wb_page)->host; struct nfs_server *nfss = NFS_SERVER(inode); + bool is_done; - if (!nfs_page_group_sync_on_bit(req, PG_WB_END)) + is_done = nfs_page_group_sync_on_bit(req, PG_WB_END); + nfs_unlock_request(req); + if (!is_done) return; end_page_writeback(req->wb_page); @@ -342,22 +364,6 @@ static void nfs_end_page_writeback(struct nfs_page *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } - -/* nfs_page_group_clear_bits - * @req - an nfs request - * clears all page group related bits from @req - */ -static void -nfs_page_group_clear_bits(struct nfs_page *req) -{ - clear_bit(PG_TEARDOWN, &req->wb_flags); - clear_bit(PG_UNLOCKPAGE, &req->wb_flags); - clear_bit(PG_UPTODATE, &req->wb_flags); - clear_bit(PG_WB_END, &req->wb_flags); - clear_bit(PG_REMOVE, &req->wb_flags); -} - - /* * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req * @@ -366,43 +372,24 @@ nfs_page_group_clear_bits(struct nfs_page *req) * @inode - inode associated with request page group, must be holding inode lock * @head - head request of page group, must be holding head lock * @req - request that couldn't lock and needs to wait on the req bit lock - * @nonblock - if true, don't actually wait * - * NOTE: this must be called holding page_group bit lock and inode spin lock - * and BOTH will be released before returning. + * NOTE: this must be called holding page_group bit lock + * which will be released before returning. * * returns 0 on success, < 0 on error. */ -static int -nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, - struct nfs_page *req, bool nonblock) - __releases(&inode->i_lock) +static void +nfs_unroll_locks(struct inode *inode, struct nfs_page *head, + struct nfs_page *req) { struct nfs_page *tmp; - int ret; /* relinquish all the locks successfully grabbed this run */ - for (tmp = head ; tmp != req; tmp = tmp->wb_this_page) - nfs_unlock_request(tmp); - - WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); - - /* grab a ref on the request that will be waited on */ - kref_get(&req->wb_kref); - - nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); - - /* release ref from nfs_page_find_head_request_locked */ - nfs_release_request(head); - - if (!nonblock) - ret = nfs_wait_on_request(req); - else - ret = -EAGAIN; - nfs_release_request(req); - - return ret; + for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) { + if (!kref_read(&tmp->wb_kref)) + continue; + nfs_unlock_and_release_request(tmp); + } } /* @@ -417,7 +404,8 @@ nfs_unroll_locks_and_wait(struct inode *inode, struct nfs_page *head, */ static void nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, - struct nfs_page *old_head) + struct nfs_page *old_head, + struct inode *inode) { while (destroy_list) { struct nfs_page *subreq = destroy_list; @@ -428,33 +416,28 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, WARN_ON_ONCE(old_head != subreq->wb_head); /* make sure old group is not used */ - subreq->wb_head = subreq; subreq->wb_this_page = subreq; - /* subreq is now totally disconnected from page group or any - * write / commit lists. last chance to wake any waiters */ - nfs_unlock_request(subreq); + clear_bit(PG_REMOVE, &subreq->wb_flags); - if (!test_bit(PG_TEARDOWN, &subreq->wb_flags)) { - /* release ref on old head request */ - nfs_release_request(old_head); + /* Note: races with nfs_page_group_destroy() */ + if (!kref_read(&subreq->wb_kref)) { + /* Check if we raced with nfs_page_group_destroy() */ + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + nfs_free_request(subreq); + continue; + } - nfs_page_group_clear_bits(subreq); + subreq->wb_head = subreq; - /* release the PG_INODE_REF reference */ - if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) - nfs_release_request(subreq); - else - WARN_ON_ONCE(1); - } else { - WARN_ON_ONCE(test_bit(PG_CLEAN, &subreq->wb_flags)); - /* zombie requests have already released the last - * reference and were waiting on the rest of the - * group to complete. Since it's no longer part of a - * group, simply free the request */ - nfs_page_group_clear_bits(subreq); - nfs_free_request(subreq); + if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { + nfs_release_request(subreq); + atomic_long_dec(&NFS_I(inode)->nrequests); } + + /* subreq is now totally disconnected from page group or any + * write / commit lists. last chance to wake any waiters */ + nfs_unlock_and_release_request(subreq); } } @@ -464,7 +447,6 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * operations for this page. * * @page - the page used to lookup the "page group" of nfs_page structures - * @nonblock - if true, don't block waiting for request locks * * This function joins all sub requests to the head request by first * locking all requests in the group, cancelling any pending operations @@ -478,7 +460,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, * error was encountered. */ static struct nfs_page * -nfs_lock_and_join_requests(struct page *page, bool nonblock) +nfs_lock_and_join_requests(struct page *page) { struct inode *inode = page_file_mapping(page)->host; struct nfs_page *head, *subreq; @@ -487,43 +469,63 @@ nfs_lock_and_join_requests(struct page *page, bool nonblock) int ret; try_again: - total_bytes = 0; - - WARN_ON_ONCE(destroy_list); - - spin_lock(&inode->i_lock); - /* * A reference is taken only on the head request which acts as a * reference to the whole page group - the group will not be destroyed * until the head reference is released. */ - head = nfs_page_find_head_request_locked(NFS_I(inode), page); - - if (!head) { - spin_unlock(&inode->i_lock); + head = nfs_page_find_head_request(page); + if (!head) return NULL; - } - /* holding inode lock, so always make a non-blocking call to try the - * page group lock */ - ret = nfs_page_group_lock(head, true); - if (ret < 0) { - spin_unlock(&inode->i_lock); + /* lock the page head first in order to avoid an ABBA inefficiency */ + if (!nfs_lock_request(head)) { + ret = nfs_wait_on_request(head); + nfs_release_request(head); + if (ret < 0) + return ERR_PTR(ret); + goto try_again; + } - if (!nonblock && ret == -EAGAIN) { - nfs_page_group_lock_wait(head); - nfs_release_request(head); - goto try_again; - } + /* Ensure that nobody removed the request before we locked it */ + if (head != nfs_page_private_request(page) && !PageSwapCache(page)) { + nfs_unlock_and_release_request(head); + goto try_again; + } - nfs_release_request(head); + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unlock_and_release_request(head); return ERR_PTR(ret); } /* lock each request in the page group */ - subreq = head; - do { + total_bytes = head->wb_bytes; + for (subreq = head->wb_this_page; subreq != head; + subreq = subreq->wb_this_page) { + + if (!kref_get_unless_zero(&subreq->wb_kref)) { + if (subreq->wb_offset == head->wb_offset + total_bytes) + total_bytes += subreq->wb_bytes; + continue; + } + + while (!nfs_lock_request(subreq)) { + /* + * Unlock page to allow nfs_page_group_sync_on_bit() + * to succeed + */ + nfs_page_group_unlock(head); + ret = nfs_wait_on_request(subreq); + if (!ret) + ret = nfs_page_group_lock(head); + if (ret < 0) { + nfs_unroll_locks(inode, head, subreq); + nfs_release_request(subreq); + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); + } + } /* * Subrequests are always contiguous, non overlapping * and in order - but may be repeated (mirrored writes). @@ -535,24 +537,12 @@ try_again: ((subreq->wb_offset + subreq->wb_bytes) > (head->wb_offset + total_bytes)))) { nfs_page_group_unlock(head); - spin_unlock(&inode->i_lock); + nfs_unroll_locks(inode, head, subreq); + nfs_unlock_and_release_request(subreq); + nfs_unlock_and_release_request(head); return ERR_PTR(-EIO); } - - if (!nfs_lock_request(subreq)) { - /* releases page group bit lock and - * inode spin lock and all references */ - ret = nfs_unroll_locks_and_wait(inode, head, - subreq, nonblock); - - if (ret == 0) - goto try_again; - - return ERR_PTR(ret); - } - - subreq = subreq->wb_this_page; - } while (subreq != head); + } /* Now that all requests are locked, make sure they aren't on any list. * Commit list removal accounting is done after locks are dropped */ @@ -573,34 +563,30 @@ try_again: head->wb_bytes = total_bytes; } - /* - * prepare head request to be added to new pgio descriptor - */ - nfs_page_group_clear_bits(head); - - /* - * some part of the group was still on the inode list - otherwise - * the group wouldn't be involved in async write. - * grab a reference for the head request, iff it needs one. - */ - if (!test_and_set_bit(PG_INODE_REF, &head->wb_flags)) + /* Postpone destruction of this request */ + if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) { + set_bit(PG_INODE_REF, &head->wb_flags); kref_get(&head->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } nfs_page_group_unlock(head); - /* drop lock to clean uprequests on destroy list */ - spin_unlock(&inode->i_lock); + nfs_destroy_unlinked_subrequests(destroy_list, head, inode); - nfs_destroy_unlinked_subrequests(destroy_list, head); + /* Did we lose a race with nfs_inode_remove_request()? */ + if (!(PagePrivate(page) || PageSwapCache(page))) { + nfs_unlock_and_release_request(head); + return NULL; + } - /* still holds ref on head from nfs_page_find_head_request_locked + /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; } static void nfs_write_error_remove_page(struct nfs_page *req) { - nfs_unlock_request(req); nfs_end_page_writeback(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); @@ -624,12 +610,12 @@ nfs_error_is_fatal_on_server(int err) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page) { struct nfs_page *req; int ret = 0; - req = nfs_lock_and_join_requests(page, nonblock); + req = nfs_lock_and_join_requests(page); if (!req) goto out; ret = PTR_ERR(req); @@ -672,7 +658,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -759,6 +745,7 @@ out_err: */ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { + struct address_space *mapping = page_file_mapping(req->wb_page); struct nfs_inode *nfsi = NFS_I(inode); WARN_ON_ONCE(req->wb_this_page != req); @@ -766,27 +753,30 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) /* Lock the request! */ nfs_lock_request(req); - spin_lock(&inode->i_lock); - if (!nfsi->nrequests && - NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) - inode->i_version++; /* * Swap-space should not get truncated. Hence no need to plug the race * with invalidate/truncate. */ + spin_lock(&mapping->private_lock); + if (!nfs_have_writebacks(inode) && + NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) { + spin_lock(&inode->i_lock); + inode->i_version++; + spin_unlock(&inode->i_lock); + } if (likely(!PageSwapCache(req->wb_page))) { set_bit(PG_MAPPED, &req->wb_flags); SetPagePrivate(req->wb_page); set_page_private(req->wb_page, (unsigned long)req); } - nfsi->nrequests++; + spin_unlock(&mapping->private_lock); + atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an * extra reference so sub groups can follow suit. * This flag also informs pgio layer when to bump nrequests when * adding subrequests. */ WARN_ON(test_and_set_bit(PG_INODE_REF, &req->wb_flags)); kref_get(&req->wb_kref); - spin_unlock(&inode->i_lock); } /* @@ -794,25 +784,22 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = d_inode(req->wb_context->dentry); + struct address_space *mapping = page_file_mapping(req->wb_page); + struct inode *inode = mapping->host; struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; + atomic_long_dec(&nfsi->nrequests); if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) { head = req->wb_head; - spin_lock(&inode->i_lock); + spin_lock(&mapping->private_lock); if (likely(head->wb_page && !PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); clear_bit(PG_MAPPED, &head->wb_flags); } - nfsi->nrequests--; - spin_unlock(&inode->i_lock); - } else { - spin_lock(&inode->i_lock); - nfsi->nrequests--; - spin_unlock(&inode->i_lock); + spin_unlock(&mapping->private_lock); } if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) @@ -868,7 +855,8 @@ nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, * number of outstanding requests requiring a commit as well as * the MM page stats. * - * The caller must hold cinfo->inode->i_lock, and the nfs_page lock. + * The caller must hold NFS_I(cinfo->inode)->commit_mutex, and the + * nfs_page lock. */ void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -876,7 +864,7 @@ nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, { set_bit(PG_CLEAN, &req->wb_flags); nfs_list_add_request(req, dst); - cinfo->mds->ncommit++; + atomic_long_inc(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); @@ -896,9 +884,9 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { - spin_lock(&cinfo->inode->i_lock); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); if (req->wb_page) nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -922,7 +910,7 @@ nfs_request_remove_commit_list(struct nfs_page *req, if (!test_and_clear_bit(PG_CLEAN, &(req)->wb_flags)) return; nfs_list_remove_request(req); - cinfo->mds->ncommit--; + atomic_long_dec(&cinfo->mds->ncommit); } EXPORT_SYMBOL_GPL(nfs_request_remove_commit_list); @@ -967,7 +955,7 @@ nfs_clear_page_commit(struct page *page) WB_RECLAIMABLE); } -/* Called holding inode (/cinfo) lock */ +/* Called holding the request lock on @req */ static void nfs_clear_request_commit(struct nfs_page *req) { @@ -976,9 +964,11 @@ nfs_clear_request_commit(struct nfs_page *req) struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); + mutex_lock(&NFS_I(inode)->commit_mutex); if (!pnfs_clear_request_commit(req, &cinfo)) { nfs_request_remove_commit_list(req, &cinfo); } + mutex_unlock(&NFS_I(inode)->commit_mutex); nfs_clear_page_commit(req->wb_page); } } @@ -1023,7 +1013,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) remove_req: nfs_inode_remove_request(req); next: - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1035,10 +1024,10 @@ out: unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { - return cinfo->mds->ncommit; + return atomic_long_read(&cinfo->mds->ncommit); } -/* cinfo->inode->i_lock held by caller */ +/* NFS_I(cinfo->inode)->commit_mutex held by caller */ int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max) @@ -1046,20 +1035,37 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_page *req, *tmp; int ret = 0; +restart: list_for_each_entry_safe(req, tmp, src, wb_list) { - if (!nfs_lock_request(req)) - continue; kref_get(&req->wb_kref); - if (cond_resched_lock(&cinfo->inode->i_lock)) - list_safe_reset_next(req, tmp, wb_list); + if (!nfs_lock_request(req)) { + int status; + + /* Prevent deadlock with nfs_lock_and_join_requests */ + if (!list_empty(dst)) { + nfs_release_request(req); + continue; + } + /* Ensure we make progress to prevent livelock */ + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); + status = nfs_wait_on_request(req); + nfs_release_request(req); + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (status < 0) + break; + goto restart; + } nfs_request_remove_commit_list(req, cinfo); + clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); nfs_list_add_request(req, dst); ret++; if ((ret == max) && !cinfo->dreq) break; + cond_resched(); } return ret; } +EXPORT_SYMBOL_GPL(nfs_scan_commit_list); /* * nfs_scan_commit - Scan an inode for commit requests @@ -1076,15 +1082,17 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, { int ret = 0; - spin_lock(&cinfo->inode->i_lock); - if (cinfo->mds->ncommit > 0) { + if (!atomic_long_read(&cinfo->mds->ncommit)) + return 0; + mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); + if (atomic_long_read(&cinfo->mds->ncommit) > 0) { const int max = INT_MAX; ret = nfs_scan_commit_list(&cinfo->mds->list, dst, cinfo, max); ret += pnfs_scan_commit_lists(inode, cinfo, max - ret); } - spin_unlock(&cinfo->inode->i_lock); + mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); return ret; } @@ -1105,43 +1113,21 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, unsigned int end; int error; - if (!PagePrivate(page)) - return NULL; - end = offset + bytes; - spin_lock(&inode->i_lock); - - for (;;) { - req = nfs_page_find_head_request_locked(NFS_I(inode), page); - if (req == NULL) - goto out_unlock; - /* should be handled by nfs_flush_incompatible */ - WARN_ON_ONCE(req->wb_head != req); - WARN_ON_ONCE(req->wb_this_page != req); - - rqend = req->wb_offset + req->wb_bytes; - /* - * Tell the caller to flush out the request if - * the offsets are non-contiguous. - * Note: nfs_flush_incompatible() will already - * have flushed out requests having wrong owners. - */ - if (offset > rqend - || end < req->wb_offset) - goto out_flushme; - - if (nfs_lock_request(req)) - break; + req = nfs_lock_and_join_requests(page); + if (IS_ERR_OR_NULL(req)) + return req; - /* The request is locked, so wait and then retry */ - spin_unlock(&inode->i_lock); - error = nfs_wait_on_request(req); - nfs_release_request(req); - if (error != 0) - goto out_err; - spin_lock(&inode->i_lock); - } + rqend = req->wb_offset + req->wb_bytes; + /* + * Tell the caller to flush out the request if + * the offsets are non-contiguous. + * Note: nfs_flush_incompatible() will already + * have flushed out requests having wrong owners. + */ + if (offset > rqend || end < req->wb_offset) + goto out_flushme; /* Okay, the request matches. Update the region */ if (offset < req->wb_offset) { @@ -1152,17 +1138,17 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = end - req->wb_offset; else req->wb_bytes = rqend - req->wb_offset; -out_unlock: - if (req) - nfs_clear_request_commit(req); - spin_unlock(&inode->i_lock); return req; out_flushme: - spin_unlock(&inode->i_lock); - nfs_release_request(req); + /* + * Note: we mark the request dirty here because + * nfs_lock_and_join_requests() cannot preserve + * commit flags, so we have to replay the write. + */ + nfs_mark_request_dirty(req); + nfs_unlock_and_release_request(req); error = nfs_wb_page(inode, page); -out_err: - return ERR_PTR(error); + return (error < 0) ? ERR_PTR(error) : NULL; } /* @@ -1227,8 +1213,6 @@ int nfs_flush_incompatible(struct file *file, struct page *page) l_ctx = req->wb_lock_context; do_flush = req->wb_page != page || !nfs_match_open_context(req->wb_context, ctx); - /* for now, flush if more than 1 request in page_group */ - do_flush |= req->wb_this_page != req; if (l_ctx && flctx && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock))) { @@ -1412,7 +1396,6 @@ static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); - nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1452,7 +1435,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags, GFP_NOIO); + server->wsize, ioflags); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1934,7 +1917,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) int ret = 0; /* no commits means nothing needs to be done */ - if (!nfsi->commit_info.ncommit) + if (!atomic_long_read(&nfsi->commit_info.ncommit)) return ret; if (wbc->sync_mode == WB_SYNC_NONE) { @@ -2015,7 +1998,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* blocking call to cancel all requests and join to a single (head) * request */ - req = nfs_lock_and_join_requests(page, false); + req = nfs_lock_and_join_requests(page); if (IS_ERR(req)) { ret = PTR_ERR(req); diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index d27e75ad25e3..3c69db7d4905 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -784,6 +784,14 @@ out: return status; } + +static void +nfsd4_read_release(union nfsd4_op_u *u) +{ + if (u->read.rd_filp) + fput(u->read.rd_filp); +} + static __be32 nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -912,6 +920,13 @@ nfsd4_secinfo_no_name(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstat return nfs_ok; } +static void +nfsd4_secinfo_release(union nfsd4_op_u *u) +{ + if (u->secinfo.si_exp) + exp_put(u->secinfo.si_exp); +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1335,6 +1350,12 @@ out: return nfserr; } +static void +nfsd4_getdeviceinfo_release(union nfsd4_op_u *u) +{ + kfree(u->getdeviceinfo.gd_device); +} + static __be32 nfsd4_layoutget(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1415,6 +1436,12 @@ out: return nfserr; } +static void +nfsd4_layoutget_release(union nfsd4_op_u *u) +{ + kfree(u->layoutget.lg_content); +} + static __be32 nfsd4_layoutcommit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1541,49 +1568,6 @@ static inline void nfsd4_increment_op_stats(u32 opnum) nfsdstats.nfs4_opcount[opnum]++; } -enum nfsd4_op_flags { - ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ - ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ - ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ - /* For rfc 5661 section 2.6.3.1.1: */ - OP_HANDLES_WRONGSEC = 1 << 3, - OP_IS_PUTFH_LIKE = 1 << 4, - /* - * These are the ops whose result size we estimate before - * encoding, to avoid performing an op then not being able to - * respond or cache a response. This includes writes and setattrs - * as well as the operations usually called "nonidempotent": - */ - OP_MODIFIES_SOMETHING = 1 << 5, - /* - * Cache compounds containing these ops in the xid-based drc: - * We use the DRC for compounds containing non-idempotent - * operations, *except* those that are 4.1-specific (since - * sessions provide their own EOS), and except for stateful - * operations other than setclientid and setclientid_confirm - * (since sequence numbers provide EOS for open, lock, etc in - * the v4.0 case). - */ - OP_CACHEME = 1 << 6, - /* - * These are ops which clear current state id. - */ - OP_CLEAR_STATEID = 1 << 7, -}; - -struct nfsd4_operation { - __be32 (*op_func)(struct svc_rqst *, struct nfsd4_compound_state *, - union nfsd4_op_u *); - u32 op_flags; - char *op_name; - /* Try to get response size before operation */ - u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); - void (*op_get_currentstateid)(struct nfsd4_compound_state *, - union nfsd4_op_u *); - void (*op_set_currentstateid)(struct nfsd4_compound_state *, - union nfsd4_op_u *); -}; - static const struct nfsd4_operation nfsd4_ops[]; static const char *nfsd4_op_name(unsigned opnum); @@ -1621,7 +1605,7 @@ static __be32 nfs41_check_op_ordering(struct nfsd4_compoundargs *args) return nfs_ok; } -static inline const struct nfsd4_operation *OPDESC(struct nfsd4_op *op) +const struct nfsd4_operation *OPDESC(struct nfsd4_op *op) { return &nfsd4_ops[op->opnum]; } @@ -1694,7 +1678,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) struct nfsd4_compoundargs *args = rqstp->rq_argp; struct nfsd4_compoundres *resp = rqstp->rq_resp; struct nfsd4_op *op; - const struct nfsd4_operation *opdesc; struct nfsd4_compound_state *cstate = &resp->cstate; struct svc_fh *current_fh = &cstate->current_fh; struct svc_fh *save_fh = &cstate->save_fh; @@ -1747,15 +1730,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto encode_op; } - opdesc = OPDESC(op); - if (!current_fh->fh_dentry) { - if (!(opdesc->op_flags & ALLOWED_WITHOUT_FH)) { + if (!(op->opdesc->op_flags & ALLOWED_WITHOUT_FH)) { op->status = nfserr_nofilehandle; goto encode_op; } } else if (current_fh->fh_export->ex_fslocs.migrated && - !(opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { + !(op->opdesc->op_flags & ALLOWED_ON_ABSENT_FS)) { op->status = nfserr_moved; goto encode_op; } @@ -1763,12 +1744,12 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) fh_clear_wcc(current_fh); /* If op is non-idempotent */ - if (opdesc->op_flags & OP_MODIFIES_SOMETHING) { + if (op->opdesc->op_flags & OP_MODIFIES_SOMETHING) { /* * Don't execute this op if we couldn't encode a * succesful reply: */ - u32 plen = opdesc->op_rsize_bop(rqstp, op); + u32 plen = op->opdesc->op_rsize_bop(rqstp, op); /* * Plus if there's another operation, make sure * we'll have space to at least encode an error: @@ -1781,9 +1762,9 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) if (op->status) goto encode_op; - if (opdesc->op_get_currentstateid) - opdesc->op_get_currentstateid(cstate, &op->u); - op->status = opdesc->op_func(rqstp, cstate, &op->u); + if (op->opdesc->op_get_currentstateid) + op->opdesc->op_get_currentstateid(cstate, &op->u); + op->status = op->opdesc->op_func(rqstp, cstate, &op->u); /* Only from SEQUENCE */ if (cstate->status == nfserr_replay_cache) { @@ -1792,10 +1773,10 @@ nfsd4_proc_compound(struct svc_rqst *rqstp) goto out; } if (!op->status) { - if (opdesc->op_set_currentstateid) - opdesc->op_set_currentstateid(cstate, &op->u); + if (op->opdesc->op_set_currentstateid) + op->opdesc->op_set_currentstateid(cstate, &op->u); - if (opdesc->op_flags & OP_CLEAR_STATEID) + if (op->opdesc->op_flags & OP_CLEAR_STATEID) clear_current_stateid(cstate); if (need_wrongsec_check(rqstp)) @@ -2160,13 +2141,15 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_LOCK] = { .op_func = nfsd4_lock, - .op_flags = OP_MODIFIES_SOMETHING, + .op_flags = OP_MODIFIES_SOMETHING | + OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCK", .op_rsize_bop = nfsd4_lock_rsize, .op_set_currentstateid = nfsd4_set_lockstateid, }, [OP_LOCKT] = { .op_func = nfsd4_lockt, + .op_flags = OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_LOCKT", .op_rsize_bop = nfsd4_lock_rsize, }, @@ -2238,6 +2221,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_READ] = { .op_func = nfsd4_read, + .op_release = nfsd4_read_release, .op_name = "OP_READ", .op_rsize_bop = nfsd4_read_rsize, .op_get_currentstateid = nfsd4_get_readstateid, @@ -2287,6 +2271,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO] = { .op_func = nfsd4_secinfo, + .op_release = nfsd4_secinfo_release, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO", .op_rsize_bop = nfsd4_secinfo_rsize, @@ -2294,14 +2279,16 @@ static const struct nfsd4_operation nfsd4_ops[] = { [OP_SETATTR] = { .op_func = nfsd4_setattr, .op_name = "OP_SETATTR", - .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME + | OP_NONTRIVIAL_ERROR_ENCODE, .op_rsize_bop = nfsd4_setattr_rsize, .op_get_currentstateid = nfsd4_get_setattrstateid, }, [OP_SETCLIENTID] = { .op_func = nfsd4_setclientid, .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS - | OP_MODIFIES_SOMETHING | OP_CACHEME, + | OP_MODIFIES_SOMETHING | OP_CACHEME + | OP_NONTRIVIAL_ERROR_ENCODE, .op_name = "OP_SETCLIENTID", .op_rsize_bop = nfsd4_setclientid_rsize, }, @@ -2388,6 +2375,7 @@ static const struct nfsd4_operation nfsd4_ops[] = { }, [OP_SECINFO_NO_NAME] = { .op_func = nfsd4_secinfo_no_name, + .op_release = nfsd4_secinfo_release, .op_flags = OP_HANDLES_WRONGSEC, .op_name = "OP_SECINFO_NO_NAME", .op_rsize_bop = nfsd4_secinfo_rsize, @@ -2408,12 +2396,14 @@ static const struct nfsd4_operation nfsd4_ops[] = { #ifdef CONFIG_NFSD_PNFS [OP_GETDEVICEINFO] = { .op_func = nfsd4_getdeviceinfo, + .op_release = nfsd4_getdeviceinfo_release, .op_flags = ALLOWED_WITHOUT_FH, .op_name = "OP_GETDEVICEINFO", .op_rsize_bop = nfsd4_getdeviceinfo_rsize, }, [OP_LAYOUTGET] = { .op_func = nfsd4_layoutget, + .op_release = nfsd4_layoutget_release, .op_flags = OP_MODIFIES_SOMETHING, .op_name = "OP_LAYOUTGET", .op_rsize_bop = nfsd4_layoutget_rsize, diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5f940d2a136b..2c61c6b8ae09 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -159,6 +159,25 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) */ unsigned int avail = (char *)argp->end - (char *)argp->p; __be32 *p; + + if (argp->pagelen == 0) { + struct kvec *vec = &argp->rqstp->rq_arg.tail[0]; + + if (!argp->tail) { + argp->tail = true; + avail = vec->iov_len; + argp->p = vec->iov_base; + argp->end = vec->iov_base + avail; + } + + if (avail < nbytes) + return NULL; + + p = argp->p; + argp->p += XDR_QUADLEN(nbytes); + return p; + } + if (avail + argp->pagelen < nbytes) return NULL; if (avail + PAGE_SIZE < nbytes) /* need more than a page !! */ @@ -1778,7 +1797,7 @@ nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); -static nfsd4_dec nfsd4_dec_ops[] = { +static const nfsd4_dec nfsd4_dec_ops[] = { [OP_ACCESS] = (nfsd4_dec)nfsd4_decode_access, [OP_CLOSE] = (nfsd4_dec)nfsd4_decode_close, [OP_COMMIT] = (nfsd4_dec)nfsd4_decode_commit, @@ -1927,6 +1946,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) op->opnum = OP_ILLEGAL; op->status = nfserr_op_illegal; } + op->opdesc = OPDESC(op); /* * We'll try to cache the result in the DRC if any one * op in the compound wants to be cached: @@ -3102,14 +3122,12 @@ nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 8); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(access->ac_supported); - *p++ = cpu_to_be32(access->ac_resp_access); - } - return nfserr; + p = xdr_reserve_space(xdr, 8); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(access->ac_supported); + *p++ = cpu_to_be32(access->ac_resp_access); + return 0; } static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_bind_conn_to_session *bcts) @@ -3117,17 +3135,15 @@ static __be32 nfsd4_encode_bind_conn_to_session(struct nfsd4_compoundres *resp, struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, - NFS4_MAX_SESSIONID_LEN); - *p++ = cpu_to_be32(bcts->dir); - /* Upshifting from TCP to RDMA is not supported */ - *p++ = cpu_to_be32(0); - } - return nfserr; + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 8); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, bcts->sessionid.data, + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(bcts->dir); + /* Upshifting from TCP to RDMA is not supported */ + *p++ = cpu_to_be32(0); + return 0; } static __be32 @@ -3135,10 +3151,7 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c { struct xdr_stream *xdr = &resp->xdr; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &close->cl_stateid); - - return nfserr; + return nfsd4_encode_stateid(xdr, &close->cl_stateid); } @@ -3148,14 +3161,12 @@ nfsd4_encode_commit(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque_fixed(p, commit->co_verf.data, + p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque_fixed(p, commit->co_verf.data, NFS4_VERIFIER_SIZE); - } - return nfserr; + return 0; } static __be32 @@ -3164,15 +3175,13 @@ nfsd4_encode_create(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - encode_cinfo(p, &create->cr_cinfo); - nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], - create->cr_bmval[1], create->cr_bmval[2]); - } - return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + encode_cinfo(p, &create->cr_cinfo); + nfserr = nfsd4_encode_bitmap(xdr, create->cr_bmval[0], + create->cr_bmval[1], create->cr_bmval[2]); + return 0; } static __be32 @@ -3181,13 +3190,8 @@ nfsd4_encode_getattr(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 struct svc_fh *fhp = getattr->ga_fhp; struct xdr_stream *xdr = &resp->xdr; - if (nfserr) - return nfserr; - - nfserr = nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, - getattr->ga_bmval, - resp->rqstp, 0); - return nfserr; + return nfsd4_encode_fattr(xdr, fhp, fhp->fh_export, fhp->fh_dentry, + getattr->ga_bmval, resp->rqstp, 0); } static __be32 @@ -3198,14 +3202,12 @@ nfsd4_encode_getfh(struct nfsd4_compoundres *resp, __be32 nfserr, struct svc_fh unsigned int len; __be32 *p; - if (!nfserr) { - len = fhp->fh_handle.fh_size; - p = xdr_reserve_space(xdr, len + 4); - if (!p) - return nfserr_resource; - p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); - } - return nfserr; + len = fhp->fh_handle.fh_size; + p = xdr_reserve_space(xdr, len + 4); + if (!p) + return nfserr_resource; + p = xdr_encode_opaque(p, &fhp->fh_handle.fh_base, len); + return 0; } /* @@ -3275,10 +3277,7 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l { struct xdr_stream *xdr = &resp->xdr; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &locku->lu_stateid); - - return nfserr; + return nfsd4_encode_stateid(xdr, &locku->lu_stateid); } @@ -3288,13 +3287,11 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &link->li_cinfo); - } - return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &link->li_cinfo); + return 0; } @@ -3304,12 +3301,9 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (nfserr) - goto out; - nfserr = nfsd4_encode_stateid(xdr, &open->op_stateid); if (nfserr) - goto out; + return nfserr; p = xdr_reserve_space(xdr, 24); if (!p) return nfserr_resource; @@ -3319,7 +3313,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op nfserr = nfsd4_encode_bitmap(xdr, open->op_bmval[0], open->op_bmval[1], open->op_bmval[2]); if (nfserr) - goto out; + return nfserr; p = xdr_reserve_space(xdr, 4); if (!p) @@ -3392,8 +3386,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op BUG(); } /* XXX save filehandle here */ -out: - return nfserr; + return 0; } static __be32 @@ -3401,10 +3394,7 @@ nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct { struct xdr_stream *xdr = &resp->xdr; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); - - return nfserr; + return nfsd4_encode_stateid(xdr, &oc->oc_resp_stateid); } static __be32 @@ -3412,10 +3402,7 @@ nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struc { struct xdr_stream *xdr = &resp->xdr; - if (!nfserr) - nfserr = nfsd4_encode_stateid(xdr, &od->od_stateid); - - return nfserr; + return nfsd4_encode_stateid(xdr, &od->od_stateid); } static __be32 nfsd4_encode_splice_read( @@ -3552,20 +3539,15 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, struct raparms *ra = NULL; __be32 *p; - if (nfserr) - goto out; - p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); - nfserr = nfserr_resource; - goto out; + return nfserr_resource; } if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); - nfserr = nfserr_resource; - goto out; + return nfserr_resource; } xdr_commit_encode(xdr); @@ -3589,9 +3571,6 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, if (nfserr) xdr_truncate_encode(xdr, starting_len); -out: - if (file) - fput(file); return nfserr; } @@ -3605,9 +3584,6 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd int length_offset = xdr->buf->len; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 4); if (!p) return nfserr_resource; @@ -3651,9 +3627,6 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 int starting_len = xdr->buf->len; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE); if (!p) return nfserr_resource; @@ -3739,13 +3712,11 @@ nfsd4_encode_remove(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 20); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &remove->rm_cinfo); - } - return nfserr; + p = xdr_reserve_space(xdr, 20); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &remove->rm_cinfo); + return 0; } static __be32 @@ -3754,19 +3725,16 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_ struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 40); - if (!p) - return nfserr_resource; - p = encode_cinfo(p, &rename->rn_sinfo); - p = encode_cinfo(p, &rename->rn_tinfo); - } - return nfserr; + p = xdr_reserve_space(xdr, 40); + if (!p) + return nfserr_resource; + p = encode_cinfo(p, &rename->rn_sinfo); + p = encode_cinfo(p, &rename->rn_tinfo); + return 0; } static __be32 -nfsd4_do_encode_secinfo(struct xdr_stream *xdr, - __be32 nfserr, struct svc_export *exp) +nfsd4_do_encode_secinfo(struct xdr_stream *xdr, struct svc_export *exp) { u32 i, nflavs, supported; struct exp_flavor_info *flavs; @@ -3774,9 +3742,6 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, __be32 *p, *flavorsp; static bool report = true; - if (nfserr) - goto out; - nfserr = nfserr_resource; if (exp->ex_nflavors) { flavs = exp->ex_flavors; nflavs = exp->ex_nflavors; @@ -3800,7 +3765,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, supported = 0; p = xdr_reserve_space(xdr, 4); if (!p) - goto out; + return nfserr_resource; flavorsp = p++; /* to be backfilled later */ for (i = 0; i < nflavs; i++) { @@ -3812,7 +3777,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + 4 + XDR_LEN(info.oid.len) + 4 + 4); if (!p) - goto out; + return nfserr_resource; *p++ = cpu_to_be32(RPC_AUTH_GSS); p = xdr_encode_opaque(p, info.oid.data, info.oid.len); *p++ = cpu_to_be32(info.qop); @@ -3821,7 +3786,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, supported++; p = xdr_reserve_space(xdr, 4); if (!p) - goto out; + return nfserr_resource; *p++ = cpu_to_be32(pf); } else { if (report) @@ -3833,11 +3798,7 @@ nfsd4_do_encode_secinfo(struct xdr_stream *xdr, if (nflavs != supported) report = false; *flavorsp = htonl(supported); - nfserr = 0; -out: - if (exp) - exp_put(exp); - return nfserr; + return 0; } static __be32 @@ -3846,7 +3807,7 @@ nfsd4_encode_secinfo(struct nfsd4_compoundres *resp, __be32 nfserr, { struct xdr_stream *xdr = &resp->xdr; - return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->si_exp); + return nfsd4_do_encode_secinfo(xdr, secinfo->si_exp); } static __be32 @@ -3855,7 +3816,7 @@ nfsd4_encode_secinfo_no_name(struct nfsd4_compoundres *resp, __be32 nfserr, { struct xdr_stream *xdr = &resp->xdr; - return nfsd4_do_encode_secinfo(xdr, nfserr, secinfo->sin_exp); + return nfsd4_do_encode_secinfo(xdr, secinfo->sin_exp); } /* @@ -3916,16 +3877,14 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (!nfserr) { - p = xdr_reserve_space(xdr, 16); - if (!p) - return nfserr_resource; - *p++ = cpu_to_be32(write->wr_bytes_written); - *p++ = cpu_to_be32(write->wr_how_written); - p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, - NFS4_VERIFIER_SIZE); - } - return nfserr; + p = xdr_reserve_space(xdr, 16); + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(write->wr_bytes_written); + *p++ = cpu_to_be32(write->wr_how_written); + p = xdr_encode_opaque_fixed(p, write->wr_verifier.data, + NFS4_VERIFIER_SIZE); + return 0; } static __be32 @@ -3938,12 +3897,8 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, char *server_scope; int major_id_sz; int server_scope_sz; - int status = 0; uint64_t minor_id = 0; - if (nfserr) - return nfserr; - major_id = utsname()->nodename; major_id_sz = strlen(major_id); server_scope = utsname()->nodename; @@ -3968,19 +3923,19 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, break; case SP4_MACH_CRED: /* spo_must_enforce bitmap: */ - status = nfsd4_encode_bitmap(xdr, + nfserr = nfsd4_encode_bitmap(xdr, exid->spo_must_enforce[0], exid->spo_must_enforce[1], exid->spo_must_enforce[2]); - if (status) - goto out; + if (nfserr) + return nfserr; /* spo_must_allow bitmap: */ - status = nfsd4_encode_bitmap(xdr, + nfserr = nfsd4_encode_bitmap(xdr, exid->spo_must_allow[0], exid->spo_must_allow[1], exid->spo_must_allow[2]); - if (status) - goto out; + if (nfserr) + return nfserr; break; default: WARN_ON_ONCE(1); @@ -4007,8 +3962,6 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr, /* Implementation id */ *p++ = cpu_to_be32(0); /* zero length nfs_impl_id4 array */ return 0; -out: - return status; } static __be32 @@ -4018,9 +3971,6 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr, struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 24); if (!p) return nfserr_resource; @@ -4074,9 +4024,6 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr, struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN + 20); if (!p) return nfserr_resource; @@ -4101,9 +4048,6 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_test_stateid_id *stateid, *next; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 4 + (4 * test_stateid->ts_num_ids)); if (!p) return nfserr_resource; @@ -4113,7 +4057,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = stateid->ts_id_status; } - return nfserr; + return 0; } #ifdef CONFIG_NFSD_PNFS @@ -4126,14 +4070,9 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, u32 starting_len = xdr->buf->len, needed_len; __be32 *p; - dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr)); - if (nfserr) - goto out; - - nfserr = nfserr_resource; p = xdr_reserve_space(xdr, 4); if (!p) - goto out; + return nfserr_resource; *p++ = cpu_to_be32(gdev->gd_layout_type); @@ -4149,42 +4088,33 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, */ if (xdr->buf->len + 4 > gdev->gd_maxcount) goto toosmall; - goto out; + return nfserr; } } - nfserr = nfserr_resource; if (gdev->gd_notify_types) { p = xdr_reserve_space(xdr, 4 + 4); if (!p) - goto out; + return nfserr_resource; *p++ = cpu_to_be32(1); /* bitmap length */ *p++ = cpu_to_be32(gdev->gd_notify_types); } else { p = xdr_reserve_space(xdr, 4); if (!p) - goto out; + return nfserr_resource; *p++ = 0; } - nfserr = 0; -out: - kfree(gdev->gd_device); - dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr)); - return nfserr; - + return 0; toosmall: dprintk("%s: maxcount too small\n", __func__); needed_len = xdr->buf->len + 4 /* notifications */; xdr_truncate_encode(xdr, starting_len); p = xdr_reserve_space(xdr, 4); - if (!p) { - nfserr = nfserr_resource; - } else { - *p++ = cpu_to_be32(needed_len); - nfserr = nfserr_toosmall; - } - goto out; + if (!p) + return nfserr_resource; + *p++ = cpu_to_be32(needed_len); + return nfserr_toosmall; } static __be32 @@ -4195,14 +4125,9 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, const struct nfsd4_layout_ops *ops; __be32 *p; - dprintk("%s: err %d\n", __func__, nfserr); - if (nfserr) - goto out; - - nfserr = nfserr_resource; p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t)); if (!p) - goto out; + return nfserr_resource; *p++ = cpu_to_be32(1); /* we always set return-on-close */ *p++ = cpu_to_be32(lgp->lg_sid.si_generation); @@ -4216,10 +4141,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_layout_type); ops = nfsd4_layout_ops[lgp->lg_layout_type]; - nfserr = ops->encode_layoutget(xdr, lgp); -out: - kfree(lgp->lg_content); - return nfserr; + return ops->encode_layoutget(xdr, lgp); } static __be32 @@ -4229,9 +4151,6 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 4); if (!p) return nfserr_resource; @@ -4243,7 +4162,7 @@ nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr, p = xdr_encode_hyper(p, lcp->lc_newsize); } - return nfs_ok; + return 0; } static __be32 @@ -4253,16 +4172,13 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr, struct xdr_stream *xdr = &resp->xdr; __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(xdr, 4); if (!p) return nfserr_resource; *p++ = cpu_to_be32(lrp->lrs_present); if (lrp->lrs_present) return nfsd4_encode_stateid(xdr, &lrp->lr_sid); - return nfs_ok; + return 0; } #endif /* CONFIG_NFSD_PNFS */ @@ -4289,16 +4205,14 @@ nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - if (!nfserr) { - nfserr = nfsd42_encode_write_res(resp, ©->cp_res); - if (nfserr) - return nfserr; + nfserr = nfsd42_encode_write_res(resp, ©->cp_res); + if (nfserr) + return nfserr; - p = xdr_reserve_space(&resp->xdr, 4 + 4); - *p++ = cpu_to_be32(copy->cp_consecutive); - *p++ = cpu_to_be32(copy->cp_synchronous); - } - return nfserr; + p = xdr_reserve_space(&resp->xdr, 4 + 4); + *p++ = cpu_to_be32(copy->cp_consecutive); + *p++ = cpu_to_be32(copy->cp_synchronous); + return 0; } static __be32 @@ -4307,14 +4221,11 @@ nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, { __be32 *p; - if (nfserr) - return nfserr; - p = xdr_reserve_space(&resp->xdr, 4 + 8); *p++ = cpu_to_be32(seek->seek_eof); p = xdr_encode_hyper(p, seek->seek_pos); - return nfserr; + return 0; } static __be32 @@ -4330,7 +4241,7 @@ typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); * since we don't need to filter out obsolete ops as this is * done in the decoding phase. */ -static nfsd4_enc nfsd4_enc_ops[] = { +static const nfsd4_enc nfsd4_enc_ops[] = { [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, [OP_COMMIT] = (nfsd4_enc)nfsd4_encode_commit, @@ -4449,6 +4360,7 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) struct xdr_stream *xdr = &resp->xdr; struct nfs4_stateowner *so = resp->cstate.replay_owner; struct svc_rqst *rqstp = resp->rqstp; + const struct nfsd4_operation *opdesc = op->opdesc; int post_err_offset; nfsd4_enc encoder; __be32 *p; @@ -4463,10 +4375,15 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) if (op->opnum == OP_ILLEGAL) goto status; + if (op->status && opdesc && + !(opdesc->op_flags & OP_NONTRIVIAL_ERROR_ENCODE)) + goto status; BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || !nfsd4_enc_ops[op->opnum]); encoder = nfsd4_enc_ops[op->opnum]; op->status = encoder(resp, op->status, &op->u); + if (opdesc && opdesc->op_release) + opdesc->op_release(&op->u); xdr_commit_encode(xdr); /* nfsd4_check_resp_size guarantees enough room for error status */ @@ -4573,6 +4490,7 @@ nfs4svc_decode_compoundargs(struct svc_rqst *rqstp, __be32 *p) args->end = rqstp->rq_arg.head[0].iov_base + rqstp->rq_arg.head[0].iov_len; args->pagelist = rqstp->rq_arg.pages; args->pagelen = rqstp->rq_arg.page_len; + args->tail = false; args->tmpp = NULL; args->to_free = NULL; args->ops = args->iops; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 063ae7de2c12..7e3af3ef0917 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -475,7 +475,7 @@ static int nfsd_get_default_max_blksize(void) return ret; } -static struct svc_serv_ops nfsd_thread_sv_ops = { +static const struct svc_serv_ops nfsd_thread_sv_ops = { .svo_shutdown = nfsd_last_thread, .svo_function = nfsd, .svo_enqueue_xprt = svc_xprt_do_enqueue, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 38d0383dc7f9..bc69d40c4e8b 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -969,7 +969,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, int use_wgather; loff_t pos = offset; unsigned int pflags = current->flags; - int flags = 0; + rwf_t flags = 0; if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) /* diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 72c6ad136107..1e4edbf70052 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -538,6 +538,7 @@ struct nfsd4_seek { struct nfsd4_op { int opnum; + const struct nfsd4_operation * opdesc; __be32 status; union nfsd4_op_u { struct nfsd4_access access; @@ -614,6 +615,7 @@ struct nfsd4_compoundargs { __be32 * end; struct page ** pagelist; int pagelen; + bool tail; __be32 tmp[8]; __be32 * tmpp; struct svcxdr_tmpbuf *to_free; @@ -661,6 +663,7 @@ static inline bool nfsd4_last_compound_op(struct svc_rqst *rqstp) return argp->opcnt == resp->opcnt; } +const struct nfsd4_operation *OPDESC(struct nfsd4_op *op); int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op); void warn_on_nonidempotent_op(struct nfsd4_op *op); @@ -748,6 +751,53 @@ extern __be32 nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *, union nfsd4_op_u *); extern void nfsd4_bump_seqid(struct nfsd4_compound_state *, __be32 nfserr); +enum nfsd4_op_flags { + ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */ + ALLOWED_ON_ABSENT_FS = 1 << 1, /* ops processed on absent fs */ + ALLOWED_AS_FIRST_OP = 1 << 2, /* ops reqired first in compound */ + /* For rfc 5661 section 2.6.3.1.1: */ + OP_HANDLES_WRONGSEC = 1 << 3, + OP_IS_PUTFH_LIKE = 1 << 4, + /* + * These are the ops whose result size we estimate before + * encoding, to avoid performing an op then not being able to + * respond or cache a response. This includes writes and setattrs + * as well as the operations usually called "nonidempotent": + */ + OP_MODIFIES_SOMETHING = 1 << 5, + /* + * Cache compounds containing these ops in the xid-based drc: + * We use the DRC for compounds containing non-idempotent + * operations, *except* those that are 4.1-specific (since + * sessions provide their own EOS), and except for stateful + * operations other than setclientid and setclientid_confirm + * (since sequence numbers provide EOS for open, lock, etc in + * the v4.0 case). + */ + OP_CACHEME = 1 << 6, + /* + * These are ops which clear current state id. + */ + OP_CLEAR_STATEID = 1 << 7, + /* Most ops return only an error on failure; some may do more: */ + OP_NONTRIVIAL_ERROR_ENCODE = 1 << 8, +}; + +struct nfsd4_operation { + __be32 (*op_func)(struct svc_rqst *, struct nfsd4_compound_state *, + union nfsd4_op_u *); + void (*op_release)(union nfsd4_op_u *); + u32 op_flags; + char *op_name; + /* Try to get response size before operation */ + u32 (*op_rsize_bop)(struct svc_rqst *, struct nfsd4_op *); + void (*op_get_currentstateid)(struct nfsd4_compound_state *, + union nfsd4_op_u *); + void (*op_set_currentstateid)(struct nfsd4_compound_state *, + union nfsd4_op_u *); +}; + + #endif /* diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index f11a3ad2df0c..8616c46d33da 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -312,10 +312,9 @@ void nilfs_copy_back_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE); + n = pagevec_lookup(&pvec, smap, &index); if (!n) return; - index = pvec.pages[n - 1]->index + 1; for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i], *dpage; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index e73c86d9855c..6c5009cc4e6f 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -400,7 +400,7 @@ static struct bio *nilfs_alloc_seg_bio(struct the_nilfs *nilfs, sector_t start, bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { - bio->bi_bdev = nilfs->ns_bdev; + bio_set_dev(bio, nilfs->ns_bdev); bio->bi_iter.bi_sector = start << (nilfs->ns_blocksize_bits - 9); } diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 2430a0415995..cba328315929 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -133,7 +133,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) kmem_cache_free(dnotify_mark_cache, dn_mark); } -static struct fsnotify_ops dnotify_fsnotify_ops = { +static const struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, .free_mark = dnotify_free_mark, }; diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 0ee19ecc982d..1a24be9e8405 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1506,7 +1506,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index c4f68c338735..331910fa8442 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1989,7 +1989,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, ntfs_debug("Entering for inode 0x%lx.", vi->i_ino); - err = filemap_write_and_wait_range(vi->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; inode_lock(vi); diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e50a387959bf..40b5cc97f7b0 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -221,7 +221,7 @@ out: /* * Set the access or default ACL of an inode. */ -int ocfs2_set_acl(handle_t *handle, +static int ocfs2_set_acl(handle_t *handle, struct inode *inode, struct buffer_head *di_bh, int type, diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 2783a75b3999..7be0bb756286 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -28,13 +28,6 @@ struct ocfs2_acl_entry { struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type); int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type); -int ocfs2_set_acl(handle_t *handle, - struct inode *inode, - struct buffer_head *di_bh, - int type, - struct posix_acl *acl, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac); extern int ocfs2_acl_chmod(struct inode *, struct buffer_head *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct buffer_head *, struct buffer_head *, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb15a96df0b6..a177eae3aa1a 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -955,8 +955,7 @@ int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno, /* * How many free extents have we got before we need more meta data? */ -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et) +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et) { int retval; struct ocfs2_extent_list *el = NULL; @@ -1933,14 +1932,12 @@ out: * the new changes. * * left_rec: the record on the left. - * left_child_el: is the child list pointed to by left_rec * right_rec: the record to the right of left_rec * right_child_el: is the child list pointed to by right_rec * * By definition, this only works on interior nodes. */ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, - struct ocfs2_extent_list *left_child_el, struct ocfs2_extent_rec *right_rec, struct ocfs2_extent_list *right_child_el) { @@ -2003,7 +2000,7 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, */ BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); - ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, + ocfs2_adjust_adjacent_records(&root_el->l_recs[i], &root_el->l_recs[i + 1], right_el); } @@ -2060,8 +2057,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle, el = right_path->p_node[i].el; right_rec = &el->l_recs[0]; - ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, - right_el); + ocfs2_adjust_adjacent_records(left_rec, right_rec, right_el); ocfs2_journal_dirty(handle, left_path->p_node[i].bh); ocfs2_journal_dirty(handle, right_path->p_node[i].bh); @@ -2509,7 +2505,7 @@ out_ret_path: static int ocfs2_update_edge_lengths(handle_t *handle, struct ocfs2_extent_tree *et, - int subtree_index, struct ocfs2_path *path) + struct ocfs2_path *path) { int i, idx, ret; struct ocfs2_extent_rec *rec; @@ -2755,8 +2751,7 @@ static int ocfs2_rotate_subtree_left(handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(handle, et, left_path, right_path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -3060,8 +3055,7 @@ static int ocfs2_remove_rightmost_path(handle_t *handle, ocfs2_unlink_subtree(handle, et, left_path, path, subtree_index, dealloc); - ret = ocfs2_update_edge_lengths(handle, et, subtree_index, - left_path); + ret = ocfs2_update_edge_lengths(handle, et, left_path); if (ret) { mlog_errno(ret); goto out; @@ -4790,7 +4784,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle, if (mark_unwritten) flags = OCFS2_EXT_UNWRITTEN; - free_extents = ocfs2_num_free_extents(osb, et); + free_extents = ocfs2_num_free_extents(et); if (free_extents < 0) { status = free_extents; mlog_errno(status); @@ -5668,7 +5662,7 @@ static int ocfs2_reserve_blocks_for_rec_trunc(struct inode *inode, *ac = NULL; - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 4a5152ec88a3..27b75cf32cfa 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -144,8 +144,7 @@ int ocfs2_remove_btree_range(struct inode *inode, struct ocfs2_cached_dealloc_ctxt *dealloc, u64 refcount_loc, bool refcount_tree_locked); -int ocfs2_num_free_extents(struct ocfs2_super *osb, - struct ocfs2_extent_tree *et); +int ocfs2_num_free_extents(struct ocfs2_extent_tree *et); /* * how many new metadata chunks would an allocation need at maximum? diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index ffe003982d95..d0206042d068 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -505,8 +505,7 @@ static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc, } } -static void o2hb_wait_on_io(struct o2hb_region *reg, - struct o2hb_bio_wait_ctxt *wc) +static void o2hb_wait_on_io(struct o2hb_bio_wait_ctxt *wc) { o2hb_bio_wait_dec(wc, 1); wait_for_completion(&wc->wc_io_complete); @@ -554,7 +553,7 @@ static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg, /* Must put everything in 512 byte sectors for the bio... */ bio->bi_iter.bi_sector = (reg->hr_start_block + cs) << (bits - 9); - bio->bi_bdev = reg->hr_bdev; + bio_set_dev(bio, reg->hr_bdev); bio->bi_private = wc; bio->bi_end_io = o2hb_bio_end_io; bio_set_op_attrs(bio, op, op_flags); @@ -608,7 +607,7 @@ static int o2hb_read_slots(struct o2hb_region *reg, status = 0; bail_and_wait: - o2hb_wait_on_io(reg, &wc); + o2hb_wait_on_io(&wc); if (wc.wc_error && !status) status = wc.wc_error; @@ -1162,7 +1161,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) * before we can go to steady state. This ensures that * people we find in our steady state have seen us. */ - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); if (write_wc.wc_error) { /* Do not re-arm the write timeout on I/O error - we * can't be sure that the new block ever made it to @@ -1275,7 +1274,7 @@ static int o2hb_thread(void *data) o2hb_prepare_block(reg, 0); ret = o2hb_issue_node_write(reg, &write_wc); if (ret == 0) - o2hb_wait_on_io(reg, &write_wc); + o2hb_wait_on_io(&write_wc); else mlog_errno(ret); } @@ -2576,22 +2575,6 @@ void o2hb_unregister_callback(const char *region_uuid, } EXPORT_SYMBOL_GPL(o2hb_unregister_callback); -int o2hb_check_node_heartbeating(u8 node_num) -{ - unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; - - o2hb_fill_node_map(testing_map, sizeof(testing_map)); - if (!test_bit(node_num, testing_map)) { - mlog(ML_HEARTBEAT, - "node (%u) does not have heartbeating enabled.\n", - node_num); - return 0; - } - - return 1; -} -EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); - int o2hb_check_node_heartbeating_no_sem(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -2626,23 +2609,6 @@ int o2hb_check_node_heartbeating_from_callback(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_from_callback); -/* Makes sure our local node is configured with a node number, and is - * heartbeating. */ -int o2hb_check_local_node_heartbeating(void) -{ - u8 node_num; - - /* if this node was set then we have networking */ - node_num = o2nm_this_node(); - if (node_num == O2NM_MAX_NODES) { - mlog(ML_HEARTBEAT, "this node has not been configured.\n"); - return 0; - } - - return o2hb_check_node_heartbeating(node_num); -} -EXPORT_SYMBOL_GPL(o2hb_check_local_node_heartbeating); - /* * this is just a hack until we get the plumbing which flips file systems * read only and drops the hb ref instead of killing the node dead. diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 3ecb9f337b7d..febe6312ceff 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -3249,7 +3249,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_unlock(&OCFS2_I(dir)->ip_lock); ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(dir), parent_fe_bh); - num_free_extents = ocfs2_num_free_extents(osb, &et); + num_free_extents = ocfs2_num_free_extents(&et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index bfeb647459d9..6e41fc8fabbe 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -196,7 +196,7 @@ static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; @@ -713,13 +713,6 @@ leave: return status; } -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) -{ - return __ocfs2_extend_allocation(inode, logical_start, - clusters_to_add, mark_unwritten); -} - /* * While a write will already be ordering the data, a truncate will not. * Thus, we need to explicitly order the zeroed pages. diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index d5e5fa7f0743..36304434eacf 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1348,7 +1348,6 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_schedule_truncate_log_flush(osb, 0); osb->local_alloc_copy = NULL; - osb->dirty = 0; /* queue to recover orphan slots for all offline slots */ ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index e52a2852d50d..7eb3b0a6347e 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -175,7 +175,7 @@ static int ocfs2_lock_allocators_move_extents(struct inode *inode, unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 0c39d71c67a1..9a50f222ac97 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -320,7 +320,6 @@ struct ocfs2_super u64 system_dir_blkno; u64 bitmap_blkno; u32 bitmap_cpg; - u8 *uuid; char *uuid_str; u32 uuid_hash; u8 *vol_label; @@ -388,9 +387,8 @@ struct ocfs2_super unsigned int osb_resv_level; unsigned int osb_dir_resv_level; - /* Next three fields are for local node slot recovery during + /* Next two fields are for local node slot recovery during * mount. */ - int dirty; struct ocfs2_dinode *local_alloc_copy; struct ocfs2_quota_recovery *quota_rec; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index cec495a921e3..c94b6baaa551 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -33,7 +33,7 @@ * Locking of quotas with OCFS2 is rather complex. Here are rules that * should be obeyed by all the functions: * - any write of quota structure (either to local or global file) is protected - * by dqio_mutex or dquot->dq_lock. + * by dqio_sem or dquot->dq_lock. * - any modification of global quota file holds inode cluster lock, i_mutex, * and ip_alloc_sem of the global quota file (achieved by * ocfs2_lock_global_qf). It also has to hold qinfo_lock. @@ -42,9 +42,9 @@ * * A rough sketch of locking dependencies (lf = local file, gf = global file): * Normal filesystem operation: - * start_trans -> dqio_mutex -> write to lf + * start_trans -> dqio_sem -> write to lf * Syncing of local and global file: - * ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf * -> write to lf * Acquire dquot for the first time: @@ -60,7 +60,7 @@ * Recovery: * inode cluster lock of recovered lf * -> read bitmaps -> ip_alloc_sem of lf - * -> ocfs2_lock_global_qf -> start_trans -> dqio_mutex -> qinfo_lock -> + * -> ocfs2_lock_global_qf -> start_trans -> dqio_sem -> qinfo_lock -> * write to gf */ @@ -443,13 +443,17 @@ static int __ocfs2_global_write_info(struct super_block *sb, int type) int ocfs2_global_write_info(struct super_block *sb, int type) { int err; - struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct quota_info *dqopt = sb_dqopt(sb); + struct ocfs2_mem_dqinfo *info = dqopt->info[type].dqi_priv; + down_write(&dqopt->dqio_sem); err = ocfs2_qinfo_lock(info, 1); if (err < 0) - return err; + goto out_sem; err = __ocfs2_global_write_info(sb, type); ocfs2_qinfo_unlock(info, 1); +out_sem: + up_write(&dqopt->dqio_sem); return err; } @@ -500,7 +504,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) /* Update space and inode usage. Get also other information from * global quota file so that we don't overwrite any changes there. * We are */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); spacechange = dquot->dq_dqb.dqb_curspace - OCFS2_DQUOT(dquot)->dq_origspace; inodechange = dquot->dq_dqb.dqb_curinodes - @@ -556,7 +560,7 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); err = ocfs2_qinfo_lock(info, freeing); if (err < 0) { mlog(ML_ERROR, "Failed to lock quota info, losing quota write" @@ -611,7 +615,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); status = ocfs2_sync_dquot(dquot); if (status < 0) mlog_errno(status); @@ -619,7 +623,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) status = ocfs2_local_write_dquot(dquot); if (status < 0) mlog_errno(status); - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); @@ -666,9 +670,9 @@ static int ocfs2_write_dquot(struct dquot *dquot) mlog_errno(status); goto out; } - mutex_lock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + down_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); status = ocfs2_local_write_dquot(dquot); - mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex); + up_write(&sb_dqopt(dquot->dq_sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out: return status; @@ -920,10 +924,10 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* In case user set some limits, sync dquot immediately to global * quota file so that information propagates quicker */ - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); if (dquot->dq_flags & mask) sync = 1; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* This is a slight hack but we can't afford getting global quota * lock if we already have a transaction started. */ if (!sync || journal_current_handle()) { @@ -939,7 +943,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) mlog_errno(status); goto out_ilock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); + down_write(&sb_dqopt(sb)->dqio_sem); status = ocfs2_sync_dquot(dquot); if (status < 0) { mlog_errno(status); @@ -948,7 +952,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot) /* Now write updated local dquot structure */ status = ocfs2_local_write_dquot(dquot); out_dlock: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(osb, handle); out_ilock: ocfs2_unlock_global_qf(oinfo, 1); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 32c5a40c1257..aa700fd10610 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -520,8 +520,8 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, mlog_errno(status); goto out_drop_lock; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); - spin_lock(&dq_data_lock); + down_write(&sb_dqopt(sb)->dqio_sem); + spin_lock(&dquot->dq_dqb_lock); /* Add usage from quota entry into quota changes * of our node. Auxiliary variables are important * due to signedness */ @@ -529,7 +529,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, inodechange = le64_to_cpu(dqblk->dqb_inodemod); dquot->dq_dqb.dqb_curspace += spacechange; dquot->dq_dqb.dqb_curinodes += inodechange; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); /* We want to drop reference held by the crashed * node. Since we have our own reference we know * global structure actually won't be freed. */ @@ -553,7 +553,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode, unlock_buffer(qbh); ocfs2_journal_dirty(handle, qbh); out_commit: - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + up_write(&sb_dqopt(sb)->dqio_sem); ocfs2_commit_trans(OCFS2_SB(sb), handle); out_drop_lock: ocfs2_unlock_global_qf(oinfo, 1); @@ -691,9 +691,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) struct ocfs2_quota_recovery *rec; int locked = 0; - /* We don't need the lock and we have to acquire quota file locks - * which will later depend on this lock */ - mutex_unlock(&sb_dqopt(sb)->dqio_mutex); info->dqi_max_spc_limit = 0x7fffffffffffffffLL; info->dqi_max_ino_limit = 0x7fffffffffffffffLL; oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); @@ -772,7 +769,6 @@ static int ocfs2_local_read_info(struct super_block *sb, int type) goto out_err; } - mutex_lock(&sb_dqopt(sb)->dqio_mutex); return 0; out_err: if (oinfo) { @@ -786,7 +782,6 @@ out_err: kfree(oinfo); } brelse(bh); - mutex_lock(&sb_dqopt(sb)->dqio_mutex); return -1; } @@ -882,12 +877,12 @@ static void olq_set_dquot(struct buffer_head *bh, void *private) dqblk->dqb_id = cpu_to_le64(from_kqid(&init_user_ns, od->dq_dquot.dq_id)); - spin_lock(&dq_data_lock); + spin_lock(&od->dq_dquot.dq_dqb_lock); dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - od->dq_origspace); dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - od->dq_originodes); - spin_unlock(&dq_data_lock); + spin_unlock(&od->dq_dquot.dq_dqb_lock); trace_olq_set_dquot( (unsigned long long)le64_to_cpu(dqblk->dqb_spacemod), (unsigned long long)le64_to_cpu(dqblk->dqb_inodemod), diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index f8933cb53d68..ab156e35ec00 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -2851,7 +2851,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb, int *credits) { int ret = 0, meta_add = 0; - int num_free_extents = ocfs2_num_free_extents(OCFS2_SB(sb), et); + int num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 6ad3533940ba..71f22c8fbffd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2700,7 +2700,7 @@ int ocfs2_lock_allocators(struct inode *inode, BUG_ON(clusters_to_add != 0 && data_ac == NULL); - num_free_extents = ocfs2_num_free_extents(osb, et); + num_free_extents = ocfs2_num_free_extents(et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 83005f486451..3f936be379a9 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2486,7 +2486,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) if (dirty) { /* Recovery will be completed after we've mounted the * rest of the volume. */ - osb->dirty = 1; osb->local_alloc_copy = local_alloc; local_alloc = NULL; } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f70c3778d600..5fdf269ba82e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6800,7 +6800,7 @@ static int ocfs2_lock_reflink_xattr_rec_allocators( *credits += 1; /* count in the xattr tree change. */ - num_free_extents = ocfs2_num_free_extents(osb, xt_et); + num_free_extents = ocfs2_num_free_extents(xt_et); if (num_free_extents < 0) { ret = num_free_extents; mlog_errno(ret); diff --git a/fs/open.c b/fs/open.c index 35bb784763a4..7ea118471dce 100644 --- a/fs/open.c +++ b/fs/open.c @@ -96,7 +96,7 @@ long vfs_truncate(const struct path *path, loff_t length) * write access on the upper inode, not on the overlay inode. For * non-overlay filesystems d_real() is an identity function. */ - upperdentry = d_real(path->dentry, NULL, O_WRONLY); + upperdentry = d_real(path->dentry, NULL, O_WRONLY, 0); error = PTR_ERR(upperdentry); if (IS_ERR(upperdentry)) goto mnt_drop_write_and_out; @@ -670,12 +670,12 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group) if (!f.file) goto out; - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (error) goto out_fput; audit_file(f.file); error = chown_common(&f.file->f_path, user, group); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); out_fput: fdput(f); out: @@ -857,7 +857,7 @@ EXPORT_SYMBOL(file_path); int vfs_open(const struct path *path, struct file *file, const struct cred *cred) { - struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags); + struct dentry *dentry = d_real(path->dentry, NULL, file->f_flags, 0); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index acb6f97deb97..aad97b30d5e6 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -241,7 +241,7 @@ struct ovl_fh *ovl_encode_fh(struct dentry *lower, bool is_upper) int buflen = MAX_HANDLE_SZ; uuid_t *uuid = &lower->d_sb->s_uuid; - buf = kmalloc(buflen, GFP_TEMPORARY); + buf = kmalloc(buflen, GFP_KERNEL); if (!buf) return ERR_PTR(-ENOMEM); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 48b70e6490f3..3309b1912241 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -155,7 +155,7 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) { - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, false); ovl_dentry_set_upper_alias(dentry); if (!hardlink) { ovl_inode_update(inode, newdentry); @@ -692,7 +692,7 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) if (flags) ovl_cleanup(wdir, upper); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, true); out_d_drop: d_drop(dentry); dput(whiteout); @@ -742,7 +742,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) err = vfs_rmdir(dir, upper); else err = vfs_unlink(dir, upper, NULL); - ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_version_inc(dentry->d_parent, ovl_type_origin(dentry)); /* * Keeping this dentry hashed would mean having to release @@ -833,7 +833,7 @@ static char *ovl_get_redirect(struct dentry *dentry, bool samedir) goto out; } - buf = ret = kmalloc(buflen, GFP_TEMPORARY); + buf = ret = kmalloc(buflen, GFP_KERNEL); if (!buf) goto out; @@ -1089,8 +1089,9 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent); - ovl_dentry_version_inc(new->d_parent); + ovl_dentry_version_inc(old->d_parent, + !overwrite && ovl_type_origin(new)); + ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old)); out_dput: dput(newdentry); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 5bc71642b226..a619addecafc 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -498,6 +498,9 @@ static int ovl_set_nlink_common(struct dentry *dentry, len = snprintf(buf, sizeof(buf), format, (int) (inode->i_nlink - realinode->i_nlink)); + if (WARN_ON(len >= sizeof(buf))) + return -EIO; + return ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_NLINK, buf, len, 0); } @@ -576,10 +579,13 @@ static int ovl_inode_set(struct inode *inode, void *data) static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, struct dentry *upperdentry) { - struct inode *lowerinode = lowerdentry ? d_inode(lowerdentry) : NULL; - - /* Lower (origin) inode must match, even if NULL */ - if (ovl_inode_lower(inode) != lowerinode) + /* + * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. + * This happens when finding a copied up overlay inode for a renamed + * or hardlinked overlay dentry and lower dentry cannot be followed + * by origin because lower fs does not support file handles. + */ + if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) return false; /* diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 8aef2b304b2d..c3addd1114f1 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -38,7 +38,7 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, return 0; goto fail; } - buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY); + buf = kzalloc(prelen + res + strlen(post) + 1, GFP_KERNEL); if (!buf) return -ENOMEM; @@ -103,7 +103,7 @@ static struct ovl_fh *ovl_get_origin_fh(struct dentry *dentry) if (res == 0) return NULL; - fh = kzalloc(res, GFP_TEMPORARY); + fh = kzalloc(res, GFP_KERNEL); if (!fh) return ERR_PTR(-ENOMEM); @@ -309,7 +309,7 @@ static int ovl_check_origin(struct dentry *upperdentry, BUG_ON(*ctrp); if (!*stackp) - *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY); + *stackp = kmalloc(sizeof(struct path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; @@ -418,7 +418,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, err = -ENOMEM; len = index->d_name.len / 2; - fh = kzalloc(len, GFP_TEMPORARY); + fh = kzalloc(len, GFP_KERNEL); if (!fh) goto fail; @@ -478,7 +478,7 @@ int ovl_get_index_name(struct dentry *origin, struct qstr *name) return PTR_ERR(fh); err = -ENOMEM; - n = kzalloc(fh->len * 2, GFP_TEMPORARY); + n = kzalloc(fh->len * 2, GFP_KERNEL); if (n) { s = bin2hex(n, fh, fh->len); *name = (struct qstr) QSTR_INIT(n, s - n); @@ -646,7 +646,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; stack = kcalloc(ofs->numlower, sizeof(struct path), - GFP_TEMPORARY); + GFP_KERNEL); if (!stack) goto out_put_upper; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e927a62c97ae..d4e8c1a08fb0 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -204,8 +204,8 @@ struct dentry *ovl_i_dentry_upper(struct inode *inode); struct inode *ovl_inode_upper(struct inode *inode); struct inode *ovl_inode_lower(struct inode *inode); struct inode *ovl_inode_real(struct inode *inode); -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode); +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache); bool ovl_dentry_is_opaque(struct dentry *dentry); bool ovl_dentry_is_whiteout(struct dentry *dentry); void ovl_dentry_set_opaque(struct dentry *dentry); @@ -217,7 +217,7 @@ void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect); void ovl_inode_init(struct inode *inode, struct dentry *upperdentry, struct dentry *lowerdentry); void ovl_inode_update(struct inode *inode, struct dentry *upperdentry); -void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity); u64 ovl_dentry_version_get(struct dentry *dentry); bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); @@ -229,6 +229,7 @@ int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, int xerr); int ovl_set_impure(struct dentry *dentry, struct dentry *upperdentry); void ovl_set_flag(unsigned long flag, struct inode *inode); +void ovl_clear_flag(unsigned long flag, struct inode *inode); bool ovl_test_flag(unsigned long flag, struct inode *inode); bool ovl_inuse_trylock(struct dentry *dentry); void ovl_inuse_unlock(struct dentry *dentry); @@ -256,6 +257,7 @@ extern const struct file_operations ovl_dir_operations; int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); void ovl_cache_free(struct list_head *list); +void ovl_dir_cache_free(struct inode *inode); int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 3d424a51cabb..62e9b22a2077 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -15,11 +15,13 @@ #include <linux/rbtree.h> #include <linux/security.h> #include <linux/cred.h> +#include <linux/ratelimit.h> #include "overlayfs.h" struct ovl_cache_entry { unsigned int len; unsigned int type; + u64 real_ino; u64 ino; struct list_head l_node; struct rb_node node; @@ -32,18 +34,20 @@ struct ovl_dir_cache { long refcount; u64 version; struct list_head entries; + struct rb_root root; }; struct ovl_readdir_data { struct dir_context ctx; struct dentry *dentry; bool is_lowest; - struct rb_root root; + struct rb_root *root; struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; int count; int err; + bool is_upper; bool d_type_supported; }; @@ -58,7 +62,33 @@ struct ovl_dir_file { static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) { - return container_of(n, struct ovl_cache_entry, node); + return rb_entry(n, struct ovl_cache_entry, node); +} + +static bool ovl_cache_entry_find_link(const char *name, int len, + struct rb_node ***link, + struct rb_node **parent) +{ + bool found = false; + struct rb_node **newp = *link; + + while (!found && *newp) { + int cmp; + struct ovl_cache_entry *tmp; + + *parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + found = true; + } + *link = newp; + + return found; } static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, @@ -82,6 +112,32 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } +static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, + struct ovl_cache_entry *p) +{ + /* Don't care if not doing ovl_iter() */ + if (!rdd->dentry) + return false; + + /* Always recalc d_ino for parent */ + if (strcmp(p->name, "..") == 0) + return true; + + /* If this is lower, then native d_ino will do */ + if (!rdd->is_upper) + return false; + + /* + * Recalc d_ino for '.' and for all entries if dir is impure (contains + * copied up entries) + */ + if ((p->name[0] == '.' && p->len == 1) || + ovl_test_flag(OVL_IMPURE, d_inode(rdd->dentry))) + return true; + + return false; +} + static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) @@ -97,7 +153,11 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, p->name[len] = '\0'; p->len = len; p->type = d_type; + p->real_ino = ino; p->ino = ino; + /* Defer setting d_ino for upper entry to ovl_iterate() */ + if (ovl_calc_d_ino(rdd, p)) + p->ino = 0; p->is_whiteout = false; if (d_type == DT_CHR) { @@ -111,32 +171,22 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { - struct rb_node **newp = &rdd->root.rb_node; + struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; - while (*newp) { - int cmp; - struct ovl_cache_entry *tmp; - - parent = *newp; - tmp = ovl_cache_entry_from_node(*newp); - cmp = strncmp(name, tmp->name, len); - if (cmp > 0) - newp = &tmp->node.rb_right; - else if (cmp < 0 || len < tmp->len) - newp = &tmp->node.rb_left; - else - return 0; - } + if (ovl_cache_entry_find_link(name, len, &newp, &parent)) + return 0; p = ovl_cache_entry_new(rdd, name, len, ino, d_type); - if (p == NULL) + if (p == NULL) { + rdd->err = -ENOMEM; return -ENOMEM; + } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); - rb_insert_color(&p->node, &rdd->root); + rb_insert_color(&p->node, rdd->root); return 0; } @@ -147,7 +197,7 @@ static int ovl_fill_lowest(struct ovl_readdir_data *rdd, { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(&rdd->root, name, namelen); + p = ovl_cache_entry_find(rdd->root, name, namelen); if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { @@ -172,6 +222,16 @@ void ovl_cache_free(struct list_head *list) INIT_LIST_HEAD(list); } +void ovl_dir_cache_free(struct inode *inode) +{ + struct ovl_dir_cache *cache = ovl_dir_cache(inode); + + if (cache) { + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) { struct ovl_dir_cache *cache = od->cache; @@ -179,8 +239,8 @@ static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) WARN_ON(cache->refcount <= 0); cache->refcount--; if (!cache->refcount) { - if (ovl_dir_cache(dentry) == cache) - ovl_set_dir_cache(dentry, NULL); + if (ovl_dir_cache(d_inode(dentry)) == cache) + ovl_set_dir_cache(d_inode(dentry), NULL); ovl_cache_free(&cache->entries); kfree(cache); @@ -273,7 +333,8 @@ static void ovl_dir_reset(struct file *file) od->is_real = false; } -static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) +static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, + struct rb_root *root) { int err; struct path realpath; @@ -281,13 +342,14 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list) .ctx.actor = ovl_fill_merge, .dentry = dentry, .list = list, - .root = RB_ROOT, + .root = root, .is_lowest = false, }; int idx, next; for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; if (next != -1) { err = ovl_dir_read(&realpath, &rdd); @@ -326,12 +388,13 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) int res; struct ovl_dir_cache *cache; - cache = ovl_dir_cache(dentry); + cache = ovl_dir_cache(d_inode(dentry)); if (cache && ovl_dentry_version_get(dentry) == cache->version) { + WARN_ON(!cache->refcount); cache->refcount++; return cache; } - ovl_set_dir_cache(dentry, NULL); + ovl_set_dir_cache(d_inode(dentry), NULL); cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); if (!cache) @@ -339,31 +402,276 @@ static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) cache->refcount = 1; INIT_LIST_HEAD(&cache->entries); + cache->root = RB_ROOT; + + res = ovl_dir_read_merged(dentry, &cache->entries, &cache->root); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(d_inode(dentry), cache); + + return cache; +} + +/* + * Set d_ino for upper entries. Non-upper entries should always report + * the uppermost real inode ino and should not call this function. + * + * When not all layer are on same fs, report real ino also for upper. + * + * When all layers are on the same fs, and upper has a reference to + * copy up origin, call vfs_getattr() on the overlay entry to make + * sure that d_ino will be consistent with st_ino from stat(2). + */ +static int ovl_cache_update_ino(struct path *path, struct ovl_cache_entry *p) + +{ + struct dentry *dir = path->dentry; + struct dentry *this = NULL; + enum ovl_path_type type; + u64 ino = p->real_ino; + int err = 0; + + if (!ovl_same_sb(dir->d_sb)) + goto out; + + if (p->name[0] == '.') { + if (p->len == 1) { + this = dget(dir); + goto get; + } + if (p->len == 2 && p->name[1] == '.') { + /* we shall not be moved */ + this = dget(dir->d_parent); + goto get; + } + } + this = lookup_one_len(p->name, dir, p->len); + if (IS_ERR_OR_NULL(this) || !this->d_inode) { + if (IS_ERR(this)) { + err = PTR_ERR(this); + this = NULL; + goto fail; + } + goto out; + } + +get: + type = ovl_path_type(this); + if (OVL_TYPE_ORIGIN(type)) { + struct kstat stat; + struct path statpath = *path; + + statpath.dentry = this; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + goto fail; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + ino = stat.ino; + } + +out: + p->ino = ino; + dput(this); + return err; - res = ovl_dir_read_merged(dentry, &cache->entries); +fail: + pr_warn_ratelimited("overlay: failed to look up (%s) for ino (%i)\n", + p->name, err); + goto out; +} + +static int ovl_fill_plain(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_cache_entry *p; + struct ovl_readdir_data *rdd = + container_of(ctx, struct ovl_readdir_data, ctx); + + rdd->count++; + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + if (p == NULL) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + list_add_tail(&p->l_node, rdd->list); + + return 0; +} + +static int ovl_dir_read_impure(struct path *path, struct list_head *list, + struct rb_root *root) +{ + int err; + struct path realpath; + struct ovl_cache_entry *p, *n; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_plain, + .list = list, + .root = root, + }; + + INIT_LIST_HEAD(list); + *root = RB_ROOT; + ovl_path_upper(path->dentry, &realpath); + + err = ovl_dir_read(&realpath, &rdd); + if (err) + return err; + + list_for_each_entry_safe(p, n, list, l_node) { + if (strcmp(p->name, ".") != 0 && + strcmp(p->name, "..") != 0) { + err = ovl_cache_update_ino(path, p); + if (err) + return err; + } + if (p->ino == p->real_ino) { + list_del(&p->l_node); + kfree(p); + } else { + struct rb_node **newp = &root->rb_node; + struct rb_node *parent = NULL; + + if (WARN_ON(ovl_cache_entry_find_link(p->name, p->len, + &newp, &parent))) + return -EIO; + + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, root); + } + } + return 0; +} + +static struct ovl_dir_cache *ovl_cache_get_impure(struct path *path) +{ + int res; + struct dentry *dentry = path->dentry; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(d_inode(dentry)); + if (cache && ovl_dentry_version_get(dentry) == cache->version) + return cache; + + /* Impure cache is not refcounted, free it here */ + ovl_dir_cache_free(d_inode(dentry)); + ovl_set_dir_cache(d_inode(dentry), NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + res = ovl_dir_read_impure(path, &cache->entries, &cache->root); if (res) { ovl_cache_free(&cache->entries); kfree(cache); return ERR_PTR(res); } + if (list_empty(&cache->entries)) { + /* Good oportunity to get rid of an unnecessary "impure" flag */ + ovl_do_removexattr(ovl_dentry_upper(dentry), OVL_XATTR_IMPURE); + ovl_clear_flag(OVL_IMPURE, d_inode(dentry)); + kfree(cache); + return NULL; + } cache->version = ovl_dentry_version_get(dentry); - ovl_set_dir_cache(dentry, cache); + ovl_set_dir_cache(d_inode(dentry), cache); return cache; } +struct ovl_readdir_translate { + struct dir_context *orig_ctx; + struct ovl_dir_cache *cache; + struct dir_context ctx; + u64 parent_ino; +}; + +static int ovl_fill_real(struct dir_context *ctx, const char *name, + int namelen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct ovl_readdir_translate *rdt = + container_of(ctx, struct ovl_readdir_translate, ctx); + struct dir_context *orig_ctx = rdt->orig_ctx; + + if (rdt->parent_ino && strcmp(name, "..") == 0) + ino = rdt->parent_ino; + else if (rdt->cache) { + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdt->cache->root, name, namelen); + if (p) + ino = p->ino; + } + + return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); +} + +static int ovl_iterate_real(struct file *file, struct dir_context *ctx) +{ + int err; + struct ovl_dir_file *od = file->private_data; + struct dentry *dir = file->f_path.dentry; + struct ovl_readdir_translate rdt = { + .ctx.actor = ovl_fill_real, + .orig_ctx = ctx, + }; + + if (OVL_TYPE_MERGE(ovl_path_type(dir->d_parent))) { + struct kstat stat; + struct path statpath = file->f_path; + + statpath.dentry = dir->d_parent; + err = vfs_getattr(&statpath, &stat, STATX_INO, 0); + if (err) + return err; + + WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev); + rdt.parent_ino = stat.ino; + } + + if (ovl_test_flag(OVL_IMPURE, d_inode(dir))) { + rdt.cache = ovl_cache_get_impure(&file->f_path); + if (IS_ERR(rdt.cache)) + return PTR_ERR(rdt.cache); + } + + return iterate_dir(od->realfile, &rdt.ctx); +} + + static int ovl_iterate(struct file *file, struct dir_context *ctx) { struct ovl_dir_file *od = file->private_data; struct dentry *dentry = file->f_path.dentry; struct ovl_cache_entry *p; + int err; if (!ctx->pos) ovl_dir_reset(file); - if (od->is_real) + if (od->is_real) { + /* + * If parent is merge, then need to adjust d_ino for '..', if + * dir is impure then need to adjust d_ino for copied up + * entries. + */ + if (ovl_same_sb(dentry->d_sb) && + (ovl_test_flag(OVL_IMPURE, d_inode(dentry)) || + OVL_TYPE_MERGE(ovl_path_type(dentry->d_parent)))) { + return ovl_iterate_real(file, ctx); + } return iterate_dir(od->realfile, ctx); + } if (!od->cache) { struct ovl_dir_cache *cache; @@ -378,9 +686,15 @@ static int ovl_iterate(struct file *file, struct dir_context *ctx) while (od->cursor != &od->cache->entries) { p = list_entry(od->cursor, struct ovl_cache_entry, l_node); - if (!p->is_whiteout) + if (!p->is_whiteout) { + if (!p->ino) { + err = ovl_cache_update_ino(&file->f_path, p); + if (err) + return err; + } if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) break; + } od->cursor = p->l_node.next; ctx->pos++; } @@ -446,14 +760,14 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); - smp_mb__before_spinlock(); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { inode_unlock(inode); return PTR_ERR(realfile); } - od->upperfile = realfile; + smp_store_release(&od->upperfile, realfile); } else { /* somebody has beaten us to it */ if (!IS_ERR(realfile)) @@ -522,8 +836,9 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; struct ovl_cache_entry *p; + struct rb_root root = RB_ROOT; - err = ovl_dir_read_merged(dentry, list); + err = ovl_dir_read_merged(dentry, list, &root); if (err) return err; @@ -612,12 +927,13 @@ static void ovl_workdir_cleanup_recurse(struct path *path, int level) int err; struct inode *dir = path->dentry->d_inode; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; @@ -675,12 +991,13 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, struct inode *dir = dentry->d_inode; struct path path = { .mnt = mnt, .dentry = dentry }; LIST_HEAD(list); + struct rb_root root = RB_ROOT; struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, .dentry = NULL, .list = &list, - .root = RB_ROOT, + .root = &root, .is_lowest = false, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d86e89f97201..cd49c0298ddf 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -70,20 +70,20 @@ static int ovl_check_append_only(struct inode *inode, int flag) static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, - unsigned int open_flags) + unsigned int open_flags, unsigned int flags) { struct dentry *real; int err; + if (flags & D_REAL_UPPER) + return ovl_dentry_upper(dentry); + if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) return dentry; goto bug; } - if (d_is_negative(dentry)) - return dentry; - if (open_flags) { err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) @@ -105,7 +105,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry, goto bug; /* Handle recursion */ - real = d_real(real, inode, open_flags); + real = d_real(real, inode, open_flags, 0); if (!inode || inode == d_inode(real)) return real; @@ -198,6 +198,7 @@ static void ovl_destroy_inode(struct inode *inode) dput(oi->__upperdentry); kfree(oi->redirect); + ovl_dir_cache_free(inode); mutex_destroy(&oi->lock); call_rcu(&inode->i_rcu, ovl_i_callback); diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index f46ad75dc96a..117794582f9f 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -180,14 +180,14 @@ struct inode *ovl_inode_real(struct inode *inode) } -struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +struct ovl_dir_cache *ovl_dir_cache(struct inode *inode) { - return OVL_I(d_inode(dentry))->cache; + return OVL_I(inode)->cache; } -void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +void ovl_set_dir_cache(struct inode *inode, struct ovl_dir_cache *cache) { - OVL_I(d_inode(dentry))->cache = cache; + OVL_I(inode)->cache = cache; } bool ovl_dentry_is_opaque(struct dentry *dentry) @@ -275,12 +275,19 @@ void ovl_inode_update(struct inode *inode, struct dentry *upperdentry) } } -void ovl_dentry_version_inc(struct dentry *dentry) +void ovl_dentry_version_inc(struct dentry *dentry, bool impurity) { struct inode *inode = d_inode(dentry); WARN_ON(!inode_is_locked(inode)); - OVL_I(inode)->version++; + /* + * Version is used by readdir code to keep cache consistent. For merge + * dirs all changes need to be noted. For non-merge dirs, cache only + * contains impure (ones which have been copied up and have origins) + * entries, so only need to note changes to impure entries. + */ + if (OVL_TYPE_MERGE(ovl_path_type(dentry)) || impurity) + OVL_I(inode)->version++; } u64 ovl_dentry_version_get(struct dentry *dentry) @@ -382,6 +389,11 @@ void ovl_set_flag(unsigned long flag, struct inode *inode) set_bit(flag, &OVL_I(inode)->flags); } +void ovl_clear_flag(unsigned long flag, struct inode *inode) +{ + clear_bit(flag, &OVL_I(inode)->flags); +} + bool ovl_test_flag(unsigned long flag, struct inode *inode) { return test_bit(flag, &OVL_I(inode)->flags); diff --git a/fs/proc/base.c b/fs/proc/base.c index 719c2e943ea1..ad3b0762cc3e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -232,7 +232,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) { rv = -ENOMEM; goto out_mmput; @@ -813,7 +813,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mm) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -918,7 +918,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mm || !mm->env_end) return 0; - page = (char *)__get_free_page(GFP_TEMPORARY); + page = (char *)__get_free_page(GFP_KERNEL); if (!page) return -ENOMEM; @@ -1408,12 +1408,13 @@ static const struct file_operations proc_fail_nth_operations = { static int sched_show(struct seq_file *m, void *v) { struct inode *inode = m->private; + struct pid_namespace *ns = inode->i_sb->s_fs_info; struct task_struct *p; p = get_proc_task(inode); if (!p) return -ESRCH; - proc_sched_show_task(p, m); + proc_sched_show_task(p, ns, m); put_task_struct(p); @@ -1629,7 +1630,7 @@ out: static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) { - char *tmp = (char*)__get_free_page(GFP_TEMPORARY); + char *tmp = (char *)__get_free_page(GFP_KERNEL); char *pathname; int len; @@ -2930,6 +2931,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_pid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY @@ -3323,6 +3325,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, proc_clear_refs_operations), REG("smaps", S_IRUGO, proc_tid_smaps_operations), + REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations), REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY diff --git a/fs/proc/devices.c b/fs/proc/devices.c index 50493edc30e5..e5709343feb7 100644 --- a/fs/proc/devices.c +++ b/fs/proc/devices.c @@ -7,14 +7,14 @@ static int devinfo_show(struct seq_file *f, void *v) { int i = *(loff_t *) v; - if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i < CHRDEV_MAJOR_MAX) { if (i == 0) seq_puts(f, "Character devices:\n"); chrdev_show(f, i); } #ifdef CONFIG_BLOCK else { - i -= CHRDEV_MAJOR_HASH_SIZE; + i -= CHRDEV_MAJOR_MAX; if (i == 0) seq_puts(f, "\nBlock devices:\n"); blkdev_show(f, i); @@ -25,7 +25,7 @@ static int devinfo_show(struct seq_file *f, void *v) static void *devinfo_start(struct seq_file *f, loff_t *pos) { - if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos < (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return pos; return NULL; } @@ -33,7 +33,7 @@ static void *devinfo_start(struct seq_file *f, loff_t *pos) static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) { (*pos)++; - if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + if (*pos >= (BLKDEV_MAJOR_MAX + CHRDEV_MAJOR_MAX)) return NULL; return pos; } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index e3cda0b5968f..793a67574668 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -40,8 +40,8 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry static struct proc_dir_entry *pde_subdir_first(struct proc_dir_entry *dir) { - return rb_entry_safe(rb_first(&dir->subdir), struct proc_dir_entry, - subdir_node); + return rb_entry_safe(rb_first_cached(&dir->subdir), + struct proc_dir_entry, subdir_node); } static struct proc_dir_entry *pde_subdir_next(struct proc_dir_entry *dir) @@ -54,7 +54,7 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, const char *name, unsigned int len) { - struct rb_node *node = dir->subdir.rb_node; + struct rb_node *node = dir->subdir.rb_root.rb_node; while (node) { struct proc_dir_entry *de = rb_entry(node, @@ -75,8 +75,9 @@ static struct proc_dir_entry *pde_subdir_find(struct proc_dir_entry *dir, static bool pde_subdir_insert(struct proc_dir_entry *dir, struct proc_dir_entry *de) { - struct rb_root *root = &dir->subdir; - struct rb_node **new = &root->rb_node, *parent = NULL; + struct rb_root_cached *root = &dir->subdir; + struct rb_node **new = &root->rb_root.rb_node, *parent = NULL; + bool leftmost = true; /* Figure out where to put new node */ while (*new) { @@ -88,15 +89,16 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, parent = *new; if (result < 0) new = &(*new)->rb_left; - else if (result > 0) + else if (result > 0) { new = &(*new)->rb_right; - else + leftmost = false; + } else return false; } /* Add new node and rebalance tree. */ rb_link_node(&de->subdir_node, parent, new); - rb_insert_color(&de->subdir_node, root); + rb_insert_color_cached(&de->subdir_node, root, leftmost); return true; } @@ -369,7 +371,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; - ent->subdir = RB_ROOT; + ent->subdir = RB_ROOT_CACHED; atomic_set(&ent->count, 1); spin_lock_init(&ent->pde_unload_lock); INIT_LIST_HEAD(&ent->pde_openers); @@ -499,6 +501,14 @@ out: } EXPORT_SYMBOL(proc_create_data); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops) +{ + return proc_create_data(name, mode, parent, proc_fops, NULL); +} +EXPORT_SYMBOL(proc_create); + void proc_set_size(struct proc_dir_entry *de, loff_t size) { de->size = size; @@ -545,7 +555,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent) de = pde_subdir_find(parent, fn, len); if (de) - rb_erase(&de->subdir_node, &parent->subdir); + rb_erase_cached(&de->subdir_node, &parent->subdir); write_unlock(&proc_subdir_lock); if (!de) { WARN(1, "name '%s'\n", name); @@ -582,13 +592,13 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) write_unlock(&proc_subdir_lock); return -ENOENT; } - rb_erase(&root->subdir_node, &parent->subdir); + rb_erase_cached(&root->subdir_node, &parent->subdir); de = root; while (1) { next = pde_subdir_first(de); if (next) { - rb_erase(&next->subdir_node, &de->subdir); + rb_erase_cached(&next->subdir_node, &de->subdir); de = next; continue; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index aa2b89071630..a34195e92b20 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -40,7 +40,7 @@ struct proc_dir_entry { const struct inode_operations *proc_iops; const struct file_operations *proc_fops; struct proc_dir_entry *parent; - struct rb_root subdir; + struct rb_root_cached subdir; struct rb_node subdir_node; void *data; atomic_t count; /* use count */ @@ -269,10 +269,12 @@ extern int proc_remount(struct super_block *, int *, char *); /* * task_[no]mmu.c */ +struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; + struct mem_size_stats *rollup; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -288,6 +290,7 @@ extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_tid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; +extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_tid_smaps_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 509a61668d90..cdd979724c74 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -80,7 +80,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); - show_val_kb(m, "Mlocked: ", global_page_state(NR_MLOCK)); + show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); @@ -114,9 +114,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "SUnreclaim: ", global_node_page_state(NR_SLAB_UNRECLAIMABLE)); seq_printf(m, "KernelStack: %8lu kB\n", - global_page_state(NR_KERNEL_STACK_KB)); + global_zone_page_state(NR_KERNEL_STACK_KB)); show_val_kb(m, "PageTables: ", - global_page_state(NR_PAGETABLE)); + global_zone_page_state(NR_PAGETABLE)); #ifdef CONFIG_QUICKLIST show_val_kb(m, "Quicklists: ", quicklist_total_size()); #endif @@ -124,7 +124,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "NFS_Unstable: ", global_node_page_state(NR_UNSTABLE_NFS)); show_val_kb(m, "Bounce: ", - global_page_state(NR_BOUNCE)); + global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); @@ -151,7 +151,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", - global_page_state(NR_FREE_CMA_PAGES)); + global_zone_page_state(NR_FREE_CMA_PAGES)); #endif hugetlb_report_meminfo(m); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index d72fc40241d9..a2bf369c923d 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -196,7 +196,7 @@ static __net_init int proc_net_ns_init(struct net *net) if (!netd) goto out; - netd->subdir = RB_ROOT; + netd->subdir = RB_ROOT_CACHED; netd->data = net; netd->nlink = 2; netd->namelen = 3; diff --git a/fs/proc/root.c b/fs/proc/root.c index deecb397daa3..926fb27f4ca2 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -210,7 +210,7 @@ struct proc_dir_entry proc_root = { .proc_iops = &proc_root_inode_operations, .proc_fops = &proc_root_operations, .parent = &proc_root, - .subdir = RB_ROOT, + .subdir = RB_ROOT_CACHED, .name = "/proc", }; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fe8f3265e877..5589b4bd4b85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -253,6 +253,7 @@ static int proc_map_release(struct inode *inode, struct file *file) if (priv->mm) mmdrop(priv->mm); + kfree(priv->rollup); return seq_release_private(inode, file); } @@ -267,8 +268,7 @@ static int do_maps_open(struct inode *inode, struct file *file, * Indicate if the VMA is a stack for the given task; for * /proc/PID/maps that is the stack of the main task. */ -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) +static int is_stack(struct vm_area_struct *vma) { /* * We make no effort to guess what a given thread considers to be @@ -279,12 +279,28 @@ static int is_stack(struct proc_maps_private *priv, vma->vm_end >= vma->vm_mm->start_stack; } +static void show_vma_header_prefix(struct seq_file *m, + unsigned long start, unsigned long end, + vm_flags_t flags, unsigned long long pgoff, + dev_t dev, unsigned long ino) +{ + seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); + seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", + start, + end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? 's' : 'p', + pgoff, + MAJOR(dev), MINOR(dev), ino); +} + static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; - struct proc_maps_private *priv = m->private; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; @@ -301,17 +317,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) start = vma->vm_start; end = vma->vm_end; - - seq_setwidth(m, 25 + sizeof(void *) * 6 - 1); - seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu ", - start, - end, - flags & VM_READ ? 'r' : '-', - flags & VM_WRITE ? 'w' : '-', - flags & VM_EXEC ? 'x' : '-', - flags & VM_MAYSHARE ? 's' : 'p', - pgoff, - MAJOR(dev), MINOR(dev), ino); + show_vma_header_prefix(m, start, end, flags, pgoff, dev, ino); /* * Print the dentry name for named mappings, and a @@ -342,7 +348,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - if (is_stack(priv, vma)) + if (is_stack(vma)) name = "[stack]"; } @@ -430,6 +436,7 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { + bool first; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -443,7 +450,9 @@ struct mem_size_stats { unsigned long swap; unsigned long shared_hugetlb; unsigned long private_hugetlb; + unsigned long first_vma_start; u64 pss; + u64 pss_locked; u64 swap_pss; bool check_shmem_swap; }; @@ -538,6 +547,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap && pte_none(*pte))) { page = find_get_entry(vma->vm_file->f_mapping, @@ -597,13 +608,14 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { - smaps_pmd_entry(pmd, addr, walk); + if (pmd_present(*pmd)) + smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); - return 0; + goto out; } if (pmd_trans_unstable(pmd)) - return 0; + goto out; /* * The mmap_sem held all the way back in m_start() is what * keeps khugepaged out of here and from collapsing things @@ -613,6 +625,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; pte++, addr += PAGE_SIZE) smaps_pte_entry(pte, addr, walk); pte_unmap_unlock(pte - 1, ptl); +out: cond_resched(); return 0; } @@ -652,6 +665,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_ARCH_1)] = "ar", + [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", #ifdef CONFIG_MEM_SOFT_DIRTY [ilog2(VM_SOFTDIRTY)] = "sd", @@ -700,6 +714,8 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + else if (is_device_private_entry(swpent)) + page = device_private_entry_to_page(swpent); } if (page) { int mapcount = page_mapcount(page); @@ -719,18 +735,36 @@ void __weak arch_show_smap(struct seq_file *m, struct vm_area_struct *vma) static int show_smap(struct seq_file *m, void *v, int is_pid) { + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct mem_size_stats mss; + struct mem_size_stats mss_stack; + struct mem_size_stats *mss; struct mm_walk smaps_walk = { .pmd_entry = smaps_pte_range, #ifdef CONFIG_HUGETLB_PAGE .hugetlb_entry = smaps_hugetlb_range, #endif .mm = vma->vm_mm, - .private = &mss, }; + int ret = 0; + bool rollup_mode; + bool last_vma; + + if (priv->rollup) { + rollup_mode = true; + mss = priv->rollup; + if (mss->first) { + mss->first_vma_start = vma->vm_start; + mss->first = false; + } + last_vma = !m_next_vma(priv, vma); + } else { + rollup_mode = false; + memset(&mss_stack, 0, sizeof(mss_stack)); + mss = &mss_stack; + } - memset(&mss, 0, sizeof mss); + smaps_walk.private = mss; #ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { @@ -748,9 +782,9 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || !(vma->vm_flags & VM_WRITE)) { - mss.swap = shmem_swapped; + mss->swap = shmem_swapped; } else { - mss.check_shmem_swap = true; + mss->check_shmem_swap = true; smaps_walk.pte_hole = smaps_pte_hole; } } @@ -758,54 +792,71 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); + if (vma->vm_flags & VM_LOCKED) + mss->pss_locked += mss->pss; + + if (!rollup_mode) { + show_map_vma(m, vma, is_pid); + } else if (last_vma) { + show_vma_header_prefix( + m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0); + seq_pad(m, ' '); + seq_puts(m, "[rollup]\n"); + } else { + ret = SEQ_SKIP; + } - show_map_vma(m, vma, is_pid); - - seq_printf(m, - "Size: %8lu kB\n" - "Rss: %8lu kB\n" - "Pss: %8lu kB\n" - "Shared_Clean: %8lu kB\n" - "Shared_Dirty: %8lu kB\n" - "Private_Clean: %8lu kB\n" - "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n" - "Anonymous: %8lu kB\n" - "LazyFree: %8lu kB\n" - "AnonHugePages: %8lu kB\n" - "ShmemPmdMapped: %8lu kB\n" - "Shared_Hugetlb: %8lu kB\n" - "Private_Hugetlb: %7lu kB\n" - "Swap: %8lu kB\n" - "SwapPss: %8lu kB\n" - "KernelPageSize: %8lu kB\n" - "MMUPageSize: %8lu kB\n" - "Locked: %8lu kB\n", - (vma->vm_end - vma->vm_start) >> 10, - mss.resident >> 10, - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), - mss.shared_clean >> 10, - mss.shared_dirty >> 10, - mss.private_clean >> 10, - mss.private_dirty >> 10, - mss.referenced >> 10, - mss.anonymous >> 10, - mss.lazyfree >> 10, - mss.anonymous_thp >> 10, - mss.shmem_thp >> 10, - mss.shared_hugetlb >> 10, - mss.private_hugetlb >> 10, - mss.swap >> 10, - (unsigned long)(mss.swap_pss >> (10 + PSS_SHIFT)), - vma_kernel_pagesize(vma) >> 10, - vma_mmu_pagesize(vma) >> 10, - (vma->vm_flags & VM_LOCKED) ? - (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0); - - arch_show_smap(m, vma); - show_smap_vma_flags(m, vma); + if (!rollup_mode) + seq_printf(m, + "Size: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", + (vma->vm_end - vma->vm_start) >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); + + + if (!rollup_mode || last_vma) + seq_printf(m, + "Rss: %8lu kB\n" + "Pss: %8lu kB\n" + "Shared_Clean: %8lu kB\n" + "Shared_Dirty: %8lu kB\n" + "Private_Clean: %8lu kB\n" + "Private_Dirty: %8lu kB\n" + "Referenced: %8lu kB\n" + "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" + "AnonHugePages: %8lu kB\n" + "ShmemPmdMapped: %8lu kB\n" + "Shared_Hugetlb: %8lu kB\n" + "Private_Hugetlb: %7lu kB\n" + "Swap: %8lu kB\n" + "SwapPss: %8lu kB\n" + "Locked: %8lu kB\n", + mss->resident >> 10, + (unsigned long)(mss->pss >> (10 + PSS_SHIFT)), + mss->shared_clean >> 10, + mss->shared_dirty >> 10, + mss->private_clean >> 10, + mss->private_dirty >> 10, + mss->referenced >> 10, + mss->anonymous >> 10, + mss->lazyfree >> 10, + mss->anonymous_thp >> 10, + mss->shmem_thp >> 10, + mss->shared_hugetlb >> 10, + mss->private_hugetlb >> 10, + mss->swap >> 10, + (unsigned long)(mss->swap_pss >> (10 + PSS_SHIFT)), + (unsigned long)(mss->pss >> (10 + PSS_SHIFT))); + + if (!rollup_mode) { + arch_show_smap(m, vma); + show_smap_vma_flags(m, vma); + } m_cache_vma(m, vma); - return 0; + return ret; } static int show_pid_smap(struct seq_file *m, void *v) @@ -837,6 +888,25 @@ static int pid_smaps_open(struct inode *inode, struct file *file) return do_maps_open(inode, file, &proc_pid_smaps_op); } +static int pid_smaps_rollup_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct proc_maps_private *priv; + int ret = do_maps_open(inode, file, &proc_pid_smaps_op); + + if (ret < 0) + return ret; + seq = file->private_data; + priv = seq->private; + priv->rollup = kzalloc(sizeof(*priv->rollup), GFP_KERNEL); + if (!priv->rollup) { + proc_map_release(inode, file); + return -ENOMEM; + } + priv->rollup->first = true; + return 0; +} + static int tid_smaps_open(struct inode *inode, struct file *file) { return do_maps_open(inode, file, &proc_tid_smaps_op); @@ -849,6 +919,13 @@ const struct file_operations proc_pid_smaps_operations = { .release = proc_map_release, }; +const struct file_operations proc_pid_smaps_rollup_operations = { + .open = pid_smaps_rollup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_map_release, +}; + const struct file_operations proc_tid_smaps_operations = { .open = tid_smaps_open, .read = seq_read, @@ -904,17 +981,22 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, { pmd_t pmd = *pmdp; - /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) - pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) - pmd = pmd_mkyoung(pmd); - - pmd = pmd_wrprotect(pmd); - pmd = pmd_clear_soft_dirty(pmd); - - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + if (pmd_present(pmd)) { + /* See comment in change_huge_pmd() */ + pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(*pmdp)) + pmd = pmd_mkdirty(pmd); + if (pmd_young(*pmdp)) + pmd = pmd_mkyoung(pmd); + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_soft_dirty(pmd); + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_clear_soft_dirty(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } } #else static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, @@ -939,6 +1021,9 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, goto out; } + if (!pmd_present(*pmd)) + goto out; + page = pmd_page(*pmd); /* Clear accessed and referenced bits. */ @@ -1181,7 +1266,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pm->show_pfn) frame = pte_pfn(pte); flags |= PM_PRESENT; - page = vm_normal_page(vma, addr, pte); + page = _vm_normal_page(vma, addr, pte, true); if (pte_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; } else if (is_swap_pte(pte)) { @@ -1194,6 +1279,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); + + if (is_device_private_entry(entry)) + page = device_private_entry_to_page(entry); } if (page && !PageAnon(page)) @@ -1220,27 +1308,33 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; + struct page *page = NULL; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; - /* - * Currently pmd for thp is always present because thp - * can not be swapped-out, migrated, or HWPOISONed - * (split in such cases instead.) - * This if-check is just to prepare for future implementation. - */ if (pmd_present(pmd)) { - struct page *page = pmd_page(pmd); - - if (page_mapcount(page) == 1) - flags |= PM_MMAP_EXCLUSIVE; + page = pmd_page(pmd); flags |= PM_PRESENT; if (pm->show_pfn) frame = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); } +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + else if (is_swap_pmd(pmd)) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + flags |= PM_SWAP; + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + page = migration_entry_to_page(entry); + } +#endif + + if (page && page_mapcount(page) == 1) + flags |= PM_MMAP_EXCLUSIVE; for (; addr != end; addr += PAGE_SIZE) { pagemap_entry_t pme = make_pme(frame, flags); @@ -1380,7 +1474,7 @@ static ssize_t pagemap_read(struct file *file, char __user *buf, pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN); pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT); - pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_TEMPORARY); + pm.buffer = kmalloc(pm.len * PM_ENTRY_BYTES, GFP_KERNEL); ret = -ENOMEM; if (!pm.buffer) goto out_mm; @@ -1673,7 +1767,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else if (is_stack(proc_priv, vma)) { + } else if (is_stack(vma)) { seq_puts(m, " stack"); } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 23266694db11..b00b766098fa 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -125,8 +125,7 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static int is_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma) +static int is_stack(struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; @@ -146,7 +145,6 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, int is_pid) { struct mm_struct *mm = vma->vm_mm; - struct proc_maps_private *priv = m->private; unsigned long ino = 0; struct file *file; dev_t dev = 0; @@ -178,7 +176,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm && is_stack(priv, vma)) { + } else if (mm && is_stack(vma)) { seq_pad(m, ' '); seq_printf(m, "[stack]"); } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index fefd22611cf6..d814723fb27d 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -36,7 +36,6 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/uaccess.h> -#include <linux/syslog.h> #include "internal.h" @@ -132,18 +131,6 @@ static const struct seq_operations pstore_ftrace_seq_ops = { .show = pstore_ftrace_seq_show, }; -static int pstore_check_syslog_permissions(struct pstore_private *ps) -{ - switch (ps->record->type) { - case PSTORE_TYPE_DMESG: - case PSTORE_TYPE_CONSOLE: - return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, - SYSLOG_FROM_READER); - default: - return 0; - } -} - static ssize_t pstore_file_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { @@ -163,10 +150,6 @@ static int pstore_file_open(struct inode *inode, struct file *file) int err; const struct seq_operations *sops = NULL; - err = pstore_check_syslog_permissions(ps); - if (err) - return err; - if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; @@ -204,11 +187,6 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; struct pstore_record *record = p->record; - int err; - - err = pstore_check_syslog_permissions(p); - if (err) - return err; if (!record->psi->erase) return -EPERM; @@ -471,7 +449,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) inode = pstore_get_inode(sb); if (inode) { - inode->i_mode = S_IFDIR | 0755; + inode->i_mode = S_IFDIR | 0750; inode->i_op = &pstore_dir_inode_operations; inode->i_fop = &simple_dir_operations; inc_nlink(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 566e6ef99f07..8381db9db6d9 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -82,16 +82,19 @@ #include <linux/uaccess.h> /* - * There are three quota SMP locks. dq_list_lock protects all lists with quotas - * and quota formats. - * dq_data_lock protects data from dq_dqb and also mem_dqinfo structures and - * also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. - * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly - * in inode_add_bytes() and inode_sub_bytes(). dq_state_lock protects - * modifications of quota state (on quotaon and quotaoff) and readers who care - * about latest values take it as well. + * There are five quota SMP locks: + * * dq_list_lock protects all lists with quotas and quota formats. + * * dquot->dq_dqb_lock protects data from dq_dqb + * * inode->i_lock protects inode->i_blocks, i_bytes and also guards + * consistency of dquot->dq_dqb with inode->i_blocks, i_bytes so that + * dquot_transfer() can stabilize amount it transfers + * * dq_data_lock protects mem_dqinfo structures and modifications of dquot + * pointers in the inode + * * dq_state_lock protects modifications of quota state (on quotaon and + * quotaoff) and readers who care about latest values take it as well. * - * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock, + * The spinlock ordering is hence: + * dq_data_lock > dq_list_lock > i_lock > dquot->dq_dqb_lock, * dq_list_lock > dq_state_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during @@ -110,17 +113,14 @@ * sure they cannot race with quotaon which first sets S_NOQUOTA flag and * then drops all pointers to dquots from an inode. * - * Each dquot has its dq_lock mutex. Locked dquots might not be referenced - * from inodes (dquot_alloc_space() and such don't check the dq_lock). - * Currently dquot is locked only when it is being read to memory (or space for - * it is being allocated) on the first dqget() and when it is being released on - * the last dqput(). The allocation and release oparations are serialized by - * the dq_lock and by checking the use count in dquot_release(). Write - * operations on dquots don't hold dq_lock as they copy data under dq_data_lock - * spinlock to internal buffers before writing. + * Each dquot has its dq_lock mutex. Dquot is locked when it is being read to + * memory (or space for it is being allocated) on the first dqget(), when it is + * being written out, and when it is being released on the last dqput(). The + * allocation and release operations are serialized by the dq_lock and by + * checking the use count in dquot_release(). * * Lock ordering (including related VFS locks) is the following: - * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex + * s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_sem */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock); @@ -129,6 +129,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); DEFINE_STATIC_SRCU(dquot_srcu); +static DECLARE_WAIT_QUEUE_HEAD(dquot_ref_wq); + void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...) { @@ -247,6 +249,7 @@ struct dqstats dqstats; EXPORT_SYMBOL(dqstats); static qsize_t inode_get_rsv_space(struct inode *inode); +static qsize_t __inode_get_rsv_space(struct inode *inode); static int __dquot_initialize(struct inode *inode, int type); static inline unsigned int @@ -342,6 +345,12 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) { int ret = 1; + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + return 0; + + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) + return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags); + /* If quota is dirty already, we don't have to acquire dq_list_lock */ if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return 1; @@ -381,18 +390,26 @@ static inline void dqput_all(struct dquot **dquot) dqput(dquot[cnt]); } -/* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { - if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY) + return test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags); + + spin_lock(&dq_list_lock); + if (!test_and_clear_bit(DQ_MOD_B, &dquot->dq_flags)) { + spin_unlock(&dq_list_lock); return 0; + } list_del_init(&dquot->dq_dirty); + spin_unlock(&dq_list_lock); return 1; } void mark_info_dirty(struct super_block *sb, int type) { - set_bit(DQF_INFO_DIRTY_B, &sb_dqopt(sb)->info[type].dqi_flags); + spin_lock(&dq_data_lock); + sb_dqopt(sb)->info[type].dqi_flags |= DQF_INFO_DIRTY; + spin_unlock(&dq_data_lock); } EXPORT_SYMBOL(mark_info_dirty); @@ -406,7 +423,6 @@ int dquot_acquire(struct dquot *dquot) struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); mutex_lock(&dquot->dq_lock); - mutex_lock(&dqopt->dqio_mutex); if (!test_bit(DQ_READ_B, &dquot->dq_flags)) ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot); if (ret < 0) @@ -436,7 +452,6 @@ int dquot_acquire(struct dquot *dquot) smp_mb__before_atomic(); set_bit(DQ_ACTIVE_B, &dquot->dq_flags); out_iolock: - mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&dquot->dq_lock); return ret; } @@ -450,21 +465,17 @@ int dquot_commit(struct dquot *dquot) int ret = 0; struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - mutex_lock(&dqopt->dqio_mutex); - spin_lock(&dq_list_lock); - if (!clear_dquot_dirty(dquot)) { - spin_unlock(&dq_list_lock); - goto out_sem; - } - spin_unlock(&dq_list_lock); + mutex_lock(&dquot->dq_lock); + if (!clear_dquot_dirty(dquot)) + goto out_lock; /* Inactive dquot can be only if there was error during read/init * => we have better not writing it */ if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot); else ret = -EIO; -out_sem: - mutex_unlock(&dqopt->dqio_mutex); +out_lock: + mutex_unlock(&dquot->dq_lock); return ret; } EXPORT_SYMBOL(dquot_commit); @@ -481,7 +492,6 @@ int dquot_release(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(&dquot->dq_count) > 1) goto out_dqlock; - mutex_lock(&dqopt->dqio_mutex); if (dqopt->ops[dquot->dq_id.type]->release_dqblk) { ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot); /* Write the info */ @@ -493,7 +503,6 @@ int dquot_release(struct dquot *dquot) ret = ret2; } clear_bit(DQ_ACTIVE_B, &dquot->dq_flags); - mutex_unlock(&dqopt->dqio_mutex); out_dqlock: mutex_unlock(&dquot->dq_lock); return ret; @@ -530,22 +539,18 @@ restart: continue; /* Wait for dquot users */ if (atomic_read(&dquot->dq_count)) { - DEFINE_WAIT(wait); - dqgrab(dquot); - prepare_to_wait(&dquot->dq_wait_unused, &wait, - TASK_UNINTERRUPTIBLE); spin_unlock(&dq_list_lock); - /* Once dqput() wakes us up, we know it's time to free + /* + * Once dqput() wakes us up, we know it's time to free * the dquot. * IMPORTANT: we rely on the fact that there is always * at most one process waiting for dquot to free. * Otherwise dq_count would be > 1 and we would never * wake up. */ - if (atomic_read(&dquot->dq_count) > 1) - schedule(); - finish_wait(&dquot->dq_wait_unused, &wait); + wait_event(dquot_ref_wq, + atomic_read(&dquot->dq_count) == 1); dqput(dquot); /* At this moment dquot() need not exist (it could be * reclaimed by prune_dqcache(). Hence we must @@ -629,11 +634,9 @@ int dquot_writeback_dquots(struct super_block *sb, int type) while (!list_empty(dirty)) { dquot = list_first_entry(dirty, struct dquot, dq_dirty); - /* Dirty and inactive can be only bad dquot... */ - if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { - clear_dquot_dirty(dquot); - continue; - } + + WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); + /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ @@ -759,12 +762,12 @@ we_slept: /* Releasing dquot during quotaoff phase? */ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_id.type) && atomic_read(&dquot->dq_count) == 1) - wake_up(&dquot->dq_wait_unused); + wake_up(&dquot_ref_wq); spin_unlock(&dq_list_lock); return; } /* Need to release dquot? */ - if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && dquot_dirty(dquot)) { + if (dquot_dirty(dquot)) { spin_unlock(&dq_list_lock); /* Commit dquot before releasing */ ret = dquot->dq_sb->dq_op->write_dquot(dquot); @@ -776,14 +779,10 @@ we_slept: * We clear dirty bit anyway, so that we avoid * infinite loop here */ - spin_lock(&dq_list_lock); clear_dquot_dirty(dquot); - spin_unlock(&dq_list_lock); } goto we_slept; } - /* Clear flag in case dquot was inactive (something bad happened) */ - clear_dquot_dirty(dquot); if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) { spin_unlock(&dq_list_lock); dquot->dq_sb->dq_op->release_dquot(dquot); @@ -818,10 +817,10 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) INIT_LIST_HEAD(&dquot->dq_inuse); INIT_HLIST_NODE(&dquot->dq_hash); INIT_LIST_HEAD(&dquot->dq_dirty); - init_waitqueue_head(&dquot->dq_wait_unused); dquot->dq_sb = sb; dquot->dq_id = make_kqid_invalid(type); atomic_set(&dquot->dq_count, 1); + spin_lock_init(&dquot->dq_dqb_lock); return dquot; } @@ -1079,42 +1078,6 @@ static void drop_dquot_ref(struct super_block *sb, int type) } } -static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_curinodes += number; -} - -static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_curspace += number; -} - -static inline void dquot_resv_space(struct dquot *dquot, qsize_t number) -{ - dquot->dq_dqb.dqb_rsvspace += number; -} - -/* - * Claim reserved quota space - */ -static void dquot_claim_reserved_space(struct dquot *dquot, qsize_t number) -{ - if (dquot->dq_dqb.dqb_rsvspace < number) { - WARN_ON_ONCE(1); - number = dquot->dq_dqb.dqb_rsvspace; - } - dquot->dq_dqb.dqb_curspace += number; - dquot->dq_dqb.dqb_rsvspace -= number; -} - -static void dquot_reclaim_reserved_space(struct dquot *dquot, qsize_t number) -{ - if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) - number = dquot->dq_dqb.dqb_curspace; - dquot->dq_dqb.dqb_rsvspace += number; - dquot->dq_dqb.dqb_curspace -= number; -} - static inline void dquot_free_reserved_space(struct dquot *dquot, qsize_t number) { @@ -1278,21 +1241,24 @@ static int ignore_hardlimit(struct dquot *dquot) !(info->dqi_flags & DQF_ROOT_SQUASH)); } -/* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, qsize_t inodes, - struct dquot_warn *warn) +static int dquot_add_inodes(struct dquot *dquot, qsize_t inodes, + struct dquot_warn *warn) { - qsize_t newinodes = dquot->dq_dqb.dqb_curinodes + inodes; + qsize_t newinodes; + int ret = 0; + spin_lock(&dquot->dq_dqb_lock); + newinodes = dquot->dq_dqb.dqb_curinodes + inodes; if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return 0; + goto add; if (dquot->dq_dqb.dqb_ihardlimit && newinodes > dquot->dq_dqb.dqb_ihardlimit && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_IHARDWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1301,7 +1267,8 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, ktime_get_real_seconds() >= dquot->dq_dqb.dqb_itime && !ignore_hardlimit(dquot)) { prepare_warning(warn, dquot, QUOTA_NL_ISOFTLONGWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_isoftlimit && @@ -1311,30 +1278,40 @@ static int check_idq(struct dquot *dquot, qsize_t inodes, dquot->dq_dqb.dqb_itime = ktime_get_real_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type].dqi_igrace; } +add: + dquot->dq_dqb.dqb_curinodes = newinodes; - return 0; +out: + spin_unlock(&dquot->dq_dqb_lock); + return ret; } -/* needs dq_data_lock */ -static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, - struct dquot_warn *warn) +static int dquot_add_space(struct dquot *dquot, qsize_t space, + qsize_t rsv_space, unsigned int flags, + struct dquot_warn *warn) { qsize_t tspace; struct super_block *sb = dquot->dq_sb; + int ret = 0; + spin_lock(&dquot->dq_dqb_lock); if (!sb_has_quota_limits_enabled(sb, dquot->dq_id.type) || test_bit(DQ_FAKE_B, &dquot->dq_flags)) - return 0; + goto add; tspace = dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace - + space; + + space + rsv_space; + + if (flags & DQUOT_SPACE_NOFAIL) + goto add; if (dquot->dq_dqb.dqb_bhardlimit && tspace > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { - if (!prealloc) + if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BHARDWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_bsoftlimit && @@ -1342,28 +1319,34 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, dquot->dq_dqb.dqb_btime && ktime_get_real_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { - if (!prealloc) + if (flags & DQUOT_SPACE_WARN) prepare_warning(warn, dquot, QUOTA_NL_BSOFTLONGWARN); - return -EDQUOT; + ret = -EDQUOT; + goto out; } if (dquot->dq_dqb.dqb_bsoftlimit && tspace > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { - if (!prealloc) { + if (flags & DQUOT_SPACE_WARN) { prepare_warning(warn, dquot, QUOTA_NL_BSOFTWARN); dquot->dq_dqb.dqb_btime = ktime_get_real_seconds() + sb_dqopt(sb)->info[dquot->dq_id.type].dqi_bgrace; - } - else + } else { /* * We don't allow preallocation to exceed softlimit so exceeding will * be always printed */ - return -EDQUOT; + ret = -EDQUOT; + goto out; + } } - - return 0; +add: + dquot->dq_dqb.dqb_rsvspace += rsv_space; + dquot->dq_dqb.dqb_curspace += space; +out: + spin_unlock(&dquot->dq_dqb_lock); + return ret; } static int info_idq_free(struct dquot *dquot, qsize_t inodes) @@ -1502,8 +1485,15 @@ static int __dquot_initialize(struct inode *inode, int type) * did a write before quota was turned on */ rsv = inode_get_rsv_space(inode); - if (unlikely(rsv)) - dquot_resv_space(dquots[cnt], rsv); + if (unlikely(rsv)) { + spin_lock(&inode->i_lock); + /* Get reservation again under proper lock */ + rsv = __inode_get_rsv_space(inode); + spin_lock(&dquots[cnt]->dq_dqb_lock); + dquots[cnt]->dq_dqb.dqb_rsvspace += rsv; + spin_unlock(&dquots[cnt]->dq_dqb_lock); + spin_unlock(&inode->i_lock); + } } } out_lock: @@ -1598,39 +1588,12 @@ static qsize_t *inode_reserved_space(struct inode * inode) return inode->i_sb->dq_op->get_reserved_space(inode); } -void inode_add_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) += number; - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_add_rsv_space); - -void inode_claim_rsv_space(struct inode *inode, qsize_t number) +static qsize_t __inode_get_rsv_space(struct inode *inode) { - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; - __inode_add_bytes(inode, number); - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_claim_rsv_space); - -void inode_reclaim_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) += number; - __inode_sub_bytes(inode, number); - spin_unlock(&inode->i_lock); -} -EXPORT_SYMBOL(inode_reclaim_rsv_space); - -void inode_sub_rsv_space(struct inode *inode, qsize_t number) -{ - spin_lock(&inode->i_lock); - *inode_reserved_space(inode) -= number; - spin_unlock(&inode->i_lock); + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + return *inode_reserved_space(inode); } -EXPORT_SYMBOL(inode_sub_rsv_space); static qsize_t inode_get_rsv_space(struct inode *inode) { @@ -1639,28 +1602,11 @@ static qsize_t inode_get_rsv_space(struct inode *inode) if (!inode->i_sb->dq_op->get_reserved_space) return 0; spin_lock(&inode->i_lock); - ret = *inode_reserved_space(inode); + ret = __inode_get_rsv_space(inode); spin_unlock(&inode->i_lock); return ret; } -static void inode_incr_space(struct inode *inode, qsize_t number, - int reserve) -{ - if (reserve) - inode_add_rsv_space(inode, number); - else - inode_add_bytes(inode, number); -} - -static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) -{ - if (reserve) - inode_sub_rsv_space(inode, number); - else - inode_sub_bytes(inode, number); -} - /* * This functions updates i_blocks+i_bytes fields and quota information * (together with appropriate checks). @@ -1682,7 +1628,13 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) struct dquot **dquots; if (!dquot_active(inode)) { - inode_incr_space(inode, number, reserve); + if (reserve) { + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); + } else { + inode_add_bytes(inode, number); + } goto out; } @@ -1691,27 +1643,41 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) continue; - ret = check_bdq(dquots[cnt], number, - !(flags & DQUOT_SPACE_WARN), &warn[cnt]); - if (ret && !(flags & DQUOT_SPACE_NOFAIL)) { - spin_unlock(&dq_data_lock); + if (flags & DQUOT_SPACE_RESERVE) { + ret = dquot_add_space(dquots[cnt], 0, number, flags, + &warn[cnt]); + } else { + ret = dquot_add_space(dquots[cnt], number, 0, flags, + &warn[cnt]); + } + if (ret) { + /* Back out changes we already did */ + for (cnt--; cnt >= 0; cnt--) { + if (!dquots[cnt]) + continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); + if (flags & DQUOT_SPACE_RESERVE) { + dquots[cnt]->dq_dqb.dqb_rsvspace -= + number; + } else { + dquots[cnt]->dq_dqb.dqb_curspace -= + number; + } + spin_unlock(&dquots[cnt]->dq_dqb_lock); + } + spin_unlock(&inode->i_lock); goto out_flush_warn; } } - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!dquots[cnt]) - continue; - if (reserve) - dquot_resv_space(dquots[cnt], number); - else - dquot_incr_space(dquots[cnt], number); - } - inode_incr_space(inode, number, reserve); - spin_unlock(&dq_data_lock); + if (reserve) + *inode_reserved_space(inode) += number; + else + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); if (reserve) goto out_flush_warn; @@ -1740,23 +1706,26 @@ int dquot_alloc_inode(struct inode *inode) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!dquots[cnt]) continue; - ret = check_idq(dquots[cnt], 1, &warn[cnt]); - if (ret) + ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]); + if (ret) { + for (cnt--; cnt >= 0; cnt--) { + if (!dquots[cnt]) + continue; + /* Back out changes we already did */ + spin_lock(&dquots[cnt]->dq_dqb_lock); + dquots[cnt]->dq_dqb.dqb_curinodes--; + spin_unlock(&dquots[cnt]->dq_dqb_lock); + } goto warn_put_all; - } - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (!dquots[cnt]) - continue; - dquot_incr_inodes(dquots[cnt], 1); + } } warn_put_all: - spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); if (ret == 0) mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); @@ -1774,21 +1743,33 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) int cnt, index; if (!dquot_active(inode)) { - inode_claim_rsv_space(inode, number); + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); return 0; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (dquots[cnt]) - dquot_claim_reserved_space(dquots[cnt], number); + if (dquots[cnt]) { + struct dquot *dquot = dquots[cnt]; + + spin_lock(&dquot->dq_dqb_lock); + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number)) + number = dquot->dq_dqb.dqb_rsvspace; + dquot->dq_dqb.dqb_curspace += number; + dquot->dq_dqb.dqb_rsvspace -= number; + spin_unlock(&dquot->dq_dqb_lock); + } } /* Update inode bytes */ - inode_claim_rsv_space(inode, number); - spin_unlock(&dq_data_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return 0; @@ -1804,21 +1785,33 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) int cnt, index; if (!dquot_active(inode)) { - inode_reclaim_rsv_space(inode, number); + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (dquots[cnt]) - dquot_reclaim_reserved_space(dquots[cnt], number); + if (dquots[cnt]) { + struct dquot *dquot = dquots[cnt]; + + spin_lock(&dquot->dq_dqb_lock); + if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number)) + number = dquot->dq_dqb.dqb_curspace; + dquot->dq_dqb.dqb_rsvspace += number; + dquot->dq_dqb.dqb_curspace -= number; + spin_unlock(&dquot->dq_dqb_lock); + } } /* Update inode bytes */ - inode_reclaim_rsv_space(inode, number); - spin_unlock(&dq_data_lock); + *inode_reserved_space(inode) += number; + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return; @@ -1836,19 +1829,26 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!dquot_active(inode)) { - inode_decr_space(inode, number, reserve); + if (reserve) { + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); + } else { + inode_sub_bytes(inode, number); + } return; } dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; if (!dquots[cnt]) continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); wtype = info_bdq_free(dquots[cnt], number); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquots[cnt], wtype); @@ -1856,9 +1856,13 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) dquot_free_reserved_space(dquots[cnt], number); else dquot_decr_space(dquots[cnt], number); + spin_unlock(&dquots[cnt]->dq_dqb_lock); } - inode_decr_space(inode, number, reserve); - spin_unlock(&dq_data_lock); + if (reserve) + *inode_reserved_space(inode) -= number; + else + __inode_sub_bytes(inode, number); + spin_unlock(&inode->i_lock); if (reserve) goto out_unlock; @@ -1884,19 +1888,21 @@ void dquot_free_inode(struct inode *inode) dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); - spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { int wtype; warn[cnt].w_type = QUOTA_NL_NOWARN; if (!dquots[cnt]) continue; + spin_lock(&dquots[cnt]->dq_dqb_lock); wtype = info_idq_free(dquots[cnt], 1); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn[cnt], dquots[cnt], wtype); dquot_decr_inodes(dquots[cnt], 1); + spin_unlock(&dquots[cnt]->dq_dqb_lock); } - spin_unlock(&dq_data_lock); + spin_unlock(&inode->i_lock); mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); flush_warnings(warn); @@ -1917,7 +1923,7 @@ EXPORT_SYMBOL(dquot_free_inode); */ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) { - qsize_t space, cur_space; + qsize_t cur_space; qsize_t rsv_space = 0; qsize_t inode_usage = 1; struct dquot *transfer_from[MAXQUOTAS] = {}; @@ -1944,14 +1950,18 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) } spin_lock(&dq_data_lock); + spin_lock(&inode->i_lock); if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); return 0; } - cur_space = inode_get_bytes(inode); - rsv_space = inode_get_rsv_space(inode); - space = cur_space + rsv_space; - /* Build the transfer_from list and check the limits */ + cur_space = __inode_get_bytes(inode); + rsv_space = __inode_get_rsv_space(inode); + /* + * Build the transfer_from list, check limits, and update usage in + * the target structures. + */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { /* * Skip changes for same uid or gid or for turned off quota-type. @@ -1963,28 +1973,33 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) continue; is_valid[cnt] = 1; transfer_from[cnt] = i_dquot(inode)[cnt]; - ret = check_idq(transfer_to[cnt], inode_usage, &warn_to[cnt]); + ret = dquot_add_inodes(transfer_to[cnt], inode_usage, + &warn_to[cnt]); if (ret) goto over_quota; - ret = check_bdq(transfer_to[cnt], space, 0, &warn_to[cnt]); - if (ret) + ret = dquot_add_space(transfer_to[cnt], cur_space, rsv_space, 0, + &warn_to[cnt]); + if (ret) { + dquot_decr_inodes(transfer_to[cnt], inode_usage); goto over_quota; + } } - /* - * Finally perform the needed transfer from transfer_from to transfer_to - */ + /* Decrease usage for source structures and update quota pointers */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (!is_valid[cnt]) continue; /* Due to IO error we might not have transfer_from[] structure */ if (transfer_from[cnt]) { int wtype; + + spin_lock(&transfer_from[cnt]->dq_dqb_lock); wtype = info_idq_free(transfer_from[cnt], inode_usage); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_inodes[cnt], transfer_from[cnt], wtype); - wtype = info_bdq_free(transfer_from[cnt], space); + wtype = info_bdq_free(transfer_from[cnt], + cur_space + rsv_space); if (wtype != QUOTA_NL_NOWARN) prepare_warning(&warn_from_space[cnt], transfer_from[cnt], wtype); @@ -1992,14 +2007,11 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) dquot_decr_space(transfer_from[cnt], cur_space); dquot_free_reserved_space(transfer_from[cnt], rsv_space); + spin_unlock(&transfer_from[cnt]->dq_dqb_lock); } - - dquot_incr_inodes(transfer_to[cnt], inode_usage); - dquot_incr_space(transfer_to[cnt], cur_space); - dquot_resv_space(transfer_to[cnt], rsv_space); - i_dquot(inode)[cnt] = transfer_to[cnt]; } + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); mark_all_dquot_dirty(transfer_from); @@ -2013,6 +2025,17 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to) transfer_to[cnt] = transfer_from[cnt]; return 0; over_quota: + /* Back out changes we already did */ + for (cnt--; cnt >= 0; cnt--) { + if (!is_valid[cnt]) + continue; + spin_lock(&transfer_to[cnt]->dq_dqb_lock); + dquot_decr_inodes(transfer_to[cnt], inode_usage); + dquot_decr_space(transfer_to[cnt], cur_space); + dquot_free_reserved_space(transfer_to[cnt], rsv_space); + spin_unlock(&transfer_to[cnt]->dq_dqb_lock); + } + spin_unlock(&inode->i_lock); spin_unlock(&dq_data_lock); flush_warnings(warn_to); return ret; @@ -2066,29 +2089,21 @@ EXPORT_SYMBOL(dquot_transfer); */ int dquot_commit_info(struct super_block *sb, int type) { - int ret; struct quota_info *dqopt = sb_dqopt(sb); - mutex_lock(&dqopt->dqio_mutex); - ret = dqopt->ops[type]->write_file_info(sb, type); - mutex_unlock(&dqopt->dqio_mutex); - return ret; + return dqopt->ops[type]->write_file_info(sb, type); } EXPORT_SYMBOL(dquot_commit_info); int dquot_get_next_id(struct super_block *sb, struct kqid *qid) { struct quota_info *dqopt = sb_dqopt(sb); - int err; if (!sb_has_quota_active(sb, qid->type)) return -ESRCH; if (!dqopt->ops[qid->type]->get_next_id) return -ENOSYS; - mutex_lock(&dqopt->dqio_mutex); - err = dqopt->ops[qid->type]->get_next_id(sb, qid); - mutex_unlock(&dqopt->dqio_mutex); - return err; + return dqopt->ops[qid->type]->get_next_id(sb, qid); } EXPORT_SYMBOL(dquot_get_next_id); @@ -2337,15 +2352,14 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->info[type].dqi_format = fmt; dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); - mutex_lock(&dqopt->dqio_mutex); error = dqopt->ops[type]->read_file_info(sb, type); - if (error < 0) { - mutex_unlock(&dqopt->dqio_mutex); + if (error < 0) goto out_file_init; - } - if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) { + spin_lock(&dq_data_lock); dqopt->info[type].dqi_flags |= DQF_SYS_FILE; - mutex_unlock(&dqopt->dqio_mutex); + spin_unlock(&dq_data_lock); + } spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); @@ -2572,7 +2586,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) struct mem_dqblk *dm = &dquot->dq_dqb; memset(di, 0, sizeof(*di)); - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); di->d_spc_hardlimit = dm->dqb_bhardlimit; di->d_spc_softlimit = dm->dqb_bsoftlimit; di->d_ino_hardlimit = dm->dqb_ihardlimit; @@ -2581,7 +2595,7 @@ static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di) di->d_ino_count = dm->dqb_curinodes; di->d_spc_timer = dm->dqb_btime; di->d_ino_timer = dm->dqb_itime; - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); } int dquot_get_dqblk(struct super_block *sb, struct kqid qid, @@ -2645,7 +2659,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) (di->d_ino_hardlimit > dqi->dqi_max_ino_limit))) return -ERANGE; - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); if (di->d_fieldmask & QC_SPACE) { dm->dqb_curspace = di->d_space - dm->dqb_rsvspace; check_blim = 1; @@ -2711,7 +2725,7 @@ static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di) clear_bit(DQ_FAKE_B, &dquot->dq_flags); else set_bit(DQ_FAKE_B, &dquot->dq_flags); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); mark_dquot_dirty(dquot); return 0; diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 0738972e8d3f..bb3f59bcfcf5 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -379,7 +379,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (!ddquot) return -ENOMEM; - /* dq_off is guarded by dqio_mutex */ + /* dq_off is guarded by dqio_sem */ if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { @@ -389,9 +389,9 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) return ret; } } - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->mem2disk_dqblk(ddquot, dquot); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { @@ -649,14 +649,14 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) kfree(ddquot); goto out; } - spin_lock(&dq_data_lock); + spin_lock(&dquot->dq_dqb_lock); info->dqi_ops->disk2mem_dqblk(dquot, ddquot); if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) set_bit(DQ_FAKE_B, &dquot->dq_flags); - spin_unlock(&dq_data_lock); + spin_unlock(&dquot->dq_dqb_lock); kfree(ddquot); out: dqstats_inc(DQST_READS); diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 8fe79beced5c..7ac5298aba70 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -56,8 +56,9 @@ static int v1_read_dqblk(struct dquot *dquot) { int type = dquot->dq_id.type; struct v1_disk_dqblk dqblk; + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); - if (!sb_dqopt(dquot->dq_sb)->files[type]) + if (!dqopt->files[type]) return -EINVAL; /* Set structure to 0s in case read fails/is after end of file */ @@ -160,6 +161,7 @@ static int v1_read_file_info(struct super_block *sb, int type) struct v1_disk_dqblk dqblk; int ret; + down_read(&dqopt->dqio_sem); ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret != sizeof(struct v1_disk_dqblk)) { @@ -176,6 +178,7 @@ static int v1_read_file_info(struct super_block *sb, int type) dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; out: + up_read(&dqopt->dqio_sem); return ret; } @@ -185,7 +188,7 @@ static int v1_write_file_info(struct super_block *sb, int type) struct v1_disk_dqblk dqblk; int ret; - dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; + down_write(&dqopt->dqio_sem); ret = sb->s_op->quota_read(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret != sizeof(struct v1_disk_dqblk)) { @@ -193,8 +196,11 @@ static int v1_write_file_info(struct super_block *sb, int type) ret = -EIO; goto out; } + spin_lock(&dq_data_lock); + dqopt->info[type].dqi_flags &= ~DQF_INFO_DIRTY; dqblk.dqb_itime = dqopt->info[type].dqi_igrace; dqblk.dqb_btime = dqopt->info[type].dqi_bgrace; + spin_unlock(&dq_data_lock); ret = sb->s_op->quota_write(sb, type, (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(0)); if (ret == sizeof(struct v1_disk_dqblk)) @@ -202,6 +208,7 @@ static int v1_write_file_info(struct super_block *sb, int type) else if (ret > 0) ret = -EIO; out: + up_write(&dqopt->dqio_sem); return ret; } diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index ca71bf881ad1..c0187cda2c1e 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -65,9 +65,11 @@ static int v2_read_header(struct super_block *sb, int type, if (size != sizeof(struct v2_disk_dqheader)) { quota_error(sb, "Failed header read: expected=%zd got=%zd", sizeof(struct v2_disk_dqheader), size); - return 0; + if (size < 0) + return size; + return -EIO; } - return 1; + return 0; } /* Check whether given file is really vfsv0 quotafile */ @@ -77,7 +79,7 @@ static int v2_check_quota_file(struct super_block *sb, int type) static const uint quota_magics[] = V2_INITQMAGICS; static const uint quota_versions[] = V2_INITQVERSIONS; - if (!v2_read_header(sb, type, &dqhead)) + if (v2_read_header(sb, type, &dqhead)) return 0; if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || le32_to_cpu(dqhead.dqh_version) > quota_versions[type]) @@ -90,29 +92,38 @@ static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; struct v2_disk_dqheader dqhead; - struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; struct qtree_mem_dqinfo *qinfo; ssize_t size; unsigned int version; + int ret; - if (!v2_read_header(sb, type, &dqhead)) - return -1; + down_read(&dqopt->dqio_sem); + ret = v2_read_header(sb, type, &dqhead); + if (ret < 0) + goto out; version = le32_to_cpu(dqhead.dqh_version); if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || - (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) - return -1; + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) { + ret = -EINVAL; + goto out; + } size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { quota_error(sb, "Can't read info structure"); - return -1; + if (size < 0) + ret = size; + else + ret = -EIO; + goto out; } info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); if (!info->dqi_priv) { - printk(KERN_WARNING - "Not enough memory for quota information structure.\n"); - return -ENOMEM; + ret = -ENOMEM; + goto out; } qinfo = info->dqi_priv; if (version == 0) { @@ -147,17 +158,22 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo->dqi_entry_size = sizeof(struct v2r1_disk_dqblk); qinfo->dqi_ops = &v2r1_qtree_ops; } - return 0; + ret = 0; +out: + up_read(&dqopt->dqio_sem); + return ret; } /* Write information header to quota file */ static int v2_write_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct quota_info *dqopt = sb_dqopt(sb); + struct mem_dqinfo *info = &dqopt->info[type]; struct qtree_mem_dqinfo *qinfo = info->dqi_priv; ssize_t size; + down_write(&dqopt->dqio_sem); spin_lock(&dq_data_lock); info->dqi_flags &= ~DQF_INFO_DIRTY; dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); @@ -170,6 +186,7 @@ static int v2_write_file_info(struct super_block *sb, int type) dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); + up_write(&dqopt->dqio_sem); if (size != sizeof(struct v2_disk_dqinfo)) { quota_error(sb, "Can't write info structure"); return -1; @@ -285,17 +302,51 @@ static int v2r1_is_id(void *dp, struct dquot *dquot) static int v2_read_dquot(struct dquot *dquot) { - return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + + down_read(&dqopt->dqio_sem); + ret = qtree_read_dquot( + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, + dquot); + up_read(&dqopt->dqio_sem); + return ret; } static int v2_write_dquot(struct dquot *dquot) { - return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + bool alloc = false; + + /* + * If space for dquot is already allocated, we don't need any + * protection as we'll only overwrite the place of dquot. We are + * still protected by concurrent writes of the same dquot by + * dquot->dq_lock. + */ + if (!dquot->dq_off) { + alloc = true; + down_write(&dqopt->dqio_sem); + } + ret = qtree_write_dquot( + sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, + dquot); + if (alloc) + up_write(&dqopt->dqio_sem); + return ret; } static int v2_release_dquot(struct dquot *dquot) { - return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); + int ret; + + down_write(&dqopt->dqio_sem); + ret = qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_id.type)->dqi_priv, dquot); + up_write(&dqopt->dqio_sem); + + return ret; } static int v2_free_file_info(struct super_block *sb, int type) @@ -306,7 +357,13 @@ static int v2_free_file_info(struct super_block *sb, int type) static int v2_get_next_id(struct super_block *sb, struct kqid *qid) { - return qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid); + struct quota_info *dqopt = sb_dqopt(sb); + int ret; + + down_read(&dqopt->dqio_sem); + ret = qtree_get_next_id(sb_dqinfo(sb, qid->type)->dqi_priv, qid); + up_read(&dqopt->dqio_sem); + return ret; } static const struct quota_format_ops v2_format_ops = { diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 2ef7ce75c062..3ac1f2387083 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free; - nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); + nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */ diff --git a/fs/read_write.c b/fs/read_write.c index cbdccaf032c6..a2b9a47235c5 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -33,7 +33,7 @@ const struct file_operations generic_ro_fops = { EXPORT_SYMBOL(generic_ro_fops); -static inline int unsigned_offsets(struct file *file) +static inline bool unsigned_offsets(struct file *file) { return file->f_mode & FMODE_UNSIGNED_OFFSET; } @@ -656,7 +656,7 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) EXPORT_SYMBOL(iov_shorten); static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { struct kiocb kiocb; ssize_t ret; @@ -678,7 +678,7 @@ static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, /* Do it by hand, with file-ops */ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, - loff_t *ppos, int type, int flags) + loff_t *ppos, int type, rwf_t flags) { ssize_t ret = 0; @@ -894,7 +894,7 @@ out: #endif static ssize_t do_iter_read(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -922,7 +922,7 @@ out: } ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->read_iter) return -EINVAL; @@ -931,7 +931,7 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_read); static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, - loff_t *pos, int flags) + loff_t *pos, rwf_t flags) { size_t tot_len; ssize_t ret = 0; @@ -958,7 +958,7 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter, } ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, - int flags) + rwf_t flags) { if (!file->f_op->write_iter) return -EINVAL; @@ -967,7 +967,7 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos, EXPORT_SYMBOL(vfs_iter_write); ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -984,7 +984,7 @@ ssize_t vfs_readv(struct file *file, const struct iovec __user *vec, } static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1002,7 +1002,7 @@ static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec, } static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1022,7 +1022,7 @@ static ssize_t do_readv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_writev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, int flags) + unsigned long vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; @@ -1048,7 +1048,7 @@ static inline loff_t pos_from_hilo(unsigned long high, unsigned long low) } static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1071,7 +1071,7 @@ static ssize_t do_preadv(unsigned long fd, const struct iovec __user *vec, } static ssize_t do_pwritev(unsigned long fd, const struct iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret = -EBADF; @@ -1115,7 +1115,7 @@ SYSCALL_DEFINE5(preadv, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(preadv2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1135,7 +1135,7 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec, SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, unsigned long, vlen, unsigned long, pos_l, unsigned long, pos_h, - int, flags) + rwf_t, flags) { loff_t pos = pos_from_hilo(pos_h, pos_l); @@ -1148,7 +1148,7 @@ SYSCALL_DEFINE6(pwritev2, unsigned long, fd, const struct iovec __user *, vec, #ifdef CONFIG_COMPAT static size_t compat_readv(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1168,7 +1168,7 @@ static size_t compat_readv(struct file *file, static size_t do_compat_readv(compat_ulong_t fd, const struct compat_iovec __user *vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1194,7 +1194,7 @@ COMPAT_SYSCALL_DEFINE3(readv, compat_ulong_t, fd, static long do_compat_preadv64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1232,7 +1232,7 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_preadv64(fd, vec, vlen, pos, flags); } @@ -1241,7 +1241,7 @@ COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, - int, flags) + rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; @@ -1253,7 +1253,7 @@ COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, static size_t compat_writev(struct file *file, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t *pos, int flags) + unsigned long vlen, loff_t *pos, rwf_t flags) { struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; @@ -1275,7 +1275,7 @@ static size_t compat_writev(struct file *file, static size_t do_compat_writev(compat_ulong_t fd, const struct compat_iovec __user* vec, - compat_ulong_t vlen, int flags) + compat_ulong_t vlen, rwf_t flags) { struct fd f = fdget_pos(fd); ssize_t ret; @@ -1300,7 +1300,7 @@ COMPAT_SYSCALL_DEFINE3(writev, compat_ulong_t, fd, static long do_compat_pwritev64(unsigned long fd, const struct compat_iovec __user *vec, - unsigned long vlen, loff_t pos, int flags) + unsigned long vlen, loff_t pos, rwf_t flags) { struct fd f; ssize_t ret; @@ -1338,7 +1338,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, #ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, const struct compat_iovec __user *,vec, - unsigned long, vlen, loff_t, pos, int, flags) + unsigned long, vlen, loff_t, pos, rwf_t, flags) { return do_compat_pwritev64(fd, vec, vlen, pos, flags); } @@ -1346,7 +1346,7 @@ COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, - compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) + compat_ulong_t, vlen, u32, pos_low, u32, pos_high, rwf_t, flags) { loff_t pos = ((loff_t)pos_high << 32) | pos_low; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 45aa05e2232f..5b50689d8539 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -34,7 +34,7 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, struct inode *inode = filp->f_mapping->host; int err; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index b396eb09f288..843aadcc123c 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -154,7 +154,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, int err; int barrier_done; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(filp, start, end); if (err) return err; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a11d773e5ff3..0c882a0e2a6e 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1481,7 +1481,7 @@ static int flush_journal_list(struct super_block *s, if ((!was_jwait) && !buffer_locked(saved_bh)) { reiserfs_warning(s, "journal-813", "BAD! buffer %llu %cdirty %cjwait, " - "not in a newer tranasction", + "not in a newer transaction", (unsigned long long)saved_bh-> b_blocknr, was_dirty ? ' ' : '!', was_jwait ? ' ' : '!'); diff --git a/fs/select.c b/fs/select.c index 9d5f15ed87fe..c6362e38ae92 100644 --- a/fs/select.c +++ b/fs/select.c @@ -1164,11 +1164,7 @@ int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, if (ufdset) { return compat_get_bitmap(fdset, ufdset, nr); } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, ALIGN makes sure that - * actually happens. - */ - memset(fdset, 0, ALIGN(nr, BITS_PER_LONG)); + zero_fd_set(nr, fdset); return 0; } } diff --git a/fs/signalfd.c b/fs/signalfd.c index 593b022ac11b..d2c434112f42 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -95,23 +95,23 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, */ err |= __put_user(kinfo->si_signo, &uinfo->ssi_signo); err |= __put_user(kinfo->si_errno, &uinfo->ssi_errno); - err |= __put_user((short) kinfo->si_code, &uinfo->ssi_code); - switch (kinfo->si_code & __SI_MASK) { - case __SI_KILL: + err |= __put_user(kinfo->si_code, &uinfo->ssi_code); + switch (siginfo_layout(kinfo->si_signo, kinfo->si_code)) { + case SIL_KILL: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); break; - case __SI_TIMER: + case SIL_TIMER: err |= __put_user(kinfo->si_tid, &uinfo->ssi_tid); err |= __put_user(kinfo->si_overrun, &uinfo->ssi_overrun); err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); err |= __put_user(kinfo->si_int, &uinfo->ssi_int); break; - case __SI_POLL: + case SIL_POLL: err |= __put_user(kinfo->si_band, &uinfo->ssi_band); err |= __put_user(kinfo->si_fd, &uinfo->ssi_fd); break; - case __SI_FAULT: + case SIL_FAULT: err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); #ifdef __ARCH_SI_TRAPNO err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); @@ -128,20 +128,14 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, &uinfo->ssi_addr_lsb); #endif break; - case __SI_CHLD: + case SIL_CHLD: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); err |= __put_user(kinfo->si_status, &uinfo->ssi_status); err |= __put_user(kinfo->si_utime, &uinfo->ssi_utime); err |= __put_user(kinfo->si_stime, &uinfo->ssi_stime); break; - case __SI_RT: /* This is not generated by the kernel as of now. */ - case __SI_MESGQ: /* But this is */ - err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); - err |= __put_user(kinfo->si_uid, &uinfo->ssi_uid); - err |= __put_user((long) kinfo->si_ptr, &uinfo->ssi_ptr); - err |= __put_user(kinfo->si_int, &uinfo->ssi_int); - break; + case SIL_RT: default: /* * This case catches also the signals queued by sigqueue(). diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index ffb093e72b6c..1adb3346b9d6 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -165,6 +165,20 @@ config SQUASHFS_XZ If unsure, say N. +config SQUASHFS_ZSTD + bool "Include support for ZSTD compressed file systems" + depends on SQUASHFS + select ZSTD_DECOMPRESS + help + Saying Y here includes support for reading Squashfs file systems + compressed with ZSTD compression. ZSTD gives better compression than + the default ZLIB compression, while using less CPU. + + ZSTD is not the standard compression used in Squashfs and so most + file systems will be readable without selecting this option. + + If unsure, say N. + config SQUASHFS_4K_DEVBLK_SIZE bool "Use 4K device block size?" depends on SQUASHFS diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile index 246a6f329d89..6655631c53ae 100644 --- a/fs/squashfs/Makefile +++ b/fs/squashfs/Makefile @@ -15,3 +15,4 @@ squashfs-$(CONFIG_SQUASHFS_LZ4) += lz4_wrapper.o squashfs-$(CONFIG_SQUASHFS_LZO) += lzo_wrapper.o squashfs-$(CONFIG_SQUASHFS_XZ) += xz_wrapper.o squashfs-$(CONFIG_SQUASHFS_ZLIB) += zlib_wrapper.o +squashfs-$(CONFIG_SQUASHFS_ZSTD) += zstd_wrapper.o diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c index d2bc13636f79..836639810ea0 100644 --- a/fs/squashfs/decompressor.c +++ b/fs/squashfs/decompressor.c @@ -65,6 +65,12 @@ static const struct squashfs_decompressor squashfs_zlib_comp_ops = { }; #endif +#ifndef CONFIG_SQUASHFS_ZSTD +static const struct squashfs_decompressor squashfs_zstd_comp_ops = { + NULL, NULL, NULL, NULL, ZSTD_COMPRESSION, "zstd", 0 +}; +#endif + static const struct squashfs_decompressor squashfs_unknown_comp_ops = { NULL, NULL, NULL, NULL, 0, "unknown", 0 }; @@ -75,6 +81,7 @@ static const struct squashfs_decompressor *decompressor[] = { &squashfs_lzo_comp_ops, &squashfs_xz_comp_ops, &squashfs_lzma_unsupported_comp_ops, + &squashfs_zstd_comp_ops, &squashfs_unknown_comp_ops }; diff --git a/fs/squashfs/decompressor.h b/fs/squashfs/decompressor.h index a25713c031a5..0f5a8e4e58da 100644 --- a/fs/squashfs/decompressor.h +++ b/fs/squashfs/decompressor.h @@ -58,4 +58,8 @@ extern const struct squashfs_decompressor squashfs_lzo_comp_ops; extern const struct squashfs_decompressor squashfs_zlib_comp_ops; #endif +#ifdef CONFIG_SQUASHFS_ZSTD +extern const struct squashfs_decompressor squashfs_zstd_comp_ops; +#endif + #endif diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h index 506f4ba5b983..24d12fd14177 100644 --- a/fs/squashfs/squashfs_fs.h +++ b/fs/squashfs/squashfs_fs.h @@ -241,6 +241,7 @@ struct meta_index { #define LZO_COMPRESSION 3 #define XZ_COMPRESSION 4 #define LZ4_COMPRESSION 5 +#define ZSTD_COMPRESSION 6 struct squashfs_super_block { __le32 s_magic; diff --git a/fs/squashfs/zstd_wrapper.c b/fs/squashfs/zstd_wrapper.c new file mode 100644 index 000000000000..eeaabf881159 --- /dev/null +++ b/fs/squashfs/zstd_wrapper.c @@ -0,0 +1,151 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2016-present, Facebook, Inc. + * All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * zstd_wrapper.c + */ + +#include <linux/mutex.h> +#include <linux/buffer_head.h> +#include <linux/slab.h> +#include <linux/zstd.h> +#include <linux/vmalloc.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs.h" +#include "decompressor.h" +#include "page_actor.h" + +struct workspace { + void *mem; + size_t mem_size; + size_t window_size; +}; + +static void *zstd_init(struct squashfs_sb_info *msblk, void *buff) +{ + struct workspace *wksp = kmalloc(sizeof(*wksp), GFP_KERNEL); + + if (wksp == NULL) + goto failed; + wksp->window_size = max_t(size_t, + msblk->block_size, SQUASHFS_METADATA_SIZE); + wksp->mem_size = ZSTD_DStreamWorkspaceBound(wksp->window_size); + wksp->mem = vmalloc(wksp->mem_size); + if (wksp->mem == NULL) + goto failed; + + return wksp; + +failed: + ERROR("Failed to allocate zstd workspace\n"); + kfree(wksp); + return ERR_PTR(-ENOMEM); +} + + +static void zstd_free(void *strm) +{ + struct workspace *wksp = strm; + + if (wksp) + vfree(wksp->mem); + kfree(wksp); +} + + +static int zstd_uncompress(struct squashfs_sb_info *msblk, void *strm, + struct buffer_head **bh, int b, int offset, int length, + struct squashfs_page_actor *output) +{ + struct workspace *wksp = strm; + ZSTD_DStream *stream; + size_t total_out = 0; + size_t zstd_err; + int k = 0; + ZSTD_inBuffer in_buf = { NULL, 0, 0 }; + ZSTD_outBuffer out_buf = { NULL, 0, 0 }; + + stream = ZSTD_initDStream(wksp->window_size, wksp->mem, wksp->mem_size); + + if (!stream) { + ERROR("Failed to initialize zstd decompressor\n"); + goto out; + } + + out_buf.size = PAGE_SIZE; + out_buf.dst = squashfs_first_page(output); + + do { + if (in_buf.pos == in_buf.size && k < b) { + int avail = min(length, msblk->devblksize - offset); + + length -= avail; + in_buf.src = bh[k]->b_data + offset; + in_buf.size = avail; + in_buf.pos = 0; + offset = 0; + } + + if (out_buf.pos == out_buf.size) { + out_buf.dst = squashfs_next_page(output); + if (out_buf.dst == NULL) { + /* Shouldn't run out of pages + * before stream is done. + */ + squashfs_finish_page(output); + goto out; + } + out_buf.pos = 0; + out_buf.size = PAGE_SIZE; + } + + total_out -= out_buf.pos; + zstd_err = ZSTD_decompressStream(stream, &out_buf, &in_buf); + total_out += out_buf.pos; /* add the additional data produced */ + + if (in_buf.pos == in_buf.size && k < b) + put_bh(bh[k++]); + } while (zstd_err != 0 && !ZSTD_isError(zstd_err)); + + squashfs_finish_page(output); + + if (ZSTD_isError(zstd_err)) { + ERROR("zstd decompression error: %d\n", + (int)ZSTD_getErrorCode(zstd_err)); + goto out; + } + + if (k < b) + goto out; + + return (int)total_out; + +out: + for (; k < b; k++) + put_bh(bh[k]); + + return -EIO; +} + +const struct squashfs_decompressor squashfs_zstd_comp_ops = { + .init = zstd_init, + .free = zstd_free, + .decompress = zstd_uncompress, + .id = ZSTD_COMPRESSION, + .name = "zstd", + .supported = 1 +}; diff --git a/fs/stat.c b/fs/stat.c index c35610845ab1..8a6aa8caf891 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -710,7 +710,7 @@ loff_t inode_get_bytes(struct inode *inode) loff_t ret; spin_lock(&inode->i_lock); - ret = (((loff_t)inode->i_blocks) << 9) + inode->i_bytes; + ret = __inode_get_bytes(inode); spin_unlock(&inode->i_lock); return ret; } diff --git a/fs/super.c b/fs/super.c index 6bc3352adcf3..221cfa1f4e92 100644 --- a/fs/super.c +++ b/fs/super.c @@ -242,7 +242,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key); - mutex_init(&s->s_dquot.dqio_mutex); + init_rwsem(&s->s_dquot.dqio_sem); s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; diff --git a/fs/sync.c b/fs/sync.c index 2a54c1f22035..2e3fd7d94d2d 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -335,14 +335,9 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, goto out_put; mapping = f.file->f_mapping; - if (!mapping) { - ret = -EINVAL; - goto out_put; - } - ret = 0; if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); if (ret < 0) goto out_put; } @@ -355,7 +350,7 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes, } if (flags & SYNC_FILE_RANGE_WAIT_AFTER) - ret = filemap_fdatawait_range(mapping, offset, endbyte); + ret = file_fdatawait_range(f.file, offset, endbyte); out_put: fdput(f); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 8cad0b19b404..a02aa59d1e24 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1337,7 +1337,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ return 0; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); + err = file_write_and_wait_range(file, start, end); if (err) return err; inode_lock(inode); @@ -1490,7 +1490,10 @@ static int ubifs_migrate_page(struct address_space *mapping, SetPagePrivate(newpage); } - migrate_page_copy(newpage, page); + if (mode != MIGRATE_SYNC_NO_COPY) + migrate_page_copy(newpage, page); + else + migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } #endif diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 18fdb9d90812..8dacf4f57414 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -114,7 +114,7 @@ static void udf_update_extent_cache(struct inode *inode, loff_t estart, __udf_clear_extent_cache(inode); if (pos->bh) get_bh(pos->bh); - memcpy(&iinfo->cached_extent.epos, pos, sizeof(struct extent_position)); + memcpy(&iinfo->cached_extent.epos, pos, sizeof(*pos)); iinfo->cached_extent.lstart = estart; switch (iinfo->i_alloc_type) { case ICBTAG_FLAG_AD_SHORT: @@ -1572,13 +1572,8 @@ static int udf_alloc_i_data(struct inode *inode, size_t size) { struct udf_inode_info *iinfo = UDF_I(inode); iinfo->i_ext.i_data = kmalloc(size, GFP_KERNEL); - - if (!iinfo->i_ext.i_data) { - udf_err(inode->i_sb, "(ino %ld) no free memory\n", - inode->i_ino); + if (!iinfo->i_ext.i_data) return -ENOMEM; - } - return 0; } @@ -1703,7 +1698,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) dsea->impUseLength = cpu_to_le32(sizeof(struct regid)); } eid = (struct regid *)dsea->impUse; - memset(eid, 0, sizeof(struct regid)); + memset(eid, 0, sizeof(*eid)); strcpy(eid->ident, UDF_ID_DEVELOPER); eid->identSuffix[0] = UDF_OS_CLASS_UNIX; eid->identSuffix[1] = UDF_OS_ID_LINUX; @@ -1754,7 +1749,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime); udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime); - memset(&(efe->impIdent), 0, sizeof(struct regid)); + memset(&(efe->impIdent), 0, sizeof(efe->impIdent)); strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); efe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; efe->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 385ee89d5824..885198dfd9f8 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -1184,7 +1184,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, */ ncfi.fileVersionNum = ocfi.fileVersionNum; ncfi.fileCharacteristics = ocfi.fileCharacteristics; - memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(struct long_ad)); + memcpy(&(ncfi.icb), &(ocfi.icb), sizeof(ocfi.icb)); udf_write_fi(new_dir, &ncfi, nfi, &nfibh, NULL, NULL); /* The old fid may have moved - find it again */ diff --git a/fs/udf/super.c b/fs/udf/super.c index 462ac2e9258c..93c59630512b 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -266,11 +266,8 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) { struct udf_sb_info *sbi = UDF_SB(sb); - sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), - GFP_KERNEL); + sbi->s_partmaps = kcalloc(count, sizeof(*sbi->s_partmaps), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_err(sb, "Unable to allocate space for %d partition maps\n", - count); sbi->s_partitions = 0; return -ENOMEM; } @@ -324,7 +321,8 @@ static void udf_sb_free_partitions(struct super_block *sb) { struct udf_sb_info *sbi = UDF_SB(sb); int i; - if (sbi->s_partmaps == NULL) + + if (!sbi->s_partmaps) return; for (i = 0; i < sbi->s_partitions; i++) udf_free_partition(&sbi->s_partmaps[i]); @@ -1071,7 +1069,7 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) else bitmap = vzalloc(size); /* TODO: get rid of vzalloc */ - if (bitmap == NULL) + if (!bitmap) return NULL; bitmap->s_nr_groups = nr_groups; @@ -2099,7 +2097,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) uopt.fmode = UDF_INVALID_MODE; uopt.dmode = UDF_INVALID_MODE; - sbi = kzalloc(sizeof(struct udf_sb_info), GFP_KERNEL); + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index b0d5897bc4e6..ef4b48d1ea42 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -109,27 +109,24 @@ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, goto out; WRITE_ONCE(uwq->waken, true); /* - * The implicit smp_mb__before_spinlock in try_to_wake_up() - * renders uwq->waken visible to other CPUs before the task is - * waken. + * The Program-Order guarantees provided by the scheduler + * ensure uwq->waken is visible before the task is woken. */ ret = wake_up_state(wq->private, mode); - if (ret) + if (ret) { /* * Wake only once, autoremove behavior. * - * After the effect of list_del_init is visible to the - * other CPUs, the waitqueue may disappear from under - * us, see the !list_empty_careful() in - * handle_userfault(). try_to_wake_up() has an - * implicit smp_mb__before_spinlock, and the - * wq->private is read before calling the extern - * function "wake_up_state" (which in turns calls - * try_to_wake_up). While the spin_lock;spin_unlock; - * wouldn't be enough, the smp_mb__before_spinlock is - * enough to avoid an explicit smp_mb() here. + * After the effect of list_del_init is visible to the other + * CPUs, the waitqueue may disappear from under us, see the + * !list_empty_careful() in handle_userfault(). + * + * try_to_wake_up() has an implicit smp_mb(), and the + * wq->private is read before calling the extern function + * "wake_up_state" (which in turns calls try_to_wake_up). */ list_del_init(&wq->entry); + } out: return ret; } @@ -181,7 +178,8 @@ static inline void msg_init(struct uffd_msg *msg) static inline struct uffd_msg userfault_msg(unsigned long address, unsigned int flags, - unsigned long reason) + unsigned long reason, + unsigned int features) { struct uffd_msg msg; msg_init(&msg); @@ -205,6 +203,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address, * write protect fault. */ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP; + if (features & UFFD_FEATURE_THREAD_ID) + msg.arg.pagefault.feat.ptid = task_pid_vnr(current); return msg; } @@ -373,13 +373,34 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) VM_BUG_ON(reason & ~(VM_UFFD_MISSING|VM_UFFD_WP)); VM_BUG_ON(!(reason & VM_UFFD_MISSING) ^ !!(reason & VM_UFFD_WP)); + if (ctx->features & UFFD_FEATURE_SIGBUS) + goto out; + /* * If it's already released don't get it. This avoids to loop * in __get_user_pages if userfaultfd_release waits on the * caller of handle_userfault to release the mmap_sem. */ - if (unlikely(ACCESS_ONCE(ctx->released))) + if (unlikely(ACCESS_ONCE(ctx->released))) { + /* + * Don't return VM_FAULT_SIGBUS in this case, so a non + * cooperative manager can close the uffd after the + * last UFFDIO_COPY, without risking to trigger an + * involuntary SIGBUS if the process was starting the + * userfaultfd while the userfaultfd was still armed + * (but after the last UFFDIO_COPY). If the uffd + * wasn't already closed when the userfault reached + * this point, that would normally be solved by + * userfaultfd_must_wait returning 'false'. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd. + */ + ret = VM_FAULT_NOPAGE; goto out; + } /* * Check that we can return VM_FAULT_RETRY. @@ -422,7 +443,8 @@ int handle_userfault(struct vm_fault *vmf, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason, + ctx->features); uwq.ctx = ctx; uwq.waken = false; @@ -1197,7 +1219,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, struct uffdio_register __user *user_uffdio_register; unsigned long vm_flags, new_flags; bool found; - bool non_anon_pages; + bool basic_ioctls; unsigned long start, end, vma_end; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1263,7 +1285,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, * Search for not compatible vmas. */ found = false; - non_anon_pages = false; + basic_ioctls = false; for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { cond_resched(); @@ -1302,8 +1324,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* * Note vmas containing huge pages */ - if (is_vm_hugetlb_page(cur) || vma_is_shmem(cur)) - non_anon_pages = true; + if (is_vm_hugetlb_page(cur)) + basic_ioctls = true; found = true; } @@ -1374,7 +1396,7 @@ out_unlock: * userland which ioctls methods are guaranteed to * succeed on this range. */ - if (put_user(non_anon_pages ? UFFD_API_RANGE_IOCTLS_BASIC : + if (put_user(basic_ioctls ? UFFD_API_RANGE_IOCTLS_BASIC : UFFD_API_RANGE_IOCTLS, &user_uffdio_register->ioctls)) ret = -EFAULT; diff --git a/fs/utimes.c b/fs/utimes.c index 6571d8c848a0..51edb9f9507c 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -22,7 +22,7 @@ */ SYSCALL_DEFINE2(utime, char __user *, filename, struct utimbuf __user *, times) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (times) { if (get_user(tv[0].tv_sec, ×->actime) || @@ -44,7 +44,7 @@ static bool nsec_valid(long nsec) return nsec >= 0 && nsec <= 999999999; } -static int utimes_common(const struct path *path, struct timespec *times) +static int utimes_common(const struct path *path, struct timespec64 *times) { int error; struct iattr newattrs; @@ -115,7 +115,7 @@ out: * must be owner or have write permission. * Else, update from *times, must be owner or super user. */ -long do_utimes(int dfd, const char __user *filename, struct timespec *times, +long do_utimes(int dfd, const char __user *filename, struct timespec64 *times, int flags) { int error = -EINVAL; @@ -167,10 +167,11 @@ out: SYSCALL_DEFINE4(utimensat, int, dfd, const char __user *, filename, struct timespec __user *, utimes, int, flags) { - struct timespec tstimes[2]; + struct timespec64 tstimes[2]; if (utimes) { - if (copy_from_user(&tstimes, utimes, sizeof(tstimes))) + if ((get_timespec64(&tstimes[0], &utimes[0]) || + get_timespec64(&tstimes[1], &utimes[1]))) return -EFAULT; /* Nothing to do, we must not even check the path. */ @@ -186,7 +187,7 @@ SYSCALL_DEFINE3(futimesat, int, dfd, const char __user *, filename, struct timeval __user *, utimes) { struct timeval times[2]; - struct timespec tstimes[2]; + struct timespec64 tstimes[2]; if (utimes) { if (copy_from_user(×, utimes, sizeof(times))) @@ -224,7 +225,7 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, struct compat_utimbuf __user *, t) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t->actime) || @@ -238,11 +239,11 @@ COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { - if (compat_get_timespec(&tv[0], &t[0]) || - compat_get_timespec(&tv[1], &t[1])) + if (compat_get_timespec64(&tv[0], &t[0]) || + compat_get_timespec64(&tv[1], &t[1])) return -EFAULT; if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) @@ -253,7 +254,7 @@ COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filena COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) { - struct timespec tv[2]; + struct timespec64 tv[2]; if (t) { if (get_user(tv[0].tv_sec, &t[0].tv_sec) || diff --git a/fs/xattr.c b/fs/xattr.c index 464c94bf65f9..4424f7fecf14 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -23,6 +23,7 @@ #include <linux/posix_acl_xattr.h> #include <linux/uaccess.h> +#include "internal.h" static const char * strcmp_prefix(const char *a, const char *a_prefix) @@ -441,6 +442,12 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if ((strcmp(kname, XATTR_NAME_POSIX_ACL_ACCESS) == 0) || (strcmp(kname, XATTR_NAME_POSIX_ACL_DEFAULT) == 0)) posix_acl_fix_xattr_from_user(kvalue, size); + else if (strcmp(kname, XATTR_NAME_CAPS) == 0) { + error = cap_convert_nscap(d, &kvalue, size); + if (error < 0) + goto out; + size = error; + } } error = vfs_setxattr(d, kname, kvalue, size, flags); @@ -496,10 +503,10 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = setxattr(f.file->f_path.dentry, name, value, size, flags); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; @@ -728,10 +735,10 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) if (!f.file) return error; audit_file(f.file); - error = mnt_want_write_file(f.file); + error = mnt_want_write_file_path(f.file); if (!error) { error = removexattr(f.file->f_path.dentry, name); - mnt_drop_write_file(f.file); + mnt_drop_write_file_path(f.file); } fdput(f); return error; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index de7b9bd30bec..6249c92671de 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -328,20 +328,19 @@ xfs_attr_set( */ xfs_defer_init(args.dfops, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) - error = xfs_defer_finish(&args.trans, args.dfops, dp); - if (error) { - args.trans = NULL; - xfs_defer_cancel(&dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args.dfops, dp); + error = xfs_defer_finish(&args.trans, args.dfops); + if (error) + goto out_defer_cancel; /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ - error = xfs_trans_roll(&args.trans, dp); + error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; @@ -373,6 +372,9 @@ xfs_attr_set( return error; +out_defer_cancel: + xfs_defer_cancel(&dfops); + args.trans = NULL; out: if (args.trans) xfs_trans_cancel(args.trans); @@ -593,19 +595,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the current trans (including the inode) and start * a new one. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -620,7 +621,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) * Commit the transaction that added the attr name so that * later routines can manage their own transactions. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; @@ -684,20 +685,18 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit the remove and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); } else if (args->rmtblkno > 0) { /* @@ -706,6 +705,10 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) error = xfs_attr3_leaf_clearflag(args); } return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -747,15 +750,18 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -872,20 +878,18 @@ restart: state = NULL; xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the node conversion and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -900,13 +904,12 @@ restart: */ xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_split(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else { /* * Addition succeeded, update Btree hashvals. @@ -925,7 +928,7 @@ restart: * Commit the leaf addition or btree split and start the next * trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -999,20 +1002,18 @@ restart: if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } /* * Commit and start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; @@ -1032,6 +1033,10 @@ out: if (error) return error; return retval; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* @@ -1122,17 +1127,16 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_defer_init(args->dfops, args->firstblock); error = xfs_da3_join(state); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Commit the Btree join operation and start a new trans. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) goto out; } @@ -1156,14 +1160,12 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_defer_init(args->dfops, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) - error = xfs_defer_finish(&args->trans, - args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - goto out; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; } else xfs_trans_brelse(args->trans, bp); } @@ -1172,6 +1174,10 @@ xfs_attr_node_removename(xfs_da_args_t *args) out: xfs_da_state_free(state); return error; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + goto out; } /* diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index c6c15e5717e4..5c16db86b38f 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -2608,7 +2608,7 @@ xfs_attr3_leaf_clearflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2659,7 +2659,7 @@ xfs_attr3_leaf_setflag( /* * Commit the flag value change and start the next trans in series. */ - return xfs_trans_roll(&args->trans, args->dp); + return xfs_trans_roll_inode(&args->trans, args->dp); } /* @@ -2777,7 +2777,7 @@ xfs_attr3_leaf_flipflags( /* * Commit the flag value change and start the next trans in series. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5236d8e45146..d56caf037ca0 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -467,13 +467,12 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->dfops); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && @@ -484,7 +483,7 @@ xfs_attr_rmtval_set( /* * Start the next trans in the chain. */ - error = xfs_trans_roll(&args->trans, dp); + error = xfs_trans_roll_inode(&args->trans, dp); if (error) return error; } @@ -539,6 +538,10 @@ xfs_attr_rmtval_set( } ASSERT(valuelen == 0); return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } /* @@ -609,21 +612,23 @@ xfs_attr_rmtval_remove( error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->dfops, &done); - if (!error) - error = xfs_defer_finish(&args->trans, args->dfops, - args->dp); - if (error) { - args->trans = NULL; - xfs_defer_cancel(args->dfops); - return error; - } + if (error) + goto out_defer_cancel; + xfs_defer_ijoin(args->dfops, args->dp); + error = xfs_defer_finish(&args->trans, args->dfops); + if (error) + goto out_defer_cancel; /* * Close out trans and start the next one in the chain. */ - error = xfs_trans_roll(&args->trans, args->dp); + error = xfs_trans_roll_inode(&args->trans, args->dp); if (error) return error; } return 0; +out_defer_cancel: + xfs_defer_cancel(args->dfops); + args->trans = NULL; + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index c09c16b1ad3b..459f4b4f08fe 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -579,7 +579,7 @@ xfs_bmap_validate_ret( #else #define xfs_bmap_check_leaf_extents(cur, ip, whichfork) do { } while (0) -#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) +#define xfs_bmap_validate_ret(bno,len,flags,mval,onmap,nmap) do { } while (0) #endif /* DEBUG */ /* @@ -880,7 +880,7 @@ xfs_bmap_local_to_extents( xfs_ifork_t *ifp; /* inode fork pointer */ xfs_alloc_arg_t args; /* allocation arguments */ xfs_buf_t *bp; /* buffer for extent block */ - xfs_bmbt_rec_host_t *ep; /* extent record pointer */ + struct xfs_bmbt_irec rec; /* * We don't want to deal with the case of keeping inode data inline yet. @@ -943,9 +943,12 @@ xfs_bmap_local_to_extents( xfs_bmap_local_to_extents_empty(ip, whichfork); flags |= XFS_ILOG_CORE; - xfs_iext_add(ifp, 0, 1); - ep = xfs_iext_get_ext(ifp, 0); - xfs_bmbt_set_allf(ep, 0, args.fsbno, 1, XFS_EXT_NORM); + rec.br_startoff = 0; + rec.br_startblock = args.fsbno; + rec.br_blockcount = 1; + rec.br_state = XFS_EXT_NORM; + xfs_iext_insert(ip, 0, 1, &rec, 0); + trace_xfs_bmap_post_update(ip, 0, whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0, _THIS_IP_); @@ -1196,7 +1199,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1356,7 +1359,6 @@ xfs_bmap_first_unused( xfs_fileoff_t lastaddr; /* last block number seen */ xfs_fileoff_t lowest; /* lowest useful block */ xfs_fileoff_t max; /* starting useful block */ - xfs_fileoff_t off; /* offset for this block */ xfs_extnum_t nextents; /* number of extent entries */ ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE || @@ -1373,16 +1375,19 @@ xfs_bmap_first_unused( lowest = *first_unused; nextents = xfs_iext_count(ifp); for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - off = xfs_bmbt_get_startoff(ep); + struct xfs_bmbt_irec got; + + xfs_iext_get_extent(ifp, idx, &got); + /* * See if the hole before this extent will work. */ - if (off >= lowest + len && off - max >= len) { + if (got.br_startoff >= lowest + len && + got.br_startoff - max >= len) { *first_unused = max; return 0; } - lastaddr = off + xfs_bmbt_get_blockcount(ep); + lastaddr = got.br_startoff + got.br_blockcount; max = XFS_FILEOFF_MAX(lastaddr, lowest); } *first_unused = max; @@ -4918,7 +4923,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -4930,7 +4935,7 @@ xfs_bmap_del_extent_delay( da_new = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, got->br_blockcount), da_old); got->br_startblock = nullstartblock((int)da_new); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -4956,7 +4961,7 @@ xfs_bmap_del_extent_delay( del->br_blockcount); got->br_startblock = nullstartblock((int)got_indlen); - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, 0, _THIS_IP_); new.br_startoff = del_endoff; @@ -5026,7 +5031,7 @@ xfs_bmap_del_extent_cow( got->br_startoff = del_endoff; got->br_blockcount -= del->br_blockcount; got->br_startblock = del->br_startblock + del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case BMAP_RIGHT_CONTIG: @@ -5035,7 +5040,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount -= del->br_blockcount; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); break; case 0: @@ -5044,7 +5049,7 @@ xfs_bmap_del_extent_cow( */ trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); got->br_blockcount = del->br_startoff - got->br_startoff; - xfs_bmbt_set_all(xfs_iext_get_ext(ifp, *idx), got); + xfs_iext_update_extent(ifp, *idx, got); trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); new.br_startoff = del_endoff; @@ -5876,32 +5881,26 @@ xfs_bmse_merge( int whichfork, xfs_fileoff_t shift, /* shift fsb */ int current_ext, /* idx of gotp */ - struct xfs_bmbt_rec_host *gotp, /* extent to shift */ - struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_bmbt_irec *got, /* extent to shift */ + struct xfs_bmbt_irec *left, /* preceding extent */ struct xfs_btree_cur *cur, - int *logflags) /* output */ + int *logflags, /* output */ + struct xfs_defer_ops *dfops) { - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_bmbt_irec new; xfs_filblks_t blockcount; int error, i; struct xfs_mount *mp = ip->i_mount; - xfs_bmbt_get_all(gotp, &got); - xfs_bmbt_get_all(leftp, &left); - blockcount = left.br_blockcount + got.br_blockcount; + blockcount = left->br_blockcount + got->br_blockcount; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + ASSERT(xfs_bmse_can_merge(left, got, shift)); - /* - * Merge the in-core extents. Note that the host record pointers and - * current_ext index are invalid once the extent has been removed via - * xfs_iext_remove(). - */ - xfs_bmbt_set_blockcount(leftp, blockcount); - xfs_iext_remove(ip, current_ext, 1, 0); + new = *left; + new.br_blockcount = blockcount; /* * Update the on-disk extent count, the btree if necessary and log the @@ -5912,12 +5911,12 @@ xfs_bmse_merge( *logflags |= XFS_ILOG_CORE; if (!cur) { *logflags |= XFS_ILOG_DEXT; - return 0; + goto done; } /* lookup and remove the extent to merge */ - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, got->br_startblock, + got->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); @@ -5928,16 +5927,28 @@ xfs_bmse_merge( XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ - error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, &i); + error = xfs_bmbt_lookup_eq(cur, left->br_startoff, left->br_startblock, + left->br_blockcount, &i); if (error) return error; XFS_WANT_CORRUPTED_RETURN(mp, i == 1); - left.br_blockcount = blockcount; + error = xfs_bmbt_update(cur, new.br_startoff, new.br_startblock, + new.br_blockcount, new.br_state); + if (error) + return error; + +done: + xfs_iext_update_extent(ifp, current_ext - 1, &new); + xfs_iext_remove(ip, current_ext, 1, 0); - return xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, - left.br_blockcount, left.br_state); + /* update reverse mapping. rmap functions merge the rmaps for us */ + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); + if (error) + return error; + memcpy(&new, got, sizeof(new)); + new.br_startoff = left->br_startoff + left->br_blockcount; + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -5949,7 +5960,7 @@ xfs_bmse_shift_one( int whichfork, xfs_fileoff_t offset_shift_fsb, int *current_ext, - struct xfs_bmbt_rec_host *gotp, + struct xfs_bmbt_irec *got, struct xfs_btree_cur *cur, int *logflags, enum shift_direction direction, @@ -5958,9 +5969,7 @@ xfs_bmse_shift_one( struct xfs_ifork *ifp; struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *adj_irecp; - struct xfs_bmbt_irec got; - struct xfs_bmbt_irec adj_irec; + struct xfs_bmbt_irec adj_irec, new; int error; int i; int total_extents; @@ -5969,13 +5978,11 @@ xfs_bmse_shift_one( ifp = XFS_IFORK_PTR(ip, whichfork); total_extents = xfs_iext_count(ifp); - xfs_bmbt_get_all(gotp, &got); - /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got->br_startblock)); if (direction == SHIFT_LEFT) { - startoff = got.br_startoff - offset_shift_fsb; + startoff = got->br_startoff - offset_shift_fsb; /* * Check for merge if we've got an extent to the left, @@ -5983,46 +5990,39 @@ xfs_bmse_shift_one( * of the file for the shift. */ if (!*current_ext) { - if (got.br_startoff < offset_shift_fsb) + if (got->br_startoff < offset_shift_fsb) return -EINVAL; goto update_current_ext; } + /* - * grab the left extent and check for a large - * enough hole. + * grab the left extent and check for a large enough hole. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - - if (startoff < - adj_irec.br_startoff + adj_irec.br_blockcount) + xfs_iext_get_extent(ifp, *current_ext - 1, &adj_irec); + if (startoff < adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&adj_irec, &got, - offset_shift_fsb)) { - error = xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, adj_irecp, - cur, logflags); - if (error) - return error; - adj_irec = got; - goto update_rmap; + if (xfs_bmse_can_merge(&adj_irec, got, offset_shift_fsb)) { + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, + *current_ext, got, &adj_irec, + cur, logflags, dfops); } } else { - startoff = got.br_startoff + offset_shift_fsb; + startoff = got->br_startoff + offset_shift_fsb; /* nothing to move if this is the last extent */ if (*current_ext >= (total_extents - 1)) goto update_current_ext; + /* * If this is not the last extent in the file, make sure there * is enough room between current extent and next extent for * accommodating the shift. */ - adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); - xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff + got.br_blockcount > adj_irec.br_startoff) + xfs_iext_get_extent(ifp, *current_ext + 1, &adj_irec); + if (startoff + got->br_blockcount > adj_irec.br_startoff) return -EINVAL; + /* * Unlike a left shift (which involves a hole punch), * a right shift does not modify extent neighbors @@ -6030,45 +6030,48 @@ xfs_bmse_shift_one( * in this scenario. Check anyways and warn if we * encounter two extents that could be one. */ - if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + if (xfs_bmse_can_merge(got, &adj_irec, offset_shift_fsb)) WARN_ON_ONCE(1); } + /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ update_current_ext: - if (direction == SHIFT_LEFT) - (*current_ext)++; - else - (*current_ext)--; - xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; - adj_irec = got; - if (!cur) { + + new = *got; + new.br_startoff = startoff; + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, got->br_startoff, + got->br_startblock, got->br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + + error = xfs_bmbt_update(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + new.br_state); + if (error) + return error; + } else { *logflags |= XFS_ILOG_DEXT; - goto update_rmap; } - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, &i); - if (error) - return error; - XFS_WANT_CORRUPTED_RETURN(mp, i == 1); + xfs_iext_update_extent(ifp, *current_ext, &new); - got.br_startoff = startoff; - error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); - if (error) - return error; + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; -update_rmap: /* update reverse mapping */ - error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, &adj_irec); + error = xfs_rmap_unmap_extent(mp, dfops, ip, whichfork, got); if (error) return error; - adj_irec.br_startoff = startoff; - return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &adj_irec); + return xfs_rmap_map_extent(mp, dfops, ip, whichfork, &new); } /* @@ -6095,7 +6098,6 @@ xfs_bmap_shift_extents( int num_exts) { struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; @@ -6122,7 +6124,6 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -6154,10 +6155,26 @@ xfs_bmap_shift_extents( * In case of first right shift, we need to initialize next_fsb */ if (*next_fsb == NULLFSBLOCK) { - gotp = xfs_iext_get_ext(ifp, total_extents - 1); - xfs_bmbt_get_all(gotp, &got); + ASSERT(direction == SHIFT_RIGHT); + + current_ext = total_extents - 1; + xfs_iext_get_extent(ifp, current_ext, &got); + if (stop_fsb > got.br_startoff) { + *done = 1; + goto del_cursor; + } *next_fsb = got.br_startoff; - if (stop_fsb > *next_fsb) { + } else { + /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * If next_fsb lies in a hole beyond which there are no extents we are + * done. + */ + if (!xfs_iext_lookup_extent(ip, ifp, *next_fsb, ¤t_ext, + &got)) { *done = 1; goto del_cursor; } @@ -6165,37 +6182,26 @@ xfs_bmap_shift_extents( /* Lookup the extent index at which we have to stop */ if (direction == SHIFT_RIGHT) { - gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + struct xfs_bmbt_irec s; + + xfs_iext_lookup_extent(ip, ifp, stop_fsb, &stop_extent, &s); /* Make stop_extent exclusive of shift range */ stop_extent--; - } else + if (current_ext <= stop_extent) { + error = -EIO; + goto del_cursor; + } + } else { stop_extent = total_extents; - - /* - * Look up the extent index for the fsb where we start shifting. We can - * henceforth iterate with current_ext as extent list changes are locked - * out via ilock. - * - * gotp can be null in 2 cases: 1) if there are no extents or 2) - * *next_fsb lies in a hole beyond which there are no extents. Either - * way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); - if (!gotp) { - *done = 1; - goto del_cursor; - } - - /* some sanity checking before we finally start shifting extents */ - if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || - (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { - error = -EIO; - goto del_cursor; + if (current_ext >= stop_extent) { + error = -EIO; + goto del_cursor; + } } while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags, + ¤t_ext, &got, cur, &logflags, direction, dfops); if (error) goto del_cursor; @@ -6213,13 +6219,11 @@ xfs_bmap_shift_extents( *next_fsb = NULLFSBLOCK; break; } - gotp = xfs_iext_get_ext(ifp, current_ext); + xfs_iext_get_extent(ifp, current_ext, &got); } - if (!*done) { - xfs_bmbt_get_all(gotp, &got); + if (!*done) *next_fsb = got.br_startoff; - } del_cursor: if (cur) @@ -6248,7 +6252,6 @@ xfs_bmap_split_extent_at( { int whichfork = XFS_DATA_FORK; struct xfs_btree_cur *cur = NULL; - struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; struct xfs_bmbt_irec new; /* split extent */ struct xfs_mount *mp = ip->i_mount; @@ -6280,21 +6283,10 @@ xfs_bmap_split_extent_at( } /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) split_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. - */ - gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); - if (!gotp) - return 0; - - xfs_bmbt_get_all(gotp, &got); - - /* - * Check split_fsb lies in a hole or the start boundary offset - * of the extent. + * If there are not extents, or split_fsb lies in a hole we are done. */ - if (got.br_startoff >= split_fsb) + if (!xfs_iext_lookup_extent(ip, ifp, split_fsb, ¤t_ext, &got) || + got.br_startoff >= split_fsb) return 0; gotblkcnt = split_fsb - got.br_startoff; @@ -6317,8 +6309,8 @@ xfs_bmap_split_extent_at( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); } - xfs_bmbt_set_blockcount(gotp, gotblkcnt); got.br_blockcount = gotblkcnt; + xfs_iext_update_extent(ifp, current_ext, &got); logflags = XFS_ILOG_CORE; if (cur) { @@ -6402,7 +6394,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out; @@ -6452,7 +6444,7 @@ __xfs_bmap_add( bi->bi_whichfork = whichfork; bi->bi_bmap = *bmap; - error = xfs_defer_join(dfops, bi->bi_owner); + error = xfs_defer_ijoin(dfops, bi->bi_owner); if (error) { kmem_free(bi); return error; diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 85de22513014..a6331ffa51e3 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -858,6 +858,7 @@ xfs_bmbt_change_owner( cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork); if (!cur) return -ENOMEM; + cur->bc_private.b.flags |= XFS_BTCUR_BPRV_INVALID_OWNER; error = xfs_btree_change_owner(cur, new_owner, buffer_list); xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index e0bcc4a59efd..5bfb88261c7e 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -1791,6 +1791,7 @@ xfs_btree_lookup_get_block( /* Check the inode owner since the verifiers don't. */ if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) && + !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_INVALID_OWNER) && (cur->bc_flags & XFS_BTREE_LONG_PTRS) && be64_to_cpu((*blkp)->bb_u.l.bb_owner) != cur->bc_private.b.ip->i_ino) @@ -4451,10 +4452,15 @@ xfs_btree_block_change_owner( /* modify the owner */ block = xfs_btree_get_block(cur, level, &bp); - if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { + if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner)) + return 0; block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner); - else + } else { + if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner)) + return 0; block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner); + } /* * If the block is a root block hosted in an inode, we might not have a @@ -4463,16 +4469,19 @@ xfs_btree_block_change_owner( * block is formatted into the on-disk inode fork. We still change it, * though, so everything is consistent in memory. */ - if (bp) { - if (cur->bc_tp) { - xfs_trans_ordered_buf(cur->bc_tp, bp); + if (!bp) { + ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); + ASSERT(level == cur->bc_nlevels - 1); + return 0; + } + + if (cur->bc_tp) { + if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) { xfs_btree_log_block(cur, bp, XFS_BB_OWNER); - } else { - xfs_buf_delwri_queue(bp, bbcoi->buffer_list); + return -EAGAIN; } } else { - ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE); - ASSERT(level == cur->bc_nlevels - 1); + xfs_buf_delwri_queue(bp, bbcoi->buffer_list); } return 0; diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9c95e965cfe5..f2a88c3b1159 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -233,7 +233,8 @@ typedef struct xfs_btree_cur short forksize; /* fork's inode space */ char whichfork; /* data or attr fork */ char flags; /* flags */ -#define XFS_BTCUR_BPRV_WASDEL 1 /* was delayed */ +#define XFS_BTCUR_BPRV_WASDEL (1<<0) /* was delayed */ +#define XFS_BTCUR_BPRV_INVALID_OWNER (1<<1) /* for ext swap */ } b; } bc_private; /* per-btree type data */ } xfs_btree_cur_t; diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5c2929f94bd3..072ebfe1d6ae 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -240,23 +240,19 @@ xfs_defer_trans_abort( STATIC int xfs_defer_trans_roll( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { int i; int error; - /* Log all the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Log all the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); - } trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ - error = xfs_trans_roll(tp, ip); + error = xfs_trans_roll(tp); if (error) { trace_xfs_defer_trans_roll_error((*tp)->t_mountp, dop, error); xfs_defer_trans_abort(*tp, dop, error); @@ -264,12 +260,9 @@ xfs_defer_trans_roll( } dop->dop_committed = true; - /* Rejoin the joined inodes except the one we passed in. */ - for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) { - if (dop->dop_inodes[i] == ip) - continue; + /* Rejoin the joined inodes. */ + for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); - } return error; } @@ -284,11 +277,10 @@ xfs_defer_has_unfinished_work( /* * Add this inode to the deferred op. Each joined inode is relogged - * each time we roll the transaction, in addition to any inode passed - * to xfs_defer_finish(). + * each time we roll the transaction. */ int -xfs_defer_join( +xfs_defer_ijoin( struct xfs_defer_ops *dop, struct xfs_inode *ip) { @@ -317,8 +309,7 @@ xfs_defer_join( int xfs_defer_finish( struct xfs_trans **tp, - struct xfs_defer_ops *dop, - struct xfs_inode *ip) + struct xfs_defer_ops *dop) { struct xfs_defer_pending *dfp; struct list_head *li; @@ -337,7 +328,7 @@ xfs_defer_finish( xfs_defer_intake_work(*tp, dop); /* Roll the transaction. */ - error = xfs_defer_trans_roll(tp, dop, ip); + error = xfs_defer_trans_roll(tp, dop); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index f6e93ef0bffe..d4f046dd44bd 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -72,12 +72,11 @@ struct xfs_defer_ops { void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, struct list_head *h); -int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop, - struct xfs_inode *ip); +int xfs_defer_finish(struct xfs_trans **tp, struct xfs_defer_ops *dop); void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); -int xfs_defer_join(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); /* Description of a deferred type. */ struct xfs_defer_op_type { diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index abf5beaae907..988bb3f31446 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -378,8 +378,6 @@ xfs_ialloc_inode_init( * transaction and pin the log appropriately. */ xfs_trans_ordered_buf(tp, fbuf); - xfs_trans_log_buf(tp, fbuf, 0, - BBTOB(fbuf->b_length) - 1); } } else { fbuf->b_flags |= XBF_DONE; @@ -1133,6 +1131,7 @@ xfs_dialloc_ag_inobt( int error; int offset; int i, j; + int searchdistance = 10; pag = xfs_perag_get(mp, agno); @@ -1159,7 +1158,6 @@ xfs_dialloc_ag_inobt( if (pagno == agno) { int doneleft; /* done, to the left */ int doneright; /* done, to the right */ - int searchdistance = 10; error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) @@ -1220,21 +1218,9 @@ xfs_dialloc_ag_inobt( /* * Loop until we find an inode chunk with a free inode. */ - while (!doneleft || !doneright) { + while (--searchdistance > 0 && (!doneleft || !doneright)) { int useleft; /* using left inode chunk this time */ - if (!--searchdistance) { - /* - * Not in range - save last search - * location and allocate a new inode - */ - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - pag->pagl_leftrec = trec.ir_startino; - pag->pagl_rightrec = rec.ir_startino; - pag->pagl_pagino = pagino; - goto newino; - } - /* figure out the closer block if both are valid. */ if (!doneleft && !doneright) { useleft = pagino - @@ -1278,26 +1264,37 @@ xfs_dialloc_ag_inobt( goto error1; } - /* - * We've reached the end of the btree. because - * we are only searching a small chunk of the - * btree each search, there is obviously free - * inodes closer to the parent inode than we - * are now. restart the search again. - */ - pag->pagl_pagino = NULLAGINO; - pag->pagl_leftrec = NULLAGINO; - pag->pagl_rightrec = NULLAGINO; - xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); - xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - goto restart_pagno; + if (searchdistance <= 0) { + /* + * Not in range - save last search + * location and allocate a new inode + */ + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + pag->pagl_leftrec = trec.ir_startino; + pag->pagl_rightrec = rec.ir_startino; + pag->pagl_pagino = pagino; + + } else { + /* + * We've reached the end of the btree. because + * we are only searching a small chunk of the + * btree each search, there is obviously free + * inodes closer to the parent inode than we + * are now. restart the search again. + */ + pag->pagl_pagino = NULLAGINO; + pag->pagl_leftrec = NULLAGINO; + pag->pagl_rightrec = NULLAGINO; + xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + goto restart_pagno; + } } /* * In a different AG from the parent. * See if the most recently allocated block has any free. */ -newino: if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), XFS_LOOKUP_EQ, &i); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 0e80f34fe97c..31840ca24018 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -1499,14 +1499,11 @@ xfs_iext_realloc_indirect( xfs_ifork_t *ifp, /* inode fork pointer */ int new_size) /* new indirection array size */ { - int nlists; /* number of irec's (ex lists) */ - int size; /* current indirection array size */ - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - size = nlists * sizeof(xfs_ext_irec_t); ASSERT(ifp->if_real_bytes); - ASSERT((new_size >= 0) && (new_size != size)); + ASSERT((new_size >= 0) && + (new_size != ((ifp->if_real_bytes / XFS_IEXT_BUFSZ) * + sizeof(xfs_ext_irec_t)))); if (new_size == 0) { xfs_iext_destroy(ifp); } else { @@ -2023,3 +2020,15 @@ xfs_iext_get_extent( xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp); return true; } + +void +xfs_iext_update_extent( + struct xfs_ifork *ifp, + xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp) +{ + ASSERT(idx >= 0); + ASSERT(idx < xfs_iext_count(ifp)); + + xfs_bmbt_set_all(xfs_iext_get_ext(ifp, idx), gotp); +} diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index 7fb8365326d1..11af705219f6 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -187,6 +187,8 @@ bool xfs_iext_lookup_extent(struct xfs_inode *ip, xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp); bool xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, struct xfs_bmbt_irec *gotp); +void xfs_iext_update_extent(struct xfs_ifork *ifp, xfs_extnum_t idx, + struct xfs_bmbt_irec *gotp); extern struct kmem_zone *xfs_ifork_zone; diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 45b1c3b4e047..9d5406b4f663 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -1679,7 +1679,7 @@ xfs_refcount_recover_cow_leftovers( xfs_bmap_add_free(mp, &dfops, fsb, rr->rr_rrec.rc_blockcount, NULL); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6bf120bb1a17..29172609f2a3 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -80,16 +80,29 @@ xfs_find_bdev_for_inode( return mp->m_ddev_targp->bt_bdev; } +struct dax_device * +xfs_find_daxdev_for_inode( + struct inode *inode) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + + if (XFS_IS_REALTIME_INODE(ip)) + return mp->m_rtdev_targp->bt_daxdev; + else + return mp->m_ddev_targp->bt_daxdev; +} + /* * We're now finished for good with this page. Update the page state via the * associated buffer_heads, paying attention to the start and end offsets that * we need to process on the page. * - * Landmine Warning: bh->b_end_io() will call end_page_writeback() on the last - * buffer in the IO. Once it does this, it is unsafe to access the bufferhead or - * the page at all, as we may be racing with memory reclaim and it can free both - * the bufferhead chain and the page as it will see the page as clean and - * unused. + * Note that we open code the action in end_buffer_async_write here so that we + * only have to iterate over the buffers attached to the page once. This is not + * only more efficient, but also ensures that we only calls end_page_writeback + * at the end of the iteration, and thus avoids the pitfall of having the page + * and buffers potentially freed after every call to end_buffer_async_write. */ static void xfs_finish_page_writeback( @@ -97,29 +110,44 @@ xfs_finish_page_writeback( struct bio_vec *bvec, int error) { - unsigned int end = bvec->bv_offset + bvec->bv_len - 1; - struct buffer_head *head, *bh, *next; + struct buffer_head *head = page_buffers(bvec->bv_page), *bh = head; + bool busy = false; unsigned int off = 0; - unsigned int bsize; + unsigned long flags; ASSERT(bvec->bv_offset < PAGE_SIZE); ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0); - ASSERT(end < PAGE_SIZE); + ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE); ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0); - bh = head = page_buffers(bvec->bv_page); - - bsize = bh->b_size; + local_irq_save(flags); + bit_spin_lock(BH_Uptodate_Lock, &head->b_state); do { - if (off > end) - break; - next = bh->b_this_page; - if (off < bvec->bv_offset) - goto next_bh; - bh->b_end_io(bh, !error); -next_bh: - off += bsize; - } while ((bh = next) != head); + if (off >= bvec->bv_offset && + off < bvec->bv_offset + bvec->bv_len) { + ASSERT(buffer_async_write(bh)); + ASSERT(bh->b_end_io == NULL); + + if (error) { + mark_buffer_write_io_error(bh); + clear_buffer_uptodate(bh); + SetPageError(bvec->bv_page); + } else { + set_buffer_uptodate(bh); + } + clear_buffer_async_write(bh); + unlock_buffer(bh); + } else if (buffer_async_write(bh)) { + ASSERT(buffer_locked(bh)); + busy = true; + } + off += bh->b_size; + } while ((bh = bh->b_this_page) != head); + bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); + local_irq_restore(flags); + + if (!busy) + end_page_writeback(bvec->bv_page); } /* @@ -133,8 +161,10 @@ xfs_destroy_ioend( int error) { struct inode *inode = ioend->io_inode; - struct bio *last = ioend->io_bio; - struct bio *bio, *next; + struct bio *bio = &ioend->io_inline_bio; + struct bio *last = ioend->io_bio, *next; + u64 start = bio->bi_iter.bi_sector; + bool quiet = bio_flagged(bio, BIO_QUIET); for (bio = &ioend->io_inline_bio; bio; bio = next) { struct bio_vec *bvec; @@ -155,6 +185,11 @@ xfs_destroy_ioend( bio_put(bio); } + + if (unlikely(error && !quiet)) { + xfs_err_ratelimited(XFS_I(inode)->i_mount, + "writeback error on sector %llu", start); + } } /* @@ -423,7 +458,8 @@ xfs_start_buffer_writeback( ASSERT(!buffer_delay(bh)); ASSERT(!buffer_unwritten(bh)); - mark_buffer_async_write(bh); + bh->b_end_io = NULL; + set_buffer_async_write(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); } @@ -517,7 +553,7 @@ xfs_init_bio_from_bh( struct buffer_head *bh) { bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); - bio->bi_bdev = bh->b_bdev; + bio_set_dev(bio, bh->b_bdev); } static struct xfs_ioend * diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index cc174ec6c2fd..88c85ea63da0 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -59,5 +59,6 @@ int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); extern void xfs_count_page_state(struct page *, int *, int *); extern struct block_device *xfs_find_bdev_for_inode(struct inode *); +extern struct dax_device *xfs_find_daxdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index be0b79d8900f..ebd66b19fbfc 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -97,7 +97,7 @@ xfs_attr3_leaf_freextent( /* * Roll to next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -308,7 +308,7 @@ xfs_attr3_node_inactive( /* * Atomically commit the whole invalidate stuff. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); if (error) return error; } @@ -375,7 +375,7 @@ xfs_attr3_root_inactive( /* * Commit the invalidate and start the next transaction. */ - error = xfs_trans_roll(trans, dp); + error = xfs_trans_roll_inode(trans, dp); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 88073910fa5d..dd136f7275e4 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -502,7 +502,7 @@ xfs_bui_recover( } /* Finish transaction, free inodes. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto err_dfops; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 93e955262d07..cd9a5400ba4f 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -222,22 +222,21 @@ xfs_bmap_eof( * Count leaf blocks given a range of extent records. Delayed allocation * extents are not counted towards the totals. */ -STATIC void +xfs_extnum_t xfs_bmap_count_leaves( struct xfs_ifork *ifp, - xfs_extnum_t *numrecs, xfs_filblks_t *count) { - xfs_extnum_t i; - xfs_extnum_t nr_exts = xfs_iext_count(ifp); - - for (i = 0; i < nr_exts; i++) { - xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, i); - if (!isnullstartblock(xfs_bmbt_get_startblock(frp))) { - (*numrecs)++; - *count += xfs_bmbt_get_blockcount(frp); + struct xfs_bmbt_irec got; + xfs_extnum_t numrecs = 0, i = 0; + + while (xfs_iext_get_extent(ifp, i++, &got)) { + if (!isnullstartblock(got.br_startblock)) { + *count += got.br_blockcount; + numrecs++; } } + return numrecs; } /* @@ -370,7 +369,7 @@ xfs_bmap_count_blocks( switch (XFS_IFORK_FORMAT(ip, whichfork)) { case XFS_DINODE_FMT_EXTENTS: - xfs_bmap_count_leaves(ifp, nextents, count); + *nextents = xfs_bmap_count_leaves(ifp, count); return 0; case XFS_DINODE_FMT_BTREE: if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -1136,7 +1135,7 @@ xfs_alloc_file_space( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error0; @@ -1202,7 +1201,8 @@ xfs_unmap_extent( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1496,7 +1496,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1777,7 +1777,8 @@ xfs_swap_extent_rmap( if (error) goto out_defer; - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) goto out_defer; @@ -1840,29 +1841,18 @@ xfs_swap_extent_forks( } /* - * Before we've swapped the forks, lets set the owners of the forks - * appropriately. We have to do this as we are demand paging the btree - * buffers, and so the validation done on read will expect the owner - * field to be correctly set. Once we change the owners, we can swap the - * inode forks. + * Btree format (v3) inodes have the inode number stamped in the bmbt + * block headers. We can't start changing the bmbt blocks until the + * inode owner change is logged so recovery does the right thing in the + * event of a crash. Set the owner change log flags now and leave the + * bmbt scan as the last step. */ if (ip->i_d.di_version == 3 && - ip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + ip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*target_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, - tip->i_ino, NULL); - if (error) - return error; - } - if (tip->i_d.di_version == 3 && - tip->i_d.di_format == XFS_DINODE_FMT_BTREE) { + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) (*src_log_flags) |= XFS_ILOG_DOWNER; - error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK, - ip->i_ino, NULL); - if (error) - return error; - } /* * Swap the data forks of the inodes @@ -1940,6 +1930,48 @@ xfs_swap_extent_forks( return 0; } +/* + * Fix up the owners of the bmbt blocks to refer to the current inode. The + * change owner scan attempts to order all modified buffers in the current + * transaction. In the event of ordered buffer failure, the offending buffer is + * physically logged as a fallback and the scan returns -EAGAIN. We must roll + * the transaction in this case to replenish the fallback log reservation and + * restart the scan. This process repeats until the scan completes. + */ +static int +xfs_swap_change_owner( + struct xfs_trans **tpp, + struct xfs_inode *ip, + struct xfs_inode *tmpip) +{ + int error; + struct xfs_trans *tp = *tpp; + + do { + error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK, ip->i_ino, + NULL); + /* success or fatal error */ + if (error != -EAGAIN) + break; + + error = xfs_trans_roll(tpp); + if (error) + break; + tp = *tpp; + + /* + * Redirty both inodes so they can relog and keep the log tail + * moving forward. + */ + xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, tmpip, 0); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + xfs_trans_log_inode(tp, tmpip, XFS_ILOG_CORE); + } while (true); + + return error; +} + int xfs_swap_extents( struct xfs_inode *ip, /* target inode */ @@ -1954,7 +1986,7 @@ xfs_swap_extents( int lock_flags; struct xfs_ifork *cowfp; uint64_t f; - int resblks; + int resblks = 0; /* * Lock the inodes against other IO, page faults and truncate to @@ -2002,11 +2034,8 @@ xfs_swap_extents( XFS_SWAP_RMAP_SPACE_RES(mp, XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK), XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, - 0, 0, &tp); - } else - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, - 0, 0, &tp); + } + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp); if (error) goto out_unlock; @@ -2092,6 +2121,23 @@ xfs_swap_extents( xfs_trans_log_inode(tp, tip, target_log_flags); /* + * The extent forks have been swapped, but crc=1,rmapbt=0 filesystems + * have inode number owner values in the bmbt blocks that still refer to + * the old inode. Scan each bmbt to fix up the owner values with the + * inode number of the current inode. + */ + if (src_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, ip, tip); + if (error) + goto out_trans_cancel; + } + if (target_log_flags & XFS_ILOG_DOWNER) { + error = xfs_swap_change_owner(&tp, tip, ip); + if (error) + goto out_trans_cancel; + } + + /* * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. */ diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 0cede1043571..0eaa81dc49be 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -70,6 +70,7 @@ int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, xfs_daddr_t xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb); +xfs_extnum_t xfs_bmap_count_leaves(struct xfs_ifork *ifp, xfs_filblks_t *count); int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_extnum_t *nextents, xfs_filblks_t *count); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 72f038492ba8..da14658da310 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1281,7 +1281,7 @@ next_chunk: nr_pages = min(total_nr_pages, BIO_MAX_PAGES); bio = bio_alloc(GFP_NOIO, nr_pages); - bio->bi_bdev = bp->b_target->bt_bdev; + bio_set_dev(bio, bp->b_target->bt_bdev); bio->bi_iter.bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; @@ -1802,7 +1802,8 @@ xfs_setsize_buftarg_early( xfs_buftarg_t * xfs_alloc_buftarg( struct xfs_mount *mp, - struct block_device *bdev) + struct block_device *bdev, + struct dax_device *dax_dev) { xfs_buftarg_t *btp; @@ -1811,6 +1812,7 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + btp->bt_daxdev = dax_dev; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 20721261dae5..bf71507ddb16 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -108,6 +108,7 @@ typedef unsigned int xfs_buf_flags_t; typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; + struct dax_device *bt_daxdev; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; @@ -385,7 +386,7 @@ xfs_buf_update_cksum(struct xfs_buf *bp, unsigned long cksum_offset) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, - struct block_device *); + struct block_device *, struct dax_device *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index f6a8422e9562..e0a0af0946f2 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -29,6 +29,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_log.h" +#include "xfs_inode.h" kmem_zone_t *xfs_buf_item_zone; @@ -322,6 +323,8 @@ xfs_buf_item_format( ASSERT((bip->bli_flags & XFS_BLI_STALE) || (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED) || + (bip->bli_flags & XFS_BLI_STALE)); /* @@ -346,16 +349,6 @@ xfs_buf_item_format( bip->bli_flags &= ~XFS_BLI_INODE_BUF; } - if ((bip->bli_flags & (XFS_BLI_ORDERED|XFS_BLI_STALE)) == - XFS_BLI_ORDERED) { - /* - * The buffer has been logged just to order it. It is not being - * included in the transaction commit, so don't format it. - */ - trace_xfs_buf_item_format_ordered(bip); - return; - } - for (i = 0; i < bip->bli_format_count; i++) { xfs_buf_item_format_segment(bip, lv, &vecp, offset, &bip->bli_formats[i]); @@ -574,26 +567,20 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool clean; - bool aborted; - int flags; + bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); + bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); +#if defined(DEBUG) || defined(XFS_WARN) + bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); +#endif /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; /* - * If this is a transaction abort, don't return early. Instead, allow - * the brelse to happen. Normally it would be done for stale - * (cancelled) buffers at unpin time, but we'll never go through the - * pin/unpin cycle if we abort inside commit. + * The per-transaction state has been copied above so clear it from the + * bli. */ - aborted = (lip->li_flags & XFS_LI_ABORTED) ? true : false; - /* - * Before possibly freeing the buf item, copy the per-transaction state - * so we can reference it safely later after clearing it from the - * buffer log item. - */ - flags = bip->bli_flags; bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED); /* @@ -601,7 +588,7 @@ xfs_buf_item_unlock( * unlock the buffer and free the buf item when the buffer is unpinned * for the last time. */ - if (flags & XFS_BLI_STALE) { + if (bip->bli_flags & XFS_BLI_STALE) { trace_xfs_buf_item_unlock_stale(bip); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); if (!aborted) { @@ -619,20 +606,11 @@ xfs_buf_item_unlock( * regardless of whether it is dirty or not. A dirty abort implies a * shutdown, anyway. * - * Ordered buffers are dirty but may have no recorded changes, so ensure - * we only release clean items here. + * The bli dirty state should match whether the blf has logged segments + * except for ordered buffers, where only the bli should be dirty. */ - clean = (flags & XFS_BLI_DIRTY) ? false : true; - if (clean) { - int i; - for (i = 0; i < bip->bli_format_count; i++) { - if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, - bip->bli_formats[i].blf_map_size)) { - clean = false; - break; - } - } - } + ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) || + (ordered && dirty && !xfs_buf_item_dirty_format(bip))); /* * Clean buffers, by definition, cannot be in the AIL. However, aborted @@ -651,11 +629,11 @@ xfs_buf_item_unlock( ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (clean) + } else if (!dirty) xfs_buf_item_relse(bp); } - if (!(flags & XFS_BLI_HOLD)) + if (!hold) xfs_buf_relse(bp); } @@ -945,14 +923,22 @@ xfs_buf_item_log( /* - * Return 1 if the buffer has been logged or ordered in a transaction (at any - * point, not just the current transaction) and 0 if not. + * Return true if the buffer has any ranges logged/dirtied by a transaction, + * false otherwise. */ -uint -xfs_buf_item_dirty( - xfs_buf_log_item_t *bip) +bool +xfs_buf_item_dirty_format( + struct xfs_buf_log_item *bip) { - return (bip->bli_flags & XFS_BLI_DIRTY); + int i; + + for (i = 0; i < bip->bli_format_count; i++) { + if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map, + bip->bli_formats[i].blf_map_size)) + return true; + } + + return false; } STATIC void @@ -1054,6 +1040,31 @@ xfs_buf_do_callbacks( } } +/* + * Invoke the error state callback for each log item affected by the failed I/O. + * + * If a metadata buffer write fails with a non-permanent error, the buffer is + * eventually resubmitted and so the completion callbacks are not run. The error + * state may need to be propagated to the log items attached to the buffer, + * however, so the next AIL push of the item knows hot to handle it correctly. + */ +STATIC void +xfs_buf_do_callbacks_fail( + struct xfs_buf *bp) +{ + struct xfs_log_item *next; + struct xfs_log_item *lip = bp->b_fspriv; + struct xfs_ail *ailp = lip->li_ailp; + + spin_lock(&ailp->xa_lock); + for (; lip; lip = next) { + next = lip->li_bio_list; + if (lip->li_ops->iop_error) + lip->li_ops->iop_error(lip, bp); + } + spin_unlock(&ailp->xa_lock); +} + static bool xfs_buf_iodone_callback_error( struct xfs_buf *bp) @@ -1123,7 +1134,11 @@ xfs_buf_iodone_callback_error( if ((mp->m_flags & XFS_MOUNT_UNMOUNTING) && mp->m_fail_unmount) goto permanent_error; - /* still a transient error, higher layers will retry */ + /* + * Still a transient error, run IO completion failure callbacks and let + * the higher layers retry the buffer. + */ + xfs_buf_do_callbacks_fail(bp); xfs_buf_ioerror(bp, 0); xfs_buf_relse(bp); return true; @@ -1204,3 +1219,31 @@ xfs_buf_iodone( xfs_trans_ail_delete(ailp, lip, SHUTDOWN_CORRUPT_INCORE); xfs_buf_item_free(BUF_ITEM(lip)); } + +/* + * Requeue a failed buffer for writeback + * + * Return true if the buffer has been re-queued properly, false otherwise + */ +bool +xfs_buf_resubmit_failed_buffers( + struct xfs_buf *bp, + struct xfs_log_item *lip, + struct list_head *buffer_list) +{ + struct xfs_log_item *next; + + /* + * Clear XFS_LI_FAILED flag from all items before resubmit + * + * XFS_LI_FAILED set/clear is protected by xa_lock, caller this + * function already have it acquired + */ + for (; lip; lip = next) { + next = lip->li_bio_list; + xfs_clear_li_failed(lip); + } + + /* Add this buffer back to the delayed write list */ + return xfs_buf_delwri_queue(bp, buffer_list); +} diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index f7eba99d19dd..9690ce62c9a7 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,12 +64,15 @@ typedef struct xfs_buf_log_item { int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *); void xfs_buf_item_relse(struct xfs_buf *); void xfs_buf_item_log(xfs_buf_log_item_t *, uint, uint); -uint xfs_buf_item_dirty(xfs_buf_log_item_t *); +bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *); void xfs_buf_attach_iodone(struct xfs_buf *, void(*)(struct xfs_buf *, xfs_log_item_t *), xfs_log_item_t *); void xfs_buf_iodone_callbacks(struct xfs_buf *); void xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *); +bool xfs_buf_resubmit_failed_buffers(struct xfs_buf *, + struct xfs_log_item *, + struct list_head *); extern kmem_zone_t *xfs_buf_item_zone; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index fd2ef8c2c9a7..cd82429d8df7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -383,7 +383,7 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - error = xfs_defer_finish(tpp, &dfops, NULL); + error = xfs_defer_finish(tpp, &dfops); if (error) goto error1; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 2f4feb959bfb..bd786a9ac2c3 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -57,6 +57,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_AG_RESV_CRITICAL, XFS_RANDOM_DROP_WRITES, XFS_RANDOM_LOG_BAD_CRC, + XFS_RANDOM_LOG_ITEM_PIN, }; struct xfs_errortag_attr { @@ -161,6 +162,7 @@ XFS_ERRORTAG_ATTR_RW(bmap_finish_one, XFS_ERRTAG_BMAP_FINISH_ONE); XFS_ERRORTAG_ATTR_RW(ag_resv_critical, XFS_ERRTAG_AG_RESV_CRITICAL); XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); +XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -193,6 +195,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(ag_resv_critical), XFS_ERRORTAG_ATTR_LIST(drop_writes), XFS_ERRORTAG_ATTR_LIST(log_bad_crc), + XFS_ERRORTAG_ATTR_LIST(log_item_pin), NULL, }; diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 7577be5f09bc..7c4bef3bddb7 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -106,7 +106,8 @@ extern void xfs_verifier_error(struct xfs_buf *bp); */ #define XFS_ERRTAG_DROP_WRITES 28 #define XFS_ERRTAG_LOG_BAD_CRC 29 -#define XFS_ERRTAG_MAX 30 +#define XFS_ERRTAG_LOG_ITEM_PIN 30 +#define XFS_ERRTAG_MAX 31 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -141,6 +142,7 @@ extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_RANDOM_AG_RESV_CRITICAL 4 #define XFS_RANDOM_DROP_WRITES 1 #define XFS_RANDOM_LOG_BAD_CRC 1 +#define XFS_RANDOM_LOG_ITEM_PIN 1 #ifdef DEBUG extern int xfs_errortag_init(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index c4893e226fd8..ec3e44fcf771 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1011,96 +1011,67 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ - -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ -STATIC int -xfs_filemap_page_mkwrite( - struct vm_fault *vmf) +static int +__xfs_filemap_fault( + struct vm_fault *vmf, + enum page_entry_size pe_size, + bool write_fault) { struct inode *inode = file_inode(vmf->vma->vm_file); + struct xfs_inode *ip = XFS_I(inode); int ret; - trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + trace_xfs_filemap_fault(ip, pe_size, write_fault); - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (write_fault) { + sb_start_pagefault(inode->i_sb); + file_update_time(vmf->vma->vm_file); + } + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); + ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); } else { - ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); - ret = block_page_mkwrite_return(ret); + if (write_fault) + ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); + else + ret = filemap_fault(vmf); } - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); + if (write_fault) + sb_end_pagefault(inode->i_sb); return ret; } -STATIC int +static int xfs_filemap_fault( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - int ret; - - trace_xfs_filemap_fault(XFS_I(inode)); - /* DAX can shortcut the normal fault path on write faults! */ - if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) - return xfs_filemap_page_mkwrite(vmf); - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - else - ret = filemap_fault(vmf); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - return ret; + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, + IS_DAX(file_inode(vmf->vma->vm_file)) && + (vmf->flags & FAULT_FLAG_WRITE)); } -/* - * Similar to xfs_filemap_fault(), the DAX fault path can call into here on - * both read and write faults. Hence we need to handle both cases. There is no - * ->huge_mkwrite callout for huge pages, so we have a single function here to - * handle both cases here. @flags carries the information on the type of fault - * occuring. - */ -STATIC int +static int xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret; - - if (!IS_DAX(inode)) + if (!IS_DAX(file_inode(vmf->vma->vm_file))) return VM_FAULT_FALLBACK; - trace_xfs_filemap_huge_fault(ip); - - if (vmf->flags & FAULT_FLAG_WRITE) { - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - } - - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - - if (vmf->flags & FAULT_FLAG_WRITE) - sb_end_pagefault(inode->i_sb); + /* DAX can shortcut the normal fault path on write faults! */ + return __xfs_filemap_fault(vmf, pe_size, + (vmf->flags & FAULT_FLAG_WRITE)); +} - return ret; +static int +xfs_filemap_page_mkwrite( + struct vm_fault *vmf) +{ + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } /* @@ -1130,7 +1101,7 @@ xfs_filemap_pfn_mkwrite( if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; else if (IS_DAX(inode)) - ret = dax_pfn_mkwrite(vmf); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 0a9e6985a0d0..34227115a5d6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -1124,11 +1124,11 @@ reclaim: * Because we use RCU freeing we need to ensure the inode always appears * to be reclaimed with an invalid inode number when in the free state. * We do this as early as possible under the ILOCK so that - * xfs_iflush_cluster() can be guaranteed to detect races with us here. - * By doing this, we guarantee that once xfs_iflush_cluster has locked - * XFS_ILOCK that it will see either a valid, flushable inode that will - * serialise correctly, or it will see a clean (and invalid) inode that - * it can skip. + * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to + * detect races with us here. By doing this, we guarantee that once + * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that + * it will see either a valid inode that will serialise correctly, or it + * will see an invalid inode that it can skip. */ spin_lock(&ip->i_flags_lock); ip->i_flags = XFS_IRECLAIM; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ff48f0096810..5599dda4727a 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1055,7 +1055,7 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - code = xfs_trans_roll(&tp, NULL); + code = xfs_trans_roll(&tp); if (committed != NULL) *committed = 1; @@ -1285,7 +1285,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -1513,7 +1513,7 @@ xfs_link( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_defer_cancel(&dfops); goto error_return; @@ -1607,11 +1607,12 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; - error = xfs_trans_roll(&tp, ip); + error = xfs_trans_roll_inode(&tp, ip); if (error) goto out; } @@ -1855,7 +1856,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) { xfs_notice(mp, "%s: xfs_defer_finish returned error %d", __func__, error); @@ -2359,11 +2360,24 @@ retry: * already marked stale. If we can't lock it, back off * and retry. */ - if (ip != free_ip && - !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { - rcu_read_unlock(); - delay(1); - goto retry; + if (ip != free_ip) { + if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + rcu_read_unlock(); + delay(1); + goto retry; + } + + /* + * Check the inode number again in case we're + * racing with freeing in xfs_reclaim_inode(). + * See the comments in that function for more + * information as to why the initial check is + * not sufficient. + */ + if (ip->i_ino != inum + i) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } } rcu_read_unlock(); @@ -2637,7 +2651,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -2723,7 +2737,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_defer_finish(&tp, dfops, NULL); + error = xfs_defer_finish(&tp, dfops); if (error) { xfs_defer_cancel(dfops); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 013cc78d7daf..6d0f74ec31e8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -27,6 +27,7 @@ #include "xfs_error.h" #include "xfs_trace.h" #include "xfs_trans_priv.h" +#include "xfs_buf_item.h" #include "xfs_log.h" @@ -475,6 +476,23 @@ xfs_inode_item_unpin( wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); } +/* + * Callback used to mark a buffer with XFS_LI_FAILED when items in the buffer + * have been failed during writeback + * + * This informs the AIL that the inode is already flush locked on the next push, + * and acquires a hold on the buffer to ensure that it isn't reclaimed before + * dirty data makes it to disk. + */ +STATIC void +xfs_inode_item_error( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + ASSERT(xfs_isiflocked(INODE_ITEM(lip)->ili_inode)); + xfs_set_li_failed(lip, bp); +} + STATIC uint xfs_inode_item_push( struct xfs_log_item *lip, @@ -484,13 +502,28 @@ xfs_inode_item_push( { struct xfs_inode_log_item *iip = INODE_ITEM(lip); struct xfs_inode *ip = iip->ili_inode; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp = lip->li_buf; uint rval = XFS_ITEM_SUCCESS; int error; if (xfs_ipincount(ip) > 0) return XFS_ITEM_PINNED; + /* + * The buffer containing this item failed to be written back + * previously. Resubmit the buffer for IO. + */ + if (lip->li_flags & XFS_LI_FAILED) { + if (!xfs_buf_trylock(bp)) + return XFS_ITEM_LOCKED; + + if (!xfs_buf_resubmit_failed_buffers(bp, lip, buffer_list)) + rval = XFS_ITEM_FLUSHING; + + xfs_buf_unlock(bp); + return rval; + } + if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) return XFS_ITEM_LOCKED; @@ -622,7 +655,8 @@ static const struct xfs_item_ops xfs_inode_item_ops = { .iop_unlock = xfs_inode_item_unlock, .iop_committed = xfs_inode_item_committed, .iop_push = xfs_inode_item_push, - .iop_committing = xfs_inode_item_committing + .iop_committing = xfs_inode_item_committing, + .iop_error = xfs_inode_item_error }; @@ -710,7 +744,8 @@ xfs_iflush_done( * the AIL lock. */ iip = INODE_ITEM(blip); - if (iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; blip = next; @@ -718,7 +753,8 @@ xfs_iflush_done( /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); - if (iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) + if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || + lip->li_flags & XFS_LI_FAILED) need_ail++; /* @@ -739,6 +775,9 @@ xfs_iflush_done( if (INODE_ITEM(blip)->ili_logged && blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) mlip_changed |= xfs_ail_delete_one(ailp, blip); + else { + xfs_clear_li_failed(blip); + } } if (mlip_changed) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 9c0c7a920304..5049e8ab6e30 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -931,16 +931,15 @@ xfs_ioc_fsgetxattr( return 0; } -STATIC void -xfs_set_diflags( +STATIC uint16_t +xfs_flags2diflags( struct xfs_inode *ip, unsigned int xflags) { - unsigned int di_flags; - uint64_t di_flags2; - /* can't set PREALLOC this way, just preserve it */ - di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + uint16_t di_flags = + (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; if (xflags & FS_XFLAG_APPEND) @@ -970,19 +969,24 @@ xfs_set_diflags( if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; - /* diflags2 only valid for v3 inodes. */ - if (ip->i_d.di_version < 3) - return; + return di_flags; +} + +STATIC uint64_t +xfs_flags2diflags2( + struct xfs_inode *ip, + unsigned int xflags) +{ + uint64_t di_flags2 = + (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); - di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK); if (xflags & FS_XFLAG_DAX) di_flags2 |= XFS_DIFLAG2_DAX; if (xflags & FS_XFLAG_COWEXTSIZE) di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; - ip->i_d.di_flags2 = di_flags2; + return di_flags2; } STATIC void @@ -1008,11 +1012,12 @@ xfs_diflags_to_linux( inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; +#if 0 /* disabled until the flag switching races are sorted out */ if (xflags & FS_XFLAG_DAX) inode->i_flags |= S_DAX; else inode->i_flags &= ~S_DAX; - +#endif } static int @@ -1022,6 +1027,7 @@ xfs_ioctl_setattr_xflags( struct fsxattr *fa) { struct xfs_mount *mp = ip->i_mount; + uint64_t di_flags2; /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && @@ -1052,7 +1058,14 @@ xfs_ioctl_setattr_xflags( !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; - xfs_set_diflags(ip, fa->fsx_xflags); + /* diflags2 only valid for v3 inodes. */ + di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags); + if (di_flags2 && ip->i_d.di_version < 3) + return -EINVAL; + + ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags); + ip->i_d.di_flags2 = di_flags2; + xfs_diflags_to_linux(ip); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 813394c62849..a1909bc064e9 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -69,6 +69,7 @@ xfs_bmbt_to_iomap( iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); + iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip)); } xfs_extlen_t @@ -274,7 +275,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -520,7 +521,6 @@ xfs_file_iomap_begin_delay( struct inode *inode, loff_t offset, loff_t count, - unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); @@ -784,7 +784,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto trans_cancel; @@ -906,7 +906,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_on_bmapi_transaction; @@ -976,7 +976,6 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; - struct block_device *bdev; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -984,8 +983,7 @@ xfs_file_iomap_begin( if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) && !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) { /* Reserve delalloc blocks for regular writeback. */ - return xfs_file_iomap_begin_delay(inode, offset, length, flags, - iomap); + return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } if (need_excl_ilock(ip, flags)) { @@ -1087,13 +1085,6 @@ xfs_file_iomap_begin( xfs_bmbt_to_iomap(ip, iomap, &imap); - /* optionally associate a dax device with the iomap bdev */ - bdev = iomap->bdev; - if (blk_queue_dax(bdev->bd_queue)) - iomap->dax_dev = fs_dax_get_by_host(bdev->bd_disk->disk_name); - else - iomap->dax_dev = NULL; - if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; @@ -1171,7 +1162,6 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { - fs_put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 469c9fa4c178..17081c77ef86 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -817,7 +817,7 @@ xfs_vn_setattr_nonsize( * Caution: The caller of this function is responsible for calling * setattr_prepare() or otherwise verifying the change is fine. */ -int +STATIC int xfs_setattr_size( struct xfs_inode *ip, struct iattr *iattr) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 9301c5a6060b..dcd1292664b3 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -270,7 +270,14 @@ static inline uint64_t howmany_64(uint64_t x, uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 4ebd0bafc914..c5107c7bc4bf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -743,10 +743,14 @@ xfs_log_mount_finish( struct xfs_mount *mp) { int error = 0; + bool readonly = (mp->m_flags & XFS_MOUNT_RDONLY); if (mp->m_flags & XFS_MOUNT_NORECOVERY) { ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } else if (readonly) { + /* Allow unlinked processing to proceed */ + mp->m_flags &= ~XFS_MOUNT_RDONLY; } /* @@ -757,12 +761,27 @@ xfs_log_mount_finish( * inodes. Turn it off immediately after recovery finishes * so that we don't leak the quota inodes if subsequent mount * activities fail. + * + * We let all inodes involved in redo item processing end up on + * the LRU instead of being evicted immediately so that if we do + * something to an unlinked inode, the irele won't cause + * premature truncation and freeing of the inode, which results + * in log recovery failure. We have to evict the unreferenced + * lru inodes after clearing MS_ACTIVE because we don't + * otherwise clean up the lru if there's a subsequent failure in + * xfs_mountfs, which leads to us leaking the inodes if nothing + * else (e.g. quotacheck) references the inodes before the + * mount failure occurs. */ mp->m_super->s_flags |= MS_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); mp->m_super->s_flags &= ~MS_ACTIVE; + evict_inodes(mp->m_super); + + if (readonly) + mp->m_flags |= XFS_MOUNT_RDONLY; return error; } @@ -812,11 +831,14 @@ xfs_log_unmount_write(xfs_mount_t *mp) int error; /* - * Don't write out unmount record on read-only mounts. + * Don't write out unmount record on norecovery mounts or ro devices. * Or, if we are doing a forced umount (typically because of IO errors). */ - if (mp->m_flags & XFS_MOUNT_RDONLY) + if (mp->m_flags & XFS_MOUNT_NORECOVERY || + xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); return 0; + } error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log))); @@ -3353,8 +3375,6 @@ maybe_sleep: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - if (log_flushed) - *log_flushed = 1; } else { no_sleep: @@ -3458,8 +3478,6 @@ try_again: xlog_wait(&iclog->ic_prev->ic_write_wait, &log->l_icloglock); - if (log_flushed) - *log_flushed = 1; already_slept = 1; goto try_again; } @@ -3493,9 +3511,6 @@ try_again: */ if (iclog->ic_state & XLOG_STATE_IOERROR) return -EIO; - - if (log_flushed) - *log_flushed = 1; } else { /* just return */ spin_unlock(&log->l_icloglock); } diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 9549188f5a36..ee34899396b2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1029,61 +1029,106 @@ out_error: } /* - * Check the log tail for torn writes. This is required when torn writes are - * detected at the head and the head had to be walked back to a previous record. - * The tail of the previous record must now be verified to ensure the torn - * writes didn't corrupt the previous tail. + * Calculate distance from head to tail (i.e., unused space in the log). + */ +static inline int +xlog_tail_distance( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + if (head_blk < tail_blk) + return tail_blk - head_blk; + + return tail_blk + (log->l_logBBsize - head_blk); +} + +/* + * Verify the log tail. This is particularly important when torn or incomplete + * writes have been detected near the front of the log and the head has been + * walked back accordingly. + * + * We also have to handle the case where the tail was pinned and the head + * blocked behind the tail right before a crash. If the tail had been pushed + * immediately prior to the crash and the subsequent checkpoint was only + * partially written, it's possible it overwrote the last referenced tail in the + * log with garbage. This is not a coherency problem because the tail must have + * been pushed before it can be overwritten, but appears as log corruption to + * recovery because we have no way to know the tail was updated if the + * subsequent checkpoint didn't write successfully. * - * Return an error if CRC verification fails as recovery cannot proceed. + * Therefore, CRC check the log from tail to head. If a failure occurs and the + * offending record is within max iclog bufs from the head, walk the tail + * forward and retry until a valid tail is found or corruption is detected out + * of the range of a possible overwrite. */ STATIC int xlog_verify_tail( struct xlog *log, xfs_daddr_t head_blk, - xfs_daddr_t tail_blk) + xfs_daddr_t *tail_blk, + int hsize) { struct xlog_rec_header *thead; struct xfs_buf *bp; xfs_daddr_t first_bad; - int count; int error = 0; bool wrapped; - xfs_daddr_t tmp_head; + xfs_daddr_t tmp_tail; + xfs_daddr_t orig_tail = *tail_blk; bp = xlog_get_bp(log, 1); if (!bp) return -ENOMEM; /* - * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get - * a temporary head block that points after the last possible - * concurrently written record of the tail. + * Make sure the tail points to a record (returns positive count on + * success). */ - count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, - XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, - &wrapped); - if (count < 0) { - error = count; + error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) goto out; - } - - /* - * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran - * into the actual log head. tmp_head points to the start of the record - * so update it to the actual head block. - */ - if (count < XLOG_MAX_ICLOGS + 1) - tmp_head = head_blk; + if (*tail_blk != tmp_tail) + *tail_blk = tmp_tail; /* - * We now have a tail and temporary head block that covers at least - * XLOG_MAX_ICLOGS records from the tail. We need to verify that these - * records were completely written. Run a CRC verification pass from - * tail to head and return the result. + * Run a CRC check from the tail to the head. We can't just check + * MAX_ICLOGS records past the tail because the tail may point to stale + * blocks cleared during the search for the head/tail. These blocks are + * overwritten with zero-length records and thus record count is not a + * reliable indicator of the iclog state before a crash. */ - error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, XLOG_RECOVER_CRCPASS, &first_bad); + while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { + int tail_distance; + + /* + * Is corruption within range of the head? If so, retry from + * the next record. Otherwise return an error. + */ + tail_distance = xlog_tail_distance(log, head_blk, first_bad); + if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize)) + break; + /* skip to the next record; returns positive count on success */ + error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2, bp, + &tmp_tail, &thead, &wrapped); + if (error < 0) + goto out; + + *tail_blk = tmp_tail; + first_bad = 0; + error = xlog_do_recovery_pass(log, head_blk, *tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + } + + if (!error && *tail_blk != orig_tail) + xfs_warn(log->l_mp, + "Tail block (0x%llx) overwrite detected. Updated to 0x%llx", + orig_tail, *tail_blk); out: xlog_put_bp(bp); return error; @@ -1143,7 +1188,7 @@ xlog_verify_head( */ error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, XLOG_RECOVER_CRCPASS, &first_bad); - if (error == -EFSBADCRC) { + if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) { /* * We've hit a potential torn write. Reset the error and warn * about it. @@ -1183,31 +1228,12 @@ xlog_verify_head( ASSERT(0); return 0; } - - /* - * Now verify the tail based on the updated head. This is - * required because the torn writes trimmed from the head could - * have been written over the tail of a previous record. Return - * any errors since recovery cannot proceed if the tail is - * corrupt. - * - * XXX: This leaves a gap in truly robust protection from torn - * writes in the log. If the head is behind the tail, the tail - * pushes forward to create some space and then a crash occurs - * causing the writes into the previous record's tail region to - * tear, log recovery isn't able to recover. - * - * How likely is this to occur? If possible, can we do something - * more intelligent here? Is it safe to push the tail forward if - * we can determine that the tail is within the range of the - * torn write (e.g., the kernel can only overwrite the tail if - * it has actually been pushed forward)? Alternatively, could we - * somehow prevent this condition at runtime? - */ - error = xlog_verify_tail(log, *head_blk, *tail_blk); } + if (error) + return error; - return error; + return xlog_verify_tail(log, *head_blk, tail_blk, + be32_to_cpu((*rhead)->h_size)); } /* @@ -4801,12 +4827,16 @@ xlog_recover_process_intents( int error = 0; struct xfs_ail_cursor cur; struct xfs_ail *ailp; +#if defined(DEBUG) || defined(XFS_WARN) xfs_lsn_t last_lsn; +#endif ailp = log->l_ailp; spin_lock(&ailp->xa_lock); lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); +#if defined(DEBUG) || defined(XFS_WARN) last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); +#endif while (lip != NULL) { /* * We're done when we see something other than an intent. @@ -5218,7 +5248,7 @@ xlog_do_recovery_pass( xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; - xfs_daddr_t blk_no; + xfs_daddr_t blk_no, rblk_no; xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; @@ -5231,7 +5261,7 @@ xlog_do_recovery_pass( LIST_HEAD (buffer_list); ASSERT(head_blk != tail_blk); - rhead_blk = 0; + blk_no = rhead_blk = tail_blk; for (i = 0; i < XLOG_RHASH_SIZE; i++) INIT_HLIST_HEAD(&rhash[i]); @@ -5309,7 +5339,6 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -5371,9 +5400,19 @@ xlog_do_recovery_pass( bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); blk_no += hblks; - /* Read in data for log record */ - if (blk_no + bblks <= log->l_logBBsize) { - error = xlog_bread(log, blk_no, bblks, dbp, + /* + * Read the log record data in multiple reads if it + * wraps around the end of the log. Note that if the + * header already wrapped, blk_no could point past the + * end of the log. The record data is contiguous in + * that case. + */ + if (blk_no + bblks <= log->l_logBBsize || + blk_no >= log->l_logBBsize) { + /* mod blk_no in case the header wrapped and + * pushed it beyond the end of the log */ + rblk_no = do_mod(blk_no, log->l_logBBsize); + error = xlog_bread(log, rblk_no, bblks, dbp, &offset); if (error) goto bread_err2; @@ -5563,6 +5602,8 @@ xlog_do_recover( xfs_buf_t *bp; xfs_sb_t *sbp; + trace_xfs_log_recover(log, head_blk, tail_blk); + /* * First replay the images in the log. */ diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 15751dc2a27d..010a13a201aa 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -31,6 +31,7 @@ #include "xfs_error.h" #include "xfs_bmap.h" #include "xfs_bmap_btree.h" +#include "xfs_bmap_util.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_qm.h" @@ -1120,31 +1121,6 @@ xfs_qm_quotacheck_dqadjust( return 0; } -STATIC int -xfs_qm_get_rtblks( - xfs_inode_t *ip, - xfs_qcnt_t *O_rtblks) -{ - xfs_filblks_t rtblks; /* total rt blks */ - xfs_extnum_t idx; /* extent record index */ - xfs_ifork_t *ifp; /* inode fork pointer */ - xfs_extnum_t nextents; /* number of extent entries */ - int error; - - ASSERT(XFS_IS_REALTIME_INODE(ip)); - ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); - if (!(ifp->if_flags & XFS_IFEXTENTS)) { - if ((error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK))) - return error; - } - rtblks = 0; - nextents = xfs_iext_count(ifp); - for (idx = 0; idx < nextents; idx++) - rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx)); - *O_rtblks = (xfs_qcnt_t)rtblks; - return 0; -} - /* * callback routine supplied to bulkstat(). Given an inumber, find its * dquots and update them to account for resources taken by that inode. @@ -1160,7 +1136,8 @@ xfs_qm_dqusage_adjust( int *res) /* result code value */ { xfs_inode_t *ip; - xfs_qcnt_t nblks, rtblks = 0; + xfs_qcnt_t nblks; + xfs_filblks_t rtblks = 0; /* total rt blks */ int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -1190,12 +1167,15 @@ xfs_qm_dqusage_adjust( ASSERT(ip->i_delayed_blks == 0); if (XFS_IS_REALTIME_INODE(ip)) { - /* - * Walk thru the extent list and count the realtime blocks. - */ - error = xfs_qm_get_rtblks(ip, &rtblks); - if (error) - goto error0; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto error0; + } + + xfs_bmap_count_leaves(ifp, &rtblks); } nblks = (xfs_qcnt_t)ip->i_d.di_nblocks - rtblks; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 96fe209b5eb6..8f2e2fac4255 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -525,7 +525,7 @@ xfs_cui_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto abort_defer; set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index f45fbf0db9bb..3246815c24d6 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -464,7 +464,7 @@ retry: goto out_bmap_cancel; /* Finish up. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -602,7 +602,8 @@ xfs_reflink_cancel_cow_blocks( -(long)del.br_blockcount); /* Roll the transaction */ - error = xfs_defer_finish(tpp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(tpp, &dfops); if (error) { xfs_defer_cancel(&dfops); break; @@ -791,7 +792,8 @@ xfs_reflink_end_cow( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &idx, &got, &del); - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; next_extent: @@ -1152,7 +1154,8 @@ xfs_reflink_remap_extent( next_extent: /* Process all the deferred stuff. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_defer; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 91472193643b..488719d43ca8 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -810,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 38aaacdbb8b3..3008f31753df 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -714,17 +714,26 @@ STATIC void xfs_close_devices( struct xfs_mount *mp) { + struct dax_device *dax_ddev = mp->m_ddev_targp->bt_daxdev; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; + struct dax_device *dax_logdev = mp->m_logdev_targp->bt_daxdev; + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); + fs_put_dax(dax_logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; + struct dax_device *dax_rtdev = mp->m_rtdev_targp->bt_daxdev; + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); + fs_put_dax(dax_rtdev); } xfs_free_buftarg(mp, mp->m_ddev_targp); + fs_put_dax(dax_ddev); } /* @@ -742,6 +751,8 @@ xfs_open_devices( struct xfs_mount *mp) { struct block_device *ddev = mp->m_super->s_bdev; + struct dax_device *dax_ddev = fs_dax_get_by_bdev(ddev); + struct dax_device *dax_logdev = NULL, *dax_rtdev = NULL; struct block_device *logdev = NULL, *rtdev = NULL; int error; @@ -752,6 +763,7 @@ xfs_open_devices( error = xfs_blkdev_get(mp, mp->m_logname, &logdev); if (error) goto out; + dax_logdev = fs_dax_get_by_bdev(logdev); } if (mp->m_rtname) { @@ -765,24 +777,25 @@ xfs_open_devices( error = -EINVAL; goto out_close_rtdev; } + dax_rtdev = fs_dax_get_by_bdev(rtdev); } /* * Setup xfs_mount buffer target pointers */ error = -ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, dax_ddev); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, dax_rtdev); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, dax_logdev); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -798,10 +811,14 @@ xfs_open_devices( xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: xfs_blkdev_put(rtdev); + fs_put_dax(dax_rtdev); out_close_logdev: - if (logdev && logdev != ddev) + if (logdev && logdev != ddev) { xfs_blkdev_put(logdev); + fs_put_dax(dax_logdev); + } out: + fs_put_dax(dax_ddev); return error; } @@ -1220,7 +1237,7 @@ xfs_test_remount_options( tmp_mp->m_super = sb; error = xfs_parseargs(tmp_mp, options); xfs_free_fsname(tmp_mp); - kfree(tmp_mp); + kmem_free(tmp_mp); return error; } diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 23a50d7aa46a..68d3ca2c4968 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -378,7 +378,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_defer_finish(&tp, &dfops, NULL); + error = xfs_defer_finish(&tp, &dfops); if (error) goto out_bmap_cancel; @@ -497,7 +497,8 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_defer_finish(&tp, &dfops, ip); + xfs_defer_ijoin(&dfops, ip); + error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; /* diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index bcc3cdf8e1c5..bb5514688d47 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -517,7 +517,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); -DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_ordered); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); @@ -689,11 +688,34 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_fault); -DEFINE_INODE_EVENT(xfs_filemap_huge_fault); -DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); +TRACE_EVENT(xfs_filemap_fault, + TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, + bool write_fault), + TP_ARGS(ip, pe_size, write_fault), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(enum page_entry_size, pe_size) + __field(bool, write_fault) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->pe_size = pe_size; + __entry->write_fault = write_fault; + ), + TP_printk("dev %d:%d ino 0x%llx %s write_fault %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_symbolic(__entry->pe_size, + { PE_SIZE_PTE, "PTE" }, + { PE_SIZE_PMD, "PMD" }, + { PE_SIZE_PUD, "PUD" }), + __entry->write_fault) +) + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -1963,6 +1985,24 @@ DEFINE_EVENT(xfs_swap_extent_class, name, \ DEFINE_SWAPEXT_EVENT(xfs_swap_extent_before); DEFINE_SWAPEXT_EVENT(xfs_swap_extent_after); +TRACE_EVENT(xfs_log_recover, + TP_PROTO(struct xlog *log, xfs_daddr_t headblk, xfs_daddr_t tailblk), + TP_ARGS(log, headblk, tailblk), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, headblk) + __field(xfs_daddr_t, tailblk) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->headblk = headblk; + __entry->tailblk = tailblk; + ), + TP_printk("dev %d:%d headblk 0x%llx tailblk 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->headblk, + __entry->tailblk) +) + TRACE_EVENT(xfs_log_recover_record, TP_PROTO(struct xlog *log, struct xlog_rec_header *rhead, int pass), TP_ARGS(log, rhead, pass), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2011620008de..a87f657f59c9 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1035,25 +1035,18 @@ xfs_trans_cancel( */ int xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) + struct xfs_trans **tpp) { - struct xfs_trans *trans; + struct xfs_trans *trans = *tpp; struct xfs_trans_res tres; int error; /* - * Ensure that the inode is always logged. - */ - trans = *tpp; - if (dp) - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); - - /* * Copy the critical parameters from one trans to the next. */ tres.tr_logres = trans->t_log_res; tres.tr_logcount = trans->t_log_count; + *tpp = xfs_trans_dup(trans); /* @@ -1067,10 +1060,8 @@ xfs_trans_roll( if (error) return error; - trans = *tpp; - /* - * Reserve space in the log for th next transaction. + * Reserve space in the log for the next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log * that we want to use. This requires that either nothing be locked @@ -1078,14 +1069,5 @@ xfs_trans_roll( * the prior and the next transactions. */ tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - error = xfs_trans_reserve(trans, &tres, 0, 0); - /* - * Ensure that the inode is in the new transaction and locked. - */ - if (error) - return error; - - if (dp) - xfs_trans_ijoin(trans, dp, 0); - return 0; + return xfs_trans_reserve(*tpp, &tres, 0, 0); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 6bdad6f58934..815b53d20e26 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -49,6 +49,7 @@ typedef struct xfs_log_item { struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ uint li_flags; /* misc flags */ + struct xfs_buf *li_buf; /* real buffer pointer */ struct xfs_log_item *li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, struct xfs_log_item *); @@ -64,11 +65,13 @@ typedef struct xfs_log_item { } xfs_log_item_t; #define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 +#define XFS_LI_ABORTED 0x2 +#define XFS_LI_FAILED 0x4 #define XFS_LI_FLAGS \ { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" } + { XFS_LI_ABORTED, "ABORTED" }, \ + { XFS_LI_FAILED, "FAILED" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -79,6 +82,7 @@ struct xfs_item_ops { void (*iop_unlock)(xfs_log_item_t *); xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); + void (*iop_error)(xfs_log_item_t *, xfs_buf_t *); }; void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, @@ -208,12 +212,14 @@ void xfs_trans_bhold_release(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); -void xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); +bool xfs_trans_ordered_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_ichgtime(struct xfs_trans *, struct xfs_inode *, int); void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint); -void xfs_trans_log_buf(xfs_trans_t *, struct xfs_buf *, uint, uint); +void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint, + uint); +void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *); void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint); void xfs_extent_free_init_defer_op(void); @@ -224,7 +230,8 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); +int xfs_trans_roll(struct xfs_trans **); +int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 9056c0f34a3c..354368a906e5 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -325,6 +325,21 @@ xfs_ail_delete( xfs_trans_ail_cursor_clear(ailp, lip); } +static inline uint +xfsaild_push_item( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + /* + * If log item pinning is enabled, skip the push and track the item as + * pinned. This can help induce head-behind-tail conditions. + */ + if (XFS_TEST_ERROR(false, ailp->xa_mount, XFS_ERRTAG_LOG_ITEM_PIN)) + return XFS_ITEM_PINNED; + + return lip->li_ops->iop_push(lip, &ailp->xa_buf_list); +} + static long xfsaild_push( struct xfs_ail *ailp) @@ -382,7 +397,7 @@ xfsaild_push( * rely on the AIL cursor implementation to be able to deal with * the dropped lock. */ - lock_result = lip->li_ops->iop_push(lip, &ailp->xa_buf_list); + lock_result = xfsaild_push_item(ailp, lip); switch (lock_result) { case XFS_ITEM_SUCCESS: XFS_STATS_INC(mp, xs_push_ail_success); @@ -687,12 +702,13 @@ xfs_trans_ail_update_bulk( bool xfs_ail_delete_one( struct xfs_ail *ailp, - struct xfs_log_item *lip) + struct xfs_log_item *lip) { struct xfs_log_item *mlip = xfs_ail_min(ailp); trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); + xfs_clear_li_failed(lip); lip->li_flags &= ~XFS_LI_IN_AIL; lip->li_lsn = 0; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 86987d823d76..3ba7a96a8abd 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -435,7 +435,7 @@ xfs_trans_brelse(xfs_trans_t *tp, if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) { xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR); xfs_buf_item_relse(bp); - } else if (!xfs_buf_item_dirty(bip)) { + } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) { /*** ASSERT(bp->b_pincount == 0); ***/ @@ -493,25 +493,17 @@ xfs_trans_bhold_release(xfs_trans_t *tp, } /* - * This is called to mark bytes first through last inclusive of the given - * buffer as needing to be logged when the transaction is committed. - * The buffer must already be associated with the given transaction. - * - * First and last are numbers relative to the beginning of this buffer, - * so the first byte in the buffer is numbered 0 regardless of the - * value of b_blkno. + * Mark a buffer dirty in the transaction. */ void -xfs_trans_log_buf(xfs_trans_t *tp, - xfs_buf_t *bp, - uint first, - uint last) +xfs_trans_dirty_buf( + struct xfs_trans *tp, + struct xfs_buf *bp) { - xfs_buf_log_item_t *bip = bp->b_fspriv; + struct xfs_buf_log_item *bip = bp->b_fspriv; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); - ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(bp->b_iodone == NULL || bp->b_iodone == xfs_buf_iodone_callbacks); @@ -531,8 +523,6 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_iodone = xfs_buf_iodone_callbacks; bip->bli_item.li_cb = xfs_buf_iodone; - trace_xfs_trans_log_buf(bip); - /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer @@ -545,17 +535,37 @@ xfs_trans_log_buf(xfs_trans_t *tp, bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } + bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; +} - /* - * If we have an ordered buffer we are not logging any dirty range but - * it still needs to be marked dirty and that it has been logged. - */ - bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; - if (!(bip->bli_flags & XFS_BLI_ORDERED)) - xfs_buf_item_log(bip, first, last); +/* + * This is called to mark bytes first through last inclusive of the given + * buffer as needing to be logged when the transaction is committed. + * The buffer must already be associated with the given transaction. + * + * First and last are numbers relative to the beginning of this buffer, + * so the first byte in the buffer is numbered 0 regardless of the + * value of b_blkno. + */ +void +xfs_trans_log_buf( + struct xfs_trans *tp, + struct xfs_buf *bp, + uint first, + uint last) +{ + struct xfs_buf_log_item *bip = bp->b_fspriv; + + ASSERT(first <= last && last < BBTOB(bp->b_length)); + ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); + + xfs_trans_dirty_buf(tp, bp); + + trace_xfs_trans_log_buf(bip); + xfs_buf_item_log(bip, first, last); } @@ -708,14 +718,13 @@ xfs_trans_inode_alloc_buf( } /* - * Mark the buffer as ordered for this transaction. This means - * that the contents of the buffer are not recorded in the transaction - * but it is tracked in the AIL as though it was. This allows us - * to record logical changes in transactions rather than the physical - * changes we make to the buffer without changing writeback ordering - * constraints of metadata buffers. + * Mark the buffer as ordered for this transaction. This means that the contents + * of the buffer are not recorded in the transaction but it is tracked in the + * AIL as though it was. This allows us to record logical changes in + * transactions rather than the physical changes we make to the buffer without + * changing writeback ordering constraints of metadata buffers. */ -void +bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) @@ -726,8 +735,18 @@ xfs_trans_ordered_buf( ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); + if (xfs_buf_item_dirty_format(bip)) + return false; + bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); + + /* + * We don't log a dirty range of an ordered buffer but it still needs + * to be marked dirty and that it has been logged. + */ + xfs_trans_dirty_buf(tp, bp); + return true; } /* diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index dab8daa676f9..daa7615497f9 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -134,3 +134,17 @@ xfs_trans_log_inode( flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_fields |= flags; } + +int +xfs_trans_roll_inode( + struct xfs_trans **tpp, + struct xfs_inode *ip) +{ + int error; + + xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); + error = xfs_trans_roll(tpp); + if (!error) + xfs_trans_ijoin(*tpp, ip, 0); + return error; +} diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d91706c56c63..b317a3644c00 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -164,4 +164,35 @@ xfs_trans_ail_copy_lsn( *dst = *src; } #endif + +static inline void +xfs_clear_li_failed( + struct xfs_log_item *lip) +{ + struct xfs_buf *bp = lip->li_buf; + + ASSERT(lip->li_flags & XFS_LI_IN_AIL); + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (lip->li_flags & XFS_LI_FAILED) { + lip->li_flags &= ~XFS_LI_FAILED; + lip->li_buf = NULL; + xfs_buf_rele(bp); + } +} + +static inline void +xfs_set_li_failed( + struct xfs_log_item *lip, + struct xfs_buf *bp) +{ + lockdep_assert_held(&lip->li_ailp->xa_lock); + + if (!(lip->li_flags & XFS_LI_FAILED)) { + xfs_buf_hold(bp); + lip->li_flags |= XFS_LI_FAILED; + lip->li_buf = bp; + } +} + #endif /* __XFS_TRANS_PRIV_H__ */ |