From e02ba72aabfade4c9cd6e3263e9b57bf890ad25c Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Tue, 15 Apr 2014 11:31:33 -0700 Subject: aio: block io_destroy() until all context requests are completed deletes aio context and all resources related to. It makes sense that no IO operations connected to the context should be running after the context is destroyed. As we removed io_context we have no chance to get requests status or call io_getevents(). man page for io_destroy says that this function may block until all context's requests are completed. Before kernel 3.11 io_destroy() blocked indeed, but since aio refactoring in 3.11 it is not true anymore. Here is a pseudo-code that shows a testcase for a race condition discovered in 3.11: initialize io_context io_submit(read to buffer) io_destroy() // context is destroyed so we can free the resources free(buffers); // if the buffer is allocated by some other user he'll be surprised // to learn that the buffer still filled by an outstanding operation // from the destroyed io_context The fix is straight-forward - add a completion struct and wait on it in io_destroy, complete() should be called when number of in-fligh requests reaches zero. If two or more io_destroy() called for the same context simultaneously then only the first one waits for IO completion, other calls behaviour is undefined. Tested: ran http://pastebin.com/LrPsQ4RL testcase for several hours and do not see the race condition anymore. Signed-off-by: Anatol Pomozov Signed-off-by: Benjamin LaHaise --- fs/aio.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 12a3de0ee6da..2adbb0398ab9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -112,6 +112,11 @@ struct kioctx { struct work_struct free_work; + /* + * signals when all in-flight requests are done + */ + struct completion *requests_done; + struct { /* * This counts the number of available slots in the ringbuffer, @@ -508,6 +513,10 @@ static void free_ioctx_reqs(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, reqs); + /* At this point we know that there are no any in-flight requests */ + if (ctx->requests_done) + complete(ctx->requests_done); + INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); } @@ -718,7 +727,8 @@ err: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, + struct completion *requests_done) { if (!atomic_xchg(&ctx->dead, 1)) { struct kioctx_table *table; @@ -747,7 +757,11 @@ static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); + ctx->requests_done = requests_done; percpu_ref_kill(&ctx->users); + } else { + if (requests_done) + complete(requests_done); } } @@ -809,7 +823,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx); + kill_ioctx(mm, ctx, NULL); } } @@ -1185,7 +1199,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(current->mm, ioctx); + kill_ioctx(current->mm, ioctx, NULL); percpu_ref_put(&ioctx->users); } @@ -1203,8 +1217,22 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(current->mm, ioctx); + struct completion requests_done = + COMPLETION_INITIALIZER_ONSTACK(requests_done); + + /* Pass requests_done to kill_ioctx() where it can be set + * in a thread-safe way. If we try to set it here then we have + * a race condition if two io_destroy() called simultaneously. + */ + kill_ioctx(current->mm, ioctx, &requests_done); percpu_ref_put(&ioctx->users); + + /* Wait until all IO for the context are done. Otherwise kernel + * keep using user-space buffers even if user thinks the context + * is destroyed. + */ + wait_for_completion(&requests_done); + return 0; } pr_debug("EINVAL: io_destroy: invalid context id\n"); -- cgit v1.2.3 From 22213318af7ae265bc6cd8aef2febbc2d69a2440 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Apr 2014 12:30:58 -0400 Subject: fix races between __d_instantiate() and checks of dentry flags in non-lazy walk we need to be careful about dentry switching from negative to positive - both ->d_flags and ->d_inode are updated, and in some places we might see only one store. The cases where dentry has been obtained by dcache lookup with ->i_mutex held on parent are safe - ->d_lock and ->i_mutex provide all the barriers we need. However, there are several places where we run into trouble: * do_last() fetches ->d_inode, then checks ->d_flags and assumes that inode won't be NULL unless d_is_negative() is true. Race with e.g. creat() - we might have fetched the old value of ->d_inode (still NULL) and new value of ->d_flags (already not DCACHE_MISS_TYPE). Lin Ming has observed and reported the resulting oops. * a bunch of places checks ->d_inode for being non-NULL, then checks ->d_flags for "is it a symlink". Race with symlink(2) in case if our CPU sees ->d_inode update first - we see non-NULL there, but ->d_flags still contains DCACHE_MISS_TYPE instead of DCACHE_SYMLINK_TYPE. Result: false negative on "should we follow link here?", with subsequent unpleasantness. Cc: stable@vger.kernel.org # 3.13 and 3.14 need that one Reported-and-tested-by: Lin Ming Signed-off-by: Al Viro --- fs/dcache.c | 3 +-- fs/namei.c | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 40707d88a945..494a9def5dce 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1647,8 +1647,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_ENTRY_TYPE; - dentry->d_flags |= add_flags; + __d_set_type(dentry, add_flags); if (inode) hlist_add_head(&dentry->d_alias, &inode->i_dentry); dentry->d_inode = inode; diff --git a/fs/namei.c b/fs/namei.c index c6157c894fce..80168273396b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, inode = path->dentry->d_inode; } err = -ENOENT; - if (!inode) + if (!inode || d_is_negative(path->dentry)) goto out_path_put; if (should_follow_link(path->dentry, follow)) { @@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode) { + if (!dentry->d_inode || d_is_negative(dentry)) { error = -ENOENT; dput(dentry); goto out; @@ -2994,7 +2994,7 @@ retry_lookup: finish_lookup: /* we _can_ be in RCU mode here */ error = -ENOENT; - if (d_is_negative(path->dentry)) { + if (!inode || d_is_negative(path->dentry)) { path_to_nameidata(path, nd); goto out; } -- cgit v1.2.3 From a87c9ad956676d84d459739fc14ec5a3c3565717 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 26 Mar 2014 07:24:23 -0700 Subject: cifs: fix actimeo=0 corner case when cifs_i->time == jiffies actimeo=0 is supposed to be a special case that ensures that inode attributes are always refetched from the server instead of trusting the cache. The cifs code however uses time_in_range() to determine whether the attributes have timed out. In the case where cifs_i->time equals jiffies, this leads to the cifs code not refetching the inode attributes when it should. Fix this by explicitly testing for actimeo=0, and handling it as a special case. Reported-and-tested-by: Tetsuo Handa Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/inode.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index aadc2b68678b..a22d667f1069 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode) if (cifs_i->time == 0) return true; + if (!cifs_sb->actimeo) + return true; + if (!time_in_range(jiffies, cifs_i->time, cifs_i->time + cifs_sb->actimeo)) return true; -- cgit v1.2.3 From 7736e8cc51bbfdbd538c1870c314eb3483fe04ed Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 23 Apr 2014 18:14:42 +0200 Subject: fuse: add __exit to fuse_ctl_cleanup fuse_ctl_cleanup is only called by __exit fuse_exit Signed-off-by: Fabian Frederick Signed-off-by: Miklos Szeredi --- fs/fuse/control.c | 2 +- fs/fuse/fuse_i.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fuse/control.c b/fs/fuse/control.c index a0b0855d00a9..205e0d5d5307 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -348,7 +348,7 @@ int __init fuse_ctl_init(void) return register_filesystem(&fuse_ctl_fs_type); } -void fuse_ctl_cleanup(void) +void __exit fuse_ctl_cleanup(void) { unregister_filesystem(&fuse_ctl_fs_type); } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a257ed8ebee6..adfa2d505c1a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -725,7 +725,7 @@ int fuse_dev_init(void); void fuse_dev_cleanup(void); int fuse_ctl_init(void); -void fuse_ctl_cleanup(void); +void __exit fuse_ctl_cleanup(void); /** * Allocate a request -- cgit v1.2.3 From 4adb83029de8ef5144a14dbb5c21de0f156c1a03 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: check fallocate mode Don't allow new fallocate modes until we figure out what (if anything) that takes. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 13f8bdec5110..982288a1c6ab 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2972,6 +2972,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || (mode & FALLOC_FL_PUNCH_HOLE); + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + if (fc->no_fallocate) return -EOPNOTSUPP; -- cgit v1.2.3 From aeb4eb6b55261f44d3705f0a3b9e4906bfa5e416 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: fix mtime update error in fsync Bad case of shadowing. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 982288a1c6ab..7ae1e6972caa 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -501,7 +501,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, fuse_sync_writes(inode); if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - int err = fuse_flush_mtime(file, false); + err = fuse_flush_mtime(file, false); if (err) goto out; } -- cgit v1.2.3 From d31433c8b06d44e27f7637574137dc4b5e6fd1d1 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:21 +0200 Subject: fuse: do not use uninitialized i_mode When inode is in I_NEW state, inode->i_mode is not initialized yet. Do not use it before fuse_init_inode() is called. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 8d611696fcad..299e553fcdfd 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -303,7 +303,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, if ((inode->i_state & I_NEW)) { inode->i_flags |= S_NOATIME; - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + if (!fc->writeback_cache || !S_ISREG(attr->mode)) inode->i_flags |= S_NOCMTIME; inode->i_generation = generation; inode->i_data.backing_dev_info = &fc->bdi; -- cgit v1.2.3 From 009dd694e82098a703b8b5c8dd9f54c131dbb9b3 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: update mtime on truncate(2) Handling truncate(2), VFS doesn't set ATTR_MTIME bit in iattr structure; only ATTR_SIZE bit is set. In-kernel fuse must handle the case by setting mtime fields of struct fuse_setattr_in to "now" and set FATTR_MTIME bit even though ATTR_MTIME was not set. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5b4e035b364c..5e361b122526 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1678,6 +1678,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); + if (trust_local_mtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME; } memset(&inarg, 0, sizeof(inarg)); -- cgit v1.2.3 From 75caeecdf9c7151af5f7d972e2dabbff1bef30a7 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: update mtime on open(O_TRUNC) in atomic_o_trunc mode In case of fc->atomic_o_trunc is set, fuse does nothing in fuse_do_setattr() while handling open(O_TRUNC). Hence, i_mtime must be updated explicitly in fuse_finish_open(). The patch also adds extra locking encompassing open(O_TRUNC) operation to avoid races between the truncation and updating i_mtime. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7ae1e6972caa..e68d8c3f063a 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + if (fc->writeback_cache) + file_update_time(file); } if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache) fuse_link_write_file(file); @@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) { struct fuse_conn *fc = get_fuse_conn(inode); int err; + bool lock_inode = (file->f_flags & O_TRUNC) && + fc->atomic_o_trunc && + fc->writeback_cache; err = generic_file_open(inode, file); if (err) return err; + if (lock_inode) + mutex_lock(&inode->i_mutex); + err = fuse_do_open(fc, get_node_id(inode), file, isdir); - if (err) - return err; - fuse_finish_open(inode, file); + if (!err) + fuse_finish_open(inode, file); - return 0; + if (lock_inode) + mutex_unlock(&inode->i_mutex); + + return err; } static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode) -- cgit v1.2.3 From 93d2269d2ffb871fdfc5555cb5d4a7c0fc56e7fe Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:22 +0200 Subject: fuse: fuse: fallocate: use file_update_time() in preparation for getting rid of FUSE_I_MTIME_DIRTY. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e68d8c3f063a..3391f6a840bd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -3030,12 +3030,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE)) { bool changed = fuse_write_update_size(inode, offset + length); - if (changed && fc->writeback_cache) { - struct fuse_inode *fi = get_fuse_inode(inode); - - inode->i_mtime = current_fs_time(inode->i_sb); - set_bit(FUSE_I_MTIME_DIRTY, &fi->state); - } + if (changed && fc->writeback_cache) + file_update_time(file); } if (mode & FALLOC_FL_PUNCH_HOLE) -- cgit v1.2.3 From 22401e7b7a686bff02549cd648ba6f73f8dba868 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: clean up fsync Don't need to start I/O twice (once without i_mutex and one within). Also make sure that even if the userspace filesystem doesn't support FSYNC we do all the steps other than sending the message. Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 3391f6a840bd..65586a567d0d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -490,13 +490,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - err = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (err) - return err; - - if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) - return 0; - mutex_lock(&inode->i_mutex); /* @@ -504,7 +497,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, * wait for all outstanding writes, before sending the FSYNC * request. */ - err = write_inode_now(inode, 0); + err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) goto out; @@ -515,6 +508,8 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (err) goto out; } + if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) + goto out; req = fuse_get_req_nopages(fc); if (IS_ERR(req)) { -- cgit v1.2.3 From 1e18bda86e2dcc4ecb176213ee34649c93ad1396 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: add .write_inode ...and flush mtime from this. This allows us to use the kernel infrastructure for writing out dirty metadata (mtime at this point, but ctime in the next patches and also maybe atime). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 28 +++++++++++----------------- fs/fuse/file.c | 44 +++++++++++++++++++++++++++++++------------- fs/fuse/fuse_i.h | 5 ++--- fs/fuse/inode.c | 1 + 4 files changed, 45 insertions(+), 33 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e361b122526..8c233834591f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1597,23 +1597,17 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct file *file, bool nofail) +int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) { - struct inode *inode = file->f_mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = NULL; + struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; int err; - if (nofail) { - req = fuse_get_req_nofail_nopages(fc, file); - } else { - req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); - } + req = fuse_get_req_nopages(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); @@ -1621,15 +1615,15 @@ int fuse_flush_mtime(struct file *file, bool nofail) inarg.valid |= FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; - + if (ff) { + inarg.valid |= FATTR_FH; + inarg.fh = ff->fh; + } fuse_setattr_fill(fc, req, inode, &inarg, &outarg); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); - return err; } @@ -1715,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, /* the kernel maintains i_mtime locally */ if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { inode->i_mtime = attr->ia_mtime; - clear_bit(FUSE_I_MTIME_DIRTY, &fi->state); + /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, @@ -1953,7 +1947,7 @@ static int fuse_update_time(struct inode *inode, struct timespec *now, { if (flags & S_MTIME) { inode->i_mtime = *now; - set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state); + mark_inode_dirty_sync(inode); BUG_ON(!S_ISREG(inode->i_mode)); } return 0; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 65586a567d0d..d228c3962ffd 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -324,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file) /* see fuse_vma_close() for !writeback_cache case */ if (fc->writeback_cache) - filemap_write_and_wait(file->f_mapping); - - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) - fuse_flush_mtime(file, true); + write_inode_now(inode, 1); fuse_release_common(file, FUSE_RELEASE); @@ -449,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fc->no_flush) return 0; - err = filemap_write_and_wait(file->f_mapping); + err = write_inode_now(inode, 1); if (err) return err; @@ -502,12 +499,10 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, goto out; fuse_sync_writes(inode); + err = sync_inode_metadata(inode, 1); + if (err) + goto out; - if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) { - err = fuse_flush_mtime(file, false); - if (err) - goto out; - } if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) goto out; @@ -1664,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) fuse_writepage_free(fc, req); } -static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, - struct fuse_inode *fi) +static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) { struct fuse_file *ff = NULL; spin_lock(&fc->lock); - if (!WARN_ON(list_empty(&fi->write_files))) { + if (!list_empty(&fi->write_files)) { ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); fuse_file_get(ff); @@ -1680,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, return ff; } +static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc, + struct fuse_inode *fi) +{ + struct fuse_file *ff = __fuse_write_file_get(fc, fi); + WARN_ON(!ff); + return ff; +} + +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff; + int err; + + ff = __fuse_write_file_get(fc, fi); + err = fuse_flush_mtime(inode, ff); + if (ff) + fuse_file_put(ff, 0); + + return err; +} + static int fuse_writepage_locked(struct page *page) { struct address_space *mapping = page->mapping; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index adfa2d505c1a..d2f10054b9a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -119,8 +119,6 @@ enum { FUSE_I_INIT_RDPLUS, /** An operation changing file size is in progress */ FUSE_I_SIZE_UNSTABLE, - /** i_mtime has been updated locally; a flush to userspace needed */ - FUSE_I_MTIME_DIRTY, }; struct fuse_conn; @@ -891,7 +889,8 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct file *file, bool nofail); +int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff); +int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, struct file *file); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 299e553fcdfd..5997e4940512 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -788,6 +788,7 @@ static const struct super_operations fuse_super_operations = { .alloc_inode = fuse_alloc_inode, .destroy_inode = fuse_destroy_inode, .evict_inode = fuse_evict_inode, + .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, .remount_fs = fuse_remount_fs, .put_super = fuse_put_super, -- cgit v1.2.3 From e27c9d3877a0d0479711a55f5cdd7ee91442da53 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:23 +0200 Subject: fuse: fuse: add time_gran to INIT_OUT Allow userspace fs to specify time granularity. This is needed because with writeback_cache mode the kernel is responsible for generating mtime and ctime, but if the underlying filesystem doesn't support nanosecond granularity then the cache will contain a different value from the one stored on the filesystem resulting in a change of times after a cache flush. Make the default granularity 1s. Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 5997e4940512..560eafcdd6a7 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -891,6 +891,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->async_dio = 1; if (arg->flags & FUSE_WRITEBACK_CACHE) fc->writeback_cache = 1; + if (arg->time_gran && arg->time_gran <= 1000000000) + fc->sb->s_time_gran = arg->time_gran; + else + fc->sb->s_time_gran = 1000000000; + } else { ra_pages = fc->max_read / PAGE_CACHE_SIZE; fc->no_lock = 1; -- cgit v1.2.3 From ab9e13f7c771b511d8f71666e83cb27bcc635b98 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: allow ctime flushing to userspace The patch extends fuse_setattr_in, and extends the flush procedure (fuse_flush_times()) called on ->write_inode() to send the ctime as well as mtime. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 9 +++++++-- fs/fuse/file.c | 2 +- fs/fuse/fuse_i.h | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 8c233834591f..e6ba579e2937 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1597,7 +1597,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req, /* * Flush inode->i_mtime to the server */ -int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) +int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; @@ -1612,9 +1612,14 @@ int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff) memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - inarg.valid |= FATTR_MTIME; + inarg.valid = FATTR_MTIME; inarg.mtime = inode->i_mtime.tv_sec; inarg.mtimensec = inode->i_mtime.tv_nsec; + if (fc->minor >= 23) { + inarg.valid |= FATTR_CTIME; + inarg.ctime = inode->i_ctime.tv_sec; + inarg.ctimensec = inode->i_ctime.tv_nsec; + } if (ff) { inarg.valid |= FATTR_FH; inarg.fh = ff->fh; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d228c3962ffd..96d513e01a5d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1691,7 +1691,7 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) int err; ff = __fuse_write_file_get(fc, fi); - err = fuse_flush_mtime(inode, ff); + err = fuse_flush_times(inode, ff); if (ff) fuse_file_put(ff, 0); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d2f10054b9a1..40677e33504f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -889,7 +889,7 @@ int fuse_dev_release(struct inode *inode, struct file *file); bool fuse_write_update_size(struct inode *inode, loff_t pos); -int fuse_flush_mtime(struct inode *inode, struct fuse_file *ff); +int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); int fuse_do_setattr(struct inode *inode, struct iattr *attr, -- cgit v1.2.3 From 8b47e73e91c064cd75e8bf458ce738e1bfe2e700 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: remove .update_time This implements updating ctime as well as mtime on file_update_time(). Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index e6ba579e2937..65357bd1d17b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1947,17 +1947,6 @@ static int fuse_removexattr(struct dentry *entry, const char *name) return err; } -static int fuse_update_time(struct inode *inode, struct timespec *now, - int flags) -{ - if (flags & S_MTIME) { - inode->i_mtime = *now; - mark_inode_dirty_sync(inode); - BUG_ON(!S_ISREG(inode->i_mode)); - } - return 0; -} - static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, @@ -1997,7 +1986,6 @@ static const struct inode_operations fuse_common_inode_operations = { .getxattr = fuse_getxattr, .listxattr = fuse_listxattr, .removexattr = fuse_removexattr, - .update_time = fuse_update_time, }; static const struct inode_operations fuse_symlink_inode_operations = { -- cgit v1.2.3 From 31f3267b4ba16b12fb9dd3b1953ea0f221cc2ab4 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:24 +0200 Subject: fuse: trust kernel i_ctime only Let the kernel maintain i_ctime locally: update i_ctime explicitly on truncate, fallocate, open(O_TRUNC), setxattr, removexattr, link, rename, unlink. The inode flag I_DIRTY_SYNC serves as indication that local i_ctime should be flushed to the server eventually. The patch sets the flag and updates i_ctime in course of operations listed above. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 22 ++++++++++++++++++++-- fs/fuse/inode.c | 6 ++++-- 2 files changed, 24 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 65357bd1d17b..f62ab8eedb5f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, return create_new_entry(fc, req, dir, entry, S_IFLNK); } +static inline void fuse_update_ctime(struct inode *inode) +{ + if (!IS_NOCMTIME(inode)) { + inode->i_ctime = current_fs_time(inode->i_sb); + mark_inode_dirty_sync(inode); + } +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -771,6 +780,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (!err) { /* ctime changes */ fuse_invalidate_attr(oldent->d_inode); + fuse_update_ctime(oldent->d_inode); fuse_invalidate_attr(olddir); if (olddir != newdir) @@ -780,6 +790,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, if (newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); + fuse_update_ctime(newent->d_inode); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -829,6 +840,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, inc_nlink(inode); spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + fuse_update_ctime(inode); } else if (err == -EINTR) { fuse_invalidate_attr(inode); } @@ -846,6 +858,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, attr->size = i_size_read(inode); attr->mtime = inode->i_mtime.tv_sec; attr->mtimensec = inode->i_mtime.tv_nsec; + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; } stat->dev = inode->i_sb->s_dev; @@ -1811,8 +1825,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name, fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } @@ -1942,8 +1958,10 @@ static int fuse_removexattr(struct dentry *entry, const char *name) fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) + if (!err) { fuse_invalidate_attr(inode); + fuse_update_ctime(inode); + } return err; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 560eafcdd6a7..e9ecb1878109 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; } - inode->i_ctime.tv_sec = attr->ctime; - inode->i_ctime.tv_nsec = attr->ctimensec; if (attr->blksize != 0) inode->i_blkbits = ilog2(attr->blksize); @@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) inode->i_size = attr->size; inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + inode->i_ctime.tv_sec = attr->ctime; + inode->i_ctime.tv_nsec = attr->ctimensec; if (S_ISREG(inode->i_mode)) { fuse_init_common(inode); fuse_init_file_inode(inode); -- cgit v1.2.3 From 3ad22c62dd23ad26c8737c300f455de60ba01f40 Mon Sep 17 00:00:00 2001 From: Maxim Patlasov Date: Mon, 28 Apr 2014 14:19:25 +0200 Subject: fuse: clear FUSE_I_CTIME_DIRTY flag on setattr The patch addresses two use-cases when the flag may be safely cleared: 1. fuse_do_setattr() is called with ATTR_CTIME flag set in attr->ia_valid. In this case attr->ia_ctime bears actual value. In-kernel fuse must send it to the userspace server and then assign the value to inode->i_ctime. 2. fuse_do_setattr() is called with ATTR_SIZE flag set in attr->ia_valid, whereas ATTR_CTIME is not set (truncate(2)). In this case in-kernel fuse must sent "now" to the userspace server and then assign the value to inode->i_ctime. In both cases we could clear I_DIRTY_SYNC, but that needs more thought. Signed-off-by: Maxim Patlasov Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index f62ab8eedb5f..843dcf1222eb 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1518,7 +1518,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime) } static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, - bool trust_local_mtime) + bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; @@ -1537,13 +1537,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg, if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } - if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) { + if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; - if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime) + if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } + if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { + arg->valid |= FATTR_CTIME; + arg->ctime = iattr->ia_ctime.tv_sec; + arg->ctimensec = iattr->ia_ctime.tv_nsec; + } } /* @@ -1666,7 +1671,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, bool is_wb = fc->writeback_cache; loff_t oldsize; int err; - bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) attr->ia_valid |= ATTR_FORCE; @@ -1691,13 +1696,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - if (trust_local_mtime && attr->ia_size != inode->i_size) - attr->ia_valid |= ATTR_MTIME; + if (trust_local_cmtime && attr->ia_size != inode->i_size) + attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); - iattr_to_fattr(attr, &inarg, trust_local_mtime); + iattr_to_fattr(attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; @@ -1726,8 +1731,11 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr, spin_lock(&fc->lock); /* the kernel maintains i_mtime locally */ - if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) { - inode->i_mtime = attr->ia_mtime; + if (trust_local_cmtime) { + if (attr->ia_valid & ATTR_MTIME) + inode->i_mtime = attr->ia_mtime; + if (attr->ia_valid & ATTR_CTIME) + inode->i_ctime = attr->ia_ctime; /* FIXME: clear I_DIRTY_SYNC? */ } -- cgit v1.2.3 From 4ace1f85a7cdab5453c2e12029ff978dd4cd6155 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 14:19:25 +0200 Subject: fuse: clear MS_I_VERSION Fuse doesn't support i_version (yet). Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e9ecb1878109..754dcf23de8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1004,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_flags & MS_MANDLOCK) goto err; - sb->s_flags &= ~MS_NOSEC; + sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION); if (!parse_fuse_opt((char *) data, &d, is_bdev)) goto err; -- cgit v1.2.3 From 1560c974dcd40a8d3f193283acd7cc6aee13dc13 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 28 Apr 2014 16:43:44 +0200 Subject: fuse: add renameat2 support Support RENAME_EXCHANGE and RENAME_NOREPLACE flags on the userspace ABI. Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++-------- fs/fuse/fuse_i.h | 3 +++ 2 files changed, 50 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 843dcf1222eb..42198359fa1b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -752,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) return err; } -static int fuse_rename(struct inode *olddir, struct dentry *oldent, - struct inode *newdir, struct dentry *newent) +static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags, int opcode, size_t argsize) { int err; - struct fuse_rename_in inarg; + struct fuse_rename2_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_req_nopages(fc); + struct fuse_req *req; + req = fuse_get_req_nopages(fc); if (IS_ERR(req)) return PTR_ERR(req); - memset(&inarg, 0, sizeof(inarg)); + memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); - req->in.h.opcode = FUSE_RENAME; + inarg.flags = flags; + req->in.h.opcode = opcode; req->in.h.nodeid = get_node_id(olddir); req->in.numargs = 3; - req->in.args[0].size = sizeof(inarg); + req->in.args[0].size = argsize; req->in.args[0].value = &inarg; req->in.args[1].size = oldent->d_name.len + 1; req->in.args[1].value = oldent->d_name.name; @@ -782,12 +785,17 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(oldent->d_inode); fuse_update_ctime(oldent->d_inode); + if (flags & RENAME_EXCHANGE) { + fuse_invalidate_attr(newent->d_inode); + fuse_update_ctime(newent->d_inode); + } + fuse_invalidate_attr(olddir); if (olddir != newdir) fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (newent->d_inode) { + if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { fuse_invalidate_attr(newent->d_inode); fuse_invalidate_entry_cache(newent); fuse_update_ctime(newent->d_inode); @@ -806,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, return err; } +static int fuse_rename(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent) +{ + return fuse_rename_common(olddir, oldent, newdir, newent, 0, + FUSE_RENAME, sizeof(struct fuse_rename_in)); +} + +static int fuse_rename2(struct inode *olddir, struct dentry *oldent, + struct inode *newdir, struct dentry *newent, + unsigned int flags) +{ + struct fuse_conn *fc = get_fuse_conn(olddir); + int err; + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + if (fc->no_rename2 || fc->minor < 23) + return -EINVAL; + + err = fuse_rename_common(olddir, oldent, newdir, newent, flags, + FUSE_RENAME2, sizeof(struct fuse_rename2_in)); + if (err == -ENOSYS) { + fc->no_rename2 = 1; + err = -EINVAL; + } + return err; + +} + static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { @@ -1980,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = { .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename, + .rename2 = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 40677e33504f..7aa5c75e0de1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -542,6 +542,9 @@ struct fuse_conn { /** Is fallocate not implemented by fs? */ unsigned no_fallocate:1; + /** Is rename with flags implemented by fs? */ + unsigned no_rename2:1; + /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; -- cgit v1.2.3 From 0081bd83c089ef3d0c9a4e4e869e2ab75f2cb379 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 8 Apr 2014 21:42:59 +0800 Subject: ceph: check directory's completeness before emitting directory entry Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 766410a12c2c..8c7f90b96913 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -182,9 +182,16 @@ more: spin_unlock(&dentry->d_lock); spin_unlock(&parent->d_lock); + /* make sure a dentry wasn't dropped while we didn't have parent lock */ + if (!ceph_dir_is_complete(dir)) { + dout(" lost dir complete on %p; falling back to mds\n", dir); + dput(dentry); + err = -EAGAIN; + goto out; + } + dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos, dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode); - ctx->pos = di->offset; if (!dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), @@ -198,19 +205,12 @@ more: return 0; } + ctx->pos = di->offset + 1; + if (last) dput(last); last = dentry; - ctx->pos++; - - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - err = -EAGAIN; - goto out; - } - spin_lock(&parent->d_lock); p = p->prev; /* advance to next dentry */ goto more; @@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) err = __dcache_readdir(file, ctx, shared_gen); if (err != -EAGAIN) return err; + frag = fpos_frag(ctx->pos); + off = fpos_off(ctx->pos); } else { spin_unlock(&ci->i_ceph_lock); } -- cgit v1.2.3 From 6da5246dd4b077ab229481ca342802f7fdcdab59 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Wed, 9 Apr 2014 09:35:19 +0800 Subject: ceph: use fpos_cmp() to compare dentry positions Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8c7f90b96913..fb4f7a203eda 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, /* start at beginning? */ if (ctx->pos == 2 || last == NULL || - ctx->pos < ceph_dentry(last)->offset) { + fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { if (list_empty(&parent->d_subdirs)) goto out_unlock; p = parent->d_subdirs.prev; -- cgit v1.2.3 From 0a8a70f96fe1bd3e07c15bb86fd247e76102398a Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 14 Apr 2014 13:13:02 +0800 Subject: ceph: clear directory's completeness when creating file When creating a file, ceph_set_dentry_offset() puts the new dentry at the end of directory's d_subdirs, then set the dentry's offset based on directory's max offset. The offset does not reflect the real postion of the dentry in directory. Later readdir reply from MDS may change the dentry's position/offset. This inconsistency can cause missing/duplicate entries in readdir result if readdir is partly satisfied by dcache_readdir(). The fix is clear directory's completeness after creating/renaming file. It prevents later readdir from using dcache_readdir(). Fixes: http://tracker.ceph.com/issues/8025 Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/dir.c | 9 ++++---- fs/ceph/inode.c | 71 +++++++++++++-------------------------------------------- fs/ceph/super.h | 1 - 3 files changed, 21 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index fb4f7a203eda..c29d6ae68874 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -448,7 +448,6 @@ more: if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { dout(" marking %p complete\n", inode); __ceph_dir_set_complete(ci, fi->dir_release_count); - ci->i_max_offset = ctx->pos; } spin_unlock(&ci->i_ceph_lock); @@ -937,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ - /* d_move screws up d_subdirs order */ - ceph_dir_clear_complete(new_dir); - d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); + + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + } ceph_mdsc_put_request(req); return err; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 0b0728e5be2d..233c6f96910a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode, !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count)); - ci->i_max_offset = 2; } no_change: /* only update max_size on auth cap */ @@ -889,41 +888,6 @@ out_unlock: return; } -/* - * Set dentry's directory position based on the current dir's max, and - * order it in d_subdirs, so that dcache_readdir behaves. - * - * Always called under directory's i_mutex. - */ -static void ceph_set_dentry_offset(struct dentry *dn) -{ - struct dentry *dir = dn->d_parent; - struct inode *inode = dir->d_inode; - struct ceph_inode_info *ci; - struct ceph_dentry_info *di; - - BUG_ON(!inode); - - ci = ceph_inode(inode); - di = ceph_dentry(dn); - - spin_lock(&ci->i_ceph_lock); - if (!__ceph_dir_is_complete(ci)) { - spin_unlock(&ci->i_ceph_lock); - return; - } - di->offset = ceph_inode(inode)->i_max_offset++; - spin_unlock(&ci->i_ceph_lock); - - spin_lock(&dir->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_u.d_child, &dir->d_subdirs); - dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset, - dn->d_u.d_child.prev, dn->d_u.d_child.next); - spin_unlock(&dn->d_lock); - spin_unlock(&dir->d_lock); -} - /* * splice a dentry to an inode. * caller must hold directory i_mutex for this to be safe. @@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn) * the caller) if we fail. */ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, - bool *prehash, bool set_offset) + bool *prehash) { struct dentry *realdn; @@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); - if (set_offset) - ceph_set_dentry_offset(dn); out: return dn; } @@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; struct inode *in = NULL; - struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; struct ceph_fs_client *fsc = ceph_sb_to_client(sb); int err = 0; @@ -1161,6 +1122,9 @@ retry_lookup: /* rename? */ if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) { + struct inode *olddir = req->r_old_dentry_dir; + BUG_ON(!olddir); + dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, req->r_old_dentry->d_name.len, @@ -1180,13 +1144,10 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* - * d_move() puts the renamed dentry at the end of - * d_subdirs. We need to assign it an appropriate - * directory offset so we can behave when dir is - * complete. - */ - ceph_set_dentry_offset(req->r_old_dentry); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(dir); + ceph_dir_clear_complete(olddir); + dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1213,8 +1174,9 @@ retry_lookup: /* attach proper inode */ if (!dn->d_inode) { + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, &have_lease, true); + dn = splice_dentry(dn, in, &have_lease); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1235,17 +1197,16 @@ retry_lookup: (req->r_op == CEPH_MDS_OP_LOOKUPSNAP || req->r_op == CEPH_MDS_OP_MKSNAP)) { struct dentry *dn = req->r_dentry; + struct inode *dir = req->r_locked_dir; /* fill out a snapdir LOOKUPSNAP dentry */ BUG_ON(!dn); - BUG_ON(!req->r_locked_dir); - BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR); - ininfo = rinfo->targeti.in; - vino.ino = le64_to_cpu(ininfo->ino); - vino.snap = le64_to_cpu(ininfo->snapid); + BUG_ON(!dir); + BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR); dout(" linking snapped dir %p to dn %p\n", in, dn); + ceph_dir_clear_complete(dir); ihold(in); - dn = splice_dentry(dn, in, NULL, true); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); goto done; @@ -1407,7 +1368,7 @@ retry_lookup: } if (!dn->d_inode) { - dn = splice_dentry(dn, in, NULL, false); + dn = splice_dentry(dn, in, NULL); if (IS_ERR(dn)) { err = PTR_ERR(dn); dn = NULL; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7866cd05a6bb..ead05cc1f447 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -266,7 +266,6 @@ struct ceph_inode_info { struct timespec i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; - u64 i_max_offset; /* largest readdir offset, set with complete dir */ struct rb_root i_fragtree; struct mutex i_fragtree_mutex; -- cgit v1.2.3 From fd7b95cd1b58171a0b931b2063729a032bec4ac2 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Thu, 17 Apr 2014 08:02:02 +0800 Subject: ceph: avoid releasing caps that are being used To avoid releasing caps that are being used, encode_inode_release() should send implemented caps to MDS. Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 2e5e648eb5c3..c561b628ebce 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq), rel->mseq = cpu_to_le32(cap->mseq); - rel->caps = cpu_to_le32(cap->issued); + rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; -- cgit v1.2.3 From 3bd58143bafc56dbc07f4f085e4d7e018d332674 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Sun, 27 Apr 2014 09:17:45 +0800 Subject: ceph: reserve caps for file layout/lock MDS requests Signed-off-by: Yan, Zheng Reviewed-by: Sage Weil --- fs/ceph/ioctl.c | 3 +++ fs/ceph/locks.c | 1 + 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index efbe08289292..2042fd16d1a8 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -110,6 +110,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL; req->r_args.setlayout.layout.fl_stripe_unit = @@ -154,6 +156,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg) return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; req->r_args.setlayout.layout.fl_stripe_unit = cpu_to_le32(l.stripe_unit); diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d94ba0df9f4d..191398852a2e 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, return PTR_ERR(req); req->r_inode = inode; ihold(inode); + req->r_num_caps = 1; /* mds requires start and length rather than start and end */ if (LLONG_MAX == fl->fl_end) -- cgit v1.2.3 From 03b3b889e79cdb6b806fc0ba9be0d71c186bbfaa Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 15:45:28 -0400 Subject: fold d_kill() and d_free() Signed-off-by: Al Viro --- fs/dcache.c | 76 +++++++++++++++++++------------------------------------------ 1 file changed, 24 insertions(+), 52 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 494a9def5dce..9b15c5c37277 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,23 +246,6 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } -/* - * no locks, please. - */ -static void d_free(struct dentry *dentry) -{ - BUG_ON((int)dentry->d_lockref.count > 0); - this_cpu_dec(nr_dentry); - if (dentry->d_op && dentry->d_op->d_release) - dentry->d_op->d_release(dentry); - - /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) - __d_free(&dentry->d_u.d_rcu); - else - call_rcu(&dentry->d_u.d_rcu, __d_free); -} - /** * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups * @dentry: the target dentry @@ -419,40 +402,6 @@ static void dentry_lru_del(struct dentry *dentry) } } -/** - * d_kill - kill dentry and return parent - * @dentry: dentry to kill - * @parent: parent dentry - * - * The dentry must already be unhashed and removed from the LRU. - * - * If this is the root of the dentry tree, return NULL. - * - * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by - * d_kill. - */ -static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) - __releases(dentry->d_lock) - __releases(parent->d_lock) - __releases(dentry->d_inode->i_lock) -{ - list_del(&dentry->d_u.d_child); - /* - * Inform d_walk() that we are no longer attached to the - * dentry tree - */ - dentry->d_flags |= DCACHE_DENTRY_KILLED; - if (parent) - spin_unlock(&parent->d_lock); - dentry_iput(dentry); - /* - * dentry_iput drops the locks, at which point nobody (except - * transient RCU lookups) can reach this dentry. - */ - d_free(dentry); - return parent; -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -546,7 +495,30 @@ relock: dentry_lru_del(dentry); /* if it was on the hash then remove it */ __d_drop(dentry); - return d_kill(dentry, parent); + list_del(&dentry->d_u.d_child); + /* + * Inform d_walk() that we are no longer attached to the + * dentry tree + */ + dentry->d_flags |= DCACHE_DENTRY_KILLED; + if (parent) + spin_unlock(&parent->d_lock); + dentry_iput(dentry); + /* + * dentry_iput drops the locks, at which point nobody (except + * transient RCU lookups) can reach this dentry. + */ + BUG_ON((int)dentry->d_lockref.count > 0); + this_cpu_dec(nr_dentry); + if (dentry->d_op && dentry->d_op->d_release) + dentry->d_op->d_release(dentry); + + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); + return parent; } /* -- cgit v1.2.3 From 5c47e6d0ad608987b91affbcf7d1fc12dfbe8fb4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 16:13:18 -0400 Subject: fold try_prune_one_dentry() Signed-off-by: Al Viro --- fs/dcache.c | 75 +++++++++++++++++++++---------------------------------------- 1 file changed, 25 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 9b15c5c37277..a5540d491954 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -787,47 +787,9 @@ restart: } EXPORT_SYMBOL(d_prune_aliases); -/* - * Try to throw away a dentry - free the inode, dput the parent. - * Requires dentry->d_lock is held, and dentry->d_count == 0. - * Releases dentry->d_lock. - * - * This may fail if locks cannot be acquired no problem, just try again. - */ -static struct dentry * try_prune_one_dentry(struct dentry *dentry) - __releases(dentry->d_lock) -{ - struct dentry *parent; - - parent = dentry_kill(dentry, 0); - /* - * If dentry_kill returns NULL, we have nothing more to do. - * if it returns the same dentry, trylocks failed. In either - * case, just loop again. - * - * Otherwise, we need to prune ancestors too. This is necessary - * to prevent quadratic behavior of shrink_dcache_parent(), but - * is also expected to be beneficial in reducing dentry cache - * fragmentation. - */ - if (!parent) - return NULL; - if (parent == dentry) - return dentry; - - /* Prune ancestors. */ - dentry = parent; - while (dentry) { - if (lockref_put_or_lock(&dentry->d_lockref)) - return NULL; - dentry = dentry_kill(dentry, 1); - } - return NULL; -} - static void shrink_dentry_list(struct list_head *list) { - struct dentry *dentry; + struct dentry *dentry, *parent; rcu_read_lock(); for (;;) { @@ -863,22 +825,35 @@ static void shrink_dentry_list(struct list_head *list) } rcu_read_unlock(); + parent = dentry_kill(dentry, 0); /* - * If 'try_to_prune()' returns a dentry, it will - * be the same one we passed in, and d_lock will - * have been held the whole time, so it will not - * have been added to any other lists. We failed - * to get the inode lock. - * - * We just add it back to the shrink list. + * If dentry_kill returns NULL, we have nothing more to do. */ - dentry = try_prune_one_dentry(dentry); - - rcu_read_lock(); - if (dentry) { + if (!parent) { + rcu_read_lock(); + continue; + } + if (unlikely(parent == dentry)) { + /* + * trylocks have failed and d_lock has been held the + * whole time, so it could not have been added to any + * other lists. Just add it back to the shrink list. + */ + rcu_read_lock(); d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); + continue; } + /* + * We need to prune ancestors too. This is necessary to prevent + * quadratic behavior of shrink_dcache_parent(), but is also + * expected to be beneficial in reducing dentry cache + * fragmentation. + */ + dentry = parent; + while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) + dentry = dentry_kill(dentry, 1); + rcu_read_lock(); } rcu_read_unlock(); } -- cgit v1.2.3 From b4f0354e968f5fabd39bc85b99fedae4a97589fe Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 23:40:14 -0400 Subject: new helper: dentry_free() The part of old d_free() that dealt with actual freeing of dentry. Taken out of dentry_kill() into a separate function. Signed-off-by: Al Viro --- fs/dcache.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index a5540d491954..dab7db10d685 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -246,6 +246,15 @@ static void __d_free(struct rcu_head *head) kmem_cache_free(dentry_cache, dentry); } +static void dentry_free(struct dentry *dentry) +{ + /* if dentry was never visible to RCU, immediate free is OK */ + if (!(dentry->d_flags & DCACHE_RCUACCESS)) + __d_free(&dentry->d_u.d_rcu); + else + call_rcu(&dentry->d_u.d_rcu, __d_free); +} + /** * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups * @dentry: the target dentry @@ -513,11 +522,7 @@ relock: if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - /* if dentry was never visible to RCU, immediate free is OK */ - if (!(dentry->d_flags & DCACHE_RCUACCESS)) - __d_free(&dentry->d_u.d_rcu); - else - call_rcu(&dentry->d_u.d_rcu, __d_free); + dentry_free(dentry); return parent; } -- cgit v1.2.3 From 01b6035190b024240a43ac1d8e9c6f964f5f1c63 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 29 Apr 2014 23:42:52 -0400 Subject: expand the call of dentry_lru_del() in dentry_kill() Signed-off-by: Al Viro --- fs/dcache.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index dab7db10d685..e482775343a0 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -501,7 +501,12 @@ relock: if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) dentry->d_op->d_prune(dentry); - dentry_lru_del(dentry); + if (dentry->d_flags & DCACHE_LRU_LIST) { + if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) + d_lru_del(dentry); + else + d_shrink_del(dentry); + } /* if it was on the hash then remove it */ __d_drop(dentry); list_del(&dentry->d_u.d_child); -- cgit v1.2.3 From 754320d6e166d3a12cb4810a452bde00afbd4e9a Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Thu, 1 May 2014 03:31:28 +0000 Subject: aio: fix potential leak in aio_run_iocb(). iovec should be reclaimed whenever caller of rw_copy_check_uvector() returns, but it doesn't hold when failure happens right after aio_setup_vectored_rw(). Fix that in a such way to avoid hairy goto. Signed-off-by: Leon Yu Signed-off-by: Benjamin LaHaise Cc: stable@vger.kernel.org --- fs/aio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 2adbb0398ab9..a0ed6c7d2cd2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1327,10 +1327,8 @@ rw_common: &iovec, compat) : aio_setup_single_vector(req, rw, buf, &nr_segs, iovec); - if (ret) - return ret; - - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + if (!ret) + ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); if (ret < 0) { if (iovec != &inline_vec) kfree(iovec); -- cgit v1.2.3 From 41edf278fc2f042f4e22a12ed87d19c5201210e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 1 May 2014 10:30:00 -0400 Subject: dentry_kill(): don't try to remove from shrink list If the victim in on the shrink list, don't remove it from there. If shrink_dentry_list() manages to remove it from the list before we are done - fine, we'll just free it as usual. If not - mark it with new flag (DCACHE_MAY_FREE) and leave it there. Eventually, shrink_dentry_list() will get to it, remove the sucker from shrink list and call dentry_kill(dentry, 0). Which is where we'll deal with freeing. Since now dentry_kill(dentry, 0) may happen after or during dentry_kill(dentry, 1), we need to recognize that (by seeing DCACHE_DENTRY_KILLED already set), unlock everything and either free the sucker (in case DCACHE_MAY_FREE has been set) or leave it for ongoing dentry_kill(dentry, 1) to deal with. Signed-off-by: Al Viro --- fs/dcache.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index e482775343a0..58e26bee7ef4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -468,7 +468,14 @@ dentry_kill(struct dentry *dentry, int unlock_on_failure) __releases(dentry->d_lock) { struct inode *inode; - struct dentry *parent; + struct dentry *parent = NULL; + bool can_free = true; + + if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) { + can_free = dentry->d_flags & DCACHE_MAY_FREE; + spin_unlock(&dentry->d_lock); + goto out; + } inode = dentry->d_inode; if (inode && !spin_trylock(&inode->i_lock)) { @@ -479,9 +486,7 @@ relock: } return dentry; /* try again with same dentry */ } - if (IS_ROOT(dentry)) - parent = NULL; - else + if (!IS_ROOT(dentry)) parent = dentry->d_parent; if (parent && !spin_trylock(&parent->d_lock)) { if (inode) @@ -504,8 +509,6 @@ relock: if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); - else - d_shrink_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); @@ -527,7 +530,15 @@ relock: if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); - dentry_free(dentry); + spin_lock(&dentry->d_lock); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { + dentry->d_flags |= DCACHE_MAY_FREE; + can_free = false; + } + spin_unlock(&dentry->d_lock); +out: + if (likely(can_free)) + dentry_free(dentry); return parent; } @@ -829,7 +840,7 @@ static void shrink_dentry_list(struct list_head *list) * We found an inuse dentry which was not removed from * the LRU because of laziness during lookup. Do not free it. */ - if (dentry->d_lockref.count) { + if ((int)dentry->d_lockref.count > 0) { spin_unlock(&dentry->d_lock); continue; } -- cgit v1.2.3 From fe91522a7ba82ca1a51b07e19954b3825e4aaa22 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 3 May 2014 00:02:25 -0400 Subject: don't remove from shrink list in select_collect() If we find something already on a shrink list, just increment data->found and do nothing else. Loops in shrink_dcache_parent() and check_submounts_and_drop() will do the right thing - everything we did put into our list will be evicted and if there had been nothing, but data->found got non-zero, well, we have somebody else shrinking those guys; just try again. Signed-off-by: Al Viro --- fs/dcache.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 58e26bee7ef4..f39a6f5a1220 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1229,34 +1229,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) if (data->start == dentry) goto out; - /* - * move only zero ref count dentries to the dispose list. - * - * Those which are presently on the shrink list, being processed - * by shrink_dentry_list(), shouldn't be moved. Otherwise the - * loop in shrink_dcache_parent() might not make any progress - * and loop forever. - */ - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); + if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; - ret = D_WALK_NORETRY; + } else { + if (dentry->d_flags & DCACHE_LRU_LIST) + d_lru_del(dentry); + if (!dentry->d_lockref.count) { + d_shrink_add(dentry, &data->dispose); + data->found++; + } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ - if (data->found && need_resched()) - ret = D_WALK_QUIT; + if (!list_empty(&data->dispose)) + ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } -- cgit v1.2.3 From 9c8c10e262e0f62cb2530f1b076de979123183dd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 2 May 2014 20:36:10 -0400 Subject: more graceful recovery in umount_collect() Start with shrink_dcache_parent(), then scan what remains. First of all, BUG() is very much an overkill here; we are holding ->s_umount, and hitting BUG() means that a lot of interesting stuff will be hanging after that point (sync(2), for example). Moreover, in cases when there had been more than one leak, we'll be better off reporting all of them. And more than just the last component of pathname - %pd is there for just such uses... That was the last user of dentry_lru_del(), so kill it off... Signed-off-by: Al Viro --- fs/dcache.c | 101 +++++++++++++++--------------------------------------------- 1 file changed, 25 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index f39a6f5a1220..2321e1a861f6 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -395,22 +395,6 @@ static void dentry_lru_add(struct dentry *dentry) d_lru_add(dentry); } -/* - * Remove a dentry with references from the LRU. - * - * If we are on the shrink list, then we can get to try_prune_one_dentry() and - * lose our last reference through the parent walk. In this case, we need to - * remove ourselves from the shrink list, not the LRU. - */ -static void dentry_lru_del(struct dentry *dentry) -{ - if (dentry->d_flags & DCACHE_LRU_LIST) { - if (dentry->d_flags & DCACHE_SHRINK_LIST) - return d_shrink_del(dentry); - d_lru_del(dentry); - } -} - /** * d_drop - drop a dentry * @dentry: dentry to drop @@ -1275,45 +1259,35 @@ void shrink_dcache_parent(struct dentry *parent) } EXPORT_SYMBOL(shrink_dcache_parent); -static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry) +static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { - struct select_data *data = _data; - enum d_walk_ret ret = D_WALK_CONTINUE; + /* it has busy descendents; complain about those instead */ + if (!list_empty(&dentry->d_subdirs)) + return D_WALK_CONTINUE; - if (dentry->d_lockref.count) { - dentry_lru_del(dentry); - if (likely(!list_empty(&dentry->d_subdirs))) - goto out; - if (dentry == data->start && dentry->d_lockref.count == 1) - goto out; - printk(KERN_ERR - "BUG: Dentry %p{i=%lx,n=%s}" - " still in use (%d)" - " [unmount of %s %s]\n", + /* root with refcount 1 is fine */ + if (dentry == _data && dentry->d_lockref.count == 1) + return D_WALK_CONTINUE; + + printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " + " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, - dentry->d_name.name, + dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); - BUG(); - } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) { - /* - * We can't use d_lru_shrink_move() because we - * need to get the global LRU lock and do the - * LRU accounting. - */ - if (dentry->d_flags & DCACHE_LRU_LIST) - d_lru_del(dentry); - d_shrink_add(dentry, &data->dispose); - data->found++; - ret = D_WALK_NORETRY; - } -out: - if (data->found && need_resched()) - ret = D_WALK_QUIT; - return ret; + WARN_ON(1); + return D_WALK_CONTINUE; +} + +static void do_one_tree(struct dentry *dentry) +{ + shrink_dcache_parent(dentry); + d_walk(dentry, dentry, umount_check, NULL); + d_drop(dentry); + dput(dentry); } /* @@ -1323,40 +1297,15 @@ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; - if (down_read_trylock(&sb->s_umount)) - BUG(); + WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; - for (;;) { - struct select_data data; - - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (!data.found) - break; - - shrink_dentry_list(&data.dispose); - cond_resched(); - } - d_drop(dentry); - dput(dentry); + do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_anon)) { - struct select_data data; - dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash); - - INIT_LIST_HEAD(&data.dispose); - data.start = NULL; - data.found = 0; - - d_walk(dentry, &data, umount_collect, NULL); - if (data.found) - shrink_dentry_list(&data.dispose); - cond_resched(); + dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash)); + do_one_tree(dentry); } } -- cgit v1.2.3 From 60942f2f235ce7b817166cdf355eed729094834d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 2 May 2014 15:38:39 -0400 Subject: dcache: don't need rcu in shrink_dentry_list() Since now the shrink list is private and nobody can free the dentry while it is on the shrink list, we can remove RCU protection from this. Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/dcache.c | 27 ++++----------------------- 1 file changed, 4 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 2321e1a861f6..42ae01eefc07 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -796,23 +796,9 @@ static void shrink_dentry_list(struct list_head *list) { struct dentry *dentry, *parent; - rcu_read_lock(); - for (;;) { - dentry = list_entry_rcu(list->prev, struct dentry, d_lru); - if (&dentry->d_lru == list) - break; /* empty */ - - /* - * Get the dentry lock, and re-verify that the dentry is - * this on the shrinking list. If it is, we know that - * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set. - */ + while (!list_empty(list)) { + dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); - if (dentry != list_entry(list->prev, struct dentry, d_lru)) { - spin_unlock(&dentry->d_lock); - continue; - } - /* * The dispose list is isolated and dentries are not accounted * to the LRU here, so we can simply remove it from the list @@ -828,23 +814,20 @@ static void shrink_dentry_list(struct list_head *list) spin_unlock(&dentry->d_lock); continue; } - rcu_read_unlock(); parent = dentry_kill(dentry, 0); /* * If dentry_kill returns NULL, we have nothing more to do. */ - if (!parent) { - rcu_read_lock(); + if (!parent) continue; - } + if (unlikely(parent == dentry)) { /* * trylocks have failed and d_lock has been held the * whole time, so it could not have been added to any * other lists. Just add it back to the shrink list. */ - rcu_read_lock(); d_shrink_add(dentry, list); spin_unlock(&dentry->d_lock); continue; @@ -858,9 +841,7 @@ static void shrink_dentry_list(struct list_head *list) dentry = parent; while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) dentry = dentry_kill(dentry, 1); - rcu_read_lock(); } - rcu_read_unlock(); } static enum lru_status -- cgit v1.2.3 From c99d609a16506602a7398eea7d12b13513f3d889 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 5 May 2014 16:18:37 +1000 Subject: xfs: fully support v5 format filesystems We have had this code in the kernel for over a year now and have shaken all the known issues out of the code over the past few releases. It's now time to remove the experimental warnings during mount and fully support the new filesystem format in production systems. Remove the experimental warning, and add a version number to the initial "mounting filesystem" message to tell use what type of filesystem is being mounted. Also, remove the temporary inode cluster size output at mount time now we know that this code works fine. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_log.c | 10 ++++++---- fs/xfs/xfs_mount.c | 2 -- fs/xfs/xfs_sb.c | 4 ---- 3 files changed, 6 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 08624dc67317..a5f8bd9899d3 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -616,11 +616,13 @@ xfs_log_mount( int error = 0; int min_logfsbs; - if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) - xfs_notice(mp, "Mounting Filesystem"); - else { + if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) { + xfs_notice(mp, "Mounting V%d Filesystem", + XFS_SB_VERSION_NUM(&mp->m_sb)); + } else { xfs_notice(mp, -"Mounting filesystem in no-recovery mode. Filesystem will be inconsistent."); +"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.", + XFS_SB_VERSION_NUM(&mp->m_sb)); ASSERT(mp->m_flags & XFS_MOUNT_RDONLY); } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 993cb19e7d39..944f3d9456a8 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -743,8 +743,6 @@ xfs_mountfs( new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE; if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size)) mp->m_inode_cluster_size = new_size; - xfs_info(mp, "Using inode cluster size of %d bytes", - mp->m_inode_cluster_size); } /* diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c index 0c0e41bbe4e3..8baf61afae1d 100644 --- a/fs/xfs/xfs_sb.c +++ b/fs/xfs/xfs_sb.c @@ -201,10 +201,6 @@ xfs_mount_validate_sb( * write validation, we don't need to check feature masks. */ if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { - xfs_alert(mp, -"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" -"Use of these features in this kernel is at your own risk!"); - if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, -- cgit v1.2.3 From fcdd57c8902a936a616df2066d873b38ef3ed364 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 7 Apr 2014 13:39:06 +0300 Subject: UBIFS: fix remount error path Dan's "smatch" checker found out that there was a bug in the error path of the 'ubifs_remount_rw()' function. Instead of jumping to the "out" label which cleans-things up, we just returned. This patch fixes the problem. Reported-by: Dan Carpenter Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index a1266089eca1..a81c7b556896 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->space_fixup) { err = ubifs_fixup_free_space(c); if (err) - return err; + goto out; } err = check_free_space(c); -- cgit v1.2.3 From d540e43b0ab134b22f015f725ce6e070d12b0244 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Tue, 6 May 2014 07:34:28 +1000 Subject: xfs: initialize default acls for ->tmpfile() The current tmpfile handler does not initialize default ACLs. Doing so within xfs_vn_tmpfile() makes it roughly equivalent to xfs_vn_mknod(), which is already used as a common create handler. xfs_vn_mknod() does not currently have a mechanism to determine whether to link the file into the namespace. Therefore, further abstract xfs_vn_mknod() into a new xfs_generic_create() handler with a tmpfile parameter. This new handler calls xfs_create_tmpfile() and d_tmpfile() on the dentry when called via ->tmpfile(). Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 55 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index ef1ca010f417..301ecbfcc0be 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -124,15 +124,15 @@ xfs_cleanup_inode( xfs_dentry_to_name(&teardown, dentry, 0); xfs_remove(XFS_I(dir), &teardown, XFS_I(inode)); - iput(inode); } STATIC int -xfs_vn_mknod( +xfs_generic_create( struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t rdev) + dev_t rdev, + bool tmpfile) /* unnamed file */ { struct inode *inode; struct xfs_inode *ip = NULL; @@ -156,8 +156,12 @@ xfs_vn_mknod( if (error) return error; - xfs_dentry_to_name(&name, dentry, mode); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + if (!tmpfile) { + xfs_dentry_to_name(&name, dentry, mode); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); + } else { + error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); + } if (unlikely(error)) goto out_free_acl; @@ -180,7 +184,11 @@ xfs_vn_mknod( } #endif - d_instantiate(dentry, inode); + if (tmpfile) + d_tmpfile(dentry, inode); + else + d_instantiate(dentry, inode); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -189,10 +197,22 @@ xfs_vn_mknod( return -error; out_cleanup_inode: - xfs_cleanup_inode(dir, inode, dentry); + if (!tmpfile) + xfs_cleanup_inode(dir, inode, dentry); + iput(inode); goto out_free_acl; } +STATIC int +xfs_vn_mknod( + struct inode *dir, + struct dentry *dentry, + umode_t mode, + dev_t rdev) +{ + return xfs_generic_create(dir, dentry, mode, rdev, false); +} + STATIC int xfs_vn_create( struct inode *dir, @@ -353,6 +373,7 @@ xfs_vn_symlink( out_cleanup_inode: xfs_cleanup_inode(dir, inode, dentry); + iput(inode); out: return -error; } @@ -1053,25 +1074,7 @@ xfs_vn_tmpfile( struct dentry *dentry, umode_t mode) { - int error; - struct xfs_inode *ip; - struct inode *inode; - - error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip); - if (unlikely(error)) - return -error; - - inode = VFS_I(ip); - - error = xfs_init_security(inode, dir, &dentry->d_name); - if (unlikely(error)) { - iput(inode); - return -error; - } - - d_tmpfile(dentry, inode); - - return 0; + return xfs_generic_create(dir, dentry, mode, 0, true); } static const struct inode_operations xfs_inode_operations = { -- cgit v1.2.3 From 8275cdd0e7ac550dcce2b3ef6d2fb3b808c1ae59 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 6 May 2014 07:37:31 +1000 Subject: xfs: remote attribute overwrite causes transaction overrun Commit e461fcb ("xfs: remote attribute lookups require the value length") passes the remote attribute length in the xfs_da_args structure on lookup so that CRC calculations and validity checking can be performed correctly by related code. This, unfortunately has the side effect of changing the args->valuelen parameter in cases where it shouldn't. That is, when we replace a remote attribute, the incoming replacement stores the value and length in args->value and args->valuelen, but then the lookup which finds the existing remote attribute overwrites args->valuelen with the length of the remote attribute being replaced. Hence when we go to create the new attribute, we create it of the size of the existing remote attribute, not the size it is supposed to be. When the new attribute is much smaller than the old attribute, this results in a transaction overrun and an ASSERT() failure on a debug kernel: XFS: Assertion failed: tp->t_blk_res_used <= tp->t_blk_res, file: fs/xfs/xfs_trans.c, line: 331 Fix this by keeping the remote attribute value length separate to the attribute value length in the xfs_da_args structure. The enables us to pass the length of the remote attribute to be removed without overwriting the new attribute's length. Also, ensure that when we save remote block contexts for a later rename we zero the original state variables so that we don't confuse the state of the attribute to be removes with the state of the new attribute that we just added. [Spotted by Brain Foster.] Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Dave Chinner --- fs/xfs/xfs_attr.c | 24 +++++++++++++++++++++++- fs/xfs/xfs_attr_leaf.c | 21 +++++++++++---------- fs/xfs/xfs_attr_list.c | 1 + fs/xfs/xfs_attr_remote.c | 8 +++++--- fs/xfs/xfs_da_btree.h | 2 ++ 5 files changed, 42 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 01b6a0102fbd..abda1124a70f 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -213,7 +213,7 @@ xfs_attr_calc_size( * Out of line attribute, cannot double split, but * make room for the attribute value itself. */ - uint dblocks = XFS_B_TO_FSB(mp, valuelen); + uint dblocks = xfs_attr3_rmt_blocks(mp, valuelen); nblks += dblocks; nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK); } @@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) trace_xfs_attr_leaf_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* an atomic rename */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ + args->rmtblkno = 0; + args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } /* @@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) @@ -999,13 +1011,22 @@ restart: trace_xfs_attr_node_replace(args); + /* save the attribute state for later removal*/ args->op_flags |= XFS_DA_OP_RENAME; /* atomic rename op */ args->blkno2 = args->blkno; /* set 2nd entry info*/ args->index2 = args->index; args->rmtblkno2 = args->rmtblkno; args->rmtblkcnt2 = args->rmtblkcnt; + args->rmtvaluelen2 = args->rmtvaluelen; + + /* + * clear the remote attr state now that it is saved so that the + * values reflect the state of the attribute we are about to + * add, not the attribute we just found and will remove later. + */ args->rmtblkno = 0; args->rmtblkcnt = 0; + args->rmtvaluelen = 0; } retval = xfs_attr3_leaf_add(blk->bp, state->args); @@ -1133,6 +1154,7 @@ restart: args->blkno = args->blkno2; args->rmtblkno = args->rmtblkno2; args->rmtblkcnt = args->rmtblkcnt2; + args->rmtvaluelen = args->rmtvaluelen2; if (args->rmtblkno) { error = xfs_attr_rmtval_remove(args); if (error) diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index fe9587fab17a..511c283459b1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work( name_rmt->valueblk = 0; args->rmtblkno = 1; args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + args->rmtvaluelen = args->valuelen; } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int( if (!xfs_attr_namesp_match(args->flags, entry->flags)) continue; args->index = probe; - args->valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks( args->dp->i_mount, - args->valuelen); + args->rmtvaluelen); return XFS_ERROR(EEXIST); } } @@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue( name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); ASSERT(name_rmt->namelen == args->namelen); ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); - valuelen = be32_to_cpu(name_rmt->valuelen); + args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, - valuelen); + args->rmtvaluelen); if (args->flags & ATTR_KERNOVAL) { - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; return 0; } - if (args->valuelen < valuelen) { - args->valuelen = valuelen; + if (args->valuelen < args->rmtvaluelen) { + args->valuelen = args->rmtvaluelen; return XFS_ERROR(ERANGE); } - args->valuelen = valuelen; + args->valuelen = args->rmtvaluelen; } return 0; } @@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag( ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt))); } @@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags( ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0); name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index); name_rmt->valueblk = cpu_to_be32(args->rmtblkno); - name_rmt->valuelen = cpu_to_be32(args->valuelen); + name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen); xfs_trans_log_buf(args->trans, bp1, XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt))); } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index 01db96f60cf0..833fe5d98d80 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int( args.dp = context->dp; args.whichfork = XFS_ATTR_FORK; args.valuelen = valuelen; + args.rmtvaluelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); args.rmtblkcnt = xfs_attr3_rmt_blocks( diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index 6e37823e2932..d2e6e948cec7 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -337,7 +337,7 @@ xfs_attr_rmtval_get( struct xfs_buf *bp; xfs_dablk_t lblkno = args->rmtblkno; __uint8_t *dst = args->value; - int valuelen = args->valuelen; + int valuelen; int nmap; int error; int blkcnt = args->rmtblkcnt; @@ -347,7 +347,9 @@ xfs_attr_rmtval_get( trace_xfs_attr_rmtval_get(args); ASSERT(!(args->flags & ATTR_KERNOVAL)); + ASSERT(args->rmtvaluelen == args->valuelen); + valuelen = args->rmtvaluelen; while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, @@ -415,7 +417,7 @@ xfs_attr_rmtval_set( * attributes have headers, we can't just do a straight byte to FSB * conversion and have to take the header space into account. */ - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); + blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) @@ -480,7 +482,7 @@ xfs_attr_rmtval_set( */ lblkno = args->rmtblkno; blkcnt = args->rmtblkcnt; - valuelen = args->valuelen; + valuelen = args->rmtvaluelen; while (valuelen > 0) { struct xfs_buf *bp; xfs_daddr_t dblkno; diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h index 6e95ea79f5d7..201c6091d26a 100644 --- a/fs/xfs/xfs_da_btree.h +++ b/fs/xfs/xfs_da_btree.h @@ -60,10 +60,12 @@ typedef struct xfs_da_args { int index; /* index of attr of interest in blk */ xfs_dablk_t rmtblkno; /* remote attr value starting blkno */ int rmtblkcnt; /* remote attr value block count */ + int rmtvaluelen; /* remote attr value length in bytes */ xfs_dablk_t blkno2; /* blkno of 2nd attr leaf of interest */ int index2; /* index of 2nd attr in blk */ xfs_dablk_t rmtblkno2; /* remote attr value starting blkno */ int rmtblkcnt2; /* remote attr value block count */ + int rmtvaluelen2; /* remote attr value length in bytes */ int op_flags; /* operation flags */ enum xfs_dacmp cmpresult; /* name compare result for lookups */ } xfs_da_args_t; -- cgit v1.2.3 From 5694c93e6c4954fa9424c215f75eeb919bddad64 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Apr 2014 14:43:56 -0400 Subject: NFSd: Move default initialisers from create_client() to alloc_client() Aside from making it clearer what is non-trivial in create_client(), it also fixes a bug whereby we can call free_client() before idr_init() has been called. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 3ba65979a3cd..230d21cb1717 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) return NULL; } clp->cl_name.len = name.len; + INIT_LIST_HEAD(&clp->cl_sessions); + idr_init(&clp->cl_stateids); + atomic_set(&clp->cl_refcount, 0); + clp->cl_cb_state = NFSD4_CB_UNKNOWN; + INIT_LIST_HEAD(&clp->cl_idhash); + INIT_LIST_HEAD(&clp->cl_openowners); + INIT_LIST_HEAD(&clp->cl_delegations); + INIT_LIST_HEAD(&clp->cl_lru); + INIT_LIST_HEAD(&clp->cl_callbacks); + INIT_LIST_HEAD(&clp->cl_revoked); + spin_lock_init(&clp->cl_lock); + rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); return clp; } @@ -1347,7 +1359,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, if (clp == NULL) return NULL; - INIT_LIST_HEAD(&clp->cl_sessions); ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred); if (ret) { spin_lock(&nn->client_lock); @@ -1355,20 +1366,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name, spin_unlock(&nn->client_lock); return NULL; } - idr_init(&clp->cl_stateids); - atomic_set(&clp->cl_refcount, 0); - clp->cl_cb_state = NFSD4_CB_UNKNOWN; - INIT_LIST_HEAD(&clp->cl_idhash); - INIT_LIST_HEAD(&clp->cl_openowners); - INIT_LIST_HEAD(&clp->cl_delegations); - INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); - INIT_LIST_HEAD(&clp->cl_revoked); - spin_lock_init(&clp->cl_lock); nfsd4_init_callback(&clp->cl_cb_null); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); - rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table"); copy_verf(clp, verf); rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa); gen_confirm(clp); -- cgit v1.2.3 From 4cb57e3032d4e4bf5e97780e9907da7282b02b0c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Apr 2014 14:43:57 -0400 Subject: NFSd: call rpc_destroy_wait_queue() from free_client() Mainly to ensure that we don't leave any hanging timers. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 230d21cb1717..32b699bebb9c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1107,6 +1107,7 @@ free_client(struct nfs4_client *clp) WARN_ON_ONCE(atomic_read(&ses->se_ref)); free_session(ses); } + rpc_destroy_wait_queue(&clp->cl_cb_waitq); free_svc_cred(&clp->cl_cred); kfree(clp->cl_name.data); idr_destroy(&clp->cl_stateids); -- cgit v1.2.3 From 50c6e282bdf5e8dabf8d7cf7b162545a55645fd9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 4 May 2014 13:03:32 +0200 Subject: posix_acl: handle NULL ACL in posix_acl_equiv_mode Various filesystems don't bother checking for a NULL ACL in posix_acl_equiv_mode, and thus can dereference a NULL pointer when it gets passed one. This usually happens from the NFS server, as the ACL tools never pass a NULL ACL, but instead of one representing the mode bits. Instead of adding boilerplat to all filesystems put this check into one place, which will allow us to remove the check from other filesystems as well later on. Signed-off-by: Christoph Hellwig Reported-by: Ben Greear Reported-by: Marco Munderloh , Cc: Chuck Lever Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/posix_acl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 9e363e41dacc..0855f772cd41 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p) umode_t mode = 0; int not_equiv = 0; + /* + * A null ACL can always be presented as mode bits. + */ + if (!acl) + return 0; + FOREACH_ACL_ENTRY(pa, acl, pe) { switch (pa->e_tag) { case ACL_USER_OBJ: -- cgit v1.2.3 From 457c1b27ed56ec472d202731b12417bff023594a Mon Sep 17 00:00:00 2001 From: Nishanth Aravamudan Date: Tue, 6 May 2014 12:50:00 -0700 Subject: hugetlb: ensure hugepage access is denied if hugepages are not supported Currently, I am seeing the following when I `mount -t hugetlbfs /none /dev/hugetlbfs`, and then simply do a `ls /dev/hugetlbfs`. I think it's related to the fact that hugetlbfs is properly not correctly setting itself up in this state?: Unable to handle kernel paging request for data at address 0x00000031 Faulting instruction address: 0xc000000000245710 Oops: Kernel access of bad area, sig: 11 [#1] SMP NR_CPUS=2048 NUMA pSeries .... In KVM guests on Power, in a guest not backed by hugepages, we see the following: AnonHugePages: 0 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 64 kB HPAGE_SHIFT == 0 in this configuration, which indicates that hugepages are not supported at boot-time, but this is only checked in hugetlb_init(). Extract the check to a helper function, and use it in a few relevant places. This does make hugetlbfs not supported (not registered at all) in this environment. I believe this is fine, as there are no valid hugepages and that won't change at runtime. [akpm@linux-foundation.org: use pr_info(), per Mel] [akpm@linux-foundation.org: fix build when HPAGE_SHIFT is undefined] Signed-off-by: Nishanth Aravamudan Reviewed-by: Aneesh Kumar K.V Acked-by: Mel Gorman Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 204027520937..e19d4c0cacae 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void) int error; int i; + if (!hugepages_supported()) { + pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + return -ENOTSUPP; + } + error = bdi_init(&hugetlbfs_backing_dev_info); if (error) return error; -- cgit v1.2.3 From 6b6751f7feba68d8f5c72b72cc69a1c5a625529c Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Tue, 6 May 2014 12:50:06 -0700 Subject: autofs: fix lockref lookup autofs needs to be able to see private data dentry flags for its dentrys that are being created but not yet hashed and for its dentrys that have been rmdir()ed but not yet freed. It needs to do this so it can block processes in these states until a status has been returned to indicate the given operation is complete. It does this by keeping two lists, active and expring, of dentrys in this state and uses ->d_release() to keep them stable while it checks the reference count to determine if they should be used. But with the recent lockref changes dentrys being freed sometimes don't transition to a reference count of 0 before being freed so autofs can occassionally use a dentry that is invalid which can lead to a panic. Signed-off-by: Ian Kent Cc: Al Viro Cc: Linus Torvalds Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/root.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 2caf36ac3e93..cc87c1abac97 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry) spin_lock(&active->d_lock); /* Already gone? */ - if (!d_count(active)) + if ((int) d_count(active) <= 0) goto next; qstr = &active->d_name; @@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) spin_lock(&expiring->d_lock); - /* Bad luck, we've already been dentry_iput */ + /* We've already been dentry_iput or unlinked */ if (!expiring->d_inode) goto next; -- cgit v1.2.3 From 1e2ee49f7f1b79f0b14884fe6a602f0411b39552 Mon Sep 17 00:00:00 2001 From: Will Woods Date: Tue, 6 May 2014 12:50:10 -0700 Subject: fanotify: fix -EOVERFLOW with large files on 64-bit On 64-bit systems, O_LARGEFILE is automatically added to flags inside the open() syscall (also openat(), blkdev_open(), etc). Userspace therefore defines O_LARGEFILE to be 0 - you can use it, but it's a no-op. Everything should be O_LARGEFILE by default. But: when fanotify does create_fd() it uses dentry_open(), which skips all that. And userspace can't set O_LARGEFILE in fanotify_init() because it's defined to 0. So if fanotify gets an event regarding a large file, the read() will just fail with -EOVERFLOW. This patch adds O_LARGEFILE to fanotify_init()'s event_f_flags on 64-bit systems, using the same test as open()/openat()/etc. Addresses https://bugzilla.redhat.com/show_bug.cgi?id=696821 Signed-off-by: Will Woods Acked-by: Eric Paris Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fanotify/fanotify_user.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 4e565c814309..732648b270dc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } group->overflow_event = &oevent->fse; + if (force_o_largefile()) + event_f_flags |= O_LARGEFILE; group->fanotify_data.f_flags = event_f_flags; #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS spin_lock_init(&group->fanotify_data.access_lock); -- cgit v1.2.3 From d353efd02357a74753cd45f367a2d3d357fd6904 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 6 May 2014 12:50:11 -0700 Subject: fs/affs/super.c: bugfix / double free Commit 842a859db26b ("affs: use ->kill_sb() to simplify ->put_super() and failure exits of ->mount()") adds .kill_sb which frees sbi but doesn't remove sbi free in case of parse_options error causing double free+random crash. Signed-off-by: Fabian Frederick Cc: Alexander Viro Cc: [3.14.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/affs/super.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/affs/super.c b/fs/affs/super.c index 6d589f28bf9b..895ac7dc9dbf 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); - kfree(sbi->s_prefix); - kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ -- cgit v1.2.3 From aa07c713ecfc0522916f3cd57ac628ea6127c0ec Mon Sep 17 00:00:00 2001 From: Kinglong Mee Date: Fri, 18 Apr 2014 20:49:04 +0800 Subject: NFSD: Call ->set_acl with a NULL ACL structure if no entries After setting ACL for directory, I got two problems that caused by the cached zero-length default posix acl. This patch make sure nfsd4_set_nfs4_acl calls ->set_acl with a NULL ACL structure if there are no entries. Thanks for Christoph Hellwig's advice. First problem: ............ hang ........... Second problem: [ 1610.167668] ------------[ cut here ]------------ [ 1610.168320] kernel BUG at /root/nfs/linux/fs/nfsd/nfs4acl.c:239! [ 1610.168320] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC [ 1610.168320] Modules linked in: nfsv4(OE) nfs(OE) nfsd(OE) rpcsec_gss_krb5 fscache ip6t_rpfilter ip6t_REJECT cfg80211 xt_conntrack rfkill ebtable_nat ebtable_broute bridge stp llc ebtable_filter ebtables ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw ip6table_filter ip6_tables iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw auth_rpcgss nfs_acl snd_intel8x0 ppdev lockd snd_ac97_codec ac97_bus snd_pcm snd_timer e1000 pcspkr parport_pc snd parport serio_raw joydev i2c_piix4 sunrpc(OE) microcode soundcore i2c_core ata_generic pata_acpi [last unloaded: nfsd] [ 1610.168320] CPU: 0 PID: 27397 Comm: nfsd Tainted: G OE 3.15.0-rc1+ #15 [ 1610.168320] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 [ 1610.168320] task: ffff88005ab653d0 ti: ffff88005a944000 task.ti: ffff88005a944000 [ 1610.168320] RIP: 0010:[] [] _posix_to_nfsv4_one+0x3cd/0x3d0 [nfsd] [ 1610.168320] RSP: 0018:ffff88005a945b00 EFLAGS: 00010293 [ 1610.168320] RAX: 0000000000000001 RBX: ffff88006700bac0 RCX: 0000000000000000 [ 1610.168320] RDX: 0000000000000000 RSI: ffff880067c83f00 RDI: ffff880068233300 [ 1610.168320] RBP: ffff88005a945b48 R08: ffffffff81c64830 R09: 0000000000000000 [ 1610.168320] R10: ffff88004ea85be0 R11: 000000000000f475 R12: ffff880068233300 [ 1610.168320] R13: 0000000000000003 R14: 0000000000000002 R15: ffff880068233300 [ 1610.168320] FS: 0000000000000000(0000) GS:ffff880077800000(0000) knlGS:0000000000000000 [ 1610.168320] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1610.168320] CR2: 00007f5bcbd3b0b9 CR3: 0000000001c0f000 CR4: 00000000000006f0 [ 1610.168320] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1610.168320] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1610.168320] Stack: [ 1610.168320] ffffffff00000000 0000000b67c83500 000000076700bac0 0000000000000000 [ 1610.168320] ffff88006700bac0 ffff880068233300 ffff88005a945c08 0000000000000002 [ 1610.168320] 0000000000000000 ffff88005a945b88 ffffffffa034e2d5 000000065a945b68 [ 1610.168320] Call Trace: [ 1610.168320] [] nfsd4_get_nfs4_acl+0x95/0x150 [nfsd] [ 1610.168320] [] nfsd4_encode_fattr+0x646/0x1e70 [nfsd] [ 1610.168320] [] ? kmemleak_alloc+0x4e/0xb0 [ 1610.168320] [] ? nfsd_setuser_and_check_port+0x52/0x80 [nfsd] [ 1610.168320] [] ? selinux_cred_prepare+0x1b/0x30 [ 1610.168320] [] nfsd4_encode_getattr+0x5a/0x60 [nfsd] [ 1610.168320] [] nfsd4_encode_operation+0x67/0x110 [nfsd] [ 1610.168320] [] nfsd4_proc_compound+0x21d/0x810 [nfsd] [ 1610.168320] [] nfsd_dispatch+0xbb/0x200 [nfsd] [ 1610.168320] [] svc_process_common+0x46d/0x6d0 [sunrpc] [ 1610.168320] [] svc_process+0x103/0x170 [sunrpc] [ 1610.168320] [] nfsd+0xbf/0x130 [nfsd] [ 1610.168320] [] ? nfsd_destroy+0x80/0x80 [nfsd] [ 1610.168320] [] kthread+0xd2/0xf0 [ 1610.168320] [] ? insert_kthread_work+0x40/0x40 [ 1610.168320] [] ret_from_fork+0x7c/0xb0 [ 1610.168320] [] ? insert_kthread_work+0x40/0x40 [ 1610.168320] Code: 78 02 e9 e7 fc ff ff 31 c0 31 d2 31 c9 66 89 45 ce 41 8b 04 24 66 89 55 d0 66 89 4d d2 48 8d 04 80 49 8d 5c 84 04 e9 37 fd ff ff <0f> 0b 90 0f 1f 44 00 00 55 8b 56 08 c7 07 00 00 00 00 8b 46 0c [ 1610.168320] RIP [] _posix_to_nfsv4_one+0x3cd/0x3d0 [nfsd] [ 1610.168320] RSP [ 1610.257313] ---[ end trace 838254e3e352285b ]--- Signed-off-by: Kinglong Mee Cc: stable@vger.kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4acl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 6f3f392d48af..b6f46013dddf 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl) * by uid/gid. */ int i, j; - if (pacl->a_count <= 4) - return; /* no users or groups */ + /* no users or groups */ + if (!pacl || pacl->a_count <= 4) + return; + i = 1; while (pacl->a_entries[i].e_tag == ACL_USER) i++; @@ -530,13 +532,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) /* * ACLs with no ACEs are treated differently in the inheritable - * and effective cases: when there are no inheritable ACEs, we - * set a zero-length default posix acl: + * and effective cases: when there are no inheritable ACEs, + * calls ->set_acl with a NULL ACL structure. */ - if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) { - pacl = posix_acl_alloc(0, GFP_KERNEL); - return pacl ? pacl : ERR_PTR(-ENOMEM); - } + if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) + return NULL; + /* * When there are no effective ACEs, the following will end * up setting a 3-element effective posix ACL with all -- cgit v1.2.3 From cf01f4eef9fe367ec0d85b38dd7214e29e376cdb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 May 2014 11:41:54 -0400 Subject: locks: only validate the lock vs. f_mode in F_SETLK codepaths v2: replace missing break in switch statement (as pointed out by Dave Jones) commit bce7560d4946 (locks: consolidate checks for compatible filp->f_mode values in setlk handlers) introduced a regression in the F_GETLK handler. flock64_to_posix_lock is a shared codepath between F_GETLK and F_SETLK, but the f_mode checks should only be applicable to the F_SETLK codepaths according to POSIX. Instead of just reverting the patch, add a new function to do this checking and have the F_SETLK handlers call it. Cc: Dave Jones Reported-and-Tested-by: Reuben Farrelly Signed-off-by: Jeff Layton --- fs/locks.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index e663aeac579e..e390bd9ae068 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -389,18 +389,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl, fl->fl_ops = NULL; fl->fl_lmops = NULL; - /* Ensure that fl->fl_filp has compatible f_mode */ - switch (l->l_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - break; - } - return assign_type(fl, l->l_type); } @@ -2034,6 +2022,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd, return error; } +/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */ +static int +check_fmode_for_setlk(struct file_lock *fl) +{ + switch (fl->fl_type) { + case F_RDLCK: + if (!(fl->fl_file->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(fl->fl_file->f_mode & FMODE_WRITE)) + return -EBADF; + } + return 0; +} + /* Apply the lock described by l to an open file descriptor. * This implements both the F_SETLK and F_SETLKW commands of fcntl(). */ @@ -2071,6 +2075,10 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the * FL_OFDLCK flag and override the owner. @@ -2206,6 +2214,10 @@ again: if (error) goto out; + error = check_fmode_for_setlk(file_lock); + if (error) + goto out; + /* * If the cmd is requesting file-private locks, then set the * FL_OFDLCK flag and override the owner. -- cgit v1.2.3 From 555724a831b4a146e7bdf16ecc989cda032b076d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 12 May 2014 13:56:27 -0400 Subject: kernfs, sysfs, cgroup: restrict extra perm check on open to sysfs The kernfs open method - kernfs_fop_open() - inherited extra permission checks from sysfs. While the vfs layer allows ignoring the read/write permissions checks if the issuer has CAP_DAC_OVERRIDE, sysfs explicitly denied open regardless of the cap if the file doesn't have any of the UGO perms of the requested access or doesn't implement the requested operation. It can be debated whether this was a good idea or not but the behavior is too subtle and dangerous to change at this point. After cgroup got converted to kernfs, this extra perm check also got applied to cgroup breaking libcgroup which opens write-only files with O_RDWR as root. This patch gates the extra open permission check with a new flag KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK and enables it for sysfs. For sysfs, nothing changes. For cgroup, root now can perform any operation regardless of the permissions as it was before kernfs conversion. Note that kernfs still fails unimplemented operations with -EINVAL. While at it, add comments explaining KERNFS_ROOT flags. Signed-off-by: Tejun Heo Reported-by: Andrey Wagin Tested-by: Andrey Wagin Cc: Li Zefan References: http://lkml.kernel.org/g/CANaxB-xUm3rJ-Cbp72q-rQJO5mZe1qK6qXsQM=vh0U8upJ44+A@mail.gmail.com Fixes: 2bd59d48ebfb ("cgroup: convert to kernfs") Signed-off-by: Greg Kroah-Hartman --- fs/kernfs/file.c | 17 ++++++++++------- fs/sysfs/mount.c | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index e01ea4a14a01..5e9a80cfc3d8 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -610,6 +610,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn, static int kernfs_fop_open(struct inode *inode, struct file *file) { struct kernfs_node *kn = file->f_path.dentry->d_fsdata; + struct kernfs_root *root = kernfs_root(kn); const struct kernfs_ops *ops; struct kernfs_open_file *of; bool has_read, has_write, has_mmap; @@ -624,14 +625,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file) has_write = ops->write || ops->mmap; has_mmap = ops->mmap; - /* check perms and supported operations */ - if ((file->f_mode & FMODE_WRITE) && - (!(inode->i_mode & S_IWUGO) || !has_write)) - goto err_out; + /* see the flag definition for details */ + if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) { + if ((file->f_mode & FMODE_WRITE) && + (!(inode->i_mode & S_IWUGO) || !has_write)) + goto err_out; - if ((file->f_mode & FMODE_READ) && - (!(inode->i_mode & S_IRUGO) || !has_read)) - goto err_out; + if ((file->f_mode & FMODE_READ) && + (!(inode->i_mode & S_IRUGO) || !has_read)) + goto err_out; + } /* allocate a kernfs_open_file for the file */ error = -ENOMEM; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index a66ad6196f59..8794423f7efb 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -63,7 +63,8 @@ int __init sysfs_init(void) { int err; - sysfs_root = kernfs_create_root(NULL, 0, NULL); + sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + NULL); if (IS_ERR(sysfs_root)) return PTR_ERR(sysfs_root); -- cgit v1.2.3 From d71f290b4e98a39f49f2595a13be3b4d5ce8e1f1 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 13 May 2014 23:58:24 +0100 Subject: metag: Reduce maximum stack size to 256MB Specify the maximum stack size for arches where the stack grows upward (parisc and metag) in asm/processor.h rather than hard coding in fs/exec.c so that metag can specify a smaller value of 256MB rather than 1GB. This fixes a BUG on metag if the RLIMIT_STACK hard limit is increased beyond a safe value by root. E.g. when starting a process after running "ulimit -H -s unlimited" it will then attempt to use a stack size of the maximum 1GB which is far too big for metag's limited user virtual address space (stack_top is usually 0x3ffff000): BUG: failure at fs/exec.c:589/shift_arg_pages()! Signed-off-by: James Hogan Cc: Helge Deller Cc: "James E.J. Bottomley" Cc: linux-parisc@vger.kernel.org Cc: linux-metag@vger.kernel.org Cc: John David Anglin Cc: stable@vger.kernel.org # only needed for >= v3.9 (arch/metag) --- fs/exec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 476f3ebf437e..238b7aa26f68 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm, unsigned long rlim_stack; #ifdef CONFIG_STACK_GROWSUP - /* Limit stack size to 1GB */ + /* Limit stack size */ stack_base = rlimit_max(RLIMIT_STACK); - if (stack_base > (1 << 30)) - stack_base = 1 << 30; + if (stack_base > STACK_SIZE_MAX) + stack_base = STACK_SIZE_MAX; /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) -- cgit v1.2.3 From 43ec1460a2189fbee87980dd3d3e64cba2f11e1f Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:21:11 +1000 Subject: xfs: xfs_dir_fsync() returns positive errno And it should be negative. Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 951a2321ee01..f499e47d44d8 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -155,7 +155,7 @@ xfs_dir_fsync( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } STATIC int -- cgit v1.2.3 From 8ff1e6705a4c711247708f23881feea169e6fc3c Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:21:37 +1000 Subject: xfs: fix incorrect error sign in xfs_file_aio_read Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f499e47d44d8..37f98c6d81aa 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -295,7 +295,7 @@ xfs_file_aio_read( xfs_rw_ilock(ip, XFS_IOLOCK_EXCL); if (inode->i_mapping->nrpages) { - ret = -filemap_write_and_wait_range( + ret = filemap_write_and_wait_range( VFS_I(ip)->i_mapping, pos, -1); if (ret) { -- cgit v1.2.3 From b38a134b22fbd0bed90f3e079bbf8cb2962a52be Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:21:52 +1000 Subject: xfs: xfs_commit_metadata returns wrong errno Invert it. Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index 1399e187d425..753e467aa1a5 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata( if (!lsn) return 0; - return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); + return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL); } const struct export_operations xfs_export_operations = { -- cgit v1.2.3 From 65149e3fab7f053396d09a429085f3071fa50825 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:22:07 +1000 Subject: xfs: correct error sign on COLLAPSE_RANGE errors Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 37f98c6d81aa..830c1c937b88 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -837,7 +837,7 @@ xfs_file_fallocate( unsigned blksize_mask = (1 << inode->i_blkbits) - 1; if (offset & blksize_mask || len & blksize_mask) { - error = -EINVAL; + error = EINVAL; goto out_unlock; } @@ -846,7 +846,7 @@ xfs_file_fallocate( * in which case it is effectively a truncate operation */ if (offset + len >= i_size_read(inode)) { - error = -EINVAL; + error = EINVAL; goto out_unlock; } -- cgit v1.2.3 From a5a14de22e8afd771775c7106b3b081c23bac783 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:22:21 +1000 Subject: xfs: fix wrong errno from xfs_initxattrs Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 301ecbfcc0be..adfb18e19992 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -72,8 +72,8 @@ xfs_initxattrs( int error = 0; for (xattr = xattr_array; xattr->name != NULL; xattr++) { - error = xfs_attr_set(ip, xattr->name, xattr->value, - xattr->value_len, ATTR_SECURE); + error = -xfs_attr_set(ip, xattr->name, xattr->value, + xattr->value_len, ATTR_SECURE); if (error < 0) break; } @@ -93,8 +93,8 @@ xfs_init_security( struct inode *dir, const struct qstr *qstr) { - return security_inode_init_security(inode, dir, qstr, - &xfs_initxattrs, NULL); + return -security_inode_init_security(inode, dir, qstr, + &xfs_initxattrs, NULL); } static void -- cgit v1.2.3 From 6670232b48848e5ad949dbd4866850aec7035f32 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:22:37 +1000 Subject: xfs: fix wrong err sign on xfs_set_acl() Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_iops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index adfb18e19992..36d630319a27 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -173,12 +173,12 @@ xfs_generic_create( #ifdef CONFIG_XFS_POSIX_ACL if (default_acl) { - error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); + error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); if (error) goto out_cleanup_inode; } if (acl) { - error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); + error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS); if (error) goto out_cleanup_inode; } -- cgit v1.2.3 From 45687642e43d5a3b700d6c1df0e07b3976c90c08 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:22:53 +1000 Subject: xfs: negate mount workqueue init error value Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 205376776377..e1597f2e58da 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1433,7 +1433,7 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - error = xfs_init_mount_workqueues(mp); + error = -xfs_init_mount_workqueues(mp); if (error) goto out_close_devices; -- cgit v1.2.3 From bc147822d5ada188fff792691efffe89589e2e19 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:23:07 +1000 Subject: xfs: negate xfs_icsb_init_counters error value Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index e1597f2e58da..3494eff8e4eb 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1437,7 +1437,7 @@ xfs_fs_fill_super( if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = -xfs_icsb_init_counters(mp); if (error) goto out_destroy_workqueues; -- cgit v1.2.3 From ee4eec478be4677b93775d17bc079efb5922b276 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 15 May 2014 09:23:24 +1000 Subject: xfs: list_lru_init returns a negative error And we don't invert it properly when initialising the dquot lru list. Signed-off-by: Dave Chinner Reviewed-by: Jie Liu Signed-off-by: Dave Chinner --- fs/xfs/xfs_qm.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 348e4d2ed6e6..dc977b6e6a36 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -843,22 +843,17 @@ xfs_qm_init_quotainfo( qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP); - if ((error = list_lru_init(&qinf->qi_lru))) { - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = -list_lru_init(&qinf->qi_lru); + if (error) + goto out_free_qinf; /* * See if quotainodes are setup, and if not, allocate them, * and change the superblock accordingly. */ - if ((error = xfs_qm_init_quotainos(mp))) { - list_lru_destroy(&qinf->qi_lru); - kmem_free(qinf); - mp->m_quotainfo = NULL; - return error; - } + error = xfs_qm_init_quotainos(mp); + if (error) + goto out_free_lru; INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS); INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS); @@ -918,7 +913,7 @@ xfs_qm_init_quotainfo( qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - + xfs_qm_dqdestroy(dqp); } else { qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; @@ -935,6 +930,13 @@ xfs_qm_init_quotainfo( qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; register_shrinker(&qinf->qi_shrinker); return 0; + +out_free_lru: + list_lru_destroy(&qinf->qi_lru); +out_free_qinf: + kmem_free(qinf); + mp->m_quotainfo = NULL; + return error; } -- cgit v1.2.3 From f5c16f29bf5e57ba4051fc7785ba7f035f798c71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 19 May 2014 15:52:10 -0400 Subject: sysfs: make sure read buffer is zeroed 13c589d5b0ac ("sysfs: use seq_file when reading regular files") switched sysfs from custom read implementation to seq_file to enable later transition to kernfs. After the change, the buffer passed to ->show() is acquired through seq_get_buf(); unfortunately, this introduces a subtle behavior change. Before the commit, the buffer passed to ->show() was always zero as it was allocated using get_zeroed_page(). Because seq_file doesn't clear buffers on allocation and neither does seq_get_buf(), after the commit, depending on the behavior of ->show(), we may end up exposing uninitialized data to userland thus possibly altering userland visible behavior and leaking information. Fix it by explicitly clearing the buffer. Signed-off-by: Tejun Heo Reported-by: Ron Fixes: 13c589d5b0ac ("sysfs: use seq_file when reading regular files") Cc: stable # 3.13+ Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 28cc1acd5439..e9ef59b3abb1 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; - /* acquire buffer and ensure that it's >= PAGE_SIZE */ + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { seq_commit(sf, -1); return 0; } + memset(buf, 0, PAGE_SIZE); /* * Invoke show(). Control may reach here via seq file lseek even -- cgit v1.2.3 From d3ecfcdf9108c833e4e501bfa02ecf673a0ace59 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 9 May 2014 10:01:02 +0800 Subject: Btrfs: fix EIO on reading file after ioctl clone works on it For inline data extent, we need to make its length aligned, otherwise, we can get a phantom extent map which confuses readpages() to return -EIO. This can be detected by xfstests/btrfs/035. Reported-by: David Disseldorp Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7b001abc73c7..e1b47ef380d6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3142,6 +3142,8 @@ process_slot: } else if (type == BTRFS_FILE_EXTENT_INLINE) { u64 skip = 0; u64 trim = 0; + u64 aligned_end = 0; + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3158,9 +3160,11 @@ process_slot: size -= skip + trim; datal -= skip + trim; + aligned_end = ALIGN(new_key.offset + datal, + root->sectorsize); ret = btrfs_drop_extents(trans, root, inode, new_key.offset, - new_key.offset + datal, + aligned_end, 1); if (ret) { if (ret != -EOPNOTSUPP) -- cgit v1.2.3 From 51a60253a58514524b7a347c4e68553821a79d04 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 13 May 2014 22:01:02 +0100 Subject: Btrfs: send, fix incorrect ref access when using extrefs When running send, if an inode only has extended reference items associated to it and no regular references, send.c:get_first_ref() was incorrectly assuming the reference it found was of type BTRFS_INODE_REF_KEY due to use of the wrong key variable. This caused weird behaviour when using the found item has a regular reference, such as weird path string, and occasionally (when lucky) a crash: [ 190.600652] general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC [ 190.600994] Modules linked in: btrfs xor raid6_pq binfmt_misc nfsd auth_rpcgss oid_registry nfs_acl nfs lockd fscache sunrpc psmouse serio_raw evbug pcspkr i2c_piix4 e1000 floppy [ 190.602565] CPU: 2 PID: 14520 Comm: btrfs Not tainted 3.13.0-fdm-btrfs-next-26+ #1 [ 190.602728] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 190.602868] task: ffff8800d447c920 ti: ffff8801fa79e000 task.ti: ffff8801fa79e000 [ 190.603030] RIP: 0010:[] [] memcpy+0x54/0x110 [ 190.603262] RSP: 0018:ffff8801fa79f880 EFLAGS: 00010202 [ 190.603395] RAX: ffff8800d4326e3f RBX: 000000000000036a RCX: ffff880000000000 [ 190.603553] RDX: 000000000000032a RSI: ffe708844042936a RDI: ffff8800d43271a9 [ 190.603710] RBP: ffff8801fa79f8c8 R08: 00000000003a4ef0 R09: 0000000000000000 [ 190.603867] R10: 793a4ef09f000000 R11: 9f0000000053726f R12: ffff8800d43271a9 [ 190.604020] R13: 0000160000000000 R14: ffff8802110134f0 R15: 000000000000036a [ 190.604020] FS: 00007fb423d09b80(0000) GS:ffff880216200000(0000) knlGS:0000000000000000 [ 190.604020] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 190.604020] CR2: 00007fb4229d4b78 CR3: 00000001f5d76000 CR4: 00000000000006e0 [ 190.604020] Stack: [ 190.604020] ffffffffa01f4d49 ffff8801fa79f8f0 00000000000009f9 ffff8801fa79f8c8 [ 190.604020] 00000000000009f9 ffff880211013260 000000000000f971 ffff88021147dba8 [ 190.604020] 00000000000009f9 ffff8801fa79f918 ffffffffa02367f5 ffff8801fa79f928 [ 190.604020] Call Trace: [ 190.604020] [] ? read_extent_buffer+0xb9/0x120 [btrfs] [ 190.604020] [] fs_path_add_from_extent_buffer+0x45/0x60 [btrfs] [ 190.604020] [] get_first_ref+0x1f6/0x210 [btrfs] [ 190.604020] [] __get_cur_name_and_parent+0x174/0x3a0 [btrfs] [ 190.604020] [] ? kmem_cache_alloc_trace+0x11d/0x1e0 [ 190.604020] [] ? fs_path_alloc+0x24/0x60 [btrfs] [ 190.604020] [] get_cur_path+0xd1/0x240 [btrfs] (...) Steps to reproduce (either crash or some weirdness like an odd path string): mkfs.btrfs -f -O extref /dev/sdd mount /dev/sdd /mnt mkdir /mnt/testdir touch /mnt/testdir/foobar for i in `seq 1 2550`; do ln /mnt/testdir/foobar /mnt/testdir/foobar_link_`printf "%04d" $i` done ln /mnt/testdir/foobar /mnt/testdir/final_foobar_name rm -f /mnt/testdir/foobar for i in `seq 1 2550`; do rm -f /mnt/testdir/foobar_link_`printf "%04d" $i` done btrfs subvolume snapshot -r /mnt /mnt/mysnap btrfs send /mnt/mysnap -f /tmp/mysnap.send Signed-off-by: Filipe David Borba Manana Signed-off-by: Chris Mason Reviewed-by: Liu Bo --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index eb6537a08c1b..fd38b5053479 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1668,7 +1668,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, goto out; } - if (key.type == BTRFS_INODE_REF_KEY) { + if (found_key.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; iref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); -- cgit v1.2.3