From ac45d61357e86b9a0cf14e45e8e09dfb626970ef Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 5 Mar 2012 15:48:11 +0100 Subject: fuse: fix nlink after unlink Anand Avati reports that the following sequence of system calls fail on a fuse filesystem: create("filename") => 0 link("filename", "linkname") => 0 unlink("filename") => 0 link("linkname", "filename") => -ENOENT ### BUG ### vfs_link() fails with ENOENT if i_nlink is zero, this is done to prevent resurrecting already deleted files. Fuse clears i_nlink on unlink even if there are other links pointing to the file. Reported-by: Anand Avati Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 206632887bb4..da379070fab5 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -644,13 +644,12 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) fuse_put_request(fc, req); if (!err) { struct inode *inode = entry->d_inode; + struct fuse_inode *fi = get_fuse_inode(inode); - /* - * Set nlink to zero so the inode can be cleared, if the inode - * does have more links this will be discovered at the next - * lookup/getattr. - */ - clear_nlink(inode); + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + drop_nlink(inode); + spin_unlock(&fc->lock); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); @@ -762,8 +761,17 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, will reflect changes in the backing inode (link count, etc.) */ - if (!err || err == -EINTR) + if (!err) { + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + inc_nlink(inode); + spin_unlock(&fc->lock); fuse_invalidate_attr(inode); + } else if (err == -EINTR) { + fuse_invalidate_attr(inode); + } return err; } -- cgit v1.2.3 From 4273b793ec68753cc3fcf5be7cbfd88c2be2058d Mon Sep 17 00:00:00 2001 From: Anand Avati Date: Fri, 17 Feb 2012 12:46:25 -0500 Subject: fuse: O_DIRECT support for files Implement ->direct_IO() method in aops. The ->direct_IO() method combines the existing fuse_direct_read/fuse_direct_write methods to implement O_DIRECT functionality. Reaching ->direct_IO() in the read path via generic_file_aio_read ensures proper synchronization with page cache with its existing framework. Reaching ->direct_IO() in the write path via fuse_file_aio_write is made to come via generic_file_direct_write() which makes it play nice with the page cache w.r.t other mmap pages etc. On files marked 'direct_io' by the filesystem server, IO always follows the fuse_direct_read/write path. There is no effect of fcntl(O_DIRECT) and it always succeeds. On files not marked with 'direct_io' by the filesystem server, the IO path depends on O_DIRECT flag by the application. This can be passed at the time of open() as well as via fcntl(). Note that asynchronous O_DIRECT iocb jobs are completed synchronously always (this has been the case with FUSE even before this patch) Signed-off-by: Anand Avati Reviewed-by: Jeff Moyer Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 3 -- fs/fuse/file.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 112 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index da379070fab5..df5ac048dc74 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -387,9 +387,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, if (fc->no_create) return -ENOSYS; - if (flags & O_DIRECT) - return -EINVAL; - forget = fuse_alloc_forget(); if (!forget) return -ENOMEM; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4a199fd93fbd..8ca007d09c96 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -194,10 +194,6 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) struct fuse_conn *fc = get_fuse_conn(inode); int err; - /* VFS checks this, but only _after_ ->open() */ - if (file->f_flags & O_DIRECT) - return -EINVAL; - err = generic_file_open(inode, file); if (err) return err; @@ -932,17 +928,23 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; size_t count = 0; + size_t ocount = 0; ssize_t written = 0; + ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; struct iov_iter i; + loff_t endbyte = 0; WARN_ON(iocb->ki_pos != pos); - err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + ocount = 0; + err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); if (err) return err; + count = ocount; + mutex_lock(&inode->i_mutex); vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); @@ -962,11 +964,41 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, file_update_time(file); - iov_iter_init(&i, iov, nr_segs, count, 0); - written = fuse_perform_write(file, mapping, &i, pos); - if (written >= 0) - iocb->ki_pos = pos + written; + if (file->f_flags & O_DIRECT) { + written = generic_file_direct_write(iocb, iov, &nr_segs, + pos, &iocb->ki_pos, + count, ocount); + if (written < 0 || written == count) + goto out; + + pos += written; + count -= written; + iov_iter_init(&i, iov, nr_segs, count, written); + written_buffered = fuse_perform_write(file, mapping, &i, pos); + if (written_buffered < 0) { + err = written_buffered; + goto out; + } + endbyte = pos + written_buffered - 1; + + err = filemap_write_and_wait_range(file->f_mapping, pos, + endbyte); + if (err) + goto out; + + invalidate_mapping_pages(file->f_mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + + written += written_buffered; + iocb->ki_pos = pos + written_buffered; + } else { + iov_iter_init(&i, iov, nr_segs, count, 0); + written = fuse_perform_write(file, mapping, &i, pos); + if (written >= 0) + iocb->ki_pos = pos + written; + } out: current->backing_dev_info = NULL; mutex_unlock(&inode->i_mutex); @@ -1101,30 +1133,41 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf, return res; } -static ssize_t fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t __fuse_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { struct inode *inode = file->f_path.dentry->d_inode; ssize_t res; - if (is_bad_inode(inode)) - return -EIO; - - /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); res = generic_write_checks(file, ppos, &count, 0); if (!res) { res = fuse_direct_io(file, buf, count, ppos, 1); if (res > 0) fuse_write_update_size(inode, *ppos); } - mutex_unlock(&inode->i_mutex); fuse_invalidate_attr(inode); return res; } +static ssize_t fuse_direct_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + ssize_t res; + + if (is_bad_inode(inode)) + return -EIO; + + /* Don't allow parallel writes to the same file */ + mutex_lock(&inode->i_mutex); + res = __fuse_direct_write(file, buf, count, ppos); + mutex_unlock(&inode->i_mutex); + + return res; +} + static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); @@ -2077,6 +2120,57 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc, return 0; } +static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos, int rw) +{ + const struct iovec *vector = iov; + ssize_t ret = 0; + + while (nr_segs > 0) { + void __user *base; + size_t len; + ssize_t nr; + + base = vector->iov_base; + len = vector->iov_len; + vector++; + nr_segs--; + + if (rw == WRITE) + nr = __fuse_direct_write(filp, base, len, ppos); + else + nr = fuse_direct_read(filp, base, len, ppos); + + if (nr < 0) { + if (!ret) + ret = nr; + break; + } + ret += nr; + if (nr != len) + break; + } + + return ret; +} + + +static ssize_t +fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, + loff_t offset, unsigned long nr_segs) +{ + ssize_t ret = 0; + struct file *file = NULL; + loff_t pos = 0; + + file = iocb->ki_filp; + pos = offset; + + ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw); + + return ret; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read = do_sync_read, @@ -2120,6 +2214,7 @@ static const struct address_space_operations fuse_file_aops = { .readpages = fuse_readpages, .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, + .direct_IO = fuse_direct_IO, }; void fuse_init_file_inode(struct inode *inode) -- cgit v1.2.3 From c1ac539ed43f273cd4d92bf7350ffd783b920184 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Thu, 22 Mar 2012 08:58:30 -0400 Subject: GFS2: put glock reference in error patch of read_rindex_entry This patch fixes the error path of function read_rindex_entry so that it correctly gives up its glock reference in cases where there is a race to re-read the rindex after gfs2_grow. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/rgrp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 19bde40b4864..19354a20e5b1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -640,6 +640,7 @@ static int read_rindex_entry(struct gfs2_inode *ip, return 0; error = 0; /* someone else read in the rgrp; free it and ignore it */ + gfs2_glock_put(rgd->rd_gl); fail: kfree(rgd->rd_bits); -- cgit v1.2.3 From 97cc008aaa8c1f02699b478ca890e81810244131 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Fri, 23 Mar 2012 18:06:18 -0400 Subject: GFS2: use depends instead of select in kconfig Avoids having to duplicate the dependencies of what is 'select'ed (and on down...) Those dependencies are currently incomplete, leading to broken builds with GFS2_FS_LOCKING_DLM=y and IP_SCTP=n. Signed-off-by: Benjamin Poirier Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index c465ae066c62..eb08c9e43c2a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,10 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" depends on (64BIT || LBDAF) - select DLM if GFS2_FS_LOCKING_DLM - select CONFIGFS_FS if GFS2_FS_LOCKING_DLM - select SYSFS if GFS2_FS_LOCKING_DLM - select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 select QUOTACTL @@ -29,7 +25,8 @@ config GFS2_FS config GFS2_FS_LOCKING_DLM bool "GFS2 DLM locking" - depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG + depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \ + HOTPLUG && DLM && CONFIGFS_FS && SYSFS help Multiple node locking module for GFS2 -- cgit v1.2.3 From cb85a6ed67e979c59a29b7b4e8217e755b951cf4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 30 Mar 2012 12:23:08 +0200 Subject: proc: stats: Use arch_idle_time for idle and iowait times if available Git commit a25cac5198d4ff28 "proc: Consider NO_HZ when printing idle and iowait times" changes the code for /proc/stat to use get_cpu_idle_time_us and get_cpu_iowait_time_us if the system is running with nohz enabled. For architectures which define arch_idle_time (currently s390 only) this is a change for the worse. The result of arch_idle_time is supposed to be the exact sleep time of the target cpu and should be used instead of the value kept by the scheduler. Signed-off-by: Martin Schwidefsky Reviewed-by: Michal Hocko Reviewed-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/20120330122308.18720283@de.ibm.com Signed-off-by: Thomas Gleixner --- fs/proc/stat.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6a0c62d6e442..64c3b3172367 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -18,19 +18,39 @@ #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif -#ifndef arch_idle_time -#define arch_idle_time(cpu) 0 -#endif + +#ifdef arch_idle_time + +static cputime64_t get_idle_time(int cpu) +{ + cputime64_t idle; + + idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; + if (cpu_online(cpu) && !nr_iowait_cpu(cpu)) + idle += arch_idle_time(cpu); + return idle; +} + +static cputime64_t get_iowait_time(int cpu) +{ + cputime64_t iowait; + + iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT]; + if (cpu_online(cpu) && nr_iowait_cpu(cpu)) + iowait += arch_idle_time(cpu); + return iowait; +} + +#else static u64 get_idle_time(int cpu) { u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL); - if (idle_time == -1ULL) { + if (idle_time == -1ULL) /* !NO_HZ so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; - idle += arch_idle_time(cpu); - } else + else idle = usecs_to_cputime64(idle_time); return idle; @@ -49,6 +69,8 @@ static u64 get_iowait_time(int cpu) return iowait; } +#endif + static int show_stat(struct seq_file *p, void *v) { int i, j; -- cgit v1.2.3 From 5e2f7d617b574dadf3ad125e4821ce1b180b1626 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 4 Apr 2012 22:11:16 -0400 Subject: GFS2: Make sure rindex is uptodate before starting transactions This patch removes the call from gfs2_blk2rgrd to function gfs2_rindex_update and replaces it with individual calls. The former way turned out to be too problematic. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 6 +++++- fs/gfs2/dir.c | 4 ++++ fs/gfs2/inode.c | 13 +++++++++++-- fs/gfs2/rgrp.c | 7 ++++--- fs/gfs2/xattr.c | 12 ++++++++++++ 5 files changed, 36 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 197c5c47e577..03c04febe26f 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -724,7 +724,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, int metadata; unsigned int revokes = 0; int x; - int error = 0; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; if (!*top) sm->sm_first = 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c35573abd371..a836056343f0 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -1844,6 +1844,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, unsigned int x, size = len * sizeof(u64); int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c98a60ee6dfd..a9ba2444e077 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1031,7 +1031,13 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) struct buffer_head *bh; struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; - int error = -EROFS; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + error = -EROFS; gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); @@ -1224,6 +1230,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (odip != ndip) { error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, &r_gh); @@ -1345,7 +1355,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = alloc_required; if (error < 0) goto out_gunlock; - error = 0; if (alloc_required) { struct gfs2_qadata *qa = gfs2_qadata_get(ndip); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 19354a20e5b1..3df65c9ab73b 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -332,9 +332,6 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact) struct rb_node *n, *next; struct gfs2_rgrpd *cur; - if (gfs2_rindex_update(sdp)) - return NULL; - spin_lock(&sdp->sd_rindex_spin); n = sdp->sd_rindex_tree.rb_node; while (n) { @@ -928,6 +925,10 @@ int gfs2_fitrim(struct file *filp, void __user *argp) } else if (copy_from_user(&r, argp, sizeof(r))) return -EFAULT; + ret = gfs2_rindex_update(sdp); + if (ret) + return ret; + rgd = gfs2_blk2rgrpd(sdp, r.start, 0); rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0); diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 2e5ba425cae7..927f4df874ae 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -238,6 +238,10 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh, unsigned int x; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + if (GFS2_EA_IS_STUFFED(ea)) return 0; @@ -1330,6 +1334,10 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) unsigned int x; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); @@ -1439,6 +1447,10 @@ static int ea_dealloc_block(struct gfs2_inode *ip) struct gfs2_holder gh; int error; + error = gfs2_rindex_update(sdp); + if (error) + return error; + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1); if (!rgd) { gfs2_consist_inode(ip); -- cgit v1.2.3 From 640946f20390e492694f9d7470656f2262385951 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 2 Apr 2012 19:22:25 -0400 Subject: dentry leak in simple_fill_super() failure exit d_genocide() does _not_ evict dentries; it just removes extra ref pinning each of those. Normally it's followed by shrinking the tree (it's done just before generic_shutdown_super() by kill_litter_super()), but in case of simple_fill_super() nothing of that kind will follow. Just do shrink_dcache_parent() manually. Signed-off-by: Al Viro --- fs/libfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index 358094f0433d..18d08f5db53a 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -529,6 +529,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic, return 0; out: d_genocide(root); + shrink_dcache_parent(root); dput(root); return -ENOMEM; } -- cgit v1.2.3 From 70fa4a62e913dde2d100e0be2711562742f58bee Mon Sep 17 00:00:00 2001 From: Tom Goff Date: Wed, 4 Apr 2012 12:06:20 -0700 Subject: sysfs: Update the name hash for an entry after changing the namespace This is needed to allow renaming network devices that have been moved to another network namespace. Signed-off-by: Tom Goff Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 2a7a3f5d1ca6..8ddc1021c79a 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -878,7 +878,6 @@ int sysfs_rename(struct sysfs_dirent *sd, dup_name = sd->s_name; sd->s_name = new_name; - sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); } /* Move to the appropriate place in the appropriate directories rbtree. */ @@ -886,6 +885,7 @@ int sysfs_rename(struct sysfs_dirent *sd, sysfs_get(new_parent_sd); sysfs_put(sd->s_parent); sd->s_ns = new_ns; + sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name); sd->s_parent = new_parent_sd; sysfs_link_sibling(sd); -- cgit v1.2.3 From ca9248d8337d525c2d2b26a1d8314478d15707fb Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 10 Apr 2012 08:56:04 -0400 Subject: GFS2: Allow caching of rindex glock This patch allows caching of the rindex glock. We were previously setting the GL_NOCACHE bit when the glock was released. That forced the rindex inode to be invalidated, which caused us to re-read rindex at the next access. However, it caused the glock to be unnecessarily bounced around the cluster. This patch allows the glock to remain cached, but it still causes the rindex to be re-read once it has been written to by gfs2_grow. Ben and I have tested single-node gfs2_grow cases and I've tested clustered gfs2_grow cases on my four-node cluster. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/aops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 38b7a74a0f91..9b2ff0e851b1 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -807,7 +807,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (inode == sdp->sd_rindex) { adjust_fs_space(inode); - ip->i_gh.gh_flags |= GL_NOCACHE; + sdp->sd_rindex_uptodate = 0; } brelse(dibh); @@ -873,7 +873,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, if (inode == sdp->sd_rindex) { adjust_fs_space(inode); - ip->i_gh.gh_flags |= GL_NOCACHE; + sdp->sd_rindex_uptodate = 0; } brelse(dibh); -- cgit v1.2.3 From 5631f2c18f4b2845b3e97df1c659c5094a17605f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bruno=20Pr=C3=A9mont?= Date: Tue, 3 Apr 2012 09:59:48 +0200 Subject: sysfs: Prevent crash on unset sysfs group attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Do not let the kernel crash when a device is registered with sysfs while group attributes are not set (aka NULL). Warn about the offender with some information about the offending device. This would warn instead of trying NULL pointer deref like: BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] internal_create_group+0x83/0x1a0 PGD 0 Oops: 0000 [#1] SMP CPU 0 Modules linked in: Pid: 1, comm: swapper/0 Not tainted 3.4.0-rc1-x86_64 #3 HP ProLiant DL360 G4 RIP: 0010:[] [] internal_create_group+0x83/0x1a0 RSP: 0018:ffff88019485fd70 EFLAGS: 00010202 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001 RDX: ffff880192e99908 RSI: ffff880192e99630 RDI: ffffffff81a26c60 RBP: ffff88019485fdc0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff880192e99908 R11: 0000000000000000 R12: ffffffff81a16a00 R13: ffff880192e99908 R14: ffffffff81a16900 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88019bc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 0000000001a0c000 CR4: 00000000000007f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper/0 (pid: 1, threadinfo ffff88019485e000, task ffff880194878000) Stack: ffff88019485fdd0 ffff880192da9d60 0000000000000000 ffff880192e99908 ffff880192e995d8 0000000000000001 ffffffff81a16a00 ffff880192da9d60 0000000000000000 0000000000000000 ffff88019485fdd0 ffffffff811527be Call Trace: [] sysfs_create_group+0xe/0x10 [] device_add_groups+0x46/0x80 [] device_add+0x46d/0x6a0 ... Signed-off-by: Bruno Prémont Acked-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/group.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index dd1701caecc9..2df555c66d57 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -67,7 +67,11 @@ static int internal_create_group(struct kobject *kobj, int update, /* Updates may happen before the object has been instantiated */ if (unlikely(update && !kobj->sd)) return -EINVAL; - + if (!grp->attrs) { + WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n", + kobj->name, grp->name ? "" : grp->name); + return -EINVAL; + } if (grp->name) { error = sysfs_create_subdir(kobj, grp->name, &sd); if (error) -- cgit v1.2.3 From 3a198886ab5f228fcbebb9ace803d8b99721d49a Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 6 Apr 2012 13:41:06 -0700 Subject: sysfs: handle 'parent deleted before child added' In scsi at least two cases of the parent device being deleted before the child is added have been observed. 1/ scsi is performing async scans and the device is removed prior to the async can thread running (can happen with an in-opportune / unlikely unplug during initial scan). 2/ libsas discovery event running after the parent port has been torn down (this is a bug in libsas). Result in crash signatures like: BUG: unable to handle kernel NULL pointer dereference at 0000000000000098 IP: [] sysfs_create_dir+0x32/0xb6 ... Process scsi_scan_8 (pid: 5417, threadinfo ffff88080bd16000, task ffff880801b8a0b0) Stack: 00000000fffffffe ffff880813470628 ffff88080bd17cd0 ffff88080614b7e8 ffff88080b45c108 00000000fffffffe ffff88080bd17d20 ffffffff8125e4a8 ffff88080bd17cf0 ffffffff81075149 ffff88080bd17d30 ffff88080614b7e8 Call Trace: [] kobject_add_internal+0x120/0x1e3 [] ? trace_hardirqs_on+0xd/0xf [] kobject_add_varg+0x41/0x50 [] kobject_add+0x64/0x66 [] device_add+0x12d/0x63a In this scenario the parent is still valid (because we have a reference), but it has been device_del()'d which means its kobj->sd pointer is NULL'd via: device_del()->kobject_del()->sysfs_remove_dir() ...and then sysfs_create_dir() (without this fix) goes ahead and de-references parent_sd via sysfs_ns_type(): return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT; This scenario is being fixed in scsi/libsas, but if other subsystems present the same ordering the system need not immediately crash. Cc: Greg Kroah-Hartman Cc: James Bottomley Signed-off-by: Dan Williams Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/dir.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 8ddc1021c79a..35a36d39fa2c 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -729,6 +729,9 @@ int sysfs_create_dir(struct kobject * kobj) else parent_sd = &sysfs_root; + if (!parent_sd) + return -ENOENT; + if (sysfs_ns_type(parent_sd)) ns = kobj->ktype->namespace(kobj); type = sysfs_read_ns_type(kobj); -- cgit v1.2.3 From 0a2da9b2ef2ef76c09397597f260245b020e6522 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 11 Apr 2012 11:45:06 +0200 Subject: fuse: allow nanosecond granularity Derrik Pates reports that an utimensat with a NULL argument results in the current time being sent from the kernel with 1 second granularity. Reported-by: Derrik Pates Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 64cf8d07393e..c9a0c972a4b2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -947,6 +947,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = FUSE_SUPER_MAGIC; sb->s_op = &fuse_super_operations; sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; sb->s_export_op = &fuse_export_operations; file = fget(d.fd); -- cgit v1.2.3 From 9dc4e6c4d1182d34604ea40fef641775f5b15456 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 9 Apr 2012 18:06:49 -0400 Subject: nfsd: don't fail unchecked creates of non-special files Allow a v3 unchecked open of a non-regular file succeed as if it were a lookup; typically a client in such a case will want to fall back on a local open, so succeeding and giving it the filehandle is more useful than failing with nfserr_exist, which makes it appear that nothing at all exists by that name. Similarly for v4, on an open-create, return the same errors we would on an attempt to open a non-regular file, instead of returning nfserr_exist. This fixes a problem found doing a v4 open of a symlink with O_RDONLY|O_CREAT, which resulted in the current client returning EEXIST. Thanks also to Trond for analysis. Cc: stable@kernel.org Reported-by: Orion Poplawski Tested-by: Orion Poplawski Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4proc.c | 8 ++++---- fs/nfsd/vfs.c | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2ed14dfd00a2..8256efda3638 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -235,15 +235,15 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o */ if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS | - FATTR4_WORD1_TIME_MODIFY); + FATTR4_WORD1_TIME_MODIFY); } else { status = nfsd_lookup(rqstp, current_fh, open->op_fname.data, open->op_fname.len, resfh); fh_unlock(current_fh); - if (status) - goto out; - status = nfsd_check_obj_isreg(resfh); } + if (status) + goto out; + status = nfsd_check_obj_isreg(resfh); if (status) goto out; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 296d671654d6..568666156ea4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1458,7 +1458,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, switch (createmode) { case NFS3_CREATE_UNCHECKED: if (! S_ISREG(dchild->d_inode->i_mode)) - err = nfserr_exist; + goto out; else if (truncp) { /* in nfsv4, we need to treat this case a little * differently. we don't want to truncate the -- cgit v1.2.3 From 4fe9e9639d95cd11de63afa353f2de320f26033a Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 10 Apr 2012 18:12:27 +0100 Subject: Cleanup handling of NULL value passed for a mount option Allow blank user= and ip= mount option. Also clean up redundant checks for NULL values since the token parser will not actually match mount options with NULL values unless explicitly specified. Signed-off-by: Sachin Prabhu Reported-by: Chris Clayton Acked-by: Jeff Layton Tested-by: Chris Clayton Signed-off-by: Steve French --- fs/cifs/connect.c | 80 +++++++++++++------------------------------------------ 1 file changed, 19 insertions(+), 61 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d81e933a796b..6a86f3d68182 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -109,6 +109,8 @@ enum { /* Options which could be blank */ Opt_blank_pass, + Opt_blank_user, + Opt_blank_ip, Opt_err }; @@ -183,11 +185,15 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_blank_user, "user=" }, + { Opt_blank_user, "username=" }, { Opt_user, "user=%s" }, { Opt_user, "username=%s" }, { Opt_blank_pass, "pass=" }, { Opt_pass, "pass=%s" }, { Opt_pass, "password=%s" }, + { Opt_blank_ip, "ip=" }, + { Opt_blank_ip, "addr=" }, { Opt_ip, "ip=%s" }, { Opt_ip, "addr=%s" }, { Opt_unc, "unc=%s" }, @@ -1534,15 +1540,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, /* String Arguments */ + case Opt_blank_user: + /* null user, ie. anonymous authentication */ + vol->nullauth = 1; + vol->username = NULL; + break; case Opt_user: string = match_strdup(args); if (string == NULL) goto out_nomem; - if (!*string) { - /* null user, ie. anonymous authentication */ - vol->nullauth = 1; - } else if (strnlen(string, MAX_USERNAME_SIZE) > + if (strnlen(string, MAX_USERNAME_SIZE) > MAX_USERNAME_SIZE) { printk(KERN_WARNING "CIFS: username too long\n"); goto cifs_parse_mount_err; @@ -1611,14 +1619,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } vol->password[j] = '\0'; break; + case Opt_blank_ip: + vol->UNCip = NULL; + break; case Opt_ip: string = match_strdup(args); if (string == NULL) goto out_nomem; - if (!*string) { - vol->UNCip = NULL; - } else if (strnlen(string, INET6_ADDRSTRLEN) > + if (strnlen(string, INET6_ADDRSTRLEN) > INET6_ADDRSTRLEN) { printk(KERN_WARNING "CIFS: ip address " "too long\n"); @@ -1636,12 +1645,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: invalid path to " - "network resource\n"); - goto cifs_parse_mount_err; - } - temp_len = strnlen(string, 300); if (temp_len == 300) { printk(KERN_WARNING "CIFS: UNC name too long\n"); @@ -1670,11 +1673,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: invalid domain" - " name\n"); - goto cifs_parse_mount_err; - } else if (strnlen(string, 256) == 256) { + if (strnlen(string, 256) == 256) { printk(KERN_WARNING "CIFS: domain name too" " long\n"); goto cifs_parse_mount_err; @@ -1693,11 +1692,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: srcaddr value not" - " specified\n"); - goto cifs_parse_mount_err; - } else if (!cifs_convert_address( + if (!cifs_convert_address( (struct sockaddr *)&vol->srcaddr, string, strlen(string))) { printk(KERN_WARNING "CIFS: Could not parse" @@ -1710,11 +1705,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: Invalid path" - " prefix\n"); - goto cifs_parse_mount_err; - } temp_len = strnlen(string, 1024); if (string[0] != '/') temp_len++; /* missing leading slash */ @@ -1742,11 +1732,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: Invalid iocharset" - " specified\n"); - goto cifs_parse_mount_err; - } else if (strnlen(string, 1024) >= 65) { + if (strnlen(string, 1024) >= 65) { printk(KERN_WARNING "CIFS: iocharset name " "too long.\n"); goto cifs_parse_mount_err; @@ -1771,11 +1757,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: No socket option" - " specified\n"); - goto cifs_parse_mount_err; - } if (strnicmp(string, "TCP_NODELAY", 11) == 0) vol->sockopt_tcp_nodelay = 1; break; @@ -1784,12 +1765,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: Invalid (empty)" - " netbiosname\n"); - break; - } - memset(vol->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); /* @@ -1817,11 +1792,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: Empty server" - " netbiosname specified\n"); - break; - } /* last byte, type, is 0x20 for servr type */ memset(vol->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); @@ -1848,12 +1818,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - cERROR(1, "no protocol version specified" - " after vers= mount option"); - goto cifs_parse_mount_err; - } - if (strnicmp(string, "cifs", 4) == 0 || strnicmp(string, "1", 1) == 0) { /* This is the default */ @@ -1868,12 +1832,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (!*string) { - printk(KERN_WARNING "CIFS: no security flavor" - " specified\n"); - break; - } - if (cifs_parse_security_flavors(string, vol) != 0) goto cifs_parse_mount_err; break; -- cgit v1.2.3 From 8e62c2de6e23e5c1fee04f59de51b54cc2868ca5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Apr 2012 13:46:48 -0400 Subject: Revert "Btrfs: increase the global block reserve estimates" This reverts commit 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf. We've had a number of complaints of early enospc that bisect down to this patch. We'll hae to fix the reservations differently. CC: stable@kernel.org Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a84420491c11..ace5e8cef03e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4205,7 +4205,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes += div64_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3) * 2; + num_bytes = div64_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); } -- cgit v1.2.3 From d95603b262edb53d6016a8df0c150371d4d61e67 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Apr 2012 15:55:15 -0400 Subject: Btrfs: fix uninit variable in repair_eb_io_failure We'd have to be passing bogus extent buffers for this uninit variable to actually be used, but set it to zero just in case. Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0c3ec003f273..59ec105444fe 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1937,7 +1937,7 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; u64 start = eb->start; unsigned long i, num_pages = num_extent_pages(eb->start, eb->len); - int ret; + int ret = 0; for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); -- cgit v1.2.3 From b89203f74bdfcb15407d54d3f257b16a2ea19e62 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: fix eof while discarding extents We miscalculate the length of extents we're discarding, and it leads to an eof of device. Reported-by: Daniel Blueman Signed-off-by: Liu Bo Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a872b48be0ae..759d02486d7c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3833,6 +3833,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, int sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; + u32 last_stripe = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -3846,6 +3847,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, stripe_nr_orig, factor, &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; } for (i = 0; i < num_stripes; i++) { @@ -3858,16 +3861,29 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, BTRFS_BLOCK_GROUP_RAID10)) { bbio->stripes[i].length = stripes_per_dev * map->stripe_len; + if (i / sub_stripes < remaining_stripes) bbio->stripes[i].length += map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ if (i < sub_stripes) bbio->stripes[i].length -= stripe_offset; - if ((i / sub_stripes + 1) % - sub_stripes == remaining_stripes) + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) bbio->stripes[i].length -= stripe_end_offset; + if (i == sub_stripes - 1) stripe_offset = 0; } else -- cgit v1.2.3 From c6664b42c4e567792abdb17c958fb01c5bcfcb3a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: remove lock assert from get_restripe_target() This fixes a regression introduced by fc67c450. spin_is_locked() always returns 0 on UP kernels, which caused assert in get_restripe_target() to be fired on every call from btrfs_reduce_alloc_profile() on UP systems. Remove it completely for now, it's not clear if it's going to be needed in future. Reported-by: Bobby Powers Reported-by: Mitch Harder Tested-by: Mitch Harder Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index ace5e8cef03e..a2134d8141c1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3152,15 +3152,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) /* * returns target flags in extended format or 0 if restripe for this * chunk_type is not in progress + * + * should be called with either volume_mutex or balance_lock held */ static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags) { struct btrfs_balance_control *bctl = fs_info->balance_ctl; u64 target = 0; - BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) && - !spin_is_locked(&fs_info->balance_lock)); - if (!bctl) return 0; -- cgit v1.2.3 From e627ee7bcd42b4e3a03ca01a8e46dcb4033c5ae0 Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: check return value of bio_alloc() properly bio_alloc() has the possibility of returning NULL. So, it is necessary to check the return value. Signed-off-by: Tsutomu Itoh Signed-off-by: Chris Mason --- fs/btrfs/compression.c | 2 ++ fs/btrfs/extent_io.c | 4 ++++ fs/btrfs/scrub.c | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index d11afa67c7d8..646f5e6f2566 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -405,6 +405,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_put(bio); bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + BUG_ON(!bio); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); @@ -687,6 +688,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + BUG_ON(!comp_bio); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 59ec105444fe..4789770f8eaf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2180,6 +2180,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page, } bio = bio_alloc(GFP_NOFS, 1); + if (!bio) { + free_io_failure(inode, failrec, 0); + return -EIO; + } bio->bi_private = state; bio->bi_end_io = failed_bio->bi_end_io; bio->bi_sector = failrec->logical >> 9; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index c9a2c1aef4bd..60f0e28db31e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1044,6 +1044,8 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info, BUG_ON(!page->page); bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; bio->bi_bdev = page->bdev; bio->bi_sector = page->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; @@ -1172,6 +1174,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, DECLARE_COMPLETION_ONSTACK(complete); bio = bio_alloc(GFP_NOFS, 1); + if (!bio) + return -EIO; bio->bi_bdev = page_bad->bdev; bio->bi_sector = page_bad->physical >> 9; bio->bi_end_io = scrub_complete_bio_end_io; -- cgit v1.2.3 From 4edc2ca388d62abffe38149f6ac00e749ea721c5 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 12 Apr 2012 16:03:56 -0400 Subject: Btrfs: fix use-after-free in __btrfs_end_transaction 49b25e0540904be0bf558b84475c69d72e4de66e introduced a use-after-free bug that caused spurious -EIO's to be returned. Do the check before we free the transaction. Cc: David Sterba Cc: Jeff Mahoney Signed-off-by: Dave Jones Signed-off-by: Chris Mason --- fs/btrfs/transaction.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8da29e8e4de1..11b77a59db62 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -480,6 +480,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *info = root->fs_info; int count = 0; + int err = 0; if (--trans->use_count) { trans->block_rsv = trans->orig_rsv; @@ -532,18 +533,18 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (current->journal_info == trans) current->journal_info = NULL; - memset(trans, 0, sizeof(*trans)); - kmem_cache_free(btrfs_trans_handle_cachep, trans); if (throttle) btrfs_run_delayed_iputs(root); if (trans->aborted || root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { - return -EIO; + err = -EIO; } - return 0; + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + return err; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, -- cgit v1.2.3 From d53ba47484ed6245e640ee4bfe9d21e9bfc15765 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Apr 2012 16:03:57 -0400 Subject: Btrfs: use commit root when loading free space cache A user reported that booting his box up with btrfs root on 3.4 was way slower than on 3.3 because I removed the ideal caching code. It turns out that we don't load the free space cache if we're in a commit for deadlock reasons, but since we're reading the cache and it hasn't changed yet we are safe reading the inode and free space item from the commit root, so do that and remove all of the deadlock checks so we don't unnecessarily skip loading the free space cache. The user reported this fixed the slowness. Thanks, Tested-by: Calvin Walton Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 +--- fs/btrfs/free-space-cache.c | 9 ++------- 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a2134d8141c1..2b35f8d14bb9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -529,9 +529,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, * allocate blocks for the tree root we can't do the fast caching since * we likely hold important locks. */ - if (trans && (!trans->transaction->in_commit) && - (root && root != root->fs_info->tree_root) && - btrfs_test_opt(root, SPACE_CACHE)) { + if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { ret = load_free_space_cache(fs_info, cache); spin_lock(&cache->lock); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 054707ed5791..baaa518baaf8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -747,13 +747,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, bool matched; u64 used = btrfs_block_group_used(&block_group->item); - /* - * If we're unmounting then just return, since this does a search on the - * normal root and not the commit root and we could deadlock. - */ - if (btrfs_fs_closing(fs_info)) - return 0; - /* * If this block group has been marked to be cleared for one reason or * another then we can't trust the on disk cache, so just return. @@ -768,6 +761,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info, path = btrfs_alloc_path(); if (!path) return 0; + path->search_commit_root = 1; + path->skip_locking = 1; inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) { -- cgit v1.2.3 From 96f6f98501196d46ce52c2697dd758d9300c63f5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Apr 2012 23:47:00 -0400 Subject: nfsd: fix b0rken error value for setattr on read-only mount ..._want_write() returns -EROFS on failure, _not_ an NFS error value. Signed-off-by: Al Viro --- fs/nfsd/nfs4proc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2ed14dfd00a2..694d526c8f19 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -841,6 +841,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_setattr *setattr) { __be32 status = nfs_ok; + int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { nfs4_lock_state(); @@ -852,9 +853,9 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } - status = fh_want_write(&cstate->current_fh); - if (status) - return status; + err = fh_want_write(&cstate->current_fh); + if (err) + return nfserrno(err); status = nfs_ok; status = check_attr_support(rqstp, cstate, setattr->sa_bmval, -- cgit v1.2.3 From 04da6e9d63427b2d0fd04766712200c250b3278f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 00:00:04 -0400 Subject: nfsd: fix error values returned by nfsd4_lockt() when nfsd_open() fails nfsd_open() already returns an NFS error value; only vfs_test_lock() result needs to be fed through nfserrno(). Broken by commit 55ef12 (nfsd: Ensure nfsv4 calls the underlying filesystem on LOCKT) three years ago... Signed-off-by: Al Viro --- fs/nfsd/nfs4state.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1841f8bf845e..7f71c69cdcdf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4211,16 +4211,14 @@ out: * vfs_test_lock. (Arguably perhaps test_lock should be done with an * inode operation.) */ -static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) { struct file *file; - int err; - - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - return err; - err = vfs_test_lock(file, lock); - nfsd_close(file); + __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (!err) { + err = nfserrno(vfs_test_lock(file, lock)); + nfsd_close(file); + } return err; } @@ -4234,7 +4232,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct inode *inode; struct file_lock file_lock; struct nfs4_lockowner *lo; - int error; __be32 status; if (locks_in_grace()) @@ -4280,12 +4277,10 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfs4_transform_lock_offset(&file_lock); - status = nfs_ok; - error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); - if (error) { - status = nfserrno(error); + status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); + if (status) goto out; - } + if (file_lock.fl_type != F_UNLCK) { status = nfserr_denied; nfs4_set_lock_denied(&file_lock, &lockt->lt_denied); -- cgit v1.2.3 From 02f5fde5df0ea930e70f93763dd48beff182b208 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 00:10:34 -0400 Subject: nfsd: fix endianness breakage in TEST_STATEID handling ->ts_id_status gets nfs errno, i.e. it's already big-endian; no need to apply htonl() to it. Broken by commit 174568 (NFSD: Added TEST_STATEID operation) last year... Signed-off-by: Al Viro --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index bcd8904ab1e3..07a99b6d4bbe 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3410,7 +3410,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr, *p++ = htonl(test_stateid->ts_num_ids); list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) { - *p++ = htonl(stateid->ts_id_status); + *p++ = stateid->ts_id_status; } ADJUST_ARGS(); -- cgit v1.2.3 From afcf6792afd66209161495f691e19d4fc5460a93 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 00:15:37 -0400 Subject: nfsd: fix error value on allocation failure in nfsd4_decode_test_stateid() PTR_ERR(NULL) is going to be 0... Signed-off-by: Al Viro --- fs/nfsd/nfs4xdr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 07a99b6d4bbe..74c00bc92b9a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1392,7 +1392,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta for (i = 0; i < test_stateid->ts_num_ids; i++) { stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL); if (!stateid) { - status = PTR_ERR(stateid); + status = nfserrno(-ENOMEM); goto out; } -- cgit v1.2.3 From efe39651f08813180f37dc508d950fc7d92b29a8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 00:32:14 -0400 Subject: nfsd: fix compose_entry_fh() failure exits Restore the original logics ("fail on mountpoints, negatives and in case of fh_compose() failures"). Since commit 8177e (nfsd: clean up readdirplus encoding) that got broken - rv = fh_compose(fhp, exp, dchild, &cd->fh); if (rv) goto out; if (!dchild->d_inode) goto out; rv = 0; out: is equivalent to rv = fh_compose(fhp, exp, dchild, &cd->fh); out: and the second check has no effect whatsoever... Signed-off-by: Al Viro --- fs/nfsd/nfs3xdr.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 08c6e36ab2eb..43f46cd9edea 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -803,13 +803,13 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, return p; } -static int +static __be32 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, const char *name, int namlen) { struct svc_export *exp; struct dentry *dparent, *dchild; - int rv = 0; + __be32 rv = nfserr_noent; dparent = cd->fh.fh_dentry; exp = cd->fh.fh_export; @@ -817,26 +817,20 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); - if (dchild == dparent) { - /* filesystem root - cannot return filehandle for ".." */ - dput(dchild); - return -ENOENT; - } + /* filesystem root - cannot return filehandle for ".." */ + if (dchild == dparent) + goto out; } else dchild = dget(dparent); } else dchild = lookup_one_len(name, dparent, namlen); if (IS_ERR(dchild)) - return -ENOENT; - rv = -ENOENT; + return rv; if (d_mountpoint(dchild)) goto out; - rv = fh_compose(fhp, exp, dchild, &cd->fh); - if (rv) - goto out; if (!dchild->d_inode) goto out; - rv = 0; + rv = fh_compose(fhp, exp, dchild, &cd->fh); out: dput(dchild); return rv; @@ -845,7 +839,7 @@ out: static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) { struct svc_fh fh; - int err; + __be32 err; fh_init(&fh, NFS3_FHSIZE); err = compose_entry_fh(cd, &fh, name, namlen); -- cgit v1.2.3 From af1584f570b19b0285e4402a0b54731495d31784 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 12 Apr 2012 20:32:25 -0400 Subject: ext4: fix endianness breakage in ext4_split_extent_at() ->ee_len is __le16, so assigning cpu_to_le32() to it is going to do Bad Things(tm) on big-endian hosts... Signed-off-by: Al Viro --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1421938e6792..1d418387ffb0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2882,7 +2882,7 @@ static int ext4_split_extent_at(handle_t *handle, if (err) goto fix_extent_len; /* update the extent length and mark as initialized */ - ex->ee_len = cpu_to_le32(ee_len); + ex->ee_len = cpu_to_le16(ee_len); ext4_ext_try_to_merge(inode, path, ex); err = ext4_ext_dirty(handle, inode, path + depth); goto out; -- cgit v1.2.3 From bfa890a3cdeed29eef53d54cd7f80cec0fd46b11 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Fri, 13 Apr 2012 14:04:32 +0100 Subject: Fix number parsing in cifs_parse_mount_options The function kstrtoul() used to parse number strings in the mount option parser is set to expect a base 10 number . This treats the octal numbers passed for mount options such as file_mode as base10 numbers leading to incorrect behavior. Change the 'base' argument passed to kstrtoul from 10 to 0 to allow it to auto-detect the base of the number passed. Signed-off-by: Sachin Prabhu Acked-by: Jeff Layton Reported-by: Chris Clayton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 6a86f3d68182..f31dc9ac37b7 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1123,7 +1123,7 @@ static int get_option_ul(substring_t args[], unsigned long *option) string = match_strdup(args); if (string == NULL) return -ENOMEM; - rc = kstrtoul(string, 10, option); + rc = kstrtoul(string, 0, option); kfree(string); return rc; -- cgit v1.2.3 From 6ed3cf2cdfce4c9f1d73171bd3f27d9cb77b734e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 11:49:04 -0400 Subject: btrfs: btrfs_root_readonly() broken on big-endian ->root_flags is __le64 and all accesses to it go through the helpers that do proper conversions. Except for btrfs_root_readonly(), which checks bit 0 as in host-endian... Signed-off-by: Al Viro --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b8ef8eb3521..3f65a812e282 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2166,7 +2166,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, static inline bool btrfs_root_readonly(struct btrfs_root *root) { - return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; } /* struct btrfs_root_backup */ -- cgit v1.2.3 From 3a251f04fe97c3d335b745c98e4b377e3c3899f2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 12:22:00 -0400 Subject: ocfs2: ->l_next_free_req breakage on big-endian It's le16, not le32... Signed-off-by: Al Viro --- fs/ocfs2/alloc.c | 2 +- fs/ocfs2/refcounttree.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 3165aebb43c8..31b9463fba1f 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1134,7 +1134,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle, } el = path_leaf_el(path); - rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1]; + rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1]; ocfs2_adjust_rightmost_records(handle, et, path, rec); diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index cf7823382664..a7b7217062b7 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1036,14 +1036,14 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, tmp_el = left_path->p_node[subtree_root].el; blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; - for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) { if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); break; } } - BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec)); out: ocfs2_free_path(left_path); -- cgit v1.2.3 From e1bf4cc620fd143766ddfcee3b004a1d1bb34fd0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 12:27:11 -0400 Subject: ocfs: ->rl_used breakage on big-endian it's le16, not le32 or le64... Signed-off-by: Al Viro --- fs/ocfs2/refcounttree.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index a7b7217062b7..bc90ebcec169 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1468,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, trace_ocfs2_divide_leaf_refcount_block( (unsigned long long)ref_leaf_bh->b_blocknr, - le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used)); + le32_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); /* * XXX: Improvement later. @@ -2411,7 +2411,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, rb = (struct ocfs2_refcount_block *) prev_bh->b_data; - if (le64_to_cpu(rb->rf_records.rl_used) + + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; @@ -2476,7 +2476,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb, if (prev_bh) { rb = (struct ocfs2_refcount_block *)prev_bh->b_data; - if (le64_to_cpu(rb->rf_records.rl_used) + recs_add > + if (le16_to_cpu(rb->rf_records.rl_used) + recs_add > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; @@ -3629,7 +3629,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode, * one will split a refcount rec, so totally we need * clusters * 2 new refcount rec. */ - if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 > + if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 > le16_to_cpu(rb->rf_records.rl_count)) ref_blocks++; -- cgit v1.2.3 From 28748b325dc2d730ccc312830a91c4ae0c0d9379 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 12:28:21 -0400 Subject: ocfs2: ->rl_count endianness breakage le16, not le32... Signed-off-by: Al Viro --- fs/ocfs2/refcounttree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index bc90ebcec169..9f32d7cbb7a3 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1468,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, trace_ocfs2_divide_leaf_refcount_block( (unsigned long long)ref_leaf_bh->b_blocknr, - le32_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); + le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used)); /* * XXX: Improvement later. -- cgit v1.2.3 From 72094e43e3af5020510f920321d71f1798fa896d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 12:30:02 -0400 Subject: ocfs2: ->e_leaf_clusters endianness breakage le16, not le32... Signed-off-by: Al Viro --- fs/ocfs2/suballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index ba5d97e4a73e..f169da4624fd 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -600,7 +600,7 @@ static void ocfs2_bg_alloc_cleanup(handle_t *handle, ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode, cluster_ac->ac_bh, le64_to_cpu(rec->e_blkno), - le32_to_cpu(rec->e_leaf_clusters)); + le16_to_cpu(rec->e_leaf_clusters)); if (ret) mlog_errno(ret); /* Try all the clusters to free */ @@ -1628,7 +1628,7 @@ static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res, { unsigned int bpc = le16_to_cpu(cl->cl_bpc); unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc; - unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc; + unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc; if (res->sr_bit_offset < bitoff) return 0; -- cgit v1.2.3 From e847469bf77a1d339274074ed068d461f0c872bc Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 13 Apr 2012 13:49:47 -0400 Subject: lockd: fix the endianness bug comparing be32 values for < is not doing the right thing... Signed-off-by: Al Viro --- fs/lockd/clnt4xdr.c | 2 +- fs/lockd/clntxdr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c index 3ddcbb1c0a43..13ad1539fbf2 100644 --- a/fs/lockd/clnt4xdr.c +++ b/fs/lockd/clnt4xdr.c @@ -241,7 +241,7 @@ static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat) p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_overflow; - if (unlikely(*p > nlm4_failed)) + if (unlikely(ntohl(*p) > ntohl(nlm4_failed))) goto out_bad_xdr; *stat = *p; return 0; diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c index 3d35e3e80c1c..d269ada7670e 100644 --- a/fs/lockd/clntxdr.c +++ b/fs/lockd/clntxdr.c @@ -236,7 +236,7 @@ static int decode_nlm_stat(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4); if (unlikely(p == NULL)) goto out_overflow; - if (unlikely(*p > nlm_lck_denied_grace_period)) + if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period))) goto out_enum; *stat = *p; return 0; -- cgit v1.2.3 From 9cd70b347e9761ea2d2ac3d758c529a48a8193e6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 16 Apr 2012 12:16:20 -0400 Subject: ext4: address scalability issue by removing extent cache statistics Andi Kleen and Tim Chen have reported that under certain circumstances the extent cache statistics are causing scalability problems due to cache line bounces. Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 3 --- fs/ext4/extents.c | 4 ---- fs/ext4/super.c | 16 ---------------- 3 files changed, 23 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ab2594a30f86..0e01e90add8b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1203,9 +1203,6 @@ struct ext4_sb_info { unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif - /* ext4 extent cache stats */ - unsigned long extent_cache_hits; - unsigned long extent_cache_misses; /* for buddy allocator */ struct ext4_group_info ***s_group_info; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1421938e6792..8a12cd207679 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2066,10 +2066,6 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block, ret = 1; } errout: - if (!ret) - sbi->extent_cache_misses++; - else - sbi->extent_cache_hits++; trace_ext4_ext_in_cache(inode, block, ret); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ceebaf853beb..ef7db5044262 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2366,18 +2366,6 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, EXT4_SB(sb)->s_sectors_written_start) >> 1))); } -static ssize_t extent_cache_hits_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits); -} - -static ssize_t extent_cache_misses_show(struct ext4_attr *a, - struct ext4_sb_info *sbi, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses); -} - static ssize_t inode_readahead_blks_store(struct ext4_attr *a, struct ext4_sb_info *sbi, const char *buf, size_t count) @@ -2435,8 +2423,6 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) EXT4_RO_ATTR(delayed_allocation_blocks); EXT4_RO_ATTR(session_write_kbytes); EXT4_RO_ATTR(lifetime_write_kbytes); -EXT4_RO_ATTR(extent_cache_hits); -EXT4_RO_ATTR(extent_cache_misses); EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show, inode_readahead_blks_store, s_inode_readahead_blks); EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal); @@ -2452,8 +2438,6 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), ATTR_LIST(session_write_kbytes), ATTR_LIST(lifetime_write_kbytes), - ATTR_LIST(extent_cache_hits), - ATTR_LIST(extent_cache_misses), ATTR_LIST(inode_readahead_blks), ATTR_LIST(inode_goal), ATTR_LIST(mb_stats), -- cgit v1.2.3 From 57f73c2c89a5d3b2ed87201c8100d1fa989a1a65 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 16 Apr 2012 18:55:26 -0400 Subject: ext4: fix handling of journalled quota options Commit 26092bf5 broke handling of journalled quota mount options by trying to parse argument of every mount option as a number. Fix this by dealing with the quota options before we call match_int(). Thanks to Jan Kara for discovering this regression. Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/super.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ef7db5044262..6da193564e43 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1305,20 +1305,20 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) ext4_msg(sb, KERN_ERR, "Cannot change journaled " "quota options when quota turned on"); - return 0; + return -1; } qname = match_strdup(args); if (!qname) { ext4_msg(sb, KERN_ERR, "Not enough memory for storing quotafile name"); - return 0; + return -1; } if (sbi->s_qf_names[qtype] && strcmp(sbi->s_qf_names[qtype], qname)) { ext4_msg(sb, KERN_ERR, "%s quota file already specified", QTYPE2NAME(qtype)); kfree(qname); - return 0; + return -1; } sbi->s_qf_names[qtype] = qname; if (strchr(sbi->s_qf_names[qtype], '/')) { @@ -1326,7 +1326,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args) "quotafile must be on filesystem root"); kfree(sbi->s_qf_names[qtype]); sbi->s_qf_names[qtype] = NULL; - return 0; + return -1; } set_opt(sb, QUOTA); return 1; @@ -1341,7 +1341,7 @@ static int clear_qf_name(struct super_block *sb, int qtype) sbi->s_qf_names[qtype]) { ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options" " when quota turned on"); - return 0; + return -1; } /* * The space will be released later when all options are confirmed @@ -1450,6 +1450,16 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, const struct mount_opts *m; int arg = 0; +#ifdef CONFIG_QUOTA + if (token == Opt_usrjquota) + return set_qf_name(sb, USRQUOTA, &args[0]); + else if (token == Opt_grpjquota) + return set_qf_name(sb, GRPQUOTA, &args[0]); + else if (token == Opt_offusrjquota) + return clear_qf_name(sb, USRQUOTA); + else if (token == Opt_offgrpjquota) + return clear_qf_name(sb, GRPQUOTA); +#endif if (args->from && match_int(args, &arg)) return -1; switch (token) { @@ -1549,18 +1559,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_opt |= m->mount_opt; } #ifdef CONFIG_QUOTA - } else if (token == Opt_usrjquota) { - if (!set_qf_name(sb, USRQUOTA, &args[0])) - return -1; - } else if (token == Opt_grpjquota) { - if (!set_qf_name(sb, GRPQUOTA, &args[0])) - return -1; - } else if (token == Opt_offusrjquota) { - if (!clear_qf_name(sb, USRQUOTA)) - return -1; - } else if (token == Opt_offgrpjquota) { - if (!clear_qf_name(sb, GRPQUOTA)) - return -1; } else if (m->flags & MOPT_QFMT) { if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != m->mount_opt) { -- cgit v1.2.3 From ca138f368a36dba40d3ef4a53d64af2011cda3c7 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Thu, 5 Apr 2012 15:26:36 -0400 Subject: NFS: check for req==NULL in nfs_try_to_update_request cleanup Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2c68818f68ac..9b8d4d42a8af 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -682,7 +682,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, req->wb_bytes = rqend - req->wb_offset; out_unlock: spin_unlock(&inode->i_lock); - nfs_clear_request_commit(req); + if (req) + nfs_clear_request_commit(req); return req; out_flushme: spin_unlock(&inode->i_lock); -- cgit v1.2.3 From 848cce0d4102b5b4b26b0987b43e1919d462afe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Feb 2012 17:04:28 +0800 Subject: Btrfs: avoid setting ->d_op twice Follow those instructions, and you'll trigger a warning in the beginning of d_set_d_op(): # mkfs.btrfs /dev/loop3 # mount /dev/loop3 /mnt # btrfs sub create /mnt/sub # btrfs sub snap /mnt /mnt/snap # touch /mnt/snap/sub touch: cannot touch `tmp': Permission denied __d_alloc() set d_op to sb->s_d_op (btrfs_dentry_operations), and then simple_lookup() reset it to simple_dentry_operations, which triggered the warning. Signed-off-by: Li Zefan --- fs/btrfs/inode.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1be31368e881..a682c267576d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4069,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s, BTRFS_I(inode)->dummy_inode = 1; inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &simple_dir_inode_operations; + inode->i_op = &btrfs_dir_ro_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; @@ -4140,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; + struct inode *inode = dentry->d_inode; - if (!dentry->d_inode && !IS_ROOT(dentry)) - dentry = dentry->d_parent; + if (!inode && !IS_ROOT(dentry)) + inode = dentry->d_parent->d_inode; - if (dentry->d_inode) { - root = BTRFS_I(dentry->d_inode)->root; + if (inode) { + root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; + + if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) + return 1; } return 0; } -- cgit v1.2.3 From 8c9c2bf7a3c4f7e9d158c0be9c49f372fb943ad2 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:30 +0100 Subject: btrfs: fix race in reada When inserting into the radix tree returns EEXIST, get the existing entry without giving up the spinlock in between. There was a race for both the zones trees and the extent tree. Signed-off-by: Arne Jansen --- fs/btrfs/inode.c | 8 +++++++- fs/btrfs/reada.c | 35 ++++++++++++++++------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a682c267576d..98ee5a51aa29 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4332,7 +4332,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, } no_dentry: /* is this a reference to our own snapshot? If so - * skip it + * skip it. + * + * In contrast to old kernels, we insert the snapshot's + * dir item and dir index after it has been created, so + * we won't find a reference to our own snapshot. We + * still keep the following code for backward + * compatibility. */ if (location.type == BTRFS_ROOT_ITEM_KEY && location.objectid == root->root_key.objectid) { diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index dc5d33146fdb..8dec650099c8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio) { int ret; - int looped = 0; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; u64 start; u64 end; int i; -again: zone = NULL; spin_lock(&fs_info->reada_lock); ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, @@ -274,9 +272,6 @@ again: spin_unlock(&fs_info->reada_lock); } - if (looped) - return NULL; - cache = btrfs_lookup_block_group(fs_info, logical); if (!cache) return NULL; @@ -307,13 +302,15 @@ again: ret = radix_tree_insert(&dev->reada_zones, (unsigned long)(zone->end >> PAGE_CACHE_SHIFT), zone); - spin_unlock(&fs_info->reada_lock); - if (ret) { + if (ret == -EEXIST) { kfree(zone); - looped = 1; - goto again; + ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone, + logical >> PAGE_CACHE_SHIFT, 1); + if (ret == 1) + kref_get(&zone->refcnt); } + spin_unlock(&fs_info->reada_lock); return zone; } @@ -323,8 +320,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_key *top, int level) { int ret; - int looped = 0; struct reada_extent *re = NULL; + struct reada_extent *re_exist = NULL; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; @@ -335,14 +332,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, int i; unsigned long index = logical >> PAGE_CACHE_SHIFT; -again: spin_lock(&fs_info->reada_lock); re = radix_tree_lookup(&fs_info->reada_tree, index); if (re) kref_get(&re->refcnt); spin_unlock(&fs_info->reada_lock); - if (re || looped) + if (re) return re; re = kzalloc(sizeof(*re), GFP_NOFS); @@ -398,12 +394,15 @@ again: /* insert extent in reada_tree + all per-device trees, all or nothing */ spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&fs_info->reada_tree, index, re); + if (ret == -EEXIST) { + re_exist = radix_tree_lookup(&fs_info->reada_tree, index); + BUG_ON(!re_exist); + kref_get(&re_exist->refcnt); + spin_unlock(&fs_info->reada_lock); + goto error; + } if (ret) { spin_unlock(&fs_info->reada_lock); - if (ret != -ENOMEM) { - /* someone inserted the extent in the meantime */ - looped = 1; - } goto error; } for (i = 0; i < nzones; ++i) { @@ -450,9 +449,7 @@ error: } kfree(bbio); kfree(re); - if (looped) - goto again; - return NULL; + return re_exist; } static void reada_kref_dummy(struct kref *kr) -- cgit v1.2.3 From 207a232ccac0a8cb79d304bd17298dbc96e2e082 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Sat, 25 Feb 2012 09:09:47 +0100 Subject: btrfs: don't add both copies of DUP to reada extent tree Normally when there are 2 copies of a block, we add both to the reada extent tree and prefetch only the one that is easier to reach. This way we can better utilize multiple devices. In case of DUP this makes no sense as both copies reside on the same device. Signed-off-by: Arne Jansen --- fs/btrfs/reada.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 8dec650099c8..ac5d01085884 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -326,6 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; + struct btrfs_device *prev_dev; u32 blocksize; u64 length; int nzones = 0; @@ -405,8 +406,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, spin_unlock(&fs_info->reada_lock); goto error; } + prev_dev = NULL; for (i = 0; i < nzones; ++i) { dev = bbio->stripes[i].dev; + if (dev == prev_dev) { + /* + * in case of DUP, just add the first zone. As both + * are on the same device, there's nothing to gain + * from adding both. + * Also, it wouldn't work, as the tree is per device + * and adding would fail with EEXIST + */ + continue; + } + prev_dev = dev; ret = radix_tree_insert(&dev->reada_extents, index, re); if (ret) { while (--i >= 0) { -- cgit v1.2.3 From 8d082fb727ac11930ea20bf1612e334ea7c2b697 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Tue, 3 Apr 2012 09:56:53 +0800 Subject: Btrfs: do not mount when we have a sectorsize unequal to PAGE_SIZE Our code is not ready to cope with a sectorsize that's not equal to PAGE_SIZE. It will lead to hanging-on while writing something. Signed-off-by: Liu Bo --- fs/btrfs/disk-io.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 20196f411206..b9866f27ebd7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2254,9 +2254,9 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - if (sectorsize < PAGE_SIZE) { - printk(KERN_WARNING "btrfs: Incompatible sector size " - "found on %s\n", sb->s_id); + if (sectorsize != PAGE_SIZE) { + printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) " + "found on %s\n", (unsigned long)sectorsize, sb->s_id); goto fail_sb_buffer; } -- cgit v1.2.3 From 871383be592ba7e819d27556591e315a0df38cee Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 2 Apr 2012 18:31:37 +0200 Subject: btrfs: add missing unlocks to transaction abort paths Added in commit 49b25e0540904be0bf558b84475c69d72e4de66e ("btrfs: enhance transaction abort infrastructure") Reported-by: Dan Carpenter Signed-off-by: David Sterba --- fs/btrfs/transaction.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 11b77a59db62..36422254ef67 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -73,8 +73,10 @@ loop: cur_trans = root->fs_info->running_transaction; if (cur_trans) { - if (cur_trans->aborted) + if (cur_trans->aborted) { + spin_unlock(&root->fs_info->trans_lock); return cur_trans->aborted; + } atomic_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); cur_trans->num_joined++; @@ -1400,6 +1402,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } @@ -1411,6 +1414,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); goto cleanup_transaction; } -- cgit v1.2.3 From 8e52acf70459020d7e9e9fda25066be4da520943 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:28 +0800 Subject: Btrfs: retrurn void from clear_state_bit Currently it returns a set of bits that were cleared, but this return value is not used at all. Moreover it doesn't seem to be useful, because we may clear the bits of a few extent_states, but only the cleared bits of last one is returned. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4789770f8eaf..05951bdf72cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -404,18 +404,16 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, /* * utility function to clear some bits in an extent state struct. - * it will optionally wake up any one waiting on this state (wake == 1), or - * forcibly remove the state from the tree (delete == 1). + * it will optionally wake up any one waiting on this state (wake == 1) * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static int clear_state_bit(struct extent_io_tree *tree, +static void clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, int *bits, int wake) { int bits_to_clear = *bits & ~EXTENT_CTLBITS; - int ret = state->state & bits_to_clear; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { u64 range = state->end - state->start + 1; @@ -437,7 +435,6 @@ static int clear_state_bit(struct extent_io_tree *tree, } else { merge_state(tree, state); } - return ret; } static struct extent_state * -- cgit v1.2.3 From cdc6a3952558f00b1bc3b6401e1cf98797632fe2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 12 Mar 2012 16:39:48 +0800 Subject: Btrfs: avoid possible use-after-free in clear_extent_bit() clear_extent_bit() { next_node = rb_next(&state->rb_node); ... clear_state_bit(state); <-- this may free next_node if (next_node) { state = rb_entry(next_node); ... } } clear_state_bit() calls merge_state() which may free the next node of the passing extent_state, so clear_extent_bit() may end up referencing freed memory. Signed-off-by: Li Zefan --- fs/btrfs/extent_io.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05951bdf72cc..11eeb81fe695 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -402,6 +402,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static struct extent_state *next_state(struct extent_state *state) +{ + struct rb_node *next = rb_next(&state->rb_node); + if (next) + return rb_entry(next, struct extent_state, rb_node); + else + return NULL; +} + /* * utility function to clear some bits in an extent state struct. * it will optionally wake up any one waiting on this state (wake == 1) @@ -409,10 +418,11 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ -static void clear_state_bit(struct extent_io_tree *tree, - struct extent_state *state, - int *bits, int wake) +static struct extent_state *clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, + int *bits, int wake) { + struct extent_state *next; int bits_to_clear = *bits & ~EXTENT_CTLBITS; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -425,6 +435,7 @@ static void clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { + next = next_state(state); if (state->tree) { rb_erase(&state->rb_node, &tree->state); state->tree = NULL; @@ -434,7 +445,9 @@ static void clear_state_bit(struct extent_io_tree *tree, } } else { merge_state(tree, state); + next = next_state(state); } + return next; } static struct extent_state * @@ -473,7 +486,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; - struct rb_node *next_node; struct rb_node *node; u64 last_end; int err; @@ -525,14 +537,11 @@ hit_next: WARN_ON(state->end < start); last_end = state->end; - if (state->end < end && !need_resched()) - next_node = rb_next(&state->rb_node); - else - next_node = NULL; - /* the state doesn't have the wanted bits, go ahead */ - if (!(state->state & bits)) + if (!(state->state & bits)) { + state = next_state(state); goto next; + } /* * | ---- desired range ---- | @@ -590,16 +599,13 @@ hit_next: goto out; } - clear_state_bit(tree, state, &bits, wake); + state = clear_state_bit(tree, state, &bits, wake); next: if (last_end == (u64)-1) goto out; start = last_end + 1; - if (start <= end && next_node) { - state = rb_entry(next_node, struct extent_state, - rb_node); + if (start <= end && state && !need_resched()) goto hit_next; - } goto search_again; out: -- cgit v1.2.3 From 4735fb282830c0966b301dabcccf4753fa6604bb Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 12 Apr 2012 22:47:52 +0200 Subject: Btrfs: Make free_ipath() deal gracefully with NULL pointers Make free_ipath() behave like most other freeing functions in the kernel and gracefully do nothing when passed a NULL pointer. Besides this making the bahaviour consistent with functions such as kfree(), vfree(), btrfs_free_path() etc etc, it also fixes a real NULL deref issue in fs/btrfs/ioctl.c::btrfs_ioctl_ino_to_path(). In that function we have this code: ... ipath = init_ipath(size, root, path); if (IS_ERR(ipath)) { ret = PTR_ERR(ipath); ipath = NULL; goto out; } ... out: btrfs_free_path(path); free_ipath(ipath); ... If we ever take the true branch of that 'if' statement we'll end up passing a NULL pointer to free_ipath() which will subsequently dereference it and we'll go "Boom" :-( This patch will avoid that. Signed-off-by: Jesper Juhl --- fs/btrfs/backref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f4e90748940a..b332ff04c5ee 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1414,6 +1414,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root, void free_ipath(struct inode_fs_paths *ipath) { + if (!ipath) + return; kfree(ipath->fspath); kfree(ipath); } -- cgit v1.2.3 From aefc1eb13ebbb86c5ffade8a9e2425cd71032d7e Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:00 +0200 Subject: Btrfs: don't call free_extent_buffer twice in iterate_irefs Avoid calling free_extent_buffer more than once when the iterator function returns non-zero. The only code that uses this is scrub repair for corrupted nodatasum blocks. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index b332ff04c5ee..fb56bcc80377 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1247,7 +1247,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_path *path, iterate_irefs_t *iterate, void *ctx) { - int ret; + int ret = 0; int slot; u32 cur; u32 len; @@ -1259,7 +1259,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_inode_ref *iref; struct btrfs_key found_key; - while (1) { + while (!ret) { ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1288,10 +1288,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, (unsigned long long)found_key.objectid, (unsigned long long)fs_root->objectid); ret = iterate(parent, iref, eb, ctx); - if (ret) { - free_extent_buffer(eb); + if (ret) break; - } len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } -- cgit v1.2.3 From b916a59adfdc875381b68ced258694b434cf43ae Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 13 Apr 2012 12:28:08 +0200 Subject: Btrfs: add missing read locks in backref.c iref_to_path and iterate_irefs both increment the eb's refcount to use it after releasing the path. Both depend on consistent data remaining in the extent buffer and need a read lock to protect it. Signed-off-by: Jan Schmidt --- fs/btrfs/backref.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index fb56bcc80377..bcec06750232 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -22,6 +22,7 @@ #include "ulist.h" #include "transaction.h" #include "delayed-ref.h" +#include "locking.h" /* * this structure records all encountered refs on the way up to the root @@ -893,18 +894,22 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, s64 bytes_left = size - 1; struct extent_buffer *eb = eb_in; struct btrfs_key found_key; + int leave_spinning = path->leave_spinning; if (bytes_left >= 0) dest[bytes_left] = '\0'; + path->leave_spinning = 1; while (1) { len = btrfs_inode_ref_name_len(eb, iref); bytes_left -= len; if (bytes_left >= 0) read_extent_buffer(eb, dest + bytes_left, (unsigned long)(iref + 1), len); - if (eb != eb_in) + if (eb != eb_in) { + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + } ret = inode_ref_info(parent, 0, fs_root, path, &found_key); if (ret > 0) ret = -ENOENT; @@ -919,8 +924,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, slot = path->slots[0]; eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ - if (eb != eb_in) + if (eb != eb_in) { atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); + } btrfs_release_path(path); iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref); @@ -931,6 +939,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, } btrfs_release_path(path); + path->leave_spinning = leave_spinning; if (ret) return ERR_PTR(ret); @@ -1260,6 +1269,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, struct btrfs_key found_key; while (!ret) { + path->leave_spinning = 1; ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path, &found_key); if (ret < 0) @@ -1275,6 +1285,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, eb = path->nodes[0]; /* make sure we can use eb after releasing the path */ atomic_inc(&eb->refs); + btrfs_tree_read_lock(eb); + btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); btrfs_release_path(path); item = btrfs_item_nr(eb, slot); @@ -1293,6 +1305,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, len = sizeof(*iref) + name_len; iref = (struct btrfs_inode_ref *)((char *)iref + len); } + btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); } -- cgit v1.2.3 From 37db63a400d3bac467795aa43901065fd8d903b7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 13 Apr 2012 17:05:08 +0300 Subject: Btrfs: fix max chunk size check in chunk allocator Fix a bug, where in case we need to adjust stripe_size so that the length of the resulting chunk is less than or equal to max_chunk_size, DUP chunks turn out to be only half as big as they could be. Cc: Arne Jansen Signed-off-by: Ilya Dryomov --- fs/btrfs/volumes.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 759d02486d7c..ce289af526f0 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3324,12 +3324,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; num_stripes = ndevs * dev_stripes; - if (stripe_size * num_stripes > max_chunk_size * ncopies) { + if (stripe_size * ndevs > max_chunk_size * ncopies) { stripe_size = max_chunk_size * ncopies; - do_div(stripe_size, num_stripes); + do_div(stripe_size, ndevs); } do_div(stripe_size, dev_stripes); + + /* align to BTRFS_STRIPE_LEN */ do_div(stripe_size, BTRFS_STRIPE_LEN); stripe_size *= BTRFS_STRIPE_LEN; -- cgit v1.2.3 From 8a3db1849e9e2563727ea2dc32737502e0096641 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Mon, 16 Apr 2012 06:44:37 +0300 Subject: btrfs: fix early abort in 'remount' Cc: Jeff Mahoney Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Sergei Trofimovich --- fs/btrfs/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 84571d7da12e..43aa2dd0bc7d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1152,13 +1152,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (fs_info->fs_devices->rw_devices == 0) + if (fs_info->fs_devices->rw_devices == 0) { ret = -EACCES; goto restore; + } - if (btrfs_super_log_root(fs_info->super_copy) != 0) + if (btrfs_super_log_root(fs_info->super_copy) != 0) { ret = -EINVAL; goto restore; + } ret = btrfs_cleanup_fs_roots(fs_info); if (ret) -- cgit v1.2.3 From 48d282326b3ce5f435835f5fb0e3231c399f4f9a Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 14 Apr 2012 11:24:33 +0200 Subject: fs/btrfs/volumes.c: add missing free_fs_devices Free fs_devices as done in the error-handling code just below. Signed-off-by: Julia Lawall --- fs/btrfs/volumes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ce289af526f0..3b984173d25b 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4352,8 +4352,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); - if (ret) + if (ret) { + free_fs_devices(fs_devices); goto out; + } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); -- cgit v1.2.3 From 5cf1ab56133ad7b712673c071b439d4a555a2d1e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 16 Apr 2012 09:42:26 -0400 Subject: Btrfs: always store the mirror we read the eb from A user reported a panic where we were trying to fix a bad mirror but the mirror number we were giving was 0, which is invalid. This is because we don't do the transid verification until after the read, so as far as the read code is concerned the read was a success. So instead store the mirror we read from so that if there is some failure post read we know which mirror to try next and which mirror needs to be fixed if we find a good copy of the block. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/disk-io.c | 16 ++++++++-------- fs/btrfs/extent_io.c | 15 ++++++--------- fs/btrfs/extent_io.h | 4 ++-- fs/btrfs/inode.c | 2 +- 4 files changed, 17 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b9866f27ebd7..d0c969beaad4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -383,17 +383,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags)) break; - if (!failed_mirror) { - failed = 1; - printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror); - failed_mirror = eb->failed_mirror; - } - num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, eb->start, eb->len); if (num_copies == 1) break; + if (!failed_mirror) { + failed = 1; + failed_mirror = eb->read_mirror; + } + mirror_num++; if (mirror_num == failed_mirror) mirror_num++; @@ -564,7 +563,7 @@ struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree, } static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { struct extent_io_tree *tree; u64 found_start; @@ -589,6 +588,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, if (!reads_done) goto err; + eb->read_mirror = mirror; if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { ret = -EIO; goto err; @@ -652,7 +652,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb = (struct extent_buffer *)page->private; set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = failed_mirror; + eb->read_mirror = failed_mirror; if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) btree_readahead_hook(root, eb, eb->start, -EIO); return -EIO; /* we fixed nothing */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 11eeb81fe695..0c23e57077c6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2304,7 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) u64 start; u64 end; int whole_page; - int failed_mirror; + int mirror; int ret; if (err) @@ -2343,20 +2343,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err) } spin_unlock(&tree->lock); + mirror = (int)(unsigned long)bio->bi_bdev; if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { ret = tree->ops->readpage_end_io_hook(page, start, end, - state); + state, mirror); if (ret) uptodate = 0; else clean_io_failure(start, page); } - if (!uptodate) - failed_mirror = (int)(unsigned long)bio->bi_bdev; - if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) { - ret = tree->ops->readpage_io_failed_hook(page, failed_mirror); + ret = tree->ops->readpage_io_failed_hook(page, mirror); if (!ret && !err && test_bit(BIO_UPTODATE, &bio->bi_flags)) uptodate = 1; @@ -2371,8 +2369,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) * can't handle the error it will return -EIO and we * remain responsible for that page. */ - ret = bio_readpage_error(bio, page, start, end, - failed_mirror, NULL); + ret = bio_readpage_error(bio, page, start, end, mirror, NULL); if (ret == 0) { uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); @@ -4465,7 +4462,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, } clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - eb->failed_mirror = 0; + eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { page = extent_buffer_page(eb, i); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index faf10eb57f75..b516c3b8dec6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -79,7 +79,7 @@ struct extent_io_ops { u64 start, u64 end, struct extent_state *state); int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, - struct extent_state *state); + struct extent_state *state, int mirror); int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, struct extent_state *state, int uptodate); void (*set_bit_hook)(struct inode *inode, struct extent_state *state, @@ -135,7 +135,7 @@ struct extent_buffer { spinlock_t refs_lock; atomic_t refs; atomic_t io_pages; - int failed_mirror; + int read_mirror; struct list_head leak_list; struct rcu_head rcu_head; pid_t lock_owner; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 98ee5a51aa29..d953f8820464 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1947,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, * extent_io.c will try to find good copies for us. */ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, - struct extent_state *state) + struct extent_state *state, int mirror) { size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); struct inode *inode = page->mapping->host; -- cgit v1.2.3 From 253beebd5a255e07d6a8b65515491f33664e82a2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 18 Apr 2012 09:59:03 +0300 Subject: Btrfs: double unlock bug in error handling The caller expects this function to return with the lock held and releases it immediately on error. Signed-off-by: Dan Carpenter --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2b35f8d14bb9..a0bb9dcd3c36 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2301,6 +2301,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } @@ -2331,6 +2332,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ret) { printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); + spin_lock(&delayed_refs->lock); return ret; } -- cgit v1.2.3 From b9688bb8459b67e42327de6420edb405a9188775 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 18 Apr 2012 10:27:16 +0200 Subject: btrfs: don't return EINTR It is basically a good thing if we are interruptible when waiting for free space, but the generality in which it is implemented currently leads to system calls being interruptible that are not documented this way. For example git can't handle interrupted unlink(), leading to corrupt repos under space pressure. Instead we raise the bar to only be interruptible by SIGKILL. Thanks to David Sterba for suggesting this. Signed-off-by: Arne Jansen --- fs/btrfs/extent-tree.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a0bb9dcd3c36..84497f8eb043 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3771,13 +3771,10 @@ again: */ if (current->journal_info) return -EAGAIN; - ret = wait_event_interruptible(space_info->wait, - !space_info->flush); - /* Must have been interrupted, return */ - if (ret) { - printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__); + ret = wait_event_killable(space_info->wait, !space_info->flush); + /* Must have been killed, return */ + if (ret) return -EINTR; - } spin_lock(&space_info->lock); } -- cgit v1.2.3 From 99ba55ad696944b37d5557bc5b4816890854fdb9 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Mon, 19 Mar 2012 16:17:22 +0100 Subject: Btrfs: fix btrfs_ioctl_dev_info() crash on missing device When a filesystem is mounted with the degraded option, it is possible that some of the devices are not there. btrfs_ioctl_dev_info() crashs in this case because the device name is a NULL pointer. This ioctl was only used for scrub. Signed-off-by: Stefan Behrens --- fs/btrfs/ioctl.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 18cc23d164a8..14f8e1faa46e 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2262,7 +2262,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) di_args->bytes_used = dev->bytes_used; di_args->total_bytes = dev->total_bytes; memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - strncpy(di_args->path, dev->name, sizeof(di_args->path)); + if (dev->name) + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + else + di_args->path[0] = '\0'; out: if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) -- cgit v1.2.3 From 5c84fc3c3914e9adfa6155a167c6c0c2709e6a62 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:31 +0200 Subject: Btrfs: don't count CRC or header errors twice while scrubbing Each CRC or header error was counted twice, this is now fixed. Signed-off-by: Stefan Behrens --- fs/btrfs/scrub.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 60f0e28db31e..b679bf68861e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1258,12 +1258,6 @@ static int scrub_checksum_data(struct scrub_block *sblock) if (memcmp(csum, on_disk_csum, sdev->csum_size)) fail = 1; - if (fail) { - spin_lock(&sdev->stat_lock); - ++sdev->stat.csum_errors; - spin_unlock(&sdev->stat_lock); - } - return fail; } @@ -1336,15 +1330,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size)) ++crc_fail; - if (crc_fail || fail) { - spin_lock(&sdev->stat_lock); - if (crc_fail) - ++sdev->stat.csum_errors; - if (fail) - ++sdev->stat.verify_errors; - spin_unlock(&sdev->stat_lock); - } - return fail || crc_fail; } -- cgit v1.2.3 From 25cd999e1a685dab65292afbe9fa24d790d8a859 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 30 Mar 2012 13:58:32 +0200 Subject: Btrfs: fix that check_int_data mount option was ignored The bitfield member mount_opt was too small by one bit to hold the mount option that enabled to include data extents in the integrity checker. Since the same issue happened when the BTRFS_MOUNT_PANIC_ON_FATAL_ERROR option was added (git rebase silently merges so that the increase of the size of the bitfield member is lost), the bit limit was removed entirely. Signed-off-by: Stefan Behrens --- fs/btrfs/ctree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5b8ef8eb3521..ec42a24e935e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1078,7 +1078,7 @@ struct btrfs_fs_info { * is required instead of the faster short fsync log commits */ u64 last_trans_log_full_commit; - unsigned long mount_opt:21; + unsigned long mount_opt; unsigned long compress_type:4; u64 max_inline; u64 alloc_start; -- cgit v1.2.3 From 05ffe24f5290dc095f98fbaf84afe51ef404ccc5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Apr 2012 12:20:10 -0400 Subject: NFSv4: Ensure that the LOCK code sets exception->inode All callers of nfs4_handle_exception() that need to handle NFS4ERR_OPENMODE correctly should set exception->inode Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f82bde005a82..3c787d02131b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4558,7 +4558,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; do { @@ -4576,7 +4578,9 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request) { struct nfs_server *server = NFS_SERVER(state->inode); - struct nfs4_exception exception = { }; + struct nfs4_exception exception = { + .inode = state->inode, + }; int err; err = nfs4_set_lock_state(state, request); @@ -4676,6 +4680,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * { struct nfs4_exception exception = { .state = state, + .inode = state->inode, }; int err; -- cgit v1.2.3 From 55725513b5ef9d462aa3e18527658a0362aaae83 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Apr 2012 12:48:35 -0400 Subject: NFSv4: Ensure that we check lock exclusive/shared type against open modes Since we may be simulating flock() locks using NFS byte range locks, we can't rely on the VFS having checked the file open mode for us. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4proc.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 3c787d02131b..ba837d977a1a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -4726,6 +4726,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) if (state == NULL) return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + do { status = nfs4_proc_setlk(state, cmd, request); if ((status != -EAGAIN) || IS_SETLK(cmd)) -- cgit v1.2.3 From 451146be933e26e21277852b5e40c6a52266ef96 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Apr 2012 16:29:11 -0400 Subject: NFSv4: Fix open(O_TRUNC) and ftruncate() error handling If the file wasn't opened for writing, then truncate and ftruncate need to report the appropriate errors. Reported-by: Miklos Szeredi Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/dir.c | 4 ++-- fs/nfs/nfs4proc.c | 15 ++++++++++++--- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4aaf0316d76a..8789210c6905 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1429,7 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry } open_flags = nd->intent.open.flags; - attr.ia_valid = 0; + attr.ia_valid = ATTR_OPEN; ctx = create_nfs_open_context(dentry, open_flags); res = ERR_CAST(ctx); @@ -1536,7 +1536,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) if (IS_ERR(ctx)) goto out; - attr.ia_valid = 0; + attr.ia_valid = ATTR_OPEN; if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ba837d977a1a..f875cf305237 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1954,10 +1954,19 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, }; int err; do { - err = nfs4_handle_exception(server, - _nfs4_do_setattr(inode, cred, fattr, sattr, state), - &exception); + err = _nfs4_do_setattr(inode, cred, fattr, sattr, state); + switch (err) { + case -NFS4ERR_OPENMODE: + if (state && !(state->state & FMODE_WRITE)) { + err = -EBADF; + if (sattr->ia_valid & ATTR_OPEN) + err = -EACCES; + goto out; + } + } + err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); +out: return err; } -- cgit v1.2.3 From 3af9d8f227a31e25b3110ef175d105798fc147a6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 13 Apr 2012 17:16:59 -0400 Subject: cifs: fix offset handling in cifs_iovec_write In the recent update of the cifs_iovec_write code to use async writes, the handling of the file position was broken. That patch added a local "offset" variable to handle the offset, and then only updated the original "*poffset" before exiting. Unfortunately, it copied off the original offset from the beginning, instead of doing so after generic_write_checks had been called. Fix this by moving the initialization of "offset" after that in the function. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fae765dac934..81725e9286e9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2178,7 +2178,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, unsigned long nr_pages, i; size_t copied, len, cur_len; ssize_t total_written = 0; - loff_t offset = *poffset; + loff_t offset; struct iov_iter it; struct cifsFileInfo *open_file; struct cifs_tcon *tcon; @@ -2200,6 +2200,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov, cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); open_file = file->private_data; tcon = tlink_tcon(open_file->tlink); + offset = *poffset; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD) pid = open_file->pid; -- cgit v1.2.3 From 73fb7bc7c57d971b11f2e00536ac2d3e316e0609 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 20 Apr 2012 14:47:34 -0400 Subject: NFS: put open context on error in nfs_pagein_multi Cc: Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 9a0e8ef4a409..0a4be28c2ea3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -322,7 +322,7 @@ out_bad: while (!list_empty(res)) { data = list_entry(res->next, struct nfs_read_data, list); list_del(&data->list); - nfs_readdata_free(data); + nfs_readdata_release(data); } nfs_readpage_release(req); return -ENOMEM; -- cgit v1.2.3 From 8ccd271f7a3a846ce6f85ead0760d9d12994a611 Mon Sep 17 00:00:00 2001 From: Fred Isaman Date: Fri, 20 Apr 2012 14:47:35 -0400 Subject: NFS: put open context on error in nfs_flush_multi Cc: Signed-off-by: Fred Isaman Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9b8d4d42a8af..c07462320f6b 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1019,7 +1019,7 @@ out_bad: while (!list_empty(res)) { data = list_entry(res->next, struct nfs_write_data, list); list_del(&data->list); - nfs_writedata_free(data); + nfs_writedata_release(data); } nfs_redirty_request(req); return -ENOMEM; -- cgit v1.2.3 From 98a2139f4f4d7b5fcc3a54c7fddbe88612abed20 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sat, 3 Sep 2011 01:09:43 +0200 Subject: nfs: Enclose hostname in brackets when needed in nfs_do_root_mount When hostname contains colon (e.g. when it is an IPv6 address) it needs to be enclosed in brackets to make parsing of NFS device string possible. Fix nfs_do_root_mount() to enclose hostname properly when needed. NFS code actually does not need this as it does not parse the string passed by nfs_do_root_mount() but the device string is exposed to userspace in /proc/mounts. CC: Josh Boyer CC: Trond Myklebust Signed-off-by: Jan Kara Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 37412f706b32..1e6715f0616c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2767,11 +2767,15 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type, char *root_devname; size_t len; - len = strlen(hostname) + 3; + len = strlen(hostname) + 5; root_devname = kmalloc(len, GFP_KERNEL); if (root_devname == NULL) return ERR_PTR(-ENOMEM); - snprintf(root_devname, len, "%s:/", hostname); + /* Does hostname needs to be enclosed in brackets? */ + if (strchr(hostname, ':')) + snprintf(root_devname, len, "[%s]:/", hostname); + else + snprintf(root_devname, len, "%s:/", hostname); root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data); kfree(root_devname); return root_mnt; -- cgit v1.2.3 From e4eb1ff61b323d6141614e5458a1f53c7046ff8e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Apr 2012 15:35:40 -0700 Subject: VM: add "vm_brk()" helper function It does the same thing as "do_brk()", except it handles the VM locking too. It turns out that all external callers want that anyway, so we can make do_brk() static to just mm/mmap.c while at it. Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 20 +++++--------------- fs/binfmt_elf.c | 15 ++++----------- 2 files changed, 9 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 2eb12f13593d..88527492b917 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -50,9 +50,7 @@ static int set_brk(unsigned long start, unsigned long end) end = PAGE_ALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -280,9 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) pos = 32; map_size = ex.a_text+ex.a_data; #endif - down_write(¤t->mm->mmap_sem); - error = do_brk(text_addr & PAGE_MASK, map_size); - up_write(¤t->mm->mmap_sem); + error = vm_brk(text_addr & PAGE_MASK, map_size); if (error != (text_addr & PAGE_MASK)) { send_sig(SIGKILL, current, 0); return error; @@ -313,9 +309,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { loff_t pos = fd_offset; - down_write(¤t->mm->mmap_sem); - do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); - up_write(¤t->mm->mmap_sem); + vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data); bprm->file->f_op->read(bprm->file, (char __user *)N_TXTADDR(ex), ex.a_text+ex.a_data, &pos); @@ -412,9 +406,7 @@ static int load_aout_library(struct file *file) "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); } - down_write(¤t->mm->mmap_sem); - do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); - up_write(¤t->mm->mmap_sem); + vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); file->f_op->read(file, (char __user *)start_addr, ex.a_text + ex.a_data, &pos); @@ -438,9 +430,7 @@ static int load_aout_library(struct file *file) len = PAGE_ALIGN(ex.a_text + ex.a_data); bss = ex.a_text + ex.a_data + ex.a_bss; if (bss > len) { - down_write(¤t->mm->mmap_sem); - error = do_brk(start_addr + len, bss - len); - up_write(¤t->mm->mmap_sem); + error = vm_brk(start_addr + len, bss - len); retval = error; if (error != start_addr + len) goto out; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 48ffb3dc610a..0708a0bf0ba9 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -82,9 +82,7 @@ static int set_brk(unsigned long start, unsigned long end) end = ELF_PAGEALIGN(end); if (end > start) { unsigned long addr; - down_write(¤t->mm->mmap_sem); - addr = do_brk(start, end - start); - up_write(¤t->mm->mmap_sem); + addr = vm_brk(start, end - start); if (BAD_ADDR(addr)) return addr; } @@ -514,9 +512,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); /* Map the last of the bss segment */ - down_write(¤t->mm->mmap_sem); - error = do_brk(elf_bss, last_bss - elf_bss); - up_write(¤t->mm->mmap_sem); + error = vm_brk(elf_bss, last_bss - elf_bss); if (BAD_ADDR(error)) goto out_close; } @@ -1072,11 +1068,8 @@ static int load_elf_library(struct file *file) len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr + ELF_MIN_ALIGN - 1); bss = eppnt->p_memsz + eppnt->p_vaddr; - if (bss > len) { - down_write(¤t->mm->mmap_sem); - do_brk(len, bss - len); - up_write(¤t->mm->mmap_sem); - } + if (bss > len) + vm_brk(len, bss - len); error = 0; out_free_ph: -- cgit v1.2.3 From a46ef99d80817a167477ed1c8b4d90ee0c2e726f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Apr 2012 16:20:01 -0700 Subject: VM: add "vm_munmap()" helper function Like the vm_brk() function, this is the same as "do_munmap()", except it does the VM locking for the caller. Signed-off-by: Linus Torvalds --- fs/aio.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index da887604dfc5..99bd790e8cd2 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -92,11 +92,8 @@ static void aio_free_ring(struct kioctx *ctx) for (i=0; inr_pages; i++) put_page(info->ring_pages[i]); - if (info->mmap_size) { - down_write(&ctx->mm->mmap_sem); - do_munmap(ctx->mm, info->mmap_base, info->mmap_size); - up_write(&ctx->mm->mmap_sem); - } + if (info->mmap_size) + vm_munmap(ctx->mm, info->mmap_base, info->mmap_size); if (info->ring_pages && info->ring_pages != info->internal_pages) kfree(info->ring_pages); -- cgit v1.2.3 From 6be5ceb02e98eaf6cfc4f8b12a896d04023f340d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 20 Apr 2012 17:13:58 -0700 Subject: VM: add "vm_mmap()" helper function This continues the theme started with vm_brk() and vm_munmap(): vm_mmap() does the same thing as do_mmap(), but additionally does the required VM locking. This uninlines (and rewrites it to be clearer) do_mmap(), which sadly duplicates it in mm/mmap.c and mm/nommu.c. But that way we don't have to export our internal do_mmap_pgoff() function. Some day we hopefully don't have to export do_mmap() either, if all modular users can become the simpler vm_mmap() instead. We're actually very close to that already, with the notable exception of the (broken) use in i810, and a couple of stragglers in binfmt_elf. Signed-off-by: Linus Torvalds --- fs/binfmt_aout.c | 12 +++--------- fs/binfmt_elf.c | 8 ++------ fs/binfmt_elf_fdpic.c | 18 ++++-------------- fs/binfmt_flat.c | 12 +++--------- fs/binfmt_som.c | 12 +++--------- 5 files changed, 15 insertions(+), 47 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index 88527492b917..d146e181d10d 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -319,24 +319,20 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto beyond_if; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, + error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset); - up_write(¤t->mm->mmap_sem); if (error != N_TXTADDR(ex)) { send_sig(SIGKILL, current, 0); return error; } - down_write(¤t->mm->mmap_sem); - error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data, + error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset + ex.a_text); - up_write(¤t->mm->mmap_sem); if (error != N_DATADDR(ex)) { send_sig(SIGKILL, current, 0); return error; @@ -417,12 +413,10 @@ static int load_aout_library(struct file *file) goto out; } /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, start_addr, ex.a_text + ex.a_data, + error = vm_mmap(file, start_addr, ex.a_text + ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, N_TXTOFF(ex)); - up_write(¤t->mm->mmap_sem); retval = error; if (error != start_addr) goto out; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 0708a0bf0ba9..16f735417072 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -958,10 +958,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) and some applications "depend" upon this behavior. Since we do not have the power to recompile these, we emulate the SVr4 behavior. Sigh. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, + error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); } #ifdef ELF_PLAT_INIT @@ -1046,8 +1044,7 @@ static int load_elf_library(struct file *file) eppnt++; /* Now use mmap to map the library into memory. */ - down_write(¤t->mm->mmap_sem); - error = do_mmap(file, + error = vm_mmap(file, ELF_PAGESTART(eppnt->p_vaddr), (eppnt->p_filesz + ELF_PAGEOFFSET(eppnt->p_vaddr)), @@ -1055,7 +1052,6 @@ static int load_elf_library(struct file *file) MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE, (eppnt->p_offset - ELF_PAGEOFFSET(eppnt->p_vaddr))); - up_write(¤t->mm->mmap_sem); if (error != ELF_PAGESTART(eppnt->p_vaddr)) goto out_free_ph; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 9bd5612a8224..d390a0fffc65 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -390,21 +390,17 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) stack_prot |= PROT_EXEC; - down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, + current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); if (IS_ERR_VALUE(current->mm->start_brk)) { - up_write(¤t->mm->mmap_sem); retval = current->mm->start_brk; current->mm->start_brk = 0; goto error_kill; } - up_write(¤t->mm->mmap_sem); - current->mm->brk = current->mm->start_brk; current->mm->context.end_brk = current->mm->start_brk; current->mm->context.end_brk += @@ -955,10 +951,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE) mflags |= MAP_EXECUTABLE; - down_write(&mm->mmap_sem); - maddr = do_mmap(NULL, load_addr, top - base, + maddr = vm_mmap(NULL, load_addr, top - base, PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0); - up_write(&mm->mmap_sem); if (IS_ERR_VALUE(maddr)) return (int) maddr; @@ -1096,10 +1090,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* create the mapping */ disp = phdr->p_vaddr & ~PAGE_MASK; - down_write(&mm->mmap_sem); - maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, + maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags, phdr->p_offset - disp); - up_write(&mm->mmap_sem); kdebug("mmap[%d] sz=%lx pr=%x fl=%x of=%lx --> %08lx", loop, phdr->p_memsz + disp, prot, flags, @@ -1143,10 +1135,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, unsigned long xmaddr; flags |= MAP_FIXED | MAP_ANONYMOUS; - down_write(&mm->mmap_sem); - xmaddr = do_mmap(NULL, xaddr, excess - excess1, + xmaddr = vm_mmap(NULL, xaddr, excess - excess1, prot, flags, 0); - up_write(&mm->mmap_sem); kdebug("mmap[%d] " " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx", diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 024d20ee3ca3..6b2daf99fab8 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -542,10 +542,8 @@ static int load_flat_file(struct linux_binprm * bprm, */ DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_EXECUTABLE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) textpos = (unsigned long) -ENOMEM; @@ -556,10 +554,8 @@ static int load_flat_file(struct linux_binprm * bprm, len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - realdatastart = do_mmap(0, 0, len, + realdatastart = vm_mmap(0, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) { if (!realdatastart) @@ -603,10 +599,8 @@ static int load_flat_file(struct linux_binprm * bprm, len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); len = PAGE_ALIGN(len); - down_write(¤t->mm->mmap_sem); - textpos = do_mmap(0, 0, len, + textpos = vm_mmap(0, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (!textpos || IS_ERR_VALUE(textpos)) { if (!textpos) diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index e4fc746629a7..4517aaff61b4 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -147,10 +147,8 @@ static int map_som_binary(struct file *file, code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize); current->mm->start_code = code_start; current->mm->end_code = code_start + code_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, code_start, code_size, prot, + retval = vm_mmap(file, code_start, code_size, prot, flags, SOM_PAGESTART(hpuxhdr->exec_tfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; @@ -158,20 +156,16 @@ static int map_som_binary(struct file *file, data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize); current->mm->start_data = data_start; current->mm->end_data = bss_start = data_start + data_size; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(file, data_start, data_size, + retval = vm_mmap(file, data_start, data_size, prot | PROT_WRITE, flags, SOM_PAGESTART(hpuxhdr->exec_dfile)); - up_write(¤t->mm->mmap_sem); if (retval < 0 && retval > -1024) goto out; som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize); current->mm->start_brk = current->mm->brk = som_brk; - down_write(¤t->mm->mmap_sem); - retval = do_mmap(NULL, bss_start, som_brk - bss_start, + retval = vm_mmap(NULL, bss_start, som_brk - bss_start, prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0); - up_write(¤t->mm->mmap_sem); if (retval > 0 || retval < -1024) retval = 0; out: -- cgit v1.2.3 From 95b72eb0bdef6476b7e73061f0382adf46c5495a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 20 Apr 2012 19:24:51 -0400 Subject: NFSv4: Ensure we do not reuse open owner names The NFSv4 spec is ambiguous about whether or not it is permissible to reuse open owner names, so play it safe. This patch adds a timestamp to the state_owner structure, and combines that with the IDA based uniquifier. Fixes a regression whereby the Linux server returns NFS4ERR_BAD_SEQID. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 1 + fs/nfs/nfs4proc.c | 6 +++--- fs/nfs/nfs4state.c | 1 + fs/nfs/nfs4xdr.c | 9 +++++---- 4 files changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 97ecc863dd76..b6db9e33fb7b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -59,6 +59,7 @@ struct nfs_unique_id { #define NFS_SEQID_CONFIRMED 1 struct nfs_seqid_counter { + ktime_t create_time; int owner_id; int flags; u32 counter; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f875cf305237..60d5f4c26dda 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -838,7 +838,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.open_flags = flags; p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE); p->o_arg.clientid = server->nfs_client->cl_clientid; - p->o_arg.id = sp->so_seqid.owner_id; + p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time); + p->o_arg.id.uniquifier = sp->so_seqid.owner_id; p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; @@ -1466,8 +1467,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto unlock_no_action; rcu_read_unlock(); } - /* Update sequence id. */ - data->o_arg.id = sp->so_seqid.owner_id; + /* Update client id. */ data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0f43414eb25a..3b07f094f3a9 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -393,6 +393,7 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp) static void nfs4_init_seqid_counter(struct nfs_seqid_counter *sc) { + sc->create_time = ktime_get(); sc->flags = 0; sc->counter = 0; spin_lock_init(&sc->lock); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index c74fdb114b48..77fc5f959c4e 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -74,7 +74,7 @@ static int nfs4_stat_to_errno(int); /* lock,open owner id: * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ -#define open_owner_id_maxsz (1 + 1 + 4) +#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) #define lock_owner_id_maxsz (1 + 1 + 4) #define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) @@ -1340,12 +1340,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena */ encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->fmode); - p = reserve_space(xdr, 32); + p = reserve_space(xdr, 36); p = xdr_encode_hyper(p, arg->clientid); - *p++ = cpu_to_be32(20); + *p++ = cpu_to_be32(24); p = xdr_encode_opaque_fixed(p, "open id:", 8); *p++ = cpu_to_be32(arg->server->s_dev); - xdr_encode_hyper(p, arg->id); + *p++ = cpu_to_be32(arg->id.uniquifier); + xdr_encode_hyper(p, arg->id.create_time); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) -- cgit v1.2.3 From 936af1576e4c24b466380fc2b8d93352161d13b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Apr 2012 21:49:41 -0400 Subject: aio: don't bother with unmapping when aio_free_ring() is coming from exit_aio() ... since exit_mmap() is coming and it will munmap() everything anyway. In all other cases aio_free_ring() has ctx->mm == current->mm; moreover, all other callers of vm_munmap() have mm == current->mm, so this will allow us to get rid of mm argument of vm_munmap(). Signed-off-by: Al Viro --- fs/aio.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 99bd790e8cd2..976e33d97413 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -92,8 +92,10 @@ static void aio_free_ring(struct kioctx *ctx) for (i=0; inr_pages; i++) put_page(info->ring_pages[i]); - if (info->mmap_size) + if (info->mmap_size) { + BUG_ON(ctx->mm != current->mm); vm_munmap(ctx->mm, info->mmap_base, info->mmap_size); + } if (info->ring_pages && info->ring_pages != info->internal_pages) kfree(info->ring_pages); @@ -386,6 +388,17 @@ void exit_aio(struct mm_struct *mm) "exit_aio:ioctx still alive: %d %d %d\n", atomic_read(&ctx->users), ctx->dead, ctx->reqs_active); + /* + * We don't need to bother with munmap() here - + * exit_mmap(mm) is coming and it'll unmap everything. + * Since aio_free_ring() uses non-zero ->mmap_size + * as indicator that it needs to unmap the area, + * just set it to 0; aio_free_ring() is the only + * place that uses ->mmap_size, so it's safe. + * That way we get all munmap done to current->mm - + * all other callers have ctx->mm == current->mm. + */ + ctx->ring_info.mmap_size = 0; put_ioctx(ctx); } } -- cgit v1.2.3 From bfce281c287a427d0841fadf5d59242757b4e620 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 20 Apr 2012 21:57:04 -0400 Subject: kill mm argument of vm_munmap() it's always current->mm Signed-off-by: Al Viro --- fs/aio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 976e33d97413..67a6db3e1b6f 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -94,7 +94,7 @@ static void aio_free_ring(struct kioctx *ctx) if (info->mmap_size) { BUG_ON(ctx->mm != current->mm); - vm_munmap(ctx->mm, info->mmap_base, info->mmap_size); + vm_munmap(info->mmap_base, info->mmap_size); } if (info->ring_pages && info->ring_pages != info->internal_pages) -- cgit v1.2.3 From c77365c963cf8703ecf1004cd3b298068a3e1b76 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Apr 2012 12:31:05 -0400 Subject: NFSv4: Ensure that we don't drop a state owner more than once Retest the RB_EMPTY_NODE() condition under the spin lock to ensure that we don't call rb_erase() more than once on the same state owner. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 3b07f094f3a9..b300fb840b21 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -435,13 +435,17 @@ nfs4_alloc_state_owner(struct nfs_server *server, static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { - if (!RB_EMPTY_NODE(&sp->so_server_node)) { + struct rb_node *rb_node = &sp->so_server_node; + + if (!RB_EMPTY_NODE(rb_node)) { struct nfs_server *server = sp->so_server; struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); - rb_erase(&sp->so_server_node, &server->state_owners); - RB_CLEAR_NODE(&sp->so_server_node); + if (!RB_EMPTY_NODE(rb_node)) { + rb_erase(rb_node, &server->state_owners); + RB_CLEAR_NODE(rb_node); + } spin_unlock(&clp->cl_lock); } } -- cgit v1.2.3 From 7bf97bc27308cfdc7a8dadd40ae50f7c4cb09b01 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 21 Apr 2012 12:36:19 -0400 Subject: NFSv4: Keep dropped state owners on the LRU list for a while To ensure that we don't reuse their identifiers. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b300fb840b21..7f0fcfc1fe9d 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -521,6 +521,14 @@ out: /** * nfs4_put_state_owner - Release a nfs4_state_owner * @sp: state owner data to release + * + * Note that we keep released state owners on an LRU + * list. + * This caches valid state owners so that they can be + * reused, to avoid the OPEN_CONFIRM on minor version 0. + * It also pins the uniquifier of dropped state owners for + * a while, to ensure that those state owner names are + * never reused. */ void nfs4_put_state_owner(struct nfs4_state_owner *sp) { @@ -530,15 +538,9 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) return; - if (!RB_EMPTY_NODE(&sp->so_server_node)) { - sp->so_expires = jiffies; - list_add_tail(&sp->so_lru, &server->state_owners_lru); - spin_unlock(&clp->cl_lock); - } else { - nfs4_remove_state_owner_locked(sp); - spin_unlock(&clp->cl_lock); - nfs4_free_state_owner(sp); - } + sp->so_expires = jiffies; + list_add_tail(&sp->so_lru, &server->state_owners_lru); + spin_unlock(&clp->cl_lock); } /** -- cgit v1.2.3 From 53ad1c980d4fb450722a575ca17c188808939340 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Wed, 4 Apr 2012 09:49:15 -0500 Subject: dlm: fix QUECVT when convert queue is empty The QUECVT flag should not prevent conversions from being granted immediately when the convert queue is empty. Signed-off-by: David Teigland --- fs/dlm/lock.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'fs') diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index fa5c07d51dcc..4c58d4a3adc4 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -1736,6 +1736,18 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now) if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT)) return 1; + /* + * Even if the convert is compat with all granted locks, + * QUECVT forces it behind other locks on the convert queue. + */ + + if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) { + if (list_empty(&r->res_convertqueue)) + return 1; + else + goto out; + } + /* * The NOORDER flag is set to avoid the standard vms rules on grant * order. -- cgit v1.2.3 From 99aa78466777083255b876293e9e83dec7cd809a Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 13 Apr 2012 10:27:35 +0800 Subject: jbd2: use GFP_NOFS for blkdev_issue_flush flush request is issued in transaction commit code path, so looks using GFP_KERNEL to allocate memory for flush request bio falls into the classic deadlock issue. I saw btrfs and dm get it right, but ext4, xfs and md are using GFP. Signed-off-by: Shaohua Li Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/jbd2/commit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 806525a7269c..840f70f50792 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -723,7 +723,7 @@ start_journal_io: if (commit_transaction->t_need_data_flush && (journal->j_fs_dev != journal->j_dev) && (journal->j_flags & JBD2_BARRIER)) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -859,7 +859,7 @@ wait_for_iobuf: if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) && journal->j_flags & JBD2_BARRIER) { - blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL); } if (err) -- cgit v1.2.3 From db7e5c668e1b16061fe2d94d3cba022dd360c5d4 Mon Sep 17 00:00:00 2001 From: Eldad Zack Date: Sun, 22 Apr 2012 17:50:52 +0200 Subject: super.c: unused variable warning without CONFIG_QUOTA MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sb info is only checked with quota support. fs/ext4/super.c: In function ‘parse_options’: fs/ext4/super.c:1600:23: warning: unused variable ‘sbi’ [-Wunused-variable] Signed-off-by: Eldad Zack Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 6da193564e43..e1fb1d5de58e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1597,7 +1597,9 @@ static int parse_options(char *options, struct super_block *sb, unsigned int *journal_ioprio, int is_remount) { +#ifdef CONFIG_QUOTA struct ext4_sb_info *sbi = EXT4_SB(sb); +#endif char *p; substring_t args[MAX_OPT_ARGS]; int token; -- cgit v1.2.3 From 4c569a72c30dfee9b5133284aba67e3aa0c9505d Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Tue, 10 Apr 2012 14:45:24 -0400 Subject: GFS2: Instruct DLM to avoid queue convert slowdown This patch instructs DLM to prevent an "in place" conversion, where the lock just stays on the granted queue, and instead forces the conversion to the back of the convert queue. This is done on upward conversions only. This is useful in cases where, for example, a lock is frequently needed in PR on one node, but another node needs it temporarily in EX to update it. This may happen, for example, when the rindex is being updated by gfs2_grow. The gfs2_grow needs to have the lock in EX, but the other nodes need to re-read it to retrieve the updates. The glock is already granted in PR on the non-growing nodes, so this prevents them from continually re-granting the lock in PR, and forces the EX from gfs2_grow to go through. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/lock_dlm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index f8411bd1b805..5f5e70e047dc 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -200,10 +200,11 @@ static int make_mode(const unsigned int lmstate) return -1; } -static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, +static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags, const int req) { u32 lkf = DLM_LKF_VALBLK; + u32 lkid = gl->gl_lksb.sb_lkid; if (gfs_flags & LM_FLAG_TRY) lkf |= DLM_LKF_NOQUEUE; @@ -227,8 +228,11 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags, BUG(); } - if (lkid != 0) + if (lkid != 0) { lkf |= DLM_LKF_CONVERT; + if (test_bit(GLF_BLOCKING, &gl->gl_flags)) + lkf |= DLM_LKF_QUECVT; + } return lkf; } @@ -250,7 +254,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state, char strname[GDLM_STRNAME_BYTES] = ""; req = make_mode(req_state); - lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req); + lkf = make_flags(gl, flags, req); gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT); if (gl->gl_lksb.sb_lkid) { -- cgit v1.2.3 From 3c7c87fd5bd71f57c68a64d11a15170d0dc4f7aa Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 24 Apr 2012 15:28:14 +0100 Subject: CIFS: Show backupuid/gid in /proc/mounts Show backupuid/backupgid in /proc/mounts for cifs shares mounted with the backupuid/backupgid feature. Also consolidate the two separate checks for pvolume_info->backupuid_specified into a single if condition in cifs_setup_cifs_sb(). Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++++ fs/cifs/connect.c | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d34212822444..ea8eb92b65b4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -434,6 +434,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",noperm"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) seq_printf(s, ",strictcache"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID) + seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) + seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); seq_printf(s, ",rsize=%d", cifs_sb->rsize); seq_printf(s, ",wsize=%d", cifs_sb->wsize); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f31dc9ac37b7..f4d381e331ce 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3228,10 +3228,6 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_uid = pvolume_info->linux_uid; cifs_sb->mnt_gid = pvolume_info->linux_gid; - if (pvolume_info->backupuid_specified) - cifs_sb->mnt_backupuid = pvolume_info->backupuid; - if (pvolume_info->backupgid_specified) - cifs_sb->mnt_backupgid = pvolume_info->backupgid; cifs_sb->mnt_file_mode = pvolume_info->file_mode; cifs_sb->mnt_dir_mode = pvolume_info->dir_mode; cFYI(1, "file mode: 0x%hx dir mode: 0x%hx", @@ -3262,10 +3258,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; if (pvolume_info->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; - if (pvolume_info->backupuid_specified) + if (pvolume_info->backupuid_specified) { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; - if (pvolume_info->backupgid_specified) + cifs_sb->mnt_backupuid = pvolume_info->backupuid; + } + if (pvolume_info->backupgid_specified) { cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; + cifs_sb->mnt_backupgid = pvolume_info->backupgid; + } if (pvolume_info->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; if (pvolume_info->override_gid) -- cgit v1.2.3 From 28f8881023c9713c303c0feda270929f9384c019 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Tue, 24 Apr 2012 15:28:30 +0100 Subject: Use correct conversion specifiers in cifs_show_options cifs_show_options uses the wrong conversion specifier for uid, gid, rsize & wsize. Correct this to %u to match it to the variable type 'unsigned integer'. Signed-off-by: Sachin Prabhu Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ea8eb92b65b4..811245b1ff2e 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -370,13 +370,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root) (int)(srcaddr->sa_family)); } - seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); + seq_printf(s, ",uid=%u", cifs_sb->mnt_uid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); else seq_printf(s, ",noforceuid"); - seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); + seq_printf(s, ",gid=%u", cifs_sb->mnt_gid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); else @@ -439,8 +439,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID) seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid); - seq_printf(s, ",rsize=%d", cifs_sb->rsize); - seq_printf(s, ",wsize=%d", cifs_sb->wsize); + seq_printf(s, ",rsize=%u", cifs_sb->rsize); + seq_printf(s, ",wsize=%u", cifs_sb->wsize); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); -- cgit v1.2.3 From 13d518074a952d33d47c428419693f63389547e9 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Wed, 25 Apr 2012 16:01:47 -0700 Subject: epoll: clear the tfile_check_list on -ELOOP An epoll_ctl(,EPOLL_CTL_ADD,,) operation can return '-ELOOP' to prevent circular epoll dependencies from being created. However, in that case we do not properly clear the 'tfile_check_list'. Thus, add a call to clear_tfile_check_list() for the -ELOOP case. Signed-off-by: Jason Baron Reported-by: Yurij M. Plotnikov Cc: Nelson Elhage Cc: Davide Libenzi Tested-by: Alexandra N. Kossovsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 739b0985b398..c0b3c70ee87a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1663,8 +1663,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, if (op == EPOLL_CTL_ADD) { if (is_file_epoll(tfile)) { error = -ELOOP; - if (ep_loop_check(ep, tfile) != 0) + if (ep_loop_check(ep, tfile) != 0) { + clear_tfile_check_list(); goto error_tgt_fput; + } } else list_add(&tfile->f_tfile_llink, &tfile_check_list); } -- cgit v1.2.3 From 61065a30af8df4b8989c2ac7a1f4b4034e4df2d5 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 25 Apr 2012 16:01:48 -0700 Subject: fs/buffer.c: remove BUG() in possible but rare condition While stressing the kernel with with failing allocations today, I hit the following chain of events: alloc_page_buffers(): bh = alloc_buffer_head(GFP_NOFS); if (!bh) goto no_grow; <= path taken grow_dev_page(): bh = alloc_page_buffers(page, size, 0); if (!bh) goto failed; <= taken, consequence of the above and then the failed path BUG()s the kernel. The failure is inserted a litte bit artificially, but even then, I see no reason why it should be deemed impossible in a real box. Even though this is not a condition that we expect to see around every time, failed allocations are expected to be handled, and BUG() sounds just too much. As a matter of fact, grow_dev_page() can return NULL just fine in other circumstances, so I propose we just remove it, then. Signed-off-by: Glauber Costa Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 36d66653b931..351e18ea2e53 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -985,7 +985,6 @@ grow_dev_page(struct block_device *bdev, sector_t block, return page; failed: - BUG(); unlock_page(page); page_cache_release(page); return NULL; -- cgit v1.2.3 From 65ed76010dfed3cb75c863c9052c367a1bacf80a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 25 Apr 2012 16:01:50 -0700 Subject: hugetlbfs: lockdep annotate root inode properly This fixes the below reported false lockdep warning. e096d0c7e2e4 ("lockdep: Add helper function for dir vs file i_mutex annotation") added a similar annotation for every other inode in hugetlbfs but missed the root inode because it was allocated by a separate function. For HugeTLB fs we allow taking i_mutex in mmap. HugeTLB fs doesn't support file write and its file read callback is modified in a05b0855fd ("hugetlbfs: avoid taking i_mutex from hugetlbfs_read()") to not take i_mutex. Hence for HugeTLB fs with regular files we really don't take i_mutex with mmap_sem held. ====================================================== [ INFO: possible circular locking dependency detected ] 3.4.0-rc1+ #322 Not tainted ------------------------------------------------------- bash/1572 is trying to acquire lock: (&mm->mmap_sem){++++++}, at: [] might_fault+0x40/0x90 but task is already holding lock: (&sb->s_type->i_mutex_key#12){+.+.+.}, at: [] vfs_readdir+0x56/0xa8 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&sb->s_type->i_mutex_key#12){+.+.+.}: [] lock_acquire+0xd5/0xfa [] __mutex_lock_common+0x48/0x350 [] mutex_lock_nested+0x2a/0x31 [] hugetlbfs_file_mmap+0x7d/0x104 [] mmap_region+0x272/0x47d [] do_mmap_pgoff+0x294/0x2ee [] sys_mmap_pgoff+0xd2/0x10e [] sys_mmap+0x1d/0x1f [] system_call_fastpath+0x16/0x1b -> #0 (&mm->mmap_sem){++++++}: [] __lock_acquire+0xa81/0xd75 [] lock_acquire+0xd5/0xfa [] might_fault+0x6d/0x90 [] filldir+0x6a/0xc2 [] dcache_readdir+0x5c/0x222 [] vfs_readdir+0x76/0xa8 [] sys_getdents+0x79/0xc9 [] system_call_fastpath+0x16/0x1b other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&sb->s_type->i_mutex_key#12); lock(&mm->mmap_sem); lock(&sb->s_type->i_mutex_key#12); lock(&mm->mmap_sem); *** DEADLOCK *** 1 lock held by bash/1572: #0: (&sb->s_type->i_mutex_key#12){+.+.+.}, at: [] vfs_readdir+0x56/0xa8 stack backtrace: Pid: 1572, comm: bash Not tainted 3.4.0-rc1+ #322 Call Trace: [] print_circular_bug+0x1f8/0x209 [] __lock_acquire+0xa81/0xd75 [] ? handle_pte_fault+0x5ff/0x614 [] ? mark_lock+0x2d/0x258 [] ? might_fault+0x40/0x90 [] lock_acquire+0xd5/0xfa [] ? might_fault+0x40/0x90 [] ? __mutex_lock_common+0x333/0x350 [] might_fault+0x6d/0x90 [] ? might_fault+0x40/0x90 [] filldir+0x6a/0xc2 [] dcache_readdir+0x5c/0x222 [] ? sys_ioctl+0x74/0x74 [] ? sys_ioctl+0x74/0x74 [] ? sys_ioctl+0x74/0x74 [] vfs_readdir+0x76/0xa8 [] sys_getdents+0x79/0xc9 [] system_call_fastpath+0x16/0x1b Signed-off-by: Aneesh Kumar K.V Cc: Dave Jones Cc: Al Viro Cc: Josh Boyer Cc: Peter Zijlstra Cc: Mimi Zohar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 28cf06e4ec84..001ef01d2fe2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -485,6 +485,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, inode->i_fop = &simple_dir_operations; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); + lockdep_annotate_inode_mutex_key(inode); } return inode; } -- cgit v1.2.3 From 63f61a6f4633ff34c17bea7a0ed827eaeb0733e1 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 25 Apr 2012 16:01:52 -0700 Subject: revert "proc: clear_refs: do not clear reserved pages" Revert commit 85e72aa5384 ("proc: clear_refs: do not clear reserved pages"), which was a quick fix suitable for -stable until ARM had been moved over to the gate_vma mechanism: https://lkml.org/lkml/2012/1/14/55 With commit f9d4861f ("ARM: 7294/1: vectors: use gate_vma for vectors user mapping"), ARM does now use the gate_vma, so the PageReserved check can be removed from the proc code. Signed-off-by: Will Deacon Cc: Nicolas Pitre Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 2b9a7607cbd5..2d60492d6df8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -597,9 +597,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; - if (PageReserved(page)) - continue; - /* Clear accessed and referenced bits. */ ptep_test_and_clear_young(vma, addr, pte); ClearPageReferenced(page); -- cgit v1.2.3 From 996d282c7ff470f150a467eb4815b90159d04c47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 23 Apr 2012 20:35:03 -0400 Subject: Btrfs: do not start delalloc inodes during sync btrfs_start_delalloc_inodes will just walk the list of delalloc inodes and start writing them out, but it doesn't splice the list or anything so as long as somebody is doing work on the box you could end up in this section _forever_. So just remove it, it's not needed anyway since sync will start writeback on all inodes anyway, all we need to do is wait for ordered extents and then we can commit the transaction. In my horrible torture test sync goes from taking 4 minutes to about 1.5 minutes. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 43aa2dd0bc7d..f267718cbd1a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -819,7 +819,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root, 0); btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 0); -- cgit v1.2.3 From 3e74317ad773ba9df36db1fa32848cba41ac4d1a Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 27 Apr 2012 12:41:45 -0400 Subject: Btrfs: fix repair code for RAID10 btrfs_map_block sets mirror_num, so that the repair code knows eventually which device gave us the read error. For RAID10, mirror_num must be 1 or 2. Before this fix mirror_num was incorrectly related to our stripe index. Signed-off-by: Jan Schmidt Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3b984173d25b..1411b99555a4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3807,10 +3807,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, else if (mirror_num) stripe_index += mirror_num - 1; else { + int old_stripe_index = stripe_index; stripe_index = find_live_mirror(map, stripe_index, map->sub_stripes, stripe_index + current->pid % map->sub_stripes); - mirror_num = stripe_index + 1; + mirror_num = stripe_index - old_stripe_index + 1; } } else { /* -- cgit v1.2.3 From 1daf3540fa77faea2f91d96bcaf07ce48ee827be Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: Prevent root_list corruption I was seeing root_list corruption on unmount during fs resize in 3.4-rc4; add correct locking to address this. Signed-off-by: Daniel J Blueman Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 017281dbb2a7..5a105a086acf 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1279,7 +1279,9 @@ static int __update_reloc_root(struct btrfs_root *root, int del) if (rb_node) backref_tree_panic(rb_node, -EEXIST, node->bytenr); } else { + spin_lock(&root->fs_info->trans_lock); list_del_init(&root->root_list); + spin_unlock(&root->fs_info->trans_lock); kfree(node); } return 0; -- cgit v1.2.3 From 1f699d38b6556c393ac80f1c23c2053502a51631 Mon Sep 17 00:00:00 2001 From: Stefan Behrens Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: fix block_rsv and space_info lock ordering may_commit_transaction() calls spin_lock(&space_info->lock); spin_lock(&delayed_rsv->lock); and update_global_block_rsv() calls spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); Lockdep complains about this at run time. Everywhere except in update_global_block_rsv(), the space_info lock is the outer lock, therefore the locking order in update_global_block_rsv() is changed. Signed-off-by: Stefan Behrens Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 84497f8eb043..6fc2e6f5aab8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4214,8 +4214,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) num_bytes = calc_global_metadata_size(fs_info); - spin_lock(&block_rsv->lock); spin_lock(&sinfo->lock); + spin_lock(&block_rsv->lock); block_rsv->size = num_bytes; @@ -4241,8 +4241,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) block_rsv->full = 1; } - spin_unlock(&sinfo->lock); spin_unlock(&block_rsv->lock); + spin_unlock(&sinfo->lock); } static void init_global_block_rsv(struct btrfs_fs_info *fs_info) -- cgit v1.2.3 From 7654b72417e10e294563496e25211200f9b8b6d3 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Fri, 27 Apr 2012 12:41:46 -0400 Subject: Btrfs: Fix space checking during fs resize Fix out-of-space checking, addressing a warning and potential resource leak when resizing the filesystem down while allocating blocks. Signed-off-by: Daniel J Blueman Reviewed-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 5a105a086acf..646ee21bb035 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3813,7 +3813,7 @@ restart: ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5); if (ret < 0) { - if (ret != -EAGAIN) { + if (ret != -ENOSPC) { err = ret; WARN_ON(1); break; -- cgit v1.2.3 From fede766f28dd766d4e8feb321fdb19edb21ef6fb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:23:22 -0400 Subject: Btrfs: avoid deadlocks from GFP_KERNEL allocations during btrfs_real_readdir Btrfs has an optimization where it will preallocate dentries during readdir to fill in enough information to open the inode without an extra lookup. But, we're calling d_alloc, which is doing GFP_KERNEL allocations, and that leads to deadlocks because our readdir code has tree locks held. For now, disable this optimization. We'll fix the gfp mask in the next merge window. Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d953f8820464..3ce7805d1117 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4192,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, struct btrfs_path *path; struct list_head ins_list; struct list_head del_list; - struct qstr q; int ret; struct extent_buffer *leaf; int slot; @@ -4283,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, while (di_cur < di_total) { struct btrfs_key location; - struct dentry *tmp; if (verify_dir_item(root, leaf, di)) break; @@ -4304,33 +4302,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent, d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - q.name = name_ptr; - q.len = name_len; - q.hash = full_name_hash(q.name, q.len); - tmp = d_lookup(filp->f_dentry, &q); - if (!tmp) { - struct btrfs_key *newkey; - - newkey = kzalloc(sizeof(struct btrfs_key), - GFP_NOFS); - if (!newkey) - goto no_dentry; - tmp = d_alloc(filp->f_dentry, &q); - if (!tmp) { - kfree(newkey); - dput(tmp); - goto no_dentry; - } - memcpy(newkey, &location, - sizeof(struct btrfs_key)); - tmp->d_fsdata = newkey; - tmp->d_flags |= DCACHE_NEED_LOOKUP; - d_rehash(tmp); - dput(tmp); - } else { - dput(tmp); - } -no_dentry: + /* is this a reference to our own snapshot? If so * skip it. * -- cgit v1.2.3 From dc7fdde39e4962b1a88741f7eba2a6b3be1285d8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 27 Apr 2012 14:31:29 -0400 Subject: Btrfs: reduce lock contention during extent insertion We're spending huge amounts of time on lock contention during end_io processing because we unconditionally assume we are overwriting an existing extent in the file for each IO. This checks to see if we are outside i_size, and if so, it uses a less expensive readonly search of the btree to look for existing extents. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d83260d7498f..53bf2d764bbc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -567,6 +567,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, int extent_type; int recow; int ret; + int modify_tree = -1; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -575,10 +576,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, if (!path) return -ENOMEM; + if (start >= BTRFS_I(inode)->disk_i_size) + modify_tree = 0; + while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, - search_start, -1); + search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == start) { @@ -634,7 +638,8 @@ next_slot: } search_start = max(key.offset, start); - if (recow) { + if (recow || !modify_tree) { + modify_tree = -1; btrfs_release_path(path); continue; } -- cgit v1.2.3 From fcbf94b9dedd2ce08e798a99aafc94fec8668161 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 28 Apr 2012 08:29:56 -0700 Subject: Revert "autofs: work around unhappy compat problem on x86-64" This reverts commit a32744d4abae24572eff7269bc17895c41bd0085. While that commit was technically the right thing to do, and made the x86-64 compat mode work identically to native 32-bit mode (and thus fixing the problem with a 32-bit systemd install on a 64-bit kernel), it turns out that the automount binaries had workarounds for this compat problem. Now, the workarounds are disgusting: doing an "uname()" to find out the architecture of the kernel, and then comparing it for the 64-bit cases and fixing up the size of the read() in automount for those. And they were confused: it's not actually a generic 64-bit issue at all, it's very much tied to just x86-64, which has different alignment for an 'u64' in 64-bit mode than in 32-bit mode. But the end result is that fixing the compat layer actually breaks the case of a 32-bit automount on a x86-64 kernel. There are various approaches to fix this (including just doing a "strcmp()" on current->comm and comparing it to "automount"), but I think that I will do the one that teaches pipes about a special "packet mode", which will allow user space to not have to care too deeply about the padding at the end of the autofs packet. That change will make the compat workaround unnecessary, so let's revert it first, and get automount working again in compat mode. The packetized pipes will then fix autofs for systemd. Reported-and-requested-by: Michael Tokarev Cc: Ian Kent Cc: stable@kernel.org # for 3.3 Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 1 - fs/autofs4/dev-ioctl.c | 1 - fs/autofs4/inode.c | 2 -- fs/autofs4/waitq.c | 22 +++------------------- 4 files changed, 3 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index eb1cc92cd67d..d8d8e7ba6a1e 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -110,7 +110,6 @@ struct autofs_sb_info { int sub_version; int min_proto; int max_proto; - int compat_daemon; unsigned long exp_timeout; unsigned int type; int reghost_enabled; diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 9dacb8586701..3dfd615afb6b 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -385,7 +385,6 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->catatonic = 0; - sbi->compat_daemon = is_compat_task(); } out: mutex_unlock(&sbi->wq_mutex); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index d8dc002e9cc3..14c7bc02349e 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -19,7 +19,6 @@ #include #include #include -#include #include "autofs_i.h" #include @@ -225,7 +224,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; - sbi->compat_daemon = is_compat_task(); mutex_init(&sbi->wq_mutex); mutex_init(&sbi->pipe_mutex); spin_lock_init(&sbi->fs_lock); diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 9c098db43344..da8876d38a7b 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -91,24 +91,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return (bytes > 0); } - -/* - * The autofs_v5 packet was misdesigned. - * - * The packets are identical on x86-32 and x86-64, but have different - * alignment. Which means that 'sizeof()' will give different results. - * Fix it up for the case of running 32-bit user mode on a 64-bit kernel. - */ -static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi) -{ - size_t pktsz = sizeof(struct autofs_v5_packet); -#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT) - if (sbi->compat_daemon > 0) - pktsz -= 4; -#endif - return pktsz; -} - + static void autofs4_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) @@ -172,7 +155,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, { struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - pktsz = autofs_v5_packet_size(sbi); + pktsz = sizeof(*packet); + packet->wait_queue_token = wq->wait_queue_token; packet->len = wq->name.len; memcpy(packet->name, wq->name.name, wq->name.len); -- cgit v1.2.3 From 9883035ae7edef3ec62ad215611cb8e17d6a1a5d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Apr 2012 13:12:42 -0700 Subject: pipes: add a "packetized pipe" mode for writing The actual internal pipe implementation is already really about individual packets (called "pipe buffers"), and this simply exposes that as a special packetized mode. When we are in the packetized mode (marked by O_DIRECT as suggested by Alan Cox), a write() on a pipe will not merge the new data with previous writes, so each write will get a pipe buffer of its own. The pipe buffer is then marked with the PIPE_BUF_FLAG_PACKET flag, which in turn will tell the reader side to break the read at that boundary (and throw away any partial packet contents that do not fit in the read buffer). End result: as long as you do writes less than PIPE_BUF in size (so that the pipe doesn't have to split them up), you can now treat the pipe as a packet interface, where each read() system call will read one packet at a time. You can just use a sufficiently big read buffer (PIPE_BUF is sufficient, since bigger than that doesn't guarantee atomicity anyway), and the return value of the read() will naturally give you the size of the packet. NOTE! We do not support zero-sized packets, and zero-sized reads and writes to a pipe continue to be no-ops. Also note that big packets will currently be split at write time, but that the size at which that happens is not really specified (except that it's bigger than PIPE_BUF). Currently that limit is the system page size, but we might want to explicitly support bigger packets some day. The main user for this is going to be the autofs packet interface, allowing us to stop having to care so deeply about exact packet sizes (which have had bugs with 32/64-bit compatibility modes). But user space can create packetized pipes with "pipe2(fd, O_DIRECT)", which will fail with an EINVAL on kernels that do not support this interface. Tested-by: Michael Tokarev Cc: Alan Cox Cc: David Miller Cc: Ian Kent Cc: Thomas Meyer Cc: stable@kernel.org # needed for systemd/autofs interaction fix Signed-off-by: Linus Torvalds --- fs/pipe.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 25feaa3faac0..fec5e4ad071a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -346,6 +346,16 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { .get = generic_pipe_buf_get, }; +static const struct pipe_buf_operations packet_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = anon_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + static ssize_t pipe_read(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t pos) @@ -407,6 +417,13 @@ redo: ret += chars; buf->offset += chars; buf->len -= chars; + + /* Was it a packet buffer? Clean up and exit */ + if (buf->flags & PIPE_BUF_FLAG_PACKET) { + total_len = chars; + buf->len = 0; + } + if (!buf->len) { buf->ops = NULL; ops->release(pipe, buf); @@ -459,6 +476,11 @@ redo: return ret; } +static inline int is_packetized(struct file *file) +{ + return (file->f_flags & O_DIRECT) != 0; +} + static ssize_t pipe_write(struct kiocb *iocb, const struct iovec *_iov, unsigned long nr_segs, loff_t ppos) @@ -593,6 +615,11 @@ redo2: buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; + buf->flags = 0; + if (is_packetized(filp)) { + buf->ops = &packet_pipe_buf_ops; + buf->flags = PIPE_BUF_FLAG_PACKET; + } pipe->nrbufs = ++bufs; pipe->tmp_page = NULL; @@ -1013,7 +1040,7 @@ struct file *create_write_pipe(int flags) goto err_dentry; f->f_mapping = inode->i_mapping; - f->f_flags = O_WRONLY | (flags & O_NONBLOCK); + f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)); f->f_version = 0; return f; @@ -1057,7 +1084,7 @@ int do_pipe_flags(int *fd, int flags) int error; int fdw, fdr; - if (flags & ~(O_CLOEXEC | O_NONBLOCK)) + if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT)) return -EINVAL; fw = create_write_pipe(flags); -- cgit v1.2.3 From 64f371bc3107e69efce563a3d0f0e6880de0d537 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 29 Apr 2012 13:30:08 -0700 Subject: autofs: make the autofsv5 packet file descriptor use a packetized pipe The autofs packet size has had a very unfortunate size problem on x86: because the alignment of 'u64' differs in 32-bit and 64-bit modes, and because the packet data was not 8-byte aligned, the size of the autofsv5 packet structure differed between 32-bit and 64-bit modes despite looking otherwise identical (300 vs 304 bytes respectively). We first fixed that up by making the 64-bit compat mode know about this problem in commit a32744d4abae ("autofs: work around unhappy compat problem on x86-64"), and that made a 32-bit 'systemd' work happily on a 64-bit kernel because everything then worked the same way as on a 32-bit kernel. But it turned out that 'automount' had actually known and worked around this problem in user space, so fixing the kernel to do the proper 32-bit compatibility handling actually *broke* 32-bit automount on a 64-bit kernel, because it knew that the packet sizes were wrong and expected those incorrect sizes. As a result, we ended up reverting that compatibility mode fix, and thus breaking systemd again, in commit fcbf94b9dedd. With both automount and systemd doing a single read() system call, and verifying that they get *exactly* the size they expect but using different sizes, it seemed that fixing one of them inevitably seemed to break the other. At one point, a patch I seriously considered applying from Michael Tokarev did a "strcmp()" to see if it was automount that was doing the operation. Ugly, ugly. However, a prettier solution exists now thanks to the packetized pipe mode. By marking the communication pipe as being packetized (by simply setting the O_DIRECT flag), we can always just write the bigger packet size, and if user-space does a smaller read, it will just get that partial end result and the extra alignment padding will simply be thrown away. This makes both automount and systemd happy, since they now get the size they asked for, and the kernel side of autofs simply no longer needs to care - it could pad out the packet arbitrarily. Of course, if there is some *other* user of autofs (please, please, please tell me it ain't so - and we haven't heard of any) that tries to read the packets with multiple writes, that other user will now be broken - the whole point of the packetized mode is that one system call gets exactly one packet, and you cannot read a packet in pieces. Tested-by: Michael Tokarev Cc: Alan Cox Cc: David Miller Cc: Ian Kent Cc: Thomas Meyer Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 11 +++++++++++ fs/autofs4/dev-ioctl.c | 2 +- fs/autofs4/inode.c | 2 +- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index d8d8e7ba6a1e..908e18455413 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -269,6 +269,17 @@ int autofs4_fill_super(struct super_block *, void *, int); struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); void autofs4_clean_ino(struct autofs_info *); +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!pipe->f_op || !pipe->f_op->write) + return -EINVAL; + if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + /* Queue management functions */ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 3dfd615afb6b..aa9103f8f01b 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -376,7 +376,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, err = -EBADF; goto out; } - if (!pipe->f_op || !pipe->f_op->write) { + if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 14c7bc02349e..6e488ebe7784 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -290,7 +290,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) printk("autofs: could not open pipe file descriptor\n"); goto fail_dput; } - if (!pipe->f_op || !pipe->f_op->write) + if (autofs_prepare_pipe(pipe) < 0) goto fail_fput; sbi->pipe = pipe; sbi->pipefd = pipefd; -- cgit v1.2.3