From ac45d61357e86b9a0cf14e45e8e09dfb626970ef Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 5 Mar 2012 15:48:11 +0100
Subject: fuse: fix nlink after unlink

Anand Avati reports that the following sequence of system calls fail on a fuse
filesystem:


 	create("filename") => 0
 	link("filename", "linkname") => 0
 	unlink("filename") => 0
 	link("linkname", "filename") => -ENOENT ### BUG ###

vfs_link() fails with ENOENT if i_nlink is zero, this is done to prevent
resurrecting already deleted files.

Fuse clears i_nlink on unlink even if there are other links pointing to the
file.

Reported-by: Anand Avati <avati@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 206632887bb4..da379070fab5 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -644,13 +644,12 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 	fuse_put_request(fc, req);
 	if (!err) {
 		struct inode *inode = entry->d_inode;
+		struct fuse_inode *fi = get_fuse_inode(inode);
 
-		/*
-		 * Set nlink to zero so the inode can be cleared, if the inode
-		 * does have more links this will be discovered at the next
-		 * lookup/getattr.
-		 */
-		clear_nlink(inode);
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		drop_nlink(inode);
+		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
 		fuse_invalidate_entry_cache(entry);
@@ -762,8 +761,17 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	   will reflect changes in the backing inode (link count,
 	   etc.)
 	*/
-	if (!err || err == -EINTR)
+	if (!err) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		inc_nlink(inode);
+		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
+	} else if (err == -EINTR) {
+		fuse_invalidate_attr(inode);
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From 4273b793ec68753cc3fcf5be7cbfd88c2be2058d Mon Sep 17 00:00:00 2001
From: Anand Avati <avati@redhat.com>
Date: Fri, 17 Feb 2012 12:46:25 -0500
Subject: fuse: O_DIRECT support for files

Implement ->direct_IO() method in aops. The ->direct_IO() method combines
the existing fuse_direct_read/fuse_direct_write methods to implement
O_DIRECT functionality.

Reaching ->direct_IO() in the read path via generic_file_aio_read ensures
proper synchronization with page cache with its existing framework.

Reaching ->direct_IO() in the write path via fuse_file_aio_write is made
to come via generic_file_direct_write() which makes it play nice with
the page cache w.r.t other mmap pages etc.

On files marked 'direct_io' by the filesystem server, IO always follows
the fuse_direct_read/write path. There is no effect of fcntl(O_DIRECT)
and it always succeeds.

On files not marked with 'direct_io' by the filesystem server, the IO
path depends on O_DIRECT flag by the application. This can be passed
at the time of open() as well as via fcntl().

Note that asynchronous O_DIRECT iocb jobs are completed synchronously
always (this has been the case with FUSE even before this patch)

Signed-off-by: Anand Avati <avati@redhat.com>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c  |   3 --
 fs/fuse/file.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 112 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index da379070fab5..df5ac048dc74 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -387,9 +387,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	if (fc->no_create)
 		return -ENOSYS;
 
-	if (flags & O_DIRECT)
-		return -EINVAL;
-
 	forget = fuse_alloc_forget();
 	if (!forget)
 		return -ENOMEM;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 4a199fd93fbd..8ca007d09c96 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -194,10 +194,6 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
-	/* VFS checks this, but only _after_ ->open() */
-	if (file->f_flags & O_DIRECT)
-		return -EINVAL;
-
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -932,17 +928,23 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	size_t count = 0;
+	size_t ocount = 0;
 	ssize_t written = 0;
+	ssize_t written_buffered = 0;
 	struct inode *inode = mapping->host;
 	ssize_t err;
 	struct iov_iter i;
+	loff_t endbyte = 0;
 
 	WARN_ON(iocb->ki_pos != pos);
 
-	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	ocount = 0;
+	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 	if (err)
 		return err;
 
+	count = ocount;
+
 	mutex_lock(&inode->i_mutex);
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
@@ -962,11 +964,41 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	file_update_time(file);
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
-	written = fuse_perform_write(file, mapping, &i, pos);
-	if (written >= 0)
-		iocb->ki_pos = pos + written;
+	if (file->f_flags & O_DIRECT) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs,
+						    pos, &iocb->ki_pos,
+						    count, ocount);
+		if (written < 0 || written == count)
+			goto out;
+
+		pos += written;
+		count -= written;
 
+		iov_iter_init(&i, iov, nr_segs, count, written);
+		written_buffered = fuse_perform_write(file, mapping, &i, pos);
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
+		endbyte = pos + written_buffered - 1;
+
+		err = filemap_write_and_wait_range(file->f_mapping, pos,
+						   endbyte);
+		if (err)
+			goto out;
+
+		invalidate_mapping_pages(file->f_mapping,
+					 pos >> PAGE_CACHE_SHIFT,
+					 endbyte >> PAGE_CACHE_SHIFT);
+
+		written += written_buffered;
+		iocb->ki_pos = pos + written_buffered;
+	} else {
+		iov_iter_init(&i, iov, nr_segs, count, 0);
+		written = fuse_perform_write(file, mapping, &i, pos);
+		if (written >= 0)
+			iocb->ki_pos = pos + written;
+	}
 out:
 	current->backing_dev_info = NULL;
 	mutex_unlock(&inode->i_mutex);
@@ -1101,30 +1133,41 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
 	return res;
 }
 
-static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
-				 size_t count, loff_t *ppos)
+static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
+				   size_t count, loff_t *ppos)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	ssize_t res;
 
-	if (is_bad_inode(inode))
-		return -EIO;
-
-	/* Don't allow parallel writes to the same file */
-	mutex_lock(&inode->i_mutex);
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res) {
 		res = fuse_direct_io(file, buf, count, ppos, 1);
 		if (res > 0)
 			fuse_write_update_size(inode, *ppos);
 	}
-	mutex_unlock(&inode->i_mutex);
 
 	fuse_invalidate_attr(inode);
 
 	return res;
 }
 
+static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	ssize_t res;
+
+	if (is_bad_inode(inode))
+		return -EIO;
+
+	/* Don't allow parallel writes to the same file */
+	mutex_lock(&inode->i_mutex);
+	res = __fuse_direct_write(file, buf, count, ppos);
+	mutex_unlock(&inode->i_mutex);
+
+	return res;
+}
+
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
 	__free_page(req->pages[0]);
@@ -2077,6 +2120,57 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
 	return 0;
 }
 
+static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
+			     unsigned long nr_segs, loff_t *ppos, int rw)
+{
+	const struct iovec *vector = iov;
+	ssize_t ret = 0;
+
+	while (nr_segs > 0) {
+		void __user *base;
+		size_t len;
+		ssize_t nr;
+
+		base = vector->iov_base;
+		len = vector->iov_len;
+		vector++;
+		nr_segs--;
+
+		if (rw == WRITE)
+			nr = __fuse_direct_write(filp, base, len, ppos);
+		else
+			nr = fuse_direct_read(filp, base, len, ppos);
+
+		if (nr < 0) {
+			if (!ret)
+				ret = nr;
+			break;
+		}
+		ret += nr;
+		if (nr != len)
+			break;
+	}
+
+	return ret;
+}
+
+
+static ssize_t
+fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+			loff_t offset, unsigned long nr_segs)
+{
+	ssize_t ret = 0;
+	struct file *file = NULL;
+	loff_t pos = 0;
+
+	file = iocb->ki_filp;
+	pos = offset;
+
+	ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
+
+	return ret;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -2120,6 +2214,7 @@ static const struct address_space_operations fuse_file_aops  = {
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
+	.direct_IO	= fuse_direct_IO,
 };
 
 void fuse_init_file_inode(struct inode *inode)
-- 
cgit v1.2.3


From c1ac539ed43f273cd4d92bf7350ffd783b920184 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 22 Mar 2012 08:58:30 -0400
Subject: GFS2: put glock reference in error patch of read_rindex_entry

This patch fixes the error path of function read_rindex_entry
so that it correctly gives up its glock reference in cases where
there is a race to re-read the rindex after gfs2_grow.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/rgrp.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 19bde40b4864..19354a20e5b1 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -640,6 +640,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 		return 0;
 
 	error = 0; /* someone else read in the rgrp; free it and ignore it */
+	gfs2_glock_put(rgd->rd_gl);
 
 fail:
 	kfree(rgd->rd_bits);
-- 
cgit v1.2.3


From 97cc008aaa8c1f02699b478ca890e81810244131 Mon Sep 17 00:00:00 2001
From: Benjamin Poirier <bpoirier@suse.de>
Date: Fri, 23 Mar 2012 18:06:18 -0400
Subject: GFS2: use depends instead of select in kconfig

Avoids having to duplicate the dependencies of what is 'select'ed (and on
down...)

Those dependencies are currently incomplete, leading to broken builds with
GFS2_FS_LOCKING_DLM=y and IP_SCTP=n.

Signed-off-by: Benjamin Poirier <bpoirier@suse.de>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index c465ae066c62..eb08c9e43c2a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,10 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
 	depends on (64BIT || LBDAF)
-	select DLM if GFS2_FS_LOCKING_DLM
-	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
-	select SYSFS if GFS2_FS_LOCKING_DLM
-	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
 	select QUOTACTL
@@ -29,7 +25,8 @@ config GFS2_FS
 
 config GFS2_FS_LOCKING_DLM
 	bool "GFS2 DLM locking"
-	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && HOTPLUG
+	depends on (GFS2_FS!=n) && NET && INET && (IPV6 || IPV6=n) && \
+		HOTPLUG && DLM && CONFIGFS_FS && SYSFS
 	help
 	  Multiple node locking module for GFS2
 
-- 
cgit v1.2.3


From cb85a6ed67e979c59a29b7b4e8217e755b951cf4 Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky <schwidefsky@de.ibm.com>
Date: Fri, 30 Mar 2012 12:23:08 +0200
Subject: proc: stats: Use arch_idle_time for idle and iowait times if
 available

Git commit a25cac5198d4ff28 "proc: Consider NO_HZ when printing idle and
iowait times" changes the code for /proc/stat to use get_cpu_idle_time_us
and get_cpu_iowait_time_us if the system is running with nohz enabled.
For architectures which define arch_idle_time (currently s390 only)
this is a change for the worse. The result of arch_idle_time is supposed
to be the exact sleep time of the target cpu and should be used instead
of the value kept by the scheduler.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/20120330122308.18720283@de.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/stat.c | 34 ++++++++++++++++++++++++++++------
 1 file changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 6a0c62d6e442..64c3b3172367 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -18,19 +18,39 @@
 #ifndef arch_irq_stat
 #define arch_irq_stat() 0
 #endif
-#ifndef arch_idle_time
-#define arch_idle_time(cpu) 0
-#endif
+
+#ifdef arch_idle_time
+
+static cputime64_t get_idle_time(int cpu)
+{
+	cputime64_t idle;
+
+	idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
+	if (cpu_online(cpu) && !nr_iowait_cpu(cpu))
+		idle += arch_idle_time(cpu);
+	return idle;
+}
+
+static cputime64_t get_iowait_time(int cpu)
+{
+	cputime64_t iowait;
+
+	iowait = kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
+	if (cpu_online(cpu) && nr_iowait_cpu(cpu))
+		iowait += arch_idle_time(cpu);
+	return iowait;
+}
+
+#else
 
 static u64 get_idle_time(int cpu)
 {
 	u64 idle, idle_time = get_cpu_idle_time_us(cpu, NULL);
 
-	if (idle_time == -1ULL) {
+	if (idle_time == -1ULL)
 		/* !NO_HZ so we can rely on cpustat.idle */
 		idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
-		idle += arch_idle_time(cpu);
-	} else
+	else
 		idle = usecs_to_cputime64(idle_time);
 
 	return idle;
@@ -49,6 +69,8 @@ static u64 get_iowait_time(int cpu)
 	return iowait;
 }
 
+#endif
+
 static int show_stat(struct seq_file *p, void *v)
 {
 	int i, j;
-- 
cgit v1.2.3


From 5e2f7d617b574dadf3ad125e4821ce1b180b1626 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 4 Apr 2012 22:11:16 -0400
Subject: GFS2: Make sure rindex is uptodate before starting transactions

This patch removes the call from gfs2_blk2rgrd to function
gfs2_rindex_update and replaces it with individual calls.
The former way turned out to be too problematic.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c  |  6 +++++-
 fs/gfs2/dir.c   |  4 ++++
 fs/gfs2/inode.c | 13 +++++++++++--
 fs/gfs2/rgrp.c  |  7 ++++---
 fs/gfs2/xattr.c | 12 ++++++++++++
 5 files changed, 36 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 197c5c47e577..03c04febe26f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -724,7 +724,11 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	int metadata;
 	unsigned int revokes = 0;
 	int x;
-	int error = 0;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
 
 	if (!*top)
 		sm->sm_first = 0;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c35573abd371..a836056343f0 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1844,6 +1844,10 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
 	unsigned int x, size = len * sizeof(u64);
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
 	ht = kzalloc(size, GFP_NOFS);
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c98a60ee6dfd..a9ba2444e077 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1031,7 +1031,13 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
 	struct buffer_head *bh;
 	struct gfs2_holder ghs[3];
 	struct gfs2_rgrpd *rgd;
-	int error = -EROFS;
+	int error;
+
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
+	error = -EROFS;
 
 	gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs);
 	gfs2_holder_init(ip->i_gl,  LM_ST_EXCLUSIVE, 0, ghs + 1);
@@ -1224,6 +1230,10 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 			return 0;
 	}
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	if (odip != ndip) {
 		error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE,
 					   0, &r_gh);
@@ -1345,7 +1355,6 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	error = alloc_required;
 	if (error < 0)
 		goto out_gunlock;
-	error = 0;
 
 	if (alloc_required) {
 		struct gfs2_qadata *qa = gfs2_qadata_get(ndip);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 19354a20e5b1..3df65c9ab73b 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -332,9 +332,6 @@ struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk, bool exact)
 	struct rb_node *n, *next;
 	struct gfs2_rgrpd *cur;
 
-	if (gfs2_rindex_update(sdp))
-		return NULL;
-
 	spin_lock(&sdp->sd_rindex_spin);
 	n = sdp->sd_rindex_tree.rb_node;
 	while (n) {
@@ -928,6 +925,10 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
 	} else if (copy_from_user(&r, argp, sizeof(r)))
 		return -EFAULT;
 
+	ret = gfs2_rindex_update(sdp);
+	if (ret)
+		return ret;
+
 	rgd = gfs2_blk2rgrpd(sdp, r.start, 0);
 	rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0);
 
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 2e5ba425cae7..927f4df874ae 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -238,6 +238,10 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
 	unsigned int x;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	if (GFS2_EA_IS_STUFFED(ea))
 		return 0;
 
@@ -1330,6 +1334,10 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 	unsigned int x;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
 	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
@@ -1439,6 +1447,10 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
 	struct gfs2_holder gh;
 	int error;
 
+	error = gfs2_rindex_update(sdp);
+	if (error)
+		return error;
+
 	rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr, 1);
 	if (!rgd) {
 		gfs2_consist_inode(ip);
-- 
cgit v1.2.3


From 640946f20390e492694f9d7470656f2262385951 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 2 Apr 2012 19:22:25 -0400
Subject: dentry leak in simple_fill_super() failure exit

d_genocide() does _not_ evict dentries; it just removes extra ref
pinning each of those.  Normally it's followed by shrinking the
tree (it's done just before generic_shutdown_super() by kill_litter_super()),
but in case of simple_fill_super() nothing of that kind will follow.
Just do shrink_dcache_parent() manually.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/libfs.c b/fs/libfs.c
index 358094f0433d..18d08f5db53a 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -529,6 +529,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
 	return 0;
 out:
 	d_genocide(root);
+	shrink_dcache_parent(root);
 	dput(root);
 	return -ENOMEM;
 }
-- 
cgit v1.2.3


From 70fa4a62e913dde2d100e0be2711562742f58bee Mon Sep 17 00:00:00 2001
From: Tom Goff <thomas.goff@boeing.com>
Date: Wed, 4 Apr 2012 12:06:20 -0700
Subject: sysfs: Update the name hash for an entry after changing the namespace

This is needed to allow renaming network devices that have been moved
to another network namespace.

Signed-off-by: Tom Goff <thomas.goff@boeing.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 2a7a3f5d1ca6..8ddc1021c79a 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -878,7 +878,6 @@ int sysfs_rename(struct sysfs_dirent *sd,
 
 		dup_name = sd->s_name;
 		sd->s_name = new_name;
-		sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
 	}
 
 	/* Move to the appropriate place in the appropriate directories rbtree. */
@@ -886,6 +885,7 @@ int sysfs_rename(struct sysfs_dirent *sd,
 	sysfs_get(new_parent_sd);
 	sysfs_put(sd->s_parent);
 	sd->s_ns = new_ns;
+	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
 	sd->s_parent = new_parent_sd;
 	sysfs_link_sibling(sd);
 
-- 
cgit v1.2.3


From ca9248d8337d525c2d2b26a1d8314478d15707fb Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 10 Apr 2012 08:56:04 -0400
Subject: GFS2: Allow caching of rindex glock

This patch allows caching of the rindex glock. We were previously
setting the GL_NOCACHE bit when the glock was released. That forced
the rindex inode to be invalidated, which caused us to re-read
rindex at the next access. However, it caused the glock to be
unnecessarily bounced around the cluster. This patch allows
the glock to remain cached, but it still causes the rindex to be
re-read once it has been written to by gfs2_grow.

Ben and I have tested single-node gfs2_grow cases and I've tested
clustered gfs2_grow cases on my four-node cluster.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 38b7a74a0f91..9b2ff0e851b1 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -807,7 +807,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 
 	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
-		ip->i_gh.gh_flags |= GL_NOCACHE;
+		sdp->sd_rindex_uptodate = 0;
 	}
 
 	brelse(dibh);
@@ -873,7 +873,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 
 	if (inode == sdp->sd_rindex) {
 		adjust_fs_space(inode);
-		ip->i_gh.gh_flags |= GL_NOCACHE;
+		sdp->sd_rindex_uptodate = 0;
 	}
 
 	brelse(dibh);
-- 
cgit v1.2.3


From 5631f2c18f4b2845b3e97df1c659c5094a17605f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bruno=20Pr=C3=A9mont?= <bonbons@linux-vserver.org>
Date: Tue, 3 Apr 2012 09:59:48 +0200
Subject: sysfs: Prevent crash on unset sysfs group attributes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Do not let the kernel crash when a device is registered with
sysfs while group attributes are not set (aka NULL).

Warn about the offender with some information about the offending
device.

This would warn instead of trying NULL pointer deref like:
 BUG: unable to handle kernel NULL pointer dereference at (null)
 IP: [<ffffffff81152673>] internal_create_group+0x83/0x1a0
 PGD 0
 Oops: 0000 [#1] SMP
 CPU 0
 Modules linked in:

 Pid: 1, comm: swapper/0 Not tainted 3.4.0-rc1-x86_64 #3 HP ProLiant DL360 G4
 RIP: 0010:[<ffffffff81152673>]  [<ffffffff81152673>] internal_create_group+0x83/0x1a0
 RSP: 0018:ffff88019485fd70  EFLAGS: 00010202
 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 0000000000000001
 RDX: ffff880192e99908 RSI: ffff880192e99630 RDI: ffffffff81a26c60
 RBP: ffff88019485fdc0 R08: 0000000000000000 R09: 0000000000000000
 R10: ffff880192e99908 R11: 0000000000000000 R12: ffffffff81a16a00
 R13: ffff880192e99908 R14: ffffffff81a16900 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88019bc00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 CR2: 0000000000000000 CR3: 0000000001a0c000 CR4: 00000000000007f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Process swapper/0 (pid: 1, threadinfo ffff88019485e000, task ffff880194878000)
 Stack:
  ffff88019485fdd0 ffff880192da9d60 0000000000000000 ffff880192e99908
  ffff880192e995d8 0000000000000001 ffffffff81a16a00 ffff880192da9d60
  0000000000000000 0000000000000000 ffff88019485fdd0 ffffffff811527be
 Call Trace:
  [<ffffffff811527be>] sysfs_create_group+0xe/0x10
  [<ffffffff81376ca6>] device_add_groups+0x46/0x80
  [<ffffffff81377d3d>] device_add+0x46d/0x6a0
  ...

Signed-off-by: Bruno Prémont <bonbons@linux-vserver.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/group.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index dd1701caecc9..2df555c66d57 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -67,7 +67,11 @@ static int internal_create_group(struct kobject *kobj, int update,
 	/* Updates may happen before the object has been instantiated */
 	if (unlikely(update && !kobj->sd))
 		return -EINVAL;
-
+	if (!grp->attrs) {
+		WARN(1, "sysfs: attrs not set by subsystem for group: %s/%s\n",
+			kobj->name, grp->name ? "" : grp->name);
+		return -EINVAL;
+	}
 	if (grp->name) {
 		error = sysfs_create_subdir(kobj, grp->name, &sd);
 		if (error)
-- 
cgit v1.2.3


From 3a198886ab5f228fcbebb9ace803d8b99721d49a Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 6 Apr 2012 13:41:06 -0700
Subject: sysfs: handle 'parent deleted before child added'

In scsi at least two cases of the parent device being deleted before the
child is added have been observed.

1/ scsi is performing async scans and the device is removed prior to the
   async can thread running (can happen with an in-opportune / unlikely
   unplug during initial scan).

2/ libsas discovery event running after the parent port has been torn
   down (this is a bug in libsas).

Result in crash signatures like:
 BUG: unable to handle kernel NULL pointer dereference at 0000000000000098
 IP: [<ffffffff8115e100>] sysfs_create_dir+0x32/0xb6
 ...
 Process scsi_scan_8 (pid: 5417, threadinfo ffff88080bd16000, task ffff880801b8a0b0)
 Stack:
  00000000fffffffe ffff880813470628 ffff88080bd17cd0 ffff88080614b7e8
  ffff88080b45c108 00000000fffffffe ffff88080bd17d20 ffffffff8125e4a8
  ffff88080bd17cf0 ffffffff81075149 ffff88080bd17d30 ffff88080614b7e8
 Call Trace:
  [<ffffffff8125e4a8>] kobject_add_internal+0x120/0x1e3
  [<ffffffff81075149>] ? trace_hardirqs_on+0xd/0xf
  [<ffffffff8125e641>] kobject_add_varg+0x41/0x50
  [<ffffffff8125e70b>] kobject_add+0x64/0x66
  [<ffffffff8131122b>] device_add+0x12d/0x63a

In this scenario the parent is still valid (because we have a
reference), but it has been device_del()'d which means its kobj->sd
pointer is NULL'd via:

 device_del()->kobject_del()->sysfs_remove_dir()

...and then sysfs_create_dir() (without this fix) goes ahead and
de-references parent_sd via sysfs_ns_type():

 return (sd->s_flags & SYSFS_NS_TYPE_MASK) >> SYSFS_NS_TYPE_SHIFT;

This scenario is being fixed in scsi/libsas, but if other subsystems
present the same ordering the system need not immediately crash.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: James Bottomley <JBottomley@parallels.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/sysfs/dir.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 8ddc1021c79a..35a36d39fa2c 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -729,6 +729,9 @@ int sysfs_create_dir(struct kobject * kobj)
 	else
 		parent_sd = &sysfs_root;
 
+	if (!parent_sd)
+		return -ENOENT;
+
 	if (sysfs_ns_type(parent_sd))
 		ns = kobj->ktype->namespace(kobj);
 	type = sysfs_read_ns_type(kobj);
-- 
cgit v1.2.3


From 0a2da9b2ef2ef76c09397597f260245b020e6522 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 11 Apr 2012 11:45:06 +0200
Subject: fuse: allow nanosecond granularity

Derrik Pates reports that an utimensat with a NULL argument results in the
current time being sent from the kernel with 1 second granularity.

Reported-by: Derrik Pates <demon@now.ai>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 64cf8d07393e..c9a0c972a4b2 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -947,6 +947,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
 	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
-- 
cgit v1.2.3


From 9dc4e6c4d1182d34604ea40fef641775f5b15456 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Mon, 9 Apr 2012 18:06:49 -0400
Subject: nfsd: don't fail unchecked creates of non-special files

Allow a v3 unchecked open of a non-regular file succeed as if it were a
lookup; typically a client in such a case will want to fall back on a
local open, so succeeding and giving it the filehandle is more useful
than failing with nfserr_exist, which makes it appear that nothing at
all exists by that name.

Similarly for v4, on an open-create, return the same errors we would on
an attempt to open a non-regular file, instead of returning
nfserr_exist.

This fixes a problem found doing a v4 open of a symlink with
O_RDONLY|O_CREAT, which resulted in the current client returning EEXIST.

Thanks also to Trond for analysis.

Cc: stable@kernel.org
Reported-by: Orion Poplawski <orion@cora.nwra.com>
Tested-by: Orion Poplawski <orion@cora.nwra.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4proc.c | 8 ++++----
 fs/nfsd/vfs.c      | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2ed14dfd00a2..8256efda3638 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -235,15 +235,15 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
 		 */
 		if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
 			open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
-						FATTR4_WORD1_TIME_MODIFY);
+							FATTR4_WORD1_TIME_MODIFY);
 	} else {
 		status = nfsd_lookup(rqstp, current_fh,
 				     open->op_fname.data, open->op_fname.len, resfh);
 		fh_unlock(current_fh);
-		if (status)
-			goto out;
-		status = nfsd_check_obj_isreg(resfh);
 	}
+	if (status)
+		goto out;
+	status = nfsd_check_obj_isreg(resfh);
 	if (status)
 		goto out;
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 296d671654d6..568666156ea4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1458,7 +1458,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		switch (createmode) {
 		case NFS3_CREATE_UNCHECKED:
 			if (! S_ISREG(dchild->d_inode->i_mode))
-				err = nfserr_exist;
+				goto out;
 			else if (truncp) {
 				/* in nfsv4, we need to treat this case a little
 				 * differently.  we don't want to truncate the
-- 
cgit v1.2.3


From 4fe9e9639d95cd11de63afa353f2de320f26033a Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Tue, 10 Apr 2012 18:12:27 +0100
Subject: Cleanup handling of NULL value passed for a mount option

Allow blank user= and ip= mount option. Also clean up redundant
checks for NULL values since the token parser will not actually
match mount options with NULL values unless explicitly specified.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reported-by: Chris Clayton <chris2553@googlemail.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Tested-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 80 +++++++++++++------------------------------------------
 1 file changed, 19 insertions(+), 61 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d81e933a796b..6a86f3d68182 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -109,6 +109,8 @@ enum {
 
 	/* Options which could be blank */
 	Opt_blank_pass,
+	Opt_blank_user,
+	Opt_blank_ip,
 
 	Opt_err
 };
@@ -183,11 +185,15 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_wsize, "wsize=%s" },
 	{ Opt_actimeo, "actimeo=%s" },
 
+	{ Opt_blank_user, "user=" },
+	{ Opt_blank_user, "username=" },
 	{ Opt_user, "user=%s" },
 	{ Opt_user, "username=%s" },
 	{ Opt_blank_pass, "pass=" },
 	{ Opt_pass, "pass=%s" },
 	{ Opt_pass, "password=%s" },
+	{ Opt_blank_ip, "ip=" },
+	{ Opt_blank_ip, "addr=" },
 	{ Opt_ip, "ip=%s" },
 	{ Opt_ip, "addr=%s" },
 	{ Opt_unc, "unc=%s" },
@@ -1534,15 +1540,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 
 		/* String Arguments */
 
+		case Opt_blank_user:
+			/* null user, ie. anonymous authentication */
+			vol->nullauth = 1;
+			vol->username = NULL;
+			break;
 		case Opt_user:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				/* null user, ie. anonymous authentication */
-				vol->nullauth = 1;
-			} else if (strnlen(string, MAX_USERNAME_SIZE) >
+			if (strnlen(string, MAX_USERNAME_SIZE) >
 							MAX_USERNAME_SIZE) {
 				printk(KERN_WARNING "CIFS: username too long\n");
 				goto cifs_parse_mount_err;
@@ -1611,14 +1619,15 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			}
 			vol->password[j] = '\0';
 			break;
+		case Opt_blank_ip:
+			vol->UNCip = NULL;
+			break;
 		case Opt_ip:
 			string = match_strdup(args);
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				vol->UNCip = NULL;
-			} else if (strnlen(string, INET6_ADDRSTRLEN) >
+			if (strnlen(string, INET6_ADDRSTRLEN) >
 						INET6_ADDRSTRLEN) {
 				printk(KERN_WARNING "CIFS: ip address "
 						    "too long\n");
@@ -1636,12 +1645,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: invalid path to "
-						    "network resource\n");
-				goto cifs_parse_mount_err;
-			}
-
 			temp_len = strnlen(string, 300);
 			if (temp_len  == 300) {
 				printk(KERN_WARNING "CIFS: UNC name too long\n");
@@ -1670,11 +1673,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: invalid domain"
-						    " name\n");
-				goto cifs_parse_mount_err;
-			} else if (strnlen(string, 256) == 256) {
+			if (strnlen(string, 256) == 256) {
 				printk(KERN_WARNING "CIFS: domain name too"
 						    " long\n");
 				goto cifs_parse_mount_err;
@@ -1693,11 +1692,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: srcaddr value not"
-						    " specified\n");
-				goto cifs_parse_mount_err;
-			} else if (!cifs_convert_address(
+			if (!cifs_convert_address(
 					(struct sockaddr *)&vol->srcaddr,
 					string, strlen(string))) {
 				printk(KERN_WARNING "CIFS:  Could not parse"
@@ -1710,11 +1705,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: Invalid path"
-						    " prefix\n");
-				goto cifs_parse_mount_err;
-			}
 			temp_len = strnlen(string, 1024);
 			if (string[0] != '/')
 				temp_len++; /* missing leading slash */
@@ -1742,11 +1732,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: Invalid iocharset"
-						    " specified\n");
-				goto cifs_parse_mount_err;
-			} else if (strnlen(string, 1024) >= 65) {
+			if (strnlen(string, 1024) >= 65) {
 				printk(KERN_WARNING "CIFS: iocharset name "
 						    "too long.\n");
 				goto cifs_parse_mount_err;
@@ -1771,11 +1757,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: No socket option"
-						    " specified\n");
-				goto cifs_parse_mount_err;
-			}
 			if (strnicmp(string, "TCP_NODELAY", 11) == 0)
 				vol->sockopt_tcp_nodelay = 1;
 			break;
@@ -1784,12 +1765,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: Invalid (empty)"
-						    " netbiosname\n");
-				break;
-			}
-
 			memset(vol->source_rfc1001_name, 0x20,
 				RFC1001_NAME_LEN);
 			/*
@@ -1817,11 +1792,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: Empty server"
-					" netbiosname specified\n");
-				break;
-			}
 			/* last byte, type, is 0x20 for servr type */
 			memset(vol->target_rfc1001_name, 0x20,
 				RFC1001_NAME_LEN_WITH_NULL);
@@ -1848,12 +1818,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				cERROR(1, "no protocol version specified"
-					  " after vers= mount option");
-				goto cifs_parse_mount_err;
-			}
-
 			if (strnicmp(string, "cifs", 4) == 0 ||
 			    strnicmp(string, "1", 1) == 0) {
 				/* This is the default */
@@ -1868,12 +1832,6 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (!*string) {
-				printk(KERN_WARNING "CIFS: no security flavor"
-						    " specified\n");
-				break;
-			}
-
 			if (cifs_parse_security_flavors(string, vol) != 0)
 				goto cifs_parse_mount_err;
 			break;
-- 
cgit v1.2.3


From 8e62c2de6e23e5c1fee04f59de51b54cc2868ca5 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Apr 2012 13:46:48 -0400
Subject: Revert "Btrfs: increase the global block reserve estimates"

This reverts commit 5500cdbe14d7435e04f66ff3cfb8ecd8b8e44ebf.

We've had a number of complaints of early enospc that bisect down
to this patch.  We'll hae to fix the reservations differently.

CC: stable@kernel.org
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a84420491c11..ace5e8cef03e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4205,7 +4205,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	num_bytes += div64_u64(data_used + meta_used, 50);
 
 	if (num_bytes * 3 > meta_used)
-		num_bytes = div64_u64(meta_used, 3) * 2;
+		num_bytes = div64_u64(meta_used, 3);
 
 	return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
 }
-- 
cgit v1.2.3


From d95603b262edb53d6016a8df0c150371d4d61e67 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Apr 2012 15:55:15 -0400
Subject: Btrfs: fix uninit variable in repair_eb_io_failure

We'd have to be passing bogus extent buffers for this uninit variable to
actually be used, but set it to zero just in case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0c3ec003f273..59ec105444fe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1937,7 +1937,7 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
 	u64 start = eb->start;
 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
-	int ret;
+	int ret = 0;
 
 	for (i = 0; i < num_pages; i++) {
 		struct page *p = extent_buffer_page(eb, i);
-- 
cgit v1.2.3


From b89203f74bdfcb15407d54d3f257b16a2ea19e62 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 12 Apr 2012 16:03:56 -0400
Subject: Btrfs: fix eof while discarding extents

We miscalculate the length of extents we're discarding, and it leads to
an eof of device.

Reported-by: Daniel Blueman <daniel@quora.org>
Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a872b48be0ae..759d02486d7c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3833,6 +3833,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		int sub_stripes = 0;
 		u64 stripes_per_dev = 0;
 		u32 remaining_stripes = 0;
+		u32 last_stripe = 0;
 
 		if (map->type &
 		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
@@ -3846,6 +3847,8 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 						      stripe_nr_orig,
 						      factor,
 						      &remaining_stripes);
+			div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+			last_stripe *= sub_stripes;
 		}
 
 		for (i = 0; i < num_stripes; i++) {
@@ -3858,16 +3861,29 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 					 BTRFS_BLOCK_GROUP_RAID10)) {
 				bbio->stripes[i].length = stripes_per_dev *
 							  map->stripe_len;
+
 				if (i / sub_stripes < remaining_stripes)
 					bbio->stripes[i].length +=
 						map->stripe_len;
+
+				/*
+				 * Special for the first stripe and
+				 * the last stripe:
+				 *
+				 * |-------|...|-------|
+				 *     |----------|
+				 *    off     end_off
+				 */
 				if (i < sub_stripes)
 					bbio->stripes[i].length -=
 						stripe_offset;
-				if ((i / sub_stripes + 1) %
-				    sub_stripes == remaining_stripes)
+
+				if (stripe_index >= last_stripe &&
+				    stripe_index <= (last_stripe +
+						     sub_stripes - 1))
 					bbio->stripes[i].length -=
 						stripe_end_offset;
+
 				if (i == sub_stripes - 1)
 					stripe_offset = 0;
 			} else
-- 
cgit v1.2.3


From c6664b42c4e567792abdb17c958fb01c5bcfcb3a Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 12 Apr 2012 16:03:56 -0400
Subject: Btrfs: remove lock assert from get_restripe_target()

This fixes a regression introduced by fc67c450.  spin_is_locked() always
returns 0 on UP kernels, which caused assert in get_restripe_target() to
be fired on every call from btrfs_reduce_alloc_profile() on UP systems.
Remove it completely for now, it's not clear if it's going to be needed
in future.

Reported-by: Bobby Powers <bobbypowers@gmail.com>
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Tested-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ace5e8cef03e..a2134d8141c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3152,15 +3152,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 /*
  * returns target flags in extended format or 0 if restripe for this
  * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
  */
 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
 {
 	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
 	u64 target = 0;
 
-	BUG_ON(!mutex_is_locked(&fs_info->volume_mutex) &&
-	       !spin_is_locked(&fs_info->balance_lock));
-
 	if (!bctl)
 		return 0;
 
-- 
cgit v1.2.3


From e627ee7bcd42b4e3a03ca01a8e46dcb4033c5ae0 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 12 Apr 2012 16:03:56 -0400
Subject: Btrfs: check return value of bio_alloc() properly

bio_alloc() has the possibility of returning NULL.
So, it is necessary to check the return value.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/compression.c | 2 ++
 fs/btrfs/extent_io.c   | 4 ++++
 fs/btrfs/scrub.c       | 4 ++++
 3 files changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index d11afa67c7d8..646f5e6f2566 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -405,6 +405,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
 			bio_put(bio);
 
 			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			BUG_ON(!bio);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -687,6 +688,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
+			BUG_ON(!comp_bio);
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 59ec105444fe..4789770f8eaf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2180,6 +2180,10 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 	}
 
 	bio = bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
 	bio->bi_private = state;
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index c9a2c1aef4bd..60f0e28db31e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1044,6 +1044,8 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 		BUG_ON(!page->page);
 		bio = bio_alloc(GFP_NOFS, 1);
+		if (!bio)
+			return -EIO;
 		bio->bi_bdev = page->bdev;
 		bio->bi_sector = page->physical >> 9;
 		bio->bi_end_io = scrub_complete_bio_end_io;
@@ -1172,6 +1174,8 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 		DECLARE_COMPLETION_ONSTACK(complete);
 
 		bio = bio_alloc(GFP_NOFS, 1);
+		if (!bio)
+			return -EIO;
 		bio->bi_bdev = page_bad->bdev;
 		bio->bi_sector = page_bad->physical >> 9;
 		bio->bi_end_io = scrub_complete_bio_end_io;
-- 
cgit v1.2.3


From 4edc2ca388d62abffe38149f6ac00e749ea721c5 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Thu, 12 Apr 2012 16:03:56 -0400
Subject: Btrfs: fix use-after-free in __btrfs_end_transaction

49b25e0540904be0bf558b84475c69d72e4de66e introduced a use-after-free bug
that caused spurious -EIO's to be returned.

Do the check before we free the transaction.

Cc: David Sterba <dsterba@suse.cz>
Cc: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Dave Jones <davej@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8da29e8e4de1..11b77a59db62 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -480,6 +480,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
 	int count = 0;
+	int err = 0;
 
 	if (--trans->use_count) {
 		trans->block_rsv = trans->orig_rsv;
@@ -532,18 +533,18 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
-	memset(trans, 0, sizeof(*trans));
-	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (throttle)
 		btrfs_run_delayed_iputs(root);
 
 	if (trans->aborted ||
 	    root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		return -EIO;
+		err = -EIO;
 	}
 
-	return 0;
+	memset(trans, 0, sizeof(*trans));
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+	return err;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From d53ba47484ed6245e640ee4bfe9d21e9bfc15765 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 12 Apr 2012 16:03:57 -0400
Subject: Btrfs: use commit root when loading free space cache

A user reported that booting his box up with btrfs root on 3.4 was way
slower than on 3.3 because I removed the ideal caching code.  It turns out
that we don't load the free space cache if we're in a commit for deadlock
reasons, but since we're reading the cache and it hasn't changed yet we are
safe reading the inode and free space item from the commit root, so do that
and remove all of the deadlock checks so we don't unnecessarily skip loading
the free space cache.  The user reported this fixed the slowness.  Thanks,

Tested-by: Calvin Walton <calvin.walton@kepstin.ca>
Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c      | 4 +---
 fs/btrfs/free-space-cache.c | 9 ++-------
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a2134d8141c1..2b35f8d14bb9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -529,9 +529,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	 * allocate blocks for the tree root we can't do the fast caching since
 	 * we likely hold important locks.
 	 */
-	if (trans && (!trans->transaction->in_commit) &&
-	    (root && root != root->fs_info->tree_root) &&
-	    btrfs_test_opt(root, SPACE_CACHE)) {
+	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 		ret = load_free_space_cache(fs_info, cache);
 
 		spin_lock(&cache->lock);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 054707ed5791..baaa518baaf8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -747,13 +747,6 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	bool matched;
 	u64 used = btrfs_block_group_used(&block_group->item);
 
-	/*
-	 * If we're unmounting then just return, since this does a search on the
-	 * normal root and not the commit root and we could deadlock.
-	 */
-	if (btrfs_fs_closing(fs_info))
-		return 0;
-
 	/*
 	 * If this block group has been marked to be cleared for one reason or
 	 * another then we can't trust the on disk cache, so just return.
@@ -768,6 +761,8 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	path = btrfs_alloc_path();
 	if (!path)
 		return 0;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode)) {
-- 
cgit v1.2.3


From 96f6f98501196d46ce52c2697dd758d9300c63f5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 12 Apr 2012 23:47:00 -0400
Subject: nfsd: fix b0rken error value for setattr on read-only mount

..._want_write() returns -EROFS on failure, _not_ an NFS error value.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4proc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 2ed14dfd00a2..694d526c8f19 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -841,6 +841,7 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	      struct nfsd4_setattr *setattr)
 {
 	__be32 status = nfs_ok;
+	int err;
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		nfs4_lock_state();
@@ -852,9 +853,9 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 			return status;
 		}
 	}
-	status = fh_want_write(&cstate->current_fh);
-	if (status)
-		return status;
+	err = fh_want_write(&cstate->current_fh);
+	if (err)
+		return nfserrno(err);
 	status = nfs_ok;
 
 	status = check_attr_support(rqstp, cstate, setattr->sa_bmval,
-- 
cgit v1.2.3


From 04da6e9d63427b2d0fd04766712200c250b3278f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 00:00:04 -0400
Subject: nfsd: fix error values returned by nfsd4_lockt() when nfsd_open()
 fails

nfsd_open() already returns an NFS error value; only vfs_test_lock()
result needs to be fed through nfserrno().  Broken by commit 55ef12
(nfsd: Ensure nfsv4 calls the underlying filesystem on LOCKT)
three years ago...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4state.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1841f8bf845e..7f71c69cdcdf 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4211,16 +4211,14 @@ out:
  * vfs_test_lock.  (Arguably perhaps test_lock should be done with an
  * inode operation.)
  */
-static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
+static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock)
 {
 	struct file *file;
-	int err;
-
-	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
-	if (err)
-		return err;
-	err = vfs_test_lock(file, lock);
-	nfsd_close(file);
+	__be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
+	if (!err) {
+		err = nfserrno(vfs_test_lock(file, lock));
+		nfsd_close(file);
+	}
 	return err;
 }
 
@@ -4234,7 +4232,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct inode *inode;
 	struct file_lock file_lock;
 	struct nfs4_lockowner *lo;
-	int error;
 	__be32 status;
 
 	if (locks_in_grace())
@@ -4280,12 +4277,10 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	nfs4_transform_lock_offset(&file_lock);
 
-	status = nfs_ok;
-	error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
-	if (error) {
-		status = nfserrno(error);
+	status = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock);
+	if (status)
 		goto out;
-	}
+
 	if (file_lock.fl_type != F_UNLCK) {
 		status = nfserr_denied;
 		nfs4_set_lock_denied(&file_lock, &lockt->lt_denied);
-- 
cgit v1.2.3


From 02f5fde5df0ea930e70f93763dd48beff182b208 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 00:10:34 -0400
Subject: nfsd: fix endianness breakage in TEST_STATEID handling

->ts_id_status gets nfs errno, i.e. it's already big-endian; no need
to apply htonl() to it.  Broken by commit 174568 (NFSD: Added TEST_STATEID
operation) last year...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bcd8904ab1e3..07a99b6d4bbe 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3410,7 +3410,7 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
 	*p++ = htonl(test_stateid->ts_num_ids);
 
 	list_for_each_entry_safe(stateid, next, &test_stateid->ts_stateid_list, ts_id_list) {
-		*p++ = htonl(stateid->ts_id_status);
+		*p++ = stateid->ts_id_status;
 	}
 
 	ADJUST_ARGS();
-- 
cgit v1.2.3


From afcf6792afd66209161495f691e19d4fc5460a93 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 00:15:37 -0400
Subject: nfsd: fix error value on allocation failure in
 nfsd4_decode_test_stateid()

PTR_ERR(NULL) is going to be 0...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 07a99b6d4bbe..74c00bc92b9a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1392,7 +1392,7 @@ nfsd4_decode_test_stateid(struct nfsd4_compoundargs *argp, struct nfsd4_test_sta
 	for (i = 0; i < test_stateid->ts_num_ids; i++) {
 		stateid = kmalloc(sizeof(struct nfsd4_test_stateid_id), GFP_KERNEL);
 		if (!stateid) {
-			status = PTR_ERR(stateid);
+			status = nfserrno(-ENOMEM);
 			goto out;
 		}
 
-- 
cgit v1.2.3


From efe39651f08813180f37dc508d950fc7d92b29a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 00:32:14 -0400
Subject: nfsd: fix compose_entry_fh() failure exits

Restore the original logics ("fail on mountpoints, negatives and in
case of fh_compose() failures").  Since commit 8177e (nfsd: clean up
readdirplus encoding) that got broken -
	rv = fh_compose(fhp, exp, dchild, &cd->fh);
	if (rv)
	       goto out;
	if (!dchild->d_inode)
		goto out;
	rv = 0;
out:
is equivalent to
	rv = fh_compose(fhp, exp, dchild, &cd->fh);
out:
and the second check has no effect whatsoever...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/nfs3xdr.c | 22 ++++++++--------------
 1 file changed, 8 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 08c6e36ab2eb..43f46cd9edea 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -803,13 +803,13 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name,
 	return p;
 }
 
-static int
+static __be32
 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		const char *name, int namlen)
 {
 	struct svc_export	*exp;
 	struct dentry		*dparent, *dchild;
-	int rv = 0;
+	__be32 rv = nfserr_noent;
 
 	dparent = cd->fh.fh_dentry;
 	exp  = cd->fh.fh_export;
@@ -817,26 +817,20 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 	if (isdotent(name, namlen)) {
 		if (namlen == 2) {
 			dchild = dget_parent(dparent);
-			if (dchild == dparent) {
-				/* filesystem root - cannot return filehandle for ".." */
-				dput(dchild);
-				return -ENOENT;
-			}
+			/* filesystem root - cannot return filehandle for ".." */
+			if (dchild == dparent)
+				goto out;
 		} else
 			dchild = dget(dparent);
 	} else
 		dchild = lookup_one_len(name, dparent, namlen);
 	if (IS_ERR(dchild))
-		return -ENOENT;
-	rv = -ENOENT;
+		return rv;
 	if (d_mountpoint(dchild))
 		goto out;
-	rv = fh_compose(fhp, exp, dchild, &cd->fh);
-	if (rv)
-		goto out;
 	if (!dchild->d_inode)
 		goto out;
-	rv = 0;
+	rv = fh_compose(fhp, exp, dchild, &cd->fh);
 out:
 	dput(dchild);
 	return rv;
@@ -845,7 +839,7 @@ out:
 static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen)
 {
 	struct svc_fh	fh;
-	int err;
+	__be32 err;
 
 	fh_init(&fh, NFS3_FHSIZE);
 	err = compose_entry_fh(cd, &fh, name, namlen);
-- 
cgit v1.2.3


From af1584f570b19b0285e4402a0b54731495d31784 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 12 Apr 2012 20:32:25 -0400
Subject: ext4: fix endianness breakage in ext4_split_extent_at()

->ee_len is __le16, so assigning cpu_to_le32() to it is going to do
Bad Things(tm) on big-endian hosts...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/extents.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1421938e6792..1d418387ffb0 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2882,7 +2882,7 @@ static int ext4_split_extent_at(handle_t *handle,
 		if (err)
 			goto fix_extent_len;
 		/* update the extent length and mark as initialized */
-		ex->ee_len = cpu_to_le32(ee_len);
+		ex->ee_len = cpu_to_le16(ee_len);
 		ext4_ext_try_to_merge(inode, path, ex);
 		err = ext4_ext_dirty(handle, inode, path + depth);
 		goto out;
-- 
cgit v1.2.3


From bfa890a3cdeed29eef53d54cd7f80cec0fd46b11 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Fri, 13 Apr 2012 14:04:32 +0100
Subject: Fix number parsing in cifs_parse_mount_options

The function kstrtoul() used to parse number strings in the mount
option parser is set to expect a base 10 number . This treats the octal
numbers passed for mount options such as file_mode as base10 numbers
leading to incorrect behavior.

Change the 'base' argument passed to kstrtoul from 10 to 0 to
allow it to auto-detect the base of the number passed.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6a86f3d68182..f31dc9ac37b7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1123,7 +1123,7 @@ static int get_option_ul(substring_t args[], unsigned long *option)
 	string = match_strdup(args);
 	if (string == NULL)
 		return -ENOMEM;
-	rc = kstrtoul(string, 10, option);
+	rc = kstrtoul(string, 0, option);
 	kfree(string);
 
 	return rc;
-- 
cgit v1.2.3


From 6ed3cf2cdfce4c9f1d73171bd3f27d9cb77b734e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 11:49:04 -0400
Subject: btrfs: btrfs_root_readonly() broken on big-endian

->root_flags is __le64 and all accesses to it go through the helpers
that do proper conversions.  Except for btrfs_root_readonly(), which
checks bit 0 as in host-endian...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5b8ef8eb3521..3f65a812e282 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2166,7 +2166,7 @@ BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 
 static inline bool btrfs_root_readonly(struct btrfs_root *root)
 {
-	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+	return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0;
 }
 
 /* struct btrfs_root_backup */
-- 
cgit v1.2.3


From 3a251f04fe97c3d335b745c98e4b377e3c3899f2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 12:22:00 -0400
Subject: ocfs2: ->l_next_free_req breakage on big-endian

It's le16, not le32...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/alloc.c        | 2 +-
 fs/ocfs2/refcounttree.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 3165aebb43c8..31b9463fba1f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1134,7 +1134,7 @@ static int ocfs2_adjust_rightmost_branch(handle_t *handle,
 	}
 
 	el = path_leaf_el(path);
-	rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
+	rec = &el->l_recs[le16_to_cpu(el->l_next_free_rec) - 1];
 
 	ocfs2_adjust_rightmost_records(handle, et, path, rec);
 
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index cf7823382664..a7b7217062b7 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1036,14 +1036,14 @@ static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
 
 	tmp_el = left_path->p_node[subtree_root].el;
 	blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
-	for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) {
+	for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
 		if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
 			*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
 			break;
 		}
 	}
 
-	BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec));
+	BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
 
 out:
 	ocfs2_free_path(left_path);
-- 
cgit v1.2.3


From e1bf4cc620fd143766ddfcee3b004a1d1bb34fd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 12:27:11 -0400
Subject: ocfs: ->rl_used breakage on big-endian

it's le16, not le32 or le64...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/refcounttree.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index a7b7217062b7..bc90ebcec169 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1468,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
 
 	trace_ocfs2_divide_leaf_refcount_block(
 		(unsigned long long)ref_leaf_bh->b_blocknr,
-		le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+		le32_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
 
 	/*
 	 * XXX: Improvement later.
@@ -2411,7 +2411,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 				rb = (struct ocfs2_refcount_block *)
 							prev_bh->b_data;
 
-				if (le64_to_cpu(rb->rf_records.rl_used) +
+				if (le16_to_cpu(rb->rf_records.rl_used) +
 				    recs_add >
 				    le16_to_cpu(rb->rf_records.rl_count))
 					ref_blocks++;
@@ -2476,7 +2476,7 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 	if (prev_bh) {
 		rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
 
-		if (le64_to_cpu(rb->rf_records.rl_used) + recs_add >
+		if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
 		    le16_to_cpu(rb->rf_records.rl_count))
 			ref_blocks++;
 
@@ -3629,7 +3629,7 @@ int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
 			 * one will split a refcount rec, so totally we need
 			 * clusters * 2 new refcount rec.
 			 */
-			if (le64_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
+			if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
 			    le16_to_cpu(rb->rf_records.rl_count))
 				ref_blocks++;
 
-- 
cgit v1.2.3


From 28748b325dc2d730ccc312830a91c4ae0c0d9379 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 12:28:21 -0400
Subject: ocfs2: ->rl_count endianness breakage

le16, not le32...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/refcounttree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index bc90ebcec169..9f32d7cbb7a3 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1468,7 +1468,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
 
 	trace_ocfs2_divide_leaf_refcount_block(
 		(unsigned long long)ref_leaf_bh->b_blocknr,
-		le32_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
+		le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
 
 	/*
 	 * XXX: Improvement later.
-- 
cgit v1.2.3


From 72094e43e3af5020510f920321d71f1798fa896d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 12:30:02 -0400
Subject: ocfs2: ->e_leaf_clusters endianness breakage

le16, not le32...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/suballoc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index ba5d97e4a73e..f169da4624fd 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -600,7 +600,7 @@ static void ocfs2_bg_alloc_cleanup(handle_t *handle,
 		ret = ocfs2_free_clusters(handle, cluster_ac->ac_inode,
 					  cluster_ac->ac_bh,
 					  le64_to_cpu(rec->e_blkno),
-					  le32_to_cpu(rec->e_leaf_clusters));
+					  le16_to_cpu(rec->e_leaf_clusters));
 		if (ret)
 			mlog_errno(ret);
 		/* Try all the clusters to free */
@@ -1628,7 +1628,7 @@ static int ocfs2_bg_discontig_fix_by_rec(struct ocfs2_suballoc_result *res,
 {
 	unsigned int bpc = le16_to_cpu(cl->cl_bpc);
 	unsigned int bitoff = le32_to_cpu(rec->e_cpos) * bpc;
-	unsigned int bitcount = le32_to_cpu(rec->e_leaf_clusters) * bpc;
+	unsigned int bitcount = le16_to_cpu(rec->e_leaf_clusters) * bpc;
 
 	if (res->sr_bit_offset < bitoff)
 		return 0;
-- 
cgit v1.2.3


From e847469bf77a1d339274074ed068d461f0c872bc Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 13 Apr 2012 13:49:47 -0400
Subject: lockd: fix the endianness bug

comparing be32 values for < is not doing the right thing...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/lockd/clnt4xdr.c | 2 +-
 fs/lockd/clntxdr.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/clnt4xdr.c b/fs/lockd/clnt4xdr.c
index 3ddcbb1c0a43..13ad1539fbf2 100644
--- a/fs/lockd/clnt4xdr.c
+++ b/fs/lockd/clnt4xdr.c
@@ -241,7 +241,7 @@ static int decode_nlm4_stat(struct xdr_stream *xdr, __be32 *stat)
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
-	if (unlikely(*p > nlm4_failed))
+	if (unlikely(ntohl(*p) > ntohl(nlm4_failed)))
 		goto out_bad_xdr;
 	*stat = *p;
 	return 0;
diff --git a/fs/lockd/clntxdr.c b/fs/lockd/clntxdr.c
index 3d35e3e80c1c..d269ada7670e 100644
--- a/fs/lockd/clntxdr.c
+++ b/fs/lockd/clntxdr.c
@@ -236,7 +236,7 @@ static int decode_nlm_stat(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
-	if (unlikely(*p > nlm_lck_denied_grace_period))
+	if (unlikely(ntohl(*p) > ntohl(nlm_lck_denied_grace_period)))
 		goto out_enum;
 	*stat = *p;
 	return 0;
-- 
cgit v1.2.3


From 9cd70b347e9761ea2d2ac3d758c529a48a8193e6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 16 Apr 2012 12:16:20 -0400
Subject: ext4: address scalability issue by removing extent cache statistics

Andi Kleen and Tim Chen have reported that under certain circumstances
the extent cache statistics are causing scalability problems due to
cache line bounces.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/ext4.h    |  3 ---
 fs/ext4/extents.c |  4 ----
 fs/ext4/super.c   | 16 ----------------
 3 files changed, 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ab2594a30f86..0e01e90add8b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1203,9 +1203,6 @@ struct ext4_sb_info {
 	unsigned long s_ext_blocks;
 	unsigned long s_ext_extents;
 #endif
-	/* ext4 extent cache stats */
-	unsigned long extent_cache_hits;
-	unsigned long extent_cache_misses;
 
 	/* for buddy allocator */
 	struct ext4_group_info ***s_group_info;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 1421938e6792..8a12cd207679 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2066,10 +2066,6 @@ static int ext4_ext_check_cache(struct inode *inode, ext4_lblk_t block,
 		ret = 1;
 	}
 errout:
-	if (!ret)
-		sbi->extent_cache_misses++;
-	else
-		sbi->extent_cache_hits++;
 	trace_ext4_ext_in_cache(inode, block, ret);
 	spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
 	return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ceebaf853beb..ef7db5044262 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2366,18 +2366,6 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
 			  EXT4_SB(sb)->s_sectors_written_start) >> 1)));
 }
 
-static ssize_t extent_cache_hits_show(struct ext4_attr *a,
-				      struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_hits);
-}
-
-static ssize_t extent_cache_misses_show(struct ext4_attr *a,
-					struct ext4_sb_info *sbi, char *buf)
-{
-	return snprintf(buf, PAGE_SIZE, "%lu\n", sbi->extent_cache_misses);
-}
-
 static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 					  struct ext4_sb_info *sbi,
 					  const char *buf, size_t count)
@@ -2435,8 +2423,6 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
-EXT4_RO_ATTR(extent_cache_hits);
-EXT4_RO_ATTR(extent_cache_misses);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2452,8 +2438,6 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
-	ATTR_LIST(extent_cache_hits),
-	ATTR_LIST(extent_cache_misses),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
-- 
cgit v1.2.3


From 57f73c2c89a5d3b2ed87201c8100d1fa989a1a65 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 16 Apr 2012 18:55:26 -0400
Subject: ext4: fix handling of journalled quota options

Commit 26092bf5 broke handling of journalled quota mount options by
trying to parse argument of every mount option as a number.  Fix this
by dealing with the quota options before we call match_int().

Thanks to Jan Kara for discovering this regression.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/super.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ef7db5044262..6da193564e43 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1305,20 +1305,20 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 		ext4_msg(sb, KERN_ERR,
 			"Cannot change journaled "
 			"quota options when quota turned on");
-		return 0;
+		return -1;
 	}
 	qname = match_strdup(args);
 	if (!qname) {
 		ext4_msg(sb, KERN_ERR,
 			"Not enough memory for storing quotafile name");
-		return 0;
+		return -1;
 	}
 	if (sbi->s_qf_names[qtype] &&
 		strcmp(sbi->s_qf_names[qtype], qname)) {
 		ext4_msg(sb, KERN_ERR,
 			"%s quota file already specified", QTYPE2NAME(qtype));
 		kfree(qname);
-		return 0;
+		return -1;
 	}
 	sbi->s_qf_names[qtype] = qname;
 	if (strchr(sbi->s_qf_names[qtype], '/')) {
@@ -1326,7 +1326,7 @@ static int set_qf_name(struct super_block *sb, int qtype, substring_t *args)
 			"quotafile must be on filesystem root");
 		kfree(sbi->s_qf_names[qtype]);
 		sbi->s_qf_names[qtype] = NULL;
-		return 0;
+		return -1;
 	}
 	set_opt(sb, QUOTA);
 	return 1;
@@ -1341,7 +1341,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 		sbi->s_qf_names[qtype]) {
 		ext4_msg(sb, KERN_ERR, "Cannot change journaled quota options"
 			" when quota turned on");
-		return 0;
+		return -1;
 	}
 	/*
 	 * The space will be released later when all options are confirmed
@@ -1450,6 +1450,16 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	const struct mount_opts *m;
 	int arg = 0;
 
+#ifdef CONFIG_QUOTA
+	if (token == Opt_usrjquota)
+		return set_qf_name(sb, USRQUOTA, &args[0]);
+	else if (token == Opt_grpjquota)
+		return set_qf_name(sb, GRPQUOTA, &args[0]);
+	else if (token == Opt_offusrjquota)
+		return clear_qf_name(sb, USRQUOTA);
+	else if (token == Opt_offgrpjquota)
+		return clear_qf_name(sb, GRPQUOTA);
+#endif
 	if (args->from && match_int(args, &arg))
 		return -1;
 	switch (token) {
@@ -1549,18 +1559,6 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 				sbi->s_mount_opt |= m->mount_opt;
 			}
 #ifdef CONFIG_QUOTA
-		} else if (token == Opt_usrjquota) {
-			if (!set_qf_name(sb, USRQUOTA, &args[0]))
-				return -1;
-		} else if (token == Opt_grpjquota) {
-			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-				return -1;
-		} else if (token == Opt_offusrjquota) {
-			if (!clear_qf_name(sb, USRQUOTA))
-				return -1;
-		} else if (token == Opt_offgrpjquota) {
-			if (!clear_qf_name(sb, GRPQUOTA))
-				return -1;
 		} else if (m->flags & MOPT_QFMT) {
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != m->mount_opt) {
-- 
cgit v1.2.3


From ca138f368a36dba40d3ef4a53d64af2011cda3c7 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Thu, 5 Apr 2012 15:26:36 -0400
Subject: NFS: check for req==NULL in nfs_try_to_update_request cleanup

Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 2c68818f68ac..9b8d4d42a8af 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -682,7 +682,8 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
 		req->wb_bytes = rqend - req->wb_offset;
 out_unlock:
 	spin_unlock(&inode->i_lock);
-	nfs_clear_request_commit(req);
+	if (req)
+		nfs_clear_request_commit(req);
 	return req;
 out_flushme:
 	spin_unlock(&inode->i_lock);
-- 
cgit v1.2.3


From 848cce0d4102b5b4b26b0987b43e1919d462afe2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Feb 2012 17:04:28 +0800
Subject: Btrfs: avoid setting ->d_op twice

Follow those instructions, and you'll trigger a warning in the
beginning of d_set_d_op():

  # mkfs.btrfs /dev/loop3
  # mount /dev/loop3 /mnt
  # btrfs sub create /mnt/sub
  # btrfs sub snap /mnt /mnt/snap
  # touch /mnt/snap/sub
  touch: cannot touch `tmp': Permission denied

__d_alloc() set d_op to sb->s_d_op (btrfs_dentry_operations), and
then simple_lookup() reset it to simple_dentry_operations, which
triggered the warning.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1be31368e881..a682c267576d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4069,7 +4069,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 	BTRFS_I(inode)->dummy_inode = 1;
 
 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &btrfs_dir_ro_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4140,14 +4140,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
+	struct inode *inode = dentry->d_inode;
 
-	if (!dentry->d_inode && !IS_ROOT(dentry))
-		dentry = dentry->d_parent;
+	if (!inode && !IS_ROOT(dentry))
+		inode = dentry->d_parent->d_inode;
 
-	if (dentry->d_inode) {
-		root = BTRFS_I(dentry->d_inode)->root;
+	if (inode) {
+		root = BTRFS_I(inode)->root;
 		if (btrfs_root_refs(&root->root_item) == 0)
 			return 1;
+
+		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+			return 1;
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From 8c9c2bf7a3c4f7e9d158c0be9c49f372fb943ad2 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Sat, 25 Feb 2012 09:09:30 +0100
Subject: btrfs: fix race in reada

When inserting into the radix tree returns EEXIST, get the existing
entry without giving up the spinlock in between.
There was a race for both the zones trees and the extent tree.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/inode.c |  8 +++++++-
 fs/btrfs/reada.c | 35 ++++++++++++++++-------------------
 2 files changed, 23 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a682c267576d..98ee5a51aa29 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4332,7 +4332,13 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			}
 no_dentry:
 			/* is this a reference to our own snapshot? If so
-			 * skip it
+			 * skip it.
+			 *
+			 * In contrast to old kernels, we insert the snapshot's
+			 * dir item and dir index after it has been created, so
+			 * we won't find a reference to our own snapshot. We
+			 * still keep the following code for backward
+			 * compatibility.
 			 */
 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
 			    location.objectid == root->root_key.objectid) {
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index dc5d33146fdb..8dec650099c8 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -250,14 +250,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 					  struct btrfs_bio *bbio)
 {
 	int ret;
-	int looped = 0;
 	struct reada_zone *zone;
 	struct btrfs_block_group_cache *cache = NULL;
 	u64 start;
 	u64 end;
 	int i;
 
-again:
 	zone = NULL;
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
@@ -274,9 +272,6 @@ again:
 		spin_unlock(&fs_info->reada_lock);
 	}
 
-	if (looped)
-		return NULL;
-
 	cache = btrfs_lookup_block_group(fs_info, logical);
 	if (!cache)
 		return NULL;
@@ -307,13 +302,15 @@ again:
 	ret = radix_tree_insert(&dev->reada_zones,
 				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
 				zone);
-	spin_unlock(&fs_info->reada_lock);
 
-	if (ret) {
+	if (ret == -EEXIST) {
 		kfree(zone);
-		looped = 1;
-		goto again;
+		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+					     logical >> PAGE_CACHE_SHIFT, 1);
+		if (ret == 1)
+			kref_get(&zone->refcnt);
 	}
+	spin_unlock(&fs_info->reada_lock);
 
 	return zone;
 }
@@ -323,8 +320,8 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 					      struct btrfs_key *top, int level)
 {
 	int ret;
-	int looped = 0;
 	struct reada_extent *re = NULL;
+	struct reada_extent *re_exist = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_bio *bbio = NULL;
@@ -335,14 +332,13 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	int i;
 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
 
-again:
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
 		kref_get(&re->refcnt);
 	spin_unlock(&fs_info->reada_lock);
 
-	if (re || looped)
+	if (re)
 		return re;
 
 	re = kzalloc(sizeof(*re), GFP_NOFS);
@@ -398,12 +394,15 @@ again:
 	/* insert extent in reada_tree + all per-device trees, all or nothing */
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret == -EEXIST) {
+		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+		BUG_ON(!re_exist);
+		kref_get(&re_exist->refcnt);
+		spin_unlock(&fs_info->reada_lock);
+		goto error;
+	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
-		if (ret != -ENOMEM) {
-			/* someone inserted the extent in the meantime */
-			looped = 1;
-		}
 		goto error;
 	}
 	for (i = 0; i < nzones; ++i) {
@@ -450,9 +449,7 @@ error:
 	}
 	kfree(bbio);
 	kfree(re);
-	if (looped)
-		goto again;
-	return NULL;
+	return re_exist;
 }
 
 static void reada_kref_dummy(struct kref *kr)
-- 
cgit v1.2.3


From 207a232ccac0a8cb79d304bd17298dbc96e2e082 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Sat, 25 Feb 2012 09:09:47 +0100
Subject: btrfs: don't add both copies of DUP to reada extent tree

Normally when there are 2 copies of a block, we add both to the
reada extent tree and prefetch only the one that is easier to reach.
This way we can better utilize multiple devices.
In case of DUP this makes no sense as both copies reside on the
same device.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/reada.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 8dec650099c8..ac5d01085884 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -326,6 +326,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_bio *bbio = NULL;
 	struct btrfs_device *dev;
+	struct btrfs_device *prev_dev;
 	u32 blocksize;
 	u64 length;
 	int nzones = 0;
@@ -405,8 +406,20 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 		spin_unlock(&fs_info->reada_lock);
 		goto error;
 	}
+	prev_dev = NULL;
 	for (i = 0; i < nzones; ++i) {
 		dev = bbio->stripes[i].dev;
+		if (dev == prev_dev) {
+			/*
+			 * in case of DUP, just add the first zone. As both
+			 * are on the same device, there's nothing to gain
+			 * from adding both.
+			 * Also, it wouldn't work, as the tree is per device
+			 * and adding would fail with EEXIST
+			 */
+			continue;
+		}
+		prev_dev = dev;
 		ret = radix_tree_insert(&dev->reada_extents, index, re);
 		if (ret) {
 			while (--i >= 0) {
-- 
cgit v1.2.3


From 8d082fb727ac11930ea20bf1612e334ea7c2b697 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Tue, 3 Apr 2012 09:56:53 +0800
Subject: Btrfs: do not mount when we have a sectorsize unequal to PAGE_SIZE

Our code is not ready to cope with a sectorsize that's not equal to PAGE_SIZE.
It will lead to hanging-on while writing something.

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/disk-io.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 20196f411206..b9866f27ebd7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2254,9 +2254,9 @@ int open_ctree(struct super_block *sb,
 		goto fail_sb_buffer;
 	}
 
-	if (sectorsize < PAGE_SIZE) {
-		printk(KERN_WARNING "btrfs: Incompatible sector size "
-		       "found on %s\n", sb->s_id);
+	if (sectorsize != PAGE_SIZE) {
+		printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
 		goto fail_sb_buffer;
 	}
 
-- 
cgit v1.2.3


From 871383be592ba7e819d27556591e315a0df38cee Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Mon, 2 Apr 2012 18:31:37 +0200
Subject: btrfs: add missing unlocks to transaction abort paths

Added in commit 49b25e0540904be0bf558b84475c69d72e4de66e
("btrfs: enhance transaction abort infrastructure")

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/transaction.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 11b77a59db62..36422254ef67 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -73,8 +73,10 @@ loop:
 
 	cur_trans = root->fs_info->running_transaction;
 	if (cur_trans) {
-		if (cur_trans->aborted)
+		if (cur_trans->aborted) {
+			spin_unlock(&root->fs_info->trans_lock);
 			return cur_trans->aborted;
+		}
 		atomic_inc(&cur_trans->use_count);
 		atomic_inc(&cur_trans->num_writers);
 		cur_trans->num_joined++;
@@ -1400,6 +1402,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = commit_fs_roots(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;
 	}
 
@@ -1411,6 +1414,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = commit_cowonly_roots(trans, root);
 	if (ret) {
 		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
 		goto cleanup_transaction;
 	}
 
-- 
cgit v1.2.3


From 8e52acf70459020d7e9e9fda25066be4da520943 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 12 Mar 2012 16:39:28 +0800
Subject: Btrfs: retrurn void from clear_state_bit

Currently it returns a set of bits that were cleared, but this return
value is not used at all.

Moreover it doesn't seem to be useful, because we may clear the bits
of a few extent_states, but only the cleared bits of last one is
returned.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4789770f8eaf..05951bdf72cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -404,18 +404,16 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
+ * it will optionally wake up any one waiting on this state (wake == 1)
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
  */
-static int clear_state_bit(struct extent_io_tree *tree,
+static void clear_state_bit(struct extent_io_tree *tree,
 			    struct extent_state *state,
 			    int *bits, int wake)
 {
 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
-	int ret = state->state & bits_to_clear;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -437,7 +435,6 @@ static int clear_state_bit(struct extent_io_tree *tree,
 	} else {
 		merge_state(tree, state);
 	}
-	return ret;
 }
 
 static struct extent_state *
-- 
cgit v1.2.3


From cdc6a3952558f00b1bc3b6401e1cf98797632fe2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 12 Mar 2012 16:39:48 +0800
Subject: Btrfs: avoid possible use-after-free in clear_extent_bit()

clear_extent_bit()
{
    next_node = rb_next(&state->rb_node);
    ...
    clear_state_bit(state);  <-- this may free next_node
    if (next_node) {
        state = rb_entry(next_node);
        ...
    }
}

clear_state_bit() calls merge_state() which may free the next node
of the passing extent_state, so clear_extent_bit() may end up
referencing freed memory.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 05951bdf72cc..11eeb81fe695 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -402,6 +402,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 	return 0;
 }
 
+static struct extent_state *next_state(struct extent_state *state)
+{
+	struct rb_node *next = rb_next(&state->rb_node);
+	if (next)
+		return rb_entry(next, struct extent_state, rb_node);
+	else
+		return NULL;
+}
+
 /*
  * utility function to clear some bits in an extent state struct.
  * it will optionally wake up any one waiting on this state (wake == 1)
@@ -409,10 +418,11 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
  */
-static void clear_state_bit(struct extent_io_tree *tree,
-			    struct extent_state *state,
-			    int *bits, int wake)
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+					    struct extent_state *state,
+					    int *bits, int wake)
 {
+	struct extent_state *next;
 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
@@ -425,6 +435,7 @@ static void clear_state_bit(struct extent_io_tree *tree,
 	if (wake)
 		wake_up(&state->wq);
 	if (state->state == 0) {
+		next = next_state(state);
 		if (state->tree) {
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
@@ -434,7 +445,9 @@ static void clear_state_bit(struct extent_io_tree *tree,
 		}
 	} else {
 		merge_state(tree, state);
+		next = next_state(state);
 	}
+	return next;
 }
 
 static struct extent_state *
@@ -473,7 +486,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
-	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
@@ -525,14 +537,11 @@ hit_next:
 	WARN_ON(state->end < start);
 	last_end = state->end;
 
-	if (state->end < end && !need_resched())
-		next_node = rb_next(&state->rb_node);
-	else
-		next_node = NULL;
-
 	/* the state doesn't have the wanted bits, go ahead */
-	if (!(state->state & bits))
+	if (!(state->state & bits)) {
+		state = next_state(state);
 		goto next;
+	}
 
 	/*
 	 *     | ---- desired range ---- |
@@ -590,16 +599,13 @@ hit_next:
 		goto out;
 	}
 
-	clear_state_bit(tree, state, &bits, wake);
+	state = clear_state_bit(tree, state, &bits, wake);
 next:
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
-	if (start <= end && next_node) {
-		state = rb_entry(next_node, struct extent_state,
-				 rb_node);
+	if (start <= end && state && !need_resched())
 		goto hit_next;
-	}
 	goto search_again;
 
 out:
-- 
cgit v1.2.3


From 4735fb282830c0966b301dabcccf4753fa6604bb Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 12 Apr 2012 22:47:52 +0200
Subject: Btrfs: Make free_ipath() deal gracefully with NULL pointers

Make free_ipath() behave like most other freeing functions in the
kernel and gracefully do nothing when passed a NULL pointer.

Besides this making the bahaviour consistent with functions such as
kfree(), vfree(), btrfs_free_path() etc etc, it also fixes a real NULL
deref issue in fs/btrfs/ioctl.c::btrfs_ioctl_ino_to_path(). In that
function we have this code:

...
        ipath = init_ipath(size, root, path);
        if (IS_ERR(ipath)) {
                ret = PTR_ERR(ipath);
                ipath = NULL;
                goto out;
        }
...
out:
        btrfs_free_path(path);
        free_ipath(ipath);
...

If we ever take the true branch of that 'if' statement we'll end up
passing a NULL pointer to free_ipath() which will subsequently
dereference it and we'll go "Boom" :-(
This patch will avoid that.

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
---
 fs/btrfs/backref.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f4e90748940a..b332ff04c5ee 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1414,6 +1414,8 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 
 void free_ipath(struct inode_fs_paths *ipath)
 {
+	if (!ipath)
+		return;
 	kfree(ipath->fspath);
 	kfree(ipath);
 }
-- 
cgit v1.2.3


From aefc1eb13ebbb86c5ffade8a9e2425cd71032d7e Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 13 Apr 2012 12:28:00 +0200
Subject: Btrfs: don't call free_extent_buffer twice in iterate_irefs

Avoid calling free_extent_buffer more than once when the iterator function
returns non-zero. The only code that uses this is scrub repair for corrupted
nodatasum blocks.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index b332ff04c5ee..fb56bcc80377 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1247,7 +1247,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 				struct btrfs_path *path,
 				iterate_irefs_t *iterate, void *ctx)
 {
-	int ret;
+	int ret = 0;
 	int slot;
 	u32 cur;
 	u32 len;
@@ -1259,7 +1259,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	struct btrfs_inode_ref *iref;
 	struct btrfs_key found_key;
 
-	while (1) {
+	while (!ret) {
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
 					&found_key);
 		if (ret < 0)
@@ -1288,10 +1288,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 				 (unsigned long long)found_key.objectid,
 				 (unsigned long long)fs_root->objectid);
 			ret = iterate(parent, iref, eb, ctx);
-			if (ret) {
-				free_extent_buffer(eb);
+			if (ret)
 				break;
-			}
 			len = sizeof(*iref) + name_len;
 			iref = (struct btrfs_inode_ref *)((char *)iref + len);
 		}
-- 
cgit v1.2.3


From b916a59adfdc875381b68ced258694b434cf43ae Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 13 Apr 2012 12:28:08 +0200
Subject: Btrfs: add missing read locks in backref.c

iref_to_path and iterate_irefs both increment the eb's refcount to use it
after releasing the path. Both depend on consistent data remaining in the
extent buffer and need a read lock to protect it.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/backref.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index fb56bcc80377..bcec06750232 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -22,6 +22,7 @@
 #include "ulist.h"
 #include "transaction.h"
 #include "delayed-ref.h"
+#include "locking.h"
 
 /*
  * this structure records all encountered refs on the way up to the root
@@ -893,18 +894,22 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 	s64 bytes_left = size - 1;
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
+	int leave_spinning = path->leave_spinning;
 
 	if (bytes_left >= 0)
 		dest[bytes_left] = '\0';
 
+	path->leave_spinning = 1;
 	while (1) {
 		len = btrfs_inode_ref_name_len(eb, iref);
 		bytes_left -= len;
 		if (bytes_left >= 0)
 			read_extent_buffer(eb, dest + bytes_left,
 						(unsigned long)(iref + 1), len);
-		if (eb != eb_in)
+		if (eb != eb_in) {
+			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
+		}
 		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
 		if (ret > 0)
 			ret = -ENOENT;
@@ -919,8 +924,11 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 		slot = path->slots[0];
 		eb = path->nodes[0];
 		/* make sure we can use eb after releasing the path */
-		if (eb != eb_in)
+		if (eb != eb_in) {
 			atomic_inc(&eb->refs);
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		}
 		btrfs_release_path(path);
 
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
@@ -931,6 +939,7 @@ static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 	}
 
 	btrfs_release_path(path);
+	path->leave_spinning = leave_spinning;
 
 	if (ret)
 		return ERR_PTR(ret);
@@ -1260,6 +1269,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 	struct btrfs_key found_key;
 
 	while (!ret) {
+		path->leave_spinning = 1;
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
 					&found_key);
 		if (ret < 0)
@@ -1275,6 +1285,8 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 		eb = path->nodes[0];
 		/* make sure we can use eb after releasing the path */
 		atomic_inc(&eb->refs);
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
 		item = btrfs_item_nr(eb, slot);
@@ -1293,6 +1305,7 @@ static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 			len = sizeof(*iref) + name_len;
 			iref = (struct btrfs_inode_ref *)((char *)iref + len);
 		}
+		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 	}
 
-- 
cgit v1.2.3


From 37db63a400d3bac467795aa43901065fd8d903b7 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 13 Apr 2012 17:05:08 +0300
Subject: Btrfs: fix max chunk size check in chunk allocator

Fix a bug, where in case we need to adjust stripe_size so that the
length of the resulting chunk is less than or equal to max_chunk_size,
DUP chunks turn out to be only half as big as they could be.

Cc: Arne Jansen <sensille@gmx.net>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 759d02486d7c..ce289af526f0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3324,12 +3324,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	stripe_size = devices_info[ndevs-1].max_avail;
 	num_stripes = ndevs * dev_stripes;
 
-	if (stripe_size * num_stripes > max_chunk_size * ncopies) {
+	if (stripe_size * ndevs > max_chunk_size * ncopies) {
 		stripe_size = max_chunk_size * ncopies;
-		do_div(stripe_size, num_stripes);
+		do_div(stripe_size, ndevs);
 	}
 
 	do_div(stripe_size, dev_stripes);
+
+	/* align to BTRFS_STRIPE_LEN */
 	do_div(stripe_size, BTRFS_STRIPE_LEN);
 	stripe_size *= BTRFS_STRIPE_LEN;
 
-- 
cgit v1.2.3


From 8a3db1849e9e2563727ea2dc32737502e0096641 Mon Sep 17 00:00:00 2001
From: Sergei Trofimovich <slyfox@gentoo.org>
Date: Mon, 16 Apr 2012 06:44:37 +0300
Subject: btrfs: fix early abort in 'remount'

Cc: Jeff Mahoney <jeffm@suse.com>
Cc: Chris Mason <chris.mason@oracle.com>
Cc: Josef Bacik <josef@redhat.com>
Signed-off-by: Sergei Trofimovich <slyfox@gentoo.org>
---
 fs/btrfs/super.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 84571d7da12e..43aa2dd0bc7d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1152,13 +1152,15 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (ret)
 			goto restore;
 	} else {
-		if (fs_info->fs_devices->rw_devices == 0)
+		if (fs_info->fs_devices->rw_devices == 0) {
 			ret = -EACCES;
 			goto restore;
+		}
 
-		if (btrfs_super_log_root(fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
 			ret = -EINVAL;
 			goto restore;
+		}
 
 		ret = btrfs_cleanup_fs_roots(fs_info);
 		if (ret)
-- 
cgit v1.2.3


From 48d282326b3ce5f435835f5fb0e3231c399f4f9a Mon Sep 17 00:00:00 2001
From: Julia Lawall <Julia.Lawall@lip6.fr>
Date: Sat, 14 Apr 2012 11:24:33 +0200
Subject: fs/btrfs/volumes.c: add missing free_fs_devices

Free fs_devices as done in the error-handling code just below.

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
---
 fs/btrfs/volumes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index ce289af526f0..3b984173d25b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4352,8 +4352,10 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
 
 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
-	if (ret)
+	if (ret) {
+		free_fs_devices(fs_devices);
 		goto out;
+	}
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
-- 
cgit v1.2.3


From 5cf1ab56133ad7b712673c071b439d4a555a2d1e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 16 Apr 2012 09:42:26 -0400
Subject: Btrfs: always store the mirror we read the eb from

A user reported a panic where we were trying to fix a bad mirror but the
mirror number we were giving was 0, which is invalid.  This is because we
don't do the transid verification until after the read, so as far as the
read code is concerned the read was a success.  So instead store the mirror
we read from so that if there is some failure post read we know which mirror
to try next and which mirror needs to be fixed if we find a good copy of the
block.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c   | 16 ++++++++--------
 fs/btrfs/extent_io.c | 15 ++++++---------
 fs/btrfs/extent_io.h |  4 ++--
 fs/btrfs/inode.c     |  2 +-
 4 files changed, 17 insertions(+), 20 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b9866f27ebd7..d0c969beaad4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -383,17 +383,16 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
 			break;
 
-		if (!failed_mirror) {
-			failed = 1;
-			printk(KERN_ERR "failed mirror was %d\n", eb->failed_mirror);
-			failed_mirror = eb->failed_mirror;
-		}
-
 		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
 					      eb->start, eb->len);
 		if (num_copies == 1)
 			break;
 
+		if (!failed_mirror) {
+			failed = 1;
+			failed_mirror = eb->read_mirror;
+		}
+
 		mirror_num++;
 		if (mirror_num == failed_mirror)
 			mirror_num++;
@@ -564,7 +563,7 @@ struct extent_buffer *find_eb_for_page(struct extent_io_tree *tree,
 }
 
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+			       struct extent_state *state, int mirror)
 {
 	struct extent_io_tree *tree;
 	u64 found_start;
@@ -589,6 +588,7 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (!reads_done)
 		goto err;
 
+	eb->read_mirror = mirror;
 	if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 		ret = -EIO;
 		goto err;
@@ -652,7 +652,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
 
 	eb = (struct extent_buffer *)page->private;
 	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
-	eb->failed_mirror = failed_mirror;
+	eb->read_mirror = failed_mirror;
 	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	return -EIO;	/* we fixed nothing */
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 11eeb81fe695..0c23e57077c6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2304,7 +2304,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	u64 start;
 	u64 end;
 	int whole_page;
-	int failed_mirror;
+	int mirror;
 	int ret;
 
 	if (err)
@@ -2343,20 +2343,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		}
 		spin_unlock(&tree->lock);
 
+		mirror = (int)(unsigned long)bio->bi_bdev;
 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
 			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      state);
+							      state, mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(start, page);
 		}
 
-		if (!uptodate)
-			failed_mirror = (int)(unsigned long)bio->bi_bdev;
-
 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(page, failed_mirror);
+			ret = tree->ops->readpage_io_failed_hook(page, mirror);
 			if (!ret && !err &&
 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
 				uptodate = 1;
@@ -2371,8 +2369,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, page, start, end,
-							failed_mirror, NULL);
+			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -4465,7 +4462,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	}
 
 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
-	eb->failed_mirror = 0;
+	eb->read_mirror = 0;
 	atomic_set(&eb->io_pages, num_reads);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index faf10eb57f75..b516c3b8dec6 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -79,7 +79,7 @@ struct extent_io_ops {
 					u64 start, u64 end,
 				       struct extent_state *state);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
-				    struct extent_state *state);
+				    struct extent_state *state, int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
 	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
@@ -135,7 +135,7 @@ struct extent_buffer {
 	spinlock_t refs_lock;
 	atomic_t refs;
 	atomic_t io_pages;
-	int failed_mirror;
+	int read_mirror;
 	struct list_head leak_list;
 	struct rcu_head rcu_head;
 	pid_t lock_owner;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98ee5a51aa29..d953f8820464 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1947,7 +1947,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
  * extent_io.c will try to find good copies for us.
  */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+			       struct extent_state *state, int mirror)
 {
 	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
 	struct inode *inode = page->mapping->host;
-- 
cgit v1.2.3


From 253beebd5a255e07d6a8b65515491f33664e82a2 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 18 Apr 2012 09:59:03 +0300
Subject: Btrfs: double unlock bug in error handling

The caller expects this function to return with the lock held and
releases it immediately on error.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2b35f8d14bb9..a0bb9dcd3c36 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2301,6 +2301,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
 				if (ret) {
 					printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
+					spin_lock(&delayed_refs->lock);
 					return ret;
 				}
 
@@ -2331,6 +2332,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 
 		if (ret) {
 			printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
+			spin_lock(&delayed_refs->lock);
 			return ret;
 		}
 
-- 
cgit v1.2.3


From b9688bb8459b67e42327de6420edb405a9188775 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Wed, 18 Apr 2012 10:27:16 +0200
Subject: btrfs: don't return EINTR

It is basically a good thing if we are interruptible when waiting for
free space, but the generality in which it is implemented currently
leads to system calls being interruptible that are not documented this
way. For example git can't handle interrupted unlink(), leading to
corrupt repos under space pressure.
Instead we raise the bar to only be interruptible by SIGKILL.
Thanks to David Sterba for suggesting this.

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/extent-tree.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a0bb9dcd3c36..84497f8eb043 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3771,13 +3771,10 @@ again:
 		 */
 		if (current->journal_info)
 			return -EAGAIN;
-		ret = wait_event_interruptible(space_info->wait,
-					       !space_info->flush);
-		/* Must have been interrupted, return */
-		if (ret) {
-			printk(KERN_DEBUG "btrfs: %s returning -EINTR\n", __func__);
+		ret = wait_event_killable(space_info->wait, !space_info->flush);
+		/* Must have been killed, return */
+		if (ret)
 			return -EINTR;
-		}
 
 		spin_lock(&space_info->lock);
 	}
-- 
cgit v1.2.3


From 99ba55ad696944b37d5557bc5b4816890854fdb9 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Mon, 19 Mar 2012 16:17:22 +0100
Subject: Btrfs: fix btrfs_ioctl_dev_info() crash on missing device

When a filesystem is mounted with the degraded option, it is
possible that some of the devices are not there.
btrfs_ioctl_dev_info() crashs in this case because the device
name is a NULL pointer. This ioctl was only used for scrub.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ioctl.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 18cc23d164a8..14f8e1faa46e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2262,7 +2262,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 	di_args->bytes_used = dev->bytes_used;
 	di_args->total_bytes = dev->total_bytes;
 	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
-	strncpy(di_args->path, dev->name, sizeof(di_args->path));
+	if (dev->name)
+		strncpy(di_args->path, dev->name, sizeof(di_args->path));
+	else
+		di_args->path[0] = '\0';
 
 out:
 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
-- 
cgit v1.2.3


From 5c84fc3c3914e9adfa6155a167c6c0c2709e6a62 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 30 Mar 2012 13:58:31 +0200
Subject: Btrfs: don't count CRC or header errors twice while scrubbing

Each CRC or header error was counted twice, this is now fixed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/scrub.c | 15 ---------------
 1 file changed, 15 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 60f0e28db31e..b679bf68861e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1258,12 +1258,6 @@ static int scrub_checksum_data(struct scrub_block *sblock)
 	if (memcmp(csum, on_disk_csum, sdev->csum_size))
 		fail = 1;
 
-	if (fail) {
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.csum_errors;
-		spin_unlock(&sdev->stat_lock);
-	}
-
 	return fail;
 }
 
@@ -1336,15 +1330,6 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
 		++crc_fail;
 
-	if (crc_fail || fail) {
-		spin_lock(&sdev->stat_lock);
-		if (crc_fail)
-			++sdev->stat.csum_errors;
-		if (fail)
-			++sdev->stat.verify_errors;
-		spin_unlock(&sdev->stat_lock);
-	}
-
 	return fail || crc_fail;
 }
 
-- 
cgit v1.2.3


From 25cd999e1a685dab65292afbe9fa24d790d8a859 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 30 Mar 2012 13:58:32 +0200
Subject: Btrfs: fix that check_int_data mount option was ignored

The bitfield member mount_opt was too small by one bit to hold the mount
option that enabled to include data extents in the integrity checker.
Since the same issue happened when the BTRFS_MOUNT_PANIC_ON_FATAL_ERROR
option was added (git rebase silently merges so that the increase of the
size of the bitfield member is lost), the bit limit was removed entirely.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5b8ef8eb3521..ec42a24e935e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1078,7 +1078,7 @@ struct btrfs_fs_info {
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	unsigned long mount_opt:21;
+	unsigned long mount_opt;
 	unsigned long compress_type:4;
 	u64 max_inline;
 	u64 alloc_start;
-- 
cgit v1.2.3


From 05ffe24f5290dc095f98fbaf84afe51ef404ccc5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 18 Apr 2012 12:20:10 -0400
Subject: NFSv4: Ensure that the LOCK code sets exception->inode

All callers of nfs4_handle_exception() that need to handle
NFS4ERR_OPENMODE correctly should set exception->inode

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f82bde005a82..3c787d02131b 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4558,7 +4558,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
 	int err;
 
 	do {
@@ -4576,7 +4578,9 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
 static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
-	struct nfs4_exception exception = { };
+	struct nfs4_exception exception = {
+		.inode = state->inode,
+	};
 	int err;
 
 	err = nfs4_set_lock_state(state, request);
@@ -4676,6 +4680,7 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
 {
 	struct nfs4_exception exception = {
 		.state = state,
+		.inode = state->inode,
 	};
 	int err;
 
-- 
cgit v1.2.3


From 55725513b5ef9d462aa3e18527658a0362aaae83 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 18 Apr 2012 12:48:35 -0400
Subject: NFSv4: Ensure that we check lock exclusive/shared type against open
 modes

Since we may be simulating flock() locks using NFS byte range locks,
we can't rely on the VFS having checked the file open mode for us.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/nfs4proc.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 3c787d02131b..ba837d977a1a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4726,6 +4726,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 
 	if (state == NULL)
 		return -ENOLCK;
+	/*
+	 * Don't rely on the VFS having checked the file open mode,
+	 * since it won't do this for flock() locks.
+	 */
+	switch (request->fl_type & (F_RDLCK|F_WRLCK|F_UNLCK)) {
+	case F_RDLCK:
+		if (!(filp->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(filp->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+
 	do {
 		status = nfs4_proc_setlk(state, cmd, request);
 		if ((status != -EAGAIN) || IS_SETLK(cmd))
-- 
cgit v1.2.3


From 451146be933e26e21277852b5e40c6a52266ef96 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 18 Apr 2012 16:29:11 -0400
Subject: NFSv4: Fix open(O_TRUNC) and ftruncate() error handling

If the file wasn't opened for writing, then truncate and ftruncate
need to report the appropriate errors.

Reported-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@vger.kernel.org
---
 fs/nfs/dir.c      |  4 ++--
 fs/nfs/nfs4proc.c | 15 ++++++++++++---
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 4aaf0316d76a..8789210c6905 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1429,7 +1429,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
 	}
 
 	open_flags = nd->intent.open.flags;
-	attr.ia_valid = 0;
+	attr.ia_valid = ATTR_OPEN;
 
 	ctx = create_nfs_open_context(dentry, open_flags);
 	res = ERR_CAST(ctx);
@@ -1536,7 +1536,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if (IS_ERR(ctx))
 		goto out;
 
-	attr.ia_valid = 0;
+	attr.ia_valid = ATTR_OPEN;
 	if (openflags & O_TRUNC) {
 		attr.ia_valid |= ATTR_SIZE;
 		attr.ia_size = 0;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ba837d977a1a..f875cf305237 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1954,10 +1954,19 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	};
 	int err;
 	do {
-		err = nfs4_handle_exception(server,
-				_nfs4_do_setattr(inode, cred, fattr, sattr, state),
-				&exception);
+		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state);
+		switch (err) {
+		case -NFS4ERR_OPENMODE:
+			if (state && !(state->state & FMODE_WRITE)) {
+				err = -EBADF;
+				if (sattr->ia_valid & ATTR_OPEN)
+					err = -EACCES;
+				goto out;
+			}
+		}
+		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
+out:
 	return err;
 }
 
-- 
cgit v1.2.3


From 3af9d8f227a31e25b3110ef175d105798fc147a6 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Fri, 13 Apr 2012 17:16:59 -0400
Subject: cifs: fix offset handling in cifs_iovec_write

In the recent update of the cifs_iovec_write code to use async writes,
the handling of the file position was broken. That patch added a local
"offset" variable to handle the offset, and then only updated the
original "*poffset" before exiting.

Unfortunately, it copied off the original offset from the beginning,
instead of doing so after generic_write_checks had been called. Fix
this by moving the initialization of "offset" after that in the
function.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/file.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index fae765dac934..81725e9286e9 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2178,7 +2178,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 	unsigned long nr_pages, i;
 	size_t copied, len, cur_len;
 	ssize_t total_written = 0;
-	loff_t offset = *poffset;
+	loff_t offset;
 	struct iov_iter it;
 	struct cifsFileInfo *open_file;
 	struct cifs_tcon *tcon;
@@ -2200,6 +2200,7 @@ cifs_iovec_write(struct file *file, const struct iovec *iov,
 	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
 	open_file = file->private_data;
 	tcon = tlink_tcon(open_file->tlink);
+	offset = *poffset;
 
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
 		pid = open_file->pid;
-- 
cgit v1.2.3


From 73fb7bc7c57d971b11f2e00536ac2d3e316e0609 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:34 -0400
Subject: NFS: put open context on error in nfs_pagein_multi

Cc: <stable@vger.kernel.org>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/read.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 9a0e8ef4a409..0a4be28c2ea3 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -322,7 +322,7 @@ out_bad:
 	while (!list_empty(res)) {
 		data = list_entry(res->next, struct nfs_read_data, list);
 		list_del(&data->list);
-		nfs_readdata_free(data);
+		nfs_readdata_release(data);
 	}
 	nfs_readpage_release(req);
 	return -ENOMEM;
-- 
cgit v1.2.3


From 8ccd271f7a3a846ce6f85ead0760d9d12994a611 Mon Sep 17 00:00:00 2001
From: Fred Isaman <iisaman@netapp.com>
Date: Fri, 20 Apr 2012 14:47:35 -0400
Subject: NFS: put open context on error in nfs_flush_multi

Cc: <stable@vger.kernel.org>
Signed-off-by: Fred Isaman <iisaman@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9b8d4d42a8af..c07462320f6b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1019,7 +1019,7 @@ out_bad:
 	while (!list_empty(res)) {
 		data = list_entry(res->next, struct nfs_write_data, list);
 		list_del(&data->list);
-		nfs_writedata_free(data);
+		nfs_writedata_release(data);
 	}
 	nfs_redirty_request(req);
 	return -ENOMEM;
-- 
cgit v1.2.3


From 98a2139f4f4d7b5fcc3a54c7fddbe88612abed20 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 3 Sep 2011 01:09:43 +0200
Subject: nfs: Enclose hostname in brackets when needed in nfs_do_root_mount

When hostname contains colon (e.g. when it is an IPv6 address) it needs
to be enclosed in brackets to make parsing of NFS device string possible.
Fix nfs_do_root_mount() to enclose hostname properly when needed. NFS code
actually does not need this as it does not parse the string passed by
nfs_do_root_mount() but the device string is exposed to userspace in
/proc/mounts.

CC: Josh Boyer <jwboyer@redhat.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 37412f706b32..1e6715f0616c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2767,11 +2767,15 @@ static struct vfsmount *nfs_do_root_mount(struct file_system_type *fs_type,
 	char *root_devname;
 	size_t len;
 
-	len = strlen(hostname) + 3;
+	len = strlen(hostname) + 5;
 	root_devname = kmalloc(len, GFP_KERNEL);
 	if (root_devname == NULL)
 		return ERR_PTR(-ENOMEM);
-	snprintf(root_devname, len, "%s:/", hostname);
+	/* Does hostname needs to be enclosed in brackets? */
+	if (strchr(hostname, ':'))
+		snprintf(root_devname, len, "[%s]:/", hostname);
+	else
+		snprintf(root_devname, len, "%s:/", hostname);
 	root_mnt = vfs_kern_mount(fs_type, flags, root_devname, data);
 	kfree(root_devname);
 	return root_mnt;
-- 
cgit v1.2.3


From e4eb1ff61b323d6141614e5458a1f53c7046ff8e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 15:35:40 -0700
Subject: VM: add "vm_brk()" helper function

It does the same thing as "do_brk()", except it handles the VM locking
too.

It turns out that all external callers want that anyway, so we can make
do_brk() static to just mm/mmap.c while at it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c | 20 +++++---------------
 fs/binfmt_elf.c  | 15 ++++-----------
 2 files changed, 9 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2eb12f13593d..88527492b917 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -50,9 +50,7 @@ static int set_brk(unsigned long start, unsigned long end)
 	end = PAGE_ALIGN(end);
 	if (end > start) {
 		unsigned long addr;
-		down_write(&current->mm->mmap_sem);
-		addr = do_brk(start, end - start);
-		up_write(&current->mm->mmap_sem);
+		addr = vm_brk(start, end - start);
 		if (BAD_ADDR(addr))
 			return addr;
 	}
@@ -280,9 +278,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 		pos = 32;
 		map_size = ex.a_text+ex.a_data;
 #endif
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(text_addr & PAGE_MASK, map_size);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(text_addr & PAGE_MASK, map_size);
 		if (error != (text_addr & PAGE_MASK)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
@@ -313,9 +309,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
 			loff_t pos = fd_offset;
-			down_write(&current->mm->mmap_sem);
-			do_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			up_write(&current->mm->mmap_sem);
+			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
 			bprm->file->f_op->read(bprm->file,
 					(char __user *)N_TXTADDR(ex),
 					ex.a_text+ex.a_data, &pos);
@@ -412,9 +406,7 @@ static int load_aout_library(struct file *file)
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
 		}
-		down_write(&current->mm->mmap_sem);
-		do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
-		up_write(&current->mm->mmap_sem);
+		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 		
 		file->f_op->read(file, (char __user *)start_addr,
 			ex.a_text + ex.a_data, &pos);
@@ -438,9 +430,7 @@ static int load_aout_library(struct file *file)
 	len = PAGE_ALIGN(ex.a_text + ex.a_data);
 	bss = ex.a_text + ex.a_data + ex.a_bss;
 	if (bss > len) {
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(start_addr + len, bss - len);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(start_addr + len, bss - len);
 		retval = error;
 		if (error != start_addr + len)
 			goto out;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 48ffb3dc610a..0708a0bf0ba9 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -82,9 +82,7 @@ static int set_brk(unsigned long start, unsigned long end)
 	end = ELF_PAGEALIGN(end);
 	if (end > start) {
 		unsigned long addr;
-		down_write(&current->mm->mmap_sem);
-		addr = do_brk(start, end - start);
-		up_write(&current->mm->mmap_sem);
+		addr = vm_brk(start, end - start);
 		if (BAD_ADDR(addr))
 			return addr;
 	}
@@ -514,9 +512,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
 
 		/* Map the last of the bss segment */
-		down_write(&current->mm->mmap_sem);
-		error = do_brk(elf_bss, last_bss - elf_bss);
-		up_write(&current->mm->mmap_sem);
+		error = vm_brk(elf_bss, last_bss - elf_bss);
 		if (BAD_ADDR(error))
 			goto out_close;
 	}
@@ -1072,11 +1068,8 @@ static int load_elf_library(struct file *file)
 	len = ELF_PAGESTART(eppnt->p_filesz + eppnt->p_vaddr +
 			    ELF_MIN_ALIGN - 1);
 	bss = eppnt->p_memsz + eppnt->p_vaddr;
-	if (bss > len) {
-		down_write(&current->mm->mmap_sem);
-		do_brk(len, bss - len);
-		up_write(&current->mm->mmap_sem);
-	}
+	if (bss > len)
+		vm_brk(len, bss - len);
 	error = 0;
 
 out_free_ph:
-- 
cgit v1.2.3


From a46ef99d80817a167477ed1c8b4d90ee0c2e726f Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 16:20:01 -0700
Subject: VM: add "vm_munmap()" helper function

Like the vm_brk() function, this is the same as "do_munmap()", except it
does the VM locking for the caller.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index da887604dfc5..99bd790e8cd2 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -92,11 +92,8 @@ static void aio_free_ring(struct kioctx *ctx)
 	for (i=0; i<info->nr_pages; i++)
 		put_page(info->ring_pages[i]);
 
-	if (info->mmap_size) {
-		down_write(&ctx->mm->mmap_sem);
-		do_munmap(ctx->mm, info->mmap_base, info->mmap_size);
-		up_write(&ctx->mm->mmap_sem);
-	}
+	if (info->mmap_size)
+		vm_munmap(ctx->mm, info->mmap_base, info->mmap_size);
 
 	if (info->ring_pages && info->ring_pages != info->internal_pages)
 		kfree(info->ring_pages);
-- 
cgit v1.2.3


From 6be5ceb02e98eaf6cfc4f8b12a896d04023f340d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Fri, 20 Apr 2012 17:13:58 -0700
Subject: VM: add "vm_mmap()" helper function

This continues the theme started with vm_brk() and vm_munmap():
vm_mmap() does the same thing as do_mmap(), but additionally does the
required VM locking.

This uninlines (and rewrites it to be clearer) do_mmap(), which sadly
duplicates it in mm/mmap.c and mm/nommu.c.  But that way we don't have
to export our internal do_mmap_pgoff() function.

Some day we hopefully don't have to export do_mmap() either, if all
modular users can become the simpler vm_mmap() instead.  We're actually
very close to that already, with the notable exception of the (broken)
use in i810, and a couple of stragglers in binfmt_elf.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/binfmt_aout.c      | 12 +++---------
 fs/binfmt_elf.c       |  8 ++------
 fs/binfmt_elf_fdpic.c | 18 ++++--------------
 fs/binfmt_flat.c      | 12 +++---------
 fs/binfmt_som.c       | 12 +++---------
 5 files changed, 15 insertions(+), 47 deletions(-)

(limited to 'fs')

diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 88527492b917..d146e181d10d 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -319,24 +319,20 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
 			goto beyond_if;
 		}
 
-		down_write(&current->mm->mmap_sem);
-		error = do_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
+		error = vm_mmap(bprm->file, N_TXTADDR(ex), ex.a_text,
 			PROT_READ | PROT_EXEC,
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
 			fd_offset);
-		up_write(&current->mm->mmap_sem);
 
 		if (error != N_TXTADDR(ex)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
 
-		down_write(&current->mm->mmap_sem);
- 		error = do_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
+		error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data,
 				PROT_READ | PROT_WRITE | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE,
 				fd_offset + ex.a_text);
-		up_write(&current->mm->mmap_sem);
 		if (error != N_DATADDR(ex)) {
 			send_sig(SIGKILL, current, 0);
 			return error;
@@ -417,12 +413,10 @@ static int load_aout_library(struct file *file)
 		goto out;
 	}
 	/* Now use mmap to map the library into memory. */
-	down_write(&current->mm->mmap_sem);
-	error = do_mmap(file, start_addr, ex.a_text + ex.a_data,
+	error = vm_mmap(file, start_addr, ex.a_text + ex.a_data,
 			PROT_READ | PROT_WRITE | PROT_EXEC,
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
 			N_TXTOFF(ex));
-	up_write(&current->mm->mmap_sem);
 	retval = error;
 	if (error != start_addr)
 		goto out;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0708a0bf0ba9..16f735417072 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -958,10 +958,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 		   and some applications "depend" upon this behavior.
 		   Since we do not have the power to recompile these, we
 		   emulate the SVr4 behavior. Sigh. */
-		down_write(&current->mm->mmap_sem);
-		error = do_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
+		error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
 				MAP_FIXED | MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 	}
 
 #ifdef ELF_PLAT_INIT
@@ -1046,8 +1044,7 @@ static int load_elf_library(struct file *file)
 		eppnt++;
 
 	/* Now use mmap to map the library into memory. */
-	down_write(&current->mm->mmap_sem);
-	error = do_mmap(file,
+	error = vm_mmap(file,
 			ELF_PAGESTART(eppnt->p_vaddr),
 			(eppnt->p_filesz +
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)),
@@ -1055,7 +1052,6 @@ static int load_elf_library(struct file *file)
 			MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE,
 			(eppnt->p_offset -
 			 ELF_PAGEOFFSET(eppnt->p_vaddr)));
-	up_write(&current->mm->mmap_sem);
 	if (error != ELF_PAGESTART(eppnt->p_vaddr))
 		goto out_free_ph;
 
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9bd5612a8224..d390a0fffc65 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -390,21 +390,17 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm,
 	    (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC))
 		stack_prot |= PROT_EXEC;
 
-	down_write(&current->mm->mmap_sem);
-	current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot,
+	current->mm->start_brk = vm_mmap(NULL, 0, stack_size, stack_prot,
 					 MAP_PRIVATE | MAP_ANONYMOUS |
 					 MAP_UNINITIALIZED | MAP_GROWSDOWN,
 					 0);
 
 	if (IS_ERR_VALUE(current->mm->start_brk)) {
-		up_write(&current->mm->mmap_sem);
 		retval = current->mm->start_brk;
 		current->mm->start_brk = 0;
 		goto error_kill;
 	}
 
-	up_write(&current->mm->mmap_sem);
-
 	current->mm->brk = current->mm->start_brk;
 	current->mm->context.end_brk = current->mm->start_brk;
 	current->mm->context.end_brk +=
@@ -955,10 +951,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 	if (params->flags & ELF_FDPIC_FLAG_EXECUTABLE)
 		mflags |= MAP_EXECUTABLE;
 
-	down_write(&mm->mmap_sem);
-	maddr = do_mmap(NULL, load_addr, top - base,
+	maddr = vm_mmap(NULL, load_addr, top - base,
 			PROT_READ | PROT_WRITE | PROT_EXEC, mflags, 0);
-	up_write(&mm->mmap_sem);
 	if (IS_ERR_VALUE(maddr))
 		return (int) maddr;
 
@@ -1096,10 +1090,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 
 		/* create the mapping */
 		disp = phdr->p_vaddr & ~PAGE_MASK;
-		down_write(&mm->mmap_sem);
-		maddr = do_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
+		maddr = vm_mmap(file, maddr, phdr->p_memsz + disp, prot, flags,
 				phdr->p_offset - disp);
-		up_write(&mm->mmap_sem);
 
 		kdebug("mmap[%d] <file> sz=%lx pr=%x fl=%x of=%lx --> %08lx",
 		       loop, phdr->p_memsz + disp, prot, flags,
@@ -1143,10 +1135,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
 			unsigned long xmaddr;
 
 			flags |= MAP_FIXED | MAP_ANONYMOUS;
-			down_write(&mm->mmap_sem);
-			xmaddr = do_mmap(NULL, xaddr, excess - excess1,
+			xmaddr = vm_mmap(NULL, xaddr, excess - excess1,
 					 prot, flags, 0);
-			up_write(&mm->mmap_sem);
 
 			kdebug("mmap[%d] <anon>"
 			       " ad=%lx sz=%lx pr=%x fl=%x of=0 --> %08lx",
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 024d20ee3ca3..6b2daf99fab8 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -542,10 +542,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 		 */
 		DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n");
 
-		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
+		textpos = vm_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC,
 				  MAP_PRIVATE|MAP_EXECUTABLE, 0);
-		up_write(&current->mm->mmap_sem);
 		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
 				textpos = (unsigned long) -ENOMEM;
@@ -556,10 +554,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 		len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
 		len = PAGE_ALIGN(len);
-		down_write(&current->mm->mmap_sem);
-		realdatastart = do_mmap(0, 0, len,
+		realdatastart = vm_mmap(0, 0, len,
 			PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 
 		if (realdatastart == 0 || IS_ERR_VALUE(realdatastart)) {
 			if (!realdatastart)
@@ -603,10 +599,8 @@ static int load_flat_file(struct linux_binprm * bprm,
 
 		len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
 		len = PAGE_ALIGN(len);
-		down_write(&current->mm->mmap_sem);
-		textpos = do_mmap(0, 0, len,
+		textpos = vm_mmap(0, 0, len,
 			PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
-		up_write(&current->mm->mmap_sem);
 
 		if (!textpos || IS_ERR_VALUE(textpos)) {
 			if (!textpos)
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index e4fc746629a7..4517aaff61b4 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -147,10 +147,8 @@ static int map_som_binary(struct file *file,
 	code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
 	current->mm->start_code = code_start;
 	current->mm->end_code = code_start + code_size;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file, code_start, code_size, prot,
+	retval = vm_mmap(file, code_start, code_size, prot,
 			flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
-	up_write(&current->mm->mmap_sem);
 	if (retval < 0 && retval > -1024)
 		goto out;
 
@@ -158,20 +156,16 @@ static int map_som_binary(struct file *file,
 	data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
 	current->mm->start_data = data_start;
 	current->mm->end_data = bss_start = data_start + data_size;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(file, data_start, data_size,
+	retval = vm_mmap(file, data_start, data_size,
 			prot | PROT_WRITE, flags,
 			SOM_PAGESTART(hpuxhdr->exec_dfile));
-	up_write(&current->mm->mmap_sem);
 	if (retval < 0 && retval > -1024)
 		goto out;
 
 	som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
 	current->mm->start_brk = current->mm->brk = som_brk;
-	down_write(&current->mm->mmap_sem);
-	retval = do_mmap(NULL, bss_start, som_brk - bss_start,
+	retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
 			prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
-	up_write(&current->mm->mmap_sem);
 	if (retval > 0 || retval < -1024)
 		retval = 0;
 out:
-- 
cgit v1.2.3


From 95b72eb0bdef6476b7e73061f0382adf46c5495a Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 20 Apr 2012 19:24:51 -0400
Subject: NFSv4: Ensure we do not reuse open owner names

The NFSv4 spec is ambiguous about whether or not it is permissible
to reuse open owner names, so play it safe. This patch adds a timestamp
to the state_owner structure, and combines that with the IDA based
uniquifier.
Fixes a regression whereby the Linux server returns NFS4ERR_BAD_SEQID.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   | 1 +
 fs/nfs/nfs4proc.c  | 6 +++---
 fs/nfs/nfs4state.c | 1 +
 fs/nfs/nfs4xdr.c   | 9 +++++----
 4 files changed, 10 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 97ecc863dd76..b6db9e33fb7b 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -59,6 +59,7 @@ struct nfs_unique_id {
 
 #define NFS_SEQID_CONFIRMED 1
 struct nfs_seqid_counter {
+	ktime_t create_time;
 	int owner_id;
 	int flags;
 	u32 counter;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index f875cf305237..60d5f4c26dda 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -838,7 +838,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.open_flags = flags;
 	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
 	p->o_arg.clientid = server->nfs_client->cl_clientid;
-	p->o_arg.id = sp->so_seqid.owner_id;
+	p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
+	p->o_arg.id.uniquifier = sp->so_seqid.owner_id;
 	p->o_arg.name = &dentry->d_name;
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
@@ -1466,8 +1467,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 			goto unlock_no_action;
 		rcu_read_unlock();
 	}
-	/* Update sequence id. */
-	data->o_arg.id = sp->so_seqid.owner_id;
+	/* Update client id. */
 	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0f43414eb25a..3b07f094f3a9 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -393,6 +393,7 @@ nfs4_remove_state_owner_locked(struct nfs4_state_owner *sp)
 static void
 nfs4_init_seqid_counter(struct nfs_seqid_counter *sc)
 {
+	sc->create_time = ktime_get();
 	sc->flags = 0;
 	sc->counter = 0;
 	spin_lock_init(&sc->lock);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index c74fdb114b48..77fc5f959c4e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -74,7 +74,7 @@ static int nfs4_stat_to_errno(int);
 /* lock,open owner id:
  * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT  >> 2)
  */
-#define open_owner_id_maxsz	(1 + 1 + 4)
+#define open_owner_id_maxsz	(1 + 2 + 1 + 1 + 2)
 #define lock_owner_id_maxsz	(1 + 1 + 4)
 #define decode_lockowner_maxsz	(1 + XDR_QUADLEN(IDMAP_NAMESZ))
 #define compound_encode_hdr_maxsz	(3 + (NFS4_MAXTAGLEN >> 2))
@@ -1340,12 +1340,13 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
  */
 	encode_nfs4_seqid(xdr, arg->seqid);
 	encode_share_access(xdr, arg->fmode);
-	p = reserve_space(xdr, 32);
+	p = reserve_space(xdr, 36);
 	p = xdr_encode_hyper(p, arg->clientid);
-	*p++ = cpu_to_be32(20);
+	*p++ = cpu_to_be32(24);
 	p = xdr_encode_opaque_fixed(p, "open id:", 8);
 	*p++ = cpu_to_be32(arg->server->s_dev);
-	xdr_encode_hyper(p, arg->id);
+	*p++ = cpu_to_be32(arg->id.uniquifier);
+	xdr_encode_hyper(p, arg->id.create_time);
 }
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
-- 
cgit v1.2.3


From 936af1576e4c24b466380fc2b8d93352161d13b0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Apr 2012 21:49:41 -0400
Subject: aio: don't bother with unmapping when aio_free_ring() is coming from
 exit_aio()

... since exit_mmap() is coming and it will munmap() everything anyway.
In all other cases aio_free_ring() has ctx->mm == current->mm; moreover,
all other callers of vm_munmap() have mm == current->mm, so this will
allow us to get rid of mm argument of vm_munmap().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 99bd790e8cd2..976e33d97413 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -92,8 +92,10 @@ static void aio_free_ring(struct kioctx *ctx)
 	for (i=0; i<info->nr_pages; i++)
 		put_page(info->ring_pages[i]);
 
-	if (info->mmap_size)
+	if (info->mmap_size) {
+		BUG_ON(ctx->mm != current->mm);
 		vm_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+	}
 
 	if (info->ring_pages && info->ring_pages != info->internal_pages)
 		kfree(info->ring_pages);
@@ -386,6 +388,17 @@ void exit_aio(struct mm_struct *mm)
 				"exit_aio:ioctx still alive: %d %d %d\n",
 				atomic_read(&ctx->users), ctx->dead,
 				ctx->reqs_active);
+		/*
+		 * We don't need to bother with munmap() here -
+		 * exit_mmap(mm) is coming and it'll unmap everything.
+		 * Since aio_free_ring() uses non-zero ->mmap_size
+		 * as indicator that it needs to unmap the area,
+		 * just set it to 0; aio_free_ring() is the only
+		 * place that uses ->mmap_size, so it's safe.
+		 * That way we get all munmap done to current->mm -
+		 * all other callers have ctx->mm == current->mm.
+		 */
+		ctx->ring_info.mmap_size = 0;
 		put_ioctx(ctx);
 	}
 }
-- 
cgit v1.2.3


From bfce281c287a427d0841fadf5d59242757b4e620 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Apr 2012 21:57:04 -0400
Subject: kill mm argument of vm_munmap()

it's always current->mm

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index 976e33d97413..67a6db3e1b6f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -94,7 +94,7 @@ static void aio_free_ring(struct kioctx *ctx)
 
 	if (info->mmap_size) {
 		BUG_ON(ctx->mm != current->mm);
-		vm_munmap(ctx->mm, info->mmap_base, info->mmap_size);
+		vm_munmap(info->mmap_base, info->mmap_size);
 	}
 
 	if (info->ring_pages && info->ring_pages != info->internal_pages)
-- 
cgit v1.2.3


From c77365c963cf8703ecf1004cd3b298068a3e1b76 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 21 Apr 2012 12:31:05 -0400
Subject: NFSv4: Ensure that we don't drop a state owner more than once

Retest the RB_EMPTY_NODE() condition under the spin lock
to ensure that we don't call rb_erase() more than once on the
same state owner.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3b07f094f3a9..b300fb840b21 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -435,13 +435,17 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 static void
 nfs4_drop_state_owner(struct nfs4_state_owner *sp)
 {
-	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
+	struct rb_node *rb_node = &sp->so_server_node;
+
+	if (!RB_EMPTY_NODE(rb_node)) {
 		struct nfs_server *server = sp->so_server;
 		struct nfs_client *clp = server->nfs_client;
 
 		spin_lock(&clp->cl_lock);
-		rb_erase(&sp->so_server_node, &server->state_owners);
-		RB_CLEAR_NODE(&sp->so_server_node);
+		if (!RB_EMPTY_NODE(rb_node)) {
+			rb_erase(rb_node, &server->state_owners);
+			RB_CLEAR_NODE(rb_node);
+		}
 		spin_unlock(&clp->cl_lock);
 	}
 }
-- 
cgit v1.2.3


From 7bf97bc27308cfdc7a8dadd40ae50f7c4cb09b01 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sat, 21 Apr 2012 12:36:19 -0400
Subject: NFSv4: Keep dropped state owners on the LRU list for a while

To ensure that we don't reuse their identifiers.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4state.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index b300fb840b21..7f0fcfc1fe9d 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -521,6 +521,14 @@ out:
 /**
  * nfs4_put_state_owner - Release a nfs4_state_owner
  * @sp: state owner data to release
+ *
+ * Note that we keep released state owners on an LRU
+ * list.
+ * This caches valid state owners so that they can be
+ * reused, to avoid the OPEN_CONFIRM on minor version 0.
+ * It also pins the uniquifier of dropped state owners for
+ * a while, to ensure that those state owner names are
+ * never reused.
  */
 void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 {
@@ -530,15 +538,9 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp)
 	if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock))
 		return;
 
-	if (!RB_EMPTY_NODE(&sp->so_server_node)) {
-		sp->so_expires = jiffies;
-		list_add_tail(&sp->so_lru, &server->state_owners_lru);
-		spin_unlock(&clp->cl_lock);
-	} else {
-		nfs4_remove_state_owner_locked(sp);
-		spin_unlock(&clp->cl_lock);
-		nfs4_free_state_owner(sp);
-	}
+	sp->so_expires = jiffies;
+	list_add_tail(&sp->so_lru, &server->state_owners_lru);
+	spin_unlock(&clp->cl_lock);
 }
 
 /**
-- 
cgit v1.2.3


From 53ad1c980d4fb450722a575ca17c188808939340 Mon Sep 17 00:00:00 2001
From: David Teigland <teigland@redhat.com>
Date: Wed, 4 Apr 2012 09:49:15 -0500
Subject: dlm: fix QUECVT when convert queue is empty

The QUECVT flag should not prevent conversions from
being granted immediately when the convert queue is
empty.

Signed-off-by: David Teigland <teigland@redhat.com>
---
 fs/dlm/lock.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'fs')

diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index fa5c07d51dcc..4c58d4a3adc4 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1736,6 +1736,18 @@ static int _can_be_granted(struct dlm_rsb *r, struct dlm_lkb *lkb, int now)
 	if (now && conv && !(lkb->lkb_exflags & DLM_LKF_QUECVT))
 		return 1;
 
+	/*
+	 * Even if the convert is compat with all granted locks,
+	 * QUECVT forces it behind other locks on the convert queue.
+	 */
+
+	if (now && conv && (lkb->lkb_exflags & DLM_LKF_QUECVT)) {
+		if (list_empty(&r->res_convertqueue))
+			return 1;
+		else
+			goto out;
+	}
+
 	/*
 	 * The NOORDER flag is set to avoid the standard vms rules on grant
 	 * order.
-- 
cgit v1.2.3


From 99aa78466777083255b876293e9e83dec7cd809a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Fri, 13 Apr 2012 10:27:35 +0800
Subject: jbd2: use GFP_NOFS for blkdev_issue_flush

flush request is issued in transaction commit code path, so looks using
GFP_KERNEL to allocate memory for flush request bio falls into the classic
deadlock issue.  I saw btrfs and dm get it right, but ext4, xfs and md are
using GFP.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: stable@vger.kernel.org
---
 fs/jbd2/commit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 806525a7269c..840f70f50792 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -723,7 +723,7 @@ start_journal_io:
 	if (commit_transaction->t_need_data_flush &&
 	    (journal->j_fs_dev != journal->j_dev) &&
 	    (journal->j_flags & JBD2_BARRIER))
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 
 	/* Done it all: now write the commit record asynchronously. */
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -859,7 +859,7 @@ wait_for_iobuf:
 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 	    journal->j_flags & JBD2_BARRIER) {
-		blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 	}
 
 	if (err)
-- 
cgit v1.2.3


From db7e5c668e1b16061fe2d94d3cba022dd360c5d4 Mon Sep 17 00:00:00 2001
From: Eldad Zack <eldad@fogrefinery.com>
Date: Sun, 22 Apr 2012 17:50:52 +0200
Subject: super.c: unused variable warning without CONFIG_QUOTA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

sb info is only checked with quota support.

fs/ext4/super.c: In function ‘parse_options’:
fs/ext4/super.c:1600:23: warning: unused variable ‘sbi’ [-Wunused-variable]

Signed-off-by: Eldad Zack <eldad@fogrefinery.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 6da193564e43..e1fb1d5de58e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1597,7 +1597,9 @@ static int parse_options(char *options, struct super_block *sb,
 			 unsigned int *journal_ioprio,
 			 int is_remount)
 {
+#ifdef CONFIG_QUOTA
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
+#endif
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
 	int token;
-- 
cgit v1.2.3


From 4c569a72c30dfee9b5133284aba67e3aa0c9505d Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Tue, 10 Apr 2012 14:45:24 -0400
Subject: GFS2: Instruct DLM to avoid queue convert slowdown

This patch instructs DLM to prevent an "in place" conversion, where the
lock just stays on the granted queue, and instead forces the conversion to
the back of the convert queue. This is done on upward conversions only.

This is useful in cases where, for example, a lock is frequently needed in
PR on one node, but another node needs it temporarily in EX to update it.
This may happen, for example, when the rindex is being updated by gfs2_grow.
The gfs2_grow needs to have the lock in EX, but the other nodes need to
re-read it to retrieve the updates. The glock is already granted in PR on
the non-growing nodes, so this prevents them from continually re-granting
the lock in PR, and forces the EX from gfs2_grow to go through.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/lock_dlm.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index f8411bd1b805..5f5e70e047dc 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -200,10 +200,11 @@ static int make_mode(const unsigned int lmstate)
 	return -1;
 }
 
-static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
+static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
 		      const int req)
 {
 	u32 lkf = DLM_LKF_VALBLK;
+	u32 lkid = gl->gl_lksb.sb_lkid;
 
 	if (gfs_flags & LM_FLAG_TRY)
 		lkf |= DLM_LKF_NOQUEUE;
@@ -227,8 +228,11 @@ static u32 make_flags(const u32 lkid, const unsigned int gfs_flags,
 			BUG();
 	}
 
-	if (lkid != 0) 
+	if (lkid != 0) {
 		lkf |= DLM_LKF_CONVERT;
+		if (test_bit(GLF_BLOCKING, &gl->gl_flags))
+			lkf |= DLM_LKF_QUECVT;
+	}
 
 	return lkf;
 }
@@ -250,7 +254,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
 	char strname[GDLM_STRNAME_BYTES] = "";
 
 	req = make_mode(req_state);
-	lkf = make_flags(gl->gl_lksb.sb_lkid, flags, req);
+	lkf = make_flags(gl, flags, req);
 	gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
 	gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
 	if (gl->gl_lksb.sb_lkid) {
-- 
cgit v1.2.3


From 3c7c87fd5bd71f57c68a64d11a15170d0dc4f7aa Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Tue, 24 Apr 2012 15:28:14 +0100
Subject: CIFS: Show backupuid/gid in /proc/mounts

Show  backupuid/backupgid in /proc/mounts for cifs shares mounted with
the backupuid/backupgid feature.

Also consolidate the two separate checks for
pvolume_info->backupuid_specified into a single if condition in
cifs_setup_cifs_sb().

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c  |  4 ++++
 fs/cifs/connect.c | 12 ++++++------
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index d34212822444..ea8eb92b65b4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -434,6 +434,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",noperm");
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO)
 		seq_printf(s, ",strictcache");
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPUID)
+		seq_printf(s, ",backupuid=%u", cifs_sb->mnt_backupuid);
+	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
+		seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
 
 	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f31dc9ac37b7..f4d381e331ce 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3228,10 +3228,6 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 
 	cifs_sb->mnt_uid = pvolume_info->linux_uid;
 	cifs_sb->mnt_gid = pvolume_info->linux_gid;
-	if (pvolume_info->backupuid_specified)
-		cifs_sb->mnt_backupuid = pvolume_info->backupuid;
-	if (pvolume_info->backupgid_specified)
-		cifs_sb->mnt_backupgid = pvolume_info->backupgid;
 	cifs_sb->mnt_file_mode = pvolume_info->file_mode;
 	cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
 	cFYI(1, "file mode: 0x%hx  dir mode: 0x%hx",
@@ -3262,10 +3258,14 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD;
 	if (pvolume_info->cifs_acl)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL;
-	if (pvolume_info->backupuid_specified)
+	if (pvolume_info->backupuid_specified) {
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID;
-	if (pvolume_info->backupgid_specified)
+		cifs_sb->mnt_backupuid = pvolume_info->backupuid;
+	}
+	if (pvolume_info->backupgid_specified) {
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID;
+		cifs_sb->mnt_backupgid = pvolume_info->backupgid;
+	}
 	if (pvolume_info->override_uid)
 		cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID;
 	if (pvolume_info->override_gid)
-- 
cgit v1.2.3


From 28f8881023c9713c303c0feda270929f9384c019 Mon Sep 17 00:00:00 2001
From: Sachin Prabhu <sprabhu@redhat.com>
Date: Tue, 24 Apr 2012 15:28:30 +0100
Subject: Use correct conversion specifiers in cifs_show_options

cifs_show_options uses the wrong conversion specifier for uid, gid,
rsize & wsize. Correct this to %u to match it to the variable type
'unsigned integer'.

Signed-off-by: Sachin Prabhu <sprabhu@redhat.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index ea8eb92b65b4..811245b1ff2e 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -370,13 +370,13 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 				   (int)(srcaddr->sa_family));
 	}
 
-	seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
+	seq_printf(s, ",uid=%u", cifs_sb->mnt_uid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
 		seq_printf(s, ",forceuid");
 	else
 		seq_printf(s, ",noforceuid");
 
-	seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
+	seq_printf(s, ",gid=%u", cifs_sb->mnt_gid);
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
 		seq_printf(s, ",forcegid");
 	else
@@ -439,8 +439,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_BACKUPGID)
 		seq_printf(s, ",backupgid=%u", cifs_sb->mnt_backupgid);
 
-	seq_printf(s, ",rsize=%d", cifs_sb->rsize);
-	seq_printf(s, ",wsize=%d", cifs_sb->wsize);
+	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
+	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
 	/* convert actimeo and display it in seconds */
 		seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
-- 
cgit v1.2.3


From 13d518074a952d33d47c428419693f63389547e9 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 25 Apr 2012 16:01:47 -0700
Subject: epoll: clear the tfile_check_list on -ELOOP

An epoll_ctl(,EPOLL_CTL_ADD,,) operation can return '-ELOOP' to prevent
circular epoll dependencies from being created.  However, in that case we
do not properly clear the 'tfile_check_list'.  Thus, add a call to
clear_tfile_check_list() for the -ELOOP case.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Reported-by: Yurij M. Plotnikov <Yurij.Plotnikov@oktetlabs.ru>
Cc: Nelson Elhage <nelhage@nelhage.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Tested-by: Alexandra N. Kossovsky <Alexandra.Kossovsky@oktetlabs.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 739b0985b398..c0b3c70ee87a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1663,8 +1663,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	if (op == EPOLL_CTL_ADD) {
 		if (is_file_epoll(tfile)) {
 			error = -ELOOP;
-			if (ep_loop_check(ep, tfile) != 0)
+			if (ep_loop_check(ep, tfile) != 0) {
+				clear_tfile_check_list();
 				goto error_tgt_fput;
+			}
 		} else
 			list_add(&tfile->f_tfile_llink, &tfile_check_list);
 	}
-- 
cgit v1.2.3


From 61065a30af8df4b8989c2ac7a1f4b4034e4df2d5 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@parallels.com>
Date: Wed, 25 Apr 2012 16:01:48 -0700
Subject: fs/buffer.c: remove BUG() in possible but rare condition

While stressing the kernel with with failing allocations today, I hit the
following chain of events:

alloc_page_buffers():

	bh = alloc_buffer_head(GFP_NOFS);
	if (!bh)
		goto no_grow; <= path taken

grow_dev_page():
        bh = alloc_page_buffers(page, size, 0);
        if (!bh)
                goto failed;  <= taken, consequence of the above

and then the failed path BUG()s the kernel.

The failure is inserted a litte bit artificially, but even then, I see no
reason why it should be deemed impossible in a real box.

Even though this is not a condition that we expect to see around every
time, failed allocations are expected to be handled, and BUG() sounds just
too much.  As a matter of fact, grow_dev_page() can return NULL just fine
in other circumstances, so I propose we just remove it, then.

Signed-off-by: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 36d66653b931..351e18ea2e53 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -985,7 +985,6 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	return page;
 
 failed:
-	BUG();
 	unlock_page(page);
 	page_cache_release(page);
 	return NULL;
-- 
cgit v1.2.3


From 65ed76010dfed3cb75c863c9052c367a1bacf80a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 25 Apr 2012 16:01:50 -0700
Subject: hugetlbfs: lockdep annotate root inode properly

This fixes the below reported false lockdep warning.  e096d0c7e2e4
("lockdep: Add helper function for dir vs file i_mutex annotation") added
a similar annotation for every other inode in hugetlbfs but missed the
root inode because it was allocated by a separate function.

For HugeTLB fs we allow taking i_mutex in mmap.  HugeTLB fs doesn't
support file write and its file read callback is modified in a05b0855fd
("hugetlbfs: avoid taking i_mutex from hugetlbfs_read()") to not take
i_mutex.  Hence for HugeTLB fs with regular files we really don't take
i_mutex with mmap_sem held.

 ======================================================
 [ INFO: possible circular locking dependency detected ]
 3.4.0-rc1+ #322 Not tainted
 -------------------------------------------------------
 bash/1572 is trying to acquire lock:
  (&mm->mmap_sem){++++++}, at: [<ffffffff810f1618>] might_fault+0x40/0x90

 but task is already holding lock:
  (&sb->s_type->i_mutex_key#12){+.+.+.}, at: [<ffffffff81125f88>] vfs_readdir+0x56/0xa8

 which lock already depends on the new lock.

 the existing dependency chain (in reverse order) is:

 -> #1 (&sb->s_type->i_mutex_key#12){+.+.+.}:
        [<ffffffff810a09e5>] lock_acquire+0xd5/0xfa
        [<ffffffff816a2f5e>] __mutex_lock_common+0x48/0x350
        [<ffffffff816a3325>] mutex_lock_nested+0x2a/0x31
        [<ffffffff811fb8e1>] hugetlbfs_file_mmap+0x7d/0x104
        [<ffffffff810f859a>] mmap_region+0x272/0x47d
        [<ffffffff810f8a39>] do_mmap_pgoff+0x294/0x2ee
        [<ffffffff810f8b65>] sys_mmap_pgoff+0xd2/0x10e
        [<ffffffff8103d19e>] sys_mmap+0x1d/0x1f
        [<ffffffff816a5922>] system_call_fastpath+0x16/0x1b

 -> #0 (&mm->mmap_sem){++++++}:
        [<ffffffff810a0256>] __lock_acquire+0xa81/0xd75
        [<ffffffff810a09e5>] lock_acquire+0xd5/0xfa
        [<ffffffff810f1645>] might_fault+0x6d/0x90
        [<ffffffff81125d62>] filldir+0x6a/0xc2
        [<ffffffff81133a83>] dcache_readdir+0x5c/0x222
        [<ffffffff81125fa8>] vfs_readdir+0x76/0xa8
        [<ffffffff811260b6>] sys_getdents+0x79/0xc9
        [<ffffffff816a5922>] system_call_fastpath+0x16/0x1b

 other info that might help us debug this:

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&sb->s_type->i_mutex_key#12);
                                lock(&mm->mmap_sem);
                                lock(&sb->s_type->i_mutex_key#12);
   lock(&mm->mmap_sem);

  *** DEADLOCK ***

 1 lock held by bash/1572:
  #0:  (&sb->s_type->i_mutex_key#12){+.+.+.}, at: [<ffffffff81125f88>] vfs_readdir+0x56/0xa8

 stack backtrace:
 Pid: 1572, comm: bash Not tainted 3.4.0-rc1+ #322
 Call Trace:
  [<ffffffff81699a3c>] print_circular_bug+0x1f8/0x209
  [<ffffffff810a0256>] __lock_acquire+0xa81/0xd75
  [<ffffffff810f38aa>] ? handle_pte_fault+0x5ff/0x614
  [<ffffffff8109e622>] ? mark_lock+0x2d/0x258
  [<ffffffff810f1618>] ? might_fault+0x40/0x90
  [<ffffffff810a09e5>] lock_acquire+0xd5/0xfa
  [<ffffffff810f1618>] ? might_fault+0x40/0x90
  [<ffffffff816a3249>] ? __mutex_lock_common+0x333/0x350
  [<ffffffff810f1645>] might_fault+0x6d/0x90
  [<ffffffff810f1618>] ? might_fault+0x40/0x90
  [<ffffffff81125d62>] filldir+0x6a/0xc2
  [<ffffffff81133a83>] dcache_readdir+0x5c/0x222
  [<ffffffff81125cf8>] ? sys_ioctl+0x74/0x74
  [<ffffffff81125cf8>] ? sys_ioctl+0x74/0x74
  [<ffffffff81125cf8>] ? sys_ioctl+0x74/0x74
  [<ffffffff81125fa8>] vfs_readdir+0x76/0xa8
  [<ffffffff811260b6>] sys_getdents+0x79/0xc9
  [<ffffffff816a5922>] system_call_fastpath+0x16/0x1b

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mimi Zohar <zohar@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 28cf06e4ec84..001ef01d2fe2 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -485,6 +485,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 		inode->i_fop = &simple_dir_operations;
 		/* directory inodes start off with i_nlink == 2 (for "." entry) */
 		inc_nlink(inode);
+		lockdep_annotate_inode_mutex_key(inode);
 	}
 	return inode;
 }
-- 
cgit v1.2.3


From 63f61a6f4633ff34c17bea7a0ed827eaeb0733e1 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Wed, 25 Apr 2012 16:01:52 -0700
Subject: revert "proc: clear_refs: do not clear reserved pages"

Revert commit 85e72aa5384 ("proc: clear_refs: do not clear reserved
pages"), which was a quick fix suitable for -stable until ARM had been
moved over to the gate_vma mechanism:

https://lkml.org/lkml/2012/1/14/55

With commit f9d4861f ("ARM: 7294/1: vectors: use gate_vma for vectors user
mapping"), ARM does now use the gate_vma, so the PageReserved check can be
removed from the proc code.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Cc: Nicolas Pitre <nico@linaro.org>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/task_mmu.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 2b9a7607cbd5..2d60492d6df8 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -597,9 +597,6 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!page)
 			continue;
 
-		if (PageReserved(page))
-			continue;
-
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
 		ClearPageReferenced(page);
-- 
cgit v1.2.3


From 996d282c7ff470f150a467eb4815b90159d04c47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 23 Apr 2012 20:35:03 -0400
Subject: Btrfs: do not start delalloc inodes during sync

btrfs_start_delalloc_inodes will just walk the list of delalloc inodes and
start writing them out, but it doesn't splice the list or anything so as
long as somebody is doing work on the box you could end up in this section
_forever_.  So just remove it, it's not needed anyway since sync will start
writeback on all inodes anyway, all we need to do is wait for ordered
extents and then we can commit the transaction.  In my horrible torture test
sync goes from taking 4 minutes to about 1.5 minutes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 43aa2dd0bc7d..f267718cbd1a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -819,7 +819,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 		return 0;
 	}
 
-	btrfs_start_delalloc_inodes(root, 0);
 	btrfs_wait_ordered_extents(root, 0, 0);
 
 	trans = btrfs_start_transaction(root, 0);
-- 
cgit v1.2.3


From 3e74317ad773ba9df36db1fa32848cba41ac4d1a Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 27 Apr 2012 12:41:45 -0400
Subject: Btrfs: fix repair code for RAID10

btrfs_map_block sets mirror_num, so that the repair code knows eventually
which device gave us the read error. For RAID10, mirror_num must be 1 or 2.
Before this fix mirror_num was incorrectly related to our stripe index.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3b984173d25b..1411b99555a4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3807,10 +3807,11 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else {
+			int old_stripe_index = stripe_index;
 			stripe_index = find_live_mirror(map, stripe_index,
 					      map->sub_stripes, stripe_index +
 					      current->pid % map->sub_stripes);
-			mirror_num = stripe_index + 1;
+			mirror_num = stripe_index - old_stripe_index + 1;
 		}
 	} else {
 		/*
-- 
cgit v1.2.3


From 1daf3540fa77faea2f91d96bcaf07ce48ee827be Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Fri, 27 Apr 2012 12:41:46 -0400
Subject: Btrfs: Prevent root_list corruption

I was seeing root_list corruption on unmount during fs resize in 3.4-rc4; add
correct locking to address this.

Signed-off-by: Daniel J Blueman <daniel@quora.org>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 017281dbb2a7..5a105a086acf 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1279,7 +1279,9 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
 		if (rb_node)
 			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 	} else {
+		spin_lock(&root->fs_info->trans_lock);
 		list_del_init(&root->root_list);
+		spin_unlock(&root->fs_info->trans_lock);
 		kfree(node);
 	}
 	return 0;
-- 
cgit v1.2.3


From 1f699d38b6556c393ac80f1c23c2053502a51631 Mon Sep 17 00:00:00 2001
From: Stefan Behrens <sbehrens@giantdisaster.de>
Date: Fri, 27 Apr 2012 12:41:46 -0400
Subject: Btrfs: fix block_rsv and space_info lock ordering

may_commit_transaction() calls
        spin_lock(&space_info->lock);
        spin_lock(&delayed_rsv->lock);
and update_global_block_rsv() calls
        spin_lock(&block_rsv->lock);
        spin_lock(&sinfo->lock);

Lockdep complains about this at run time.
Everywhere except in update_global_block_rsv(), the space_info lock is
the outer lock, therefore the locking order in update_global_block_rsv()
is changed.

Signed-off-by: Stefan Behrens <sbehrens@giantdisaster.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 84497f8eb043..6fc2e6f5aab8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4214,8 +4214,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	num_bytes = calc_global_metadata_size(fs_info);
 
-	spin_lock(&block_rsv->lock);
 	spin_lock(&sinfo->lock);
+	spin_lock(&block_rsv->lock);
 
 	block_rsv->size = num_bytes;
 
@@ -4241,8 +4241,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 		block_rsv->full = 1;
 	}
 
-	spin_unlock(&sinfo->lock);
 	spin_unlock(&block_rsv->lock);
+	spin_unlock(&sinfo->lock);
 }
 
 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
-- 
cgit v1.2.3


From 7654b72417e10e294563496e25211200f9b8b6d3 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel@quora.org>
Date: Fri, 27 Apr 2012 12:41:46 -0400
Subject: Btrfs: Fix space checking during fs resize

Fix out-of-space checking, addressing a warning and potential resource
leak when resizing the filesystem down while allocating blocks.

Signed-off-by: Daniel J Blueman <daniel@quora.org>
Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/relocation.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5a105a086acf..646ee21bb035 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3813,7 +3813,7 @@ restart:
 
 		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
 		if (ret < 0) {
-			if (ret != -EAGAIN) {
+			if (ret != -ENOSPC) {
 				err = ret;
 				WARN_ON(1);
 				break;
-- 
cgit v1.2.3


From fede766f28dd766d4e8feb321fdb19edb21ef6fb Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 27 Apr 2012 14:23:22 -0400
Subject: Btrfs: avoid deadlocks from GFP_KERNEL allocations during
 btrfs_real_readdir

Btrfs has an optimization where it will preallocate dentries during
readdir to fill in enough information to open the inode without an extra
lookup.

But, we're calling d_alloc, which is doing GFP_KERNEL allocations, and
that leads to deadlocks because our readdir code has tree locks held.

For now, disable this optimization.  We'll fix the gfp mask in the next
merge window.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 30 +-----------------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d953f8820464..3ce7805d1117 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4192,7 +4192,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 	struct btrfs_path *path;
 	struct list_head ins_list;
 	struct list_head del_list;
-	struct qstr q;
 	int ret;
 	struct extent_buffer *leaf;
 	int slot;
@@ -4283,7 +4282,6 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 
 		while (di_cur < di_total) {
 			struct btrfs_key location;
-			struct dentry *tmp;
 
 			if (verify_dir_item(root, leaf, di))
 				break;
@@ -4304,33 +4302,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
 
-			q.name = name_ptr;
-			q.len = name_len;
-			q.hash = full_name_hash(q.name, q.len);
-			tmp = d_lookup(filp->f_dentry, &q);
-			if (!tmp) {
-				struct btrfs_key *newkey;
-
-				newkey = kzalloc(sizeof(struct btrfs_key),
-						 GFP_NOFS);
-				if (!newkey)
-					goto no_dentry;
-				tmp = d_alloc(filp->f_dentry, &q);
-				if (!tmp) {
-					kfree(newkey);
-					dput(tmp);
-					goto no_dentry;
-				}
-				memcpy(newkey, &location,
-				       sizeof(struct btrfs_key));
-				tmp->d_fsdata = newkey;
-				tmp->d_flags |= DCACHE_NEED_LOOKUP;
-				d_rehash(tmp);
-				dput(tmp);
-			} else {
-				dput(tmp);
-			}
-no_dentry:
+
 			/* is this a reference to our own snapshot? If so
 			 * skip it.
 			 *
-- 
cgit v1.2.3


From dc7fdde39e4962b1a88741f7eba2a6b3be1285d8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 27 Apr 2012 14:31:29 -0400
Subject: Btrfs: reduce lock contention during extent insertion

We're spending huge amounts of time on lock contention during
end_io processing because we unconditionally assume we are overwriting
an existing extent in the file for each IO.

This checks to see if we are outside i_size, and if so, it uses a
less expensive readonly search of the btree to look for existing
extents.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index d83260d7498f..53bf2d764bbc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -567,6 +567,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 	int extent_type;
 	int recow;
 	int ret;
+	int modify_tree = -1;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -575,10 +576,13 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
 	if (!path)
 		return -ENOMEM;
 
+	if (start >= BTRFS_I(inode)->disk_i_size)
+		modify_tree = 0;
+
 	while (1) {
 		recow = 0;
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
-					       search_start, -1);
+					       search_start, modify_tree);
 		if (ret < 0)
 			break;
 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
@@ -634,7 +638,8 @@ next_slot:
 		}
 
 		search_start = max(key.offset, start);
-		if (recow) {
+		if (recow || !modify_tree) {
+			modify_tree = -1;
 			btrfs_release_path(path);
 			continue;
 		}
-- 
cgit v1.2.3


From fcbf94b9dedd2ce08e798a99aafc94fec8668161 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 28 Apr 2012 08:29:56 -0700
Subject: Revert "autofs: work around unhappy compat problem on x86-64"

This reverts commit a32744d4abae24572eff7269bc17895c41bd0085.

While that commit was technically the right thing to do, and made the
x86-64 compat mode work identically to native 32-bit mode (and thus
fixing the problem with a 32-bit systemd install on a 64-bit kernel), it
turns out that the automount binaries had workarounds for this compat
problem.

Now, the workarounds are disgusting: doing an "uname()" to find out the
architecture of the kernel, and then comparing it for the 64-bit cases
and fixing up the size of the read() in automount for those.  And they
were confused: it's not actually a generic 64-bit issue at all, it's
very much tied to just x86-64, which has different alignment for an
'u64' in 64-bit mode than in 32-bit mode.

But the end result is that fixing the compat layer actually breaks the
case of a 32-bit automount on a x86-64 kernel.

There are various approaches to fix this (including just doing a
"strcmp()" on current->comm and comparing it to "automount"), but I
think that I will do the one that teaches pipes about a special "packet
mode", which will allow user space to not have to care too deeply about
the padding at the end of the autofs packet.

That change will make the compat workaround unnecessary, so let's revert
it first, and get automount working again in compat mode.  The
packetized pipes will then fix autofs for systemd.

Reported-and-requested-by: Michael Tokarev <mjt@tls.msk.ru>
Cc: Ian Kent <raven@themaw.net>
Cc: stable@kernel.org # for 3.3
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h  |  1 -
 fs/autofs4/dev-ioctl.c |  1 -
 fs/autofs4/inode.c     |  2 --
 fs/autofs4/waitq.c     | 22 +++-------------------
 4 files changed, 3 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index eb1cc92cd67d..d8d8e7ba6a1e 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -110,7 +110,6 @@ struct autofs_sb_info {
 	int sub_version;
 	int min_proto;
 	int max_proto;
-	int compat_daemon;
 	unsigned long exp_timeout;
 	unsigned int type;
 	int reghost_enabled;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 9dacb8586701..3dfd615afb6b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -385,7 +385,6 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 		sbi->pipefd = pipefd;
 		sbi->pipe = pipe;
 		sbi->catatonic = 0;
-		sbi->compat_daemon = is_compat_task();
 	}
 out:
 	mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index d8dc002e9cc3..14c7bc02349e 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -19,7 +19,6 @@
 #include <linux/parser.h>
 #include <linux/bitops.h>
 #include <linux/magic.h>
-#include <linux/compat.h>
 #include "autofs_i.h"
 #include <linux/module.h>
 
@@ -225,7 +224,6 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 	set_autofs_type_indirect(&sbi->type);
 	sbi->min_proto = 0;
 	sbi->max_proto = 0;
-	sbi->compat_daemon = is_compat_task();
 	mutex_init(&sbi->wq_mutex);
 	mutex_init(&sbi->pipe_mutex);
 	spin_lock_init(&sbi->fs_lock);
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 9c098db43344..da8876d38a7b 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -91,24 +91,7 @@ static int autofs4_write(struct autofs_sb_info *sbi,
 
 	return (bytes > 0);
 }
-
-/*
- * The autofs_v5 packet was misdesigned.
- *
- * The packets are identical on x86-32 and x86-64, but have different
- * alignment. Which means that 'sizeof()' will give different results.
- * Fix it up for the case of running 32-bit user mode on a 64-bit kernel.
- */
-static noinline size_t autofs_v5_packet_size(struct autofs_sb_info *sbi)
-{
-	size_t pktsz = sizeof(struct autofs_v5_packet);
-#if defined(CONFIG_X86_64) && defined(CONFIG_COMPAT)
-	if (sbi->compat_daemon > 0)
-		pktsz -= 4;
-#endif
-	return pktsz;
-}
-
+	
 static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 				 struct autofs_wait_queue *wq,
 				 int type)
@@ -172,7 +155,8 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 	{
 		struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet;
 
-		pktsz = autofs_v5_packet_size(sbi);
+		pktsz = sizeof(*packet);
+
 		packet->wait_queue_token = wq->wait_queue_token;
 		packet->len = wq->name.len;
 		memcpy(packet->name, wq->name.name, wq->name.len);
-- 
cgit v1.2.3


From 9883035ae7edef3ec62ad215611cb8e17d6a1a5d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 Apr 2012 13:12:42 -0700
Subject: pipes: add a "packetized pipe" mode for writing

The actual internal pipe implementation is already really about
individual packets (called "pipe buffers"), and this simply exposes that
as a special packetized mode.

When we are in the packetized mode (marked by O_DIRECT as suggested by
Alan Cox), a write() on a pipe will not merge the new data with previous
writes, so each write will get a pipe buffer of its own.  The pipe
buffer is then marked with the PIPE_BUF_FLAG_PACKET flag, which in turn
will tell the reader side to break the read at that boundary (and throw
away any partial packet contents that do not fit in the read buffer).

End result: as long as you do writes less than PIPE_BUF in size (so that
the pipe doesn't have to split them up), you can now treat the pipe as a
packet interface, where each read() system call will read one packet at
a time.  You can just use a sufficiently big read buffer (PIPE_BUF is
sufficient, since bigger than that doesn't guarantee atomicity anyway),
and the return value of the read() will naturally give you the size of
the packet.

NOTE! We do not support zero-sized packets, and zero-sized reads and
writes to a pipe continue to be no-ops.  Also note that big packets will
currently be split at write time, but that the size at which that
happens is not really specified (except that it's bigger than PIPE_BUF).
Currently that limit is the system page size, but we might want to
explicitly support bigger packets some day.

The main user for this is going to be the autofs packet interface,
allowing us to stop having to care so deeply about exact packet sizes
(which have had bugs with 32/64-bit compatibility modes).  But user
space can create packetized pipes with "pipe2(fd, O_DIRECT)", which will
fail with an EINVAL on kernels that do not support this interface.

Tested-by: Michael Tokarev <mjt@tls.msk.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Miller <davem@davemloft.net>
Cc: Ian Kent <raven@themaw.net>
Cc: Thomas Meyer <thomas@m3y3r.de>
Cc: stable@kernel.org  # needed for systemd/autofs interaction fix
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/pipe.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/pipe.c b/fs/pipe.c
index 25feaa3faac0..fec5e4ad071a 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -346,6 +346,16 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.get = generic_pipe_buf_get,
 };
 
+static const struct pipe_buf_operations packet_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = anon_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
 static ssize_t
 pipe_read(struct kiocb *iocb, const struct iovec *_iov,
 	   unsigned long nr_segs, loff_t pos)
@@ -407,6 +417,13 @@ redo:
 			ret += chars;
 			buf->offset += chars;
 			buf->len -= chars;
+
+			/* Was it a packet buffer? Clean up and exit */
+			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
+				total_len = chars;
+				buf->len = 0;
+			}
+
 			if (!buf->len) {
 				buf->ops = NULL;
 				ops->release(pipe, buf);
@@ -459,6 +476,11 @@ redo:
 	return ret;
 }
 
+static inline int is_packetized(struct file *file)
+{
+	return (file->f_flags & O_DIRECT) != 0;
+}
+
 static ssize_t
 pipe_write(struct kiocb *iocb, const struct iovec *_iov,
 	    unsigned long nr_segs, loff_t ppos)
@@ -593,6 +615,11 @@ redo2:
 			buf->ops = &anon_pipe_buf_ops;
 			buf->offset = 0;
 			buf->len = chars;
+			buf->flags = 0;
+			if (is_packetized(filp)) {
+				buf->ops = &packet_pipe_buf_ops;
+				buf->flags = PIPE_BUF_FLAG_PACKET;
+			}
 			pipe->nrbufs = ++bufs;
 			pipe->tmp_page = NULL;
 
@@ -1013,7 +1040,7 @@ struct file *create_write_pipe(int flags)
 		goto err_dentry;
 	f->f_mapping = inode->i_mapping;
 
-	f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
+	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 	f->f_version = 0;
 
 	return f;
@@ -1057,7 +1084,7 @@ int do_pipe_flags(int *fd, int flags)
 	int error;
 	int fdw, fdr;
 
-	if (flags & ~(O_CLOEXEC | O_NONBLOCK))
+	if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT))
 		return -EINVAL;
 
 	fw = create_write_pipe(flags);
-- 
cgit v1.2.3


From 64f371bc3107e69efce563a3d0f0e6880de0d537 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 29 Apr 2012 13:30:08 -0700
Subject: autofs: make the autofsv5 packet file descriptor use a packetized
 pipe

The autofs packet size has had a very unfortunate size problem on x86:
because the alignment of 'u64' differs in 32-bit and 64-bit modes, and
because the packet data was not 8-byte aligned, the size of the autofsv5
packet structure differed between 32-bit and 64-bit modes despite
looking otherwise identical (300 vs 304 bytes respectively).

We first fixed that up by making the 64-bit compat mode know about this
problem in commit a32744d4abae ("autofs: work around unhappy compat
problem on x86-64"), and that made a 32-bit 'systemd' work happily on a
64-bit kernel because everything then worked the same way as on a 32-bit
kernel.

But it turned out that 'automount' had actually known and worked around
this problem in user space, so fixing the kernel to do the proper 32-bit
compatibility handling actually *broke* 32-bit automount on a 64-bit
kernel, because it knew that the packet sizes were wrong and expected
those incorrect sizes.

As a result, we ended up reverting that compatibility mode fix, and
thus breaking systemd again, in commit fcbf94b9dedd.

With both automount and systemd doing a single read() system call, and
verifying that they get *exactly* the size they expect but using
different sizes, it seemed that fixing one of them inevitably seemed to
break the other.  At one point, a patch I seriously considered applying
from Michael Tokarev did a "strcmp()" to see if it was automount that
was doing the operation.  Ugly, ugly.

However, a prettier solution exists now thanks to the packetized pipe
mode.  By marking the communication pipe as being packetized (by simply
setting the O_DIRECT flag), we can always just write the bigger packet
size, and if user-space does a smaller read, it will just get that
partial end result and the extra alignment padding will simply be thrown
away.

This makes both automount and systemd happy, since they now get the size
they asked for, and the kernel side of autofs simply no longer needs to
care - it could pad out the packet arbitrarily.

Of course, if there is some *other* user of autofs (please, please,
please tell me it ain't so - and we haven't heard of any) that tries to
read the packets with multiple writes, that other user will now be
broken - the whole point of the packetized mode is that one system call
gets exactly one packet, and you cannot read a packet in pieces.

Tested-by: Michael Tokarev <mjt@tls.msk.ru>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: David Miller <davem@davemloft.net>
Cc: Ian Kent <raven@themaw.net>
Cc: Thomas Meyer <thomas@m3y3r.de>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/autofs4/autofs_i.h  | 11 +++++++++++
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/autofs4/inode.c     |  2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index d8d8e7ba6a1e..908e18455413 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -269,6 +269,17 @@ int autofs4_fill_super(struct super_block *, void *, int);
 struct autofs_info *autofs4_new_ino(struct autofs_sb_info *);
 void autofs4_clean_ino(struct autofs_info *);
 
+static inline int autofs_prepare_pipe(struct file *pipe)
+{
+	if (!pipe->f_op || !pipe->f_op->write)
+		return -EINVAL;
+	if (!S_ISFIFO(pipe->f_dentry->d_inode->i_mode))
+		return -EINVAL;
+	/* We want a packet pipe */
+	pipe->f_flags |= O_DIRECT;
+	return 0;
+}
+
 /* Queue management functions */
 
 int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 3dfd615afb6b..aa9103f8f01b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -376,7 +376,7 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp,
 			err = -EBADF;
 			goto out;
 		}
-		if (!pipe->f_op || !pipe->f_op->write) {
+		if (autofs_prepare_pipe(pipe) < 0) {
 			err = -EPIPE;
 			fput(pipe);
 			goto out;
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 14c7bc02349e..6e488ebe7784 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -290,7 +290,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 		printk("autofs: could not open pipe file descriptor\n");
 		goto fail_dput;
 	}
-	if (!pipe->f_op || !pipe->f_op->write)
+	if (autofs_prepare_pipe(pipe) < 0)
 		goto fail_fput;
 	sbi->pipe = pipe;
 	sbi->pipefd = pipefd;
-- 
cgit v1.2.3