From 0cf5537b158caae42bcc03f0f6db10f68585b1ec Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Fri, 11 Jun 2010 15:57:06 -0700
Subject: ceph: some endianity fixes

Fix some problems that came up with sparse.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c     | 2 +-
 fs/ceph/messenger.c  | 2 +-
 fs/ceph/mon_client.c | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 83d4d2785ffe..3fe49042d8ad 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -493,7 +493,7 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 		return -EAGAIN;
 	}
 
-	op = le32_to_cpu(head->op);
+	op = le16_to_cpu(head->op);
 	result = le32_to_cpu(head->result);
 	dout("handle_reply op %d result %d\n", op, result);
 	switch (op) {
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 64b8b1f7863d..cf1c7845d877 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -657,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
 	dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
 	     con->connect_seq, global_seq, proto);
 
-	con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
+	con->out_connect.features = cpu_to_le64(CEPH_FEATURE_SUPPORTED_CLIENT);
 	con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
 	con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
 	con->out_connect.global_seq = cpu_to_le32(global_seq);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 07a539906e67..cc115eafae11 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -725,7 +725,8 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		dout("authenticated, starting session\n");
 
 		monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
-		monc->client->msgr->inst.name.num = monc->auth->global_id;
+		monc->client->msgr->inst.name.num =
+					cpu_to_le64(monc->auth->global_id);
 
 		__send_subscribe(monc);
 		__resend_generic_request(monc);
-- 
cgit v1.2.3


From 4a32f93d29b05cdab63c0e2979bc1524c8ea6bf5 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 13 Jun 2010 10:27:53 -0700
Subject: ceph: fix map handler error path

Don't leak message if we receive an unexpected message type.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index d25b4add85b4..92b7251a53f1 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -1344,7 +1344,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 	int type = le16_to_cpu(msg->hdr.type);
 
 	if (!osd)
-		return;
+		goto out;
 	osdc = osd->o_osdc;
 
 	switch (type) {
@@ -1359,6 +1359,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 		pr_err("received unknown message type %d %s\n", type,
 		       ceph_msg_type_name(type));
 	}
+out:
 	ceph_msg_put(msg);
 }
 
-- 
cgit v1.2.3


From ae32be31341a5fecfa16c5b3eb78095207182cce Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 13 Jun 2010 10:30:19 -0700
Subject: ceph: fix message memory leak, uninitialized variable

We need to properly initialize skip, as not all alloc_msg op instances
set it.

Also, BUG if someone says skip but also allocates a message.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cf1c7845d877..9ad43a310a41 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -1396,10 +1396,12 @@ static int read_partial_message(struct ceph_connection *con)
 	if (!con->in_msg) {
 		dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
 		     con->in_hdr.front_len, con->in_hdr.data_len);
+		skip = 0;
 		con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
 		if (skip) {
 			/* skip this message */
 			dout("alloc_msg said skip message\n");
+			BUG_ON(con->in_msg);
 			con->in_base_pos = -front_len - middle_len - data_len -
 				sizeof(m->footer);
 			con->in_tag = CEPH_MSGR_TAG_READY;
-- 
cgit v1.2.3


From 6469272c350872980891dbe38e81c936c43f2d9b Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 26 May 2010 17:58:53 +0200
Subject: fs/ocfs2/dlm: Add missing spin_unlock

Add a spin_unlock missing on the error path.  Unlock as in the other code
that leads to the leave label.

The semantic match that finds this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@@
expression E1;
@@

* spin_lock(E1,...);
  <+... when != E1
  if (...) {
    ... when != E1
*   return ...;
  }
  ...+>
* spin_unlock(E1,...);
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmdomain.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6b5a492e1749..2ccad86fb590 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1709,6 +1709,7 @@ retry:
 		}
 
 		if (dlm_protocol_compare(&dlm->fs_locking_proto, fs_proto)) {
+			spin_unlock(&dlm_domain_lock);
 			mlog(ML_ERROR,
 			     "Requested locking protocol version is not "
 			     "compatible with already registered domain "
-- 
cgit v1.2.3


From 40f165f416bde747d85cdf71bc9dde700912f71f Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 28 May 2010 14:22:59 +0800
Subject: ocfs2: Move orphan scan work to ocfs2_wq.

We used to let orphan scan work in the default work queue,
but there is a corner case which will make the system deadlock.
The scenario is like this:
1. set heartbeat threadshold to 200. this will allow us to have a
   great chance to have a orphan scan work before our quorum decision.
2. mount node 1.
3. after 1~2 minutes, mount node 2(in order to make the bug easier
   to reproduce, better add maxcpus=1 to kernel command line).
4. node 1 do orphan scan work.
5. node 2 do orphan scan work.
6. node 1 do orphan scan work. After this, node 1 hold the orphan scan
   lock while node 2 know node 1 is the master.
7. ifdown eth2 in node 2(eth2 is what we do ocfs2 interconnection).

Now when node 2 begins orphan scan, the system queue is blocked.

The root cause is that both orphan scan work and quorum decision work
will use the system event work queue. orphan scan has a chance of
blocking the event work queue(in dlm_wait_for_node_death) so that there
is no chance for quorum decision work to proceed.

This patch resolve it by moving orphan scan work to ocfs2_wq.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/journal.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 47878cf16418..39113b5e79e7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1936,7 +1936,7 @@ void ocfs2_orphan_scan_work(struct work_struct *work)
 	mutex_lock(&os->os_lock);
 	ocfs2_queue_orphan_scan(osb);
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_ACTIVE)
-		schedule_delayed_work(&os->os_orphan_scan_work,
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
 				      ocfs2_orphan_scan_timeout());
 	mutex_unlock(&os->os_lock);
 }
@@ -1976,8 +1976,8 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
 		atomic_set(&os->os_state, ORPHAN_SCAN_ACTIVE);
-		schedule_delayed_work(&os->os_orphan_scan_work,
-				      ocfs2_orphan_scan_timeout());
+		queue_delayed_work(ocfs2_wq, &os->os_orphan_scan_work,
+				   ocfs2_orphan_scan_timeout());
 	}
 }
 
-- 
cgit v1.2.3


From 1739da40543ed2129050ccfa8a076a851ab6ed00 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 9 Jun 2010 16:43:05 +0800
Subject: ocfs2: Limit default local alloc size within bitmap range.

In commit 6b82021b9e91cd689fdffadbcdb9a42597bbe764, we increase
our local alloc size and calculate how much megabytes we can
get according to group size and volume size.
But we also need to check the maximum bits a local alloc block
bitmap can have. With a bs=512, cs=32K, local volume with 160G,
it calculate 96MB while the maximum local alloc size is only
76M. So the bitmap will overflow and corrupt the system truncate
log file. See bug
http://oss.oracle.com/bugzilla/show_bug.cgi?id=1262

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/localalloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 3d7419682dc0..ec6adbf8f551 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -118,6 +118,7 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
 {
 	unsigned int la_mb;
 	unsigned int gd_mb;
+	unsigned int la_max_mb;
 	unsigned int megs_per_slot;
 	struct super_block *sb = osb->sb;
 
@@ -182,6 +183,12 @@ unsigned int ocfs2_la_default_mb(struct ocfs2_super *osb)
 	if (megs_per_slot < la_mb)
 		la_mb = megs_per_slot;
 
+	/* We can't store more bits than we can in a block. */
+	la_max_mb = ocfs2_clusters_to_megabytes(osb->sb,
+						ocfs2_local_alloc_size(sb) * 8);
+	if (la_mb > la_max_mb)
+		la_mb = la_max_mb;
+
 	return la_mb;
 }
 
-- 
cgit v1.2.3


From cebc5be6b6c82a99231e9c9af451e9e3d3399ec6 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 17 Jun 2010 10:22:48 -0700
Subject: ceph: fix crush map update decoding

If the incremental osdmap has a new crush map, advance the position after
decoding so that we can parse the rest of the osdmap properly.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index ddc656fb5c05..50ce64ebd330 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -707,6 +707,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		newcrush = crush_decode(*p, min(*p+len, end));
 		if (IS_ERR(newcrush))
 			return ERR_CAST(newcrush);
+		*p += len;
 	}
 
 	/* new flags? */
-- 
cgit v1.2.3


From d69ed05a80f23b25f06e73af9b7e701ce4900edc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Jun 2010 10:38:14 -0700
Subject: ceph: handle splice_dentry/d_materialize_unique error in
 readdir_prepopulate

Handle a splice_dentry failure (due to a d_materialize_unique error)
without crashing.  (Also, report the error code.)

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ab47f46ca282..8f9b9fe8ef9f 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -854,8 +854,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 		d_drop(dn);
 	realdn = d_materialise_unique(dn, in);
 	if (IS_ERR(realdn)) {
-		pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
-		       dn, in, ceph_vinop(in));
+		pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
+		       PTR_ERR(realdn), dn, in, ceph_vinop(in));
 		if (prehash)
 			*prehash = false; /* don't rehash on error */
 		dn = realdn; /* note realdn contains the error */
@@ -1234,18 +1234,23 @@ retry_lookup:
 				goto out;
 			}
 			dn = splice_dentry(dn, in, NULL);
+			if (IS_ERR(dn))
+				dn = NULL;
 		}
 
 		if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
 			       req->r_request_started, -1,
 			       &req->r_caps_reservation) < 0) {
 			pr_err("fill_inode badness on %p\n", in);
-			dput(dn);
-			continue;
+			goto next_item;
 		}
-		update_dentry_lease(dn, rinfo->dir_dlease[i],
-				    req->r_session, req->r_request_started);
-		dput(dn);
+		if (dn)
+			update_dentry_lease(dn, rinfo->dir_dlease[i],
+					    req->r_session,
+					    req->r_request_started);
+next_item:
+		if (dn)
+			dput(dn);
 	}
 	req->r_did_prepopulate = true;
 
-- 
cgit v1.2.3


From 17c688c3dfffc274c87be00033da0050bb6eefc0 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 21 Jun 2010 16:12:26 -0700
Subject: ceph: delay umount until all mds requests drop inode+dentry refs

This fixes a race between handle_reply finishing an mds request, signalling
completion, and then dropping the request structing and its dentry+inode
refs, and pre_umount function waiting for requests to finish before
letting the vfs tear down the dcache.  If umount was delayed waiting for
mds requests, we could race and BUG in shrink_dcache_for_umount_subtree
because of a slow dput.

This delays umount until the msgr queue flushes, which means handle_reply
will exit and will have dropped the ceph_mds_request struct.  I'm assuming
the VFS has already ensured that its calls have all completed and those
request refs have thus been dropped as well (I haven't seen that race, at
least).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 1766947fc07a..3ab79f6c4ce8 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2783,6 +2783,12 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
 	drop_leases(mdsc);
 	ceph_flush_dirty_caps(mdsc);
 	wait_requests(mdsc);
+
+	/*
+	 * wait for reply handlers to drop their request refs and
+	 * their inode/dcache refs
+	 */
+	ceph_msgr_flush();
 }
 
 /*
-- 
cgit v1.2.3


From bfaf148eb2e42c00f1c79b2163f0804068ea0c5e Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Wed, 23 Jun 2010 15:52:27 -0700
Subject: ceph: fix caps debugfs entry

The ceph client structure was not set correctly.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 3be33fb066cc..f2f5332ddbba 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -261,7 +261,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 
 static int caps_show(struct seq_file *s, void *p)
 {
-	struct ceph_client *client = p;
+	struct ceph_client *client = s->private;
 	int total, avail, used, reserved, min;
 
 	ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
-- 
cgit v1.2.3


From 55bda7aacd13f5fdfeaafc16934953171405c692 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 24 Jun 2010 12:55:48 -0700
Subject: ceph: fix crush recursion

There was a longstanding problem with recursion through intervening
bucket types on complex hierarchies.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/mapper.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 9ba54efb6543..804e6d53b77c 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -366,6 +366,7 @@ static int crush_choose(struct crush_map *map,
 					BUG_ON(item >= 0 ||
 					       (-1-item) >= map->max_buckets);
 					in = map->buckets[-1-item];
+					retry_bucket = 1;
 					continue;
 				}
 
-- 
cgit v1.2.3


From a1a31e734241aefcb2b30fb0cc0376977b6d2ba8 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 24 Jun 2010 12:58:14 -0700
Subject: ceph: fix crush CHOOSE_LEAF when type is already a leaf

We may not recurse for CHOOSE_LEAF if we start with a leaf node.  When
that happens, the out2 vector needs to be filled in with the result.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/mapper.c | 38 +++++++++++++++++++++++++-------------
 1 file changed, 25 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 804e6d53b77c..374266bddd9c 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -238,7 +238,7 @@ static int bucket_straw_choose(struct crush_bucket_straw *bucket,
 
 static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
 {
-	dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+	dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
 	switch (in->alg) {
 	case CRUSH_BUCKET_UNIFORM:
 		return bucket_uniform_choose((struct crush_bucket_uniform *)in,
@@ -305,7 +305,9 @@ static int crush_choose(struct crush_map *map,
 	int itemtype;
 	int collide, reject;
 	const int orig_tries = 5; /* attempts before we fall back to search */
-	dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+	dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+		bucket->id, x, outpos, numrep);
 
 	for (rep = outpos; rep < numrep; rep++) {
 		/* keep trying until we get a non-out, non-colliding item */
@@ -378,15 +380,25 @@ static int crush_choose(struct crush_map *map,
 					}
 				}
 
-				if (recurse_to_leaf &&
-				    item < 0 &&
-				    crush_choose(map, map->buckets[-1-item],
-						 weight,
-						 x, outpos+1, 0,
-						 out2, outpos,
-						 firstn, 0, NULL) <= outpos) {
-					reject = 1;
-				} else {
+				reject = 0;
+				if (recurse_to_leaf) {
+					if (item < 0) {
+						if (crush_choose(map,
+							 map->buckets[-1-item],
+							 weight,
+							 x, outpos+1, 0,
+							 out2, outpos,
+							 firstn, 0,
+							 NULL) <= outpos)
+							/* didn't get leaf */
+							reject = 1;
+					} else {
+						/* we already have a leaf! */
+						out2[outpos] = item;
+					}
+				}
+
+				if (!reject) {
 					/* out? */
 					if (itemtype == 0)
 						reject = is_out(map, weight,
@@ -425,12 +437,12 @@ reject:
 			continue;
 		}
 
-		dprintk("choose got %d\n", item);
+		dprintk("CHOOSE got %d\n", item);
 		out[outpos] = item;
 		outpos++;
 	}
 
-	dprintk("choose returns %d\n", outpos);
+	dprintk("CHOOSE returns %d\n", outpos);
 	return outpos;
 }
 
-- 
cgit v1.2.3


From ec97f88ba6d4256927fde516033ee76d5d85b54a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 24 Jun 2010 15:12:37 -0700
Subject: ceph: only release clean, unused caps with mds requests

We can drop caps with an mds request.  Ensure we only drop unused AND
clean caps, since the MDS doesn't support cap writeback in that context,
nor do we track it.  If caps are dirty, and the MDS needs them back, we
it will revoke and we will flush in the normal fashion.

This fixes a possibly loss of metadata.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 619b61655ee5..d4fcdda7676c 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2886,18 +2886,19 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_cap *cap;
 	struct ceph_mds_request_release *rel = *p;
+	int used, dirty;
 	int ret = 0;
-	int used = 0;
 
 	spin_lock(&inode->i_lock);
 	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
 
-	dout("encode_inode_release %p mds%d used %s drop %s unless %s\n", inode,
-	     mds, ceph_cap_string(used), ceph_cap_string(drop),
+	dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
+	     inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
 	     ceph_cap_string(unless));
 
-	/* only drop unused caps */
-	drop &= ~used;
+	/* only drop unused, clean caps */
+	drop &= ~(used | dirty);
 
 	cap = __get_cap_for_mds(ci, mds);
 	if (cap && __cap_is_valid(cap)) {
-- 
cgit v1.2.3


From 443b3760a06860187f135c1ecd56c2c7d4ad1022 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 29 Jun 2010 09:28:39 -0700
Subject: ceph: fix caps usage accounting for import (non-reserved) case

We need to increase the total and used counters when allocating a new cap
in the non-reserved (cap import) case.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d4fcdda7676c..74144d6389f0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -244,8 +244,14 @@ static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
 	struct ceph_cap *cap = NULL;
 
 	/* temporary, until we do something about cap import/export */
-	if (!ctx)
-		return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+	if (!ctx) {
+		cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+		if (cap) {
+			caps_use_count++;
+			caps_total_count++;
+		}
+		return cap;
+	}
 
 	spin_lock(&caps_list_lock);
 	dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
-- 
cgit v1.2.3


From 2cb4b05e7647891b46b91c07c9a60304803d1688 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 29 Jun 2010 13:09:18 +0200
Subject: splice: direct_splice_actor() should not use pos in sd

direct_splice_actor() shouldn't use sd->pos, as sd->pos is for file reading,
file->f_pos should be used instead.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
----
 fs/splice.c |    3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/splice.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 740e6b9faf7a..41900496d3bb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1282,7 +1282,8 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
 {
 	struct file *file = sd->u.file;
 
-	return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags);
+	return do_splice_from(pipe, file, &file->f_pos, sd->total_len,
+			      sd->flags);
 }
 
 /**
-- 
cgit v1.2.3


From 19c9a49b432f245c6293508d164a4350f1f2c601 Mon Sep 17 00:00:00 2001
From: Changli Gao <xiaosuo@gmail.com>
Date: Tue, 29 Jun 2010 13:10:36 +0200
Subject: splice: check f_mode for seekable file

check f_mode for seekable file

As a seekable file is allowed without a llseek function, so the old way isn't
work any more.

Signed-off-by: Changli Gao <xiaosuo@gmail.com>
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
----
 fs/splice.c |    6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/splice.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/splice.c b/fs/splice.c
index 41900496d3bb..efdbfece9932 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1372,8 +1372,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
-			if (!out->f_op || !out->f_op->llseek ||
-			    out->f_op->llseek == no_llseek)
+			if (!(out->f_mode & FMODE_PWRITE))
 				return -EINVAL;
 			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
 				return -EFAULT;
@@ -1393,8 +1392,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
-			if (!in->f_op || !in->f_op->llseek ||
-			    in->f_op->llseek == no_llseek)
+			if (!(in->f_mode & FMODE_PREAD))
 				return -EINVAL;
 			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
 				return -EFAULT;
-- 
cgit v1.2.3


From 06d738fa9155ff16dba3d7e501ba4581d01a98cb Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 1 Jul 2010 08:26:34 +0200
Subject: fs-writeback: fix kernel-doc warnings

Fix kernel-doc to match the function's changed args.

Warning(fs/fs-writeback.c:190): No description found for parameter 'args'
Warning(fs/fs-writeback.c:190): Excess function parameter 'sb' description in 'bdi_queue_work_onstack'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0609607d3955..6981e4b7c148 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -179,7 +179,7 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
 
 /**
  * bdi_queue_work_onstack - start and wait for writeback
- * @sb: write inodes from this super_block
+ * @args: parameters to control the work queue writeback
  *
  * Description:
  *   This function initiates writeback and waits for the operation to
-- 
cgit v1.2.3


From 153a10939ea6e42e9c0115b0645060d0d7bb4697 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 5 Jul 2010 09:44:17 -0700
Subject: ceph: fix crush device 'out' threshold to 1.0, not 0.1

Fix a typo that made any OSD weighted between 0.1 and 1.0 effectively
weighted as 1.0 (fully in).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/crush/mapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
index 374266bddd9c..a4eec133258e 100644
--- a/fs/ceph/crush/mapper.c
+++ b/fs/ceph/crush/mapper.c
@@ -264,7 +264,7 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
  */
 static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
 {
-	if (weight[item] >= 0x1000)
+	if (weight[item] >= 0x10000)
 		return 0;
 	if (weight[item] == 0)
 		return 1;
-- 
cgit v1.2.3


From ed98adad3d87594c55347824e85137d1829c9e70 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 5 Jul 2010 12:15:14 -0700
Subject: ceph: fix message revocation

A message can be on a queue (pending or sent), or out_msg (sending), or
both.  We were assuming that if it's not on a queue it couldn't be out_msg,
but that was false in the case of lossy connections like the OSD.  Fix
ceph_con_revoke() to treat these cases independently.  Also, fix the
out_kvec_is_message check to only trigger if we are currently sending
_this_ message.

This fixes a GPF in tcp_sendpage, triggered by OSD restarts.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 9ad43a310a41..9692d08e2f88 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -2015,20 +2015,20 @@ void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
 {
 	mutex_lock(&con->mutex);
 	if (!list_empty(&msg->list_head)) {
-		dout("con_revoke %p msg %p\n", con, msg);
+		dout("con_revoke %p msg %p - was on queue\n", con, msg);
 		list_del_init(&msg->list_head);
 		ceph_msg_put(msg);
 		msg->hdr.seq = 0;
-		if (con->out_msg == msg) {
-			ceph_msg_put(con->out_msg);
-			con->out_msg = NULL;
-		}
+	}
+	if (con->out_msg == msg) {
+		dout("con_revoke %p msg %p - was sending\n", con, msg);
+		con->out_msg = NULL;
 		if (con->out_kvec_is_msg) {
 			con->out_skip = con->out_kvec_bytes;
 			con->out_kvec_is_msg = false;
 		}
-	} else {
-		dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+		ceph_msg_put(msg);
+		msg->hdr.seq = 0;
 	}
 	mutex_unlock(&con->mutex);
 }
-- 
cgit v1.2.3


From 22b1de06c9fe128ca3de72560c3e8c2cabf2927a Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 5 Jul 2010 15:36:49 -0700
Subject: ceph: fix leak of mon authorizer

Fix leak of a struct ceph_buffer on umount.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 3fe49042d8ad..6d44053ecff1 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -613,6 +613,9 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
 		remove_ticket_handler(ac, th);
 	}
 
+	if (xi->auth_authorizer.buf)
+		ceph_buffer_put(xi->auth_authorizer.buf);
+
 	kfree(ac->private);
 	ac->private = NULL;
 }
-- 
cgit v1.2.3


From 9c3a8ee8a1d72c5c0d7fbdf426d80e270ddfa54c Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jun 2010 12:07:27 +0200
Subject: writeback: remove writeback_inodes_wbc

This was just an odd wrapper around writeback_inodes_wb.  Removing this
also allows to get rid of the bdi member of struct writeback_control
which was rather out of place there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/afs/write.c       |  1 -
 fs/btrfs/extent_io.c |  2 --
 fs/fs-writeback.c    | 12 ++----------
 3 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3dab9e9948d0..722743b152d8 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -680,7 +680,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
 {
 	struct address_space *mapping = vnode->vfs_inode.i_mapping;
 	struct writeback_control wbc = {
-		.bdi		= mapping->backing_dev_info,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_to_write	= LONG_MAX,
 		.range_cyclic	= 1,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index a4080c21ec55..d74e6af9b53a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2594,7 +2594,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
-		.bdi		= wbc->bdi,
 		.sync_mode	= wbc->sync_mode,
 		.older_than_this = NULL,
 		.nr_to_write	= 64,
@@ -2628,7 +2627,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 		.sync_io = mode == WB_SYNC_ALL,
 	};
 	struct writeback_control wbc_writepages = {
-		.bdi		= inode->i_mapping->backing_dev_info,
 		.sync_mode	= mode,
 		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6981e4b7c148..94a602e98bb5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -614,8 +614,8 @@ static int writeback_sb_inodes(struct super_block *sb,
 	return 1;
 }
 
-static void writeback_inodes_wb(struct bdi_writeback *wb,
-				struct writeback_control *wbc)
+void writeback_inodes_wb(struct bdi_writeback *wb,
+		struct writeback_control *wbc)
 {
 	int ret = 0;
 
@@ -660,13 +660,6 @@ static void writeback_inodes_wb(struct bdi_writeback *wb,
 	/* Leave any unwritten inodes on b_io */
 }
 
-void writeback_inodes_wbc(struct writeback_control *wbc)
-{
-	struct backing_dev_info *bdi = wbc->bdi;
-
-	writeback_inodes_wb(&bdi->wb, wbc);
-}
-
 /*
  * The maximum number of pages to writeout in a single bdi flush/kupdate
  * operation.  We do this so we don't hold I_SYNC against an inode for
@@ -705,7 +698,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 			 struct wb_writeback_args *args)
 {
 	struct writeback_control wbc = {
-		.bdi			= wb->bdi,
 		.sb			= args->sb,
 		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
-- 
cgit v1.2.3


From edadfb10ba35da7253541e4155aa92eff758ebe6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 10 Jun 2010 12:07:54 +0200
Subject: writeback: split writeback_inodes_wb

The case where we have a superblock doesn't require a loop here as we scan
over all inodes in writeback_sb_inodes. Split it out into a separate helper
to make the code simpler.  This also allows to get rid of the sb member in
struct writeback_control, which was rather out of place there.

Also update the comments in writeback_sb_inodes that explain the handling
of inodes from wrong superblocks.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 82 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 94a602e98bb5..8cc06d5432b5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -554,29 +554,41 @@ static bool pin_sb_for_writeback(struct super_block *sb)
 
 /*
  * Write a portion of b_io inodes which belong to @sb.
- * If @wbc->sb != NULL, then find and write all such
+ *
+ * If @only_this_sb is true, then find and write all such
  * inodes. Otherwise write only ones which go sequentially
  * in reverse order.
+ *
  * Return 1, if the caller writeback routine should be
  * interrupted. Otherwise return 0.
  */
-static int writeback_sb_inodes(struct super_block *sb,
-			       struct bdi_writeback *wb,
-			       struct writeback_control *wbc)
+static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
+		struct writeback_control *wbc, bool only_this_sb)
 {
 	while (!list_empty(&wb->b_io)) {
 		long pages_skipped;
 		struct inode *inode = list_entry(wb->b_io.prev,
 						 struct inode, i_list);
-		if (wbc->sb && sb != inode->i_sb) {
-			/* super block given and doesn't
-			   match, skip this inode */
-			redirty_tail(inode);
-			continue;
-		}
-		if (sb != inode->i_sb)
-			/* finish with this superblock */
+
+		if (inode->i_sb != sb) {
+			if (only_this_sb) {
+				/*
+				 * We only want to write back data for this
+				 * superblock, move all inodes not belonging
+				 * to it back onto the dirty list.
+				 */
+				redirty_tail(inode);
+				continue;
+			}
+
+			/*
+			 * The inode belongs to a different superblock.
+			 * Bounce back to the caller to unpin this and
+			 * pin the next superblock.
+			 */
 			return 0;
+		}
+
 		if (inode->i_state & (I_NEW | I_WILL_FREE)) {
 			requeue_io(inode);
 			continue;
@@ -629,29 +641,12 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 						 struct inode, i_list);
 		struct super_block *sb = inode->i_sb;
 
-		if (wbc->sb) {
-			/*
-			 * We are requested to write out inodes for a specific
-			 * superblock.  This means we already have s_umount
-			 * taken by the caller which also waits for us to
-			 * complete the writeout.
-			 */
-			if (sb != wbc->sb) {
-				redirty_tail(inode);
-				continue;
-			}
-
-			WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
-			ret = writeback_sb_inodes(sb, wb, wbc);
-		} else {
-			if (!pin_sb_for_writeback(sb)) {
-				requeue_io(inode);
-				continue;
-			}
-			ret = writeback_sb_inodes(sb, wb, wbc);
-			drop_super(sb);
+		if (!pin_sb_for_writeback(sb)) {
+			requeue_io(inode);
+			continue;
 		}
+		ret = writeback_sb_inodes(sb, wb, wbc, false);
+		drop_super(sb);
 
 		if (ret)
 			break;
@@ -660,6 +655,19 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 	/* Leave any unwritten inodes on b_io */
 }
 
+static void __writeback_inodes_sb(struct super_block *sb,
+		struct bdi_writeback *wb, struct writeback_control *wbc)
+{
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	wbc->wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&inode_lock);
+	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+		queue_io(wb, wbc->older_than_this);
+	writeback_sb_inodes(sb, wb, wbc, true);
+	spin_unlock(&inode_lock);
+}
+
 /*
  * The maximum number of pages to writeout in a single bdi flush/kupdate
  * operation.  We do this so we don't hold I_SYNC against an inode for
@@ -698,7 +706,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 			 struct wb_writeback_args *args)
 {
 	struct writeback_control wbc = {
-		.sb			= args->sb,
 		.sync_mode		= args->sync_mode,
 		.older_than_this	= NULL,
 		.for_kupdate		= args->for_kupdate,
@@ -736,7 +743,10 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
-		writeback_inodes_wb(wb, &wbc);
+		if (args->sb)
+			__writeback_inodes_sb(args->sb, wb, &wbc);
+		else
+			writeback_inodes_wb(wb, &wbc);
 		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
-- 
cgit v1.2.3


From 83ba7b071f30f7c01f72518ad72d5cd203c27502 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 6 Jul 2010 08:59:53 +0200
Subject: writeback: simplify the write back thread queue

First remove items from work_list as soon as we start working on them.  This
means we don't have to track any pending or visited state and can get
rid of all the RCU magic freeing the work items - we can simply free
them once the operation has finished.  Second use a real completion for
tracking synchronous requests - if the caller sets the completion pointer
we complete it, otherwise use it as a boolean indicator that we can free
the work item directly.  Third unify struct wb_writeback_args and struct
bdi_work into a single data structure, wb_writeback_work.  Previous we
set all parameters into a struct wb_writeback_args, copied it into
struct bdi_work, copied it again on the stack to use it there.  Instead
of just allocate one structure dynamically or on the stack and use it
all the way through the stack.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
---
 fs/fs-writeback.c | 253 +++++++++++++++---------------------------------------
 1 file changed, 69 insertions(+), 184 deletions(-)

(limited to 'fs')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 8cc06d5432b5..d5be1693ac93 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -38,43 +38,18 @@ int nr_pdflush_threads;
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
-struct wb_writeback_args {
+struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
-};
 
-/*
- * Work items for the bdi_writeback threads
- */
-struct bdi_work {
 	struct list_head list;		/* pending work list */
-	struct rcu_head rcu_head;	/* for RCU free/clear of work */
-
-	unsigned long seen;		/* threads that have seen this work */
-	atomic_t pending;		/* number of threads still to do work */
-
-	struct wb_writeback_args args;	/* writeback arguments */
-
-	unsigned long state;		/* flag bits, see WS_* */
-};
-
-enum {
-	WS_INPROGRESS = 0,
-	WS_ONSTACK,
+	struct completion *done;	/* set if the caller waits */
 };
 
-static inline void bdi_work_init(struct bdi_work *work,
-				 struct wb_writeback_args *args)
-{
-	INIT_RCU_HEAD(&work->rcu_head);
-	work->args = *args;
-	__set_bit(WS_INPROGRESS, &work->state);
-}
-
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -87,49 +62,11 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 	return !list_empty(&bdi->work_list);
 }
 
-static void bdi_work_free(struct rcu_head *head)
-{
-	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-
-	clear_bit(WS_INPROGRESS, &work->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&work->state, WS_INPROGRESS);
-
-	if (!test_bit(WS_ONSTACK, &work->state))
-		kfree(work);
-}
-
-static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
-{
-	/*
-	 * The caller has retrieved the work arguments from this work,
-	 * drop our reference. If this is the last ref, delete and free it
-	 */
-	if (atomic_dec_and_test(&work->pending)) {
-		struct backing_dev_info *bdi = wb->bdi;
-
-		spin_lock(&bdi->wb_lock);
-		list_del_rcu(&work->list);
-		spin_unlock(&bdi->wb_lock);
-
-		call_rcu(&work->rcu_head, bdi_work_free);
-	}
-}
-
-static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
+static void bdi_queue_work(struct backing_dev_info *bdi,
+		struct wb_writeback_work *work)
 {
-	work->seen = bdi->wb_mask;
-	BUG_ON(!work->seen);
-	atomic_set(&work->pending, bdi->wb_cnt);
-	BUG_ON(!bdi->wb_cnt);
-
-	/*
-	 * list_add_tail_rcu() contains the necessary barriers to
-	 * make sure the above stores are seen before the item is
-	 * noticed on the list
-	 */
 	spin_lock(&bdi->wb_lock);
-	list_add_tail_rcu(&work->list, &bdi->work_list);
+	list_add_tail(&work->list, &bdi->work_list);
 	spin_unlock(&bdi->wb_lock);
 
 	/*
@@ -146,55 +83,29 @@ static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
 	}
 }
 
-/*
- * Used for on-stack allocated work items. The caller needs to wait until
- * the wb threads have acked the work before it's safe to continue.
- */
-static void bdi_wait_on_work_done(struct bdi_work *work)
-{
-	wait_on_bit(&work->state, WS_INPROGRESS, bdi_sched_wait,
-		    TASK_UNINTERRUPTIBLE);
-}
-
-static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-				 struct wb_writeback_args *args)
+static void
+__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
+		bool range_cyclic, bool for_background)
 {
-	struct bdi_work *work;
+	struct wb_writeback_work *work;
 
 	/*
 	 * This is WB_SYNC_NONE writeback, so if allocation fails just
 	 * wakeup the thread for old dirty data writeback
 	 */
-	work = kmalloc(sizeof(*work), GFP_ATOMIC);
-	if (work) {
-		bdi_work_init(work, args);
-		bdi_queue_work(bdi, work);
-	} else {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		if (wb->task)
-			wake_up_process(wb->task);
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work) {
+		if (bdi->wb.task)
+			wake_up_process(bdi->wb.task);
+		return;
 	}
-}
 
-/**
- * bdi_queue_work_onstack - start and wait for writeback
- * @args: parameters to control the work queue writeback
- *
- * Description:
- *   This function initiates writeback and waits for the operation to
- *   complete. Callers must hold the sb s_umount semaphore for
- *   reading, to avoid having the super disappear before we are done.
- */
-static void bdi_queue_work_onstack(struct wb_writeback_args *args)
-{
-	struct bdi_work work;
+	work->sync_mode	= WB_SYNC_NONE;
+	work->nr_pages	= nr_pages;
+	work->range_cyclic = range_cyclic;
+	work->for_background = for_background;
 
-	bdi_work_init(&work, args);
-	__set_bit(WS_ONSTACK, &work.state);
-
-	bdi_queue_work(args->sb->s_bdi, &work);
-	bdi_wait_on_work_done(&work);
+	bdi_queue_work(bdi, work);
 }
 
 /**
@@ -210,13 +121,7 @@ static void bdi_queue_work_onstack(struct wb_writeback_args *args)
  */
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-	struct wb_writeback_args args = {
-		.sync_mode	= WB_SYNC_NONE,
-		.nr_pages	= nr_pages,
-		.range_cyclic	= 1,
-	};
-
-	bdi_alloc_queue_work(bdi, &args);
+	__bdi_start_writeback(bdi, nr_pages, true, false);
 }
 
 /**
@@ -230,13 +135,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
  */
 void bdi_start_background_writeback(struct backing_dev_info *bdi)
 {
-	struct wb_writeback_args args = {
-		.sync_mode	= WB_SYNC_NONE,
-		.nr_pages	= LONG_MAX,
-		.for_background = 1,
-		.range_cyclic	= 1,
-	};
-	bdi_alloc_queue_work(bdi, &args);
+	__bdi_start_writeback(bdi, LONG_MAX, true, true);
 }
 
 /*
@@ -703,14 +602,14 @@ static inline bool over_bground_thresh(void)
  * all dirty pages if they are all attached to "old" mappings.
  */
 static long wb_writeback(struct bdi_writeback *wb,
-			 struct wb_writeback_args *args)
+			 struct wb_writeback_work *work)
 {
 	struct writeback_control wbc = {
-		.sync_mode		= args->sync_mode,
+		.sync_mode		= work->sync_mode,
 		.older_than_this	= NULL,
-		.for_kupdate		= args->for_kupdate,
-		.for_background		= args->for_background,
-		.range_cyclic		= args->range_cyclic,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.range_cyclic		= work->range_cyclic,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
@@ -730,24 +629,24 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * Stop writeback when nr_pages has been consumed
 		 */
-		if (args->nr_pages <= 0)
+		if (work->nr_pages <= 0)
 			break;
 
 		/*
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (args->for_background && !over_bground_thresh())
+		if (work->for_background && !over_bground_thresh())
 			break;
 
 		wbc.more_io = 0;
 		wbc.nr_to_write = MAX_WRITEBACK_PAGES;
 		wbc.pages_skipped = 0;
-		if (args->sb)
-			__writeback_inodes_sb(args->sb, wb, &wbc);
+		if (work->sb)
+			__writeback_inodes_sb(work->sb, wb, &wbc);
 		else
 			writeback_inodes_wb(wb, &wbc);
-		args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+		work->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 		wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
 
 		/*
@@ -783,31 +682,21 @@ static long wb_writeback(struct bdi_writeback *wb,
 }
 
 /*
- * Return the next bdi_work struct that hasn't been processed by this
- * wb thread yet. ->seen is initially set for each thread that exists
- * for this device, when a thread first notices a piece of work it
- * clears its bit. Depending on writeback type, the thread will notify
- * completion on either receiving the work (WB_SYNC_NONE) or after
- * it is done (WB_SYNC_ALL).
+ * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
-					   struct bdi_writeback *wb)
+static struct wb_writeback_work *
+get_next_work_item(struct backing_dev_info *bdi, struct bdi_writeback *wb)
 {
-	struct bdi_work *work, *ret = NULL;
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(work, &bdi->work_list, list) {
-		if (!test_bit(wb->nr, &work->seen))
-			continue;
-		clear_bit(wb->nr, &work->seen);
+	struct wb_writeback_work *work = NULL;
 
-		ret = work;
-		break;
+	spin_lock(&bdi->wb_lock);
+	if (!list_empty(&bdi->work_list)) {
+		work = list_entry(bdi->work_list.next,
+				  struct wb_writeback_work, list);
+		list_del_init(&work->list);
 	}
-
-	rcu_read_unlock();
-	return ret;
+	spin_unlock(&bdi->wb_lock);
+	return work;
 }
 
 static long wb_check_old_data_flush(struct bdi_writeback *wb)
@@ -832,14 +721,14 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
 	if (nr_pages) {
-		struct wb_writeback_args args = {
+		struct wb_writeback_work work = {
 			.nr_pages	= nr_pages,
 			.sync_mode	= WB_SYNC_NONE,
 			.for_kupdate	= 1,
 			.range_cyclic	= 1,
 		};
 
-		return wb_writeback(wb, &args);
+		return wb_writeback(wb, &work);
 	}
 
 	return 0;
@@ -851,33 +740,27 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 {
 	struct backing_dev_info *bdi = wb->bdi;
-	struct bdi_work *work;
+	struct wb_writeback_work *work;
 	long wrote = 0;
 
 	while ((work = get_next_work_item(bdi, wb)) != NULL) {
-		struct wb_writeback_args args = work->args;
-
 		/*
 		 * Override sync mode, in case we must wait for completion
+		 * because this thread is exiting now.
 		 */
 		if (force_wait)
-			work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
+			work->sync_mode = WB_SYNC_ALL;
 
-		/*
-		 * If this isn't a data integrity operation, just notify
-		 * that we have seen this work and we are now starting it.
-		 */
-		if (!test_bit(WS_ONSTACK, &work->state))
-			wb_clear_pending(wb, work);
-
-		wrote += wb_writeback(wb, &args);
+		wrote += wb_writeback(wb, work);
 
 		/*
-		 * This is a data integrity writeback, so only do the
-		 * notification when we have completed the work.
+		 * Notify the caller of completion if this is a synchronous
+		 * work item, otherwise just free it.
 		 */
-		if (test_bit(WS_ONSTACK, &work->state))
-			wb_clear_pending(wb, work);
+		if (work->done)
+			complete(work->done);
+		else
+			kfree(work);
 	}
 
 	/*
@@ -940,14 +823,9 @@ int bdi_writeback_task(struct bdi_writeback *wb)
 void wakeup_flusher_threads(long nr_pages)
 {
 	struct backing_dev_info *bdi;
-	struct wb_writeback_args args = {
-		.sync_mode	= WB_SYNC_NONE,
-	};
 
-	if (nr_pages) {
-		args.nr_pages = nr_pages;
-	} else {
-		args.nr_pages = global_page_state(NR_FILE_DIRTY) +
+	if (!nr_pages) {
+		nr_pages = global_page_state(NR_FILE_DIRTY) +
 				global_page_state(NR_UNSTABLE_NFS);
 	}
 
@@ -955,7 +833,7 @@ void wakeup_flusher_threads(long nr_pages)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		bdi_alloc_queue_work(bdi, &args);
+		__bdi_start_writeback(bdi, nr_pages, false, false);
 	}
 	rcu_read_unlock();
 }
@@ -1164,17 +1042,20 @@ void writeback_inodes_sb(struct super_block *sb)
 {
 	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
 	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-	struct wb_writeback_args args = {
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_NONE,
+		.done		= &done,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	args.nr_pages = nr_dirty + nr_unstable +
+	work.nr_pages = nr_dirty + nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 
-	bdi_queue_work_onstack(&args);
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
 }
 EXPORT_SYMBOL(writeback_inodes_sb);
 
@@ -1206,16 +1087,20 @@ EXPORT_SYMBOL(writeback_inodes_sb_if_idle);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-	struct wb_writeback_args args = {
+	DECLARE_COMPLETION_ONSTACK(done);
+	struct wb_writeback_work work = {
 		.sb		= sb,
 		.sync_mode	= WB_SYNC_ALL,
 		.nr_pages	= LONG_MAX,
 		.range_cyclic	= 0,
+		.done		= &done,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	bdi_queue_work_onstack(&args);
+	bdi_queue_work(sb->s_bdi, &work);
+	wait_for_completion(&done);
+
 	wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
-- 
cgit v1.2.3


From b0bbb0be8f7fbf6d366b359e034c78a96c4e274d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 8 Jul 2010 14:49:38 +0200
Subject: ceph: add kfree() to error path

We leak a "pi" on this error path.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osdmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 50ce64ebd330..277f8b339577 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -568,6 +568,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		if (ev > CEPH_PG_POOL_VERSION) {
 			pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
 				   ev, CEPH_PG_POOL_VERSION);
+			kfree(pi);
 			goto bad;
 		}
 		__decode_pool(p, pi);
-- 
cgit v1.2.3


From a4bfb4cf11fd2211b788af59dc8a8b4394bca227 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Tue, 6 Jul 2010 14:36:06 -0700
Subject: ocfs2: When zero extending, do it by page.

ocfs2_zero_extend() does its zeroing block by block, but it calls a
function named ocfs2_write_zero_page().  Let's have
ocfs2_write_zero_page() handle the page level.  From
ocfs2_zero_extend()'s perspective, it is now page-at-a-time.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
---
 fs/ocfs2/aops.c |  30 --------------
 fs/ocfs2/file.c | 118 ++++++++++++++++++++++++++++++++++++++++----------------
 2 files changed, 84 insertions(+), 64 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca20cc18..9a5c931439bd 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -459,36 +459,6 @@ int walk_page_buffers(	handle_t *handle,
 	return ret;
 }
 
-handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
-							 struct page *page,
-							 unsigned from,
-							 unsigned to)
-{
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	handle_t *handle;
-	int ret = 0;
-
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		ret = -ENOMEM;
-		mlog_errno(ret);
-		goto out;
-	}
-
-	if (ocfs2_should_order_data(inode)) {
-		ret = ocfs2_jbd2_file_inode(handle, inode);
-		if (ret < 0)
-			mlog_errno(ret);
-	}
-out:
-	if (ret) {
-		if (!IS_ERR(handle))
-			ocfs2_commit_trans(osb, handle);
-		handle = ERR_PTR(ret);
-	}
-	return handle;
-}
-
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6a13ea64c447..4cfc976a9067 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -724,28 +724,55 @@ leave:
 	return status;
 }
 
+/*
+ * While a write will already be ordering the data, a truncate will not.
+ * Thus, we need to explicitly order the zeroed pages.
+ */
+static handle_t *ocfs2_zero_start_ordered_transaction(struct inode *inode)
+{
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	handle_t *handle = NULL;
+	int ret = 0;
+
+	if (!ocfs2_should_order_data(inode))
+		goto out;
+
+	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_jbd2_file_inode(handle, inode);
+	if (ret < 0)
+		mlog_errno(ret);
+
+out:
+	if (ret) {
+		if (!IS_ERR(handle))
+			ocfs2_commit_trans(osb, handle);
+		handle = ERR_PTR(ret);
+	}
+	return handle;
+}
+
 /* Some parts of this taken from generic_cont_expand, which turned out
  * to be too fragile to do exactly what we need without us having to
  * worry about recursive locking in ->write_begin() and ->write_end(). */
-static int ocfs2_write_zero_page(struct inode *inode,
-				 u64 size)
+static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
+				 u64 abs_to)
 {
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
-	unsigned long index;
-	unsigned int offset;
+	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
 	handle_t *handle = NULL;
 	int ret;
+	unsigned zero_from, zero_to, block_start, block_end;
 
-	offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */
-	/* ugh.  in prepare/commit_write, if from==to==start of block, we
-	** skip the prepare.  make sure we never send an offset for the start
-	** of a block
-	*/
-	if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) {
-		offset++;
-	}
-	index = size >> PAGE_CACHE_SHIFT;
+	BUG_ON(abs_from >= abs_to);
+	BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT));
+	BUG_ON(abs_from & (inode->i_blkbits - 1));
 
 	page = grab_cache_page(mapping, index);
 	if (!page) {
@@ -754,31 +781,51 @@ static int ocfs2_write_zero_page(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_prepare_write_nolock(inode, page, offset, offset);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_unlock;
-	}
+	/* Get the offsets within the page that we want to zero */
+	zero_from = abs_from & (PAGE_CACHE_SIZE - 1);
+	zero_to = abs_to & (PAGE_CACHE_SIZE - 1);
+	if (!zero_to)
+		zero_to = PAGE_CACHE_SIZE;
 
-	if (ocfs2_should_order_data(inode)) {
-		handle = ocfs2_start_walk_page_trans(inode, page, offset,
-						     offset);
-		if (IS_ERR(handle)) {
-			ret = PTR_ERR(handle);
-			handle = NULL;
+	/* We know that zero_from is block aligned */
+	for (block_start = zero_from; block_start < zero_to;
+	     block_start = block_end) {
+		block_end = block_start + (1 << inode->i_blkbits);
+
+		/*
+		 * block_start is block-aligned.  Bump it by one to
+		 * force ocfs2_{prepare,commit}_write() to zero the
+		 * whole block.
+		 */
+		ret = ocfs2_prepare_write_nolock(inode, page,
+						 block_start + 1,
+						 block_start + 1);
+		if (ret < 0) {
+			mlog_errno(ret);
 			goto out_unlock;
 		}
-	}
 
-	/* must not update i_size! */
-	ret = block_commit_write(page, offset, offset);
-	if (ret < 0)
-		mlog_errno(ret);
-	else
-		ret = 0;
+		if (!handle) {
+			handle = ocfs2_zero_start_ordered_transaction(inode);
+			if (IS_ERR(handle)) {
+				ret = PTR_ERR(handle);
+				handle = NULL;
+				break;
+			}
+		}
+
+		/* must not update i_size! */
+		ret = block_commit_write(page, block_start + 1,
+					 block_start + 1);
+		if (ret < 0)
+			mlog_errno(ret);
+		else
+			ret = 0;
+	}
 
 	if (handle)
 		ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+
 out_unlock:
 	unlock_page(page);
 	page_cache_release(page);
@@ -790,18 +837,21 @@ static int ocfs2_zero_extend(struct inode *inode,
 			     u64 zero_to_size)
 {
 	int ret = 0;
-	u64 start_off;
+	u64 start_off, next_off;
 	struct super_block *sb = inode->i_sb;
 
 	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
 	while (start_off < zero_to_size) {
-		ret = ocfs2_write_zero_page(inode, start_off);
+		next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		if (next_off > zero_to_size)
+			next_off = zero_to_size;
+		ret = ocfs2_write_zero_page(inode, start_off, next_off);
 		if (ret < 0) {
 			mlog_errno(ret);
 			goto out;
 		}
 
-		start_off += sb->s_blocksize;
+		start_off = next_off;
 
 		/*
 		 * Very large extends have the potential to lock up
-- 
cgit v1.2.3


From 5693486bad2bc2ac585a2c24f7e2f3964b478df9 Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Thu, 1 Jul 2010 15:13:31 -0700
Subject: ocfs2: Zero the tail cluster when extending past i_size.

ocfs2's allocation unit is the cluster.  This can be larger than a block
or even a memory page.  This means that a file may have many blocks in
its last extent that are beyond the block containing i_size.  There also
may be more unwritten extents after that.

When ocfs2 grows a file, it zeros the entire cluster in order to ensure
future i_size growth will see cleared blocks.  Unfortunately,
block_write_full_page() drops the pages past i_size.  This means that
ocfs2 is actually leaking garbage data into the tail end of that last
cluster.  This is a bug.

We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect
when a write or truncate is past i_size.  They will use
ocfs2_zero_extend() to ensure the data is properly zeroed.

Older versions of ocfs2_zero_extend() simply zeroed every block between
i_size and the zeroing position.  This presumes three things:

1) There is allocation for all of these blocks.
2) The extents are not unwritten.
3) The extents are not refcounted.

(1) and (2) hold true for non-sparse filesystems, which used to be the
only users of ocfs2_zero_extend().  (3) is another bug.

Since we're now using ocfs2_zero_extend() for sparse filesystems as
well, we teach ocfs2_zero_extend() to check every extent between
i_size and the zeroing position.  If the extent is unwritten, it is
ignored.  If it is refcounted, it is CoWed.  Then it is zeroed.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
---
 fs/ocfs2/aops.c         |  42 ++++++----
 fs/ocfs2/file.c         | 201 +++++++++++++++++++++++++++++++++++++++---------
 fs/ocfs2/file.h         |   6 +-
 fs/ocfs2/quota_global.c |   2 +-
 fs/ocfs2/quota_local.c  |   4 +-
 fs/ocfs2/refcounttree.c |   6 ++
 6 files changed, 207 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9a5c931439bd..742893ea7390 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -196,15 +196,14 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			dump_stack();
 			goto bail;
 		}
-
-		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-		     (unsigned long long)past_eof);
-
-		if (create && (iblock >= past_eof))
-			set_buffer_new(bh_result);
 	}
 
+	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+	     (unsigned long long)past_eof);
+	if (create && (iblock >= past_eof))
+		set_buffer_new(bh_result);
+
 bail:
 	if (err < 0)
 		err = -EIO;
@@ -1590,21 +1589,20 @@ out:
  * write path can treat it as an non-allocating write, which has no
  * special case code for sparse/nonsparse files.
  */
-static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
-					unsigned len,
+static int ocfs2_expand_nonsparse_inode(struct inode *inode,
+					struct buffer_head *di_bh,
+					loff_t pos, unsigned len,
 					struct ocfs2_write_ctxt *wc)
 {
 	int ret;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	loff_t newsize = pos + len;
 
-	if (ocfs2_sparse_alloc(osb))
-		return 0;
+	BUG_ON(ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
 
 	if (newsize <= i_size_read(inode))
 		return 0;
 
-	ret = ocfs2_extend_no_holes(inode, newsize, pos);
+	ret = ocfs2_extend_no_holes(inode, di_bh, newsize, pos);
 	if (ret)
 		mlog_errno(ret);
 
@@ -1614,6 +1612,18 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
 	return ret;
 }
 
+static int ocfs2_zero_tail(struct inode *inode, struct buffer_head *di_bh,
+			   loff_t pos)
+{
+	int ret = 0;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)));
+	if (pos > i_size_read(inode))
+		ret = ocfs2_zero_extend(inode, di_bh, pos);
+
+	return ret;
+}
+
 int ocfs2_write_begin_nolock(struct address_space *mapping,
 			     loff_t pos, unsigned len, unsigned flags,
 			     struct page **pagep, void **fsdata,
@@ -1649,7 +1659,11 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 		}
 	}
 
-	ret = ocfs2_expand_nonsparse_inode(inode, pos, len, wc);
+	if (ocfs2_sparse_alloc(osb))
+		ret = ocfs2_zero_tail(inode, di_bh, pos);
+	else
+		ret = ocfs2_expand_nonsparse_inode(inode, di_bh, pos, len,
+						   wc);
 	if (ret) {
 		mlog_errno(ret);
 		goto out;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4cfc976a9067..ac15911b31c4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -787,6 +787,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	if (!zero_to)
 		zero_to = PAGE_CACHE_SIZE;
 
+	mlog(0,
+	     "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
+	     (unsigned long long)abs_from, (unsigned long long)abs_to,
+	     index, zero_from, zero_to);
+
 	/* We know that zero_from is block aligned */
 	for (block_start = zero_from; block_start < zero_to;
 	     block_start = block_end) {
@@ -833,25 +838,114 @@ out:
 	return ret;
 }
 
-static int ocfs2_zero_extend(struct inode *inode,
-			     u64 zero_to_size)
+/*
+ * Find the next range to zero.  We do this in terms of bytes because
+ * that's what ocfs2_zero_extend() wants, and it is dealing with the
+ * pagecache.  We may return multiple extents.
+ *
+ * zero_start and zero_end are ocfs2_zero_extend()s current idea of what
+ * needs to be zeroed.  range_start and range_end return the next zeroing
+ * range.  A subsequent call should pass the previous range_end as its
+ * zero_start.  If range_end is 0, there's nothing to do.
+ *
+ * Unwritten extents are skipped over.  Refcounted extents are CoWd.
+ */
+static int ocfs2_zero_extend_get_range(struct inode *inode,
+				       struct buffer_head *di_bh,
+				       u64 zero_start, u64 zero_end,
+				       u64 *range_start, u64 *range_end)
 {
-	int ret = 0;
-	u64 start_off, next_off;
-	struct super_block *sb = inode->i_sb;
+	int rc = 0, needs_cow = 0;
+	u32 p_cpos, zero_clusters = 0;
+	u32 zero_cpos =
+		zero_start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+	u32 last_cpos = ocfs2_clusters_for_bytes(inode->i_sb, zero_end);
+	unsigned int num_clusters = 0;
+	unsigned int ext_flags = 0;
 
-	start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-	while (start_off < zero_to_size) {
-		next_off = (start_off & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
-		if (next_off > zero_to_size)
-			next_off = zero_to_size;
-		ret = ocfs2_write_zero_page(inode, start_off, next_off);
-		if (ret < 0) {
-			mlog_errno(ret);
+	while (zero_cpos < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos, &p_cpos,
+					&num_clusters, &ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) {
+			zero_clusters = num_clusters;
+			if (ext_flags & OCFS2_EXT_REFCOUNTED)
+				needs_cow = 1;
+			break;
+		}
+
+		zero_cpos += num_clusters;
+	}
+	if (!zero_clusters) {
+		*range_end = 0;
+		goto out;
+	}
+
+	while ((zero_cpos + zero_clusters) < last_cpos) {
+		rc = ocfs2_get_clusters(inode, zero_cpos + zero_clusters,
+					&p_cpos, &num_clusters,
+					&ext_flags);
+		if (rc) {
+			mlog_errno(rc);
+			goto out;
+		}
+
+		if (!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN))
+			break;
+		if (ext_flags & OCFS2_EXT_REFCOUNTED)
+			needs_cow = 1;
+		zero_clusters += num_clusters;
+	}
+	if ((zero_cpos + zero_clusters) > last_cpos)
+		zero_clusters = last_cpos - zero_cpos;
+
+	if (needs_cow) {
+		rc = ocfs2_refcount_cow(inode, di_bh, zero_cpos, zero_clusters,
+					UINT_MAX);
+		if (rc) {
+			mlog_errno(rc);
 			goto out;
 		}
+	}
 
-		start_off = next_off;
+	*range_start = ocfs2_clusters_to_bytes(inode->i_sb, zero_cpos);
+	*range_end = ocfs2_clusters_to_bytes(inode->i_sb,
+					     zero_cpos + zero_clusters);
+
+out:
+	return rc;
+}
+
+/*
+ * Zero one range returned from ocfs2_zero_extend_get_range().  The caller
+ * has made sure that the entire range needs zeroing.
+ */
+static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
+				   u64 range_end)
+{
+	int rc = 0;
+	u64 next_pos;
+	u64 zero_pos = range_start;
+
+	mlog(0, "range_start = %llu, range_end = %llu\n",
+	     (unsigned long long)range_start,
+	     (unsigned long long)range_end);
+	BUG_ON(range_start >= range_end);
+
+	while (zero_pos < range_end) {
+		next_pos = (zero_pos & PAGE_CACHE_MASK) + PAGE_CACHE_SIZE;
+		if (next_pos > range_end)
+			next_pos = range_end;
+		rc = ocfs2_write_zero_page(inode, zero_pos, next_pos);
+		if (rc < 0) {
+			mlog_errno(rc);
+			break;
+		}
+		zero_pos = next_pos;
 
 		/*
 		 * Very large extends have the potential to lock up
@@ -860,16 +954,63 @@ static int ocfs2_zero_extend(struct inode *inode,
 		cond_resched();
 	}
 
-out:
+	return rc;
+}
+
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to_size)
+{
+	int ret = 0;
+	u64 zero_start, range_start = 0, range_end = 0;
+	struct super_block *sb = inode->i_sb;
+
+	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
+	mlog(0, "zero_start %llu for i_size %llu\n",
+	     (unsigned long long)zero_start,
+	     (unsigned long long)i_size_read(inode));
+	while (zero_start < zero_to_size) {
+		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
+						  zero_to_size,
+						  &range_start,
+						  &range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		if (!range_end)
+			break;
+		/* Trim the ends */
+		if (range_start < zero_start)
+			range_start = zero_start;
+		if (range_end > zero_to_size)
+			range_end = zero_to_size;
+
+		ret = ocfs2_zero_extend_range(inode, range_start,
+					      range_end);
+		if (ret) {
+			mlog_errno(ret);
+			break;
+		}
+		zero_start = range_end;
+	}
+
 	return ret;
 }
 
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to)
 {
 	int ret;
 	u32 clusters_to_add;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
+	/*
+	 * Only quota files call this without a bh, and they can't be
+	 * refcounted.
+	 */
+	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
+
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
 	if (clusters_to_add < oi->ip_clusters)
 		clusters_to_add = 0;
@@ -890,7 +1031,7 @@ int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to)
 	 * still need to zero the area between the old i_size and the
 	 * new i_size.
 	 */
-	ret = ocfs2_zero_extend(inode, zero_to);
+	ret = ocfs2_zero_extend(inode, di_bh, zero_to);
 	if (ret < 0)
 		mlog_errno(ret);
 
@@ -912,27 +1053,15 @@ static int ocfs2_extend_file(struct inode *inode,
 		goto out;
 
 	if (i_size_read(inode) == new_i_size)
-  		goto out;
+		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 
-	/*
-	 * Fall through for converting inline data, even if the fs
-	 * supports sparse files.
-	 *
-	 * The check for inline data here is legal - nobody can add
-	 * the feature since we have i_mutex. We must check it again
-	 * after acquiring ip_alloc_sem though, as paths like mmap
-	 * might have raced us to converting the inode to extents.
-	 */
-	if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
-	    && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-		goto out_update_size;
-
 	/*
 	 * The alloc sem blocks people in read/write from reading our
 	 * allocation until we're done changing it. We depend on
 	 * i_mutex to block other extend/truncate calls while we're
-	 * here.
+	 * here.  We even have to hold it for sparse files because there
+	 * might be some tail zeroing.
 	 */
 	down_write(&oi->ip_alloc_sem);
 
@@ -949,14 +1078,16 @@ static int ocfs2_extend_file(struct inode *inode,
 		ret = ocfs2_convert_inline_data_to_extents(inode, di_bh);
 		if (ret) {
 			up_write(&oi->ip_alloc_sem);
-
 			mlog_errno(ret);
 			goto out;
 		}
 	}
 
-	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
-		ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size);
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ret = ocfs2_zero_extend(inode, di_bh, new_i_size);
+	else
+		ret = ocfs2_extend_no_holes(inode, di_bh, new_i_size,
+					    new_i_size);
 
 	up_write(&oi->ip_alloc_sem);
 
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index d66cf4f7c70e..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -54,8 +54,10 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
 int ocfs2_simple_size_update(struct inode *inode,
 			     struct buffer_head *di_bh,
 			     u64 new_i_size);
-int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
-			  u64 zero_to);
+int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
+			  u64 new_i_size, u64 zero_to);
+int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
+		      loff_t zero_to);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 2bb35fe00511..4607923eb24c 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -775,7 +775,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 		 * locking allocators ranks above a transaction start
 		 */
 		WARN_ON(journal_current_handle());
-		status = ocfs2_extend_no_holes(gqinode,
+		status = ocfs2_extend_no_holes(gqinode, NULL,
 			gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
 			gqinode->i_size);
 		if (status < 0)
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8bd70d4d184d..dc78764ccc4c 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -971,7 +971,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
 	u64 p_blkno;
 
 	/* We are protected by dqio_sem so no locking needed */
-	status = ocfs2_extend_no_holes(lqinode,
+	status = ocfs2_extend_no_holes(lqinode, NULL,
 				       lqinode->i_size + 2 * sb->s_blocksize,
 				       lqinode->i_size);
 	if (status < 0) {
@@ -1114,7 +1114,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
 		return ocfs2_local_quota_add_chunk(sb, type, offset);
 
 	/* We are protected by dqio_sem so no locking needed */
-	status = ocfs2_extend_no_holes(lqinode,
+	status = ocfs2_extend_no_holes(lqinode, NULL,
 				       lqinode->i_size + sb->s_blocksize,
 				       lqinode->i_size);
 	if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 4793f36f6518..32949df10694 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4166,6 +4166,12 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 	struct inode *inode = old_dentry->d_inode;
 	struct buffer_head *new_bh = NULL;
 
+	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
+		ret = -EINVAL;
+		mlog_errno(ret);
+		goto out;
+	}
+
 	ret = filemap_fdatawrite(inode->i_mapping);
 	if (ret) {
 		mlog_errno(ret);
-- 
cgit v1.2.3


From d06dbaf6c2c7187938f3f6745d9e4938a2d0ec47 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Jul 2010 10:47:16 -0700
Subject: ceph: fix printing of ipv6 addrs

The buffer was too small.  Make it bigger, use snprintf(), put brackets
around the ipv6 address to avoid mixing it up with the :port, and use the
ever-so-handy %pI[46] formats.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 24 ++++++------------------
 1 file changed, 6 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 9692d08e2f88..e8c5a2d0e88f 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -43,7 +43,8 @@ static void ceph_fault(struct ceph_connection *con);
  * nicely render a sockaddr as a string.
  */
 #define MAX_ADDR_STR 20
-static char addr_str[MAX_ADDR_STR][40];
+#define MAX_ADDR_STR_LEN 60
+static char addr_str[MAX_ADDR_STR][MAX_ADDR_STR_LEN];
 static DEFINE_SPINLOCK(addr_str_lock);
 static int last_addr_str;
 
@@ -52,7 +53,6 @@ const char *pr_addr(const struct sockaddr_storage *ss)
 	int i;
 	char *s;
 	struct sockaddr_in *in4 = (void *)ss;
-	unsigned char *quad = (void *)&in4->sin_addr.s_addr;
 	struct sockaddr_in6 *in6 = (void *)ss;
 
 	spin_lock(&addr_str_lock);
@@ -64,25 +64,13 @@ const char *pr_addr(const struct sockaddr_storage *ss)
 
 	switch (ss->ss_family) {
 	case AF_INET:
-		sprintf(s, "%u.%u.%u.%u:%u",
-			(unsigned int)quad[0],
-			(unsigned int)quad[1],
-			(unsigned int)quad[2],
-			(unsigned int)quad[3],
-			(unsigned int)ntohs(in4->sin_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "%pI4:%u", &in4->sin_addr,
+			 (unsigned int)ntohs(in4->sin_port));
 		break;
 
 	case AF_INET6:
-		sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
-			in6->sin6_addr.s6_addr16[0],
-			in6->sin6_addr.s6_addr16[1],
-			in6->sin6_addr.s6_addr16[2],
-			in6->sin6_addr.s6_addr16[3],
-			in6->sin6_addr.s6_addr16[4],
-			in6->sin6_addr.s6_addr16[5],
-			in6->sin6_addr.s6_addr16[6],
-			in6->sin6_addr.s6_addr16[7],
-			(unsigned int)ntohs(in6->sin6_port));
+		snprintf(s, MAX_ADDR_STR_LEN, "[%pI6c]:%u", &in6->sin6_addr,
+			 (unsigned int)ntohs(in6->sin6_port));
 		break;
 
 	default:
-- 
cgit v1.2.3


From 39139f64e14684cf2370770deb79d929d27cfd9b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 8 Jul 2010 09:54:52 -0700
Subject: ceph: fix parsing of ipv6 addresses

Check for brackets around the ipv6 address to avoid ambiguity with the port
number.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index e8c5a2d0e88f..3ddef1556457 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -997,19 +997,32 @@ int ceph_parse_ips(const char *c, const char *end,
 		struct sockaddr_in *in4 = (void *)ss;
 		struct sockaddr_in6 *in6 = (void *)ss;
 		int port;
+		char delim = ',';
+
+		if (*p == '[') {
+			delim = ']';
+			p++;
+		}
 
 		memset(ss, 0, sizeof(*ss));
 		if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
-			     ',', &ipend)) {
+			     delim, &ipend))
 			ss->ss_family = AF_INET;
-		} else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
-				    ',', &ipend)) {
+		else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+				  delim, &ipend))
 			ss->ss_family = AF_INET6;
-		} else {
+		else
 			goto bad;
-		}
 		p = ipend;
 
+		if (delim == ']') {
+			if (*p != ']') {
+				dout("missing matching ']'\n");
+				goto bad;
+			}
+			p++;
+		}
+
 		/* port? */
 		if (p < end && *p == ':') {
 			port = 0;
@@ -1043,7 +1056,7 @@ int ceph_parse_ips(const char *c, const char *end,
 	return 0;
 
 bad:
-	pr_err("parse_ips bad ip '%s'\n", c);
+	pr_err("parse_ips bad ip '%.*s'\n", (int)(end - c), c);
 	return -EINVAL;
 }
 
-- 
cgit v1.2.3


From f91d3471ccf1ca9a795f46c94b1ded8dd219940c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 1 Jul 2010 15:18:31 -0700
Subject: ceph: fix creation of ipv6 sockets

Use the address family from the peer address instead of assuming IPv4.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/messenger.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 3ddef1556457..15167b2daa55 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -203,12 +203,13 @@ static void set_sock_callbacks(struct socket *sock,
  */
 static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 {
-	struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+	struct sockaddr_storage *paddr = &con->peer_addr.in_addr;
 	struct socket *sock;
 	int ret;
 
 	BUG_ON(con->sock);
-	ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+	ret = sock_create_kern(con->peer_addr.in_addr.ss_family, SOCK_STREAM,
+			       IPPROTO_TCP, &sock);
 	if (ret)
 		return ERR_PTR(ret);
 	con->sock = sock;
@@ -222,7 +223,8 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con)
 
 	dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
 
-	ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+	ret = sock->ops->connect(sock, (struct sockaddr *)paddr, sizeof(*paddr),
+				 O_NONBLOCK);
 	if (ret == -EINPROGRESS) {
 		dout("connect %s EINPROGRESS sk_state = %u\n",
 		     pr_addr(&con->peer_addr.in_addr),
-- 
cgit v1.2.3


From 693c241a5f6aa01417f5f4caf9f82e60e316398d Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 2 Jul 2010 17:20:27 -0700
Subject: ocfs2: No need to zero pages past i_size.

When ocfs2 fills a hole, it does so by allocating clusters.  When a
cluster is larger than the write, ocfs2 must zero the portions of the
cluster outside of the write.  If the clustersize is smaller than a
pagecache page, this is handled by the normal pagecache mechanisms, but
when the clustersize is larger than a page, ocfs2's write code will zero
the pages adjacent to the write.  This makes sure the entire cluster is
zeroed correctly.

Currently ocfs2 behaves exactly the same when writing past i_size.
However, this means ocfs2 is writing zeroed pages for portions of a new
cluster that are beyond i_size.  The page writeback code isn't expecting
this.  It treats all pages past the one containing i_size as left behind
due to a previous truncate operation.

Thankfully, ocfs2 calculates the number of pages it will be working on
up front.  The rest of the write code merely honors the original
calculation.  We can simply trim the number of pages to only cover the
actual file data.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
---
 fs/ocfs2/aops.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 742893ea7390..356e976772bf 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1100,23 +1100,37 @@ out:
  */
 static int ocfs2_grab_pages_for_write(struct address_space *mapping,
 				      struct ocfs2_write_ctxt *wc,
-				      u32 cpos, loff_t user_pos, int new,
+				      u32 cpos, loff_t user_pos,
+				      unsigned user_len, int new,
 				      struct page *mmap_page)
 {
 	int ret = 0, i;
-	unsigned long start, target_index, index;
+	unsigned long start, target_index, end_index, index;
 	struct inode *inode = mapping->host;
+	loff_t last_byte;
 
 	target_index = user_pos >> PAGE_CACHE_SHIFT;
 
 	/*
 	 * Figure out how many pages we'll be manipulating here. For
 	 * non allocating write, we just change the one
-	 * page. Otherwise, we'll need a whole clusters worth.
+	 * page. Otherwise, we'll need a whole clusters worth.  If we're
+	 * writing past i_size, we only need enough pages to cover the
+	 * last page of the write.
 	 */
 	if (new) {
 		wc->w_num_pages = ocfs2_pages_per_cluster(inode->i_sb);
 		start = ocfs2_align_clusters_to_page_index(inode->i_sb, cpos);
+		/*
+		 * We need the index *past* the last page we could possibly
+		 * touch.  This is the page past the end of the write or
+		 * i_size, whichever is greater.
+		 */
+		last_byte = max(user_pos + user_len, i_size_read(inode));
+		BUG_ON(last_byte < 1);
+		end_index = ((last_byte - 1) >> PAGE_CACHE_SHIFT) + 1;
+		if ((start + wc->w_num_pages) > end_index)
+			wc->w_num_pages = end_index - start;
 	} else {
 		wc->w_num_pages = 1;
 		start = target_index;
@@ -1773,7 +1787,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
 	 * that we can zero and flush if we error after adding the
 	 * extent.
 	 */
-	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
+	ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
 					 cluster_of_pages, mmap_page);
 	if (ret) {
 		mlog_errno(ret);
-- 
cgit v1.2.3


From f471c9df922a80ca9af1d9a490b4aab3f990ec19 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Wed, 30 Jun 2010 20:23:30 +0800
Subject: ocfs2/dlm: don't access beyond bitmap size

dlm->recovery_map is defined as
	unsigned long recovery_map[BITS_TO_LONGS(O2NM_MAX_NODES)];

We should treat O2NM_MAX_NODES as the bit map size in bits.
This patches fixes a bit operation that takes O2NM_MAX_NODES + 1 as bitmap size.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmrecovery.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce4be70..9dfaac73b36d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -463,7 +463,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
 		int bit;
 
-		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
+		bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES, 0);
 		if (bit >= O2NM_MAX_NODES || bit < 0)
 			dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 		else
-- 
cgit v1.2.3


From 0a463b74e7e6856b24e613de2b85237c6e11890b Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Thu, 8 Jul 2010 11:11:11 +0800
Subject: ocfs2: Remove the redundant cpu_to_le64.

In ocfs2_block_group_alloc, we set c_blkno by bg->bg_blkno.
But actually bg->bg_blkno is already changed to little endian
in ocfs2_block_group_fill. So remove the extra cpu_to_le64.

Reported-by: Marcos Matsunaga <Marcos.Matsunaga@oracle.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/suballoc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index f4c2a9eb8c4d..a8e6a95a353f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -741,7 +741,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 		     le16_to_cpu(bg->bg_free_bits_count));
 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total,
 		     le16_to_cpu(bg->bg_bits));
-	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg->bg_blkno);
+	cl->cl_recs[alloc_rec].c_blkno = bg->bg_blkno;
 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
 		le16_add_cpu(&cl->cl_next_free_rec, 1);
 
-- 
cgit v1.2.3


From a78f9f4668949a6588b8872f162e86685c63d023 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 9 Jul 2010 14:53:11 +0800
Subject: ocfs2: make xattr extension work with new local alloc reservation.

The old ocfs2_xattr_extent_allocation is too optimistic about
the clusters we can get. So actually if the file system is
too fragmented, ocfs2_add_clusters_in_btree will return us
with EGAIN and we need to allocate clusters once again.

So this patch change it to a while loop so that we can allocate
clusters until we reach clusters_to_add.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
Cc: stable@kernel.org
---
 fs/ocfs2/xattr.c | 74 ++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 45 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e97b34842cfe..2be19083713f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -709,7 +709,7 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 					 struct ocfs2_xattr_value_buf *vb,
 					 struct ocfs2_xattr_set_ctxt *ctxt)
 {
-	int status = 0;
+	int status = 0, credits;
 	handle_t *handle = ctxt->handle;
 	enum ocfs2_alloc_restarted why;
 	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
@@ -719,38 +719,54 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 
 	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
 
-	status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
-			      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+	while (clusters_to_add) {
+		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
+				       OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			break;
+		}
 
-	prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
-	status = ocfs2_add_clusters_in_btree(handle,
-					     &et,
-					     &logical_start,
-					     clusters_to_add,
-					     0,
-					     ctxt->data_ac,
-					     ctxt->meta_ac,
-					     &why);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
+		prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
+		status = ocfs2_add_clusters_in_btree(handle,
+						     &et,
+						     &logical_start,
+						     clusters_to_add,
+						     0,
+						     ctxt->data_ac,
+						     ctxt->meta_ac,
+						     &why);
+		if ((status < 0) && (status != -EAGAIN)) {
+			if (status != -ENOSPC)
+				mlog_errno(status);
+			break;
+		}
 
-	ocfs2_journal_dirty(handle, vb->vb_bh);
+		ocfs2_journal_dirty(handle, vb->vb_bh);
 
-	clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
+		clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) -
+					 prev_clusters;
 
-	/*
-	 * We should have already allocated enough space before the transaction,
-	 * so no need to restart.
-	 */
-	BUG_ON(why != RESTART_NONE || clusters_to_add);
-
-leave:
+		if (why != RESTART_NONE && clusters_to_add) {
+			/*
+			 * We can only fail in case the alloc file doesn't give
+			 * up enough clusters.
+			 */
+			BUG_ON(why == RESTART_META);
+
+			mlog(0, "restarting xattr value extension for %u"
+			     " clusters,.\n", clusters_to_add);
+			credits = ocfs2_calc_extend_credits(inode->i_sb,
+							    &vb->vb_xv->xr_list,
+							    clusters_to_add);
+			status = ocfs2_extend_trans(handle, credits);
+			if (status < 0) {
+				status = -ENOMEM;
+				mlog_errno(status);
+				break;
+			}
+		}
+	}
 
 	return status;
 }
-- 
cgit v1.2.3


From 121a39bb00b421211f4f590c440a8f636d3ae807 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 9 Jul 2010 14:53:12 +0800
Subject: ocfs2: Make xattr reflink work with new local alloc reservation.

The new reservation code in local alloc has add the limitation
that the caller should handle the case that the local alloc
doesn't give use enough contiguous clusters. It make the old
xattr reflink code broken.

So this patch udpate the xattr reflink code so that it can
handle the case that local alloc give us one cluster at a time.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/xattr.c | 126 +++++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 90 insertions(+), 36 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 2be19083713f..d03469f61801 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -6804,16 +6804,15 @@ out:
 	return ret;
 }
 
-static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+static int ocfs2_reflink_xattr_bucket(handle_t *handle,
 				u64 blkno, u64 new_blkno, u32 clusters,
+				u32 *cpos, int num_buckets,
 				struct ocfs2_alloc_context *meta_ac,
 				struct ocfs2_alloc_context *data_ac,
 				struct ocfs2_reflink_xattr_tree_args *args)
 {
 	int i, j, ret = 0;
 	struct super_block *sb = args->reflink->old_inode->i_sb;
-	u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
-	u32 num_buckets = clusters * bpc;
 	int bpb = args->old_bucket->bu_blocks;
 	struct ocfs2_xattr_value_buf vb = {
 		.vb_access = ocfs2_journal_access,
@@ -6832,14 +6831,6 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 			break;
 		}
 
-		/*
-		 * The real bucket num in this series of blocks is stored
-		 * in the 1st bucket.
-		 */
-		if (i == 0)
-			num_buckets = le16_to_cpu(
-				bucket_xh(args->old_bucket)->xh_num_buckets);
-
 		ret = ocfs2_xattr_bucket_journal_access(handle,
 						args->new_bucket,
 						OCFS2_JOURNAL_ACCESS_CREATE);
@@ -6853,6 +6844,18 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 			       bucket_block(args->old_bucket, j),
 			       sb->s_blocksize);
 
+		/*
+		 * Record the start cpos so that we can use it to initialize
+		 * our xattr tree we also set the xh_num_bucket for the new
+		 * bucket.
+		 */
+		if (i == 0) {
+			*cpos = le32_to_cpu(bucket_xh(args->new_bucket)->
+					    xh_entries[0].xe_name_hash);
+			bucket_xh(args->new_bucket)->xh_num_buckets =
+				cpu_to_le16(num_buckets);
+		}
+
 		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
 
 		ret = ocfs2_reflink_xattr_header(handle, args->reflink,
@@ -6882,6 +6885,7 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 		}
 
 		ocfs2_xattr_bucket_journal_dirty(handle, args->new_bucket);
+
 		ocfs2_xattr_bucket_relse(args->old_bucket);
 		ocfs2_xattr_bucket_relse(args->new_bucket);
 	}
@@ -6890,6 +6894,75 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 	ocfs2_xattr_bucket_relse(args->new_bucket);
 	return ret;
 }
+
+static int ocfs2_reflink_xattr_buckets(handle_t *handle,
+				struct inode *inode,
+				struct ocfs2_reflink_xattr_tree_args *args,
+				struct ocfs2_extent_tree *et,
+				struct ocfs2_alloc_context *meta_ac,
+				struct ocfs2_alloc_context *data_ac,
+				u64 blkno, u32 cpos, u32 len)
+{
+	int ret, first_inserted = 0;
+	u32 p_cluster, num_clusters, reflink_cpos = 0;
+	u64 new_blkno;
+	unsigned int num_buckets, reflink_buckets;
+	unsigned int bpc =
+		ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
+
+	ret = ocfs2_read_xattr_bucket(args->old_bucket, blkno);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	num_buckets = le16_to_cpu(bucket_xh(args->old_bucket)->xh_num_buckets);
+	ocfs2_xattr_bucket_relse(args->old_bucket);
+
+	while (len && num_buckets) {
+		ret = ocfs2_claim_clusters(handle, data_ac,
+					   1, &p_cluster, &num_clusters);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		new_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		reflink_buckets = min(num_buckets, bpc * num_clusters);
+
+		ret = ocfs2_reflink_xattr_bucket(handle, blkno,
+						 new_blkno, num_clusters,
+						 &reflink_cpos, reflink_buckets,
+						 meta_ac, data_ac, args);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * For the 1st allocated cluster, we make it use the same cpos
+		 * so that the xattr tree looks the same as the original one
+		 * in the most case.
+		 */
+		if (!first_inserted) {
+			reflink_cpos = cpos;
+			first_inserted = 1;
+		}
+		ret = ocfs2_insert_extent(handle, et, reflink_cpos, new_blkno,
+					  num_clusters, 0, meta_ac);
+		if (ret)
+			mlog_errno(ret);
+
+		mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
+		     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+
+		len -= num_clusters;
+		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		num_buckets -= reflink_buckets;
+	}
+out:
+	return ret;
+}
+
 /*
  * Create the same xattr extent record in the new inode's xattr tree.
  */
@@ -6901,8 +6974,6 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 				   void *para)
 {
 	int ret, credits = 0;
-	u32 p_cluster, num_clusters;
-	u64 new_blkno;
 	handle_t *handle;
 	struct ocfs2_reflink_xattr_tree_args *args =
 			(struct ocfs2_reflink_xattr_tree_args *)para;
@@ -6911,6 +6982,9 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_extent_tree et;
 
+	mlog(0, "reflink xattr buckets %llu len %u\n",
+	     (unsigned long long)blkno, len);
+
 	ocfs2_init_xattr_tree_extent_tree(&et,
 					  INODE_CACHE(args->reflink->new_inode),
 					  args->new_blk_bh);
@@ -6930,32 +7004,12 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 		goto out;
 	}
 
-	ret = ocfs2_claim_clusters(handle, data_ac,
-				   len, &p_cluster, &num_clusters);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	new_blkno = ocfs2_clusters_to_blocks(osb->sb, p_cluster);
-
-	mlog(0, "reflink xattr buckets %llu to %llu, len %u\n",
-	     (unsigned long long)blkno, (unsigned long long)new_blkno, len);
-	ret = ocfs2_reflink_xattr_buckets(handle, blkno, new_blkno, len,
-					  meta_ac, data_ac, args);
-	if (ret) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-	     (unsigned long long)new_blkno, len, cpos);
-	ret = ocfs2_insert_extent(handle, &et, cpos, new_blkno,
-				  len, 0, meta_ac);
+	ret = ocfs2_reflink_xattr_buckets(handle, inode, args, &et,
+					  meta_ac, data_ac,
+					  blkno, cpos, len);
 	if (ret)
 		mlog_errno(ret);
 
-out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out:
-- 
cgit v1.2.3


From e372357ba55ae89307af15cd680467d8f0db4f01 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sat, 10 Jul 2010 16:33:36 +0200
Subject: ocfs2: tighten up strlen() checking

This function is only called from one place and it's like this:
	dlm_register_domain(conn->cc_name, dlm_key, &fs_version);

The "conn->cc_name" is 64 characters long.  If strlen(conn->cc_name)
were equal to O2NM_MAX_NAME_LEN (64) that would be a bug because
strlen() doesn't count the NULL character.

In fact, if you look how O2NM_MAX_NAME_LEN is used, it mostly describes
64 character buffers.  The only exception is nd_name from struct
o2nm_node.

Anyway I looked into it and in this case the domain string comes from
osb->uuid_str in ocfs2_setup_osb_uuid().  That's 32 characters and NULL
which easily fits into O2NM_MAX_NAME_LEN.  This patch doesn't change how
the code works, but I think it makes the code a little cleaner.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmdomain.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 2ccad86fb590..153abb5abef0 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1671,7 +1671,7 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
 	struct dlm_ctxt *dlm = NULL;
 	struct dlm_ctxt *new_ctxt = NULL;
 
-	if (strlen(domain) > O2NM_MAX_NAME_LEN) {
+	if (strlen(domain) >= O2NM_MAX_NAME_LEN) {
 		ret = -ENAMETOOLONG;
 		mlog(ML_ERROR, "domain name length too long\n");
 		goto leave;
-- 
cgit v1.2.3


From a8bf2bc212e129dd59a8b06cdbc15079cc3bd876 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 24 Jun 2010 19:15:09 -0400
Subject: GFS2: O_TRUNC not working on stuffed files across cluster

This patch replaces a statement that got dropped out by accident.
Without the patch, truncates on stuffed (very small) files cause
those files to have an unpredictable size.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 4a48c0f4b402..84da64b551b2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1041,6 +1041,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size)
 
 	if (gfs2_is_stuffed(ip)) {
 		u64 dsize = size + sizeof(struct gfs2_inode);
+		ip->i_disksize = size;
 		ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
 		gfs2_trans_add_bh(ip->i_gl, dibh, 1);
 		gfs2_dinode_out(ip, dibh->b_data);
-- 
cgit v1.2.3


From b7dc2df5725fe7355fd76000ead7e39728e1b8a9 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 23 Jun 2010 11:44:47 -0400
Subject: GFS2: recovery stuck on transaction lock

This patch fixes bugzilla bug #590878: GFS2: recovery stuck on
transaction lock.  We set the frozen flag on the glock when we receive
a completion that cannot be delivered due to blocked locks. At that
point we check to see whether the first waiting holder has the noexp
flag set. If the noexp lock is queued later, then we need to unfreeze
the glock at that point in time, namely, in the glock work function.

This patch was originally written by Steve Whitehouse, but since
he's on holiday, I'm submitting it.  It's been well tested with a
complex recovery test called revolver.

Signed-off-by: Steve Whitehouse <swhiteho@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
---
 fs/gfs2/glock.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'fs')

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ddcdbf493536..dbab3fdc2582 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -706,8 +706,18 @@ static void glock_work_func(struct work_struct *work)
 {
 	unsigned long delay = 0;
 	struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_work.work);
+	struct gfs2_holder *gh;
 	int drop_ref = 0;
 
+	if (unlikely(test_bit(GLF_FROZEN, &gl->gl_flags))) {
+		spin_lock(&gl->gl_spin);
+		gh = find_first_waiter(gl);
+		if (gh && (gh->gh_flags & LM_FLAG_NOEXP) &&
+		    test_and_clear_bit(GLF_FROZEN, &gl->gl_flags))
+			set_bit(GLF_REPLY_PENDING, &gl->gl_flags);
+		spin_unlock(&gl->gl_spin);
+	}
+
 	if (test_and_clear_bit(GLF_REPLY_PENDING, &gl->gl_flags)) {
 		finish_xmote(gl, gl->gl_reply);
 		drop_ref = 1;
-- 
cgit v1.2.3


From b1becbdee776b447f203aa8da9a40488d5a75e1d Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Thu, 17 Jun 2010 16:45:37 -0400
Subject: GFS2: Fix kernel NULL pointer dereference by dlm_astd

This patch fixes a problem in an error path when looking
up dinodes.  There are two sister-functions, gfs2_inode_lookup
and gfs2_process_unlinked_inode.  Both functions acquire and
hold the i_iopen glock for the dinode being looked up. The last
thing they try to do is hold the i_gl glock for the dinode.
If that glock fails for some reason, the error path was
incorrectly calling gfs2_glock_put for the i_iopen glock twice.
This resulted in the glock being prematurely freed.  The
"minimum hold time" usually kept the glock in memory, but the
lock interface to dlm (aka lock_dlm) freed its memory for the
glock.  In some circumstances, it would cause dlm's dlm_astd daemon
to try to call the bast function for the freed lock_dlm memory,
which resulted in a NULL pointer dereference.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b5612cbb62a5..f03afd9c44bc 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -169,7 +169,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 {
 	struct inode *inode;
 	struct gfs2_inode *ip;
-	struct gfs2_glock *io_gl;
+	struct gfs2_glock *io_gl = NULL;
 	int error;
 
 	inode = gfs2_iget(sb, no_addr);
@@ -198,6 +198,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb,
 		ip->i_iopen_gh.gh_gl->gl_object = ip;
 
 		gfs2_glock_put(io_gl);
+		io_gl = NULL;
 
 		if ((type == DT_UNKNOWN) && (no_formal_ino == 0))
 			goto gfs2_nfsbypass;
@@ -228,7 +229,8 @@ gfs2_nfsbypass:
 fail_glock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-	gfs2_glock_put(io_gl);
+	if (io_gl)
+		gfs2_glock_put(io_gl);
 fail_put:
 	if (inode->i_state & I_NEW)
 		ip->i_gl->gl_object = NULL;
@@ -256,7 +258,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 {
 	struct gfs2_sbd *sdp;
 	struct gfs2_inode *ip;
-	struct gfs2_glock *io_gl;
+	struct gfs2_glock *io_gl = NULL;
 	int error;
 	struct gfs2_holder gh;
 	struct inode *inode;
@@ -293,6 +295,7 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 
 	ip->i_iopen_gh.gh_gl->gl_object = ip;
 	gfs2_glock_put(io_gl);
+	io_gl = NULL;
 
 	inode->i_mode = DT2IF(DT_UNKNOWN);
 
@@ -319,7 +322,8 @@ void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
 fail_glock:
 	gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
-	gfs2_glock_put(io_gl);
+	if (io_gl)
+		gfs2_glock_put(io_gl);
 fail_put:
 	ip->i_gl->gl_object = NULL;
 	gfs2_glock_put(ip->i_gl);
-- 
cgit v1.2.3


From 8b4216018bdbfbb1b76150d202b15ee68c38e991 Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Sun, 4 Jul 2010 01:33:24 -0400
Subject: GFS2: BUG in gfs2_adjust_quota

HighMem pages on i686 do not get mapped to the buffer_heads and this was
causing a NULL pointer dereference when we were trying to memset page buffers
to zero.
We now use zero_user() that kmaps the page and directly manipulates page data.
This patch also fixes a boundary condition that was incorrect.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/quota.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 49667d68769e..b256d6f24288 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -694,10 +694,8 @@ get_a_page:
 		if (!buffer_mapped(bh))
 			goto unlock_out;
 		/* If it's a newly allocated disk block for quota, zero it */
-		if (buffer_new(bh)) {
-			memset(bh->b_data, 0, bh->b_size);
-			set_buffer_uptodate(bh);
-		}
+		if (buffer_new(bh))
+			zero_user(page, pos - blocksize, bh->b_size);
 	}
 
 	if (PageUptodate(page))
@@ -723,7 +721,7 @@ get_a_page:
 
 	/* If quota straddles page boundary, we need to update the rest of the
 	 * quota at the beginning of the next page */
-	if (offset != 0) { /* first page, offset is closer to PAGE_CACHE_SIZE */
+	if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) {
 		ptr = ptr + nbytes;
 		nbytes = sizeof(struct gfs2_quota) - nbytes;
 		offset = 0;
-- 
cgit v1.2.3


From 728a756b8fcd22d80e2dbba8117a8a3aafd3f203 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 14 Jul 2010 18:12:26 -0400
Subject: GFS2: rename causes kernel Oops

This patch fixes a kernel Oops in the GFS2 rename code.

The problem was in the way the gfs2 directory code was trying
to re-use sentinel directory entries.

In the failing case, gfs2's rename function was renaming a
file to another name that had the same non-trivial length.
The file being renamed happened to be the first directory
entry on the leaf block.

First, the rename code (gfs2_rename in ops_inode.c) found the
original directory entry and decided it could do its job by
simply replacing the directory entry with another.  Therefore
it determined correctly that no block allocations were needed.

Next, the rename code deleted the old directory entry prior to
replacing it with the new name.  Therefore, the soon-to-be
replaced directory entry was temporarily made into a directory
entry "sentinel" or a place holder at the start of a leaf block.

Lastly, it went to re-add the replacement directory entry in
that leaf block.  However, when gfs2_dirent_find_space was
looking for space in the leaf block, it used the wrong value
for the sentinel.  That threw off its calculations so later
it decides it can't really re-use the sentinel and therefore
must allocate a new leaf block.  But because it previously decided
to re-use the directory entry, it didn't waste the time to
grab a new block allocation for the inode.  Therefore, the
inode's i_alloc pointer was still NULL and it crashes trying to
reference it.

In the case of sentinel directory entries, the entire dirent is
reused, not just the "free space" portion of it, and therefore
the function gfs2_dirent_find_space should use the value 0
rather than GFS2_DIRENT_SIZE(0) for the actual dirent size.

Fixing this calculation enables the reproducer programs to work
properly.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 8295c5b5d4a9..26ca3361a8bc 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -392,7 +392,7 @@ static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 	unsigned totlen = be16_to_cpu(dent->de_rec_len);
 
 	if (gfs2_dirent_sentinel(dent))
-		actual = GFS2_DIRENT_SIZE(0);
+		actual = 0;
 	if (totlen - actual >= required)
 		return 1;
 	return 0;
-- 
cgit v1.2.3


From f5e27b6ddfbafdd9c9c2f06bbf28af12581409bc Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Wed, 14 Jul 2010 11:19:32 +0800
Subject: ocfs2: Don't duplicate pages past i_size during CoW.

During CoW, the pages after i_size don't contain valid data, so there's
no need to read and duplicate them.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/refcounttree.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 32949df10694..3ac5aa733e9c 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2931,6 +2931,12 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 
 	offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
 	end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
+	/*
+	 * We only duplicate pages until we reach the page contains i_size - 1.
+	 * So trim 'end' to i_size.
+	 */
+	if (end > i_size_read(context->inode))
+		end = i_size_read(context->inode);
 
 	while (offset < end) {
 		page_index = offset >> PAGE_CACHE_SHIFT;
-- 
cgit v1.2.3


From a39953dd95ff10e311083d94f4f95c348cb22464 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Wed, 14 Jul 2010 22:38:21 +0800
Subject: ocfs2/dlm: Remove BUG_ON from migration in the rare case of a down
 node

For migration, we are waiting for DLM_LOCK_RES_MIGRATING flag to be set
before sending DLM_MIG_LOCKRES_MSG message to the target. We are using
dlm_migration_can_proceed() for that purpose.  However, if the node is
down, dlm_migration_can_proceed() will also return "go ahead".  In this
rare case, the DLM_LOCK_RES_MIGRATING flag might not be set yet. Remove
the BUG_ON() that trips over this condition.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4a7506a4e314..94b97fc6a88e 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2808,14 +2808,8 @@ again:
 		mlog(0, "trying again...\n");
 		goto again;
 	}
-	/* now that we are sure the MIGRATING state is there, drop
-	 * the unneded state which blocked threads trying to DIRTY */
-	spin_lock(&res->spinlock);
-	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
-	BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
-	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
-	spin_unlock(&res->spinlock);
 
+	ret = 0;
 	/* did the target go down or die? */
 	spin_lock(&dlm->spinlock);
 	if (!test_bit(target, dlm->domain_map)) {
@@ -2825,10 +2819,22 @@ again:
 	}
 	spin_unlock(&dlm->spinlock);
 
+	/*
+	 * if target is down, we need to clear DLM_LOCK_RES_BLOCK_DIRTY for
+	 * another try; otherwise, we are sure the MIGRATING state is there,
+	 * drop the unneded state which blocked threads trying to DIRTY
+	 */
+	spin_lock(&res->spinlock);
+	BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY));
+	res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY;
+	if (!ret)
+		BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING));
+	spin_unlock(&res->spinlock);
+
 	/*
 	 * at this point:
 	 *
-	 *   o the DLM_LOCK_RES_MIGRATING flag is set
+	 *   o the DLM_LOCK_RES_MIGRATING flag is set if target not down
 	 *   o there are no pending asts on this lockres
 	 *   o all processes trying to reserve an ast on this
 	 *     lockres must wait for the MIGRATING flag to clear
-- 
cgit v1.2.3


From 13ceef099edd2b70c5a6f3a9ef5d6d97cda2e096 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 14 Jul 2010 07:56:33 +0200
Subject: jbd2/ocfs2: Fix block checksumming when a buffer is used in several
 transactions

OCFS2 uses t_commit trigger to compute and store checksum of the just
committed blocks. When a buffer has b_frozen_data, checksum is computed
for it instead of b_data but this can result in an old checksum being
written to the filesystem in the following scenario:

1) transaction1 is opened
2) handle1 is opened
3) journal_access(handle1, bh)
    - This sets jh->b_transaction to transaction1
4) modify(bh)
5) journal_dirty(handle1, bh)
6) handle1 is closed
7) start committing transaction1, opening transaction2
8) handle2 is opened
9) journal_access(handle2, bh)
    - This copies off b_frozen_data to make it safe for transaction1 to commit.
      jh->b_next_transaction is set to transaction2.
10) jbd2_journal_write_metadata() checksums b_frozen_data
11) the journal correctly writes b_frozen_data to the disk journal
12) handle2 is closed
    - There was no dirty call for the bh on handle2, so it is never queued for
      any more journal operation
13) Checkpointing finally happens, and it just spools the bh via normal buffer
writeback.  This will write b_data, which was never triggered on and thus
contains a wrong (old) checksum.

This patch fixes the problem by calling the trigger at the moment data is
frozen for journal commit - i.e., either when b_frozen_data is created by
do_get_write_access or just before we write a buffer to the log if
b_frozen_data does not exist. We also rename the trigger to t_frozen as
that better describes when it is called.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/jbd2/journal.c     | 15 +++++++--------
 fs/jbd2/transaction.c |  9 ++++++---
 fs/ocfs2/journal.c    | 24 ++++++++++++------------
 3 files changed, 25 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index bc2ff5932769..036880895bfc 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -297,7 +297,6 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
 	struct page *new_page;
 	unsigned int new_offset;
 	struct buffer_head *bh_in = jh2bh(jh_in);
-	struct jbd2_buffer_trigger_type *triggers;
 	journal_t *journal = transaction->t_journal;
 
 	/*
@@ -328,21 +327,21 @@ repeat:
 		done_copy_out = 1;
 		new_page = virt_to_page(jh_in->b_frozen_data);
 		new_offset = offset_in_page(jh_in->b_frozen_data);
-		triggers = jh_in->b_frozen_triggers;
 	} else {
 		new_page = jh2bh(jh_in)->b_page;
 		new_offset = offset_in_page(jh2bh(jh_in)->b_data);
-		triggers = jh_in->b_triggers;
 	}
 
 	mapped_data = kmap_atomic(new_page, KM_USER0);
 	/*
-	 * Fire any commit trigger.  Do this before checking for escaping,
-	 * as the trigger may modify the magic offset.  If a copy-out
-	 * happens afterwards, it will have the correct data in the buffer.
+	 * Fire data frozen trigger if data already wasn't frozen.  Do this
+	 * before checking for escaping, as the trigger may modify the magic
+	 * offset.  If a copy-out happens afterwards, it will have the correct
+	 * data in the buffer.
 	 */
-	jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
-				   triggers);
+	if (!done_copy_out)
+		jbd2_buffer_frozen_trigger(jh_in, mapped_data + new_offset,
+					   jh_in->b_triggers);
 
 	/*
 	 * Check for escaping
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e214d68620ac..b8e0806681bb 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -725,6 +725,9 @@ done:
 		page = jh2bh(jh)->b_page;
 		offset = ((unsigned long) jh2bh(jh)->b_data) & ~PAGE_MASK;
 		source = kmap_atomic(page, KM_USER0);
+		/* Fire data frozen trigger just before we copy the data */
+		jbd2_buffer_frozen_trigger(jh, source + offset,
+					   jh->b_triggers);
 		memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
 		kunmap_atomic(source, KM_USER0);
 
@@ -963,15 +966,15 @@ void jbd2_journal_set_triggers(struct buffer_head *bh,
 	jh->b_triggers = type;
 }
 
-void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+void jbd2_buffer_frozen_trigger(struct journal_head *jh, void *mapped_data,
 				struct jbd2_buffer_trigger_type *triggers)
 {
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (!triggers || !triggers->t_commit)
+	if (!triggers || !triggers->t_frozen)
 		return;
 
-	triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+	triggers->t_frozen(triggers, bh, mapped_data, bh->b_size);
 }
 
 void jbd2_buffer_abort_trigger(struct journal_head *jh,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 39113b5e79e7..625de9d7088c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -472,7 +472,7 @@ static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger
 	return container_of(triggers, struct ocfs2_triggers, ot_triggers);
 }
 
-static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -491,7 +491,7 @@ static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
  * Quota blocks have their own trigger because the struct ocfs2_block_check
  * offset depends on the blocksize.
  */
-static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_dq_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -511,7 +511,7 @@ static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
  * Directory blocks also have their own trigger because the
  * struct ocfs2_block_check offset depends on the blocksize.
  */
-static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+static void ocfs2_db_frozen_trigger(struct jbd2_buffer_trigger_type *triggers,
 				 struct buffer_head *bh,
 				 void *data, size_t size)
 {
@@ -544,7 +544,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
 
 static struct ocfs2_triggers di_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dinode, i_check),
@@ -552,7 +552,7 @@ static struct ocfs2_triggers di_triggers = {
 
 static struct ocfs2_triggers eb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_extent_block, h_check),
@@ -560,7 +560,7 @@ static struct ocfs2_triggers eb_triggers = {
 
 static struct ocfs2_triggers rb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_refcount_block, rf_check),
@@ -568,7 +568,7 @@ static struct ocfs2_triggers rb_triggers = {
 
 static struct ocfs2_triggers gd_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_group_desc, bg_check),
@@ -576,14 +576,14 @@ static struct ocfs2_triggers gd_triggers = {
 
 static struct ocfs2_triggers db_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_db_commit_trigger,
+		.t_frozen = ocfs2_db_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 };
 
 static struct ocfs2_triggers xb_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_xattr_block, xb_check),
@@ -591,14 +591,14 @@ static struct ocfs2_triggers xb_triggers = {
 
 static struct ocfs2_triggers dq_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_dq_commit_trigger,
+		.t_frozen = ocfs2_dq_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 };
 
 static struct ocfs2_triggers dr_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dx_root_block, dr_check),
@@ -606,7 +606,7 @@ static struct ocfs2_triggers dr_triggers = {
 
 static struct ocfs2_triggers dl_triggers = {
 	.ot_triggers = {
-		.t_commit = ocfs2_commit_trigger,
+		.t_frozen = ocfs2_frozen_trigger,
 		.t_abort = ocfs2_abort_trigger,
 	},
 	.ot_offset	= offsetof(struct ocfs2_dx_leaf, dl_check),
-- 
cgit v1.2.3


From 01a92f174f8a3b99dbb5e02c86e7ee1e576737af Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Jul 2010 13:24:32 -0700
Subject: ceph: reuse request message when replaying against recovering mds

Replayed rename operations (after an mds failure/recovery) were broken
because the request paths were regenerated from the dentry names, which
get mangled when d_move() is called.

Instead, resend the previous request message when replaying completed
operations.  Just make sure the REPLAY flag is set and the target ino is
filled in.

This fixes problems with workloads doing renames when the MDS restarts,
where the rename operation appears to succeed, but on mds restart then
fails (leading to client confusion, app breakage, etc.).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 3ab79f6c4ce8..23332bc44515 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1580,6 +1580,27 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
 	     req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
 
+	if (req->r_got_unsafe) {
+		/*
+		 * Replay.  Do not regenerate message (and rebuild
+		 * paths, etc.); just use the original message.
+		 * Rebuilding paths will break for renames because
+		 * d_move mangles the src name.
+		 */
+		msg = req->r_request;
+		rhead = msg->front.iov_base;
+
+		flags = le32_to_cpu(rhead->flags);
+		flags |= CEPH_MDS_FLAG_REPLAY;
+		rhead->flags = cpu_to_le32(flags);
+
+		if (req->r_target_inode)
+			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
+
+		rhead->num_retry = req->r_attempts - 1;
+		return 0;
+	}
+
 	if (req->r_request) {
 		ceph_msg_put(req->r_request);
 		req->r_request = NULL;
@@ -1601,13 +1622,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 	rhead->flags = cpu_to_le32(flags);
 	rhead->num_fwd = req->r_num_fwd;
 	rhead->num_retry = req->r_attempts - 1;
+	rhead->ino = 0;
 
 	dout(" r_locked_dir = %p\n", req->r_locked_dir);
-
-	if (req->r_target_inode && req->r_got_unsafe)
-		rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
-	else
-		rhead->ino = 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From e979cf50395e24c4bdd489f60e2d5dd5ae66d255 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 15 Jul 2010 14:58:39 -0700
Subject: ceph: do not include cap/dentry releases in replayed messages

Strip the cap and dentry releases from replayed messages.  They can
cause the shared state to get out of sync because they were generated
(with the request message) earlier, and no longer reflect the current
client state.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 8 ++++++++
 fs/ceph/mds_client.h | 1 +
 2 files changed, 9 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 23332bc44515..416c08d315db 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1514,6 +1514,9 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	ceph_encode_filepath(&p, end, ino1, path1);
 	ceph_encode_filepath(&p, end, ino2, path2);
 
+	/* make note of release offset, in case we need to replay */
+	req->r_request_release_offset = p - msg->front.iov_base;
+
 	/* cap releases */
 	releases = 0;
 	if (req->r_inode_drop)
@@ -1598,6 +1601,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
 			rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
 
 		rhead->num_retry = req->r_attempts - 1;
+
+		/* remove cap/dentry releases from message */
+		rhead->num_releases = 0;
+		msg->hdr.front_len = cpu_to_le32(req->r_request_release_offset);
+		msg->front.iov_len = req->r_request_release_offset;
 		return 0;
 	}
 
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index b292fa42a66d..952410c60d09 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -188,6 +188,7 @@ struct ceph_mds_request {
 	int r_old_inode_drop, r_old_inode_unless;
 
 	struct ceph_msg  *r_request;  /* original request */
+	int r_request_release_offset;
 	struct ceph_msg  *r_reply;
 	struct ceph_mds_reply_info_parsed r_reply_info;
 	int r_err;
-- 
cgit v1.2.3


From 5453258d532e72731b0829e4fefd36dd611a2fff Mon Sep 17 00:00:00 2001
From: Joel Becker <joel.becker@oracle.com>
Date: Fri, 16 Jul 2010 13:32:33 -0700
Subject: ocfs2: Silence gcc warning in ocfs2_write_zero_page().

ocfs2_write_zero_page() has a loop that won't ever be skipped, but gcc
doesn't know that.  Set ret=0 just to make gcc happy.

Signed-off-by: Joel Becker <joel.becker@oracle.com>
---
 fs/ocfs2/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ac15911b31c4..2b10b36d1577 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -767,7 +767,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	struct page *page;
 	unsigned long index = abs_from >> PAGE_CACHE_SHIFT;
 	handle_t *handle = NULL;
-	int ret;
+	int ret = 0;
 	unsigned zero_from, zero_to, block_start, block_end;
 
 	BUG_ON(abs_from >= abs_to);
-- 
cgit v1.2.3


From 7f8275d0d660c146de6ee3017e1e2e594c49e820 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Mon, 19 Jul 2010 14:56:17 +1000
Subject: mm: add context argument to shrinker callback

The current shrinker implementation requires the registered callback
to have global state to work from. This makes it difficult to shrink
caches that are not global (e.g. per-filesystem caches). Pass the shrinker
structure to the callback so that users can embed the shrinker structure
in the context the shrinker needs to operate on and get back to it in the
callback via container_of().

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/dcache.c                 | 2 +-
 fs/gfs2/glock.c             | 2 +-
 fs/gfs2/quota.c             | 2 +-
 fs/gfs2/quota.h             | 2 +-
 fs/inode.c                  | 2 +-
 fs/mbcache.c                | 5 +++--
 fs/nfs/dir.c                | 2 +-
 fs/nfs/internal.h           | 3 ++-
 fs/quota/dquot.c            | 2 +-
 fs/ubifs/shrinker.c         | 2 +-
 fs/ubifs/ubifs.h            | 2 +-
 fs/xfs/linux-2.6/xfs_buf.c  | 5 +++--
 fs/xfs/linux-2.6/xfs_sync.c | 1 +
 fs/xfs/quota/xfs_qm.c       | 7 +++++--
 14 files changed, 23 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index c8c78ba07827..86d4db15473e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -896,7 +896,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
  *
  * In this case we return -1 to tell the caller that we baled.
  */
-static int shrink_dcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		if (!(gfp_mask & __GFP_FS))
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index dbab3fdc2582..0898f3ec8212 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1358,7 +1358,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
 }
 
 
-static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
+static int gfs2_shrink_glock_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	struct gfs2_glock *gl;
 	int may_demote;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index b256d6f24288..8f02d3db8f42 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -77,7 +77,7 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
 
-int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask)
+int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 195f60c8bd14..e7d236ca48bd 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -51,7 +51,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern int gfs2_shrink_qd_memory(int nr, gfp_t gfp_mask);
+extern int gfs2_shrink_qd_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/inode.c b/fs/inode.c
index 2bee20ae3d65..722860b323a9 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -512,7 +512,7 @@ static void prune_icache(int nr_to_scan)
  * This function is passed the number of inodes to scan, and it returns the
  * total number of remaining possibly-reclaimable inodes.
  */
-static int shrink_icache_memory(int nr, gfp_t gfp_mask)
+static int shrink_icache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		/*
diff --git a/fs/mbcache.c b/fs/mbcache.c
index ec88ff3d04a9..e28f21b95344 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -115,7 +115,7 @@ mb_cache_indexes(struct mb_cache *cache)
  * What the mbcache registers as to get shrunk dynamically.
  */
 
-static int mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask);
+static int mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 
 static struct shrinker mb_cache_shrinker = {
 	.shrink = mb_cache_shrink_fn,
@@ -191,13 +191,14 @@ forget:
  * This function is called by the kernel memory management when memory
  * gets low.
  *
+ * @shrink: (ignored)
  * @nr_to_scan: Number of objects to scan
  * @gfp_mask: (ignored)
  *
  * Returns the number of objects which are present in the cache.
  */
 static int
-mb_cache_shrink_fn(int nr_to_scan, gfp_t gfp_mask)
+mb_cache_shrink_fn(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(free_list);
 	struct list_head *l, *ltmp;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 782b431ef91c..e60416d3f818 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1710,7 +1710,7 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask)
+int nfs_access_cache_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d8bd619e386c..e70f44b9b3f4 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -205,7 +205,8 @@ extern struct rpc_procinfo nfs4_procedures[];
 void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 
 /* dir.c */
-extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
+extern int nfs_access_cache_shrinker(struct shrinker *shrink,
+					int nr_to_scan, gfp_t gfp_mask);
 
 /* inode.c */
 extern struct workqueue_struct *nfsiod_workqueue;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 12c233da1b6b..437d2ca2de97 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -676,7 +676,7 @@ static void prune_dqcache(int count)
  * This is called from kswapd when we think we need some
  * more memory
  */
-static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
+static int shrink_dqcache_memory(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	if (nr) {
 		spin_lock(&dq_list_lock);
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 02feb59cefca..0b201114a5ad 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,7 +277,7 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(int nr, gfp_t gfp_mask)
+int ubifs_shrinker(struct shrinker *shrink, int nr, gfp_t gfp_mask)
 {
 	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 2eef553d50c8..04310878f449 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1575,7 +1575,7 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(int nr_to_scan, gfp_t gfp_mask);
+int ubifs_shrinker(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 649ade8ef598..2ee3f7a60163 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -45,7 +45,7 @@
 
 static kmem_zone_t *xfs_buf_zone;
 STATIC int xfsbufd(void *);
-STATIC int xfsbufd_wakeup(int, gfp_t);
+STATIC int xfsbufd_wakeup(struct shrinker *, int, gfp_t);
 STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
 static struct shrinker xfs_buf_shake = {
 	.shrink = xfsbufd_wakeup,
@@ -340,7 +340,7 @@ _xfs_buf_lookup_pages(
 					__func__, gfp_mask);
 
 			XFS_STATS_INC(xb_page_retries);
-			xfsbufd_wakeup(0, gfp_mask);
+			xfsbufd_wakeup(NULL, 0, gfp_mask);
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
@@ -1762,6 +1762,7 @@ xfs_buf_runall_queues(
 
 STATIC int
 xfsbufd_wakeup(
+	struct shrinker		*shrink,
 	int			priority,
 	gfp_t			mask)
 {
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index ef7f0218bccb..be375827af98 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -838,6 +838,7 @@ static struct rw_semaphore xfs_mount_list_lock;
 
 static int
 xfs_reclaim_inode_shrink(
+	struct shrinker	*shrink,
 	int		nr_to_scan,
 	gfp_t		gfp_mask)
 {
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8c117ff2e3ab..67c018392d62 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -69,7 +69,7 @@ STATIC void	xfs_qm_list_destroy(xfs_dqlist_t *);
 
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(int, gfp_t);
+STATIC int	xfs_qm_shake(struct shrinker *, int, gfp_t);
 
 static struct shrinker xfs_qm_shaker = {
 	.shrink = xfs_qm_shake,
@@ -2117,7 +2117,10 @@ xfs_qm_shake_freelist(
  */
 /* ARGSUSED */
 STATIC int
-xfs_qm_shake(int nr_to_scan, gfp_t gfp_mask)
+xfs_qm_shake(
+	struct shrinker	*shrink,
+	int		nr_to_scan,
+	gfp_t		gfp_mask)
 {
 	int	ndqused, nfree, n;
 
-- 
cgit v1.2.3


From cffab6bc5511cd6f67a60bf16b62de4267b68c4c Mon Sep 17 00:00:00 2001
From: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Date: Mon, 19 Jul 2010 09:22:35 +0200
Subject: [S390] dasd: use correct label location for diag fba disks

Partition boundary calculation fails for DASD FBA disks under the
following conditions:
- disk is formatted with CMS FORMAT with a blocksize of more than
  512 bytes
- all of the disk is reserved to a single CMS file using CMS RESERVE
- the disk is accessed using the DIAG mode of the DASD driver

Under these circumstances, the partition detection code tries to
read the CMS label block containing partition-relevant information
from logical block offset 1, while it is in fact located at physical
block offset 1.

Fix this problem by using the correct CMS label block location
depending on the device type as determined by the DASD SENSE ID
information.

Signed-off-by: Peter Oberparleiter <peter.oberparleiter@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 fs/partitions/ibm.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 3e73de5967ff..fc8497643fd0 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -74,6 +74,7 @@ int ibm_partition(struct parsed_partitions *state)
 	} *label;
 	unsigned char *data;
 	Sector sect;
+	sector_t labelsect;
 
 	res = 0;
 	blocksize = bdev_logical_block_size(bdev);
@@ -97,11 +98,20 @@ int ibm_partition(struct parsed_partitions *state)
 	    ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo) != 0)
 		goto out_freeall;
 
+	/*
+	 * Special case for FBA disks: label sector does not depend on
+	 * blocksize.
+	 */
+	if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) ||
+	    (info->cu_type == 0x3880 && info->dev_type == 0x3370))
+		labelsect = info->label_block;
+	else
+		labelsect = info->label_block * (blocksize >> 9);
+
 	/*
 	 * Get volume label, extract name and type.
 	 */
-	data = read_part_sector(state, info->label_block*(blocksize/512),
-				&sect);
+	data = read_part_sector(state, labelsect, &sect);
 	if (data == NULL)
 		goto out_readerr;
 
-- 
cgit v1.2.3


From 99d8f83c98930100cd70437b0c81a935e7a14b0b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 7 Jul 2010 10:51:48 -0400
Subject: Btrfs: fix split_leaf double split corner case

split_leaf was not properly balancing leaves when it was forced to
split a leaf twice.  This commit adds an extra push left and right
before forcing the double split in hopes of getting the slot where
we want to insert at either the start or end of the leaf.

If the extra pushes do work, then we are able to avoid splitting twice
and we keep the tree properly balanced.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 111 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 0d1d966b0fe4..c3df14ce2cc2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2304,12 +2304,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
 	return ret;
 }
 
+/*
+ * min slot controls the lowest index we're willing to push to the
+ * right.  We'll push up to and including min_slot, but no lower
+ */
 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      struct btrfs_path *path,
 				      int data_size, int empty,
 				      struct extent_buffer *right,
-				      int free_space, u32 left_nritems)
+				      int free_space, u32 left_nritems,
+				      u32 min_slot)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *upper = path->nodes[1];
@@ -2327,7 +2332,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	if (empty)
 		nr = 0;
 	else
-		nr = 1;
+		nr = max_t(u32, 1, min_slot);
 
 	if (path->slots[0] >= left_nritems)
 		push_space += data_size;
@@ -2469,10 +2474,14 @@ out_unlock:
  *
  * returns 1 if the push failed because the other node didn't have enough
  * room, 0 if everything worked out and < 0 if there were major errors.
+ *
+ * this will push starting from min_slot to the end of the leaf.  It won't
+ * push any slot lower than min_slot
  */
 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-			   *root, struct btrfs_path *path, int data_size,
-			   int empty)
+			   *root, struct btrfs_path *path,
+			   int min_data_size, int data_size,
+			   int empty, u32 min_slot)
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *right;
@@ -2514,8 +2523,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (left_nritems == 0)
 		goto out_unlock;
 
-	return __push_leaf_right(trans, root, path, data_size, empty,
-				right, free_space, left_nritems);
+	return __push_leaf_right(trans, root, path, min_data_size, empty,
+				right, free_space, left_nritems, min_slot);
 out_unlock:
 	btrfs_tree_unlock(right);
 	free_extent_buffer(right);
@@ -2525,12 +2534,17 @@ out_unlock:
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
+ * items
  */
 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct btrfs_path *path, int data_size,
 				     int empty, struct extent_buffer *left,
-				     int free_space, int right_nritems)
+				     int free_space, u32 right_nritems,
+				     u32 max_slot)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *right = path->nodes[0];
@@ -2549,9 +2563,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	slot = path->slots[1];
 
 	if (empty)
-		nr = right_nritems;
+		nr = min(right_nritems, max_slot);
 	else
-		nr = right_nritems - 1;
+		nr = min(right_nritems - 1, max_slot);
 
 	for (i = 0; i < nr; i++) {
 		item = btrfs_item_nr(right, i);
@@ -2712,10 +2726,14 @@ out:
 /*
  * push some data in the path leaf to the left, trying to free up at
  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * max_slot can put a limit on how far into the leaf we'll push items.  The
+ * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
+ * items
  */
 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
-			  *root, struct btrfs_path *path, int data_size,
-			  int empty)
+			  *root, struct btrfs_path *path, int min_data_size,
+			  int data_size, int empty, u32 max_slot)
 {
 	struct extent_buffer *right = path->nodes[0];
 	struct extent_buffer *left;
@@ -2761,8 +2779,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 		goto out;
 	}
 
-	return __push_leaf_left(trans, root, path, data_size,
-			       empty, left, free_space, right_nritems);
+	return __push_leaf_left(trans, root, path, min_data_size,
+			       empty, left, free_space, right_nritems,
+			       max_slot);
 out:
 	btrfs_tree_unlock(left);
 	free_extent_buffer(left);
@@ -2854,6 +2873,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+/*
+ * double splits happen when we need to insert a big item in the middle
+ * of a leaf.  A double split can leave us with 3 mostly empty leaves:
+ * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
+ *          A                 B                 C
+ *
+ * We avoid this by trying to push the items on either side of our target
+ * into the adjacent leaves.  If all goes well we can avoid the double split
+ * completely.
+ */
+static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  int data_size)
+{
+	int ret;
+	int progress = 0;
+	int slot;
+	u32 nritems;
+
+	slot = path->slots[0];
+
+	/*
+	 * try to push all the items after our slot into the
+	 * right leaf
+	 */
+	ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	nritems = btrfs_header_nritems(path->nodes[0]);
+	/*
+	 * our goal is to get our slot at the start or end of a leaf.  If
+	 * we've done so we're done
+	 */
+	if (path->slots[0] == 0 || path->slots[0] == nritems)
+		return 0;
+
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+
+	/* try to push all the items before our slot into the next leaf */
+	slot = path->slots[0];
+	ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0)
+		progress++;
+
+	if (progress)
+		return 0;
+	return 1;
+}
+
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
@@ -2876,6 +2953,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int wret;
 	int split;
 	int num_doubles = 0;
+	int tried_avoid_double = 0;
 
 	l = path->nodes[0];
 	slot = path->slots[0];
@@ -2884,12 +2962,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 		return -EOVERFLOW;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
-		wret = push_leaf_right(trans, root, path, data_size, 0);
+	if (data_size) {
+		wret = push_leaf_right(trans, root, path, data_size,
+				       data_size, 0, 0);
 		if (wret < 0)
 			return wret;
 		if (wret) {
-			wret = push_leaf_left(trans, root, path, data_size, 0);
+			wret = push_leaf_left(trans, root, path, data_size,
+					      data_size, 0, (u32)-1);
 			if (wret < 0)
 				return wret;
 		}
@@ -2923,6 +3003,8 @@ again:
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
 					split = 2;
 				}
 			}
@@ -2939,6 +3021,8 @@ again:
 				if (mid != nritems &&
 				    leaf_space_used(l, mid, nritems - mid) +
 				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					if (data_size && !tried_avoid_double)
+						goto push_for_double;
 					split = 2 ;
 				}
 			}
@@ -3019,6 +3103,13 @@ again:
 	}
 
 	return ret;
+
+push_for_double:
+	push_for_double_split(trans, root, path, data_size);
+	tried_avoid_double = 1;
+	if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
+		return 0;
+	goto again;
 }
 
 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3915,13 +4006,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 			extent_buffer_get(leaf);
 
 			btrfs_set_path_blocking(path);
-			wret = push_leaf_left(trans, root, path, 1, 1);
+			wret = push_leaf_left(trans, root, path, 1, 1,
+					      1, (u32)-1);
 			if (wret < 0 && wret != -ENOSPC)
 				ret = wret;
 
 			if (path->nodes[0] == leaf &&
 			    btrfs_header_nritems(leaf)) {
-				wret = push_leaf_right(trans, root, path, 1, 1);
+				wret = push_leaf_right(trans, root, path, 1,
+						       1, 1, 0);
 				if (wret < 0 && wret != -ENOSPC)
 					ret = wret;
 			}
-- 
cgit v1.2.3


From b5384d48f4e74edec3ca1887cb65e378a72af9a1 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sat, 12 Jun 2010 22:31:14 +0000
Subject: Btrfs: fix CLONE ioctl destination file size expansion to block
 boundary

The CLONE and CLONE_RANGE ioctls round up the range of extents being
cloned to the block size when the range to clone extends to the end of file
(this is always the case with CLONE).  It was then using that offset when
extending the destination file's i_size.  Fix this by not setting i_size
beyond the originally requested ending offset.

This bug was introduced by a22285a6 (2.6.35-rc1).

Signed-off-by: Sage Weil <sage@newdream.net>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4dbaf89b1337..2a8b3a7568ad 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1578,6 +1578,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			u64 disko = 0, diskl = 0;
 			u64 datao = 0, datal = 0;
 			u8 comp;
+			u64 endoff;
 
 			size = btrfs_item_size_nr(leaf, slot);
 			read_extent_buffer(leaf, buf,
@@ -1712,9 +1713,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 			btrfs_release_path(root, path);
 
 			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-			if (new_key.offset + datal > inode->i_size)
-				btrfs_i_size_write(inode,
-						   new_key.offset + datal);
+
+			/*
+			 * we round up to the block size at eof when
+			 * determining which extents to clone above,
+			 * but shouldn't round up the file size
+			 */
+			endoff = new_key.offset + datal;
+			if (endoff > off+olen)
+				endoff = off+olen;
+			if (endoff > inode->i_size)
+				btrfs_i_size_write(inode, endoff);
+
 			BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
 			ret = btrfs_update_inode(trans, root, inode);
 			BUG_ON(ret);
-- 
cgit v1.2.3


From 2ebc3464781ad24474abcbd2274e6254689853b5 Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <dan.j.rosenberg@gmail.com>
Date: Mon, 19 Jul 2010 16:58:20 -0400
Subject: Btrfs: fix checks in BTRFS_IOC_CLONE_RANGE

1.  The BTRFS_IOC_CLONE and BTRFS_IOC_CLONE_RANGE ioctls should check
whether the donor file is append-only before writing to it.

2.  The BTRFS_IOC_CLONE_RANGE ioctl appears to have an integer
overflow that allows a user to specify an out-of-bounds range to copy
from the source file (if off + len wraps around).  I haven't been able
to successfully exploit this, but I'd imagine that a clever attacker
could use this to read things he shouldn't.  Even if it's not
exploitable, it couldn't hurt to be safe.

Signed-off-by: Dan Rosenberg <dan.j.rosenberg@gmail.com>
cc: stable@kernel.org
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2a8b3a7568ad..9254b3d58dbe 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1458,7 +1458,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 */
 
 	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE))
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
 		return -EINVAL;
 
 	ret = mnt_want_write(file->f_path.mnt);
@@ -1511,7 +1511,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 
 	/* determine range to clone */
 	ret = -EINVAL;
-	if (off >= src->i_size || off + len > src->i_size)
+	if (off + len > src->i_size || off + len < off)
 		goto out_unlock;
 	if (len == 0)
 		olen = len = src->i_size - off;
-- 
cgit v1.2.3


From 70e60ce71516c3a9e882edb70a09f696a05961db Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 20 Jul 2010 08:07:02 +1000
Subject: xfs: convert inode shrinker to per-filesystem contexts

Now the shrinker passes us a context, wire up a shrinker context per
filesystem. This allows us to remove the global mount list and the
locking problems that introduced. It also means that a shrinker call
does not need to traverse clean filesystems before finding a
filesystem with reclaimable inodes.  This significantly reduces
scanning overhead when lots of filesystems are present.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_super.c |  2 --
 fs/xfs/linux-2.6/xfs_sync.c  | 62 ++++++++++----------------------------------
 fs/xfs/linux-2.6/xfs_sync.h  |  2 --
 fs/xfs/xfs_mount.h           |  2 +-
 4 files changed, 15 insertions(+), 53 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f2d1718c9165..80938c736c27 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1883,7 +1883,6 @@ init_xfs_fs(void)
 		goto out_cleanup_procfs;
 
 	vfs_initquota();
-	xfs_inode_shrinker_init();
 
 	error = register_filesystem(&xfs_fs_type);
 	if (error)
@@ -1911,7 +1910,6 @@ exit_xfs_fs(void)
 {
 	vfs_exitquota();
 	unregister_filesystem(&xfs_fs_type);
-	xfs_inode_shrinker_destroy();
 	xfs_sysctl_unregister();
 	xfs_cleanup_procfs();
 	xfs_buf_terminate();
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index be375827af98..f433819611cb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -828,14 +828,7 @@ xfs_reclaim_inodes(
 
 /*
  * Shrinker infrastructure.
- *
- * This is all far more complex than it needs to be. It adds a global list of
- * mounts because the shrinkers can only call a global context. We need to make
- * the shrinkers pass a context to avoid the need for global state.
  */
-static LIST_HEAD(xfs_mount_list);
-static struct rw_semaphore xfs_mount_list_lock;
-
 static int
 xfs_reclaim_inode_shrink(
 	struct shrinker	*shrink,
@@ -847,65 +840,38 @@ xfs_reclaim_inode_shrink(
 	xfs_agnumber_t	ag;
 	int		reclaimable = 0;
 
+	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
 	if (nr_to_scan) {
 		if (!(gfp_mask & __GFP_FS))
 			return -1;
 
-		down_read(&xfs_mount_list_lock);
-		list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-			xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
+		xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0,
 					XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan);
-			if (nr_to_scan <= 0)
-				break;
-		}
-		up_read(&xfs_mount_list_lock);
-	}
+		/* if we don't exhaust the scan, don't bother coming back */
+		if (nr_to_scan > 0)
+			return -1;
+       }
 
-	down_read(&xfs_mount_list_lock);
-	list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
-		for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-			pag = xfs_perag_get(mp, ag);
-			reclaimable += pag->pag_ici_reclaimable;
-			xfs_perag_put(pag);
-		}
+	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
+		pag = xfs_perag_get(mp, ag);
+		reclaimable += pag->pag_ici_reclaimable;
+		xfs_perag_put(pag);
 	}
-	up_read(&xfs_mount_list_lock);
 	return reclaimable;
 }
 
-static struct shrinker xfs_inode_shrinker = {
-	.shrink = xfs_reclaim_inode_shrink,
-	.seeks = DEFAULT_SEEKS,
-};
-
-void __init
-xfs_inode_shrinker_init(void)
-{
-	init_rwsem(&xfs_mount_list_lock);
-	register_shrinker(&xfs_inode_shrinker);
-}
-
-void
-xfs_inode_shrinker_destroy(void)
-{
-	ASSERT(list_empty(&xfs_mount_list));
-	unregister_shrinker(&xfs_inode_shrinker);
-}
-
 void
 xfs_inode_shrinker_register(
 	struct xfs_mount	*mp)
 {
-	down_write(&xfs_mount_list_lock);
-	list_add_tail(&mp->m_mplist, &xfs_mount_list);
-	up_write(&xfs_mount_list_lock);
+	mp->m_inode_shrink.shrink = xfs_reclaim_inode_shrink;
+	mp->m_inode_shrink.seeks = DEFAULT_SEEKS;
+	register_shrinker(&mp->m_inode_shrink);
 }
 
 void
 xfs_inode_shrinker_unregister(
 	struct xfs_mount	*mp)
 {
-	down_write(&xfs_mount_list_lock);
-	list_del(&mp->m_mplist);
-	up_write(&xfs_mount_list_lock);
+	unregister_shrinker(&mp->m_inode_shrink);
 }
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index cdcbaaca9880..e28139aaa4aa 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -55,8 +55,6 @@ int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
 	int flags, int tag, int write_lock, int *nr_to_scan);
 
-void xfs_inode_shrinker_init(void);
-void xfs_inode_shrinker_destroy(void);
 void xfs_inode_shrinker_register(struct xfs_mount *mp);
 void xfs_inode_shrinker_unregister(struct xfs_mount *mp);
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d2c7eed4eda..5761087ee8ea 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -259,7 +259,7 @@ typedef struct xfs_mount {
 	wait_queue_head_t	m_wait_single_sync_task;
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
-	struct list_head	m_mplist;	/* inode shrinker mount list */
+	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
 } xfs_mount_t;
 
 /*
-- 
cgit v1.2.3


From 16fd5367370099b59d96e30bb7d9de8d419659f2 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 20 Jul 2010 09:43:39 +1000
Subject: xfs: track AGs with reclaimable inodes in per-ag radix tree

https://bugzilla.kernel.org/show_bug.cgi?id=16348

When the filesystem grows to a large number of allocation groups,
the summing of recalimable inodes gets expensive. In many cases,
most AGs won't have any reclaimable inodes and so we are wasting CPU
time aggregating over these AGs. This is particularly important for
the inode shrinker that gets called frequently under memory
pressure.

To avoid the overhead, track AGs with reclaimable inodes in the
per-ag radix tree so that we can find all the AGs with reclaimable
inodes via a simple gang tag lookup. This involves setting the tag
when the first reclaimable inode is tracked in the AG, and removing
the tag when the last reclaimable inode is removed from the tree.
Then the summation process becomes a loop walking the radix tree
summing AGs with the reclaim tag set.

This significantly reduces the overhead of scanning - a 6400 AG
filesystea now only uses about 25% of a cpu in kswapd while slab
reclaim progresses instead of being permanently stuck at 100% CPU
and making little progress. Clean filesystems filesystems will see
no overhead and the overhead only increases linearly with the number
of dirty AGs.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_sync.c  | 71 +++++++++++++++++++++++++++++++++++++++-----
 fs/xfs/linux-2.6/xfs_trace.h |  3 ++
 2 files changed, 67 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index f433819611cb..a51a07c3a70c 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -144,6 +144,41 @@ restart:
 	return last_error;
 }
 
+/*
+ * Select the next per-ag structure to iterate during the walk. The reclaim
+ * walk is optimised only to walk AGs with reclaimable inodes in them.
+ */
+static struct xfs_perag *
+xfs_inode_ag_iter_next_pag(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		*first,
+	int			tag)
+{
+	struct xfs_perag	*pag = NULL;
+
+	if (tag == XFS_ICI_RECLAIM_TAG) {
+		int found;
+		int ref;
+
+		spin_lock(&mp->m_perag_lock);
+		found = radix_tree_gang_lookup_tag(&mp->m_perag_tree,
+				(void **)&pag, *first, 1, tag);
+		if (found <= 0) {
+			spin_unlock(&mp->m_perag_lock);
+			return NULL;
+		}
+		*first = pag->pag_agno + 1;
+		/* open coded pag reference increment */
+		ref = atomic_inc_return(&pag->pag_ref);
+		spin_unlock(&mp->m_perag_lock);
+		trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_);
+	} else {
+		pag = xfs_perag_get(mp, *first);
+		(*first)++;
+	}
+	return pag;
+}
+
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
@@ -154,16 +189,15 @@ xfs_inode_ag_iterator(
 	int			exclusive,
 	int			*nr_to_scan)
 {
+	struct xfs_perag	*pag;
 	int			error = 0;
 	int			last_error = 0;
 	xfs_agnumber_t		ag;
 	int			nr;
 
 	nr = nr_to_scan ? *nr_to_scan : INT_MAX;
-	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-		struct xfs_perag	*pag;
-
-		pag = xfs_perag_get(mp, ag);
+	ag = 0;
+	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) {
 		error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
 						exclusive, &nr);
 		xfs_perag_put(pag);
@@ -640,6 +674,17 @@ __xfs_inode_set_reclaim_tag(
 	radix_tree_tag_set(&pag->pag_ici_root,
 			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
 			   XFS_ICI_RECLAIM_TAG);
+
+	if (!pag->pag_ici_reclaimable) {
+		/* propagate the reclaim tag up into the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
 	pag->pag_ici_reclaimable++;
 }
 
@@ -674,6 +719,16 @@ __xfs_inode_clear_reclaim_tag(
 	radix_tree_tag_clear(&pag->pag_ici_root,
 			XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 	pag->pag_ici_reclaimable--;
+	if (!pag->pag_ici_reclaimable) {
+		/* clear the reclaim tag from the perag radix tree */
+		spin_lock(&ip->i_mount->m_perag_lock);
+		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+				XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+				XFS_ICI_RECLAIM_TAG);
+		spin_unlock(&ip->i_mount->m_perag_lock);
+		trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+							-1, _RET_IP_);
+	}
 }
 
 /*
@@ -838,7 +893,7 @@ xfs_reclaim_inode_shrink(
 	struct xfs_mount *mp;
 	struct xfs_perag *pag;
 	xfs_agnumber_t	ag;
-	int		reclaimable = 0;
+	int		reclaimable;
 
 	mp = container_of(shrink, struct xfs_mount, m_inode_shrink);
 	if (nr_to_scan) {
@@ -852,8 +907,10 @@ xfs_reclaim_inode_shrink(
 			return -1;
        }
 
-	for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
-		pag = xfs_perag_get(mp, ag);
+	reclaimable = 0;
+	ag = 0;
+	while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag,
+					XFS_ICI_RECLAIM_TAG))) {
 		reclaimable += pag->pag_ici_reclaimable;
 		xfs_perag_put(pag);
 	}
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 73d5aa117384..302820690904 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -124,7 +124,10 @@ DEFINE_EVENT(xfs_perag_class, name,	\
 		 unsigned long caller_ip),					\
 	TP_ARGS(mp, agno, refcount, caller_ip))
 DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 
 TRACE_EVENT(xfs_attr_list_node_descend,
 	TP_PROTO(struct xfs_attr_list_context *ctx,
-- 
cgit v1.2.3


From a4ce96ac356e7024a7724ade9d18ba1bdf3c5c06 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 21 Jul 2010 09:25:42 -0700
Subject: Fix up trivial spelling errors ('taht' -> 'that')

Pointed out by Lucas who found the new one in a comment in
setup_percpu.c. And then I fixed the others that I grepped
for.

Reported-by: Lucas <canolucas@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jffs2/xattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index a2d58c96f1b4..d258e261bdc7 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -626,7 +626,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i
 
 static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic)
 {
-	/* success of check_xattr_ref_inode() means taht inode (ic) dose not have
+	/* success of check_xattr_ref_inode() means that inode (ic) dose not have
 	 * duplicate name/value pairs. If duplicate name/value pair would be found,
 	 * one will be removed.
 	 */
-- 
cgit v1.2.3