ocfs2/dlm: wait for dlm recovery done when migrating all lock resources

Wait for dlm recovery done when migrating all lock resources in case that new lock resource left after leaving dlm domain. And the left lock resource will cause other nodes BUG. NodeA NodeB NodeC umount: dlm_unregister_domain() dlm_migrate_all_locks() NodeB down do recovery for NodeB and collect a new lockres form other live nodes: dlm_do_recovery dlm_remaster_locks dlm_request_all_locks: dlm_mig_lockres_handler dlm_new_lockres __dlm_insert_lockres at last NodeA become the master of the new lockres and leave domain: dlm_leave_domain() mount: dlm_join_domain() touch file and request for the owner of the new lockres, but all the other nodes said 'NO', so NodeC decide to be the owner, and send do assert msg to other nodes: dlmlock() dlm_get_lock_resource() dlm_do_assert_master() other nodes receive the msg and found two masters exist. at last cause BUG in dlm_assert_master_handler() -->BUG(); Link: http://lkml.kernel.org/r/5AAA6E25.7090303@huawei.com Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown") Signed-off-by: Jun Piao <piaojun@huawei.com> Reviewed-by: Alex Chen <alex.chen@huawei.com> Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com> Acked-by: Joseph Qi <jiangqi903@gmail.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: piaojun <piaojun@huawei.com> 2018-04-05 16:19:11 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-04-05 21:36:22 -0700
commit: 60c7ec9ee4a3410c2cb08850102d363c7e207f48 (patch)
tree: 77b1cff7c9c386e7e338c07a1ad66ea249ab0d63 /fs/ocfs2
parent: a43d24cb3b0b395de8bb176355a907a9c9a1c42e (diff)
download: linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.tar.gz
linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.tar.bz2
linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.zip
3 files changed, 26 insertions, 3 deletions
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 953c200e1c30..d06e27ec4be4 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
 	u8 node_num;
 	u32 key;
 	u8  joining_node;
+	u8 migrate_done; /* set to 1 means node has migrated all lock resources */
 	wait_queue_head_t dlm_join_events;
 	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 25b76f0d082b..425081be6161 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,19 @@ redo_bucket:
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
 	}
+
+	if (!num) {
+		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(0, "%s: perhaps there are more lock resources "
+			     "need to be migrated after dlm recovery\n", dlm->name);
+			ret = -EAGAIN;
+		} else {
+			mlog(0, "%s: we won't do dlm recovery after migrating "
+			     "all lock resources\n", dlm->name);
+			dlm->migrate_done = 1;
+		}
+	}
+
 	spin_unlock(&dlm->spinlock);
 	wake_up(&dlm->dlm_thread_wq);
 
@@ -2038,6 +2051,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
 	init_waitqueue_head(&dlm->dlm_join_events);
 
+	dlm->migrate_done = 0;
+
 	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
 	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 86204b81ef34..b454eb371b77 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
 
 static void dlm_begin_recovery(struct dlm_ctxt *dlm)
 {
-	spin_lock(&dlm->spinlock);
+	assert_spin_locked(&dlm->spinlock);
 	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
 	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
 	       dlm->name, dlm->reco.dead_node);
 	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
-	spin_unlock(&dlm->spinlock);
 }
 
 static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 
 	spin_lock(&dlm->spinlock);
 
+	if (dlm->migrate_done) {
+		mlog(0, "%s: no need do recovery after migrating all "
+		     "lock resources\n", dlm->name);
+		spin_unlock(&dlm->spinlock);
+		return 0;
+	}
+
 	/* check to see if the new master has died */
 	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
 	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +496,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
 	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
 	     dlm->reco.dead_node);
-	spin_unlock(&dlm->spinlock);
 
 	/* take write barrier */
 	/* (stops the list reshuffling thread, proxy ast handling) */
 	dlm_begin_recovery(dlm);
 
+	spin_unlock(&dlm->spinlock);
+
 	if (dlm->reco.new_master == dlm->node_num)
 		goto master_here;
author	piaojun <piaojun@huawei.com>	2018-04-05 16:19:11 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-04-05 21:36:22 -0700
commit	60c7ec9ee4a3410c2cb08850102d363c7e207f48 (patch)
tree	77b1cff7c9c386e7e338c07a1ad66ea249ab0d63 /fs/ocfs2
parent	a43d24cb3b0b395de8bb176355a907a9c9a1c42e (diff)
download	linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.tar.gz linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.tar.bz2 linux-60c7ec9ee4a3410c2cb08850102d363c7e207f48.zip