From 7adb072ca83d71103ecc8578bee5f73c4f1eba89 Mon Sep 17 00:00:00 2001
From: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Date: Wed, 26 Oct 2016 09:20:39 +0200
Subject: raid5: revert commit 11367799f3d1

Revert commit 11367799f3d1 ("md: Prevent IO hold during accessing to faulty
raid5 array") as it doesn't comply with commit c3cce6cda162 ("md/raid5:
ensure device failure recorded before write request returns."). That change
is not required anymore as the problem is resolved by commit 16f889499a52
("md: report 'write_pending' state when array in sync") - read request is
stuck as array state is not reported correctly via sysfs attribute.

Signed-off-by: Tomasz Majchrzak <tomasz.majchrzak@intel.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 92ac251e91e6..b6e4670624b8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4645,9 +4645,7 @@ finish:
 	}
 
 	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags) &&
-				(s.failed <= conf->max_degraded ||
-					conf->mddev->external == 0)) {
+		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
 			spin_lock_irq(&conf->device_lock);
 			bio_list_merge(&conf->return_bi, &s.return_bi);
 			spin_unlock_irq(&conf->device_lock);
-- 
cgit v1.2.3


From cc6167b4f3b3caabe973ca12612ac7d60aae0cfc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 2 Nov 2016 14:16:50 +1100
Subject: md/raid5: change printk() to pr_*()

Signed-off-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 207 ++++++++++++++++++++++-------------------------------
 1 file changed, 86 insertions(+), 121 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b6e4670624b8..df88656d8798 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -541,7 +541,7 @@ retry:
 
 		if (dev->toread || dev->read || dev->towrite || dev->written ||
 		    test_bit(R5_LOCKED, &dev->flags)) {
-			printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
+			pr_err("sector=%llx i=%d %p %p %p %p %d\n",
 			       (unsigned long long)sh->sector, i, dev->toread,
 			       dev->read, dev->towrite, dev->written,
 			       test_bit(R5_LOCKED, &dev->flags));
@@ -2347,10 +2347,8 @@ static void raid5_end_read_request(struct bio * bi)
 			 * replacement device.  We just fail those on
 			 * any error
 			 */
-			printk_ratelimited(
-				KERN_INFO
-				"md/raid:%s: read error corrected"
-				" (%lu sectors at %llu on %s)\n",
+			pr_info_ratelimited(
+				"md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n",
 				mdname(conf->mddev), STRIPE_SECTORS,
 				(unsigned long long)s,
 				bdevname(rdev->bdev, b));
@@ -2370,36 +2368,29 @@ static void raid5_end_read_request(struct bio * bi)
 		clear_bit(R5_UPTODATE, &sh->dev[i].flags);
 		atomic_inc(&rdev->read_errors);
 		if (test_bit(R5_ReadRepl, &sh->dev[i].flags))
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error on replacement device "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error on replacement device (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		else if (conf->mddev->degraded >= conf->max_degraded) {
 			set_bad = 1;
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error not correctable "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error not correctable (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		} else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) {
 			/* Oh, no!!! */
 			set_bad = 1;
-			printk_ratelimited(
-				KERN_WARNING
-				"md/raid:%s: read error NOT corrected!! "
-				"(sector %llu on %s).\n",
+			pr_warn_ratelimited(
+				"md/raid:%s: read error NOT corrected!! (sector %llu on %s).\n",
 				mdname(conf->mddev),
 				(unsigned long long)s,
 				bdn);
 		} else if (atomic_read(&rdev->read_errors)
 			 > conf->max_nr_stripes)
-			printk(KERN_WARNING
-			       "md/raid:%s: Too many read errors, failing device %s.\n",
+			pr_warn("md/raid:%s: Too many read errors, failing device %s.\n",
 			       mdname(conf->mddev), bdn);
 		else
 			retry = 1;
@@ -2533,13 +2524,12 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Faulty, &rdev->flags);
 	set_mask_bits(&mddev->flags, 0,
 		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-	printk(KERN_ALERT
-	       "md/raid:%s: Disk failure on %s, disabling device.\n"
-	       "md/raid:%s: Operation continuing on %d devices.\n",
-	       mdname(mddev),
-	       bdevname(rdev->bdev, b),
-	       mdname(mddev),
-	       conf->raid_disks - mddev->degraded);
+	pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
+		"md/raid:%s: Operation continuing on %d devices.\n",
+		mdname(mddev),
+		bdevname(rdev->bdev, b),
+		mdname(mddev),
+		conf->raid_disks - mddev->degraded);
 }
 
 /*
@@ -2861,8 +2851,8 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
 				     previous, &dummy1, &sh2);
 	if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
 		|| sh2.qd_idx != sh->qd_idx) {
-		printk(KERN_ERR "md/raid:%s: compute_blocknr: map not correct\n",
-		       mdname(conf->mddev));
+		pr_warn("md/raid:%s: compute_blocknr: map not correct\n",
+			mdname(conf->mddev));
 		return 0;
 	}
 	return r_sector;
@@ -3782,7 +3772,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 	case check_state_compute_run:
 		break;
 	default:
-		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
+		pr_err("%s: unknown check_state: %d sector: %llu\n",
 		       __func__, sh->check_state,
 		       (unsigned long long) sh->sector);
 		BUG();
@@ -3946,9 +3936,9 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 	case check_state_compute_run:
 		break;
 	default:
-		printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
-		       __func__, sh->check_state,
-		       (unsigned long long) sh->sector);
+		pr_warn("%s: unknown check_state: %d sector: %llu\n",
+			__func__, sh->check_state,
+			(unsigned long long) sh->sector);
 		BUG();
 	}
 }
@@ -6385,8 +6375,8 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
 	if (alloc_scratch_buffer(conf, percpu)) {
-		pr_err("%s: failed memory allocation for cpu%u\n",
-		       __func__, cpu);
+		pr_warn("%s: failed memory allocation for cpu%u\n",
+			__func__, cpu);
 		return -ENOMEM;
 	}
 	return 0;
@@ -6456,29 +6446,29 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	if (mddev->new_level != 5
 	    && mddev->new_level != 4
 	    && mddev->new_level != 6) {
-		printk(KERN_ERR "md/raid:%s: raid level not set to 4/5/6 (%d)\n",
-		       mdname(mddev), mddev->new_level);
+		pr_warn("md/raid:%s: raid level not set to 4/5/6 (%d)\n",
+			mdname(mddev), mddev->new_level);
 		return ERR_PTR(-EIO);
 	}
 	if ((mddev->new_level == 5
 	     && !algorithm_valid_raid5(mddev->new_layout)) ||
 	    (mddev->new_level == 6
 	     && !algorithm_valid_raid6(mddev->new_layout))) {
-		printk(KERN_ERR "md/raid:%s: layout %d not supported\n",
-		       mdname(mddev), mddev->new_layout);
+		pr_warn("md/raid:%s: layout %d not supported\n",
+			mdname(mddev), mddev->new_layout);
 		return ERR_PTR(-EIO);
 	}
 	if (mddev->new_level == 6 && mddev->raid_disks < 4) {
-		printk(KERN_ERR "md/raid:%s: not enough configured devices (%d, minimum 4)\n",
-		       mdname(mddev), mddev->raid_disks);
+		pr_warn("md/raid:%s: not enough configured devices (%d, minimum 4)\n",
+			mdname(mddev), mddev->raid_disks);
 		return ERR_PTR(-EINVAL);
 	}
 
 	if (!mddev->new_chunk_sectors ||
 	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
 	    !is_power_of_2(mddev->new_chunk_sectors)) {
-		printk(KERN_ERR "md/raid:%s: invalid chunk size %d\n",
-		       mdname(mddev), mddev->new_chunk_sectors << 9);
+		pr_warn("md/raid:%s: invalid chunk size %d\n",
+			mdname(mddev), mddev->new_chunk_sectors << 9);
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -6569,9 +6559,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 		if (test_bit(In_sync, &rdev->flags)) {
 			char b[BDEVNAME_SIZE];
-			printk(KERN_INFO "md/raid:%s: device %s operational as raid"
-			       " disk %d\n",
-			       mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
+			pr_info("md/raid:%s: device %s operational as raid disk %d\n",
+				mdname(mddev), bdevname(rdev->bdev, b), raid_disk);
 		} else if (rdev->saved_raid_disk != raid_disk)
 			/* Cannot rely on bitmap to complete recovery */
 			conf->fullsync = 1;
@@ -6605,21 +6594,18 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
 		conf->min_nr_stripes = max(NR_STRIPES, stripes);
 		if (conf->min_nr_stripes != NR_STRIPES)
-			printk(KERN_INFO
-				"md/raid:%s: force stripe size %d for reshape\n",
+			pr_info("md/raid:%s: force stripe size %d for reshape\n",
 				mdname(mddev), conf->min_nr_stripes);
 	}
 	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
 	if (grow_stripes(conf, conf->min_nr_stripes)) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
-		       mdname(mddev), memory);
+		pr_warn("md/raid:%s: couldn't allocate %dkB for buffers\n",
+			mdname(mddev), memory);
 		goto abort;
 	} else
-		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
-		       mdname(mddev), memory);
+		pr_debug("md/raid:%s: allocated %dkB\n", mdname(mddev), memory);
 	/*
 	 * Losing a stripe head costs more than the time to refill it,
 	 * it reduces the queue depth and so can hurt throughput.
@@ -6631,18 +6617,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	conf->shrinker.batch = 128;
 	conf->shrinker.flags = 0;
 	if (register_shrinker(&conf->shrinker)) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't register shrinker.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: couldn't register shrinker.\n",
+			mdname(mddev));
 		goto abort;
 	}
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);
 	if (!conf->thread) {
-		printk(KERN_ERR
-		       "md/raid:%s: couldn't allocate thread.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: couldn't allocate thread.\n",
+			mdname(mddev));
 		goto abort;
 	}
 
@@ -6695,9 +6679,8 @@ static int raid5_run(struct mddev *mddev)
 	int first = 1;
 
 	if (mddev->recovery_cp != MaxSector)
-		printk(KERN_NOTICE "md/raid:%s: not clean"
-		       " -- starting background reconstruction\n",
-		       mdname(mddev));
+		pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
+			  mdname(mddev));
 
 	rdev_for_each(rdev, mddev) {
 		long long diff;
@@ -6740,15 +6723,14 @@ static int raid5_run(struct mddev *mddev)
 		int new_data_disks;
 
 		if (journal_dev) {
-			printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 
 		if (mddev->new_level != mddev->level) {
-			printk(KERN_ERR "md/raid:%s: unsupported reshape "
-			       "required - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 		old_disks = mddev->raid_disks - mddev->delta_disks;
@@ -6763,8 +6745,8 @@ static int raid5_run(struct mddev *mddev)
 		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
 		new_data_disks = mddev->raid_disks - max_degraded;
 		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
-			printk(KERN_ERR "md/raid:%s: reshape_position not "
-			       "on a stripe boundary\n", mdname(mddev));
+			pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
 		reshape_offset = here_new * chunk_sectors;
@@ -6785,10 +6767,8 @@ static int raid5_run(struct mddev *mddev)
 			    abs(min_offset_diff) >= mddev->new_chunk_sectors)
 				/* not really in-place - so OK */;
 			else if (mddev->ro == 0) {
-				printk(KERN_ERR "md/raid:%s: in-place reshape "
-				       "must be started in read-only mode "
-				       "- aborting\n",
-				       mdname(mddev));
+				pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
+					mdname(mddev));
 				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
@@ -6797,13 +6777,11 @@ static int raid5_run(struct mddev *mddev)
 		    : (here_new * chunk_sectors >=
 		       here_old * chunk_sectors + (-min_offset_diff))) {
 			/* Reading from the same stripe as writing to - bad */
-			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
-			       "auto-recovery - aborting.\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
+				mdname(mddev));
 			return -EINVAL;
 		}
-		printk(KERN_INFO "md/raid:%s: reshape will continue\n",
-		       mdname(mddev));
+		pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
 		/* OK, we should be able to continue; */
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
@@ -6822,8 +6800,8 @@ static int raid5_run(struct mddev *mddev)
 
 	if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
 		if (!journal_dev) {
-			pr_err("md/raid:%s: journal disk is missing, force array readonly\n",
-			       mdname(mddev));
+			pr_warn("md/raid:%s: journal disk is missing, force array readonly\n",
+				mdname(mddev));
 			mddev->ro = 1;
 			set_disk_ro(mddev->gendisk, 1);
 		} else if (mddev->recovery_cp == MaxSector)
@@ -6850,8 +6828,7 @@ static int raid5_run(struct mddev *mddev)
 		if (conf->disks[i].replacement &&
 		    conf->reshape_progress != MaxSector) {
 			/* replacements and reshape simply do not mix. */
-			printk(KERN_ERR "md: cannot handle concurrent "
-			       "replacement and reshape.\n");
+			pr_warn("md: cannot handle concurrent replacement and reshape.\n");
 			goto abort;
 		}
 		if (test_bit(In_sync, &rdev->flags)) {
@@ -6893,8 +6870,7 @@ static int raid5_run(struct mddev *mddev)
 	mddev->degraded = calc_degraded(conf);
 
 	if (has_failed(conf)) {
-		printk(KERN_ERR "md/raid:%s: not enough operational devices"
-			" (%d/%d failed)\n",
+		pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n",
 			mdname(mddev), mddev->degraded, conf->raid_disks);
 		goto abort;
 	}
@@ -6906,29 +6882,19 @@ static int raid5_run(struct mddev *mddev)
 	if (mddev->degraded > dirty_parity_disks &&
 	    mddev->recovery_cp != MaxSector) {
 		if (mddev->ok_start_degraded)
-			printk(KERN_WARNING
-			       "md/raid:%s: starting dirty degraded array"
-			       " - data corruption possible.\n",
-			       mdname(mddev));
+			pr_crit("md/raid:%s: starting dirty degraded array - data corruption possible.\n",
+				mdname(mddev));
 		else {
-			printk(KERN_ERR
-			       "md/raid:%s: cannot start dirty degraded array.\n",
-			       mdname(mddev));
+			pr_crit("md/raid:%s: cannot start dirty degraded array.\n",
+				mdname(mddev));
 			goto abort;
 		}
 	}
 
-	if (mddev->degraded == 0)
-		printk(KERN_INFO "md/raid:%s: raid level %d active with %d out of %d"
-		       " devices, algorithm %d\n", mdname(mddev), conf->level,
-		       mddev->raid_disks-mddev->degraded, mddev->raid_disks,
-		       mddev->new_layout);
-	else
-		printk(KERN_ALERT "md/raid:%s: raid level %d active with %d"
-		       " out of %d devices, algorithm %d\n",
-		       mdname(mddev), conf->level,
-		       mddev->raid_disks - mddev->degraded,
-		       mddev->raid_disks, mddev->new_layout);
+	pr_info("md/raid:%s: raid level %d active with %d out of %d devices, algorithm %d\n",
+		mdname(mddev), conf->level,
+		mddev->raid_disks-mddev->degraded, mddev->raid_disks,
+		mddev->new_layout);
 
 	print_raid5_conf(conf);
 
@@ -6948,9 +6914,8 @@ static int raid5_run(struct mddev *mddev)
 		mddev->to_remove = NULL;
 	else if (mddev->kobj.sd &&
 	    sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
-		printk(KERN_WARNING
-		       "raid5: failed to create sysfs attributes for %s\n",
-		       mdname(mddev));
+		pr_warn("raid5: failed to create sysfs attributes for %s\n",
+			mdname(mddev));
 	md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
 
 	if (mddev->queue) {
@@ -7038,8 +7003,8 @@ static int raid5_run(struct mddev *mddev)
 	if (journal_dev) {
 		char b[BDEVNAME_SIZE];
 
-		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-		       mdname(mddev), bdevname(journal_dev->bdev, b));
+		pr_debug("md/raid:%s: using device %s as journal\n",
+			 mdname(mddev), bdevname(journal_dev->bdev, b));
 		r5l_init_log(conf, journal_dev);
 	}
 
@@ -7049,7 +7014,7 @@ abort:
 	print_raid5_conf(conf);
 	free_conf(conf);
 	mddev->private = NULL;
-	printk(KERN_ALERT "md/raid:%s: failed to run raid set.\n", mdname(mddev));
+	pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
 	return -EIO;
 }
 
@@ -7083,12 +7048,12 @@ static void print_raid5_conf (struct r5conf *conf)
 	int i;
 	struct disk_info *tmp;
 
-	printk(KERN_DEBUG "RAID conf printout:\n");
+	pr_debug("RAID conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_debug("(conf==NULL)\n");
 		return;
 	}
-	printk(KERN_DEBUG " --- level:%d rd:%d wd:%d\n", conf->level,
+	pr_debug(" --- level:%d rd:%d wd:%d\n", conf->level,
 	       conf->raid_disks,
 	       conf->raid_disks - conf->mddev->degraded);
 
@@ -7096,7 +7061,7 @@ static void print_raid5_conf (struct r5conf *conf)
 		char b[BDEVNAME_SIZE];
 		tmp = conf->disks + i;
 		if (tmp->rdev)
-			printk(KERN_DEBUG " disk %d, o:%d, dev:%s\n",
+			pr_debug(" disk %d, o:%d, dev:%s\n",
 			       i, !test_bit(Faulty, &tmp->rdev->flags),
 			       bdevname(tmp->rdev->bdev, b));
 	}
@@ -7244,8 +7209,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 		 * write requests running. We should be safe
 		 */
 		r5l_init_log(conf, rdev);
-		printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
-		       mdname(mddev), bdevname(rdev->bdev, b));
+		pr_debug("md/raid:%s: using device %s as journal\n",
+			 mdname(mddev), bdevname(rdev->bdev, b));
 		return 0;
 	}
 	if (mddev->recovery_disabled == conf->recovery_disabled)
@@ -7349,10 +7314,10 @@ static int check_stripe_cache(struct mddev *mddev)
 	    > conf->min_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
 	    > conf->min_nr_stripes) {
-		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
-		       mdname(mddev),
-		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
-			/ STRIPE_SIZE)*4);
+		pr_warn("md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
+			mdname(mddev),
+			((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+			 / STRIPE_SIZE)*4);
 		return 0;
 	}
 	return 1;
@@ -7433,8 +7398,8 @@ static int raid5_start_reshape(struct mddev *mddev)
 	 */
 	if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
 	    < mddev->array_sectors) {
-		printk(KERN_ERR "md/raid:%s: array size must be reduced "
-		       "before number of disks\n", mdname(mddev));
+		pr_warn("md/raid:%s: array size must be reduced before number of disks\n",
+			mdname(mddev));
 		return -EINVAL;
 	}
 
@@ -7652,8 +7617,8 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 
 	/* for raid0 takeover only one zone is supported */
 	if (raid0_conf->nr_strip_zones > 1) {
-		printk(KERN_ERR "md/raid:%s: cannot takeover raid0 with more than one zone.\n",
-		       mdname(mddev));
+		pr_warn("md/raid:%s: cannot takeover raid0 with more than one zone.\n",
+			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
-- 
cgit v1.2.3


From 937621c36e0ea1af2aceeaea412ba3bd80247199 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:37 -0800
Subject: md/r5cache: move some code to raid5.h

Move some define and inline functions to raid5.h, so they can be
used in raid5-cache.c

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 71 ------------------------------------------------------
 1 file changed, 71 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index df88656d8798..34895f3218d9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -70,19 +70,6 @@ module_param(devices_handle_discard_safely, bool, 0644);
 MODULE_PARM_DESC(devices_handle_discard_safely,
 		 "Set to Y if all devices in each array reliably return zeroes on reads from discarded regions");
 static struct workqueue_struct *raid5_wq;
-/*
- * Stripe cache
- */
-
-#define NR_STRIPES		256
-#define STRIPE_SIZE		PAGE_SIZE
-#define STRIPE_SHIFT		(PAGE_SHIFT - 9)
-#define STRIPE_SECTORS		(STRIPE_SIZE>>9)
-#define	IO_THRESHOLD		1
-#define BYPASS_THRESHOLD	1
-#define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
-#define HASH_MASK		(NR_HASH - 1)
-#define MAX_STRIPE_BATCH	8
 
 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
 {
@@ -126,64 +113,6 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
 	local_irq_enable();
 }
 
-/* bio's attached to a stripe+device for I/O are linked together in bi_sector
- * order without overlap.  There may be several bio's per stripe+device, and
- * a bio could span several devices.
- * When walking this list for a particular stripe+device, we must never proceed
- * beyond a bio that extends past this device, as the next bio might no longer
- * be valid.
- * This function is used to determine the 'next' bio in the list, given the sector
- * of the current stripe+device
- */
-static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
-{
-	int sectors = bio_sectors(bio);
-	if (bio->bi_iter.bi_sector + sectors < sector + STRIPE_SECTORS)
-		return bio->bi_next;
-	else
-		return NULL;
-}
-
-/*
- * We maintain a biased count of active stripes in the bottom 16 bits of
- * bi_phys_segments, and a count of processed stripes in the upper 16 bits
- */
-static inline int raid5_bi_processed_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return (atomic_read(segments) >> 16) & 0xffff;
-}
-
-static inline int raid5_dec_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	return atomic_sub_return(1, segments) & 0xffff;
-}
-
-static inline void raid5_inc_bi_active_stripes(struct bio *bio)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_inc(segments);
-}
-
-static inline void raid5_set_bi_processed_stripes(struct bio *bio,
-	unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	int old, new;
-
-	do {
-		old = atomic_read(segments);
-		new = (old & 0xffff) | (cnt << 16);
-	} while (atomic_cmpxchg(segments, old, new) != old);
-}
-
-static inline void raid5_set_bi_stripes(struct bio *bio, unsigned int cnt)
-{
-	atomic_t *segments = (atomic_t *)&bio->bi_phys_segments;
-	atomic_set(segments, cnt);
-}
-
 /* Find first data disk in a raid6 stripe */
 static inline int raid6_d0(struct stripe_head *sh)
 {
-- 
cgit v1.2.3


From 2ded370373a400c20cf0c6e941e724e61582a867 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:38 -0800
Subject: md/r5cache: State machine for raid5-cache write back mode

This patch adds state machine for raid5-cache. With log device, the
raid456 array could operate in two different modes (r5c_journal_mode):
  - write-back (R5C_MODE_WRITE_BACK)
  - write-through (R5C_MODE_WRITE_THROUGH)

Existing code of raid5-cache only has write-through mode. For write-back
cache, it is necessary to extend the state machine.

With write-back cache, every stripe could operate in two different
phases:
  - caching
  - writing-out

In caching phase, the stripe handles writes as:
  - write to journal
  - return IO

In writing-out phase, the stripe behaviors as a stripe in write through
mode R5C_MODE_WRITE_THROUGH.

STRIPE_R5C_CACHING is added to sh->state to differentiate caching and
writing-out phase.

Please note: this is a "no-op" patch for raid5-cache write-through
mode.

The following detailed explanation is copied from the raid5-cache.c:

/*
 * raid5 cache state machine
 *
 * With rhe RAID cache, each stripe works in two phases:
 *      - caching phase
 *      - writing-out phase
 *
 * These two phases are controlled by bit STRIPE_R5C_CACHING:
 *   if STRIPE_R5C_CACHING == 0, the stripe is in writing-out phase
 *   if STRIPE_R5C_CACHING == 1, the stripe is in caching phase
 *
 * When there is no journal, or the journal is in write-through mode,
 * the stripe is always in writing-out phase.
 *
 * For write-back journal, the stripe is sent to caching phase on write
 * (r5c_handle_stripe_dirtying). r5c_make_stripe_write_out() kicks off
 * the write-out phase by clearing STRIPE_R5C_CACHING.
 *
 * Stripes in caching phase do not write the raid disks. Instead, all
 * writes are committed from the log device. Therefore, a stripe in
 * caching phase handles writes as:
 *      - write to log device
 *      - return IO
 *
 * Stripes in writing-out phase handle writes as:
 *      - calculate parity
 *      - write pending data and parity to journal
 *      - write data and parity to raid disks
 *      - return IO for pending writes
 */

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 45 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 41 insertions(+), 4 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 34895f3218d9..7c98eb06d1b2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4107,6 +4107,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 			if (rdev && !test_bit(Faulty, &rdev->flags))
 				do_recovery = 1;
 		}
+
+		if (test_bit(R5_InJournal, &dev->flags))
+			s->injournal++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4386,14 +4389,47 @@ static void handle_stripe(struct stripe_head *sh)
 	    || s.expanding)
 		handle_stripe_fill(sh, &s, disks);
 
-	/* Now to consider new write requests and what else, if anything
-	 * should be read.  We do not handle new writes when:
+	/*
+	 * When the stripe finishes full journal write cycle (write to journal
+	 * and raid disk), this is the clean up procedure so it is ready for
+	 * next operation.
+	 */
+	r5c_finish_stripe_write_out(conf, sh, &s);
+
+	/*
+	 * Now to consider new write requests, cache write back and what else,
+	 * if anything should be read.  We do not handle new writes when:
 	 * 1/ A 'write' operation (copy+xor) is already in flight.
 	 * 2/ A 'check' operation is in flight, as it may clobber the parity
 	 *    block.
+	 * 3/ A r5c cache log write is in flight.
 	 */
-	if (s.to_write && !sh->reconstruct_state && !sh->check_state)
-		handle_stripe_dirtying(conf, sh, &s, disks);
+
+	if (!sh->reconstruct_state && !sh->check_state && !sh->log_io) {
+		if (!r5c_is_writeback(conf->log)) {
+			if (s.to_write)
+				handle_stripe_dirtying(conf, sh, &s, disks);
+		} else { /* write back cache */
+			int ret = 0;
+
+			/* First, try handle writes in caching phase */
+			if (s.to_write)
+				ret = r5c_try_caching_write(conf, sh, &s,
+							    disks);
+			/*
+			 * If caching phase failed: ret == -EAGAIN
+			 *    OR
+			 * stripe under reclaim: !caching && injournal
+			 *
+			 * fall back to handle_stripe_dirtying()
+			 */
+			if (ret == -EAGAIN ||
+			    /* stripe under reclaim: !caching && injournal */
+			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
+			     s.injournal > 0))
+				handle_stripe_dirtying(conf, sh, &s, disks);
+		}
+	}
 
 	/* maybe we need to check and possibly fix the parity for this stripe
 	 * Any reads will already have been scheduled, so we just see if enough
@@ -5110,6 +5146,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	 * data on failed drives.
 	 */
 	if (rw == READ && mddev->degraded == 0 &&
+	    !r5c_is_writeback(conf->log) &&
 	    mddev->reshape_position == MaxSector) {
 		bi = chunk_aligned_read(mddev, bi);
 		if (!bi)
-- 
cgit v1.2.3


From 1e6d690b9334b7e1b31d25fd8d93e980e449a5f9 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:39 -0800
Subject: md/r5cache: caching phase of r5cache

As described in previous patch, write back cache operates in two
phases: caching and writing-out. The caching phase works as:
1. write data to journal
   (r5c_handle_stripe_dirtying, r5c_cache_data)
2. call bio_endio
   (r5c_handle_data_cached, r5c_return_dev_pending_writes).

Then the writing-out phase is as:
1. Mark the stripe as write-out (r5c_make_stripe_write_out)
2. Calcualte parity (reconstruct or RMW)
3. Write parity (and maybe some other data) to journal device
4. Write data and parity to RAID disks

This patch implements caching phase. The cache is integrated with
stripe cache of raid456. It leverages code of r5l_log to write
data to journal device.

Writing-out phase of the cache is implemented in the next patch.

With r5cache, write operation does not wait for parity calculation
and write out, so the write latency is lower (1 write to journal
device vs. read and then write to raid disks). Also, r5cache will
reduce RAID overhead (multipile IO due to read-modify-write of
parity) and provide more opportunities of full stripe writes.

This patch adds 2 flags to stripe_head.state:
 - STRIPE_R5C_PARTIAL_STRIPE,
 - STRIPE_R5C_FULL_STRIPE,

Instead of inactive_list, stripes with cached data are tracked in
r5conf->r5c_full_stripe_list and r5conf->r5c_partial_stripe_list.
STRIPE_R5C_FULL_STRIPE and STRIPE_R5C_PARTIAL_STRIPE are flags for
stripes in these lists. Note: stripes in r5c_full/partial_stripe_list
are not considered as "active".

For RMW, the code allocates an extra page for each data block
being updated.  This is stored in r5dev->orig_page and the old data
is read into it.  Then the prexor calculation subtracts ->orig_page
from the parity block, and the reconstruct calculation adds the
->page data back into the parity block.

r5cache naturally excludes SkipCopy. When the array has write back
cache, async_copy_data() will not skip copy.

There are some known limitations of the cache implementation:

1. Write cache only covers full page writes (R5_OVERWRITE). Writes
   of smaller granularity are write through.
2. Only one log io (sh->log_io) for each stripe at anytime. Later
   writes for the same stripe have to wait. This can be improved by
   moving log_io to r5dev.
3. With writeback cache, read path must enter state machine, which
   is a significant bottleneck for some workloads.
4. There is no per stripe checkpoint (with r5l_payload_flush) in
   the log, so recovery code has to replay more than necessary data
   (sometimes all the log from last_checkpoint). This reduces
   availability of the array.

This patch includes a fix proposed by ZhengYuan Liu
<liuzhengyuan@kylinos.cn>

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 152 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 130 insertions(+), 22 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7c98eb06d1b2..f535ce2c267a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -218,8 +218,17 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			      struct list_head *temp_inactive_list)
 {
+	int i;
+	int injournal = 0;	/* number of date pages with R5_InJournal */
+
 	BUG_ON(!list_empty(&sh->lru));
 	BUG_ON(atomic_read(&conf->active_stripes)==0);
+
+	if (r5c_is_writeback(conf->log))
+		for (i = sh->disks; i--; )
+			if (test_bit(R5_InJournal, &sh->dev[i].flags))
+				injournal++;
+
 	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 		if (test_bit(STRIPE_DELAYED, &sh->state) &&
 		    !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
@@ -245,8 +254,29 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			    < IO_THRESHOLD)
 				md_wakeup_thread(conf->mddev->thread);
 		atomic_dec(&conf->active_stripes);
-		if (!test_bit(STRIPE_EXPANDING, &sh->state))
-			list_add_tail(&sh->lru, temp_inactive_list);
+		if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
+			if (!r5c_is_writeback(conf->log))
+				list_add_tail(&sh->lru, temp_inactive_list);
+			else {
+				WARN_ON(test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags));
+				if (injournal == 0)
+					list_add_tail(&sh->lru, temp_inactive_list);
+				else if (injournal == conf->raid_disks - conf->max_degraded) {
+					/* full stripe */
+					if (!test_and_set_bit(STRIPE_R5C_FULL_STRIPE, &sh->state))
+						atomic_inc(&conf->r5c_cached_full_stripes);
+					if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
+						atomic_dec(&conf->r5c_cached_partial_stripes);
+					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+				} else {
+					/* partial stripe */
+					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
+							      &sh->state))
+						atomic_inc(&conf->r5c_cached_partial_stripes);
+					list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list);
+				}
+			}
+		}
 	}
 }
 
@@ -830,8 +860,17 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	might_sleep();
 
-	if (r5l_write_stripe(conf->log, sh) == 0)
-		return;
+	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
+		/* writing out phase */
+		if (r5l_write_stripe(conf->log, sh) == 0)
+			return;
+	} else {  /* caching phase */
+		if (test_bit(STRIPE_LOG_TRAPPED, &sh->state)) {
+			r5c_cache_data(conf->log, sh, s);
+			return;
+		}
+	}
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -1044,7 +1083,7 @@ again:
 static struct dma_async_tx_descriptor *
 async_copy_data(int frombio, struct bio *bio, struct page **page,
 	sector_t sector, struct dma_async_tx_descriptor *tx,
-	struct stripe_head *sh)
+	struct stripe_head *sh, int no_skipcopy)
 {
 	struct bio_vec bvl;
 	struct bvec_iter iter;
@@ -1084,7 +1123,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 			if (frombio) {
 				if (sh->raid_conf->skip_copy &&
 				    b_offset == 0 && page_offset == 0 &&
-				    clen == STRIPE_SIZE)
+				    clen == STRIPE_SIZE &&
+				    !no_skipcopy)
 					*page = bio_page;
 				else
 					tx = async_memcpy(*page, bio_page, page_offset,
@@ -1166,7 +1206,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				tx = async_copy_data(0, rbi, &dev->page,
-					dev->sector, tx, sh);
+						     dev->sector, tx, sh, 0);
 				rbi = r5_next_bio(rbi, dev->sector);
 			}
 		}
@@ -1293,10 +1333,15 @@ static int set_syndrome_sources(struct page **srcs,
 		if (i == sh->qd_idx || i == sh->pd_idx ||
 		    (srctype == SYNDROME_SRC_ALL) ||
 		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
-		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		     (test_bit(R5_Wantdrain, &dev->flags) ||
+		      test_bit(R5_InJournal, &dev->flags))) ||
 		    (srctype == SYNDROME_SRC_WRITTEN &&
-		     dev->written))
-			srcs[slot] = sh->dev[i].page;
+		     dev->written)) {
+			if (test_bit(R5_InJournal, &dev->flags))
+				srcs[slot] = sh->dev[i].orig_page;
+			else
+				srcs[slot] = sh->dev[i].page;
+		}
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
 
@@ -1475,6 +1520,13 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	if (r5c_is_writeback(sh->raid_conf->log))
+		/*
+		 * raid5-cache write back uses orig_page during prexor.
+		 * After prexor, it is time to free orig_page
+		 */
+		r5c_release_extra_page(sh);
 }
 
 static struct dma_async_tx_descriptor *
@@ -1496,7 +1548,9 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
-		if (test_bit(R5_Wantdrain, &dev->flags))
+		if (test_bit(R5_InJournal, &dev->flags))
+			xor_srcs[count++] = dev->orig_page;
+		else if (test_bit(R5_Wantdrain, &dev->flags))
 			xor_srcs[count++] = dev->page;
 	}
 
@@ -1530,6 +1584,7 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
+	struct r5conf *conf = sh->raid_conf;
 	int disks = sh->disks;
 	int i;
 	struct stripe_head *head_sh = sh;
@@ -1547,6 +1602,11 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 
 again:
 			dev = &sh->dev[i];
+			/*
+			 * clear R5_InJournal, so when rewriting a page in
+			 * journal, it is not skipped by r5l_log_stripe()
+			 */
+			clear_bit(R5_InJournal, &dev->flags);
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1566,8 +1626,10 @@ again:
 					set_bit(R5_Discard, &dev->flags);
 				else {
 					tx = async_copy_data(1, wbi, &dev->page,
-						dev->sector, tx, sh);
-					if (dev->page != dev->orig_page) {
+							     dev->sector, tx, sh,
+							     r5c_is_writeback(conf->log));
+					if (dev->page != dev->orig_page &&
+					    !r5c_is_writeback(conf->log)) {
 						set_bit(R5_SkipCopy, &dev->flags);
 						clear_bit(R5_UPTODATE, &dev->flags);
 						clear_bit(R5_OVERWRITE, &dev->flags);
@@ -1675,7 +1737,8 @@ again:
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (head_sh->dev[i].written)
+			if (head_sh->dev[i].written ||
+			    test_bit(R5_InJournal, &head_sh->dev[i].flags))
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -2796,6 +2859,13 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 	int level = conf->level;
 
 	if (rcw) {
+		/*
+		 * In some cases, handle_stripe_dirtying initially decided to
+		 * run rmw and allocates extra page for prexor. However, rcw is
+		 * cheaper later on. We need to free the extra page now,
+		 * because we won't be able to do that in ops_complete_prexor().
+		 */
+		r5c_release_extra_page(sh);
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
@@ -2806,6 +2876,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				if (!expand)
 					clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InJournal, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		/* if we are not expanding this is a proper write request, and
@@ -2845,6 +2918,9 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 				set_bit(R5_LOCKED, &dev->flags);
 				clear_bit(R5_UPTODATE, &dev->flags);
 				s->locked++;
+			} else if (test_bit(R5_InJournal, &dev->flags)) {
+				set_bit(R5_LOCKED, &dev->flags);
+				s->locked++;
 			}
 		}
 		if (!s->locked)
@@ -3516,9 +3592,12 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx ||
+		     test_bit(R5_InJournal, &dev->flags)) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
-		    !(test_bit(R5_UPTODATE, &dev->flags) ||
+		    !((test_bit(R5_UPTODATE, &dev->flags) &&
+		       (!test_bit(R5_InJournal, &dev->flags) ||
+			dev->page != dev->orig_page)) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rmw++;
@@ -3530,13 +3609,15 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
-		    test_bit(R5_Wantcompute, &dev->flags))) {
+		      test_bit(R5_InJournal, &dev->flags) ||
+		      test_bit(R5_Wantcompute, &dev->flags))) {
 			if (test_bit(R5_Insync, &dev->flags))
 				rcw++;
 			else
 				rcw += 2*disks;
 		}
 	}
+
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -3548,10 +3629,24 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
+			if (test_bit(R5_InJournal, &dev->flags) &&
+			    dev->page == dev->orig_page &&
+			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
+				/* alloc page for prexor */
+				dev->orig_page = alloc_page(GFP_NOIO);
+
+				/* will handle failure in a later patch*/
+				BUG_ON(!dev->orig_page);
+			}
+
+			if ((dev->towrite ||
+			     i == sh->pd_idx || i == sh->qd_idx ||
+			     test_bit(R5_InJournal, &dev->flags)) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
-			    !(test_bit(R5_UPTODATE, &dev->flags) ||
-			    test_bit(R5_Wantcompute, &dev->flags)) &&
+			    !((test_bit(R5_UPTODATE, &dev->flags) &&
+			       (!test_bit(R5_InJournal, &dev->flags) ||
+				dev->page != dev->orig_page)) ||
+			      test_bit(R5_Wantcompute, &dev->flags)) &&
 			    test_bit(R5_Insync, &dev->flags)) {
 				if (test_bit(STRIPE_PREREAD_ACTIVE,
 					     &sh->state)) {
@@ -3577,6 +3672,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    i != sh->pd_idx && i != sh->qd_idx &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
+			      test_bit(R5_InJournal, &dev->flags) ||
 			      test_bit(R5_Wantcompute, &dev->flags))) {
 				rcw++;
 				if (test_bit(R5_Insync, &dev->flags) &&
@@ -3616,7 +3712,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	 */
 	if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
-	    !test_bit(STRIPE_BIT_DELAY, &sh->state)))
+	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
 }
 
@@ -4110,6 +4206,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 		if (test_bit(R5_InJournal, &dev->flags))
 			s->injournal++;
+		if (test_bit(R5_InJournal, &dev->flags) && dev->written)
+			s->just_cached++;
 	}
 	if (test_bit(STRIPE_SYNCING, &sh->state)) {
 		/* If there is a failed device being replaced,
@@ -4338,7 +4436,8 @@ static void handle_stripe(struct stripe_head *sh)
 			struct r5dev *dev = &sh->dev[i];
 			if (test_bit(R5_LOCKED, &dev->flags) &&
 				(i == sh->pd_idx || i == sh->qd_idx ||
-				 dev->written)) {
+				 dev->written || test_bit(R5_InJournal,
+							  &dev->flags))) {
 				pr_debug("Writing block %d\n", i);
 				set_bit(R5_Wantwrite, &dev->flags);
 				if (prexor)
@@ -4378,6 +4477,10 @@ static void handle_stripe(struct stripe_head *sh)
 				 test_bit(R5_Discard, &qdev->flags))))))
 		handle_stripe_clean_event(conf, sh, disks, &s.return_bi);
 
+	if (s.just_cached)
+		r5c_handle_cached_data_endio(conf, sh, disks, &s.return_bi);
+	r5l_stripe_write_finished(sh);
+
 	/* Now we might consider reading some blocks, either to check/generate
 	 * parity, or to satisfy requests
 	 * or to load a block that is being partially written.
@@ -6499,6 +6602,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
+	atomic_set(&conf->r5c_cached_full_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_full_stripe_list);
+	atomic_set(&conf->r5c_cached_partial_stripes, 0);
+	INIT_LIST_HEAD(&conf->r5c_partial_stripe_list);
+
 	conf->level = mddev->new_level;
 	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
-- 
cgit v1.2.3


From a39f7afde358ca89e9fc09a5525d3f8631a98a3a Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:40 -0800
Subject: md/r5cache: write-out phase and reclaim support

There are two limited resources, stripe cache and journal disk space.
For better performance, we priotize reclaim of full stripe writes.
To free up more journal space, we free earliest data on the journal.

In current implementation, reclaim happens when:
1. Periodically (every R5C_RECLAIM_WAKEUP_INTERVAL, 30 seconds) reclaim
   if there is no reclaim in the past 5 seconds.
2. when there are R5C_FULL_STRIPE_FLUSH_BATCH (256) cached full stripes,
   or cached stripes is enough for a full stripe (chunk size / 4k)
   (r5c_check_cached_full_stripe)
3. when there is pressure on stripe cache (r5c_check_stripe_cache_usage)
4. when there is pressure on journal space (r5l_write_stripe, r5c_cache_data)

r5c_do_reclaim() contains new logic of reclaim.

For stripe cache:

When stripe cache pressure is high (more than 3/4 stripes are cached,
or there is empty inactive lists), flush all full stripe. If fewer
than R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2) full stripes
are flushed, flush some paritial stripes. When stripe cache pressure
is moderate (1/2 to 3/4 of stripes are cached), flush all full stripes.

For log space:

To avoid deadlock due to log space, we need to reserve enough space
to flush cached data. The size of required log space depends on total
number of cached stripes (stripe_in_journal_count). In current
implementation, the writing-out phase automatically include pending
data writes with parity writes (similar to write through case).
Therefore, we need up to (conf->raid_disks + 1) pages for each cached
stripe (1 page for meta data, raid_disks pages for all data and
parity). r5c_log_required_to_flush_cache() calculates log space
required to flush cache. In the following, we refer to the space
calculated by r5c_log_required_to_flush_cache() as
reclaim_required_space.

Two flags are added to r5conf->cache_state: R5C_LOG_TIGHT and
R5C_LOG_CRITICAL. R5C_LOG_TIGHT is set when free space on the log
device is less than 3x of reclaim_required_space. R5C_LOG_CRITICAL
is set when free space on the log device is less than 2x of
reclaim_required_space.

r5c_cache keeps all data in cache (not fully committed to RAID) in
a list (stripe_in_journal_list). These stripes are in the order of their
first appearance on the journal. So the log tail (last_checkpoint)
should point to the journal_start of the first item in the list.

When R5C_LOG_TIGHT is set, r5l_reclaim_thread starts flushing out
stripes at the head of stripe_in_journal. When R5C_LOG_CRITICAL is
set, the state machine only writes data that are already in the
log device (in stripe_in_journal_list).

This patch includes a fix to improve performance by
Shaohua Li <shli@fb.com>.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f535ce2c267a..90638ba67812 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -228,6 +228,16 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 		for (i = sh->disks; i--; )
 			if (test_bit(R5_InJournal, &sh->dev[i].flags))
 				injournal++;
+	/*
+	 * When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
+	 * data in journal, so they are not released to cached lists
+	 */
+	if (conf->quiesce && r5c_is_writeback(conf->log) &&
+	    !test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
+		if (test_bit(STRIPE_R5C_CACHING, &sh->state))
+			r5c_make_stripe_write_out(sh);
+		set_bit(STRIPE_HANDLE, &sh->state);
+	}
 
 	if (test_bit(STRIPE_HANDLE, &sh->state)) {
 		if (test_bit(STRIPE_DELAYED, &sh->state) &&
@@ -268,6 +278,7 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 					if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state))
 						atomic_dec(&conf->r5c_cached_partial_stripes);
 					list_add_tail(&sh->lru, &conf->r5c_full_stripe_list);
+					r5c_check_cached_full_stripe(conf);
 				} else {
 					/* partial stripe */
 					if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE,
@@ -639,9 +650,12 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
 			}
 			if (noblock && sh == NULL)
 				break;
+
+			r5c_check_stripe_cache_usage(conf);
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
+				r5l_wake_reclaim(conf->log, 0);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
@@ -1992,7 +2006,9 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		spin_lock_init(&sh->batch_lock);
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
+		INIT_LIST_HEAD(&sh->r5c);
 		atomic_set(&sh->count, 1);
+		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
 			struct r5dev *dev = &sh->dev[i];
 
@@ -4759,6 +4775,10 @@ static int raid5_congested(struct mddev *mddev, int bits)
 
 	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return 1;
+
+	/* Also checks whether there is pressure on r5cache log space */
+	if (test_bit(R5C_LOG_TIGHT, &conf->cache_state))
+		return 1;
 	if (conf->quiesce)
 		return 1;
 	if (atomic_read(&conf->empty_inactive_list_nr))
@@ -7661,6 +7681,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		/* '2' tells resync/reshape to pause so that all
 		 * active stripes can drain
 		 */
+		r5c_flush_cache(conf, INT_MAX);
 		conf->quiesce = 2;
 		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
-- 
cgit v1.2.3


From 2c7da14b90a01e48b17a028de6050a796cfd6d8d Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:41 -0800
Subject: md/r5cache: sysfs entry journal_mode

With write cache, journal_mode is the knob to switch between
write-back and write-through.

Below is an example:

root@virt-test:~/# cat /sys/block/md0/md/journal_mode
[write-through] write-back
root@virt-test:~/# echo write-back > /sys/block/md0/md/journal_mode
root@virt-test:~/# cat /sys/block/md0/md/journal_mode
write-through [write-back]

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 90638ba67812..17cf98e93307 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6319,6 +6319,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
+	&r5c_journal_mode.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
-- 
cgit v1.2.3


From 5aabf7c49d9ebe54a318976276b187637177a03e Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 17 Nov 2016 15:24:44 -0800
Subject: md/r5cache: r5cache recovery: part 2

1. In previous patch, we:
      - add new data to r5l_recovery_ctx
      - add new functions to recovery write-back cache
   The new functions are not used in this patch, so this patch does not
   change the behavior of recovery.

2. In this patchpatch, we:
      - modify main recovery procedure r5l_recovery_log() to call new
        functions
      - remove old functions

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 17cf98e93307..aa4968c04055 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7100,7 +7100,8 @@ static int raid5_run(struct mddev *mddev)
 
 		pr_debug("md/raid:%s: using device %s as journal\n",
 			 mdname(mddev), bdevname(journal_dev->bdev, b));
-		r5l_init_log(conf, journal_dev);
+		if (r5l_init_log(conf, journal_dev))
+			goto abort;
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 3bddb7f8f264ec58dc86e11ca97341c24f9d38f6 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 18 Nov 2016 16:46:50 -0800
Subject: md/r5cache: handle FLUSH and FUA

With raid5 cache, we committing data from journal device. When
there is flush request, we need to flush journal device's cache.
This was not needed in raid5 journal, because we will flush the
journal before committing data to raid disks.

This is similar to FUA, except that we also need flush journal for
FUA. Otherwise, corruptions in earlier meta data will stop recovery
from reaching FUA data.

slightly changed the code by Shaohua

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index aa4968c04055..dbab8c7eccb0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5248,6 +5248,7 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 	int remaining;
 	DEFINE_WAIT(w);
 	bool do_prepare;
+	bool do_flush = false;
 
 	if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
 		int ret = r5l_handle_flush_request(conf->log, bi);
@@ -5259,6 +5260,11 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 			return;
 		}
 		/* ret == -EAGAIN, fallback */
+		/*
+		 * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH,
+		 * we need to flush journal device
+		 */
+		do_flush = bi->bi_opf & REQ_PREFLUSH;
 	}
 
 	md_write_start(mddev, bi);
@@ -5398,6 +5404,12 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi)
 				do_prepare = true;
 				goto retry;
 			}
+			if (do_flush) {
+				set_bit(STRIPE_R5C_PREFLUSH, &sh->state);
+				/* we only need flush for one stripe */
+				do_flush = false;
+			}
+
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if ((!sh->batch_head || sh == sh->batch_head) &&
-- 
cgit v1.2.3


From d7bd398e97f236a2353689eca5e8950f67cd34d5 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Wed, 23 Nov 2016 22:50:39 -0800
Subject: md/r5cache: handle alloc_page failure

RMW of r5c write back cache uses an extra page to store old data for
prexor. handle_stripe_dirtying() allocates this page by calling
alloc_page(). However, alloc_page() may fail.

To handle alloc_page() failures, this patch adds an extra page to
disk_info. When alloc_page fails, handle_stripe() trys to use these
pages. When these pages are used by other stripe (R5C_EXTRA_PAGE_IN_USE),
the stripe is added to delayed_list.

Signed-off-by: Song Liu <songliubraving@fb.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 78 +++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 12 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dbab8c7eccb0..db909b9e37df 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -876,6 +876,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
 	if (!test_bit(STRIPE_R5C_CACHING, &sh->state)) {
 		/* writing out phase */
+		if (s->waiting_extra_page)
+			return;
 		if (r5l_write_stripe(conf->log, sh) == 0)
 			return;
 	} else {  /* caching phase */
@@ -2007,6 +2009,7 @@ static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
 		INIT_LIST_HEAD(&sh->batch_list);
 		INIT_LIST_HEAD(&sh->lru);
 		INIT_LIST_HEAD(&sh->r5c);
+		INIT_LIST_HEAD(&sh->log_list);
 		atomic_set(&sh->count, 1);
 		sh->log_start = MaxSector;
 		for (i = 0; i < disks; i++) {
@@ -2253,10 +2256,24 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	 */
 	ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
 	if (ndisks) {
-		for (i=0; i<conf->raid_disks; i++)
+		for (i = 0; i < conf->pool_size; i++)
 			ndisks[i] = conf->disks[i];
-		kfree(conf->disks);
-		conf->disks = ndisks;
+
+		for (i = conf->pool_size; i < newsize; i++) {
+			ndisks[i].extra_page = alloc_page(GFP_NOIO);
+			if (!ndisks[i].extra_page)
+				err = -ENOMEM;
+		}
+
+		if (err) {
+			for (i = conf->pool_size; i < newsize; i++)
+				if (ndisks[i].extra_page)
+					put_page(ndisks[i].extra_page);
+			kfree(ndisks);
+		} else {
+			kfree(conf->disks);
+			conf->disks = ndisks;
+		}
 	} else
 		err = -ENOMEM;
 
@@ -3580,10 +3597,10 @@ unhash:
 		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
-static void handle_stripe_dirtying(struct r5conf *conf,
-				   struct stripe_head *sh,
-				   struct stripe_head_state *s,
-				   int disks)
+static int handle_stripe_dirtying(struct r5conf *conf,
+				  struct stripe_head *sh,
+				  struct stripe_head_state *s,
+				  int disks)
 {
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
@@ -3649,12 +3666,32 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			    dev->page == dev->orig_page &&
 			    !test_bit(R5_LOCKED, &sh->dev[sh->pd_idx].flags)) {
 				/* alloc page for prexor */
-				dev->orig_page = alloc_page(GFP_NOIO);
+				struct page *p = alloc_page(GFP_NOIO);
+
+				if (p) {
+					dev->orig_page = p;
+					continue;
+				}
 
-				/* will handle failure in a later patch*/
-				BUG_ON(!dev->orig_page);
+				/*
+				 * alloc_page() failed, try use
+				 * disk_info->extra_page
+				 */
+				if (!test_and_set_bit(R5C_EXTRA_PAGE_IN_USE,
+						      &conf->cache_state)) {
+					r5c_use_extra_page(sh);
+					break;
+				}
+
+				/* extra_page in use, add to delayed_list */
+				set_bit(STRIPE_DELAYED, &sh->state);
+				s->waiting_extra_page = 1;
+				return -EAGAIN;
 			}
+		}
 
+		for (i = disks; i--; ) {
+			struct r5dev *dev = &sh->dev[i];
 			if ((dev->towrite ||
 			     i == sh->pd_idx || i == sh->qd_idx ||
 			     test_bit(R5_InJournal, &dev->flags)) &&
@@ -3730,6 +3767,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	    (s->locked == 0 && (rcw == 0 || rmw == 0) &&
 	     !test_bit(STRIPE_BIT_DELAY, &sh->state)))
 		schedule_reconstruction(sh, s, rcw == 0, 0);
+	return 0;
 }
 
 static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
@@ -4545,8 +4583,12 @@ static void handle_stripe(struct stripe_head *sh)
 			if (ret == -EAGAIN ||
 			    /* stripe under reclaim: !caching && injournal */
 			    (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
-			     s.injournal > 0))
-				handle_stripe_dirtying(conf, sh, &s, disks);
+			     s.injournal > 0)) {
+				ret = handle_stripe_dirtying(conf, sh, &s,
+							     disks);
+				if (ret == -EAGAIN)
+					goto finish;
+			}
 		}
 	}
 
@@ -6458,6 +6500,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	int i;
+
 	if (conf->log)
 		r5l_exit_log(conf->log);
 	if (conf->shrinker.nr_deferred)
@@ -6466,6 +6510,9 @@ static void free_conf(struct r5conf *conf)
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
+	for (i = 0; i < conf->pool_size; i++)
+		if (conf->disks[i].extra_page)
+			put_page(conf->disks[i].extra_page);
 	kfree(conf->disks);
 	kfree(conf->stripe_hashtbl);
 	kfree(conf);
@@ -6612,9 +6659,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 
 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
+
 	if (!conf->disks)
 		goto abort;
 
+	for (i = 0; i < max_disks; i++) {
+		conf->disks[i].extra_page = alloc_page(GFP_KERNEL);
+		if (!conf->disks[i].extra_page)
+			goto abort;
+	}
+
 	conf->mddev = mddev;
 
 	if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
-- 
cgit v1.2.3


From e8d7c33232e5fdfa761c3416539bc5b4acd12db5 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Date: Sun, 27 Nov 2016 19:32:32 +0300
Subject: md/raid5: limit request size according to implementation limits

Current implementation employ 16bit counter of active stripes in lower
bits of bio->bi_phys_segments. If request is big enough to overflow
this counter bio will be completed and freed too early.

Fortunately this not happens in default configuration because several
other limits prevent that: stripe_cache_size * nr_disks effectively
limits count of active stripes. And small max_sectors_kb at lower
disks prevent that during normal read/write operations.

Overflow easily happens in discard if it's enabled by module parameter
"devices_handle_discard_safely" and stripe_cache_size is set big enough.

This patch limits requests size with 256Mb - 8Kb to prevent overflows.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@yandex-team.ru>
Cc: Shaohua Li <shli@kernel.org>
Cc: Neil Brown <neilb@suse.com>
Cc: stable@vger.kernel.org
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index db909b9e37df..6bf3c2604a2f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7108,6 +7108,15 @@ static int raid5_run(struct mddev *mddev)
 			stripe = (stripe | (stripe-1)) + 1;
 		mddev->queue->limits.discard_alignment = stripe;
 		mddev->queue->limits.discard_granularity = stripe;
+
+		/*
+		 * We use 16-bit counter of active stripes in bi_phys_segments
+		 * (minus one for over-loaded initialization)
+		 */
+		blk_queue_max_hw_sectors(mddev->queue, 0xfffe * STRIPE_SECTORS);
+		blk_queue_max_discard_sectors(mddev->queue,
+					      0xfffe * STRIPE_SECTORS);
+
 		/*
 		 * unaligned part of discard request will be ignored, so can't
 		 * guarantee discard_zeroes_data
-- 
cgit v1.2.3


From 6995f0b247e15e34fbcd10852c08b30bdc1a78da Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 8 Dec 2016 15:48:17 -0800
Subject: md: takeover should clear unrelated bits

When we change level from raid1 to raid5, the MD_FAILFAST_SUPPORTED bit
will be accidentally set, but raid5 doesn't support it. The same is true
for the MD_HAS_JOURNAL bit.

Fix: 46533ff (md: Use REQ_FAILFAST_* on metadata writes where appropriate)
Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6bf3c2604a2f..3e6a2a0d61e9 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7811,6 +7811,7 @@ static void *raid45_takeover_raid0(struct mddev *mddev, int level)
 static void *raid5_takeover_raid1(struct mddev *mddev)
 {
 	int chunksect;
+	void *ret;
 
 	if (mddev->raid_disks != 2 ||
 	    mddev->degraded > 1)
@@ -7832,7 +7833,10 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
 	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
 	mddev->new_chunk_sectors = chunksect;
 
-	return setup_conf(mddev);
+	ret = setup_conf(mddev);
+	if (!IS_ERR_VALUE(ret))
+		clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	return ret;
 }
 
 static void *raid5_takeover_raid6(struct mddev *mddev)
-- 
cgit v1.2.3


From 2953079c692da067aeb6345659875b97378f9b0a Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 8 Dec 2016 15:48:19 -0800
Subject: md: separate flags for superblock changes

The mddev->flags are used for different purposes. There are a lot of
places we check/change the flags without masking unrelated flags, we
could check/change unrelated flags. These usage are most for superblock
write, so spearate superblock related flags. This should make the code
clearer and also fix real bugs.

Reviewed-by: NeilBrown <neilb@suse.com>
Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'drivers/md/raid5.c')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3e6a2a0d61e9..d40e94d56410 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -961,7 +961,7 @@ again:
 			if (bad < 0) {
 				set_bit(BlockedBadBlocks, &rdev->flags);
 				if (!conf->mddev->external &&
-				    conf->mddev->flags) {
+				    conf->mddev->sb_flags) {
 					/* It is very unlikely, but we might
 					 * still need to write out the
 					 * bad block log - better give it
@@ -2547,8 +2547,8 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
 
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
-	set_mask_bits(&mddev->flags, 0,
-		      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
+	set_mask_bits(&mddev->sb_flags, 0,
+		      BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
 	pr_crit("md/raid:%s: Disk failure on %s, disabling device.\n"
 		"md/raid:%s: Operation continuing on %d devices.\n",
 		mdname(mddev),
@@ -4761,7 +4761,7 @@ finish:
 	}
 
 	if (!bio_list_empty(&s.return_bi)) {
-		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+		if (test_bit(MD_SB_CHANGE_PENDING, &conf->mddev->sb_flags)) {
 			spin_lock_irq(&conf->device_lock);
 			bio_list_merge(&conf->return_bi, &s.return_bi);
 			spin_unlock_irq(&conf->device_lock);
@@ -5617,9 +5617,9 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait, mddev->flags == 0 ||
+		wait_event(mddev->sb_wait, mddev->sb_flags == 0 ||
 			   test_bit(MD_RECOVERY_INTR, &mddev->recovery));
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			return 0;
@@ -5715,10 +5715,10 @@ finish:
 		mddev->reshape_position = conf->reshape_progress;
 		mddev->curr_resync_completed = sector_nr;
 		conf->reshape_checkpoint = jiffies;
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
-			   !test_bit(MD_CHANGE_DEVS, &mddev->flags)
+			   !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)
 			   || test_bit(MD_RECOVERY_INTR, &mddev->recovery));
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			goto ret;
@@ -5993,10 +5993,10 @@ static void raid5d(struct md_thread *thread)
 	md_check_recovery(mddev);
 
 	if (!bio_list_empty(&conf->return_bi) &&
-	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+	    !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 		struct bio_list tmp = BIO_EMPTY_LIST;
 		spin_lock_irq(&conf->device_lock);
-		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
 			bio_list_merge(&tmp, &conf->return_bi);
 			bio_list_init(&conf->return_bi);
 		}
@@ -6043,7 +6043,7 @@ static void raid5d(struct md_thread *thread)
 			break;
 		handled += batch_size;
 
-		if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) {
+		if (mddev->sb_flags & ~(1 << MD_SB_CHANGE_PENDING)) {
 			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
@@ -7640,7 +7640,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 	}
 	mddev->raid_disks = conf->raid_disks;
 	mddev->reshape_position = conf->reshape_progress;
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
@@ -7906,7 +7906,7 @@ static int raid5_check_reshape(struct mddev *mddev)
 			conf->chunk_sectors = new_chunk ;
 			mddev->chunk_sectors = new_chunk;
 		}
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 		md_wakeup_thread(mddev->thread);
 	}
 	return check_reshape(mddev);
-- 
cgit v1.2.3