diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 819 |
1 files changed, 307 insertions, 512 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 087eee0cb809..55e7c56045a0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -94,6 +94,8 @@ #define __inline__ #endif +#define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) + #if !RAID6_USE_EMPTY_ZERO_PAGE /* In .bss so it's zeroed */ const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256))); @@ -113,15 +115,20 @@ static void return_io(struct bio *return_bi) return_bi = bi->bi_next; bi->bi_next = NULL; bi->bi_size = 0; - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); bi = return_bi; } } static void print_raid5_conf (raid5_conf_t *conf); +static int stripe_operations_active(struct stripe_head *sh) +{ + return sh->check_state || sh->reconstruct_state || + test_bit(STRIPE_BIOFILL_RUN, &sh->state) || + test_bit(STRIPE_COMPUTE_RUN, &sh->state); +} + static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) { if (atomic_dec_and_test(&sh->count)) { @@ -141,7 +148,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { - BUG_ON(sh->ops.pending); + BUG_ON(stripe_operations_active(sh)); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -243,7 +250,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + BUG_ON(stripe_operations_active(sh)); CHECK_DEVLOCK(); pr_debug("init_stripe called, stripe %llu\n", @@ -344,62 +351,18 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } -/* test_and_ack_op() ensures that we only dequeue an operation once */ -#define test_and_ack_op(op, pend) \ -do { \ - if (test_bit(op, &sh->ops.pending) && \ - !test_bit(op, &sh->ops.complete)) { \ - if (test_and_set_bit(op, &sh->ops.ack)) \ - clear_bit(op, &pend); \ - else \ - ack++; \ - } else \ - clear_bit(op, &pend); \ -} while (0) - -/* find new work to run, do not resubmit work that is already - * in flight - */ -static unsigned long get_stripe_work(struct stripe_head *sh) -{ - unsigned long pending; - int ack = 0; - - pending = sh->ops.pending; - - test_and_ack_op(STRIPE_OP_BIOFILL, pending); - test_and_ack_op(STRIPE_OP_COMPUTE_BLK, pending); - test_and_ack_op(STRIPE_OP_PREXOR, pending); - test_and_ack_op(STRIPE_OP_BIODRAIN, pending); - test_and_ack_op(STRIPE_OP_POSTXOR, pending); - test_and_ack_op(STRIPE_OP_CHECK, pending); - if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) - ack++; - - sh->ops.count -= ack; - if (unlikely(sh->ops.count < 0)) { - printk(KERN_ERR "pending: %#lx ops.pending: %#lx ops.ack: %#lx " - "ops.complete: %#lx\n", pending, sh->ops.pending, - sh->ops.ack, sh->ops.complete); - BUG(); - } - - return pending; -} - static void raid5_end_read_request(struct bio *bi, int error); static void raid5_end_write_request(struct bio *bi, int error); -static void ops_run_io(struct stripe_head *sh) +static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) { raid5_conf_t *conf = sh->raid_conf; int i, disks = sh->disks; might_sleep(); - set_bit(STRIPE_IO_STARTED, &sh->state); for (i = disks; i--; ) { int rw; struct bio *bi; @@ -428,11 +391,11 @@ static void ops_run_io(struct stripe_head *sh) rcu_read_unlock(); if (rdev) { - if (test_bit(STRIPE_SYNCING, &sh->state) || - test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || - test_bit(STRIPE_EXPAND_READY, &sh->state)) + if (s->syncing || s->expanding || s->expanded) md_sync_acct(rdev->bdev, STRIPE_SECTORS); + set_bit(STRIPE_IO_STARTED, &sh->state); + bi->bi_bdev = rdev->bdev; pr_debug("%s: for %llu schedule op %ld on disc %d\n", __func__, (unsigned long long)sh->sector, @@ -526,38 +489,34 @@ static void ops_complete_biofill(void *stripe_head_ref) (unsigned long long)sh->sector); /* clear completed biofills */ + spin_lock_irq(&conf->device_lock); for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* acknowledge completion of a biofill operation */ /* and check if we need to reply to a read request, * new R5_Wantfill requests are held off until - * !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending) + * !STRIPE_BIOFILL_RUN */ if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { struct bio *rbi, *rbi2; - /* The access to dev->read is outside of the - * spin_lock_irq(&conf->device_lock), but is protected - * by the STRIPE_OP_BIOFILL pending bit - */ BUG_ON(!dev->read); rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); if (--rbi->bi_phys_segments == 0) { rbi->bi_next = return_bi; return_bi = rbi; } - spin_unlock_irq(&conf->device_lock); rbi = rbi2; } } } - set_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); + spin_unlock_irq(&conf->device_lock); + clear_bit(STRIPE_BIOFILL_RUN, &sh->state); return_io(return_bi); @@ -608,13 +567,14 @@ static void ops_complete_compute5(void *stripe_head_ref) set_bit(R5_UPTODATE, &tgt->flags); BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); clear_bit(R5_Wantcompute, &tgt->flags); - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_COMPUTE_RUN, &sh->state); + if (sh->check_state == check_state_compute_run) + sh->check_state = check_state_compute_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } -static struct dma_async_tx_descriptor * -ops_run_compute5(struct stripe_head *sh, unsigned long pending) +static struct dma_async_tx_descriptor *ops_run_compute5(struct stripe_head *sh) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -644,10 +604,6 @@ ops_run_compute5(struct stripe_head *sh, unsigned long pending) ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute5, sh); - /* ack now if postxor is not set to be run */ - if (tx && !test_bit(STRIPE_OP_POSTXOR, &pending)) - async_tx_ack(tx); - return tx; } @@ -657,8 +613,6 @@ static void ops_complete_prexor(void *stripe_head_ref) pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_PREXOR, &sh->ops.complete); } static struct dma_async_tx_descriptor * @@ -678,7 +632,7 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; /* Only process blocks that are known to be uptodate */ - if (dev->towrite && test_bit(R5_Wantprexor, &dev->flags)) + if (test_bit(R5_Wantdrain, &dev->flags)) xor_srcs[count++] = dev->page; } @@ -690,16 +644,10 @@ ops_run_prexor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) } static struct dma_async_tx_descriptor * -ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { int disks = sh->disks; - int pd_idx = sh->pd_idx, i; - - /* check if prexor is active which means only process blocks - * that are part of a read-modify-write (Wantprexor) - */ - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int i; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -707,20 +655,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; struct bio *chosen; - int towrite; - towrite = 0; - if (prexor) { /* rmw */ - if (dev->towrite && - test_bit(R5_Wantprexor, &dev->flags)) - towrite = 1; - } else { /* rcw */ - if (i != pd_idx && dev->towrite && - test_bit(R5_LOCKED, &dev->flags)) - towrite = 1; - } - - if (towrite) { + if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { struct bio *wbi; spin_lock(&sh->lock); @@ -745,18 +681,6 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, static void ops_complete_postxor(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - - pr_debug("%s: stripe %llu\n", __func__, - (unsigned long long)sh->sector); - - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - set_bit(STRIPE_HANDLE, &sh->state); - release_stripe(sh); -} - -static void ops_complete_write(void *stripe_head_ref) -{ - struct stripe_head *sh = stripe_head_ref; int disks = sh->disks, i, pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, @@ -768,16 +692,21 @@ static void ops_complete_write(void *stripe_head_ref) set_bit(R5_UPTODATE, &dev->flags); } - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + if (sh->reconstruct_state == reconstruct_state_drain_run) + sh->reconstruct_state = reconstruct_state_drain_result; + else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) + sh->reconstruct_state = reconstruct_state_prexor_drain_result; + else { + BUG_ON(sh->reconstruct_state != reconstruct_state_run); + sh->reconstruct_state = reconstruct_state_result; + } set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } static void -ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, - unsigned long pending) +ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) { /* kernel stack size limits the total number of disks */ int disks = sh->disks; @@ -785,9 +714,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, int count = 0, pd_idx = sh->pd_idx, i; struct page *xor_dest; - int prexor = test_bit(STRIPE_OP_PREXOR, &pending); + int prexor = 0; unsigned long flags; - dma_async_tx_callback callback; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -795,7 +723,8 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, /* check if prexor is active which means only process blocks * that are part of a read-modify-write (written) */ - if (prexor) { + if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { + prexor = 1; xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -811,10 +740,6 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, } } - /* check whether this postxor is part of a write */ - callback = test_bit(STRIPE_OP_BIODRAIN, &pending) ? - ops_complete_write : ops_complete_postxor; - /* 1/ if we prexor'd then the dest is reused as a source * 2/ if we did not prexor then we are redoing the parity * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST @@ -828,25 +753,20 @@ ops_run_postxor(struct stripe_head *sh, struct dma_async_tx_descriptor *tx, if (unlikely(count == 1)) { flags &= ~(ASYNC_TX_XOR_DROP_DST | ASYNC_TX_XOR_ZERO_DST); tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } else tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, - flags, tx, callback, sh); + flags, tx, ops_complete_postxor, sh); } static void ops_complete_check(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; - int pd_idx = sh->pd_idx; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); - if (test_and_clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending) && - sh->ops.zero_sum_result == 0) - set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - - set_bit(STRIPE_OP_CHECK, &sh->ops.complete); + sh->check_state = check_state_check_result; set_bit(STRIPE_HANDLE, &sh->state); release_stripe(sh); } @@ -873,46 +793,42 @@ static void ops_run_check(struct stripe_head *sh) tx = async_xor_zero_sum(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &sh->ops.zero_sum_result, 0, NULL, NULL, NULL); - if (tx) - set_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - else - clear_bit(STRIPE_OP_MOD_DMA_CHECK, &sh->ops.pending); - atomic_inc(&sh->count); tx = async_trigger_callback(ASYNC_TX_DEP_ACK | ASYNC_TX_ACK, tx, ops_complete_check, sh); } -static void raid5_run_ops(struct stripe_head *sh, unsigned long pending) +static void raid5_run_ops(struct stripe_head *sh, unsigned long ops_request) { int overlap_clear = 0, i, disks = sh->disks; struct dma_async_tx_descriptor *tx = NULL; - if (test_bit(STRIPE_OP_BIOFILL, &pending)) { + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; } - if (test_bit(STRIPE_OP_COMPUTE_BLK, &pending)) - tx = ops_run_compute5(sh, pending); + if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { + tx = ops_run_compute5(sh); + /* terminate the chain if postxor is not set to be run */ + if (tx && !test_bit(STRIPE_OP_POSTXOR, &ops_request)) + async_tx_ack(tx); + } - if (test_bit(STRIPE_OP_PREXOR, &pending)) + if (test_bit(STRIPE_OP_PREXOR, &ops_request)) tx = ops_run_prexor(sh, tx); - if (test_bit(STRIPE_OP_BIODRAIN, &pending)) { - tx = ops_run_biodrain(sh, tx, pending); + if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { + tx = ops_run_biodrain(sh, tx); overlap_clear++; } - if (test_bit(STRIPE_OP_POSTXOR, &pending)) - ops_run_postxor(sh, tx, pending); + if (test_bit(STRIPE_OP_POSTXOR, &ops_request)) + ops_run_postxor(sh, tx); - if (test_bit(STRIPE_OP_CHECK, &pending)) + if (test_bit(STRIPE_OP_CHECK, &ops_request)) ops_run_check(sh); - if (test_bit(STRIPE_OP_IO, &pending)) - ops_run_io(sh); - if (overlap_clear) for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; @@ -995,14 +911,16 @@ static int resize_stripes(raid5_conf_t *conf, int newsize) struct stripe_head *osh, *nsh; LIST_HEAD(newstripes); struct disk_info *ndisks; - int err = 0; + int err; struct kmem_cache *sc; int i; if (newsize <= conf->pool_size) return 0; /* never bother to shrink */ - md_allow_write(conf->mddev); + err = md_allow_write(conf->mddev); + if (err) + return err; /* Step 1 */ sc = kmem_cache_create(conf->cache_name[1-conf->active_name], @@ -1143,10 +1061,12 @@ static void raid5_end_read_request(struct bio * bi, int error) set_bit(R5_UPTODATE, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) { rdev = conf->disks[i].rdev; - printk(KERN_INFO "raid5:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, - (unsigned long long)(sh->sector + rdev->data_offset), - bdevname(rdev->bdev, b)); + printk_rl(KERN_INFO "raid5:%s: read error corrected" + " (%lu sectors at %llu on %s)\n", + mdname(conf->mddev), STRIPE_SECTORS, + (unsigned long long)(sh->sector + + rdev->data_offset), + bdevname(rdev->bdev, b)); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } @@ -1160,16 +1080,22 @@ static void raid5_end_read_request(struct bio * bi, int error) clear_bit(R5_UPTODATE, &sh->dev[i].flags); atomic_inc(&rdev->read_errors); if (conf->mddev->degraded) - printk(KERN_WARNING "raid5:%s: read error not correctable (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error not correctable " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) /* Oh, no!!! */ - printk(KERN_WARNING "raid5:%s: read error NOT corrected!! (sector %llu on %s).\n", - mdname(conf->mddev), - (unsigned long long)(sh->sector + rdev->data_offset), - bdn); + printk_rl(KERN_WARNING + "raid5:%s: read error NOT corrected!! " + "(sector %llu on %s).\n", + mdname(conf->mddev), + (unsigned long long)(sh->sector + + rdev->data_offset), + bdn); else if (atomic_read(&rdev->read_errors) > conf->max_nr_stripes) printk(KERN_WARNING @@ -1258,7 +1184,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) /* * if recovery was running, make sure it aborts. */ - set_bit(MD_RECOVERY_ERR, &mddev->recovery); + set_bit(MD_RECOVERY_INTR, &mddev->recovery); } set_bit(Faulty, &rdev->flags); printk (KERN_ALERT @@ -1693,11 +1619,11 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } -static int -handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +static void +schedule_reconstruction5(struct stripe_head *sh, struct stripe_head_state *s, + int rcw, int expand) { int i, pd_idx = sh->pd_idx, disks = sh->disks; - int locked = 0; if (rcw) { /* if we are not expanding this is a proper write request, and @@ -1705,53 +1631,48 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) * stripe cache */ if (!expand) { - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - sh->ops.count++; - } + sh->reconstruct_state = reconstruct_state_drain_run; + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + } else + sh->reconstruct_state = reconstruct_state_run; - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - sh->ops.count++; + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (dev->towrite) { set_bit(R5_LOCKED, &dev->flags); + set_bit(R5_Wantdrain, &dev->flags); if (!expand) clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } - if (locked + 1 == disks) + if (s->locked + 1 == disks) if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) atomic_inc(&sh->raid_conf->pending_full_writes); } else { BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); - set_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - - sh->ops.count += 3; + sh->reconstruct_state = reconstruct_state_prexor_drain_run; + set_bit(STRIPE_OP_PREXOR, &s->ops_request); + set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); + set_bit(STRIPE_OP_POSTXOR, &s->ops_request); for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (i == pd_idx) continue; - /* For a read-modify write there may be blocks that are - * locked for reading while others are ready to be - * written so we distinguish these blocks by the - * R5_Wantprexor bit - */ if (dev->towrite && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Wantcompute, &dev->flags))) { - set_bit(R5_Wantprexor, &dev->flags); + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantdrain, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); - locked++; + s->locked++; } } } @@ -1761,13 +1682,11 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand) */ set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - locked++; + s->locked++; - pr_debug("%s: stripe %llu locked: %d pending: %lx\n", + pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", __func__, (unsigned long long)sh->sector, - locked, sh->ops.pending); - - return locked; + s->locked, s->ops_request); } /* @@ -1866,7 +1785,7 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) } static void -handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, +handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks, struct bio **return_bi) { @@ -1957,47 +1876,38 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh, md_wakeup_thread(conf->mddev->thread); } -/* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks - * to process +/* fetch_block5 - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill5 to continue */ -static int __handle_issuing_new_read_requests5(struct stripe_head *sh, - struct stripe_head_state *s, int disk_idx, int disks) +static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, + int disk_idx, int disks) { struct r5dev *dev = &sh->dev[disk_idx]; struct r5dev *failed_dev = &sh->dev[s->failed_num]; - /* don't schedule compute operations or reads on the parity block while - * a check is in flight - */ - if ((disk_idx == sh->pd_idx) && - test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - return ~0; - /* is the data in this block needed, and can we get it? */ if (!test_bit(R5_LOCKED, &dev->flags) && - !test_bit(R5_UPTODATE, &dev->flags) && (dev->toread || - (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || - s->syncing || s->expanding || (s->failed && - (failed_dev->toread || (failed_dev->towrite && - !test_bit(R5_OVERWRITE, &failed_dev->flags) - ))))) { - /* 1/ We would like to get this block, possibly by computing it, - * but we might not be able to. - * - * 2/ Since parity check operations potentially make the parity - * block !uptodate it will need to be refreshed before any - * compute operations on data disks are scheduled. - * - * 3/ We hold off parity block re-reads until check operations - * have quiesced. + !test_bit(R5_UPTODATE, &dev->flags) && + (dev->toread || + (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || + s->syncing || s->expanding || + (s->failed && + (failed_dev->toread || + (failed_dev->towrite && + !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { + /* We would like to get this block, possibly by computing it, + * otherwise read it if the backing disk is insync */ if ((s->uptodate == disks - 1) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { - set_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + (s->failed && disk_idx == s->failed_num)) { + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); set_bit(R5_Wantcompute, &dev->flags); sh->ops.target = disk_idx; s->req_compute = 1; - sh->ops.count++; /* Careful: from this point on 'uptodate' is in the eye * of raid5_run_ops which services 'compute' operations * before writes. R5_Wantcompute flags a block that will @@ -2005,58 +1915,40 @@ static int __handle_issuing_new_read_requests5(struct stripe_head *sh, * subsequent operation. */ s->uptodate++; - return 0; /* uptodate + compute == disks */ - } else if ((s->uptodate < disks - 1) && - test_bit(R5_Insync, &dev->flags)) { - /* Note: we hold off compute operations while checks are - * in flight, but we still prefer 'compute' over 'read' - * hence we only read if (uptodate < * disks-1) - */ + return 1; /* uptodate + compute == disks */ + } else if (test_bit(R5_Insync, &dev->flags)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; pr_debug("Reading block %d (sync=%d)\n", disk_idx, s->syncing); } } - return ~0; + return 0; } -static void handle_issuing_new_read_requests5(struct stripe_head *sh, +/** + * handle_stripe_fill5 - read or compute data to satisfy pending requests. + */ +static void handle_stripe_fill5(struct stripe_head *sh, struct stripe_head_state *s, int disks) { int i; - /* Clear completed compute operations. Parity recovery - * (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled - * later on in this routine - */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - /* look for blocks to read/compute, skip this if a compute * is already in flight, or if the stripe contents are in the * midst of changing due to a write */ - if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && + !sh->reconstruct_state) for (i = disks; i--; ) - if (__handle_issuing_new_read_requests5( - sh, s, i, disks) == 0) + if (fetch_block5(sh, s, i, disks)) break; - } set_bit(STRIPE_HANDLE, &sh->state); } -static void handle_issuing_new_read_requests6(struct stripe_head *sh, +static void handle_stripe_fill6(struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2077,7 +1969,9 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, /* we would like to get this block, possibly * by computing it, but we might not be able to */ - if (s->uptodate == disks-1) { + if ((s->uptodate == disks - 1) && + (s->failed && (i == r6s->failed_num[0] || + i == r6s->failed_num[1]))) { pr_debug("Computing stripe %llu block %d\n", (unsigned long long)sh->sector, i); compute_block_1(sh, i, 0); @@ -2113,12 +2007,12 @@ static void handle_issuing_new_read_requests6(struct stripe_head *sh, } -/* handle_completed_write_requests +/* handle_stripe_clean_event * any written block on an uptodate or failed drive can be returned. * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but * never LOCKED, so we don't need to test 'failed' directly. */ -static void handle_completed_write_requests(raid5_conf_t *conf, +static void handle_stripe_clean_event(raid5_conf_t *conf, struct stripe_head *sh, int disks, struct bio **return_bi) { int i; @@ -2163,7 +2057,7 @@ static void handle_completed_write_requests(raid5_conf_t *conf, md_wakeup_thread(conf->mddev->thread); } -static void handle_issuing_new_write_requests5(raid5_conf_t *conf, +static void handle_stripe_dirtying5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { int rmw = 0, rcw = 0, i; @@ -2207,9 +2101,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2233,9 +2124,6 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; s->locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2253,14 +2141,13 @@ static void handle_issuing_new_write_requests5(raid5_conf_t *conf, * simultaneously. If this is not the case then new writes need to be * held off until the compute completes. */ - if ((s->req_compute || - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) && - (s->locked == 0 && (rcw == 0 || rmw == 0) && - !test_bit(STRIPE_BIT_DELAY, &sh->state))) - s->locked += handle_write_operations5(sh, rcw == 0, 0); + if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && + (s->locked == 0 && (rcw == 0 || rmw == 0) && + !test_bit(STRIPE_BIT_DELAY, &sh->state))) + schedule_reconstruction5(sh, s, rcw == 0, 0); } -static void handle_issuing_new_write_requests6(raid5_conf_t *conf, +static void handle_stripe_dirtying6(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, struct r6_state *r6s, int disks) { @@ -2363,91 +2250,86 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf, static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, struct stripe_head_state *s, int disks) { - int canceled_check = 0; + struct r5dev *dev = NULL; set_bit(STRIPE_HANDLE, &sh->state); - /* complete a check operation */ - if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + switch (sh->check_state) { + case check_state_idle: + /* start a new check operation if there are no failures */ if (s->failed == 0) { - if (sh->ops.zero_sum_result == 0) - /* parity is correct (on disc, - * not in buffer any more) - */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - conf->mddev->resync_mismatches += - STRIPE_SECTORS; - if (test_bit( - MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ - set_bit(STRIPE_INSYNC, &sh->state); - else { - set_bit(STRIPE_OP_COMPUTE_BLK, - &sh->ops.pending); - set_bit(STRIPE_OP_MOD_REPAIR_PD, - &sh->ops.pending); - set_bit(R5_Wantcompute, - &sh->dev[sh->pd_idx].flags); - sh->ops.target = sh->pd_idx; - sh->ops.count++; - s->uptodate++; - } - } - } else - canceled_check = 1; /* STRIPE_INSYNC is not set */ - } - - /* check if we can clear a parity disk reconstruct */ - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); - } - - /* start a new check operation if there are no failures, the stripe is - * not insync, and a repair is not in flight - */ - if (s->failed == 0 && - !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { - if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { BUG_ON(s->uptodate != disks); + sh->check_state = check_state_run; + set_bit(STRIPE_OP_CHECK, &s->ops_request); clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); - sh->ops.count++; s->uptodate--; + break; } - } + dev = &sh->dev[s->failed_num]; + /* fall through */ + case check_state_compute_result: + sh->check_state = check_state_idle; + if (!dev) + dev = &sh->dev[sh->pd_idx]; + + /* check that a write has not made the stripe insync */ + if (test_bit(STRIPE_INSYNC, &sh->state)) + break; - /* Wait for check parity and compute block operations to complete - * before write-back. If a failure occurred while the check operation - * was in flight we need to cycle this stripe through handle_stripe - * since the parity block may not be uptodate - */ - if (!canceled_check && !test_bit(STRIPE_INSYNC, &sh->state) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending) && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { - struct r5dev *dev; /* either failed parity check, or recovery is happening */ - if (s->failed == 0) - s->failed_num = sh->pd_idx; - dev = &sh->dev[s->failed_num]; BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); BUG_ON(s->uptodate != disks); set_bit(R5_LOCKED, &dev->flags); + s->locked++; set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); - s->locked++; set_bit(STRIPE_INSYNC, &sh->state); + break; + case check_state_run: + break; /* we will be called again upon completion */ + case check_state_check_result: + sh->check_state = check_state_idle; + + /* if a failure occurred during the check operation, leave + * STRIPE_INSYNC not set and let the stripe be handled again + */ + if (s->failed) + break; + + /* handle a successful check operation, if parity is correct + * we are done. Otherwise update the mismatch count and repair + * parity if !MD_RECOVERY_CHECK + */ + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, + * not in buffer any more) + */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + sh->check_state = check_state_compute_run; + set_bit(STRIPE_COMPUTE_RUN, &sh->state); + set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); + set_bit(R5_Wantcompute, + &sh->dev[sh->pd_idx].flags); + sh->ops.target = sh->pd_idx; + s->uptodate++; + } + } + break; + case check_state_compute_run: + break; + default: + printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", + __func__, sh->check_state, + (unsigned long long) sh->sector); + BUG(); } } @@ -2632,14 +2514,14 @@ static void handle_stripe5(struct stripe_head *sh) struct bio *return_bi = NULL; struct stripe_head_state s; struct r5dev *dev; - unsigned long pending = 0; mdk_rdev_t *blocked_rdev = NULL; + int prexor; memset(&s, 0, sizeof(s)); - pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d " - "ops=%lx:%lx:%lx\n", (unsigned long long)sh->sector, sh->state, - atomic_read(&sh->count), sh->pd_idx, - sh->ops.pending, sh->ops.ack, sh->ops.complete); + pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " + "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, + atomic_read(&sh->count), sh->pd_idx, sh->check_state, + sh->reconstruct_state); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2648,15 +2530,8 @@ static void handle_stripe5(struct stripe_head *sh) s.syncing = test_bit(STRIPE_SYNCING, &sh->state); s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ - - /* clean-up completed biofill operations */ - if (test_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); - clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete); - } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; @@ -2670,10 +2545,10 @@ static void handle_stripe5(struct stripe_head *sh) /* maybe we can request a biofill operation * * new wantfill requests are only permitted while - * STRIPE_OP_BIOFILL is clear + * ops_complete_biofill is guaranteed to be inactive */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && - !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) set_bit(R5_Wantfill, &dev->flags); /* now count some things */ @@ -2717,8 +2592,10 @@ static void handle_stripe5(struct stripe_head *sh) goto unlock; } - if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) - sh->ops.count++; + if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { + set_bit(STRIPE_OP_BIOFILL, &s.ops_request); + set_bit(STRIPE_BIOFILL_RUN, &sh->state); + } pr_debug("locked=%d uptodate=%d to_read=%d" " to_write=%d failed=%d failed_num=%d\n", @@ -2728,8 +2605,7 @@ static void handle_stripe5(struct stripe_head *sh) * need to be failed */ if (s.failed > 1 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 1 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -2745,46 +2621,25 @@ static void handle_stripe5(struct stripe_head *sh) !test_bit(R5_LOCKED, &dev->flags) && test_bit(R5_UPTODATE, &dev->flags)) || (s.failed == 1 && s.failed_num == sh->pd_idx))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests * or to load a block that is being partially written. */ if (s.to_read || s.non_overwrite || - (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding || - test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) - handle_issuing_new_read_requests5(sh, &s, disks); + (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) + handle_stripe_fill5(sh, &s, disks); /* Now we check to see if any write operations have recently * completed */ - - /* leave prexor set until postxor is done, allows us to distinguish - * a rmw from a rcw during biodrain - */ - if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); - - for (i = disks; i--; ) - clear_bit(R5_Wantprexor, &sh->dev[i].flags); - } - - /* if only POSTXOR is set then this is an 'expand' postxor */ - if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && - test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { - - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); - clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + prexor = 0; + if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) + prexor = 1; + if (sh->reconstruct_state == reconstruct_state_drain_result || + sh->reconstruct_state == reconstruct_state_prexor_drain_result) { + sh->reconstruct_state = reconstruct_state_idle; /* All the 'written' buffers and the parity block are ready to * be written back to disk @@ -2796,9 +2651,8 @@ static void handle_stripe5(struct stripe_head *sh) (i == sh->pd_idx || dev->written)) { pr_debug("Writing block %d\n", i); set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit( - STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; + if (prexor) + continue; if (!test_bit(R5_Insync, &dev->flags) || (i == sh->pd_idx && s.failed == 0)) set_bit(STRIPE_INSYNC, &sh->state); @@ -2818,20 +2672,18 @@ static void handle_stripe5(struct stripe_head *sh) * 2/ A 'check' operation is in flight, as it may clobber the parity * block. */ - if (s.to_write && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending) && - !test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) - handle_issuing_new_write_requests5(conf, sh, &s, disks); + if (s.to_write && !sh->reconstruct_state && !sh->check_state) + handle_stripe_dirtying5(conf, sh, &s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough * data is available. The parity check is held off while parity * dependent operations are in flight. */ - if ((s.syncing && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && - !test_bit(STRIPE_INSYNC, &sh->state)) || - test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) + if (sh->check_state || + (s.syncing && s.locked == 0 && + !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && + !test_bit(STRIPE_INSYNC, &sh->state))) handle_parity_checks5(conf, sh, &s, disks); if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { @@ -2850,49 +2702,35 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[s.failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); s.locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); s.locked++; } } - /* Finish postxor operations initiated by the expansion - * process - */ - if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && - !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { - + /* Finish reconstruct operations initiated by the expansion process */ + if (sh->reconstruct_state == reconstruct_state_result) { + sh->reconstruct_state = reconstruct_state_idle; clear_bit(STRIPE_EXPANDING, &sh->state); - - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); - clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); - - for (i = conf->raid_disks; i--; ) { + for (i = conf->raid_disks; i--; ) set_bit(R5_Wantwrite, &sh->dev[i].flags); - if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) - sh->ops.count++; - } + set_bit(R5_LOCKED, &dev->flags); + s.locked++; } if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + !sh->reconstruct_state) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - s.locked += handle_write_operations5(sh, 1, 1); - } else if (s.expanded && - !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + schedule_reconstruction5(sh, &s, 1, 1); + } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2900,12 +2738,9 @@ static void handle_stripe5(struct stripe_head *sh) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, NULL); - if (sh->ops.count) - pending = get_stripe_work(sh); - unlock: spin_unlock(&sh->lock); @@ -2913,11 +2748,12 @@ static void handle_stripe5(struct stripe_head *sh) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - if (pending) - raid5_run_ops(sh, pending); + if (s.ops_request) + raid5_run_ops(sh, s.ops_request); - return_io(return_bi); + ops_run_io(sh, &s); + return_io(return_bi); } static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) @@ -3025,8 +2861,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) * might need to be failed */ if (s.failed > 2 && s.to_read+s.to_write+s.written) - handle_requests_to_failed_array(conf, sh, &s, disks, - &return_bi); + handle_failed_stripe(conf, sh, &s, disks, &return_bi); if (s.failed > 2 && s.syncing) { md_done_sync(conf->mddev, STRIPE_SECTORS,0); clear_bit(STRIPE_SYNCING, &sh->state); @@ -3051,7 +2886,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) && !test_bit(R5_LOCKED, &qdev->flags) && test_bit(R5_UPTODATE, &qdev->flags))))) - handle_completed_write_requests(conf, sh, disks, &return_bi); + handle_stripe_clean_event(conf, sh, disks, &return_bi); /* Now we might consider reading some blocks, either to check/generate * parity, or to satisfy requests @@ -3059,11 +2894,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) */ if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || (s.syncing && (s.uptodate < disks)) || s.expanding) - handle_issuing_new_read_requests6(sh, &s, &r6s, disks); + handle_stripe_fill6(sh, &s, &r6s, disks); /* now to consider writing and what else, if anything should be read */ if (s.to_write) - handle_issuing_new_write_requests6(conf, sh, &s, &r6s, disks); + handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); /* maybe we need to check and possibly fix the parity for this stripe * Any reads will already have been scheduled, so we just see if enough @@ -3119,7 +2954,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) } if (s.expanding && s.locked == 0 && - !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) + !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) handle_stripe_expansion(conf, sh, &r6s); unlock: @@ -3129,68 +2964,9 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page) if (unlikely(blocked_rdev)) md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); - return_io(return_bi); - - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - set_bit(STRIPE_IO_STARTED, &sh->state); - - bi = &sh->dev[i].req; + ops_run_io(sh, &s); - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (s.syncing || s.expanding || s.expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - pr_debug("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->data_offset; - bi->bi_flags = 1 << BIO_UPTODATE; - bi->bi_vcnt = 1; - bi->bi_max_vecs = 1; - bi->bi_idx = 0; - bi->bi_io_vec = &sh->dev[i].vec; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; - bi->bi_io_vec[0].bv_offset = 0; - bi->bi_size = STRIPE_SIZE; - bi->bi_next = NULL; - if (rw == WRITE && - test_bit(R5_ReWrite, &sh->dev[i].flags)) - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); - generic_make_request(bi); - } else { - if (rw == WRITE) - set_bit(STRIPE_DEGRADED, &sh->state); - pr_debug("skip op %ld on disc %d for sector %llu\n", - bi->bi_rw, i, (unsigned long long)sh->sector); - clear_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(STRIPE_HANDLE, &sh->state); - } - } + return_io(return_bi); } static void handle_stripe(struct stripe_head *sh, struct page *tmp_page) @@ -3297,15 +3073,17 @@ static int raid5_congested(void *data, int bits) /* We want read requests to align with chunks where possible, * but write requests don't need to. */ -static int raid5_mergeable_bvec(struct request_queue *q, struct bio *bio, struct bio_vec *biovec) +static int raid5_mergeable_bvec(struct request_queue *q, + struct bvec_merge_data *bvm, + struct bio_vec *biovec) { mddev_t *mddev = q->queuedata; - sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); + sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); int max; unsigned int chunk_sectors = mddev->chunk_size >> 9; - unsigned int bio_sectors = bio->bi_size >> 9; + unsigned int bio_sectors = bvm->bi_size >> 9; - if (bio_data_dir(bio) == WRITE) + if ((bvm->bi_rw & 1) == WRITE) return biovec->bv_len; /* always allow writes to be mergeable */ max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; @@ -3678,9 +3456,7 @@ static int make_request(struct request_queue *q, struct bio * bi) if ( rw == WRITE ) md_write_end(mddev); - bi->bi_end_io(bi, - test_bit(BIO_UPTODATE, &bi->bi_flags) - ? 0 : -EIO); + bio_endio(bi, 0); } return 0; } @@ -3766,7 +3542,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped j == raid6_next_disk(sh->pd_idx, sh->disks)) continue; s = compute_blocknr(sh, j); - if (s < (mddev->array_size<<1)) { + if (s < mddev->array_sectors) { skipped = 1; continue; } @@ -3983,12 +3759,8 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) spin_lock_irq(&conf->device_lock); remaining = --raid_bio->bi_phys_segments; spin_unlock_irq(&conf->device_lock); - if (remaining == 0) { - - raid_bio->bi_end_io(raid_bio, - test_bit(BIO_UPTODATE, &raid_bio->bi_flags) - ? 0 : -EIO); - } + if (remaining == 0) + bio_endio(raid_bio, 0); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_stripe); return handled; @@ -4075,6 +3847,8 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) { raid5_conf_t *conf = mddev_to_conf(mddev); unsigned long new; + int err; + if (len >= PAGE_SIZE) return -EINVAL; if (!conf) @@ -4090,7 +3864,9 @@ raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) else break; } - md_allow_write(mddev); + err = md_allow_write(mddev); + if (err) + return err; while (new > conf->max_nr_stripes) { if (grow_one_stripe(conf)) conf->max_nr_stripes++; @@ -4256,6 +4032,7 @@ static int run(mddev_t *mddev) goto abort; } spin_lock_init(&conf->device_lock); + mddev->queue->queue_lock = &conf->device_lock; init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -4285,7 +4062,9 @@ static int run(mddev_t *mddev) " disk %d\n", bdevname(rdev->bdev,b), raid_disk); working_disks++; - } + } else + /* Cannot rely on bitmap to complete recovery */ + conf->fullsync = 1; } /* @@ -4412,7 +4191,7 @@ static int run(mddev_t *mddev) mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = raid5_congested; - mddev->array_size = mddev->size * (conf->previous_raid_disks - + mddev->array_sectors = 2 * mddev->size * (conf->previous_raid_disks - conf->max_degraded); blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); @@ -4562,6 +4341,14 @@ static int raid5_remove_disk(mddev_t *mddev, int number) err = -EBUSY; goto abort; } + /* Only remove non-faulty devices if recovery + * isn't possible. + */ + if (!test_bit(Faulty, &rdev->flags) && + mddev->degraded <= conf->max_degraded) { + err = -EBUSY; + goto abort; + } p->rdev = NULL; synchronize_rcu(); if (atomic_read(&rdev->nr_pending)) { @@ -4579,35 +4366,41 @@ abort: static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) { raid5_conf_t *conf = mddev->private; - int found = 0; + int err = -EEXIST; int disk; struct disk_info *p; + int first = 0; + int last = conf->raid_disks - 1; if (mddev->degraded > conf->max_degraded) /* no point adding a device */ - return 0; + return -EINVAL; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; /* * find the disk ... but prefer rdev->saved_raid_disk * if possible. */ if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && conf->disks[rdev->saved_raid_disk].rdev == NULL) disk = rdev->saved_raid_disk; else - disk = 0; - for ( ; disk < conf->raid_disks; disk++) + disk = first; + for ( ; disk <= last ; disk++) if ((p=conf->disks + disk)->rdev == NULL) { clear_bit(In_sync, &rdev->flags); rdev->raid_disk = disk; - found = 1; + err = 0; if (rdev->saved_raid_disk != disk) conf->fullsync = 1; rcu_assign_pointer(p->rdev, rdev); break; } print_raid5_conf(conf); - return found; + return err; } static int raid5_resize(mddev_t *mddev, sector_t sectors) @@ -4622,8 +4415,9 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) raid5_conf_t *conf = mddev_to_conf(mddev); sectors &= ~((sector_t)mddev->chunk_size/512 - 1); - mddev->array_size = (sectors * (mddev->raid_disks-conf->max_degraded))>>1; - set_capacity(mddev->gendisk, mddev->array_size << 1); + mddev->array_sectors = sectors * (mddev->raid_disks + - conf->max_degraded); + set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; if (sectors/2 > mddev->size && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->size << 1; @@ -4708,7 +4502,7 @@ static int raid5_start_reshape(mddev_t *mddev) rdev_for_each(rdev, rtmp, mddev) if (rdev->raid_disk < 0 && !test_bit(Faulty, &rdev->flags)) { - if (raid5_add_disk(mddev, rdev)) { + if (raid5_add_disk(mddev, rdev) == 0) { char nm[20]; set_bit(In_sync, &rdev->flags); added_devices++; @@ -4756,15 +4550,16 @@ static void end_reshape(raid5_conf_t *conf) struct block_device *bdev; if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { - conf->mddev->array_size = conf->mddev->size * + conf->mddev->array_sectors = 2 * conf->mddev->size * (conf->raid_disks - conf->max_degraded); - set_capacity(conf->mddev->gendisk, conf->mddev->array_size << 1); + set_capacity(conf->mddev->gendisk, conf->mddev->array_sectors); conf->mddev->changed = 1; bdev = bdget_disk(conf->mddev->gendisk, 0); if (bdev) { mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, (loff_t)conf->mddev->array_size << 10); + i_size_write(bdev->bd_inode, + (loff_t)conf->mddev->array_sectors << 9); mutex_unlock(&bdev->bd_inode->i_mutex); bdput(bdev); } |