diff options
Diffstat (limited to 'drivers/md')
29 files changed, 903 insertions, 447 deletions
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 76f7534d1dd1..81d3db40cd7b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl) static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); - struct bio_vec *bv; - int n; - - bio_for_each_segment_all(bv, b->bio, n) - __free_page(bv->bv_page); + bio_free_pages(b->bio); __btree_node_write_done(cl); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c28df164701e..333a1e5f6ae6 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; struct bio *check; - struct bio_vec bv, *bv2; + struct bio_vec bv; struct bvec_iter iter; - int i; check = bio_clone(bio, GFP_NOIO); if (!check) @@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) kunmap_atomic(p1); } - bio_for_each_segment_all(bv2, check, i) - __free_page(bv2->bv_page); + bio_free_pages(check); out_put: bio_put(check); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1881319f2298..5c4bddecfaf0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, bio, i) - __free_page(bv->bv_page); + bio_free_pages(bio); if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4b177fe11ebb..40ffe5e424b3 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->iop.replace_collision) bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->iop.bio) { - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, s->iop.bio, i) - __free_page(bv->bv_page); - } + if (s->iop.bio) + bio_free_pages(s->iop.bio); cached_dev_bio_complete(cl); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d9fd2a62e5f6..e51644e503a5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, &io->bio, i) - __free_page(bv->bv_page); + bio_free_pages(&io->bio); /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 13041ee37ad6..2d826927a3bf 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1903,10 +1903,8 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot, struct bitmap_counts *counts; struct bitmap *bitmap = bitmap_create(mddev, slot); - if (IS_ERR(bitmap)) { - bitmap_free(bitmap); + if (IS_ERR(bitmap)) return PTR_ERR(bitmap); - } rv = bitmap_init_from_disk(bitmap, 0); if (rv) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 8625040bae92..125aedc3875f 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -191,19 +191,6 @@ static void dm_bufio_unlock(struct dm_bufio_client *c) mutex_unlock(&c->lock); } -/* - * FIXME Move to sched.h? - */ -#ifdef CONFIG_PREEMPT_VOLUNTARY -# define dm_bufio_cond_resched() \ -do { \ - if (unlikely(need_resched())) \ - _cond_resched(); \ -} while (0) -#else -# define dm_bufio_cond_resched() do { } while (0) -#endif - /*----------------------------------------------------------------*/ /* @@ -741,7 +728,7 @@ static void __flush_write_list(struct list_head *write_list) list_entry(write_list->next, struct dm_buffer, write_list); list_del(&b->write_list); submit_io(b, WRITE, b->block, write_endio); - dm_bufio_cond_resched(); + cond_resched(); } blk_finish_plug(&plug); } @@ -780,7 +767,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) __unlink_buffer(b); return b; } - dm_bufio_cond_resched(); + cond_resched(); } list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) { @@ -791,7 +778,7 @@ static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c) __unlink_buffer(b); return b; } - dm_bufio_cond_resched(); + cond_resched(); } return NULL; @@ -923,7 +910,7 @@ static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, return; __write_dirty_buffer(b, write_list); - dm_bufio_cond_resched(); + cond_resched(); } } @@ -973,7 +960,7 @@ static void __check_watermark(struct dm_bufio_client *c, return; __free_buffer_wake(b); - dm_bufio_cond_resched(); + cond_resched(); } if (c->n_buffers[LIST_DIRTY] > threshold_buffers) @@ -1170,7 +1157,7 @@ void dm_bufio_prefetch(struct dm_bufio_client *c, submit_io(b, READ, b->block, read_endio); dm_bufio_release(b); - dm_bufio_cond_resched(); + cond_resched(); if (!n_blocks) goto flush_plug; @@ -1291,7 +1278,7 @@ again: !test_bit(B_WRITING, &b->state)) __relink_lru(b, LIST_CLEAN); - dm_bufio_cond_resched(); + cond_resched(); /* * If we dropped the lock, the list is no longer consistent, @@ -1574,7 +1561,7 @@ static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, freed++; if (!--nr_to_scan || ((count - freed) <= retain_target)) return freed; - dm_bufio_cond_resched(); + cond_resched(); } } return freed; @@ -1808,7 +1795,7 @@ static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) if (__try_evict_buffer(b, 0)) count--; - dm_bufio_cond_resched(); + cond_resched(); } dm_bufio_unlock(c); diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 3970cda10080..695577812cf6 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -140,6 +140,13 @@ struct dm_cache_metadata { * the device. */ bool fail_io:1; + + /* + * These structures are used when loading metadata. They're too + * big to put on the stack. + */ + struct dm_array_cursor mapping_cursor; + struct dm_array_cursor hint_cursor; }; /*------------------------------------------------------------------- @@ -1171,31 +1178,37 @@ static bool hints_array_available(struct dm_cache_metadata *cmd, hints_array_initialized(cmd); } -static int __load_mapping(void *context, uint64_t cblock, void *leaf) +static int __load_mapping(struct dm_cache_metadata *cmd, + uint64_t cb, bool hints_valid, + struct dm_array_cursor *mapping_cursor, + struct dm_array_cursor *hint_cursor, + load_mapping_fn fn, void *context) { int r = 0; - bool dirty; - __le64 value; - __le32 hint_value = 0; + + __le64 mapping; + __le32 hint = 0; + + __le64 *mapping_value_le; + __le32 *hint_value_le; + dm_oblock_t oblock; unsigned flags; - struct thunk *thunk = context; - struct dm_cache_metadata *cmd = thunk->cmd; - memcpy(&value, leaf, sizeof(value)); - unpack_value(value, &oblock, &flags); + dm_array_cursor_get_value(mapping_cursor, (void **) &mapping_value_le); + memcpy(&mapping, mapping_value_le, sizeof(mapping)); + unpack_value(mapping, &oblock, &flags); if (flags & M_VALID) { - if (thunk->hints_valid) { - r = dm_array_get_value(&cmd->hint_info, cmd->hint_root, - cblock, &hint_value); - if (r && r != -ENODATA) - return r; + if (hints_valid) { + dm_array_cursor_get_value(hint_cursor, (void **) &hint_value_le); + memcpy(&hint, hint_value_le, sizeof(hint)); } - dirty = thunk->respect_dirty_flags ? (flags & M_DIRTY) : true; - r = thunk->fn(thunk->context, oblock, to_cblock(cblock), - dirty, le32_to_cpu(hint_value), thunk->hints_valid); + r = fn(context, oblock, to_cblock(cb), flags & M_DIRTY, + le32_to_cpu(hint), hints_valid); + if (r) + DMERR("policy couldn't load cblock"); } return r; @@ -1205,16 +1218,60 @@ static int __load_mappings(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy, load_mapping_fn fn, void *context) { - struct thunk thunk; + int r; + uint64_t cb; + + bool hints_valid = hints_array_available(cmd, policy); + + if (from_cblock(cmd->cache_blocks) == 0) + /* Nothing to do */ + return 0; + + r = dm_array_cursor_begin(&cmd->info, cmd->root, &cmd->mapping_cursor); + if (r) + return r; - thunk.fn = fn; - thunk.context = context; + if (hints_valid) { + r = dm_array_cursor_begin(&cmd->hint_info, cmd->hint_root, &cmd->hint_cursor); + if (r) { + dm_array_cursor_end(&cmd->mapping_cursor); + return r; + } + } + + for (cb = 0; ; cb++) { + r = __load_mapping(cmd, cb, hints_valid, + &cmd->mapping_cursor, &cmd->hint_cursor, + fn, context); + if (r) + goto out; + + /* + * We need to break out before we move the cursors. + */ + if (cb >= (from_cblock(cmd->cache_blocks) - 1)) + break; - thunk.cmd = cmd; - thunk.respect_dirty_flags = cmd->clean_when_opened; - thunk.hints_valid = hints_array_available(cmd, policy); + r = dm_array_cursor_next(&cmd->mapping_cursor); + if (r) { + DMERR("dm_array_cursor_next for mapping failed"); + goto out; + } - return dm_array_walk(&cmd->info, cmd->root, __load_mapping, &thunk); + if (hints_valid) { + r = dm_array_cursor_next(&cmd->hint_cursor); + if (r) { + DMERR("dm_array_cursor_next for hint failed"); + goto out; + } + } + } +out: + dm_array_cursor_end(&cmd->mapping_cursor); + if (hints_valid) + dm_array_cursor_end(&cmd->hint_cursor); + + return r; } int dm_cache_load_mappings(struct dm_cache_metadata *cmd, @@ -1368,10 +1425,24 @@ int dm_cache_get_metadata_dev_size(struct dm_cache_metadata *cmd, /*----------------------------------------------------------------*/ -static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) +static int get_hint(uint32_t index, void *value_le, void *context) +{ + uint32_t value; + struct dm_cache_policy *policy = context; + + value = policy_get_hint(policy, to_cblock(index)); + *((__le32 *) value_le) = cpu_to_le32(value); + + return 0; +} + +/* + * It's quicker to always delete the hint array, and recreate with + * dm_array_new(). + */ +static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) { int r; - __le32 value; size_t hint_size; const char *policy_name = dm_cache_policy_get_name(policy); const unsigned *policy_version = dm_cache_policy_get_version(policy); @@ -1380,63 +1451,23 @@ static int begin_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *po (strlen(policy_name) > sizeof(cmd->policy_name) - 1)) return -EINVAL; - if (!policy_unchanged(cmd, policy)) { - strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); - memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); - - hint_size = dm_cache_policy_get_hint_size(policy); - if (!hint_size) - return 0; /* short-circuit hints initialization */ - cmd->policy_hint_size = hint_size; + strncpy(cmd->policy_name, policy_name, sizeof(cmd->policy_name)); + memcpy(cmd->policy_version, policy_version, sizeof(cmd->policy_version)); - if (cmd->hint_root) { - r = dm_array_del(&cmd->hint_info, cmd->hint_root); - if (r) - return r; - } + hint_size = dm_cache_policy_get_hint_size(policy); + if (!hint_size) + return 0; /* short-circuit hints initialization */ + cmd->policy_hint_size = hint_size; - r = dm_array_empty(&cmd->hint_info, &cmd->hint_root); + if (cmd->hint_root) { + r = dm_array_del(&cmd->hint_info, cmd->hint_root); if (r) return r; - - value = cpu_to_le32(0); - __dm_bless_for_disk(&value); - r = dm_array_resize(&cmd->hint_info, cmd->hint_root, 0, - from_cblock(cmd->cache_blocks), - &value, &cmd->hint_root); - if (r) - return r; - } - - return 0; -} - -static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock, uint32_t hint) -{ - struct dm_cache_metadata *cmd = context; - __le32 value = cpu_to_le32(hint); - int r; - - __dm_bless_for_disk(&value); - - r = dm_array_set_value(&cmd->hint_info, cmd->hint_root, - from_cblock(cblock), &value, &cmd->hint_root); - cmd->changed = true; - - return r; -} - -static int write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) -{ - int r; - - r = begin_hints(cmd, policy); - if (r) { - DMERR("begin_hints failed"); - return r; } - return policy_walk_mappings(policy, save_hint, cmd); + return dm_array_new(&cmd->hint_info, &cmd->hint_root, + from_cblock(cmd->cache_blocks), + get_hint, policy); } int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *policy) diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c index 14aaaf059f06..2e8a8f1d8358 100644 --- a/drivers/md/dm-cache-policy-cleaner.c +++ b/drivers/md/dm-cache-policy-cleaner.c @@ -395,7 +395,7 @@ static void init_policy_functions(struct policy *p) p->policy.set_dirty = wb_set_dirty; p->policy.clear_dirty = wb_clear_dirty; p->policy.load_mapping = wb_load_mapping; - p->policy.walk_mappings = NULL; + p->policy.get_hint = NULL; p->policy.remove_mapping = wb_remove_mapping; p->policy.writeback_work = wb_writeback_work; p->policy.force_mapping = wb_force_mapping; diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h index 2816018faa7f..808ee0e2b2c4 100644 --- a/drivers/md/dm-cache-policy-internal.h +++ b/drivers/md/dm-cache-policy-internal.h @@ -48,10 +48,10 @@ static inline int policy_load_mapping(struct dm_cache_policy *p, return p->load_mapping(p, oblock, cblock, hint, hint_valid); } -static inline int policy_walk_mappings(struct dm_cache_policy *p, - policy_walk_fn fn, void *context) +static inline uint32_t policy_get_hint(struct dm_cache_policy *p, + dm_cblock_t cblock) { - return p->walk_mappings ? p->walk_mappings(p, fn, context) : 0; + return p->get_hint ? p->get_hint(p, cblock) : 0; } static inline int policy_writeback_work(struct dm_cache_policy *p, diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c index cf48a617a3a4..c33f4a6e1d7d 100644 --- a/drivers/md/dm-cache-policy-smq.c +++ b/drivers/md/dm-cache-policy-smq.c @@ -1359,6 +1359,11 @@ static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) spin_unlock_irqrestore(&mq->lock, flags); } +static unsigned random_level(dm_cblock_t cblock) +{ + return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1); +} + static int smq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid) @@ -1369,47 +1374,21 @@ static int smq_load_mapping(struct dm_cache_policy *p, e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock)); e->oblock = oblock; e->dirty = false; /* this gets corrected in a minute */ - e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1; + e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock); push(mq, e); return 0; } -static int smq_save_hints(struct smq_policy *mq, struct queue *q, - policy_walk_fn fn, void *context) -{ - int r; - unsigned level; - struct entry *e; - - for (level = 0; level < q->nr_levels; level++) - for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) { - if (!e->sentinel) { - r = fn(context, infer_cblock(mq, e), - e->oblock, e->level); - if (r) - return r; - } - } - - return 0; -} - -static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, - void *context) +static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock) { struct smq_policy *mq = to_smq_policy(p); - int r = 0; + struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock)); - /* - * We don't need to lock here since this method is only called once - * the IO has stopped. - */ - r = smq_save_hints(mq, &mq->clean, fn, context); - if (!r) - r = smq_save_hints(mq, &mq->dirty, fn, context); + if (!e->allocated) + return 0; - return r; + return e->level; } static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock) @@ -1616,7 +1595,7 @@ static void init_policy_functions(struct smq_policy *mq, bool mimic_mq) mq->policy.set_dirty = smq_set_dirty; mq->policy.clear_dirty = smq_clear_dirty; mq->policy.load_mapping = smq_load_mapping; - mq->policy.walk_mappings = smq_walk_mappings; + mq->policy.get_hint = smq_get_hint; mq->policy.remove_mapping = smq_remove_mapping; mq->policy.remove_cblock = smq_remove_cblock; mq->policy.writeback_work = smq_writeback_work; diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h index 05db56eedb6a..aa10b1493f34 100644 --- a/drivers/md/dm-cache-policy.h +++ b/drivers/md/dm-cache-policy.h @@ -90,9 +90,6 @@ struct policy_result { dm_cblock_t cblock; /* POLICY_HIT, POLICY_NEW, POLICY_REPLACE */ }; -typedef int (*policy_walk_fn)(void *context, dm_cblock_t cblock, - dm_oblock_t oblock, uint32_t hint); - /* * The cache policy object. Just a bunch of methods. It is envisaged that * this structure will be embedded in a bigger, policy specific structure @@ -158,8 +155,11 @@ struct dm_cache_policy { int (*load_mapping)(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid); - int (*walk_mappings)(struct dm_cache_policy *p, policy_walk_fn fn, - void *context); + /* + * Gets the hint for a given cblock. Called in a single threaded + * context. So no locking required. + */ + uint32_t (*get_hint)(struct dm_cache_policy *p, dm_cblock_t cblock); /* * Override functions used on the error paths of the core target. diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 0448e7e35c8c..a2768835d394 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -113,8 +113,7 @@ struct iv_tcw_private { * and encrypts / decrypts at the same time. */ enum flags { DM_CRYPT_SUSPENDED, DM_CRYPT_KEY_VALID, - DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD, - DM_CRYPT_EXIT_THREAD}; + DM_CRYPT_SAME_CPU, DM_CRYPT_NO_OFFLOAD }; /* * The fields in here must be read only after initialization. @@ -1207,18 +1206,20 @@ continue_locked: if (!RB_EMPTY_ROOT(&cc->write_tree)) goto pop_from_list; - if (unlikely(test_bit(DM_CRYPT_EXIT_THREAD, &cc->flags))) { - spin_unlock_irq(&cc->write_thread_wait.lock); - break; - } - - __set_current_state(TASK_INTERRUPTIBLE); + set_current_state(TASK_INTERRUPTIBLE); __add_wait_queue(&cc->write_thread_wait, &wait); spin_unlock_irq(&cc->write_thread_wait.lock); + if (unlikely(kthread_should_stop())) { + set_task_state(current, TASK_RUNNING); + remove_wait_queue(&cc->write_thread_wait, &wait); + break; + } + schedule(); + set_task_state(current, TASK_RUNNING); spin_lock_irq(&cc->write_thread_wait.lock); __remove_wait_queue(&cc->write_thread_wait, &wait); goto continue_locked; @@ -1533,13 +1534,8 @@ static void crypt_dtr(struct dm_target *ti) if (!cc) return; - if (cc->write_thread) { - spin_lock_irq(&cc->write_thread_wait.lock); - set_bit(DM_CRYPT_EXIT_THREAD, &cc->flags); - wake_up_locked(&cc->write_thread_wait); - spin_unlock_irq(&cc->write_thread_wait.lock); + if (cc->write_thread) kthread_stop(cc->write_thread); - } if (cc->io_queue) destroy_workqueue(cc->io_queue); diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 49e4d8d4558f..4dfe38655a49 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc) static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - struct bio_vec *bvec; - int i; if (bio->bi_error) { unsigned long flags; @@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio) spin_unlock_irqrestore(&lc->blocks_lock, flags); } - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - + bio_free_pages(bio); put_io_block(lc); bio_put(bio); } diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index ac734e5bbe48..e477af8596e2 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -550,9 +550,9 @@ static int __multipath_map(struct dm_target *ti, struct request *clone, pgpath = choose_pgpath(m, nr_bytes); if (!pgpath) { - if (!must_push_back_rq(m)) - r = -EIO; /* Failed */ - return r; + if (must_push_back_rq(m)) + return DM_MAPIO_DELAY_REQUEUE; + return -EIO; /* Failed */ } else if (test_bit(MPATHF_QUEUE_IO, &m->flags) || test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) { pg_init_all_paths(m); @@ -680,9 +680,11 @@ static int multipath_map_bio(struct dm_target *ti, struct bio *bio) return __multipath_map_bio(m, bio, mpio); } -static void process_queued_bios_list(struct multipath *m) +static void process_queued_io_list(struct multipath *m) { - if (m->queue_mode == DM_TYPE_BIO_BASED) + if (m->queue_mode == DM_TYPE_MQ_REQUEST_BASED) + dm_mq_kick_requeue_list(dm_table_get_md(m->ti->table)); + else if (m->queue_mode == DM_TYPE_BIO_BASED) queue_work(kmultipathd, &m->process_queued_bios); } @@ -752,7 +754,7 @@ static int queue_if_no_path(struct multipath *m, bool queue_if_no_path, if (!queue_if_no_path) { dm_table_run_md_queue_async(m->ti->table); - process_queued_bios_list(m); + process_queued_io_list(m); } return 0; @@ -1193,21 +1195,17 @@ static int multipath_ctr(struct dm_target *ti, unsigned argc, char **argv) static void multipath_wait_for_pg_init_completion(struct multipath *m) { - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&m->pg_init_wait, &wait); + DEFINE_WAIT(wait); while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait(&m->pg_init_wait, &wait, TASK_UNINTERRUPTIBLE); if (!atomic_read(&m->pg_init_in_progress)) break; io_schedule(); } - set_current_state(TASK_RUNNING); - - remove_wait_queue(&m->pg_init_wait, &wait); + finish_wait(&m->pg_init_wait, &wait); } static void flush_multipath_work(struct multipath *m) @@ -1308,7 +1306,7 @@ out: spin_unlock_irqrestore(&m->lock, flags); if (run_queue) { dm_table_run_md_queue_async(m->ti->table); - process_queued_bios_list(m); + process_queued_io_list(m); } return r; @@ -1506,7 +1504,7 @@ static void pg_init_done(void *data, int errors) } clear_bit(MPATHF_QUEUE_IO, &m->flags); - process_queued_bios_list(m); + process_queued_io_list(m); /* * Wake up any thread waiting to suspend. @@ -1521,10 +1519,10 @@ static void activate_path(struct work_struct *work) { struct pgpath *pgpath = container_of(work, struct pgpath, activate_path.work); + struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev); - if (pgpath->is_active) - scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev), - pg_init_done, pgpath); + if (pgpath->is_active && !blk_queue_dying(q)) + scsi_dh_activate(q, pg_init_done, pgpath); else pg_init_done(pgpath, SCSI_DH_DEV_OFFLINED); } @@ -1532,6 +1530,14 @@ static void activate_path(struct work_struct *work) static int noretry_error(int error) { switch (error) { + case -EBADE: + /* + * EBADE signals an reservation conflict. + * We shouldn't fail the path here as we can communicate with + * the target. We should failover to the next path, but in + * doing so we might be causing a ping-pong between paths. + * So just return the reservation conflict error. + */ case -EOPNOTSUPP: case -EREMOTEIO: case -EILSEQ: @@ -1576,9 +1582,6 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) { if (!must_push_back_rq(m)) r = -EIO; - } else { - if (error == -EBADE) - r = error; } } @@ -1627,9 +1630,6 @@ static int do_end_io_bio(struct multipath *m, struct bio *clone, if (!must_push_back_bio(m)) return -EIO; return DM_ENDIO_REQUEUE; - } else { - if (error == -EBADE) - return error; } } @@ -1941,7 +1941,7 @@ static int multipath_prepare_ioctl(struct dm_target *ti, if (test_bit(MPATHF_PG_INIT_REQUIRED, &m->flags)) pg_init_all_paths(m); dm_table_run_md_queue_async(m->ti->table); - process_queued_bios_list(m); + process_queued_io_list(m); } /* @@ -1994,11 +1994,14 @@ static int multipath_busy(struct dm_target *ti) struct priority_group *pg, *next_pg; struct pgpath *pgpath; - /* pg_init in progress or no paths available */ - if (atomic_read(&m->pg_init_in_progress) || - (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags))) + /* pg_init in progress */ + if (atomic_read(&m->pg_init_in_progress)) return true; + /* no paths available, for blk-mq: rely on IO mapping to delay requeue */ + if (!atomic_read(&m->nr_valid_paths) && test_bit(MPATHF_QUEUE_IF_NO_PATH, &m->flags)) + return (m->queue_mode != DM_TYPE_MQ_REQUEST_BASED); + /* Guess which priority_group will be used at next mapping time */ pg = lockless_dereference(m->current_pg); next_pg = lockless_dereference(m->next_pg); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index d1c3645d5ce1..5eacce1ef88b 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -73,15 +73,24 @@ static void dm_old_start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } +static void dm_mq_start_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); + spin_unlock_irqrestore(q->queue_lock, flags); + + blk_mq_start_stopped_hw_queues(q, true); + blk_mq_kick_requeue_list(q); +} + void dm_start_queue(struct request_queue *q) { if (!q->mq_ops) dm_old_start_queue(q); - else { - queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, q); - blk_mq_start_stopped_hw_queues(q, true); - blk_mq_kick_requeue_list(q); - } + else + dm_mq_start_queue(q); } static void dm_old_stop_queue(struct request_queue *q) @@ -89,27 +98,35 @@ static void dm_old_stop_queue(struct request_queue *q) unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); + if (!blk_queue_stopped(q)) + blk_stop_queue(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + +static void dm_mq_stop_queue(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(q->queue_lock, flags); if (blk_queue_stopped(q)) { spin_unlock_irqrestore(q->queue_lock, flags); return; } - blk_stop_queue(q); + queue_flag_set(QUEUE_FLAG_STOPPED, q); spin_unlock_irqrestore(q->queue_lock, flags); + + /* Avoid that requeuing could restart the queue. */ + blk_mq_cancel_requeue_work(q); + blk_mq_stop_hw_queues(q); } void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) dm_old_stop_queue(q); - else { - spin_lock_irq(q->queue_lock); - queue_flag_set(QUEUE_FLAG_STOPPED, q); - spin_unlock_irq(q->queue_lock); - - blk_mq_cancel_requeue_work(q); - blk_mq_stop_hw_queues(q); - } + else + dm_mq_stop_queue(q); } static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, @@ -319,21 +336,32 @@ static void dm_old_requeue_request(struct request *rq) spin_unlock_irqrestore(q->queue_lock, flags); } -static void dm_mq_requeue_request(struct request *rq) +static void __dm_mq_kick_requeue_list(struct request_queue *q, unsigned long msecs) { - struct request_queue *q = rq->q; unsigned long flags; - blk_mq_requeue_request(rq); spin_lock_irqsave(q->queue_lock, flags); if (!blk_queue_stopped(q)) - blk_mq_kick_requeue_list(q); + blk_mq_delay_kick_requeue_list(q, msecs); spin_unlock_irqrestore(q->queue_lock, flags); } -static void dm_requeue_original_request(struct mapped_device *md, - struct request *rq) +void dm_mq_kick_requeue_list(struct mapped_device *md) +{ + __dm_mq_kick_requeue_list(dm_get_md_queue(md), 0); +} +EXPORT_SYMBOL(dm_mq_kick_requeue_list); + +static void dm_mq_delay_requeue_request(struct request *rq, unsigned long msecs) +{ + blk_mq_requeue_request(rq); + __dm_mq_kick_requeue_list(rq->q, msecs); +} + +static void dm_requeue_original_request(struct dm_rq_target_io *tio, bool delay_requeue) { + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; int rw = rq_data_dir(rq); rq_end_stats(md, rq); @@ -342,7 +370,7 @@ static void dm_requeue_original_request(struct mapped_device *md, if (!rq->q->mq_ops) dm_old_requeue_request(rq); else - dm_mq_requeue_request(rq); + dm_mq_delay_requeue_request(rq, delay_requeue ? 5000 : 0); rq_completed(md, rw, false); } @@ -372,7 +400,7 @@ static void dm_done(struct request *clone, int error, bool mapped) return; else if (r == DM_ENDIO_REQUEUE) /* The target wants to requeue the I/O */ - dm_requeue_original_request(tio->md, tio->orig); + dm_requeue_original_request(tio, false); else { DMWARN("unimplemented target endio return value: %d", r); BUG(); @@ -612,20 +640,23 @@ static int dm_old_prep_fn(struct request_queue *q, struct request *rq) /* * Returns: - * 0 : the request has been processed - * DM_MAPIO_REQUEUE : the original request needs to be requeued + * DM_MAPIO_* : the request has been processed as indicated + * DM_MAPIO_REQUEUE : the original request needs to be immediately requeued * < 0 : the request was completed due to failure */ -static int map_request(struct dm_rq_target_io *tio, struct request *rq, - struct mapped_device *md) +static int map_request(struct dm_rq_target_io *tio) { int r; struct dm_target *ti = tio->ti; + struct mapped_device *md = tio->md; + struct request *rq = tio->orig; struct request *clone = NULL; if (tio->clone) { clone = tio->clone; r = ti->type->map_rq(ti, clone, &tio->info); + if (r == DM_MAPIO_DELAY_REQUEUE) + return DM_MAPIO_REQUEUE; /* .request_fn requeue is always immediate */ } else { r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); if (r < 0) { @@ -633,9 +664,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, dm_kill_unmapped_request(rq, r); return r; } - if (r != DM_MAPIO_REMAPPED) - return r; - if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + if (r == DM_MAPIO_REMAPPED && + setup_clone(clone, rq, tio, GFP_ATOMIC)) { /* -ENOMEM */ ti->type->release_clone_rq(clone); return DM_MAPIO_REQUEUE; @@ -654,7 +684,10 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ - dm_requeue_original_request(md, tio->orig); + break; + case DM_MAPIO_DELAY_REQUEUE: + /* The target wants to requeue the I/O after a delay */ + dm_requeue_original_request(tio, true); break; default: if (r > 0) { @@ -664,10 +697,9 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq, /* The target wants to complete the I/O */ dm_kill_unmapped_request(rq, r); - return r; } - return 0; + return r; } static void dm_start_request(struct mapped_device *md, struct request *orig) @@ -706,11 +738,9 @@ static void dm_start_request(struct mapped_device *md, struct request *orig) static void map_tio_request(struct kthread_work *work) { struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); - struct request *rq = tio->orig; - struct mapped_device *md = tio->md; - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) - dm_requeue_original_request(md, rq); + if (map_request(tio) == DM_MAPIO_REQUEUE) + dm_requeue_original_request(tio, false); } ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) @@ -896,7 +926,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, tio->ti = ti; /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + if (map_request(tio) == DM_MAPIO_REQUEUE) { /* Undo dm_start_request() before requeuing */ rq_end_stats(md, rq); rq_completed(md, rq_data_dir(rq), false); @@ -954,7 +984,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - blk_mq_register_disk(md->disk); + blk_mq_register_dev(disk_to_dev(md->disk), q); return 0; diff --git a/drivers/md/dm-rq.h b/drivers/md/dm-rq.h index 9e6f0a3773d4..4da06cae7bad 100644 --- a/drivers/md/dm-rq.h +++ b/drivers/md/dm-rq.h @@ -55,6 +55,8 @@ void dm_mq_cleanup_mapped_device(struct mapped_device *md); void dm_start_queue(struct request_queue *q); void dm_stop_queue(struct request_queue *q); +void dm_mq_kick_requeue_list(struct mapped_device *md); + unsigned dm_get_reserved_rq_based_ios(void); ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index fa9b1cb4438a..be35258324c1 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1648,6 +1648,8 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, struct request_queue *q = md->queue; sector_t size; + lockdep_assert_held(&md->suspend_lock); + size = dm_table_get_size(t); /* @@ -1873,6 +1875,7 @@ EXPORT_SYMBOL_GPL(dm_device_name); static void __dm_destroy(struct mapped_device *md, bool wait) { + struct request_queue *q = dm_get_md_queue(md); struct dm_table *map; int srcu_idx; @@ -1883,6 +1886,10 @@ static void __dm_destroy(struct mapped_device *md, bool wait) set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); + spin_lock_irq(q->queue_lock); + queue_flag_set(QUEUE_FLAG_DYING, q); + spin_unlock_irq(q->queue_lock); + if (dm_request_based(md) && md->kworker_task) flush_kthread_worker(&md->kworker); @@ -1934,30 +1941,25 @@ void dm_put(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_put); -static int dm_wait_for_completion(struct mapped_device *md, int interruptible) +static int dm_wait_for_completion(struct mapped_device *md, long task_state) { int r = 0; - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(&md->wait, &wait); + DEFINE_WAIT(wait); while (1) { - set_current_state(interruptible); + prepare_to_wait(&md->wait, &wait, task_state); if (!md_in_flight(md)) break; - if (interruptible == TASK_INTERRUPTIBLE && - signal_pending(current)) { + if (signal_pending_state(task_state, current)) { r = -EINTR; break; } io_schedule(); } - set_current_state(TASK_RUNNING); - - remove_wait_queue(&md->wait, &wait); + finish_wait(&md->wait, &wait); return r; } @@ -2075,6 +2077,10 @@ static void unlock_fs(struct mapped_device *md) } /* + * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG + * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE + * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY + * * If __dm_suspend returns 0, the device is completely quiescent * now. There is no request-processing activity. All new requests * are being added to md->deferred list. @@ -2082,13 +2088,15 @@ static void unlock_fs(struct mapped_device *md) * Caller must hold md->suspend_lock */ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, - unsigned suspend_flags, int interruptible, + unsigned suspend_flags, long task_state, int dmf_suspended_flag) { bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; int r; + lockdep_assert_held(&md->suspend_lock); + /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. * This flag is cleared before dm_suspend returns. @@ -2149,7 +2157,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, interruptible); + r = dm_wait_for_completion(md, task_state); if (!r) set_bit(dmf_suspended_flag, &md->flags); @@ -2249,10 +2257,11 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) int dm_resume(struct mapped_device *md) { - int r = -EINVAL; + int r; struct dm_table *map = NULL; retry: + r = -EINVAL; mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); if (!dm_suspended_md(md)) @@ -2276,8 +2285,6 @@ retry: goto out; clear_bit(DMF_SUSPENDED, &md->flags); - - r = 0; out: mutex_unlock(&md->suspend_lock); diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 34a840d9df76..2b13117fb918 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -10,6 +10,7 @@ #include <linux/module.h> +#include <linux/kthread.h> #include <linux/dlm.h> #include <linux/sched.h> #include <linux/raid/md_p.h> @@ -25,7 +26,8 @@ struct dlm_lock_resource { struct dlm_lksb lksb; char *name; /* lock name. */ uint32_t flags; /* flags to pass to dlm_lock() */ - struct completion completion; /* completion for synchronized locking */ + wait_queue_head_t sync_locking; /* wait queue for synchronized locking */ + bool sync_locking_done; void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ struct mddev *mddev; /* pointing back to mddev. */ int mode; @@ -118,7 +120,8 @@ static void sync_ast(void *arg) struct dlm_lock_resource *res; res = arg; - complete(&res->completion); + res->sync_locking_done = true; + wake_up(&res->sync_locking); } static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) @@ -130,7 +133,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) 0, sync_ast, res, res->bast); if (ret) return ret; - wait_for_completion(&res->completion); + wait_event(res->sync_locking, res->sync_locking_done); + res->sync_locking_done = false; if (res->lksb.sb_status == 0) res->mode = mode; return res->lksb.sb_status; @@ -141,6 +145,44 @@ static int dlm_unlock_sync(struct dlm_lock_resource *res) return dlm_lock_sync(res, DLM_LOCK_NL); } +/* + * An variation of dlm_lock_sync, which make lock request could + * be interrupted + */ +static int dlm_lock_sync_interruptible(struct dlm_lock_resource *res, int mode, + struct mddev *mddev) +{ + int ret = 0; + + ret = dlm_lock(res->ls, mode, &res->lksb, + res->flags, res->name, strlen(res->name), + 0, sync_ast, res, res->bast); + if (ret) + return ret; + + wait_event(res->sync_locking, res->sync_locking_done + || kthread_should_stop() + || test_bit(MD_CLOSING, &mddev->flags)); + if (!res->sync_locking_done) { + /* + * the convert queue contains the lock request when request is + * interrupted, and sync_ast could still be run, so need to + * cancel the request and reset completion + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_CANCEL, + &res->lksb, res); + res->sync_locking_done = false; + if (unlikely(ret != 0)) + pr_info("failed to cancel previous lock request " + "%s return %d\n", res->name, ret); + return -EPERM; + } else + res->sync_locking_done = false; + if (res->lksb.sb_status == 0) + res->mode = mode; + return res->lksb.sb_status; +} + static struct dlm_lock_resource *lockres_init(struct mddev *mddev, char *name, void (*bastfn)(void *arg, int mode), int with_lvb) { @@ -151,7 +193,8 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); if (!res) return NULL; - init_completion(&res->completion); + init_waitqueue_head(&res->sync_locking); + res->sync_locking_done = false; res->ls = cinfo->lockspace; res->mddev = mddev; res->mode = DLM_LOCK_IV; @@ -194,25 +237,21 @@ out_err: static void lockres_free(struct dlm_lock_resource *res) { - int ret; + int ret = 0; if (!res) return; - /* cancel a lock request or a conversion request that is blocked */ - res->flags |= DLM_LKF_CANCEL; -retry: - ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); - if (unlikely(ret != 0)) { - pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret); - - /* if a lock conversion is cancelled, then the lock is put - * back to grant queue, need to ensure it is unlocked */ - if (ret == -DLM_ECANCEL) - goto retry; - } - res->flags &= ~DLM_LKF_CANCEL; - wait_for_completion(&res->completion); + /* + * use FORCEUNLOCK flag, so we can unlock even the lock is on the + * waiting or convert queue + */ + ret = dlm_unlock(res->ls, res->lksb.sb_lkid, DLM_LKF_FORCEUNLOCK, + &res->lksb, res); + if (unlikely(ret != 0)) + pr_err("failed to unlock %s return %d\n", res->name, ret); + else + wait_event(res->sync_locking, res->sync_locking_done); kfree(res->name); kfree(res->lksb.sb_lvbptr); @@ -279,7 +318,7 @@ static void recover_bitmaps(struct md_thread *thread) goto clear_bit; } - ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); + ret = dlm_lock_sync_interruptible(bm_lockres, DLM_LOCK_PW, mddev); if (ret) { pr_err("md-cluster: Could not DLM lock %s: %d\n", str, ret); @@ -288,7 +327,7 @@ static void recover_bitmaps(struct md_thread *thread) ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true); if (ret) { pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); - goto dlm_unlock; + goto clear_bit; } if (hi > 0) { if (lo < mddev->recovery_cp) @@ -300,8 +339,6 @@ static void recover_bitmaps(struct md_thread *thread) md_wakeup_thread(mddev->thread); } } -dlm_unlock: - dlm_unlock_sync(bm_lockres); clear_bit: lockres_free(bm_lockres); clear_bit(slot, &cinfo->recovery_map); @@ -495,9 +532,10 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, - le32_to_cpu(msg->raid_slot)); + struct md_rdev *rdev; + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); if (rdev) { set_bit(ClusterRemove, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -506,18 +544,21 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) else pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); } static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) { - struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, - le32_to_cpu(msg->raid_slot)); + struct md_rdev *rdev; + rcu_read_lock(); + rdev = md_find_rdev_nr_rcu(mddev, le32_to_cpu(msg->raid_slot)); if (rdev && test_bit(Faulty, &rdev->flags)) clear_bit(Faulty, &rdev->flags); else pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, le32_to_cpu(msg->raid_slot)); + rcu_read_unlock(); } static int process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) @@ -770,7 +811,6 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots) md_check_recovery(mddev); } - dlm_unlock_sync(bm_lockres); lockres_free(bm_lockres); } out: @@ -1006,7 +1046,7 @@ static void metadata_update_cancel(struct mddev *mddev) static int resync_start(struct mddev *mddev) { struct md_cluster_info *cinfo = mddev->cluster_info; - return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX); + return dlm_lock_sync_interruptible(cinfo->resync_lockres, DLM_LOCK_EX, mddev); } static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) @@ -1186,7 +1226,6 @@ static void unlock_all_bitmaps(struct mddev *mddev) if (cinfo->other_bitmap_lockres) { for (i = 0; i < mddev->bitmap_info.nodes - 1; i++) { if (cinfo->other_bitmap_lockres[i]) { - dlm_unlock_sync(cinfo->other_bitmap_lockres[i]); lockres_free(cinfo->other_bitmap_lockres[i]); } } diff --git a/drivers/md/md.c b/drivers/md/md.c index 67642bacd597..eac84d8ff724 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5297,6 +5297,21 @@ int md_run(struct mddev *mddev) return err; } if (mddev->queue) { + bool nonrot = true; + + rdev_for_each(rdev, mddev) { + if (rdev->raid_disk >= 0 && + !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + nonrot = false; + break; + } + } + if (mddev->degraded) + nonrot = false; + if (nonrot) + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue); mddev->queue->backing_dev_info.congested_data = mddev; mddev->queue->backing_dev_info.congested_fn = md_congested; } @@ -5454,12 +5469,14 @@ static void md_clean(struct mddev *mddev) mddev->degraded = 0; mddev->safemode = 0; mddev->private = NULL; + mddev->cluster_info = NULL; mddev->bitmap_info.offset = 0; mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_space = 0; mddev->bitmap_info.chunksize = 0; mddev->bitmap_info.daemon_sleep = 0; mddev->bitmap_info.max_write_behind = 0; + mddev->bitmap_info.nodes = 0; } static void __md_stop_writes(struct mddev *mddev) @@ -5573,8 +5590,7 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) mutex_lock(&mddev->open_mutex); if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { printk("md: %s still in use.\n",mdname(mddev)); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); @@ -5636,8 +5652,7 @@ static int do_md_stop(struct mddev *mddev, int mode, if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || mddev->sysfs_active || mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { printk("md: %s still in use.\n",mdname(mddev)); mutex_unlock(&mddev->open_mutex); if (did_freeze) { @@ -6101,9 +6116,14 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) export_rdev(rdev); if (mddev_is_clustered(mddev)) { - if (info->state & (1 << MD_DISK_CANDIDATE)) - md_cluster_ops->new_disk_ack(mddev, (err == 0)); - else { + if (info->state & (1 << MD_DISK_CANDIDATE)) { + if (!err) { + err = md_cluster_ops->new_disk_ack(mddev, + err == 0); + if (err) + md_kick_rdev_from_array(rdev); + } + } else { if (err) md_cluster_ops->add_new_disk_cancel(mddev); else @@ -6821,7 +6841,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, err = -EBUSY; goto out; } - set_bit(MD_STILL_CLOSED, &mddev->flags); + set_bit(MD_CLOSING, &mddev->flags); mutex_unlock(&mddev->open_mutex); sync_blockdev(bdev); } @@ -7070,9 +7090,13 @@ static int md_open(struct block_device *bdev, fmode_t mode) if ((err = mutex_lock_interruptible(&mddev->open_mutex))) goto out; + if (test_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + return -ENODEV; + } + err = 0; atomic_inc(&mddev->openers); - clear_bit(MD_STILL_CLOSED, &mddev->flags); mutex_unlock(&mddev->open_mutex); check_disk_change(bdev); @@ -7610,16 +7634,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { - int err; - - err = request_module("md-cluster"); - if (err) { - pr_err("md-cluster module not found.\n"); - return -ENOENT; - } - + if (!md_cluster_ops) + request_module("md-cluster"); spin_lock(&pers_lock); + /* ensure module won't be unloaded */ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) { + pr_err("can't find md-cluster module or get it's reference.\n"); spin_unlock(&pers_lock); return -ENOENT; } @@ -8877,7 +8897,9 @@ static void autostart_arrays(int part) list_del(&node_detected_dev->list); dev = node_detected_dev->dev; kfree(node_detected_dev); + mutex_unlock(&detected_devices_mutex); rdev = md_import_device(dev,0, 90); + mutex_lock(&detected_devices_mutex); if (IS_ERR(rdev)) continue; diff --git a/drivers/md/md.h b/drivers/md/md.h index 20c667579ede..2b2041773e79 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -201,9 +201,8 @@ struct mddev { #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ #define MD_UPDATE_SB_FLAGS (1 | 2 | 4) /* If these are set, md_update_sb needed */ #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ -#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since - * md_ioctl checked on it. - */ +#define MD_CLOSING 4 /* If set, we are closing the array, do not open + * it then */ #define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */ #define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */ #define MD_RELOAD_SB 7 /* Reload the superblock because another node diff --git a/drivers/md/persistent-data/dm-array.c b/drivers/md/persistent-data/dm-array.c index 431a03067d64..e83047cbb2da 100644 --- a/drivers/md/persistent-data/dm-array.c +++ b/drivers/md/persistent-data/dm-array.c @@ -277,6 +277,48 @@ static int insert_ablock(struct dm_array_info *info, uint64_t index, return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); } +/*----------------------------------------------------------------*/ + +static int __shadow_ablock(struct dm_array_info *info, dm_block_t b, + struct dm_block **block, struct array_block **ab) +{ + int inc; + int r = dm_tm_shadow_block(info->btree_info.tm, b, + &array_validator, block, &inc); + if (r) + return r; + + *ab = dm_block_data(*block); + if (inc) + inc_ablock_entries(info, *ab); + + return 0; +} + +/* + * The shadow op will often be a noop. Only insert if it really + * copied data. + */ +static int __reinsert_ablock(struct dm_array_info *info, unsigned index, + struct dm_block *block, dm_block_t b, + dm_block_t *root) +{ + int r = 0; + + if (dm_block_location(block) != b) { + /* + * dm_tm_shadow_block will have already decremented the old + * block, but it is still referenced by the btree. We + * increment to stop the insert decrementing it below zero + * when overwriting the old value. + */ + dm_tm_inc(info->btree_info.tm, b); + r = insert_ablock(info, index, block, root); + } + + return r; +} + /* * Looks up an array block in the btree. Then shadows it, and updates the * btree to point to this new shadow. 'root' is an input/output parameter @@ -286,49 +328,21 @@ static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, unsigned index, struct dm_block **block, struct array_block **ab) { - int r, inc; + int r; uint64_t key = index; dm_block_t b; __le64 block_le; - /* - * lookup - */ r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); if (r) return r; b = le64_to_cpu(block_le); - /* - * shadow - */ - r = dm_tm_shadow_block(info->btree_info.tm, b, - &array_validator, block, &inc); + r = __shadow_ablock(info, b, block, ab); if (r) return r; - *ab = dm_block_data(*block); - if (inc) - inc_ablock_entries(info, *ab); - - /* - * Reinsert. - * - * The shadow op will often be a noop. Only insert if it really - * copied data. - */ - if (dm_block_location(*block) != b) { - /* - * dm_tm_shadow_block will have already decremented the old - * block, but it is still referenced by the btree. We - * increment to stop the insert decrementing it below zero - * when overwriting the old value. - */ - dm_tm_inc(info->btree_info.tm, b); - r = insert_ablock(info, index, *block, root); - } - - return r; + return __reinsert_ablock(info, index, *block, b, root); } /* @@ -681,6 +695,72 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root, } EXPORT_SYMBOL_GPL(dm_array_resize); +static int populate_ablock_with_values(struct dm_array_info *info, struct array_block *ab, + value_fn fn, void *context, unsigned base, unsigned new_nr) +{ + int r; + unsigned i; + uint32_t nr_entries; + struct dm_btree_value_type *vt = &info->value_type; + + BUG_ON(le32_to_cpu(ab->nr_entries)); + BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); + + nr_entries = le32_to_cpu(ab->nr_entries); + for (i = 0; i < new_nr; i++) { + r = fn(base + i, element_at(info, ab, i), context); + if (r) + return r; + + if (vt->inc) + vt->inc(vt->context, element_at(info, ab, i)); + } + + ab->nr_entries = cpu_to_le32(new_nr); + return 0; +} + +int dm_array_new(struct dm_array_info *info, dm_block_t *root, + uint32_t size, value_fn fn, void *context) +{ + int r; + struct dm_block *block; + struct array_block *ab; + unsigned block_index, end_block, size_of_block, max_entries; + + r = dm_array_empty(info, root); + if (r) + return r; + + size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); + max_entries = calc_max_entries(info->value_type.size, size_of_block); + end_block = dm_div_up(size, max_entries); + + for (block_index = 0; block_index != end_block; block_index++) { + r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); + if (r) + break; + + r = populate_ablock_with_values(info, ab, fn, context, + block_index * max_entries, + min(max_entries, size)); + if (r) { + unlock_ablock(info, block); + break; + } + + r = insert_ablock(info, block_index, block, root); + unlock_ablock(info, block); + if (r) + break; + + size -= max_entries; + } + + return r; +} +EXPORT_SYMBOL_GPL(dm_array_new); + int dm_array_del(struct dm_array_info *info, dm_block_t root) { return dm_btree_del(&info->btree_info, root); @@ -819,3 +899,89 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root, EXPORT_SYMBOL_GPL(dm_array_walk); /*----------------------------------------------------------------*/ + +static int load_ablock(struct dm_array_cursor *c) +{ + int r; + __le64 value_le; + uint64_t key; + + if (c->block) + unlock_ablock(c->info, c->block); + + c->block = NULL; + c->ab = NULL; + c->index = 0; + + r = dm_btree_cursor_get_value(&c->cursor, &key, &value_le); + if (r) { + DMERR("dm_btree_cursor_get_value failed"); + dm_btree_cursor_end(&c->cursor); + + } else { + r = get_ablock(c->info, le64_to_cpu(value_le), &c->block, &c->ab); + if (r) { + DMERR("get_ablock failed"); + dm_btree_cursor_end(&c->cursor); + } + } + + return r; +} + +int dm_array_cursor_begin(struct dm_array_info *info, dm_block_t root, + struct dm_array_cursor *c) +{ + int r; + + memset(c, 0, sizeof(*c)); + c->info = info; + r = dm_btree_cursor_begin(&info->btree_info, root, true, &c->cursor); + if (r) { + DMERR("couldn't create btree cursor"); + return r; + } + + return load_ablock(c); +} +EXPORT_SYMBOL_GPL(dm_array_cursor_begin); + +void dm_array_cursor_end(struct dm_array_cursor *c) +{ + if (c->block) { + unlock_ablock(c->info, c->block); + dm_btree_cursor_end(&c->cursor); + } +} +EXPORT_SYMBOL_GPL(dm_array_cursor_end); + +int dm_array_cursor_next(struct dm_array_cursor *c) +{ + int r; + + if (!c->block) + return -ENODATA; + + c->index++; + + if (c->index >= le32_to_cpu(c->ab->nr_entries)) { + r = dm_btree_cursor_next(&c->cursor); + if (r) + return r; + + r = load_ablock(c); + if (r) + return r; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dm_array_cursor_next); + +void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le) +{ + *value_le = element_at(c->info, c->ab, c->index); +} +EXPORT_SYMBOL_GPL(dm_array_cursor_get_value); + +/*----------------------------------------------------------------*/ diff --git a/drivers/md/persistent-data/dm-array.h b/drivers/md/persistent-data/dm-array.h index ea177d6fa58f..27ee49a55473 100644 --- a/drivers/md/persistent-data/dm-array.h +++ b/drivers/md/persistent-data/dm-array.h @@ -112,6 +112,25 @@ int dm_array_resize(struct dm_array_info *info, dm_block_t root, __dm_written_to_disk(value); /* + * Creates a new array populated with values provided by a callback + * function. This is more efficient than creating an empty array, + * resizing, and then setting values since that process incurs a lot of + * copying. + * + * Assumes 32bit values for now since it's only used by the cache hint + * array. + * + * info - describes the array + * root - the root block of the array on disk + * size - the number of entries in the array + * fn - the callback + * context - passed to the callback + */ +typedef int (*value_fn)(uint32_t index, void *value_le, void *context); +int dm_array_new(struct dm_array_info *info, dm_block_t *root, + uint32_t size, value_fn fn, void *context); + +/* * Frees a whole array. The value_type's decrement operation will be called * for all values in the array */ @@ -163,4 +182,37 @@ int dm_array_walk(struct dm_array_info *info, dm_block_t root, /*----------------------------------------------------------------*/ +/* + * Cursor api. + * + * This lets you iterate through all the entries in an array efficiently + * (it will preload metadata). + * + * I'm using a cursor, rather than a walk function with a callback because + * the cache target needs to iterate both the mapping and hint arrays in + * unison. + */ +struct dm_array_cursor { + struct dm_array_info *info; + struct dm_btree_cursor cursor; + + struct dm_block *block; + struct array_block *ab; + unsigned index; +}; + +int dm_array_cursor_begin(struct dm_array_info *info, + dm_block_t root, struct dm_array_cursor *c); +void dm_array_cursor_end(struct dm_array_cursor *c); + +uint32_t dm_array_cursor_index(struct dm_array_cursor *c); +int dm_array_cursor_next(struct dm_array_cursor *c); + +/* + * value_le is only valid while the cursor points at the current value. + */ +void dm_array_cursor_get_value(struct dm_array_cursor *c, void **value_le); + +/*----------------------------------------------------------------*/ + #endif /* _LINUX_DM_ARRAY_H */ diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index 2cc1877804c2..20a40329d84a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c @@ -994,3 +994,165 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, return walk_node(info, root, fn, context); } EXPORT_SYMBOL_GPL(dm_btree_walk); + +/*----------------------------------------------------------------*/ + +static void prefetch_values(struct dm_btree_cursor *c) +{ + unsigned i, nr; + __le64 value_le; + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); + + BUG_ON(c->info->value_type.size != sizeof(value_le)); + + nr = le32_to_cpu(bn->header.nr_entries); + for (i = 0; i < nr; i++) { + memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); + dm_bm_prefetch(bm, le64_to_cpu(value_le)); + } +} + +static bool leaf_node(struct dm_btree_cursor *c) +{ + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + + return le32_to_cpu(bn->header.flags) & LEAF_NODE; +} + +static int push_node(struct dm_btree_cursor *c, dm_block_t b) +{ + int r; + struct cursor_node *n = c->nodes + c->depth; + + if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { + DMERR("couldn't push cursor node, stack depth too high"); + return -EINVAL; + } + + r = bn_read_lock(c->info, b, &n->b); + if (r) + return r; + + n->index = 0; + c->depth++; + + if (c->prefetch_leaves || !leaf_node(c)) + prefetch_values(c); + + return 0; +} + +static void pop_node(struct dm_btree_cursor *c) +{ + c->depth--; + unlock_block(c->info, c->nodes[c->depth].b); +} + +static int inc_or_backtrack(struct dm_btree_cursor *c) +{ + struct cursor_node *n; + struct btree_node *bn; + + for (;;) { + if (!c->depth) + return -ENODATA; + + n = c->nodes + c->depth - 1; + bn = dm_block_data(n->b); + + n->index++; + if (n->index < le32_to_cpu(bn->header.nr_entries)) + break; + + pop_node(c); + } + + return 0; +} + +static int find_leaf(struct dm_btree_cursor *c) +{ + int r = 0; + struct cursor_node *n; + struct btree_node *bn; + __le64 value_le; + + for (;;) { + n = c->nodes + c->depth - 1; + bn = dm_block_data(n->b); + + if (le32_to_cpu(bn->header.flags) & LEAF_NODE) + break; + + memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); + r = push_node(c, le64_to_cpu(value_le)); + if (r) { + DMERR("push_node failed"); + break; + } + } + + if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) + return -ENODATA; + + return r; +} + +int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, + bool prefetch_leaves, struct dm_btree_cursor *c) +{ + int r; + + c->info = info; + c->root = root; + c->depth = 0; + c->prefetch_leaves = prefetch_leaves; + + r = push_node(c, root); + if (r) + return r; + + return find_leaf(c); +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); + +void dm_btree_cursor_end(struct dm_btree_cursor *c) +{ + while (c->depth) + pop_node(c); +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_end); + +int dm_btree_cursor_next(struct dm_btree_cursor *c) +{ + int r = inc_or_backtrack(c); + if (!r) { + r = find_leaf(c); + if (r) + DMERR("find_leaf failed"); + } + + return r; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_next); + +int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) +{ + if (c->depth) { + struct cursor_node *n = c->nodes + c->depth - 1; + struct btree_node *bn = dm_block_data(n->b); + + if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) + return -EINVAL; + + *key = le64_to_cpu(*key_ptr(bn, n->index)); + memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); + return 0; + + } else + return -ENODATA; +} +EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h index c74301fa5a37..db9bd26adf31 100644 --- a/drivers/md/persistent-data/dm-btree.h +++ b/drivers/md/persistent-data/dm-btree.h @@ -176,4 +176,39 @@ int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, int (*fn)(void *context, uint64_t *keys, void *leaf), void *context); + +/*----------------------------------------------------------------*/ + +/* + * Cursor API. This does not follow the rolling lock convention. Since we + * know the order that values are required we can issue prefetches to speed + * up iteration. Use on a single level btree only. + */ +#define DM_BTREE_CURSOR_MAX_DEPTH 16 + +struct cursor_node { + struct dm_block *b; + unsigned index; +}; + +struct dm_btree_cursor { + struct dm_btree_info *info; + dm_block_t root; + + bool prefetch_leaves; + unsigned depth; + struct cursor_node nodes[DM_BTREE_CURSOR_MAX_DEPTH]; +}; + +/* + * Creates a fresh cursor. If prefetch_leaves is set then it is assumed + * the btree contains block indexes that will be prefetched. The cursor is + * quite large, so you probably don't want to put it on the stack. + */ +int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, + bool prefetch_leaves, struct dm_btree_cursor *c); +void dm_btree_cursor_end(struct dm_btree_cursor *c); +int dm_btree_cursor_next(struct dm_btree_cursor *c); +int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le); + #endif /* _LINUX_DM_BTREE_H */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 21dc00eb1989..1961d827dbd1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - while (--j >= 0) { - struct bio_vec *bv; - - bio_for_each_segment_all(bv, r1_bio->bios[j], i) - __free_page(bv->bv_page); - } + while (--j >= 0) + bio_free_pages(r1_bio->bios[j]); out_free_bio: while (++j < pi->raid_disks) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 51f76ddbe265..1b1ab4a1d132 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -96,7 +96,6 @@ struct r5l_log { spinlock_t no_space_stripes_lock; bool need_cache_flush; - bool in_teardown; }; /* @@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log, mddev = log->rdev->mddev; /* - * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and - * wait for this thread to finish. This thread waits for - * MD_CHANGE_PENDING clear, which is supposed to be done in - * md_check_recovery(). md_check_recovery() tries to get - * reconfig_mutex. Since r5l_quiesce already holds the mutex, - * md_check_recovery() fails, so the PENDING never get cleared. The - * in_teardown check workaround this issue. + * Discard could zero data, so before discard we must make sure + * superblock is updated to new log tail. Updating superblock (either + * directly call md_update_sb() or depend on md thread) must hold + * reconfig mutex. On the other hand, raid5_quiesce is called with + * reconfig_mutex hold. The first step of raid5_quiesce() is waitting + * for all IO finish, hence waitting for reclaim thread, while reclaim + * thread is calling this function and waitting for reconfig mutex. So + * there is a deadlock. We workaround this issue with a trylock. + * FIXME: we could miss discard if we can't take reconfig mutex */ - if (!log->in_teardown) { - set_mask_bits(&mddev->flags, 0, - BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); - md_wakeup_thread(mddev->thread); - wait_event(mddev->sb_wait, - !test_bit(MD_CHANGE_PENDING, &mddev->flags) || - log->in_teardown); - /* - * r5l_quiesce could run after in_teardown check and hold - * mutex first. Superblock might get updated twice. - */ - if (log->in_teardown) - md_update_sb(mddev, 1); - } else { - WARN_ON(!mddev_is_locked(mddev)); - md_update_sb(mddev, 1); - } + set_mask_bits(&mddev->flags, 0, + BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING)); + if (!mddev_trylock(mddev)) + return; + md_update_sb(mddev, 1); + mddev_unlock(mddev); /* discard IO error really doesn't matter, ignore it */ if (log->last_checkpoint < end) { @@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state) if (!log || state == 2) return; if (state == 0) { - log->in_teardown = 0; /* * This is a special case for hotadd. In suspend, the array has * no journal. In resume, journal is initialized as well as the @@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state) log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); } else if (state == 1) { - /* - * at this point all stripes are finished, so io_unit is at - * least in STRIPE_END state - */ - log->in_teardown = 1; /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index da583bb43c84..92ac251e91e6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2423,10 +2423,10 @@ static void raid5_end_read_request(struct bio * bi) } } rdev_dec_pending(rdev, conf->mddev); + bio_reset(bi); clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); - bio_reset(bi); } static void raid5_end_write_request(struct bio *bi) @@ -2498,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && bi->bi_error && !replacement) set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state); + bio_reset(bi); if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) clear_bit(R5_LOCKED, &sh->dev[i].flags); set_bit(STRIPE_HANDLE, &sh->state); @@ -2505,7 +2506,6 @@ static void raid5_end_write_request(struct bio *bi) if (sh->batch_head && sh != sh->batch_head) raid5_release_stripe(sh->batch_head); - bio_reset(bi); } static void raid5_build_block(struct stripe_head *sh, int i, int previous) @@ -6349,22 +6349,20 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu return 0; } -static void raid5_free_percpu(struct r5conf *conf) +static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node) { - unsigned long cpu; + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); + free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); + return 0; +} + +static void raid5_free_percpu(struct r5conf *conf) +{ if (!conf->percpu) return; -#ifdef CONFIG_HOTPLUG_CPU - unregister_cpu_notifier(&conf->cpu_notify); -#endif - - get_online_cpus(); - for_each_possible_cpu(cpu) - free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - put_online_cpus(); - + cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); free_percpu(conf->percpu); } @@ -6372,7 +6370,7 @@ static void free_conf(struct r5conf *conf) { if (conf->log) r5l_exit_log(conf->log); - if (conf->shrinker.seeks) + if (conf->shrinker.nr_deferred) unregister_shrinker(&conf->shrinker); free_thread_groups(conf); @@ -6383,64 +6381,28 @@ static void free_conf(struct r5conf *conf) kfree(conf); } -#ifdef CONFIG_HOTPLUG_CPU -static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, - void *hcpu) +static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) { - struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify); - long cpu = (long)hcpu; + struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node); struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (alloc_scratch_buffer(conf, percpu)) { - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - return notifier_from_errno(-ENOMEM); - } - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - break; - default: - break; + if (alloc_scratch_buffer(conf, percpu)) { + pr_err("%s: failed memory allocation for cpu%u\n", + __func__, cpu); + return -ENOMEM; } - return NOTIFY_OK; + return 0; } -#endif static int raid5_alloc_percpu(struct r5conf *conf) { - unsigned long cpu; int err = 0; conf->percpu = alloc_percpu(struct raid5_percpu); if (!conf->percpu) return -ENOMEM; -#ifdef CONFIG_HOTPLUG_CPU - conf->cpu_notify.notifier_call = raid456_cpu_notify; - conf->cpu_notify.priority = 0; - err = register_cpu_notifier(&conf->cpu_notify); - if (err) - return err; -#endif - - get_online_cpus(); - for_each_present_cpu(cpu) { - err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu)); - if (err) { - pr_err("%s: failed memory allocation for cpu%ld\n", - __func__, cpu); - break; - } - } - put_online_cpus(); - + err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node); if (!err) { conf->scribble_disks = max(conf->raid_disks, conf->previous_raid_disks); @@ -6639,6 +6601,16 @@ static struct r5conf *setup_conf(struct mddev *mddev) } conf->min_nr_stripes = NR_STRIPES; + if (mddev->reshape_position != MaxSector) { + int stripes = max_t(int, + ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, + ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + conf->min_nr_stripes = max(NR_STRIPES, stripes); + if (conf->min_nr_stripes != NR_STRIPES) + printk(KERN_INFO + "md/raid:%s: force stripe size %d for reshape\n", + mdname(mddev), conf->min_nr_stripes); + } memory = conf->min_nr_stripes * (sizeof(struct stripe_head) + max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS); @@ -6660,7 +6632,12 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->shrinker.count_objects = raid5_cache_count; conf->shrinker.batch = 128; conf->shrinker.flags = 0; - register_shrinker(&conf->shrinker); + if (register_shrinker(&conf->shrinker)) { + printk(KERN_ERR + "md/raid:%s: couldn't register shrinker.\n", + mdname(mddev)); + goto abort; + } sprintf(pers_name, "raid%d", mddev->new_level); conf->thread = md_register_thread(raid5d, mddev, pers_name); @@ -7056,6 +7033,8 @@ static int raid5_run(struct mddev *mddev) else queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + + blk_queue_max_hw_sectors(mddev->queue, UINT_MAX); } if (journal_dev) { @@ -7975,10 +7954,21 @@ static struct md_personality raid4_personality = static int __init raid5_init(void) { + int ret; + raid5_wq = alloc_workqueue("raid5wq", WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); if (!raid5_wq) return -ENOMEM; + + ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE, + "md/raid5:prepare", + raid456_cpu_up_prepare, + raid456_cpu_dead); + if (ret) { + destroy_workqueue(raid5_wq); + return ret; + } register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); @@ -7990,6 +7980,7 @@ static void raid5_exit(void) unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); + cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE); destroy_workqueue(raid5_wq); } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 517d4b68a1be..57ec49f0839e 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -512,9 +512,7 @@ struct r5conf { } __percpu *percpu; int scribble_disks; int scribble_sectors; -#ifdef CONFIG_HOTPLUG_CPU - struct notifier_block cpu_notify; -#endif + struct hlist_node node; /* * Free stripes pool |