diff options
Diffstat (limited to 'drivers/md')
-rw-r--r-- | drivers/md/bitmap.c | 6 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.c | 70 | ||||
-rw-r--r-- | drivers/md/dm-bio-prison.h | 2 | ||||
-rw-r--r-- | drivers/md/dm-crypt.c | 4 | ||||
-rw-r--r-- | drivers/md/dm-era-target.c | 3 | ||||
-rw-r--r-- | drivers/md/dm-io.c | 22 | ||||
-rw-r--r-- | drivers/md/dm-mpath.c | 16 | ||||
-rw-r--r-- | drivers/md/dm-snap.c | 67 | ||||
-rw-r--r-- | drivers/md/dm-table.c | 5 | ||||
-rw-r--r-- | drivers/md/dm-thin.c | 93 | ||||
-rw-r--r-- | drivers/md/dm-zero.c | 4 | ||||
-rw-r--r-- | drivers/md/dm.c | 101 | ||||
-rw-r--r-- | drivers/md/md.c | 27 | ||||
-rw-r--r-- | drivers/md/raid5.c | 158 | ||||
-rw-r--r-- | drivers/md/raid5.h | 4 |
15 files changed, 407 insertions, 175 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 9a8e66ae04f5..67f8b31e2054 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store, /* * return a pointer to the page in the filemap that contains the given bit * - * this lookup is complicated by the fact that the bitmap sb might be exactly - * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page - * 0 or page 1 */ static inline struct page *filemap_get_page(struct bitmap_storage *store, unsigned long chunk) { if (file_page_index(store, chunk) >= store->file_pages) return NULL; - return store->filemap[file_page_index(store, chunk) - - file_page_index(store, 0)]; + return store->filemap[file_page_index(store, chunk)]; } static int bitmap_storage_alloc(struct bitmap_storage *store, diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c index 85f0b7074257..f752d12081ff 100644 --- a/drivers/md/dm-bio-prison.c +++ b/drivers/md/dm-bio-prison.c @@ -14,13 +14,17 @@ /*----------------------------------------------------------------*/ -struct dm_bio_prison { +struct bucket { spinlock_t lock; + struct hlist_head cells; +}; + +struct dm_bio_prison { mempool_t *cell_pool; unsigned nr_buckets; unsigned hash_mask; - struct hlist_head *cells; + struct bucket *buckets; }; /*----------------------------------------------------------------*/ @@ -40,6 +44,12 @@ static uint32_t calc_nr_buckets(unsigned nr_cells) static struct kmem_cache *_cell_cache; +static void init_bucket(struct bucket *b) +{ + spin_lock_init(&b->lock); + INIT_HLIST_HEAD(&b->cells); +} + /* * @nr_cells should be the number of cells you want in use _concurrently_. * Don't confuse it with the number of distinct keys. @@ -49,13 +59,12 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) unsigned i; uint32_t nr_buckets = calc_nr_buckets(nr_cells); size_t len = sizeof(struct dm_bio_prison) + - (sizeof(struct hlist_head) * nr_buckets); + (sizeof(struct bucket) * nr_buckets); struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL); if (!prison) return NULL; - spin_lock_init(&prison->lock); prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache); if (!prison->cell_pool) { kfree(prison); @@ -64,9 +73,9 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells) prison->nr_buckets = nr_buckets; prison->hash_mask = nr_buckets - 1; - prison->cells = (struct hlist_head *) (prison + 1); + prison->buckets = (struct bucket *) (prison + 1); for (i = 0; i < nr_buckets; i++) - INIT_HLIST_HEAD(prison->cells + i); + init_bucket(prison->buckets + i); return prison; } @@ -107,40 +116,44 @@ static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs) (lhs->block == rhs->block); } -static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket, +static struct bucket *get_bucket(struct dm_bio_prison *prison, + struct dm_cell_key *key) +{ + return prison->buckets + hash_key(prison, key); +} + +static struct dm_bio_prison_cell *__search_bucket(struct bucket *b, struct dm_cell_key *key) { struct dm_bio_prison_cell *cell; - hlist_for_each_entry(cell, bucket, list) + hlist_for_each_entry(cell, &b->cells, list) if (keys_equal(&cell->key, key)) return cell; return NULL; } -static void __setup_new_cell(struct dm_bio_prison *prison, +static void __setup_new_cell(struct bucket *b, struct dm_cell_key *key, struct bio *holder, - uint32_t hash, struct dm_bio_prison_cell *cell) { memcpy(&cell->key, key, sizeof(cell->key)); cell->holder = holder; bio_list_init(&cell->bios); - hlist_add_head(&cell->list, prison->cells + hash); + hlist_add_head(&cell->list, &b->cells); } -static int __bio_detain(struct dm_bio_prison *prison, +static int __bio_detain(struct bucket *b, struct dm_cell_key *key, struct bio *inmate, struct dm_bio_prison_cell *cell_prealloc, struct dm_bio_prison_cell **cell_result) { - uint32_t hash = hash_key(prison, key); struct dm_bio_prison_cell *cell; - cell = __search_bucket(prison->cells + hash, key); + cell = __search_bucket(b, key); if (cell) { if (inmate) bio_list_add(&cell->bios, inmate); @@ -148,7 +161,7 @@ static int __bio_detain(struct dm_bio_prison *prison, return 1; } - __setup_new_cell(prison, key, inmate, hash, cell_prealloc); + __setup_new_cell(b, key, inmate, cell_prealloc); *cell_result = cell_prealloc; return 0; } @@ -161,10 +174,11 @@ static int bio_detain(struct dm_bio_prison *prison, { int r; unsigned long flags; + struct bucket *b = get_bucket(prison, key); - spin_lock_irqsave(&prison->lock, flags); - r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result); - spin_unlock_irqrestore(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); + r = __bio_detain(b, key, inmate, cell_prealloc, cell_result); + spin_unlock_irqrestore(&b->lock, flags); return r; } @@ -208,10 +222,11 @@ void dm_cell_release(struct dm_bio_prison *prison, struct bio_list *bios) { unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); __cell_release(cell, bios); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irqrestore(&b->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release); @@ -230,28 +245,25 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct bio_list *inmates) { unsigned long flags; + struct bucket *b = get_bucket(prison, &cell->key); - spin_lock_irqsave(&prison->lock, flags); + spin_lock_irqsave(&b->lock, flags); __cell_release_no_holder(cell, inmates); - spin_unlock_irqrestore(&prison->lock, flags); + spin_unlock_irqrestore(&b->lock, flags); } EXPORT_SYMBOL_GPL(dm_cell_release_no_holder); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell) + struct dm_bio_prison_cell *cell, int error) { struct bio_list bios; struct bio *bio; - unsigned long flags; bio_list_init(&bios); - - spin_lock_irqsave(&prison->lock, flags); - __cell_release(cell, &bios); - spin_unlock_irqrestore(&prison->lock, flags); + dm_cell_release(prison, cell, &bios); while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + bio_endio(bio, error); } EXPORT_SYMBOL_GPL(dm_cell_error); diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h index 3f833190eadf..6805a142b750 100644 --- a/drivers/md/dm-bio-prison.h +++ b/drivers/md/dm-bio-prison.h @@ -85,7 +85,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison, struct dm_bio_prison_cell *cell, struct bio_list *inmates); void dm_cell_error(struct dm_bio_prison *prison, - struct dm_bio_prison_cell *cell); + struct dm_bio_prison_cell *cell, int error); /*----------------------------------------------------------------*/ diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53b213226c01..4cba2d808afb 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2003 Jana Saout <jana@saout.de> * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org> * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved. * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com> @@ -1996,6 +1996,6 @@ static void __exit dm_crypt_exit(void) module_init(dm_crypt_init); module_exit(dm_crypt_exit); -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_AUTHOR("Jana Saout <jana@saout.de>"); MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 414dad4cb49b..ad913cd4aded 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1391,7 +1391,8 @@ static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) static void era_destroy(struct era *era) { - metadata_close(era->md); + if (era->md) + metadata_close(era->md); if (era->wq) destroy_workqueue(era->wq); diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index 3842ac738f98..db404a0f7e2c 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c @@ -10,6 +10,7 @@ #include <linux/device-mapper.h> #include <linux/bio.h> +#include <linux/completion.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/sched.h> @@ -32,7 +33,7 @@ struct dm_io_client { struct io { unsigned long error_bits; atomic_t count; - struct task_struct *sleeper; + struct completion *wait; struct dm_io_client *client; io_notify_fn callback; void *context; @@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error) invalidate_kernel_vmap_range(io->vma_invalidate_address, io->vma_invalidate_size); - if (io->sleeper) - wake_up_process(io->sleeper); + if (io->wait) + complete(io->wait); else { unsigned long r = io->error_bits; @@ -387,6 +388,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, */ volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1]; struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io)); + DECLARE_COMPLETION_ONSTACK(wait); if (num_regions > 1 && (rw & RW_MASK) != WRITE) { WARN_ON(1); @@ -395,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = current; + io->wait = &wait; io->client = client; io->vma_invalidate_address = dp->vma_invalidate_address; @@ -403,15 +405,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions, dispatch_io(rw, num_regions, where, dp, io, 1); - while (1) { - set_current_state(TASK_UNINTERRUPTIBLE); - - if (!atomic_read(&io->count)) - break; - - io_schedule(); - } - set_current_state(TASK_RUNNING); + wait_for_completion_io(&wait); if (error_bits) *error_bits = io->error_bits; @@ -434,7 +428,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions, io = mempool_alloc(client->pool, GFP_NOIO); io->error_bits = 0; atomic_set(&io->count, 1); /* see dispatch_io() */ - io->sleeper = NULL; + io->wait = NULL; io->client = client; io->callback = fn; io->context = context; diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index ebfa411d1a7d..f4167b013d99 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1242,17 +1242,8 @@ static int do_end_io(struct multipath *m, struct request *clone, if (!error && !clone->errors) return 0; /* I/O complete */ - if (noretry_error(error)) { - if ((clone->cmd_flags & REQ_WRITE_SAME) && - !clone->q->limits.max_write_same_sectors) { - struct queue_limits *limits; - - /* device doesn't really support WRITE SAME, disable it */ - limits = dm_get_queue_limits(dm_table_get_md(m->ti->table)); - limits->max_write_same_sectors = 0; - } + if (noretry_error(error)) return error; - } if (mpio->pgpath) fail_path(mpio->pgpath); @@ -1620,8 +1611,9 @@ static int multipath_busy(struct dm_target *ti) spin_lock_irqsave(&m->lock, flags); - /* pg_init in progress, requeue until done */ - if (!pg_ready(m)) { + /* pg_init in progress or no paths available */ + if (m->pg_init_in_progress || + (!m->nr_valid_paths && m->queue_if_no_path)) { busy = 1; goto out; } diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 8e0caed0bf74..5bd2290cfb1e 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -2141,6 +2141,11 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, * Origin: maps a linear range of a device, with hooks for snapshotting. */ +struct dm_origin { + struct dm_dev *dev; + unsigned split_boundary; +}; + /* * Construct an origin mapping: <dev_path> * The context for an origin is merely a 'struct dm_dev *' @@ -2149,41 +2154,65 @@ static int origin_write_extent(struct dm_snapshot *merging_snap, static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv) { int r; - struct dm_dev *dev; + struct dm_origin *o; if (argc != 1) { ti->error = "origin: incorrect number of arguments"; return -EINVAL; } - r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev); + o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL); + if (!o) { + ti->error = "Cannot allocate private origin structure"; + r = -ENOMEM; + goto bad_alloc; + } + + r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev); if (r) { ti->error = "Cannot get target device"; - return r; + goto bad_open; } - ti->private = dev; + ti->private = o; ti->num_flush_bios = 1; return 0; + +bad_open: + kfree(o); +bad_alloc: + return r; } static void origin_dtr(struct dm_target *ti) { - struct dm_dev *dev = ti->private; - dm_put_device(ti, dev); + struct dm_origin *o = ti->private; + dm_put_device(ti, o->dev); + kfree(o); } static int origin_map(struct dm_target *ti, struct bio *bio) { - struct dm_dev *dev = ti->private; - bio->bi_bdev = dev->bdev; + struct dm_origin *o = ti->private; + unsigned available_sectors; - if (bio->bi_rw & REQ_FLUSH) + bio->bi_bdev = o->dev->bdev; + + if (unlikely(bio->bi_rw & REQ_FLUSH)) return DM_MAPIO_REMAPPED; + if (bio_rw(bio) != WRITE) + return DM_MAPIO_REMAPPED; + + available_sectors = o->split_boundary - + ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1)); + + if (bio_sectors(bio) > available_sectors) + dm_accept_partial_bio(bio, available_sectors); + /* Only tell snapshots if this is a write */ - return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED; + return do_origin(o->dev, bio); } /* @@ -2192,15 +2221,15 @@ static int origin_map(struct dm_target *ti, struct bio *bio) */ static void origin_resume(struct dm_target *ti) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; - ti->max_io_len = get_origin_minimum_chunksize(dev->bdev); + o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev); } static void origin_status(struct dm_target *ti, status_type_t type, unsigned status_flags, char *result, unsigned maxlen) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; switch (type) { case STATUSTYPE_INFO: @@ -2208,7 +2237,7 @@ static void origin_status(struct dm_target *ti, status_type_t type, break; case STATUSTYPE_TABLE: - snprintf(result, maxlen, "%s", dev->name); + snprintf(result, maxlen, "%s", o->dev->name); break; } } @@ -2216,13 +2245,13 @@ static void origin_status(struct dm_target *ti, status_type_t type, static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, struct bio_vec *biovec, int max_size) { - struct dm_dev *dev = ti->private; - struct request_queue *q = bdev_get_queue(dev->bdev); + struct dm_origin *o = ti->private; + struct request_queue *q = bdev_get_queue(o->dev->bdev); if (!q->merge_bvec_fn) return max_size; - bvm->bi_bdev = dev->bdev; + bvm->bi_bdev = o->dev->bdev; return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); } @@ -2230,9 +2259,9 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm, static int origin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - struct dm_dev *dev = ti->private; + struct dm_origin *o = ti->private; - return fn(ti, dev, 0, ti->len, data); + return fn(ti, o->dev, 0, ti->len, data); } static struct target_type origin_target = { diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 50601ec7017a..5f59f1e3e5b1 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -465,8 +465,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, } EXPORT_SYMBOL(dm_get_device); -int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, sector_t len, void *data) +static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, + sector_t start, sector_t len, void *data) { struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; @@ -499,7 +499,6 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, (unsigned int) (PAGE_SIZE >> 9)); return 0; } -EXPORT_SYMBOL_GPL(dm_set_device_limits); /* * Decrement a device's use count and remove it if necessary. diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 242ac2ea5f29..fc9c848a60c9 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -310,13 +310,18 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc, wake_worker(pool); } -static void cell_error(struct pool *pool, - struct dm_bio_prison_cell *cell) +static void cell_error_with_code(struct pool *pool, + struct dm_bio_prison_cell *cell, int error_code) { - dm_cell_error(pool->prison, cell); + dm_cell_error(pool->prison, cell, error_code); dm_bio_prison_free_cell(pool->prison, cell); } +static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell) +{ + cell_error_with_code(pool, cell, -EIO); +} + /*----------------------------------------------------------------*/ /* @@ -1027,7 +1032,7 @@ static void retry_on_resume(struct bio *bio) spin_unlock_irqrestore(&tc->lock, flags); } -static bool should_error_unserviceable_bio(struct pool *pool) +static int should_error_unserviceable_bio(struct pool *pool) { enum pool_mode m = get_pool_mode(pool); @@ -1035,25 +1040,27 @@ static bool should_error_unserviceable_bio(struct pool *pool) case PM_WRITE: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode"); - return true; + return -EIO; case PM_OUT_OF_DATA_SPACE: - return pool->pf.error_if_no_space; + return pool->pf.error_if_no_space ? -ENOSPC : 0; case PM_READ_ONLY: case PM_FAIL: - return true; + return -EIO; default: /* Shouldn't get here */ DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode"); - return true; + return -EIO; } } static void handle_unserviceable_bio(struct pool *pool, struct bio *bio) { - if (should_error_unserviceable_bio(pool)) - bio_io_error(bio); + int error = should_error_unserviceable_bio(pool); + + if (error) + bio_endio(bio, error); else retry_on_resume(bio); } @@ -1062,18 +1069,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c { struct bio *bio; struct bio_list bios; + int error; - if (should_error_unserviceable_bio(pool)) { - cell_error(pool, cell); + error = should_error_unserviceable_bio(pool); + if (error) { + cell_error_with_code(pool, cell, error); return; } bio_list_init(&bios); cell_release(pool, cell, &bios); - if (should_error_unserviceable_bio(pool)) + error = should_error_unserviceable_bio(pool); + if (error) while ((bio = bio_list_pop(&bios))) - bio_io_error(bio); + bio_endio(bio, error); else while ((bio = bio_list_pop(&bios))) retry_on_resume(bio); @@ -1610,47 +1620,63 @@ static void do_no_space_timeout(struct work_struct *ws) /*----------------------------------------------------------------*/ -struct noflush_work { +struct pool_work { struct work_struct worker; - struct thin_c *tc; + struct completion complete; +}; + +static struct pool_work *to_pool_work(struct work_struct *ws) +{ + return container_of(ws, struct pool_work, worker); +} - atomic_t complete; - wait_queue_head_t wait; +static void pool_work_complete(struct pool_work *pw) +{ + complete(&pw->complete); +} + +static void pool_work_wait(struct pool_work *pw, struct pool *pool, + void (*fn)(struct work_struct *)) +{ + INIT_WORK_ONSTACK(&pw->worker, fn); + init_completion(&pw->complete); + queue_work(pool->wq, &pw->worker); + wait_for_completion(&pw->complete); +} + +/*----------------------------------------------------------------*/ + +struct noflush_work { + struct pool_work pw; + struct thin_c *tc; }; -static void complete_noflush_work(struct noflush_work *w) +static struct noflush_work *to_noflush(struct work_struct *ws) { - atomic_set(&w->complete, 1); - wake_up(&w->wait); + return container_of(to_pool_work(ws), struct noflush_work, pw); } static void do_noflush_start(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = true; requeue_io(w->tc); - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void do_noflush_stop(struct work_struct *ws) { - struct noflush_work *w = container_of(ws, struct noflush_work, worker); + struct noflush_work *w = to_noflush(ws); w->tc->requeue_mode = false; - complete_noflush_work(w); + pool_work_complete(&w->pw); } static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *)) { struct noflush_work w; - INIT_WORK_ONSTACK(&w.worker, fn); w.tc = tc; - atomic_set(&w.complete, 0); - init_waitqueue_head(&w.wait); - - queue_work(tc->pool->wq, &w.worker); - - wait_event(w.wait, atomic_read(&w.complete)); + pool_work_wait(&w.pw, tc->pool, fn); } /*----------------------------------------------------------------*/ @@ -3068,7 +3094,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits) */ if (pt->adjusted_pf.discard_passdown) { data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits; - limits->discard_granularity = data_limits->discard_granularity; + limits->discard_granularity = max(data_limits->discard_granularity, + pool->sectors_per_block << SECTOR_SHIFT); } else limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; } diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c index c99003e0d47a..b9a64bbce304 100644 --- a/drivers/md/dm-zero.c +++ b/drivers/md/dm-zero.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2003 Christophe Saout <christophe@saout.de> + * Copyright (C) 2003 Jana Saout <jana@saout.de> * * This file is released under the GPL. */ @@ -79,6 +79,6 @@ static void __exit dm_zero_exit(void) module_init(dm_zero_init) module_exit(dm_zero_exit) -MODULE_AUTHOR("Christophe Saout <christophe@saout.de>"); +MODULE_AUTHOR("Jana Saout <jana@saout.de>"); MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros"); MODULE_LICENSE("GPL"); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index aa9e093343d4..32b958dbc499 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -54,6 +54,8 @@ static void do_deferred_remove(struct work_struct *w); static DECLARE_WORK(deferred_remove_work, do_deferred_remove); +static struct workqueue_struct *deferred_remove_workqueue; + /* * For bio-based dm. * One of these is allocated per bio. @@ -276,16 +278,24 @@ static int __init local_init(void) if (r) goto out_free_rq_tio_cache; + deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); + if (!deferred_remove_workqueue) { + r = -ENOMEM; + goto out_uevent_exit; + } + _major = major; r = register_blkdev(_major, _name); if (r < 0) - goto out_uevent_exit; + goto out_free_workqueue; if (!_major) _major = r; return 0; +out_free_workqueue: + destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); out_free_rq_tio_cache: @@ -299,6 +309,7 @@ out_free_io_cache: static void local_exit(void) { flush_scheduled_work(); + destroy_workqueue(deferred_remove_workqueue); kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); @@ -407,7 +418,7 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode) if (atomic_dec_and_test(&md->open_count) && (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) - schedule_work(&deferred_remove_work); + queue_work(deferred_remove_workqueue, &deferred_remove_work); dm_put(md); @@ -755,6 +766,14 @@ static void dec_pending(struct dm_io *io, int error) } } +static void disable_write_same(struct mapped_device *md) +{ + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE SAME, disable it */ + limits->max_write_same_sectors = 0; +} + static void clone_endio(struct bio *bio, int error) { int r = 0; @@ -783,6 +802,10 @@ static void clone_endio(struct bio *bio, int error) } } + if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) + disable_write_same(md); + free_tio(md, tio); dec_pending(io, error); } @@ -977,6 +1000,10 @@ static void dm_done(struct request *clone, int error, bool mapped) r = rq_end_io(tio->ti, clone, error, &tio->info); } + if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors)) + disable_write_same(tio->md); + if (r <= 0) /* The target wants to complete the I/O */ dm_end_request(clone, r); @@ -1110,6 +1137,46 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len) } EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); +/* + * A target may call dm_accept_partial_bio only from the map routine. It is + * allowed for all bio types except REQ_FLUSH. + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * | 1 | 2 | 3 | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + * <------- bi_size -------> + * <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + * (it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + * (it may be empty if region 1 is non-empty, although there is no reason + * to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_rw & REQ_FLUSH); + BUG_ON(bi_size > *tio->len_ptr); + BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; + bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + static void __map_bio(struct dm_target_io *tio) { int r; @@ -1152,10 +1219,10 @@ struct clone_info { struct bio *bio; struct dm_io *io; sector_t sector; - sector_t sector_count; + unsigned sector_count; }; -static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) +static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) { bio->bi_iter.bi_sector = sector; bio->bi_iter.bi_size = to_bytes(len); @@ -1200,11 +1267,13 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci, static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, sector_t len) + unsigned target_bio_nr, unsigned *len) { struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); struct bio *clone = &tio->clone; + tio->len_ptr = len; + /* * Discard requests require the bio's inline iovecs be initialized. * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush @@ -1212,13 +1281,13 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, */ __bio_clone_fast(clone, ci->bio); if (len) - bio_setup_sector(clone, ci->sector, len); + bio_setup_sector(clone, ci->sector, *len); __map_bio(tio); } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, sector_t len) + unsigned num_bios, unsigned *len) { unsigned target_bio_nr; @@ -1233,13 +1302,13 @@ static int __send_empty_flush(struct clone_info *ci) BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); + __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); return 0; } static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, unsigned len) + sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; @@ -1254,7 +1323,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, 0, target_bio_nr); - clone_bio(tio, bio, sector, len); + tio->len_ptr = len; + clone_bio(tio, bio, sector, *len); __map_bio(tio); } } @@ -1283,7 +1353,7 @@ static int __send_changing_extent_only(struct clone_info *ci, is_split_required_fn is_split_required) { struct dm_target *ti; - sector_t len; + unsigned len; unsigned num_bios; do { @@ -1302,11 +1372,11 @@ static int __send_changing_extent_only(struct clone_info *ci, return -EOPNOTSUPP; if (is_split_required && !is_split_required(ti)) - len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); else - len = min(ci->sector_count, max_io_len(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, len); + __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; } while (ci->sector_count -= len); @@ -1345,7 +1415,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, len); + __clone_and_map_data_bio(ci, ti, ci->sector, &len); ci->sector += len; ci->sector_count -= len; @@ -1439,7 +1509,6 @@ static int dm_merge_bvec(struct request_queue *q, * just one page. */ else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) - max_size = 0; out: diff --git a/drivers/md/md.c b/drivers/md/md.c index 2382cfc9bb3f..32fc19c540d4 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3448,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->level = LEVEL_NONE; return rv; } + if (mddev->ro) + return -EROFS; /* request to change the personality. Need to ensure: * - array is not engaged in resync/recovery/reshape @@ -3634,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_layout = n; err = mddev->pers->check_reshape(mddev); if (err) { @@ -3723,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len) int err; if (mddev->pers->check_reshape == NULL) return -EBUSY; + if (mddev->ro) + return -EROFS; mddev->new_chunk_sectors = n >> 9; err = mddev->pers->check_reshape(mddev); if (err) { @@ -5593,7 +5599,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) if (mddev->in_sync) info.state = (1<<MD_SB_CLEAN); if (mddev->bitmap && mddev->bitmap_info.offset) - info.state = (1<<MD_SB_BITMAP_PRESENT); + info.state |= (1<<MD_SB_BITMAP_PRESENT); info.active_disks = insync; info.working_disks = working; info.failed_disks = failed; @@ -6135,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) */ if (mddev->sync_thread) return -EBUSY; + if (mddev->ro) + return -EROFS; rdev_for_each(rdev, mddev) { sector_t avail = rdev->sectors; @@ -6157,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; + if (mddev->ro) + return -EROFS; if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; @@ -7491,6 +7501,19 @@ void md_do_sync(struct md_thread *thread) rdev->recovery_offset < j) j = rdev->recovery_offset; rcu_read_unlock(); + + /* If there is a bitmap, we need to make sure all + * writes that started before we added a spare + * complete before we start doing a recovery. + * Otherwise the write might complete and (via + * bitmap_endwrite) set a bit in the bitmap after the + * recovery has checked that bit and skipped that + * region. + */ + if (mddev->bitmap) { + mddev->pers->quiesce(mddev, 1); + mddev->pers->quiesce(mddev, 0); + } } printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); @@ -8333,7 +8356,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) if (a < s) { /* we need to split this range */ if (bb->count >= MD_MAX_BADBLOCKS) { - rv = 0; + rv = -ENOSPC; goto out; } memmove(p+lo+1, p+lo, (bb->count - lo) * 8); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2afef4ec9312..6234b2e84587 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(atomic_read(&conf->active_stripes)==0); if (test_bit(STRIPE_HANDLE, &sh->state)) { if (test_bit(STRIPE_DELAYED, &sh->state) && - !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) + !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { list_add_tail(&sh->lru, &conf->delayed_list); - else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && + if (atomic_read(&conf->preread_active_stripes) + < IO_THRESHOLD) + md_wakeup_thread(conf->mddev->thread); + } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && sh->bm_seq - conf->seq_write > 0) list_add_tail(&sh->lru, &conf->bitmap_list); else { @@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh) int hash; bool wakeup; + /* Avoid release_list until the last reference. + */ + if (atomic_add_unless(&sh->count, -1, 1)) + return; + if (unlikely(!conf->mddev->thread) || test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) goto slow_path; @@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh) int num = sh->raid_conf->pool_size; for (i = 0; i < num ; i++) { + WARN_ON(sh->dev[i].page != sh->dev[i].orig_page); p = sh->dev[i].page; if (!p) continue; @@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh) return 1; } sh->dev[i].page = page; + sh->dev[i].orig_page = page; } return 0; } @@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) bi->bi_rw |= REQ_NOMERGE; + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) else rbi->bi_iter.bi_sector = (sh->sector + rrdev->data_offset); + if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; @@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) } static struct dma_async_tx_descriptor * -async_copy_data(int frombio, struct bio *bio, struct page *page, - sector_t sector, struct dma_async_tx_descriptor *tx) +async_copy_data(int frombio, struct bio *bio, struct page **page, + sector_t sector, struct dma_async_tx_descriptor *tx, + struct stripe_head *sh) { struct bio_vec bvl; struct bvec_iter iter; @@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page, if (clen > 0) { b_offset += bvl.bv_offset; bio_page = bvl.bv_page; - if (frombio) - tx = async_memcpy(page, bio_page, page_offset, + if (frombio) { + if (sh->raid_conf->skip_copy && + b_offset == 0 && page_offset == 0 && + clen == STRIPE_SIZE) + *page = bio_page; + else + tx = async_memcpy(*page, bio_page, page_offset, b_offset, clen, &submit); - else - tx = async_memcpy(bio_page, page, b_offset, + } else + tx = async_memcpy(bio_page, *page, b_offset, page_offset, clen, &submit); } /* chain the operations */ @@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh) spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { - tx = async_copy_data(0, rbi, dev->page, - dev->sector, tx); + tx = async_copy_data(0, rbi, &dev->page, + dev->sector, tx, sh); rbi = r5_next_bio(rbi, dev->sector); } } @@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) BUG_ON(dev->written); wbi = dev->written = chosen; spin_unlock_irq(&sh->stripe_lock); + WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < dev->sector + STRIPE_SECTORS) { @@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) set_bit(R5_SyncIO, &dev->flags); if (wbi->bi_rw & REQ_DISCARD) set_bit(R5_Discard, &dev->flags); - else - tx = async_copy_data(1, wbi, dev->page, - dev->sector, tx); + else { + tx = async_copy_data(1, wbi, &dev->page, + dev->sector, tx, sh); + if (dev->page != dev->orig_page) { + set_bit(R5_SkipCopy, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + clear_bit(R5_OVERWRITE, &dev->flags); + } + } wbi = r5_next_bio(wbi, dev->sector); } } @@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) set_bit(R5_UPTODATE, &dev->flags); if (fua) set_bit(R5_WantFUA, &dev->flags); @@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize) osh = get_free_stripe(conf, hash); unlock_device_hash_lock(conf, hash); atomic_set(&nsh->count, 1); - for(i=0; i<conf->pool_size; i++) + for(i=0; i<conf->pool_size; i++) { nsh->dev[i].page = osh->dev[i].page; + nsh->dev[i].orig_page = osh->dev[i].page; + } for( ; i<newsize; i++) nsh->dev[i].page = NULL; nsh->hash_lock_index = hash; @@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) if (nsh->dev[i].page == NULL) { struct page *p = alloc_page(GFP_NOIO); nsh->dev[i].page = p; + nsh->dev[i].orig_page = p; if (!p) err = -ENOMEM; } @@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error) } static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); - + static void raid5_build_block(struct stripe_head *sh, int i, int previous) { struct r5dev *dev = &sh->dev[i]; bio_init(&dev->req); dev->req.bi_io_vec = &dev->vec; - dev->req.bi_vcnt++; - dev->req.bi_max_vecs++; + dev->req.bi_max_vecs = 1; dev->req.bi_private = sh; - dev->vec.bv_page = dev->page; bio_init(&dev->rreq); dev->rreq.bi_io_vec = &dev->rvec; - dev->rreq.bi_vcnt++; - dev->rreq.bi_max_vecs++; + dev->rreq.bi_max_vecs = 1; dev->rreq.bi_private = sh; - dev->rvec.bv_page = dev->page; dev->flags = 0; dev->sector = compute_blocknr(sh, i, previous); @@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; + if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) { + WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); + sh->dev[i].page = sh->dev[i].orig_page; + } + if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + STRIPE_SECTORS) { @@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, (s->failed >= 1 && fdev[0]->toread) || (s->failed >= 2 && fdev[1]->toread) || (sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) && !test_bit(R5_OVERWRITE, &fdev[0]->flags)) || - (sh->raid_conf->level == 6 && s->failed && s->to_write))) { + (sh->raid_conf->level == 6 && s->failed && s->to_write && + s->to_write < sh->raid_conf->raid_disks - 2 && + (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) { /* we would like to get this block, possibly by computing it, * otherwise read it if the backing disk is insync */ @@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf, dev = &sh->dev[i]; if (!test_bit(R5_LOCKED, &dev->flags) && (test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_Discard, &dev->flags))) { + test_bit(R5_Discard, &dev->flags) || + test_bit(R5_SkipCopy, &dev->flags))) { /* We can return any write requests */ struct bio *wbi, *wbi2; pr_debug("Return write for disc %d\n", i); if (test_and_clear_bit(R5_Discard, &dev->flags)) clear_bit(R5_UPTODATE, &dev->flags); + if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) { + WARN_ON(test_bit(R5_UPTODATE, &dev->flags)); + dev->page = dev->orig_page; + } wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < @@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf, 0); } else if (test_bit(R5_Discard, &dev->flags)) discard_pending = 1; + WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); + WARN_ON(dev->page != dev->orig_page); } if (!discard_pending && test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { @@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf, !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { - if (test_bit(R5_Insync, &dev->flags)) rcw++; + if (test_bit(R5_Insync, &dev->flags)) + rcw++; else rcw += 2*disks; } @@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { - pr_debug("Read_old block " - "%d for r-m-w\n", i); + if (test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { + pr_debug("Read_old block %d for r-m-w\n", + i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); s->locked++; @@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf, !(test_bit(R5_UPTODATE, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; - if (!test_bit(R5_Insync, &dev->flags)) - continue; /* it's a failed drive */ - if ( - test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { + if (test_bit(R5_Insync, &dev->flags) && + test_bit(STRIPE_PREREAD_ACTIVE, + &sh->state)) { pr_debug("Read_old block " "%d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); @@ -5031,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); set_bit(STRIPE_SYNC_REQUESTED, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); - handle_stripe(sh); release_stripe(sh); return STRIPE_SECTORS; @@ -5072,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) /* already done this stripe */ continue; - sh = get_active_stripe(conf, sector, 0, 1, 0); + sh = get_active_stripe(conf, sector, 0, 1, 1); if (!sh) { /* failed to get a stripe - must wait */ @@ -5355,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, raid5_store_preread_threshold); static ssize_t +raid5_show_skip_copy(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->skip_copy); + else + return 0; +} + +static ssize_t +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + new = !!new; + if (new == conf->skip_copy) + return len; + + mddev_suspend(mddev); + conf->skip_copy = new; + if (new) + mddev->queue->backing_dev_info.capabilities |= + BDI_CAP_STABLE_WRITES; + else + mddev->queue->backing_dev_info.capabilities &= + ~BDI_CAP_STABLE_WRITES; + mddev_resume(mddev); + return len; +} + +static struct md_sysfs_entry +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR, + raid5_show_skip_copy, + raid5_store_skip_copy); + + +static ssize_t stripe_cache_active_show(struct mddev *mddev, char *page) { struct r5conf *conf = mddev->private; @@ -5439,6 +5526,7 @@ static struct attribute *raid5_attrs[] = { &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, &raid5_group_thread_cnt.attr, + &raid5_skip_copy.attr, NULL, }; static struct attribute_group raid5_attrs_group = { diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 01ad8ae8f578..bc72cd4be5f8 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -232,7 +232,7 @@ struct stripe_head { */ struct bio req, rreq; struct bio_vec vec, rvec; - struct page *page; + struct page *page, *orig_page; struct bio *toread, *read, *towrite, *written; sector_t sector; /* sector of this page */ unsigned long flags; @@ -299,6 +299,7 @@ enum r5dev_flags { * data in, and now is a good time to write it out. */ R5_Discard, /* Discard the stripe */ + R5_SkipCopy, /* Don't copy data from bio to stripe cache */ }; /* @@ -436,6 +437,7 @@ struct r5conf { atomic_t pending_full_writes; /* full write backlog */ int bypass_count; /* bypassed prereads */ int bypass_threshold; /* preread nice */ + int skip_copy; /* Don't copy data from bio to stripe cache */ struct list_head *last_hold; /* detect hold_list promotions */ atomic_t reshape_stripes; /* stripes with pending writes for reshape */ |