summaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bitmap.c6
-rw-r--r--drivers/md/dm-bio-prison.c70
-rw-r--r--drivers/md/dm-bio-prison.h2
-rw-r--r--drivers/md/dm-crypt.c4
-rw-r--r--drivers/md/dm-era-target.c3
-rw-r--r--drivers/md/dm-io.c22
-rw-r--r--drivers/md/dm-mpath.c16
-rw-r--r--drivers/md/dm-snap.c67
-rw-r--r--drivers/md/dm-table.c5
-rw-r--r--drivers/md/dm-thin.c93
-rw-r--r--drivers/md/dm-zero.c4
-rw-r--r--drivers/md/dm.c101
-rw-r--r--drivers/md/md.c27
-rw-r--r--drivers/md/raid5.c158
-rw-r--r--drivers/md/raid5.h4
15 files changed, 407 insertions, 175 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 9a8e66ae04f5..67f8b31e2054 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -669,17 +669,13 @@ static inline unsigned long file_page_offset(struct bitmap_storage *store,
/*
* return a pointer to the page in the filemap that contains the given bit
*
- * this lookup is complicated by the fact that the bitmap sb might be exactly
- * 1 page (e.g., x86) or less than 1 page -- so the bitmap might start on page
- * 0 or page 1
*/
static inline struct page *filemap_get_page(struct bitmap_storage *store,
unsigned long chunk)
{
if (file_page_index(store, chunk) >= store->file_pages)
return NULL;
- return store->filemap[file_page_index(store, chunk)
- - file_page_index(store, 0)];
+ return store->filemap[file_page_index(store, chunk)];
}
static int bitmap_storage_alloc(struct bitmap_storage *store,
diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index 85f0b7074257..f752d12081ff 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -14,13 +14,17 @@
/*----------------------------------------------------------------*/
-struct dm_bio_prison {
+struct bucket {
spinlock_t lock;
+ struct hlist_head cells;
+};
+
+struct dm_bio_prison {
mempool_t *cell_pool;
unsigned nr_buckets;
unsigned hash_mask;
- struct hlist_head *cells;
+ struct bucket *buckets;
};
/*----------------------------------------------------------------*/
@@ -40,6 +44,12 @@ static uint32_t calc_nr_buckets(unsigned nr_cells)
static struct kmem_cache *_cell_cache;
+static void init_bucket(struct bucket *b)
+{
+ spin_lock_init(&b->lock);
+ INIT_HLIST_HEAD(&b->cells);
+}
+
/*
* @nr_cells should be the number of cells you want in use _concurrently_.
* Don't confuse it with the number of distinct keys.
@@ -49,13 +59,12 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
unsigned i;
uint32_t nr_buckets = calc_nr_buckets(nr_cells);
size_t len = sizeof(struct dm_bio_prison) +
- (sizeof(struct hlist_head) * nr_buckets);
+ (sizeof(struct bucket) * nr_buckets);
struct dm_bio_prison *prison = kmalloc(len, GFP_KERNEL);
if (!prison)
return NULL;
- spin_lock_init(&prison->lock);
prison->cell_pool = mempool_create_slab_pool(nr_cells, _cell_cache);
if (!prison->cell_pool) {
kfree(prison);
@@ -64,9 +73,9 @@ struct dm_bio_prison *dm_bio_prison_create(unsigned nr_cells)
prison->nr_buckets = nr_buckets;
prison->hash_mask = nr_buckets - 1;
- prison->cells = (struct hlist_head *) (prison + 1);
+ prison->buckets = (struct bucket *) (prison + 1);
for (i = 0; i < nr_buckets; i++)
- INIT_HLIST_HEAD(prison->cells + i);
+ init_bucket(prison->buckets + i);
return prison;
}
@@ -107,40 +116,44 @@ static int keys_equal(struct dm_cell_key *lhs, struct dm_cell_key *rhs)
(lhs->block == rhs->block);
}
-static struct dm_bio_prison_cell *__search_bucket(struct hlist_head *bucket,
+static struct bucket *get_bucket(struct dm_bio_prison *prison,
+ struct dm_cell_key *key)
+{
+ return prison->buckets + hash_key(prison, key);
+}
+
+static struct dm_bio_prison_cell *__search_bucket(struct bucket *b,
struct dm_cell_key *key)
{
struct dm_bio_prison_cell *cell;
- hlist_for_each_entry(cell, bucket, list)
+ hlist_for_each_entry(cell, &b->cells, list)
if (keys_equal(&cell->key, key))
return cell;
return NULL;
}
-static void __setup_new_cell(struct dm_bio_prison *prison,
+static void __setup_new_cell(struct bucket *b,
struct dm_cell_key *key,
struct bio *holder,
- uint32_t hash,
struct dm_bio_prison_cell *cell)
{
memcpy(&cell->key, key, sizeof(cell->key));
cell->holder = holder;
bio_list_init(&cell->bios);
- hlist_add_head(&cell->list, prison->cells + hash);
+ hlist_add_head(&cell->list, &b->cells);
}
-static int __bio_detain(struct dm_bio_prison *prison,
+static int __bio_detain(struct bucket *b,
struct dm_cell_key *key,
struct bio *inmate,
struct dm_bio_prison_cell *cell_prealloc,
struct dm_bio_prison_cell **cell_result)
{
- uint32_t hash = hash_key(prison, key);
struct dm_bio_prison_cell *cell;
- cell = __search_bucket(prison->cells + hash, key);
+ cell = __search_bucket(b, key);
if (cell) {
if (inmate)
bio_list_add(&cell->bios, inmate);
@@ -148,7 +161,7 @@ static int __bio_detain(struct dm_bio_prison *prison,
return 1;
}
- __setup_new_cell(prison, key, inmate, hash, cell_prealloc);
+ __setup_new_cell(b, key, inmate, cell_prealloc);
*cell_result = cell_prealloc;
return 0;
}
@@ -161,10 +174,11 @@ static int bio_detain(struct dm_bio_prison *prison,
{
int r;
unsigned long flags;
+ struct bucket *b = get_bucket(prison, key);
- spin_lock_irqsave(&prison->lock, flags);
- r = __bio_detain(prison, key, inmate, cell_prealloc, cell_result);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_lock_irqsave(&b->lock, flags);
+ r = __bio_detain(b, key, inmate, cell_prealloc, cell_result);
+ spin_unlock_irqrestore(&b->lock, flags);
return r;
}
@@ -208,10 +222,11 @@ void dm_cell_release(struct dm_bio_prison *prison,
struct bio_list *bios)
{
unsigned long flags;
+ struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irqsave(&b->lock, flags);
__cell_release(cell, bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irqrestore(&b->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release);
@@ -230,28 +245,25 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct bio_list *inmates)
{
unsigned long flags;
+ struct bucket *b = get_bucket(prison, &cell->key);
- spin_lock_irqsave(&prison->lock, flags);
+ spin_lock_irqsave(&b->lock, flags);
__cell_release_no_holder(cell, inmates);
- spin_unlock_irqrestore(&prison->lock, flags);
+ spin_unlock_irqrestore(&b->lock, flags);
}
EXPORT_SYMBOL_GPL(dm_cell_release_no_holder);
void dm_cell_error(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell)
+ struct dm_bio_prison_cell *cell, int error)
{
struct bio_list bios;
struct bio *bio;
- unsigned long flags;
bio_list_init(&bios);
-
- spin_lock_irqsave(&prison->lock, flags);
- __cell_release(cell, &bios);
- spin_unlock_irqrestore(&prison->lock, flags);
+ dm_cell_release(prison, cell, &bios);
while ((bio = bio_list_pop(&bios)))
- bio_io_error(bio);
+ bio_endio(bio, error);
}
EXPORT_SYMBOL_GPL(dm_cell_error);
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 3f833190eadf..6805a142b750 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -85,7 +85,7 @@ void dm_cell_release_no_holder(struct dm_bio_prison *prison,
struct dm_bio_prison_cell *cell,
struct bio_list *inmates);
void dm_cell_error(struct dm_bio_prison *prison,
- struct dm_bio_prison_cell *cell);
+ struct dm_bio_prison_cell *cell, int error);
/*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53b213226c01..4cba2d808afb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
* Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
* Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
* Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
@@ -1996,6 +1996,6 @@ static void __exit dm_crypt_exit(void)
module_init(dm_crypt_init);
module_exit(dm_crypt_exit);
-MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
MODULE_DESCRIPTION(DM_NAME " target for transparent encryption / decryption");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 414dad4cb49b..ad913cd4aded 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1391,7 +1391,8 @@ static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
static void era_destroy(struct era *era)
{
- metadata_close(era->md);
+ if (era->md)
+ metadata_close(era->md);
if (era->wq)
destroy_workqueue(era->wq);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index 3842ac738f98..db404a0f7e2c 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -10,6 +10,7 @@
#include <linux/device-mapper.h>
#include <linux/bio.h>
+#include <linux/completion.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/sched.h>
@@ -32,7 +33,7 @@ struct dm_io_client {
struct io {
unsigned long error_bits;
atomic_t count;
- struct task_struct *sleeper;
+ struct completion *wait;
struct dm_io_client *client;
io_notify_fn callback;
void *context;
@@ -121,8 +122,8 @@ static void dec_count(struct io *io, unsigned int region, int error)
invalidate_kernel_vmap_range(io->vma_invalidate_address,
io->vma_invalidate_size);
- if (io->sleeper)
- wake_up_process(io->sleeper);
+ if (io->wait)
+ complete(io->wait);
else {
unsigned long r = io->error_bits;
@@ -387,6 +388,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
*/
volatile char io_[sizeof(struct io) + __alignof__(struct io) - 1];
struct io *io = (struct io *)PTR_ALIGN(&io_, __alignof__(struct io));
+ DECLARE_COMPLETION_ONSTACK(wait);
if (num_regions > 1 && (rw & RW_MASK) != WRITE) {
WARN_ON(1);
@@ -395,7 +397,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = current;
+ io->wait = &wait;
io->client = client;
io->vma_invalidate_address = dp->vma_invalidate_address;
@@ -403,15 +405,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
dispatch_io(rw, num_regions, where, dp, io, 1);
- while (1) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!atomic_read(&io->count))
- break;
-
- io_schedule();
- }
- set_current_state(TASK_RUNNING);
+ wait_for_completion_io(&wait);
if (error_bits)
*error_bits = io->error_bits;
@@ -434,7 +428,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
io = mempool_alloc(client->pool, GFP_NOIO);
io->error_bits = 0;
atomic_set(&io->count, 1); /* see dispatch_io() */
- io->sleeper = NULL;
+ io->wait = NULL;
io->client = client;
io->callback = fn;
io->context = context;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index ebfa411d1a7d..f4167b013d99 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1242,17 +1242,8 @@ static int do_end_io(struct multipath *m, struct request *clone,
if (!error && !clone->errors)
return 0; /* I/O complete */
- if (noretry_error(error)) {
- if ((clone->cmd_flags & REQ_WRITE_SAME) &&
- !clone->q->limits.max_write_same_sectors) {
- struct queue_limits *limits;
-
- /* device doesn't really support WRITE SAME, disable it */
- limits = dm_get_queue_limits(dm_table_get_md(m->ti->table));
- limits->max_write_same_sectors = 0;
- }
+ if (noretry_error(error))
return error;
- }
if (mpio->pgpath)
fail_path(mpio->pgpath);
@@ -1620,8 +1611,9 @@ static int multipath_busy(struct dm_target *ti)
spin_lock_irqsave(&m->lock, flags);
- /* pg_init in progress, requeue until done */
- if (!pg_ready(m)) {
+ /* pg_init in progress or no paths available */
+ if (m->pg_init_in_progress ||
+ (!m->nr_valid_paths && m->queue_if_no_path)) {
busy = 1;
goto out;
}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8e0caed0bf74..5bd2290cfb1e 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -2141,6 +2141,11 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
* Origin: maps a linear range of a device, with hooks for snapshotting.
*/
+struct dm_origin {
+ struct dm_dev *dev;
+ unsigned split_boundary;
+};
+
/*
* Construct an origin mapping: <dev_path>
* The context for an origin is merely a 'struct dm_dev *'
@@ -2149,41 +2154,65 @@ static int origin_write_extent(struct dm_snapshot *merging_snap,
static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
{
int r;
- struct dm_dev *dev;
+ struct dm_origin *o;
if (argc != 1) {
ti->error = "origin: incorrect number of arguments";
return -EINVAL;
}
- r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &dev);
+ o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
+ if (!o) {
+ ti->error = "Cannot allocate private origin structure";
+ r = -ENOMEM;
+ goto bad_alloc;
+ }
+
+ r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
if (r) {
ti->error = "Cannot get target device";
- return r;
+ goto bad_open;
}
- ti->private = dev;
+ ti->private = o;
ti->num_flush_bios = 1;
return 0;
+
+bad_open:
+ kfree(o);
+bad_alloc:
+ return r;
}
static void origin_dtr(struct dm_target *ti)
{
- struct dm_dev *dev = ti->private;
- dm_put_device(ti, dev);
+ struct dm_origin *o = ti->private;
+ dm_put_device(ti, o->dev);
+ kfree(o);
}
static int origin_map(struct dm_target *ti, struct bio *bio)
{
- struct dm_dev *dev = ti->private;
- bio->bi_bdev = dev->bdev;
+ struct dm_origin *o = ti->private;
+ unsigned available_sectors;
- if (bio->bi_rw & REQ_FLUSH)
+ bio->bi_bdev = o->dev->bdev;
+
+ if (unlikely(bio->bi_rw & REQ_FLUSH))
return DM_MAPIO_REMAPPED;
+ if (bio_rw(bio) != WRITE)
+ return DM_MAPIO_REMAPPED;
+
+ available_sectors = o->split_boundary -
+ ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
+
+ if (bio_sectors(bio) > available_sectors)
+ dm_accept_partial_bio(bio, available_sectors);
+
/* Only tell snapshots if this is a write */
- return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
+ return do_origin(o->dev, bio);
}
/*
@@ -2192,15 +2221,15 @@ static int origin_map(struct dm_target *ti, struct bio *bio)
*/
static void origin_resume(struct dm_target *ti)
{
- struct dm_dev *dev = ti->private;
+ struct dm_origin *o = ti->private;
- ti->max_io_len = get_origin_minimum_chunksize(dev->bdev);
+ o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
}
static void origin_status(struct dm_target *ti, status_type_t type,
unsigned status_flags, char *result, unsigned maxlen)
{
- struct dm_dev *dev = ti->private;
+ struct dm_origin *o = ti->private;
switch (type) {
case STATUSTYPE_INFO:
@@ -2208,7 +2237,7 @@ static void origin_status(struct dm_target *ti, status_type_t type,
break;
case STATUSTYPE_TABLE:
- snprintf(result, maxlen, "%s", dev->name);
+ snprintf(result, maxlen, "%s", o->dev->name);
break;
}
}
@@ -2216,13 +2245,13 @@ static void origin_status(struct dm_target *ti, status_type_t type,
static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
struct bio_vec *biovec, int max_size)
{
- struct dm_dev *dev = ti->private;
- struct request_queue *q = bdev_get_queue(dev->bdev);
+ struct dm_origin *o = ti->private;
+ struct request_queue *q = bdev_get_queue(o->dev->bdev);
if (!q->merge_bvec_fn)
return max_size;
- bvm->bi_bdev = dev->bdev;
+ bvm->bi_bdev = o->dev->bdev;
return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
}
@@ -2230,9 +2259,9 @@ static int origin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
static int origin_iterate_devices(struct dm_target *ti,
iterate_devices_callout_fn fn, void *data)
{
- struct dm_dev *dev = ti->private;
+ struct dm_origin *o = ti->private;
- return fn(ti, dev, 0, ti->len, data);
+ return fn(ti, o->dev, 0, ti->len, data);
}
static struct target_type origin_target = {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 50601ec7017a..5f59f1e3e5b1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -465,8 +465,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
}
EXPORT_SYMBOL(dm_get_device);
-int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
- sector_t start, sector_t len, void *data)
+static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+ sector_t start, sector_t len, void *data)
{
struct queue_limits *limits = data;
struct block_device *bdev = dev->bdev;
@@ -499,7 +499,6 @@ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
(unsigned int) (PAGE_SIZE >> 9));
return 0;
}
-EXPORT_SYMBOL_GPL(dm_set_device_limits);
/*
* Decrement a device's use count and remove it if necessary.
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 242ac2ea5f29..fc9c848a60c9 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -310,13 +310,18 @@ static void cell_defer_no_holder_no_free(struct thin_c *tc,
wake_worker(pool);
}
-static void cell_error(struct pool *pool,
- struct dm_bio_prison_cell *cell)
+static void cell_error_with_code(struct pool *pool,
+ struct dm_bio_prison_cell *cell, int error_code)
{
- dm_cell_error(pool->prison, cell);
+ dm_cell_error(pool->prison, cell, error_code);
dm_bio_prison_free_cell(pool->prison, cell);
}
+static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
+{
+ cell_error_with_code(pool, cell, -EIO);
+}
+
/*----------------------------------------------------------------*/
/*
@@ -1027,7 +1032,7 @@ static void retry_on_resume(struct bio *bio)
spin_unlock_irqrestore(&tc->lock, flags);
}
-static bool should_error_unserviceable_bio(struct pool *pool)
+static int should_error_unserviceable_bio(struct pool *pool)
{
enum pool_mode m = get_pool_mode(pool);
@@ -1035,25 +1040,27 @@ static bool should_error_unserviceable_bio(struct pool *pool)
case PM_WRITE:
/* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
- return true;
+ return -EIO;
case PM_OUT_OF_DATA_SPACE:
- return pool->pf.error_if_no_space;
+ return pool->pf.error_if_no_space ? -ENOSPC : 0;
case PM_READ_ONLY:
case PM_FAIL:
- return true;
+ return -EIO;
default:
/* Shouldn't get here */
DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
- return true;
+ return -EIO;
}
}
static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
{
- if (should_error_unserviceable_bio(pool))
- bio_io_error(bio);
+ int error = should_error_unserviceable_bio(pool);
+
+ if (error)
+ bio_endio(bio, error);
else
retry_on_resume(bio);
}
@@ -1062,18 +1069,21 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
{
struct bio *bio;
struct bio_list bios;
+ int error;
- if (should_error_unserviceable_bio(pool)) {
- cell_error(pool, cell);
+ error = should_error_unserviceable_bio(pool);
+ if (error) {
+ cell_error_with_code(pool, cell, error);
return;
}
bio_list_init(&bios);
cell_release(pool, cell, &bios);
- if (should_error_unserviceable_bio(pool))
+ error = should_error_unserviceable_bio(pool);
+ if (error)
while ((bio = bio_list_pop(&bios)))
- bio_io_error(bio);
+ bio_endio(bio, error);
else
while ((bio = bio_list_pop(&bios)))
retry_on_resume(bio);
@@ -1610,47 +1620,63 @@ static void do_no_space_timeout(struct work_struct *ws)
/*----------------------------------------------------------------*/
-struct noflush_work {
+struct pool_work {
struct work_struct worker;
- struct thin_c *tc;
+ struct completion complete;
+};
+
+static struct pool_work *to_pool_work(struct work_struct *ws)
+{
+ return container_of(ws, struct pool_work, worker);
+}
- atomic_t complete;
- wait_queue_head_t wait;
+static void pool_work_complete(struct pool_work *pw)
+{
+ complete(&pw->complete);
+}
+
+static void pool_work_wait(struct pool_work *pw, struct pool *pool,
+ void (*fn)(struct work_struct *))
+{
+ INIT_WORK_ONSTACK(&pw->worker, fn);
+ init_completion(&pw->complete);
+ queue_work(pool->wq, &pw->worker);
+ wait_for_completion(&pw->complete);
+}
+
+/*----------------------------------------------------------------*/
+
+struct noflush_work {
+ struct pool_work pw;
+ struct thin_c *tc;
};
-static void complete_noflush_work(struct noflush_work *w)
+static struct noflush_work *to_noflush(struct work_struct *ws)
{
- atomic_set(&w->complete, 1);
- wake_up(&w->wait);
+ return container_of(to_pool_work(ws), struct noflush_work, pw);
}
static void do_noflush_start(struct work_struct *ws)
{
- struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+ struct noflush_work *w = to_noflush(ws);
w->tc->requeue_mode = true;
requeue_io(w->tc);
- complete_noflush_work(w);
+ pool_work_complete(&w->pw);
}
static void do_noflush_stop(struct work_struct *ws)
{
- struct noflush_work *w = container_of(ws, struct noflush_work, worker);
+ struct noflush_work *w = to_noflush(ws);
w->tc->requeue_mode = false;
- complete_noflush_work(w);
+ pool_work_complete(&w->pw);
}
static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
{
struct noflush_work w;
- INIT_WORK_ONSTACK(&w.worker, fn);
w.tc = tc;
- atomic_set(&w.complete, 0);
- init_waitqueue_head(&w.wait);
-
- queue_work(tc->pool->wq, &w.worker);
-
- wait_event(w.wait, atomic_read(&w.complete));
+ pool_work_wait(&w.pw, tc->pool, fn);
}
/*----------------------------------------------------------------*/
@@ -3068,7 +3094,8 @@ static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
*/
if (pt->adjusted_pf.discard_passdown) {
data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
- limits->discard_granularity = data_limits->discard_granularity;
+ limits->discard_granularity = max(data_limits->discard_granularity,
+ pool->sectors_per_block << SECTOR_SHIFT);
} else
limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
}
diff --git a/drivers/md/dm-zero.c b/drivers/md/dm-zero.c
index c99003e0d47a..b9a64bbce304 100644
--- a/drivers/md/dm-zero.c
+++ b/drivers/md/dm-zero.c
@@ -1,5 +1,5 @@
/*
- * Copyright (C) 2003 Christophe Saout <christophe@saout.de>
+ * Copyright (C) 2003 Jana Saout <jana@saout.de>
*
* This file is released under the GPL.
*/
@@ -79,6 +79,6 @@ static void __exit dm_zero_exit(void)
module_init(dm_zero_init)
module_exit(dm_zero_exit)
-MODULE_AUTHOR("Christophe Saout <christophe@saout.de>");
+MODULE_AUTHOR("Jana Saout <jana@saout.de>");
MODULE_DESCRIPTION(DM_NAME " dummy target returning zeros");
MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index aa9e093343d4..32b958dbc499 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -54,6 +54,8 @@ static void do_deferred_remove(struct work_struct *w);
static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
+static struct workqueue_struct *deferred_remove_workqueue;
+
/*
* For bio-based dm.
* One of these is allocated per bio.
@@ -276,16 +278,24 @@ static int __init local_init(void)
if (r)
goto out_free_rq_tio_cache;
+ deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
+ if (!deferred_remove_workqueue) {
+ r = -ENOMEM;
+ goto out_uevent_exit;
+ }
+
_major = major;
r = register_blkdev(_major, _name);
if (r < 0)
- goto out_uevent_exit;
+ goto out_free_workqueue;
if (!_major)
_major = r;
return 0;
+out_free_workqueue:
+ destroy_workqueue(deferred_remove_workqueue);
out_uevent_exit:
dm_uevent_exit();
out_free_rq_tio_cache:
@@ -299,6 +309,7 @@ out_free_io_cache:
static void local_exit(void)
{
flush_scheduled_work();
+ destroy_workqueue(deferred_remove_workqueue);
kmem_cache_destroy(_rq_tio_cache);
kmem_cache_destroy(_io_cache);
@@ -407,7 +418,7 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode)
if (atomic_dec_and_test(&md->open_count) &&
(test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
- schedule_work(&deferred_remove_work);
+ queue_work(deferred_remove_workqueue, &deferred_remove_work);
dm_put(md);
@@ -755,6 +766,14 @@ static void dec_pending(struct dm_io *io, int error)
}
}
+static void disable_write_same(struct mapped_device *md)
+{
+ struct queue_limits *limits = dm_get_queue_limits(md);
+
+ /* device doesn't really support WRITE SAME, disable it */
+ limits->max_write_same_sectors = 0;
+}
+
static void clone_endio(struct bio *bio, int error)
{
int r = 0;
@@ -783,6 +802,10 @@ static void clone_endio(struct bio *bio, int error)
}
}
+ if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
+ !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
+ disable_write_same(md);
+
free_tio(md, tio);
dec_pending(io, error);
}
@@ -977,6 +1000,10 @@ static void dm_done(struct request *clone, int error, bool mapped)
r = rq_end_io(tio->ti, clone, error, &tio->info);
}
+ if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
+ !clone->q->limits.max_write_same_sectors))
+ disable_write_same(tio->md);
+
if (r <= 0)
/* The target wants to complete the I/O */
dm_end_request(clone, r);
@@ -1110,6 +1137,46 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
}
EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
+/*
+ * A target may call dm_accept_partial_bio only from the map routine. It is
+ * allowed for all bio types except REQ_FLUSH.
+ *
+ * dm_accept_partial_bio informs the dm that the target only wants to process
+ * additional n_sectors sectors of the bio and the rest of the data should be
+ * sent in a next bio.
+ *
+ * A diagram that explains the arithmetics:
+ * +--------------------+---------------+-------+
+ * | 1 | 2 | 3 |
+ * +--------------------+---------------+-------+
+ *
+ * <-------------- *tio->len_ptr --------------->
+ * <------- bi_size ------->
+ * <-- n_sectors -->
+ *
+ * Region 1 was already iterated over with bio_advance or similar function.
+ * (it may be empty if the target doesn't use bio_advance)
+ * Region 2 is the remaining bio size that the target wants to process.
+ * (it may be empty if region 1 is non-empty, although there is no reason
+ * to make it empty)
+ * The target requires that region 3 is to be sent in the next bio.
+ *
+ * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
+ * the partially processed part (the sum of regions 1+2) must be the same for all
+ * copies of the bio.
+ */
+void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
+{
+ struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
+ unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+ BUG_ON(bio->bi_rw & REQ_FLUSH);
+ BUG_ON(bi_size > *tio->len_ptr);
+ BUG_ON(n_sectors > bi_size);
+ *tio->len_ptr -= bi_size - n_sectors;
+ bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
+}
+EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
+
static void __map_bio(struct dm_target_io *tio)
{
int r;
@@ -1152,10 +1219,10 @@ struct clone_info {
struct bio *bio;
struct dm_io *io;
sector_t sector;
- sector_t sector_count;
+ unsigned sector_count;
};
-static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
+static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
{
bio->bi_iter.bi_sector = sector;
bio->bi_iter.bi_size = to_bytes(len);
@@ -1200,11 +1267,13 @@ static struct dm_target_io *alloc_tio(struct clone_info *ci,
static void __clone_and_map_simple_bio(struct clone_info *ci,
struct dm_target *ti,
- unsigned target_bio_nr, sector_t len)
+ unsigned target_bio_nr, unsigned *len)
{
struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
struct bio *clone = &tio->clone;
+ tio->len_ptr = len;
+
/*
* Discard requests require the bio's inline iovecs be initialized.
* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
@@ -1212,13 +1281,13 @@ static void __clone_and_map_simple_bio(struct clone_info *ci,
*/
__bio_clone_fast(clone, ci->bio);
if (len)
- bio_setup_sector(clone, ci->sector, len);
+ bio_setup_sector(clone, ci->sector, *len);
__map_bio(tio);
}
static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
- unsigned num_bios, sector_t len)
+ unsigned num_bios, unsigned *len)
{
unsigned target_bio_nr;
@@ -1233,13 +1302,13 @@ static int __send_empty_flush(struct clone_info *ci)
BUG_ON(bio_has_data(ci->bio));
while ((ti = dm_table_get_target(ci->map, target_nr++)))
- __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
+ __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
return 0;
}
static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
- sector_t sector, unsigned len)
+ sector_t sector, unsigned *len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
@@ -1254,7 +1323,8 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
tio = alloc_tio(ci, ti, 0, target_bio_nr);
- clone_bio(tio, bio, sector, len);
+ tio->len_ptr = len;
+ clone_bio(tio, bio, sector, *len);
__map_bio(tio);
}
}
@@ -1283,7 +1353,7 @@ static int __send_changing_extent_only(struct clone_info *ci,
is_split_required_fn is_split_required)
{
struct dm_target *ti;
- sector_t len;
+ unsigned len;
unsigned num_bios;
do {
@@ -1302,11 +1372,11 @@ static int __send_changing_extent_only(struct clone_info *ci,
return -EOPNOTSUPP;
if (is_split_required && !is_split_required(ti))
- len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
+ len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
else
- len = min(ci->sector_count, max_io_len(ci->sector, ti));
+ len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
- __send_duplicate_bios(ci, ti, num_bios, len);
+ __send_duplicate_bios(ci, ti, num_bios, &len);
ci->sector += len;
} while (ci->sector_count -= len);
@@ -1345,7 +1415,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
- __clone_and_map_data_bio(ci, ti, ci->sector, len);
+ __clone_and_map_data_bio(ci, ti, ci->sector, &len);
ci->sector += len;
ci->sector_count -= len;
@@ -1439,7 +1509,6 @@ static int dm_merge_bvec(struct request_queue *q,
* just one page.
*/
else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
-
max_size = 0;
out:
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 2382cfc9bb3f..32fc19c540d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3448,6 +3448,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->level = LEVEL_NONE;
return rv;
}
+ if (mddev->ro)
+ return -EROFS;
/* request to change the personality. Need to ensure:
* - array is not engaged in resync/recovery/reshape
@@ -3634,6 +3636,8 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
int err;
if (mddev->pers->check_reshape == NULL)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
mddev->new_layout = n;
err = mddev->pers->check_reshape(mddev);
if (err) {
@@ -3723,6 +3727,8 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
int err;
if (mddev->pers->check_reshape == NULL)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
mddev->new_chunk_sectors = n >> 9;
err = mddev->pers->check_reshape(mddev);
if (err) {
@@ -5593,7 +5599,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
if (mddev->in_sync)
info.state = (1<<MD_SB_CLEAN);
if (mddev->bitmap && mddev->bitmap_info.offset)
- info.state = (1<<MD_SB_BITMAP_PRESENT);
+ info.state |= (1<<MD_SB_BITMAP_PRESENT);
info.active_disks = insync;
info.working_disks = working;
info.failed_disks = failed;
@@ -6135,6 +6141,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
*/
if (mddev->sync_thread)
return -EBUSY;
+ if (mddev->ro)
+ return -EROFS;
rdev_for_each(rdev, mddev) {
sector_t avail = rdev->sectors;
@@ -6157,6 +6165,8 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
/* change the number of raid disks */
if (mddev->pers->check_reshape == NULL)
return -EINVAL;
+ if (mddev->ro)
+ return -EROFS;
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL;
@@ -7491,6 +7501,19 @@ void md_do_sync(struct md_thread *thread)
rdev->recovery_offset < j)
j = rdev->recovery_offset;
rcu_read_unlock();
+
+ /* If there is a bitmap, we need to make sure all
+ * writes that started before we added a spare
+ * complete before we start doing a recovery.
+ * Otherwise the write might complete and (via
+ * bitmap_endwrite) set a bit in the bitmap after the
+ * recovery has checked that bit and skipped that
+ * region.
+ */
+ if (mddev->bitmap) {
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
}
printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
@@ -8333,7 +8356,7 @@ static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
if (a < s) {
/* we need to split this range */
if (bb->count >= MD_MAX_BADBLOCKS) {
- rv = 0;
+ rv = -ENOSPC;
goto out;
}
memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2afef4ec9312..6234b2e84587 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -292,9 +292,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
BUG_ON(atomic_read(&conf->active_stripes)==0);
if (test_bit(STRIPE_HANDLE, &sh->state)) {
if (test_bit(STRIPE_DELAYED, &sh->state) &&
- !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+ !test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
list_add_tail(&sh->lru, &conf->delayed_list);
- else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
+ if (atomic_read(&conf->preread_active_stripes)
+ < IO_THRESHOLD)
+ md_wakeup_thread(conf->mddev->thread);
+ } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
sh->bm_seq - conf->seq_write > 0)
list_add_tail(&sh->lru, &conf->bitmap_list);
else {
@@ -413,6 +416,11 @@ static void release_stripe(struct stripe_head *sh)
int hash;
bool wakeup;
+ /* Avoid release_list until the last reference.
+ */
+ if (atomic_add_unless(&sh->count, -1, 1))
+ return;
+
if (unlikely(!conf->mddev->thread) ||
test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state))
goto slow_path;
@@ -479,6 +487,7 @@ static void shrink_buffers(struct stripe_head *sh)
int num = sh->raid_conf->pool_size;
for (i = 0; i < num ; i++) {
+ WARN_ON(sh->dev[i].page != sh->dev[i].orig_page);
p = sh->dev[i].page;
if (!p)
continue;
@@ -499,6 +508,7 @@ static int grow_buffers(struct stripe_head *sh)
return 1;
}
sh->dev[i].page = page;
+ sh->dev[i].orig_page = page;
}
return 0;
}
@@ -855,6 +865,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
bi->bi_rw |= REQ_NOMERGE;
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].vec.bv_page = sh->dev[i].page;
bi->bi_vcnt = 1;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
@@ -899,6 +912,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
else
rbi->bi_iter.bi_sector = (sh->sector
+ rrdev->data_offset);
+ if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].rvec.bv_page = sh->dev[i].page;
rbi->bi_vcnt = 1;
rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
rbi->bi_io_vec[0].bv_offset = 0;
@@ -927,8 +943,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
}
static struct dma_async_tx_descriptor *
-async_copy_data(int frombio, struct bio *bio, struct page *page,
- sector_t sector, struct dma_async_tx_descriptor *tx)
+async_copy_data(int frombio, struct bio *bio, struct page **page,
+ sector_t sector, struct dma_async_tx_descriptor *tx,
+ struct stripe_head *sh)
{
struct bio_vec bvl;
struct bvec_iter iter;
@@ -965,11 +982,16 @@ async_copy_data(int frombio, struct bio *bio, struct page *page,
if (clen > 0) {
b_offset += bvl.bv_offset;
bio_page = bvl.bv_page;
- if (frombio)
- tx = async_memcpy(page, bio_page, page_offset,
+ if (frombio) {
+ if (sh->raid_conf->skip_copy &&
+ b_offset == 0 && page_offset == 0 &&
+ clen == STRIPE_SIZE)
+ *page = bio_page;
+ else
+ tx = async_memcpy(*page, bio_page, page_offset,
b_offset, clen, &submit);
- else
- tx = async_memcpy(bio_page, page, b_offset,
+ } else
+ tx = async_memcpy(bio_page, *page, b_offset,
page_offset, clen, &submit);
}
/* chain the operations */
@@ -1045,8 +1067,8 @@ static void ops_run_biofill(struct stripe_head *sh)
spin_unlock_irq(&sh->stripe_lock);
while (rbi && rbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
- tx = async_copy_data(0, rbi, dev->page,
- dev->sector, tx);
+ tx = async_copy_data(0, rbi, &dev->page,
+ dev->sector, tx, sh);
rbi = r5_next_bio(rbi, dev->sector);
}
}
@@ -1384,6 +1406,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
BUG_ON(dev->written);
wbi = dev->written = chosen;
spin_unlock_irq(&sh->stripe_lock);
+ WARN_ON(dev->page != dev->orig_page);
while (wbi && wbi->bi_iter.bi_sector <
dev->sector + STRIPE_SECTORS) {
@@ -1393,9 +1416,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
set_bit(R5_SyncIO, &dev->flags);
if (wbi->bi_rw & REQ_DISCARD)
set_bit(R5_Discard, &dev->flags);
- else
- tx = async_copy_data(1, wbi, dev->page,
- dev->sector, tx);
+ else {
+ tx = async_copy_data(1, wbi, &dev->page,
+ dev->sector, tx, sh);
+ if (dev->page != dev->orig_page) {
+ set_bit(R5_SkipCopy, &dev->flags);
+ clear_bit(R5_UPTODATE, &dev->flags);
+ clear_bit(R5_OVERWRITE, &dev->flags);
+ }
+ }
wbi = r5_next_bio(wbi, dev->sector);
}
}
@@ -1426,7 +1455,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
- if (!discard)
+ if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
@@ -1839,8 +1868,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
osh = get_free_stripe(conf, hash);
unlock_device_hash_lock(conf, hash);
atomic_set(&nsh->count, 1);
- for(i=0; i<conf->pool_size; i++)
+ for(i=0; i<conf->pool_size; i++) {
nsh->dev[i].page = osh->dev[i].page;
+ nsh->dev[i].orig_page = osh->dev[i].page;
+ }
for( ; i<newsize; i++)
nsh->dev[i].page = NULL;
nsh->hash_lock_index = hash;
@@ -1896,6 +1927,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (nsh->dev[i].page == NULL) {
struct page *p = alloc_page(GFP_NOIO);
nsh->dev[i].page = p;
+ nsh->dev[i].orig_page = p;
if (!p)
err = -ENOMEM;
}
@@ -2133,24 +2165,20 @@ static void raid5_end_write_request(struct bio *bi, int error)
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
-
+
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
bio_init(&dev->req);
dev->req.bi_io_vec = &dev->vec;
- dev->req.bi_vcnt++;
- dev->req.bi_max_vecs++;
+ dev->req.bi_max_vecs = 1;
dev->req.bi_private = sh;
- dev->vec.bv_page = dev->page;
bio_init(&dev->rreq);
dev->rreq.bi_io_vec = &dev->rvec;
- dev->rreq.bi_vcnt++;
- dev->rreq.bi_max_vecs++;
+ dev->rreq.bi_max_vecs = 1;
dev->rreq.bi_private = sh;
- dev->rvec.bv_page = dev->page;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
@@ -2750,6 +2778,11 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
+ if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
+ sh->dev[i].page = sh->dev[i].orig_page;
+ }
+
if (bi) bitmap_end = 1;
while (bi && bi->bi_iter.bi_sector <
sh->dev[i].sector + STRIPE_SECTORS) {
@@ -2886,8 +2919,11 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
(s->failed >= 1 && fdev[0]->toread) ||
(s->failed >= 2 && fdev[1]->toread) ||
(sh->raid_conf->level <= 5 && s->failed && fdev[0]->towrite &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) &&
!test_bit(R5_OVERWRITE, &fdev[0]->flags)) ||
- (sh->raid_conf->level == 6 && s->failed && s->to_write))) {
+ (sh->raid_conf->level == 6 && s->failed && s->to_write &&
+ s->to_write < sh->raid_conf->raid_disks - 2 &&
+ (!test_bit(R5_Insync, &dev->flags) || test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))))) {
/* we would like to get this block, possibly by computing it,
* otherwise read it if the backing disk is insync
*/
@@ -2991,12 +3027,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
(test_bit(R5_UPTODATE, &dev->flags) ||
- test_bit(R5_Discard, &dev->flags))) {
+ test_bit(R5_Discard, &dev->flags) ||
+ test_bit(R5_SkipCopy, &dev->flags))) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
pr_debug("Return write for disc %d\n", i);
if (test_and_clear_bit(R5_Discard, &dev->flags))
clear_bit(R5_UPTODATE, &dev->flags);
+ if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
+ WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
+ dev->page = dev->orig_page;
+ }
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_iter.bi_sector <
@@ -3015,6 +3056,8 @@ static void handle_stripe_clean_event(struct r5conf *conf,
0);
} else if (test_bit(R5_Discard, &dev->flags))
discard_pending = 1;
+ WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
+ WARN_ON(dev->page != dev->orig_page);
}
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
@@ -3086,7 +3129,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!test_bit(R5_LOCKED, &dev->flags) &&
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
- if (test_bit(R5_Insync, &dev->flags)) rcw++;
+ if (test_bit(R5_Insync, &dev->flags))
+ rcw++;
else
rcw += 2*disks;
}
@@ -3107,10 +3151,10 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags)) &&
test_bit(R5_Insync, &dev->flags)) {
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
- pr_debug("Read_old block "
- "%d for r-m-w\n", i);
+ if (test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
+ pr_debug("Read_old block %d for r-m-w\n",
+ i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
s->locked++;
@@ -3133,10 +3177,9 @@ static void handle_stripe_dirtying(struct r5conf *conf,
!(test_bit(R5_UPTODATE, &dev->flags) ||
test_bit(R5_Wantcompute, &dev->flags))) {
rcw++;
- if (!test_bit(R5_Insync, &dev->flags))
- continue; /* it's a failed drive */
- if (
- test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
+ if (test_bit(R5_Insync, &dev->flags) &&
+ test_bit(STRIPE_PREREAD_ACTIVE,
+ &sh->state)) {
pr_debug("Read_old block "
"%d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
@@ -5031,8 +5074,8 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
+ set_bit(STRIPE_HANDLE, &sh->state);
- handle_stripe(sh);
release_stripe(sh);
return STRIPE_SECTORS;
@@ -5072,7 +5115,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
- sh = get_active_stripe(conf, sector, 0, 1, 0);
+ sh = get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
@@ -5355,6 +5398,50 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
raid5_store_preread_threshold);
static ssize_t
+raid5_show_skip_copy(struct mddev *mddev, char *page)
+{
+ struct r5conf *conf = mddev->private;
+ if (conf)
+ return sprintf(page, "%d\n", conf->skip_copy);
+ else
+ return 0;
+}
+
+static ssize_t
+raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
+{
+ struct r5conf *conf = mddev->private;
+ unsigned long new;
+ if (len >= PAGE_SIZE)
+ return -EINVAL;
+ if (!conf)
+ return -ENODEV;
+
+ if (kstrtoul(page, 10, &new))
+ return -EINVAL;
+ new = !!new;
+ if (new == conf->skip_copy)
+ return len;
+
+ mddev_suspend(mddev);
+ conf->skip_copy = new;
+ if (new)
+ mddev->queue->backing_dev_info.capabilities |=
+ BDI_CAP_STABLE_WRITES;
+ else
+ mddev->queue->backing_dev_info.capabilities &=
+ ~BDI_CAP_STABLE_WRITES;
+ mddev_resume(mddev);
+ return len;
+}
+
+static struct md_sysfs_entry
+raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
+ raid5_show_skip_copy,
+ raid5_store_skip_copy);
+
+
+static ssize_t
stripe_cache_active_show(struct mddev *mddev, char *page)
{
struct r5conf *conf = mddev->private;
@@ -5439,6 +5526,7 @@ static struct attribute *raid5_attrs[] = {
&raid5_stripecache_active.attr,
&raid5_preread_bypass_threshold.attr,
&raid5_group_thread_cnt.attr,
+ &raid5_skip_copy.attr,
NULL,
};
static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01ad8ae8f578..bc72cd4be5f8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -232,7 +232,7 @@ struct stripe_head {
*/
struct bio req, rreq;
struct bio_vec vec, rvec;
- struct page *page;
+ struct page *page, *orig_page;
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
@@ -299,6 +299,7 @@ enum r5dev_flags {
* data in, and now is a good time to write it out.
*/
R5_Discard, /* Discard the stripe */
+ R5_SkipCopy, /* Don't copy data from bio to stripe cache */
};
/*
@@ -436,6 +437,7 @@ struct r5conf {
atomic_t pending_full_writes; /* full write backlog */
int bypass_count; /* bypassed prereads */
int bypass_threshold; /* preread nice */
+ int skip_copy; /* Don't copy data from bio to stripe cache */
struct list_head *last_hold; /* detect hold_list promotions */
atomic_t reshape_stripes; /* stripes with pending writes for reshape */